even more testing/playing - with "Der_Spiegel", "settler_colonialism"+gender

db293256 · Schoeneh · 679e4b1b · db293256
Commit db293256 authored 3 months ago by Schoeneh
--- a/llms/gensim-test.ipynb
+++ b/llms/gensim-test.ipynb
@@ -131,6 +131,356 @@
   "source": [
    "wv.most_similar(\"nazi\")"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Donald_Trump', 0.8103920221328735),\n",
+       " ('impersonator_entertained', 0.5942257046699524),\n",
+       " ('Ivanka_Trump', 0.5924582481384277),\n",
+       " ('Ivanka', 0.5607207417488098),\n",
+       " ('mogul_Donald_Trump', 0.5592453479766846),\n",
+       " ('Trump_Tower', 0.548555314540863),\n",
+       " ('Kepcher', 0.5468589067459106),\n",
+       " ('billionaire_Donald_Trump', 0.5447269082069397),\n",
+       " ('Trumpster', 0.5412818193435669),\n",
+       " ('tycoon_Donald_Trump', 0.5383972525596619)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"Trump\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Israeli', 0.8130459785461426),\n",
+       " ('Israelis', 0.779090940952301),\n",
+       " ('Palestinians', 0.7580956220626831),\n",
+       " ('Palestinian', 0.7473597526550293),\n",
+       " ('Netanyahu', 0.7082809805870056),\n",
+       " ('Gaza', 0.7046299576759338),\n",
+       " ('Hamas', 0.6912718415260315),\n",
+       " ('Gaza_Strip', 0.6873201727867126),\n",
+       " ('Palestinian_Authority', 0.6648115515708923),\n",
+       " ('Prime_Minister_Binyamin_Netanyahu', 0.6640220880508423)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"Israel\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Fuchs', 0.5700803995132446),\n",
+       " ('Weil', 0.5526396632194519),\n",
+       " ('weekly_newsmagazine_Der', 0.5345348119735718),\n",
+       " ('Berman', 0.5249170660972595),\n",
+       " ('Stein', 0.5195315480232239),\n",
+       " ('Der_Spiegel', 0.5161873698234558),\n",
+       " ('Nussbaum', 0.5158510208129883),\n",
+       " ('Welt', 0.5128974914550781),\n",
+       " ('Klein', 0.5109302401542664),\n",
+       " ('Ulrich', 0.5099592208862305)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"Spiegel\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('magazine_Der_Spiegel', 0.7872042655944824),\n",
+       " ('weekly_Der_Spiegel', 0.7623571753501892),\n",
+       " ('Die_Zeit', 0.7383401393890381),\n",
+       " ('Frankfurter_Allgemeine_Zeitung', 0.7346989512443542),\n",
+       " ('Die_Welt', 0.7314777374267578),\n",
+       " ('Der_Spiegel_magazine', 0.7263863682746887),\n",
+       " ('Süddeutsche_Zeitung', 0.7214947938919067),\n",
+       " ('Handelsblatt', 0.7061707377433777),\n",
+       " ('Tagesspiegel_daily', 0.7048733830451965),\n",
+       " ('Spiegel_Online', 0.7014873623847961)]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"Der_Spiegel\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.26165980100631714"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.distance(\"Der_Spiegel\", \"Die_Zeit\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.2685222625732422"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.distance(\"Der_Spiegel\", \"Die_Welt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.2785053253173828"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.distance(\"Der_Spiegel\", \"Süddeutsche_Zeitung\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('imperialism', 0.7371744513511658),\n",
+       " ('colonialists', 0.7273486852645874),\n",
+       " ('neo_colonialism', 0.7152635455131531),\n",
+       " ('Colonialism', 0.6945492029190063),\n",
+       " ('colonial_domination', 0.6901723146438599),\n",
+       " ('colonialist', 0.6886431574821472),\n",
+       " ('colonial', 0.6881863474845886),\n",
+       " ('slavery_colonialism', 0.6797659397125244),\n",
+       " ('colonial_rule', 0.6758955717086792),\n",
+       " ('colonization', 0.6730928421020508)]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"colonialism\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('colonial', 0.8822019100189209),\n",
+       " ('oppression', 0.8728238940238953),\n",
+       " ('colonialists', 0.8726308941841125),\n",
+       " ('feminism', 0.8686202764511108),\n",
+       " ('imperialism', 0.8678603768348694),\n",
+       " ('patriarchy', 0.8666298389434814),\n",
+       " ('colonization', 0.8656938076019287),\n",
+       " ('colonial_rule', 0.86388099193573),\n",
+       " ('slavery', 0.8588250875473022),\n",
+       " ('subjugation', 0.8580973744392395)]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar_cosmul(positive=['colonialism', 'woman'], negative=['man'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('neo_colonialism', 0.8733060956001282),\n",
+       " ('imperialism', 0.869311511516571),\n",
+       " ('slavery_colonialism', 0.866578996181488),\n",
+       " ('colonialists', 0.8548066020011902),\n",
+       " ('colonialist', 0.8444662094116211),\n",
+       " ('imperialist_domination', 0.8404235243797302),\n",
+       " ('Colonialism', 0.8400565981864929),\n",
+       " ('imperialism_colonialism', 0.8381094932556152),\n",
+       " ('colonial_domination', 0.8354700207710266),\n",
+       " ('colonialization', 0.8350632190704346)]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar_cosmul(positive=['colonialism', 'man'], negative=['woman'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('colonial_subjugation', 0.6939011216163635),\n",
+       " ('colonized_peoples', 0.678598940372467),\n",
+       " ('colonial_conquest', 0.6679588556289673),\n",
+       " ('imperialist_domination', 0.6554943323135376),\n",
+       " ('colonial_settler', 0.6515358686447144),\n",
+       " ('slavery_colonialism', 0.6513102054595947),\n",
+       " ('ethnocracy', 0.6486039161682129),\n",
+       " ('colonial_domination', 0.6479084491729736),\n",
+       " ('settler_colonial', 0.644547700881958),\n",
+       " ('imperialism_colonialism', 0.6408925652503967)]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"settler_colonialism\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('colonial_subjugation', 0.8631111979484558),\n",
+       " ('subjugation', 0.8551343679428101),\n",
+       " ('colonized_peoples', 0.8545337915420532),\n",
+       " ('colonial_conquest', 0.8533400893211365),\n",
+       " ('colonial_settler', 0.8425801396369934),\n",
+       " ('colonialism', 0.8342924118041992),\n",
+       " ('patriarchy', 0.8340162634849548),\n",
+       " ('colonial_domination', 0.8334349393844604),\n",
+       " ('Zionist_expansionism', 0.8305999040603638),\n",
+       " ('colonial_empires', 0.8288437724113464)]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar_cosmul(positive=['settler_colonialism', 'woman'], negative=['man'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('slavery_colonialism', 0.8518989086151123),\n",
+       " ('settler_colonial', 0.8481582403182983),\n",
+       " ('Hitlerism', 0.8431347012519836),\n",
+       " ('imperialism_colonialism', 0.8403087258338928),\n",
+       " ('Nazism_fascism', 0.8380133509635925),\n",
+       " ('imperialist_domination', 0.8367621898651123),\n",
+       " ('totalitarian_ideologies', 0.8347264528274536),\n",
+       " ('neo_colonialists', 0.8338908553123474),\n",
+       " ('predatory_capitalism', 0.8327714800834656),\n",
+       " ('proletarian_internationalism', 0.8313636779785156)]"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar_cosmul(positive=['settler_colonialism', 'man'], negative=['woman'])"
+   ]
  }
 ],
 "metadata": {

 %% Cell type:markdown id: tags:
 # Testing gensim
 See https://radimrehurek.com/gensim/
 %% Cell type:code id: tags:
 ``` python
 !pip install --upgrade gensim
 ```
 %% Cell type:code id: tags:
 ``` python
 import gensim.downloader as api
 ```
 %% Cell type:code id: tags:
 ``` python
 info = api.info()
 for model_name, model_data in sorted(info['models'].items()):
    print('%s (%d records): %s' % (model_name, model_data.get('num_records', -1), model_data['description'][:80] + '...'))
 ```
 %% Output
    __testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....
    conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known...
    fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt...
    glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vocab, uncased (https:...
    glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
    glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
    glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...
    glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B tokens, 400K vocab, ...
    glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...
    glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...
    glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...
    word2vec-google-news-300 (3000000 records): Pre-trained vectors trained on a part of the Google News dataset (about 100 bill...
    word2vec-ruscorpora-300 (184973 records): Word2vec Continuous Skipgram vectors trained on full Russian National Corpus (ab...
 %% Cell type:code id: tags:
 ``` python
 wv = api.load('word2vec-google-news-300')
 ```
 %% Output
    [==================================================] 100.0% 1662.8/1662.8MB downloaded
 %% Cell type:code id: tags:
 ``` python
 wv.most_similar("jew")
 ```
 %% Output
    [('jews', 0.606805145740509),
     ('jewish', 0.5944611430168152),
     ('rahm', 0.5944365859031677),
     ('mhux', 0.5918845534324646),
     ('yid', 0.5769580006599426),
     ('jessie', 0.5755242109298706),
     ('yur', 0.5660163164138794),
     ('israel', 0.5639604330062866),
     ('gilbert', 0.5632734894752502),
     ('kol', 0.5615833401679993)]
 %% Cell type:code id: tags:
 ``` python
 wv.most_similar("nazi")
 ```
 %% Output
    [('nazis', 0.6923775672912598),
     ('fascist', 0.657628059387207),
     ('Nazi', 0.6324446201324463),
     ('facist', 0.6276720762252808),
     ('fascists', 0.6110973358154297),
     ('Hilter', 0.5978641510009766),
     ('Hitler', 0.5964925289154053),
     ('hitler', 0.5891590714454651),
     ('NAZI', 0.5822753310203552),
     ('Fascist', 0.5806231498718262)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar("Trump")
+```
+%% Output
+    [('Donald_Trump', 0.8103920221328735),
+     ('impersonator_entertained', 0.5942257046699524),
+     ('Ivanka_Trump', 0.5924582481384277),
+     ('Ivanka', 0.5607207417488098),
+     ('mogul_Donald_Trump', 0.5592453479766846),
+     ('Trump_Tower', 0.548555314540863),
+     ('Kepcher', 0.5468589067459106),
+     ('billionaire_Donald_Trump', 0.5447269082069397),
+     ('Trumpster', 0.5412818193435669),
+     ('tycoon_Donald_Trump', 0.5383972525596619)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar("Israel")
+```
+%% Output
+    [('Israeli', 0.8130459785461426),
+     ('Israelis', 0.779090940952301),
+     ('Palestinians', 0.7580956220626831),
+     ('Palestinian', 0.7473597526550293),
+     ('Netanyahu', 0.7082809805870056),
+     ('Gaza', 0.7046299576759338),
+     ('Hamas', 0.6912718415260315),
+     ('Gaza_Strip', 0.6873201727867126),
+     ('Palestinian_Authority', 0.6648115515708923),
+     ('Prime_Minister_Binyamin_Netanyahu', 0.6640220880508423)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar("Spiegel")
+```
+%% Output
+    [('Fuchs', 0.5700803995132446),
+     ('Weil', 0.5526396632194519),
+     ('weekly_newsmagazine_Der', 0.5345348119735718),
+     ('Berman', 0.5249170660972595),
+     ('Stein', 0.5195315480232239),
+     ('Der_Spiegel', 0.5161873698234558),
+     ('Nussbaum', 0.5158510208129883),
+     ('Welt', 0.5128974914550781),
+     ('Klein', 0.5109302401542664),
+     ('Ulrich', 0.5099592208862305)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar("Der_Spiegel")
+```
+%% Output
+    [('magazine_Der_Spiegel', 0.7872042655944824),
+     ('weekly_Der_Spiegel', 0.7623571753501892),
+     ('Die_Zeit', 0.7383401393890381),
+     ('Frankfurter_Allgemeine_Zeitung', 0.7346989512443542),
+     ('Die_Welt', 0.7314777374267578),
+     ('Der_Spiegel_magazine', 0.7263863682746887),
+     ('Süddeutsche_Zeitung', 0.7214947938919067),
+     ('Handelsblatt', 0.7061707377433777),
+     ('Tagesspiegel_daily', 0.7048733830451965),
+     ('Spiegel_Online', 0.7014873623847961)]
+%% Cell type:code id: tags:
+``` python
+wv.distance("Der_Spiegel", "Die_Zeit")
+```
+%% Output
+    0.26165980100631714
+%% Cell type:code id: tags:
+``` python
+wv.distance("Der_Spiegel", "Die_Welt")
+```
+%% Output
+    0.2685222625732422
+%% Cell type:code id: tags:
+``` python
+wv.distance("Der_Spiegel", "Süddeutsche_Zeitung")
+```
+%% Output
+    0.2785053253173828
+%% Cell type:code id: tags:
+``` python
+wv.most_similar("colonialism")
+```
+%% Output
+    [('imperialism', 0.7371744513511658),
+     ('colonialists', 0.7273486852645874),
+     ('neo_colonialism', 0.7152635455131531),
+     ('Colonialism', 0.6945492029190063),
+     ('colonial_domination', 0.6901723146438599),
+     ('colonialist', 0.6886431574821472),
+     ('colonial', 0.6881863474845886),
+     ('slavery_colonialism', 0.6797659397125244),
+     ('colonial_rule', 0.6758955717086792),
+     ('colonization', 0.6730928421020508)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar_cosmul(positive=['colonialism', 'woman'], negative=['man'])
+```
+%% Output
+    [('colonial', 0.8822019100189209),
+     ('oppression', 0.8728238940238953),
+     ('colonialists', 0.8726308941841125),
+     ('feminism', 0.8686202764511108),
+     ('imperialism', 0.8678603768348694),
+     ('patriarchy', 0.8666298389434814),
+     ('colonization', 0.8656938076019287),
+     ('colonial_rule', 0.86388099193573),
+     ('slavery', 0.8588250875473022),
+     ('subjugation', 0.8580973744392395)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar_cosmul(positive=['colonialism', 'man'], negative=['woman'])
+```
+%% Output
+    [('neo_colonialism', 0.8733060956001282),
+     ('imperialism', 0.869311511516571),
+     ('slavery_colonialism', 0.866578996181488),
+     ('colonialists', 0.8548066020011902),
+     ('colonialist', 0.8444662094116211),
+     ('imperialist_domination', 0.8404235243797302),
+     ('Colonialism', 0.8400565981864929),
+     ('imperialism_colonialism', 0.8381094932556152),
+     ('colonial_domination', 0.8354700207710266),
+     ('colonialization', 0.8350632190704346)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar("settler_colonialism")
+```
+%% Output
+    [('colonial_subjugation', 0.6939011216163635),
+     ('colonized_peoples', 0.678598940372467),
+     ('colonial_conquest', 0.6679588556289673),
+     ('imperialist_domination', 0.6554943323135376),
+     ('colonial_settler', 0.6515358686447144),
+     ('slavery_colonialism', 0.6513102054595947),
+     ('ethnocracy', 0.6486039161682129),
+     ('colonial_domination', 0.6479084491729736),
+     ('settler_colonial', 0.644547700881958),
+     ('imperialism_colonialism', 0.6408925652503967)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar_cosmul(positive=['settler_colonialism', 'woman'], negative=['man'])
+```
+%% Output
+    [('colonial_subjugation', 0.8631111979484558),
+     ('subjugation', 0.8551343679428101),
+     ('colonized_peoples', 0.8545337915420532),
+     ('colonial_conquest', 0.8533400893211365),
+     ('colonial_settler', 0.8425801396369934),
+     ('colonialism', 0.8342924118041992),
+     ('patriarchy', 0.8340162634849548),
+     ('colonial_domination', 0.8334349393844604),
+     ('Zionist_expansionism', 0.8305999040603638),
+     ('colonial_empires', 0.8288437724113464)]
+%% Cell type:code id: tags:
+``` python
+wv.most_similar_cosmul(positive=['settler_colonialism', 'man'], negative=['woman'])
+```
+%% Output
+    [('slavery_colonialism', 0.8518989086151123),
+     ('settler_colonial', 0.8481582403182983),
+     ('Hitlerism', 0.8431347012519836),
+     ('imperialism_colonialism', 0.8403087258338928),
+     ('Nazism_fascism', 0.8380133509635925),
+     ('imperialist_domination', 0.8367621898651123),
+     ('totalitarian_ideologies', 0.8347264528274536),
+     ('neo_colonialists', 0.8338908553123474),
+     ('predatory_capitalism', 0.8327714800834656),
+     ('proletarian_internationalism', 0.8313636779785156)]