{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Testing gensim\n", "See https://radimrehurek.com/gensim/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install --upgrade gensim" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import gensim.downloader as api" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....\n", "conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known...\n", "fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt...\n", "glove-twitter-100 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:...\n", "glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n", "glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n", "glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n", "glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B tokens, 400K vocab, ...\n", "glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n", "glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n", "glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n", "word2vec-google-news-300 (3000000 records): Pre-trained vectors trained on a part of the Google News dataset (about 100 bill...\n", "word2vec-ruscorpora-300 (184973 records): Word2vec Continuous Skipgram vectors trained on full Russian National Corpus (ab...\n" ] } ], "source": [ "info = api.info()\n", "for model_name, model_data in sorted(info['models'].items()):\n", " print('%s (%d records): %s' % (model_name, model_data.get('num_records', -1), model_data['description'][:80] + '...'))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n" ] } ], "source": [ "wv = api.load('word2vec-google-news-300')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('jews', 0.606805145740509),\n", " ('jewish', 0.5944611430168152),\n", " ('rahm', 0.5944365859031677),\n", " ('mhux', 0.5918845534324646),\n", " ('yid', 0.5769580006599426),\n", " ('jessie', 0.5755242109298706),\n", " ('yur', 0.5660163164138794),\n", " ('israel', 0.5639604330062866),\n", " ('gilbert', 0.5632734894752502),\n", " ('kol', 0.5615833401679993)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"jew\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('nazis', 0.6923775672912598),\n", " ('fascist', 0.657628059387207),\n", " ('Nazi', 0.6324446201324463),\n", " ('facist', 0.6276720762252808),\n", " ('fascists', 0.6110973358154297),\n", " ('Hilter', 0.5978641510009766),\n", " ('Hitler', 0.5964925289154053),\n", " ('hitler', 0.5891590714454651),\n", " ('NAZI', 0.5822753310203552),\n", " ('Fascist', 0.5806231498718262)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"nazi\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Donald_Trump', 0.8103920221328735),\n", " ('impersonator_entertained', 0.5942257046699524),\n", " ('Ivanka_Trump', 0.5924582481384277),\n", " ('Ivanka', 0.5607207417488098),\n", " ('mogul_Donald_Trump', 0.5592453479766846),\n", " ('Trump_Tower', 0.548555314540863),\n", " ('Kepcher', 0.5468589067459106),\n", " ('billionaire_Donald_Trump', 0.5447269082069397),\n", " ('Trumpster', 0.5412818193435669),\n", " ('tycoon_Donald_Trump', 0.5383972525596619)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"Trump\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Israeli', 0.8130459785461426),\n", " ('Israelis', 0.779090940952301),\n", " ('Palestinians', 0.7580956220626831),\n", " ('Palestinian', 0.7473597526550293),\n", " ('Netanyahu', 0.7082809805870056),\n", " ('Gaza', 0.7046299576759338),\n", " ('Hamas', 0.6912718415260315),\n", " ('Gaza_Strip', 0.6873201727867126),\n", " ('Palestinian_Authority', 0.6648115515708923),\n", " ('Prime_Minister_Binyamin_Netanyahu', 0.6640220880508423)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"Israel\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Fuchs', 0.5700803995132446),\n", " ('Weil', 0.5526396632194519),\n", " ('weekly_newsmagazine_Der', 0.5345348119735718),\n", " ('Berman', 0.5249170660972595),\n", " ('Stein', 0.5195315480232239),\n", " ('Der_Spiegel', 0.5161873698234558),\n", " ('Nussbaum', 0.5158510208129883),\n", " ('Welt', 0.5128974914550781),\n", " ('Klein', 0.5109302401542664),\n", " ('Ulrich', 0.5099592208862305)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"Spiegel\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('magazine_Der_Spiegel', 0.7872042655944824),\n", " ('weekly_Der_Spiegel', 0.7623571753501892),\n", " ('Die_Zeit', 0.7383401393890381),\n", " ('Frankfurter_Allgemeine_Zeitung', 0.7346989512443542),\n", " ('Die_Welt', 0.7314777374267578),\n", " ('Der_Spiegel_magazine', 0.7263863682746887),\n", " ('Süddeutsche_Zeitung', 0.7214947938919067),\n", " ('Handelsblatt', 0.7061707377433777),\n", " ('Tagesspiegel_daily', 0.7048733830451965),\n", " ('Spiegel_Online', 0.7014873623847961)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"Der_Spiegel\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.26165980100631714" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"Der_Spiegel\", \"Die_Zeit\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2685222625732422" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"Der_Spiegel\", \"Die_Welt\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2785053253173828" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"Der_Spiegel\", \"Süddeutsche_Zeitung\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('imperialism', 0.7371744513511658),\n", " ('colonialists', 0.7273486852645874),\n", " ('neo_colonialism', 0.7152635455131531),\n", " ('Colonialism', 0.6945492029190063),\n", " ('colonial_domination', 0.6901723146438599),\n", " ('colonialist', 0.6886431574821472),\n", " ('colonial', 0.6881863474845886),\n", " ('slavery_colonialism', 0.6797659397125244),\n", " ('colonial_rule', 0.6758955717086792),\n", " ('colonization', 0.6730928421020508)]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"colonialism\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('colonial', 0.8822019100189209),\n", " ('oppression', 0.8728238940238953),\n", " ('colonialists', 0.8726308941841125),\n", " ('feminism', 0.8686202764511108),\n", " ('imperialism', 0.8678603768348694),\n", " ('patriarchy', 0.8666298389434814),\n", " ('colonization', 0.8656938076019287),\n", " ('colonial_rule', 0.86388099193573),\n", " ('slavery', 0.8588250875473022),\n", " ('subjugation', 0.8580973744392395)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['colonialism', 'woman'], negative=['man'])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('neo_colonialism', 0.8733060956001282),\n", " ('imperialism', 0.869311511516571),\n", " ('slavery_colonialism', 0.866578996181488),\n", " ('colonialists', 0.8548066020011902),\n", " ('colonialist', 0.8444662094116211),\n", " ('imperialist_domination', 0.8404235243797302),\n", " ('Colonialism', 0.8400565981864929),\n", " ('imperialism_colonialism', 0.8381094932556152),\n", " ('colonial_domination', 0.8354700207710266),\n", " ('colonialization', 0.8350632190704346)]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['colonialism', 'man'], negative=['woman'])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('colonial_subjugation', 0.6939011216163635),\n", " ('colonized_peoples', 0.678598940372467),\n", " ('colonial_conquest', 0.6679588556289673),\n", " ('imperialist_domination', 0.6554943323135376),\n", " ('colonial_settler', 0.6515358686447144),\n", " ('slavery_colonialism', 0.6513102054595947),\n", " ('ethnocracy', 0.6486039161682129),\n", " ('colonial_domination', 0.6479084491729736),\n", " ('settler_colonial', 0.644547700881958),\n", " ('imperialism_colonialism', 0.6408925652503967)]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"settler_colonialism\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('colonial_subjugation', 0.8631111979484558),\n", " ('subjugation', 0.8551343679428101),\n", " ('colonized_peoples', 0.8545337915420532),\n", " ('colonial_conquest', 0.8533400893211365),\n", " ('colonial_settler', 0.8425801396369934),\n", " ('colonialism', 0.8342924118041992),\n", " ('patriarchy', 0.8340162634849548),\n", " ('colonial_domination', 0.8334349393844604),\n", " ('Zionist_expansionism', 0.8305999040603638),\n", " ('colonial_empires', 0.8288437724113464)]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['settler_colonialism', 'woman'], negative=['man'])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('slavery_colonialism', 0.8518989086151123),\n", " ('settler_colonial', 0.8481582403182983),\n", " ('Hitlerism', 0.8431347012519836),\n", " ('imperialism_colonialism', 0.8403087258338928),\n", " ('Nazism_fascism', 0.8380133509635925),\n", " ('imperialist_domination', 0.8367621898651123),\n", " ('totalitarian_ideologies', 0.8347264528274536),\n", " ('neo_colonialists', 0.8338908553123474),\n", " ('predatory_capitalism', 0.8327714800834656),\n", " ('proletarian_internationalism', 0.8313636779785156)]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['settler_colonialism', 'man'], negative=['woman'])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Zionism', 1.1978480815887451),\n", " ('Palestinians', 1.1952910423278809),\n", " ('Palestinains', 1.162667989730835),\n", " ('settler_colonialist', 1.1616851091384888),\n", " ('Israelis', 1.158406138420105),\n", " ('Kibbutz_Yad_Mordechai', 1.1381361484527588),\n", " ('Zionist_colonization', 1.134883165359497),\n", " (\"Hizb'allah\", 1.1295013427734375),\n", " ('Israeli', 1.1281832456588745),\n", " ('Palestinian', 1.127851128578186)]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['settler_colonialism', 'Israel'], negative=['USA'])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('lifts_Squaw_Valley', 0.8459171056747437),\n", " ('Volleyball_Confederation_NORCECA', 0.8203061819076538),\n", " ('Mobility_NASDAQ_USMO', 0.8179371953010559),\n", " ('proudly_proclaims_Made', 0.8160853981971741),\n", " ('World_Premiere_Narrative', 0.8117023706436157),\n", " ('subsidiary_Powertech', 0.8103252649307251),\n", " ('Heavy_Duty_Waterproof_Flashlight', 0.808460533618927),\n", " ('Sodi_Racing', 0.8083812594413757),\n", " ('Massimo_Zanetti_Beverage', 0.8071674108505249),\n", " ('lifts_Sugarloaf', 0.8043990731239319)]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['settler_colonialism', 'USA'], negative=['Israel'])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('colonialist', 0.9835244417190552),\n", " ('colonialists', 0.9767362475395203),\n", " ('colonial_rule', 0.9680818319320679),\n", " ('chancellor_Konrad_Adenauer', 0.9659275412559509),\n", " ('Prussians', 0.9655771255493164),\n", " ('neo_colonialism', 0.9561163783073425),\n", " ('colonization', 0.9336003065109253),\n", " ('Belsen_concentration_camp', 0.93024080991745),\n", " ('economist_Frederic_Bastiat', 0.9301672577857971),\n", " ('colonial', 0.9293596148490906)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['colonialism', 'Germany'], negative=['USA'])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Afrikan', 0.8292856812477112),\n", " ('Colonialism', 0.8270210027694702),\n", " ('imperialist_domination', 0.8252911567687988),\n", " ('slavery_colonialism', 0.8215567469596863),\n", " ('imperialism', 0.8194811940193176),\n", " ('chattel_slavery', 0.8183201551437378),\n", " ('colonialism_neo_colonialism', 0.8140093684196472),\n", " ('oppression', 0.8125733733177185),\n", " ('imperialist', 0.8091658353805542),\n", " ('institutionalized_racism', 0.8086007237434387)]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['colonialism', 'USA'], negative=['Germany'])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.008988581597805" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"colonialism\", \"USA\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8896720111370087" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"colonialism\", \"Germany\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7863316237926483" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"engineer\", \"doctor\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7539812177419662" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"engineer\", \"nurse\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6907496750354767" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"engineers\", \"doctors\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9487667307257652" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.distance(\"engineer\", \"nurses\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('electrical_engineer', 0.8471074104309082),\n", " ('Engineer', 0.8455448746681213),\n", " ('structural_engineer', 0.8157095313072205),\n", " ('mechanical_engineer', 0.8142104744911194),\n", " ('architect', 0.8123660683631897),\n", " ('engineering', 0.80747389793396),\n", " ('metallurgist', 0.8038502335548401),\n", " ('geotechnical_engineer', 0.7997499704360962),\n", " ('Guo_Xianliang', 0.7965970635414124),\n", " ('geologist', 0.788753867149353)]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['engineer', 'doctor'], negative=['nurse'])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('mechanical_engineer', 0.935675859451294),\n", " ('electrical_engineer', 0.9038631916046143),\n", " ('technologist', 0.8726279735565186),\n", " ('technician', 0.8719496130943298),\n", " ('engineering', 0.8703181147575378),\n", " ('computer_programmer', 0.862802267074585),\n", " ('machinist', 0.8552348017692566),\n", " ('engineeer', 0.8525202870368958),\n", " ('Engineer', 0.8455774784088135),\n", " ('pipe_fitter', 0.845426082611084)]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['engineer', 'nurse'], negative=['doctor'])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('stalin', 0.8554697036743164),\n", " ('russians', 0.8503891825675964),\n", " ('george_bush', 0.8482958078384399),\n", " ('clintons', 0.8405222296714783),\n", " ('sharpton', 0.8373916149139404),\n", " ('ron_paul', 0.8371630907058716),\n", " ('sarah_palin', 0.8370876908302307),\n", " ('iranians', 0.8367903232574463),\n", " ('limbaugh', 0.8362519145011902),\n", " ('chavez', 0.8359288573265076)]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar_cosmul(positive=['hitler', 'russia'], negative=['germany'])" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('sarah_palin', 0.5195872783660889),\n", " ('john_mccain', 0.5010458827018738),\n", " ('russians', 0.49574121832847595),\n", " ('george_bush', 0.4951860010623932),\n", " ('obama', 0.49378806352615356),\n", " ('hillary', 0.4904623031616211),\n", " ('stalin', 0.4876244366168976),\n", " ('clintons', 0.4781968295574188),\n", " ('reagan', 0.4776139259338379),\n", " ('chavez', 0.47482872009277344)]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(positive=['hitler', 'russia', 'female'], negative=['germany', 'male'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }