{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Testing gensim\n", "See https://radimrehurek.com/gensim/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install --upgrade gensim" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import gensim.downloader as api" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....\n", "conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known...\n", "fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt...\n", "glove-twitter-100 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:...\n", "glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n", "glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n", "glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n", "glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B tokens, 400K vocab, ...\n", "glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n", "glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n", "glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n", "word2vec-google-news-300 (3000000 records): Pre-trained vectors trained on a part of the Google News dataset (about 100 bill...\n", "word2vec-ruscorpora-300 (184973 records): Word2vec Continuous Skipgram vectors trained on full Russian National Corpus (ab...\n" ] } ], "source": [ "info = api.info()\n", "for model_name, model_data in sorted(info['models'].items()):\n", " print('%s (%d records): %s' % (model_name, model_data.get('num_records', -1), model_data['description'][:80] + '...'))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n" ] } ], "source": [ "wv = api.load('word2vec-google-news-300')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('jews', 0.606805145740509),\n", " ('jewish', 0.5944611430168152),\n", " ('rahm', 0.5944365859031677),\n", " ('mhux', 0.5918845534324646),\n", " ('yid', 0.5769580006599426),\n", " ('jessie', 0.5755242109298706),\n", " ('yur', 0.5660163164138794),\n", " ('israel', 0.5639604330062866),\n", " ('gilbert', 0.5632734894752502),\n", " ('kol', 0.5615833401679993)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"jew\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('nazis', 0.6923775672912598),\n", " ('fascist', 0.657628059387207),\n", " ('Nazi', 0.6324446201324463),\n", " ('facist', 0.6276720762252808),\n", " ('fascists', 0.6110973358154297),\n", " ('Hilter', 0.5978641510009766),\n", " ('Hitler', 0.5964925289154053),\n", " ('hitler', 0.5891590714454651),\n", " ('NAZI', 0.5822753310203552),\n", " ('Fascist', 0.5806231498718262)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wv.most_similar(\"nazi\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }