Skip to content
Snippets Groups Projects
gensim-test.ipynb 4.8 KiB
Newer Older
Schoeneh's avatar
Schoeneh committed
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Testing gensim\n",
    "See https://radimrehurek.com/gensim/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install --upgrade gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim.downloader as api"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix....\n",
      "conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known...\n",
      "fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt...\n",
      "glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vocab, uncased (https:...\n",
      "glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n",
      "glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n",
      "glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https:/...\n",
      "glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B tokens, 400K vocab, ...\n",
      "glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n",
      "glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n",
      "glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab,...\n",
      "word2vec-google-news-300 (3000000 records): Pre-trained vectors trained on a part of the Google News dataset (about 100 bill...\n",
      "word2vec-ruscorpora-300 (184973 records): Word2vec Continuous Skipgram vectors trained on full Russian National Corpus (ab...\n"
     ]
    }
   ],
   "source": [
    "info = api.info()\n",
    "for model_name, model_data in sorted(info['models'].items()):\n",
    "    print('%s (%d records): %s' % (model_name, model_data.get('num_records', -1), model_data['description'][:80] + '...'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n"
     ]
    }
   ],
   "source": [
    "wv = api.load('word2vec-google-news-300')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('jews', 0.606805145740509),\n",
       " ('jewish', 0.5944611430168152),\n",
       " ('rahm', 0.5944365859031677),\n",
       " ('mhux', 0.5918845534324646),\n",
       " ('yid', 0.5769580006599426),\n",
       " ('jessie', 0.5755242109298706),\n",
       " ('yur', 0.5660163164138794),\n",
       " ('israel', 0.5639604330062866),\n",
       " ('gilbert', 0.5632734894752502),\n",
       " ('kol', 0.5615833401679993)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wv.most_similar(\"jew\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('nazis', 0.6923775672912598),\n",
       " ('fascist', 0.657628059387207),\n",
       " ('Nazi', 0.6324446201324463),\n",
       " ('facist', 0.6276720762252808),\n",
       " ('fascists', 0.6110973358154297),\n",
       " ('Hilter', 0.5978641510009766),\n",
       " ('Hitler', 0.5964925289154053),\n",
       " ('hitler', 0.5891590714454651),\n",
       " ('NAZI', 0.5822753310203552),\n",
       " ('Fascist', 0.5806231498718262)]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wv.most_similar(\"nazi\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}