Skip to content
Snippets Groups Projects
Commit b77c9c13 authored by Schoeneh's avatar Schoeneh
Browse files

fixed download

parent 474e421c
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Testing gensim
See https://radimrehurek.com/gensim/
%% Cell type:code id: tags:
``` python
import gensim
import requests, zipfile, io, os
from os.path import exists
# Downloading data & reading csv as pandas DataFrame
file = 'spiegel_1948-1979_embeddings_fasttext_50epochs_2'
link = 'https://box.hu-berlin.de/f/19c5aa45779a461499d0/?dl=1'
model = 'fasttext_model_spiegel_corpus_neu_50epochs_2'
if exists(f'{file}.zip') == False:
r = requests.get(link, stream=True)
with open(f'{file}.zip', 'wb') as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
print("File downloaded")
print("File downloaded")
else: print("File already downloaded.")
if exists(f'./{file}') == False:
os.mkdir(f'./{file}')
with zipfile.ZipFile(f'{file}.zip') as zf:
zf.extractall(path=f'./{file}')
print("File unzipped.")
else: print("File already unzipped.")
```
%% Output
File already downloaded.
File already unzipped.
%% Cell type:code id: tags:
``` python
spiegel_fasttext = gensim.models.FastText.load(f'/{file}/{model}.model')
#spiegel_1948-1979_embeddings_fasttext_50epochs_2
#ue-llms\spiegel_1948-1979_embeddings_fasttext_50epochs_2\fasttext_model_spiegel_corpus_neu_50epochs_2.model
#ue-llms\spiegel_1948-1979_embeddings_fasttext_50epochs_2\fasttext_model_spiegel_corpus_neu_50epochs_2.model
```
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("Kolonie")
```
%% Output
[('kolonie', 0.7393254637718201),
('exkolonie', 0.6531060338020325),
('kolonialreiche', 0.652650773525238),
('kolonien', 0.6513134837150574),
('musterkolonie', 0.6470878720283508),
('kolonialreiches', 0.6406031250953674),
('nachbarprovinz', 0.6389856338500977),
('kolonialreichs', 0.6318897008895874),
('kolonialreich', 0.6315282583236694),
('ferienkolonie', 0.6258662343025208)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("Kolonialismus")
```
%% Output
[('kolonialismus', 0.9018982648849487),
('kolonialist', 0.8311547040939331),
('kolonialisten', 0.7932776212692261),
('nachkolonialen', 0.7744893431663513),
('antikolonialen', 0.7676698565483093),
('kolonialpolitik', 0.7602006793022156),
('nachkoloniale', 0.7534233331680298),
('nationalismus', 0.7524922490119934),
('kolonialen', 0.7517328858375549),
('antikoloniale', 0.7438010573387146)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("nazi")
```
%% Output
[('nazis', 0.8403863310813904),
('ns', 0.8173379898071289),
('nazistischer', 0.7557086944580078),
('nazistischen', 0.750986635684967),
('nazistische', 0.7342256903648376),
('nazistisch', 0.726861298084259),
('großdeutscher', 0.7023860216140747),
('nazismus', 0.7022897601127625),
('antisemitischer', 0.696376383304596),
('kz', 0.6951879262924194)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("Trump")
```
%% Output
[('krump', 0.735675036907196),
('registrier', 0.5744073987007141),
('neuerkrankungen', 0.57392418384552),
('gemeldete', 0.5610890984535217),
('pressebericht', 0.5544757843017578),
('verlustanzeige', 0.5432375073432922),
('anmeldungen', 0.5428926348686218),
('aufgeflogene', 0.5423745512962341),
('alarmnachricht', 0.5374037623405457),
('alarmierende', 0.5313758254051208)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("kz", topn=30)
```
%% Output
[('buchenwald', 0.8792651891708374),
('dachau', 0.8258079290390015),
('auschwitz', 0.8050574064254761),
('mauthausen', 0.7775819897651672),
('sachsenhausen', 0.765381395816803),
('buchenwalder', 0.7609554529190063),
('theresienstadt', 0.754604697227478),
('häftlinge', 0.7309788465499878),
('häftling', 0.7256706357002258),
('gaskammern', 0.7118412256240845),
('ns', 0.697554349899292),
('nazi', 0.6951879858970642),
('häftlings', 0.6949462890625),
('auschwitzer', 0.6874114274978638),
('lagerkommandant', 0.6857143044471741),
('treblinka', 0.6774755120277405),
('nazis', 0.6764761209487915),
('ss', 0.6742820739746094),
('zwangsarbeiter', 0.6706141233444214),
('mitgefangene', 0.6703796982765198),
('natzweiler', 0.6701797246932983),
('häftlingsarzt', 0.6700323820114136),
('gestapo', 0.6683216094970703),
('gaskammer', 0.666691243648529),
('häftlingen', 0.6666845679283142),
('neuengamme', 0.665062665939331),
('flossenbürg', 0.6631953120231628),
('halbjuden', 0.6626685857772827),
('ravensbrück', 0.6606388688087463),
('euthanasie', 0.6560691595077515)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("suez", topn=30)
```
%% Output
[('suezkanal', 0.8312360644340515),
('suezkanals', 0.7922067642211914),
('akaba', 0.709147572517395),
('said', 0.698356032371521),
('anglo', 0.696181058883667),
('ismailia', 0.6857931613922119),
('kanals', 0.6778820157051086),
('abadan', 0.6718226671218872),
('israelisch', 0.6687085628509521),
('ägyptisch', 0.6670056581497192),
('ägypten', 0.6517369151115417),
('nahost', 0.6489949822425842),
('ägyptens', 0.6461713910102844),
('biserta', 0.6436704993247986),
('nassers', 0.6390738487243652),
('kanal', 0.6378217339515686),
('transarabische', 0.6368891000747681),
('britisch', 0.6345162391662598),
('jordanien', 0.6323387622833252),
('arabien', 0.6312205195426941),
('jordaniens', 0.6304064989089966),
('sinai', 0.6302697658538818),
('libanon', 0.6274277567863464),
('nasser', 0.6264342069625854),
('tiran', 0.6263170838356018),
('sudan', 0.623646080493927),
('irak', 0.6217209696769714),
('port', 0.621683657169342),
('transjordaniens', 0.6210361123085022),
('aden', 0.6208665370941162)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("ostafrika", topn=30)
```
%% Output
[('ostafrikaner', 0.8676508069038391),
('ostafrikas', 0.842673659324646),
('ostafrikanische', 0.8248769640922546),
('westafrika', 0.8198734521865845),
('kenia', 0.8148528337478638),
('afrika', 0.7896697521209717),
('tanganjika', 0.7732571363449097),
('tansania', 0.7536317706108093),
('nigeria', 0.7515606880187988),
('westafrikas', 0.7401487231254578),
('togo', 0.7241421937942505),
('kamerun', 0.7232619524002075),
('uganda', 0.722804069519043),
('ghana', 0.7129299640655518),
('guinea', 0.7060198783874512),
('nyassaland', 0.7059506773948669),
('kolonie', 0.7057479023933411),
('sambia', 0.7054694890975952),
('südafrika', 0.696257472038269),
('sudan', 0.6920241117477417),
('niger', 0.6908750534057617),
('safari', 0.689552366733551),
('niederländisch', 0.6891322731971741),
('afrikas', 0.6875596642494202),
('südwestafrikas', 0.6868376731872559),
('senegal', 0.6863754391670227),
('daressalam', 0.6860796213150024),
('afrikanischen', 0.6859438419342041),
('indien', 0.6857169270515442),
('äthiopien', 0.6830366253852844)]
%% Cell type:code id: tags:
``` python
spiegel_fasttext.wv.most_similar("volkspolen", topn=30)
```
%% Output
[('volkspolens', 0.9186643362045288),
('volkspolnischen', 0.8382866978645325),
('westpolen', 0.6901859641075134),
('gomulkas', 0.6826123595237732),
('gomulka', 0.6729230284690857),
('polens', 0.6660183072090149),
('ostdeutschland', 0.6637937426567078),
('sed', 0.6632714867591858),
('sowjetzone', 0.6624323725700378),
('ostdeutschlands', 0.6607036590576172),
('vorkriegspolen', 0.6591367125511169),
('ostdeutsch', 0.6585639715194702),
('warschau', 0.6509412527084351),
('ulbrichts', 0.6508803963661194),
('cyrankiewicz', 0.6478868126869202),
('ddr', 0.645229160785675),
('ostdeutschen', 0.6429562568664551),
('ostberlin', 0.6405128836631775),
('ulbricht', 0.6396167278289795),
('ostpolnischen', 0.6330234408378601),
('honecker', 0.632041335105896),
('polen', 0.6277611255645752),
('rumänien', 0.624265193939209),
('gierek', 0.6231211423873901),
('ostpolen', 0.6198691129684448),
('cssr', 0.6192417144775391),
('prag', 0.6176929473876953),
('komsomolzen', 0.6167832612991333),
('bulgarien', 0.6152984499931335),
('ostdeutsche', 0.6143065690994263)]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment