Newer
Older
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pipeline to create research dataset"
]
},
{
"cell_type": "code",

Philipp Schneider
committed
"execution_count": 1,
"outputs": [],
"source": [
"from rdflib import Graph\n",
"\n",
"def merge_graphs(input_path_1, input_path_2, output_path):\n",
" g = Graph()\n",
" g.parse(input_path_1)\n",
" g.parse(input_path_2)\n",
" g.serialize(destination=output_path, format='ttl')"
]
},
{
"cell_type": "code",

Philipp Schneider
committed
"execution_count": 11,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [

Philipp Schneider
committed
"src/rdf-mappings/map-tblBranch.py:510: DtypeWarning: Columns (2,9,11,16,17,18,19,20,21,31,38,39,49,56,58,62,67,71,72,73,74,75,77,79,82,85,86,88,94,97,98,99,100,102,104,106,109,110,112,114,117,118,119,122,124,125,126,127,128,129,130,131,132,133,134,135,136,138,139,140,141,142,143,144,145,149,150,151,152,153,154,157,165,166,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,188,189,190,194,196) have mixed types. Specify dtype option on import or set low_memory=False.\n",
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
" df_tblBranch = pd.read_csv(csv_input_path, sep='\\t')\n",
" Column Column2 BranchID ... GpSel TinctChanged Column 201\n",
"0 0 NaN 1 ... NaN 1.0 NaN\n",
"1 1 NaN 2 ... NaN 1.0 NaN\n",
"2 2 NaN 3 ... NaN 1.0 NaN\n",
"3 3 NaN 4 ... NaN 1.0 NaN\n",
"4 4 NaN 5 ... NaN 1.0 NaN\n",
"... ... ... ... ... ... ... ...\n",
"40859 40859 NaN 8318_A_1 ... NaN NaN NaN\n",
"40860 40860 NaN 8425_1 ... 0.0 NaN 0.0\n",
"40861 40861 NaN 8425_A_1 ... NaN NaN NaN\n",
"40862 40862 NaN 8425_B_1 ... NaN NaN NaN\n",
"40863 40863 NaN 13770_1 ... 0.0 NaN 1.0\n",
"\n",
"[40864 rows x 205 columns]\n",
"chief\n",
"compon\n",
"chief\n",
"chief\n",
"chief\n",
"chief\n",
"chief\n",
"eschutch\n",
"chief\n",
"chief\n",
"chief\n",
"chief\n",
"chief\n",
"shield\n",
"stag\n",
"compon de Harcourt\n",
"chief\n",
"chief\n",
"chief\n",
"\n",
"chief\n",
"border\n",
"lozenge\n",
"lozenge\n",
"\n",
"\n",
"chief\n",
"chief\n",
"lozenge\n",
"chief\n",
"chief\n",
"compon\n",
"compon\n",
"escuch\n",
"chief\n",
"\n",
"\n",
"escurch\n",
"banner\n",
"chief\n",
"banner\n",
"banner\n",
"banner\n",
"banner\n",
"banner\n",
"escuch\n",

Philipp Schneider
committed
"Created data/rdf-output/test-dataset/knowledge-graph-blazon-test-dataset.ttl\n",
"Created data/rdf-output/test-dataset/knowledge-graph-blazon-test-dataset.jsonld\n",
"Export TBox to file...\n",
"Create data/rdf-output/digital-heraldry-ontology.ttl\n"
]
}
],
"source": [
"!python src/rdf-mappings/map-tblBranch.py"
]
},
{
"cell_type": "code",

Philipp Schneider
committed
"execution_count": 42,

Philipp Schneider
committed
"merge_graphs('data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl', 'data/ontologies/digital-heraldry-ontology-heraldry_no-charges.ttl', 'data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl')"
"**IGNORE FOR NOW** Add class structure for `dhoh:Modifier`s"
"outputs": [],
"source": [
"merge_graphs('data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl', 'data/ontologies/research-dataset/modifier-class-structure.ttl', 'data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**IGNORE FOR NOW** Add class structure for `dhoh:Charge`s"
]
},
{
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": [
"merge_graphs('data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl', 'data/ontologies/research-dataset/charge-class-structure_by-claude-ai.ttl', 'data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the occurances of the coats of arms (`dhor:CoatOfArmsRepresentation`) from `tblArmItem`"
]
},
{
"cell_type": "code",

Philipp Schneider
committed
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"src/rdf-mappings/map-tblArmItems.py:105: DtypeWarning: Columns (0,7,10,11,13,14,42) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df_tblArmItems = pd.read_csv(csv_input_path, sep='\\t')\n",
"src/rdf-mappings/map-tblArmItems.py:118: FutureWarning: Inferring datetime64[ns] from data containing strings is deprecated and will be removed in a future version. To retain the old behavior explicitly pass Series(data, dtype=datetime64[ns])\n",
" df_metadata = pd.read_excel(metadata_file)\n",
" Column ArmItemID ArmCode ItemNr ... CreMulti CreTint CreOrnIDR TGmember\n",

Philipp Schneider
committed
"0 18208.0 21653.0 A 58.0 ... NaN NaN NaN False\n",
"1 18227.0 21672.0 A 323.0 ... NaN NaN NaN False\n",
"2 18278.0 21727.0 A 160.0 ... NaN NaN NaN False\n",
"3 18333.0 21787.0 A 211.0 ... NaN NaN NaN False\n",
"4 18344.0 21798.0 A 82.0 ... NaN NaN NaN False\n",
"... ... ... ... ... ... ... ... ... ...\n",

Philipp Schneider
committed
"87832 33804.0 39070.0 ZUR 586.0 ... NaN NaN NaN False\n",
"87833 33805.0 39071.0 ZUR 587.0 ... NaN NaN NaN False\n",
"87834 4184.0 4306.0 BEL 360.0 ... NaN NaN NaN False\n",
"87835 11106-1 12011.0 BEL 1447.0 ... NaN NaN NaN False\n",
"87836 11214-1 12135.0 BEL 1513.0 ... NaN SO 2.0 False\n",

Philipp Schneider
committed
"[87837 rows x 46 columns]\n",
"src/rdf-mappings/map-tblArmItems.py:130: DtypeWarning: Columns (2,9,11,16,17,18,19,20,21,31,38,39,49,56,58,62,67,71,72,73,74,75,77,79,82,85,86,88,94,97,98,99,100,102,104,106,109,110,112,114,117,118,119,122,124,125,126,127,128,129,130,131,132,133,134,135,136,138,139,140,141,142,143,144,145,149,150,151,152,153,154,157,165,166,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,188,189,190,194,196) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df_tblBranch = pd.read_csv('data/input/tblBranch_research-dataset.tsv', sep='\\t')\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
"None\n",
Loading
Loading full blame...