research-dataset-creation-pipeline.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pipeline to create research dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from rdflib import Graph\n",
    "\n",
    "def merge_graphs(input_path_1, input_path_2, output_path):\n",
    "    g = Graph()\n",
    "    g.parse(input_path_1)\n",
    "    g.parse(input_path_2)\n",
    "    g.serialize(destination=output_path, format='ttl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "src/rdf-mappings/map-tblBranch.py:510: DtypeWarning: Columns (2,9,11,16,17,18,19,20,21,31,38,39,49,56,58,62,67,71,72,73,74,75,77,79,82,85,86,88,94,97,98,99,100,102,104,106,109,110,112,114,117,118,119,122,124,125,126,127,128,129,130,131,132,133,134,135,136,138,139,140,141,142,143,144,145,149,150,151,152,153,154,157,165,166,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,188,189,190,194,196) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df_tblBranch = pd.read_csv(csv_input_path, sep='\\t')\n",
      "       Column  Column2  BranchID  ... GpSel TinctChanged  Column 201\n",
      "0           0      NaN         1  ...   NaN          1.0         NaN\n",
      "1           1      NaN         2  ...   NaN          1.0         NaN\n",
      "2           2      NaN         3  ...   NaN          1.0         NaN\n",
      "3           3      NaN         4  ...   NaN          1.0         NaN\n",
      "4           4      NaN         5  ...   NaN          1.0         NaN\n",
      "...       ...      ...       ...  ...   ...          ...         ...\n",
      "40859   40859      NaN  8318_A_1  ...   NaN          NaN         NaN\n",
      "40860   40860      NaN    8425_1  ...   0.0          NaN         0.0\n",
      "40861   40861      NaN  8425_A_1  ...   NaN          NaN         NaN\n",
      "40862   40862      NaN  8425_B_1  ...   NaN          NaN         NaN\n",
      "40863   40863      NaN   13770_1  ...   0.0          NaN         1.0\n",
      "\n",
      "[40864 rows x 205 columns]\n",
      "chief\n",
      "compon\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "eschutch\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "shield\n",
      "stag\n",
      "compon de Harcourt\n",
      "chief\n",
      "chief\n",
      "chief\n",
      "\n",
      "chief\n",
      "border\n",
      "lozenge\n",
      "lozenge\n",
      "\n",
      "\n",
      "chief\n",
      "chief\n",
      "lozenge\n",
      "chief\n",
      "chief\n",
      "compon\n",
      "compon\n",
      "escuch\n",
      "chief\n",
      "\n",
      "\n",
      "escurch\n",
      "banner\n",
      "chief\n",
      "banner\n",
      "banner\n",
      "banner\n",
      "banner\n",
      "banner\n",
      "escuch\n",
      "Created data/rdf-output/test-dataset/knowledge-graph-blazon-test-dataset.ttl\n",
      "Created data/rdf-output/test-dataset/knowledge-graph-blazon-test-dataset.jsonld\n",
      "Export TBox to file...\n",
      "Create data/rdf-output/digital-heraldry-ontology.ttl\n"
     ]
    }
   ],
   "source": [
    "!python src/rdf-mappings/map-tblBranch.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "merge_graphs('data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl', 'data/ontologies/digital-heraldry-ontology-heraldry_no-charges.ttl', 'data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**IGNORE FOR NOW** Add class structure for `dhoh:Modifier`s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "merge_graphs('data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl', 'data/ontologies/research-dataset/modifier-class-structure.ttl', 'data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**IGNORE FOR NOW** Add class structure for `dhoh:Charge`s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "merge_graphs('data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl', 'data/ontologies/research-dataset/charge-class-structure_by-claude-ai.ttl', 'data/rdf-output/research-dataset/knowledge-graph-blazon-research-dataset.ttl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create the occurances of the coats of arms (`dhor:CoatOfArmsRepresentation`) from `tblArmItem`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "src/rdf-mappings/map-tblArmItems.py:105: DtypeWarning: Columns (0,7,10,11,13,14,42) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df_tblArmItems = pd.read_csv(csv_input_path, sep='\\t')\n",
      "src/rdf-mappings/map-tblArmItems.py:118: FutureWarning: Inferring datetime64[ns] from data containing strings is deprecated and will be removed in a future version. To retain the old behavior explicitly pass Series(data, dtype=datetime64[ns])\n",
      "  df_metadata = pd.read_excel(metadata_file)\n",
      "        Column  ArmItemID ArmCode  ItemNr  ... CreMulti  CreTint  CreOrnIDR TGmember\n",
      "0      18208.0    21653.0       A    58.0  ...      NaN      NaN        NaN    False\n",
      "1      18227.0    21672.0       A   323.0  ...      NaN      NaN        NaN    False\n",
      "2      18278.0    21727.0       A   160.0  ...      NaN      NaN        NaN    False\n",
      "3      18333.0    21787.0       A   211.0  ...      NaN      NaN        NaN    False\n",
      "4      18344.0    21798.0       A    82.0  ...      NaN      NaN        NaN    False\n",
      "...        ...        ...     ...     ...  ...      ...      ...        ...      ...\n",
      "87832  33804.0    39070.0     ZUR   586.0  ...      NaN      NaN        NaN    False\n",
      "87833  33805.0    39071.0     ZUR   587.0  ...      NaN      NaN        NaN    False\n",
      "87834   4184.0     4306.0     BEL   360.0  ...      NaN      NaN        NaN    False\n",
      "87835  11106-1    12011.0     BEL  1447.0  ...      NaN      NaN        NaN    False\n",
      "87836  11214-1    12135.0     BEL  1513.0  ...      NaN       SO        2.0    False\n",
      "\n",
      "[87837 rows x 46 columns]\n",
      "src/rdf-mappings/map-tblArmItems.py:130: DtypeWarning: Columns (2,9,11,16,17,18,19,20,21,31,38,39,49,56,58,62,67,71,72,73,74,75,77,79,82,85,86,88,94,97,98,99,100,102,104,106,109,110,112,114,117,118,119,122,124,125,126,127,128,129,130,131,132,133,134,135,136,138,139,140,141,142,143,144,145,149,150,151,152,153,154,157,165,166,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,188,189,190,194,196) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df_tblBranch = pd.read_csv('data/input/tblBranch_research-dataset.tsv', sep='\\t')\n",
      "None\n",
      "None\n",
      "None\n",
      "None\n",
      "None\n",
      "None\n",
      "None\n",