From 97efbeb52a4271cfaf29bbcce1eaf4921a9d56f9 Mon Sep 17 00:00:00 2001
From: Philipp Schneider <schneider.philipp@uni-muenster.de>
Date: Mon, 25 Jul 2022 18:23:09 +0200
Subject: [PATCH] Create script to integrate metadata on manuscripts

---
 README.md                                     |   3 +
 .../integrate_manuscript_metadata_into_kg.py  | 131 ++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 src/rdf-mappings/integrate_manuscript_metadata_into_kg.py

diff --git a/README.md b/README.md
index 5aa21cf..a1485c8 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,9 @@ Merging is done by the script `merge_rdf_files_into_kg.py`. The input is given a
 * `existing_ontology`: File link to an existing knowledge graph. If set, this KG is loaded before adding any new data. The old data, including UUIDs, is then not overwritten, when `merge_rdf_files_into_kg.py` is run.
 * `output_files`: List of output files and corresponding format into which the results are to be serialized. The first output-object in the list is considered as preferred and therefore used by following steps in the pipeline.
 
+#### Integrate metadata
+The script `integrate_manuscript_metadata_into_kg.py` creates entities for the manuscript in the Knowledge Graph and integrates their metadata. The script is only a preliminary version; configuration is hard coded into the script.
+
 ### Create ontology documentation
 The content of the documentation of all classes and properties is stored as TSV files in `data/input/documentation`. To integrate the whole content of the documentation directory into an RDF file, call the script `update_documentation.py` with the RDF file as a command line parameter (in most cases, this will be `digital-heraldry-ontology.ttl`)
 
diff --git a/src/rdf-mappings/integrate_manuscript_metadata_into_kg.py b/src/rdf-mappings/integrate_manuscript_metadata_into_kg.py
new file mode 100644
index 0000000..e6b7abd
--- /dev/null
+++ b/src/rdf-mappings/integrate_manuscript_metadata_into_kg.py
@@ -0,0 +1,131 @@
+# integrate_manuscript_metadata_into_kg
+
+"""
+Integrate metadata on manuscripts into the Knowledge Graph
+"""
+
+from rdflib import Graph, URIRef, Literal
+from rdflib.namespace import RDF, RDFS, Namespace, OWL, XSD
+import argparse
+import json
+import functions
+import pandas as pd
+from dho_namespaces import *
+
+if __name__ == '__main__':
+    # parser = argparse.ArgumentParser(
+    #     description=__doc__,
+    #     formatter_class=argparse.RawDescriptionHelpFormatter
+    # )
+
+    # parser.add_argument(
+    #     '-d',
+    #     '--input-rdf-file',
+    #     nargs=1,
+    #     required=True,
+    #     help='One rdf file with the Knowlede Graph in which the metadata is to be integrated. This is also used as output file to store the data into.'
+    # )
+    # parser.add_argument(
+    #     '-m',
+    #     '--input-metadata-file',
+    #     nargs=1,
+    #     required=True,
+    #     help='One excel file from which the metadata is to be read.'
+    # )
+
+    # args = parser.parse_args()._get_kwargs()[0][1]
+
+    metadata_file = '/Users/pschneider/SeaDrive/Für mich freigegeben/Projekte Digital History/Heraldikprojekt/Datenhaltung/armorial_manuscripts_summary.xlsx'
+    kg_file = 'data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.ttl'
+    object_ontology_file = 'data/ontologies/digital-heraldry-ontology-objects.ttl'
+
+    output_files = [
+        {
+            "output_file": "data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.ttl",
+            "file_format": "turtle"
+        },
+        {
+            "output_file": "data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.jsonld",
+            "file_format": "json-ld"
+        }
+    ]
+
+    # Initialize rdflib graph object
+    g = Graph()
+    g.parse(object_ontology_file)
+
+    # Create objects graph with metadata on manuscripts
+    df_metadata = pd.read_excel(metadata_file)
+
+    print(df_metadata)
+    print('Creating manuscript entities...')
+
+    for index, row in df_metadata.iterrows():
+        if not pd.isnull(row['ArmCode']):
+            manuscript_uri = URIRef(dho_obj_n + 'MS' + row['ArmCode'])
+            g.add((manuscript_uri, RDF.type, dho_obj_n.Manuscript))
+            g.add((manuscript_uri, RDF.type, OWL.NamedIndividual))
+
+            if not pd.isnull(row['date-year']):
+                full_date = row['date-year']
+
+                if '-' in str(full_date):
+                    earliest_date = int(full_date.split('-')[0])
+                    latest_date = int(full_date.split('-')[1])
+                else:
+                    earliest_date = int(full_date)
+                    latest_date = int(full_date)
+
+                g.add((manuscript_uri, dho_obj_n.hasDate, Literal(str(full_date), datatype=XSD.string)))
+                g.add((manuscript_uri, dho_obj_n.hasEarliestDate, Literal(earliest_date, datatype=XSD.int)))
+                g.add((manuscript_uri, dho_obj_n.hasLatestDate, Literal(latest_date, datatype=XSD.int)))
+
+            if not pd.isnull(row['date-century']):
+                g.add((manuscript_uri, dho_obj_n.datedInCentury, Literal(str(row['date-century']), datatype=XSD.string)))
+
+            if not pd.isnull(row['Titel']):
+                g.add((manuscript_uri, dho_obj_n.hasTitle, Literal(row['Titel'], datatype=XSD.string)))
+
+            if not pd.isnull(row['library-place']):
+                g.add((manuscript_uri, dho_obj_n.hasLibraryPlace, Literal(row['library-place'], datatype=XSD.string)))
+
+    # Integrate knowledge graph on heraldic data
+    print('Loading Knowledge Graph file...')
+    g.parse(kg_file)
+
+    # Link manuscript objects to each coat of arms representation in the KG
+    print('Linking coat of arms representations to manuscript entities...')
+    for subject, predicate, object in g.triples((None, RDF.type, dho_rep_n.CoatOfArmsRepresentation)):
+        manuscript_id = str(subject).split('#')[1]
+        manuscript_id = manuscript_id.split('-')[0]
+
+        manuscript_uri = URIRef(dho_obj_n + 'MS' + manuscript_id)
+
+        print(manuscript_uri)
+        
+        g.add((subject, dho_rep_n.partOfObject, manuscript_uri))
+
+    # Serialize the prepared graph
+    functions.serialize_graph(g, output_files)
+
+
+
+
+    # # Load config file
+    # with open("config/config-merge_rdf_files_into_kg.json", "r") as config_file:
+    #     mapping_config = json.load(config_file)
+
+    # # Initialize rdflib graph object
+    # g = Graph()
+
+    # # Load the existing Knowledge Graph and integrate the new data there, if stated so in the config file
+    # if mapping_config['existing_ontology'] is not None:
+    #     g.parse(mapping_config['existing_ontology'])
+
+    # # Integrate all given rdf graphs into a single one
+    # for rdf_file in args:
+    #     print(f'Merging {rdf_file} into Knowledge Graph')
+    #     g.parse(rdf_file)
+
+    # # Serialize the prepared graph according to options, set in the config file
+    # functions.serialize_graph(g, mapping_config['output_files'])
\ No newline at end of file
-- 
GitLab