From 97efbeb52a4271cfaf29bbcce1eaf4921a9d56f9 Mon Sep 17 00:00:00 2001 From: Philipp Schneider <schneider.philipp@uni-muenster.de> Date: Mon, 25 Jul 2022 18:23:09 +0200 Subject: [PATCH] Create script to integrate metadata on manuscripts --- README.md | 3 + .../integrate_manuscript_metadata_into_kg.py | 131 ++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 src/rdf-mappings/integrate_manuscript_metadata_into_kg.py diff --git a/README.md b/README.md index 5aa21cf..a1485c8 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,9 @@ Merging is done by the script `merge_rdf_files_into_kg.py`. The input is given a * `existing_ontology`: File link to an existing knowledge graph. If set, this KG is loaded before adding any new data. The old data, including UUIDs, is then not overwritten, when `merge_rdf_files_into_kg.py` is run. * `output_files`: List of output files and corresponding format into which the results are to be serialized. The first output-object in the list is considered as preferred and therefore used by following steps in the pipeline. +#### Integrate metadata +The script `integrate_manuscript_metadata_into_kg.py` creates entities for the manuscript in the Knowledge Graph and integrates their metadata. The script is only a preliminary version; configuration is hard coded into the script. + ### Create ontology documentation The content of the documentation of all classes and properties is stored as TSV files in `data/input/documentation`. To integrate the whole content of the documentation directory into an RDF file, call the script `update_documentation.py` with the RDF file as a command line parameter (in most cases, this will be `digital-heraldry-ontology.ttl`) diff --git a/src/rdf-mappings/integrate_manuscript_metadata_into_kg.py b/src/rdf-mappings/integrate_manuscript_metadata_into_kg.py new file mode 100644 index 0000000..e6b7abd --- /dev/null +++ b/src/rdf-mappings/integrate_manuscript_metadata_into_kg.py @@ -0,0 +1,131 @@ +# integrate_manuscript_metadata_into_kg + +""" +Integrate metadata on manuscripts into the Knowledge Graph +""" + +from rdflib import Graph, URIRef, Literal +from rdflib.namespace import RDF, RDFS, Namespace, OWL, XSD +import argparse +import json +import functions +import pandas as pd +from dho_namespaces import * + +if __name__ == '__main__': + # parser = argparse.ArgumentParser( + # description=__doc__, + # formatter_class=argparse.RawDescriptionHelpFormatter + # ) + + # parser.add_argument( + # '-d', + # '--input-rdf-file', + # nargs=1, + # required=True, + # help='One rdf file with the Knowlede Graph in which the metadata is to be integrated. This is also used as output file to store the data into.' + # ) + # parser.add_argument( + # '-m', + # '--input-metadata-file', + # nargs=1, + # required=True, + # help='One excel file from which the metadata is to be read.' + # ) + + # args = parser.parse_args()._get_kwargs()[0][1] + + metadata_file = '/Users/pschneider/SeaDrive/Für mich freigegeben/Projekte Digital History/Heraldikprojekt/Datenhaltung/armorial_manuscripts_summary.xlsx' + kg_file = 'data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.ttl' + object_ontology_file = 'data/ontologies/digital-heraldry-ontology-objects.ttl' + + output_files = [ + { + "output_file": "data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.ttl", + "file_format": "turtle" + }, + { + "output_file": "data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.jsonld", + "file_format": "json-ld" + } + ] + + # Initialize rdflib graph object + g = Graph() + g.parse(object_ontology_file) + + # Create objects graph with metadata on manuscripts + df_metadata = pd.read_excel(metadata_file) + + print(df_metadata) + print('Creating manuscript entities...') + + for index, row in df_metadata.iterrows(): + if not pd.isnull(row['ArmCode']): + manuscript_uri = URIRef(dho_obj_n + 'MS' + row['ArmCode']) + g.add((manuscript_uri, RDF.type, dho_obj_n.Manuscript)) + g.add((manuscript_uri, RDF.type, OWL.NamedIndividual)) + + if not pd.isnull(row['date-year']): + full_date = row['date-year'] + + if '-' in str(full_date): + earliest_date = int(full_date.split('-')[0]) + latest_date = int(full_date.split('-')[1]) + else: + earliest_date = int(full_date) + latest_date = int(full_date) + + g.add((manuscript_uri, dho_obj_n.hasDate, Literal(str(full_date), datatype=XSD.string))) + g.add((manuscript_uri, dho_obj_n.hasEarliestDate, Literal(earliest_date, datatype=XSD.int))) + g.add((manuscript_uri, dho_obj_n.hasLatestDate, Literal(latest_date, datatype=XSD.int))) + + if not pd.isnull(row['date-century']): + g.add((manuscript_uri, dho_obj_n.datedInCentury, Literal(str(row['date-century']), datatype=XSD.string))) + + if not pd.isnull(row['Titel']): + g.add((manuscript_uri, dho_obj_n.hasTitle, Literal(row['Titel'], datatype=XSD.string))) + + if not pd.isnull(row['library-place']): + g.add((manuscript_uri, dho_obj_n.hasLibraryPlace, Literal(row['library-place'], datatype=XSD.string))) + + # Integrate knowledge graph on heraldic data + print('Loading Knowledge Graph file...') + g.parse(kg_file) + + # Link manuscript objects to each coat of arms representation in the KG + print('Linking coat of arms representations to manuscript entities...') + for subject, predicate, object in g.triples((None, RDF.type, dho_rep_n.CoatOfArmsRepresentation)): + manuscript_id = str(subject).split('#')[1] + manuscript_id = manuscript_id.split('-')[0] + + manuscript_uri = URIRef(dho_obj_n + 'MS' + manuscript_id) + + print(manuscript_uri) + + g.add((subject, dho_rep_n.partOfObject, manuscript_uri)) + + # Serialize the prepared graph + functions.serialize_graph(g, output_files) + + + + + # # Load config file + # with open("config/config-merge_rdf_files_into_kg.json", "r") as config_file: + # mapping_config = json.load(config_file) + + # # Initialize rdflib graph object + # g = Graph() + + # # Load the existing Knowledge Graph and integrate the new data there, if stated so in the config file + # if mapping_config['existing_ontology'] is not None: + # g.parse(mapping_config['existing_ontology']) + + # # Integrate all given rdf graphs into a single one + # for rdf_file in args: + # print(f'Merging {rdf_file} into Knowledge Graph') + # g.parse(rdf_file) + + # # Serialize the prepared graph according to options, set in the config file + # functions.serialize_graph(g, mapping_config['output_files']) \ No newline at end of file -- GitLab