Skip to content
Snippets Groups Projects
Commit 97efbeb5 authored by Philipp Schneider's avatar Philipp Schneider
Browse files

Create script to integrate metadata on manuscripts

parent 0eaa5805
No related branches found
No related tags found
No related merge requests found
...@@ -47,6 +47,9 @@ Merging is done by the script `merge_rdf_files_into_kg.py`. The input is given a ...@@ -47,6 +47,9 @@ Merging is done by the script `merge_rdf_files_into_kg.py`. The input is given a
* `existing_ontology`: File link to an existing knowledge graph. If set, this KG is loaded before adding any new data. The old data, including UUIDs, is then not overwritten, when `merge_rdf_files_into_kg.py` is run. * `existing_ontology`: File link to an existing knowledge graph. If set, this KG is loaded before adding any new data. The old data, including UUIDs, is then not overwritten, when `merge_rdf_files_into_kg.py` is run.
* `output_files`: List of output files and corresponding format into which the results are to be serialized. The first output-object in the list is considered as preferred and therefore used by following steps in the pipeline. * `output_files`: List of output files and corresponding format into which the results are to be serialized. The first output-object in the list is considered as preferred and therefore used by following steps in the pipeline.
#### Integrate metadata
The script `integrate_manuscript_metadata_into_kg.py` creates entities for the manuscript in the Knowledge Graph and integrates their metadata. The script is only a preliminary version; configuration is hard coded into the script.
### Create ontology documentation ### Create ontology documentation
The content of the documentation of all classes and properties is stored as TSV files in `data/input/documentation`. To integrate the whole content of the documentation directory into an RDF file, call the script `update_documentation.py` with the RDF file as a command line parameter (in most cases, this will be `digital-heraldry-ontology.ttl`) The content of the documentation of all classes and properties is stored as TSV files in `data/input/documentation`. To integrate the whole content of the documentation directory into an RDF file, call the script `update_documentation.py` with the RDF file as a command line parameter (in most cases, this will be `digital-heraldry-ontology.ttl`)
......
# integrate_manuscript_metadata_into_kg
"""
Integrate metadata on manuscripts into the Knowledge Graph
"""
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import RDF, RDFS, Namespace, OWL, XSD
import argparse
import json
import functions
import pandas as pd
from dho_namespaces import *
if __name__ == '__main__':
# parser = argparse.ArgumentParser(
# description=__doc__,
# formatter_class=argparse.RawDescriptionHelpFormatter
# )
# parser.add_argument(
# '-d',
# '--input-rdf-file',
# nargs=1,
# required=True,
# help='One rdf file with the Knowlede Graph in which the metadata is to be integrated. This is also used as output file to store the data into.'
# )
# parser.add_argument(
# '-m',
# '--input-metadata-file',
# nargs=1,
# required=True,
# help='One excel file from which the metadata is to be read.'
# )
# args = parser.parse_args()._get_kwargs()[0][1]
metadata_file = '/Users/pschneider/SeaDrive/Für mich freigegeben/Projekte Digital History/Heraldikprojekt/Datenhaltung/armorial_manuscripts_summary.xlsx'
kg_file = 'data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.ttl'
object_ontology_file = 'data/ontologies/digital-heraldry-ontology-objects.ttl'
output_files = [
{
"output_file": "data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.ttl",
"file_format": "turtle"
},
{
"output_file": "data/rdf-output/research-dataset/digital-heraldry-knowledge-graph-research-dataset.jsonld",
"file_format": "json-ld"
}
]
# Initialize rdflib graph object
g = Graph()
g.parse(object_ontology_file)
# Create objects graph with metadata on manuscripts
df_metadata = pd.read_excel(metadata_file)
print(df_metadata)
print('Creating manuscript entities...')
for index, row in df_metadata.iterrows():
if not pd.isnull(row['ArmCode']):
manuscript_uri = URIRef(dho_obj_n + 'MS' + row['ArmCode'])
g.add((manuscript_uri, RDF.type, dho_obj_n.Manuscript))
g.add((manuscript_uri, RDF.type, OWL.NamedIndividual))
if not pd.isnull(row['date-year']):
full_date = row['date-year']
if '-' in str(full_date):
earliest_date = int(full_date.split('-')[0])
latest_date = int(full_date.split('-')[1])
else:
earliest_date = int(full_date)
latest_date = int(full_date)
g.add((manuscript_uri, dho_obj_n.hasDate, Literal(str(full_date), datatype=XSD.string)))
g.add((manuscript_uri, dho_obj_n.hasEarliestDate, Literal(earliest_date, datatype=XSD.int)))
g.add((manuscript_uri, dho_obj_n.hasLatestDate, Literal(latest_date, datatype=XSD.int)))
if not pd.isnull(row['date-century']):
g.add((manuscript_uri, dho_obj_n.datedInCentury, Literal(str(row['date-century']), datatype=XSD.string)))
if not pd.isnull(row['Titel']):
g.add((manuscript_uri, dho_obj_n.hasTitle, Literal(row['Titel'], datatype=XSD.string)))
if not pd.isnull(row['library-place']):
g.add((manuscript_uri, dho_obj_n.hasLibraryPlace, Literal(row['library-place'], datatype=XSD.string)))
# Integrate knowledge graph on heraldic data
print('Loading Knowledge Graph file...')
g.parse(kg_file)
# Link manuscript objects to each coat of arms representation in the KG
print('Linking coat of arms representations to manuscript entities...')
for subject, predicate, object in g.triples((None, RDF.type, dho_rep_n.CoatOfArmsRepresentation)):
manuscript_id = str(subject).split('#')[1]
manuscript_id = manuscript_id.split('-')[0]
manuscript_uri = URIRef(dho_obj_n + 'MS' + manuscript_id)
print(manuscript_uri)
g.add((subject, dho_rep_n.partOfObject, manuscript_uri))
# Serialize the prepared graph
functions.serialize_graph(g, output_files)
# # Load config file
# with open("config/config-merge_rdf_files_into_kg.json", "r") as config_file:
# mapping_config = json.load(config_file)
# # Initialize rdflib graph object
# g = Graph()
# # Load the existing Knowledge Graph and integrate the new data there, if stated so in the config file
# if mapping_config['existing_ontology'] is not None:
# g.parse(mapping_config['existing_ontology'])
# # Integrate all given rdf graphs into a single one
# for rdf_file in args:
# print(f'Merging {rdf_file} into Knowledge Graph')
# g.parse(rdf_file)
# # Serialize the prepared graph according to options, set in the config file
# functions.serialize_graph(g, mapping_config['output_files'])
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment