Commit abc6d52e authored by Konstantin Schulz's avatar Konstantin Schulz

notes on textual criticism are now removed automatically from Perseus texts

parent bd16be14
Pipeline #15607 passed with stages
in 3 minutes and 11 seconds
......@@ -369,6 +369,7 @@ class CorpusService:
return []
xml: etree.Element = etree.fromstring(resp)
XMLservice.strip_name_spaces(xml)
XMLservice.remove_notes(xml)
return XMLservice.get_text_parts_by_urn(cts_urn_raw, xml)
@staticmethod
......
......@@ -105,32 +105,44 @@ class XMLservice:
""" Parses an XML file for the various text parts and maps them to their respective URN. """
text_list: List[ReferenceableText] = []
base_urn: str = ":".join(cts_urn_raw.split(":")[:-1])
target_elements_string: str = "*[@n]"
n_attribute_xpath: str = "@n"
target_elements_string: str = f"*[{n_attribute_xpath}]"
level1_parts: List[etree._Element] = xml.xpath(
f"/GetPassage/reply/passage/TEI/text/body/div/{target_elements_string}")
text_xpath: str = ".//text()"
for l1p in level1_parts:
level2_parts: List[etree._Element] = l1p.xpath(f"./{target_elements_string}")
l1p_value: _ElementUnicodeResult = l1p.xpath("@n")[0]
l1p_value: _ElementUnicodeResult = l1p.xpath(n_attribute_xpath)[0]
if level2_parts:
for l2p in level2_parts:
l2p_value: _ElementUnicodeResult = l2p.xpath("@n")[0]
l2p_value: _ElementUnicodeResult = l2p.xpath(n_attribute_xpath)[0]
level3_parts: List[etree._Element] = l2p.xpath(f"./{target_elements_string}")
if level3_parts:
for l3p in level3_parts:
l3p_value: _ElementUnicodeResult = l3p.xpath("@n")[0]
text_values: List[str] = l3p.xpath(".//text()")
l3p_value: _ElementUnicodeResult = l3p.xpath(n_attribute_xpath)[0]
text_values: List[str] = l3p.xpath(text_xpath)
urn: str = f"{base_urn}:{str(l1p_value)}.{str(l2p_value)}.{str(l3p_value)}"
text_list.append(ReferenceableText(" ".join(" ".join(text_values).split()), urn))
else:
text_values: List[str] = l2p.xpath(".//text()")
text_values: List[str] = l2p.xpath(text_xpath)
urn: str = f"{base_urn}:{str(l1p_value)}.{str(l2p_value)}"
text_list.append(ReferenceableText(" ".join(" ".join(text_values).split()), urn))
else:
text_values: List[str] = l1p.xpath(".//text()")
text_values: List[str] = l1p.xpath(text_xpath)
urn: str = f"{base_urn}:{str(l1p_value)}"
text_list.append(ReferenceableText(" ".join(" ".join(text_values).split()), urn))
return text_list
@staticmethod
def remove_notes(parent: etree._Element):
"""Removes all notes from an XML document, including textual criticism."""
for child in parent:
if child.tag == "note":
parent.remove(child)
elif len([x for x in child]) > 0:
XMLservice.remove_notes(child)
pass
@staticmethod
def strip_name_spaces(xml: etree._Element) -> None:
"""Removes all namespaces from an XML document for easier parsing, e.g. with XPath."""
......
......@@ -676,7 +676,7 @@ class Mocks:
citation_level_1=CitationLevel.default.value)]
cts_capabilities_xml: str = '<GetCapabilities xmlns="http://chs.harvard.edu/xmlns/cts"><request><requestName>GetInventory</requestName><requestFilters>urn=urn:cts:latinLit</requestFilters></request><reply><ti:TextInventory xmlns:ti=\'http://chs.harvard.edu/xmlns/cts\'><ti:textgroup urn=\'urn:cts:latinLit:phi0660\' xmlns:ti=\'http://chs.harvard.edu/xmlns/cts\'><ti:groupname xml:lang=\'eng\'>Tibullus</ti:groupname><ti:groupname xml:lang=\'lat\'>Corpus Tibullianum</ti:groupname><ti:work xml:lang="lat" urn=\'urn:cts:latinLit:phi0660.phi001\' groupUrn=\'urn:cts:latinLit:phi0660\' xmlns:ti=\'http://chs.harvard.edu/xmlns/cts\'><ti:title xml:lang=\'lat\'>Elegiae</ti:title><ti:edition urn=\'urn:cts:latinLit:phi0660.phi001.perseus-lat2\' workUrn=\'urn:cts:latinLit:phi0660.phi001\' xmlns:ti=\'http://chs.harvard.edu/xmlns/cts\'><ti:label xml:lang=\'eng\'>Elegiae, Aliorumque carminum libri tres</ti:label><ti:description xml:lang=\'eng\'>Tibullus, creator; Postgate, J. P. (John Percival), 1853- 1926, editor </ti:description><ti:online><ti:citationMapping><ti:citation label="book" xpath="/tei:div[@n=\'?\']" scope="/tei:TEI/tei:text/tei:body/tei:div"><ti:citation label="poem" xpath="/tei:div[@n=\'?\']" scope="/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'?\']"><ti:citation label="line" xpath="//tei:l[@n=\'?\']" scope="/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'?\']/tei:div[@n=\'?\']"></ti:citation></ti:citation></ti:citation></ti:citationMapping></ti:online></ti:edition></ti:work><ti:work xml:lang="lat" urn=\'urn:cts:latinLit:phi0660.phi003\' groupUrn=\'urn:cts:latinLit:phi0660\' xmlns:ti=\'http://chs.harvard.edu/xmlns/cts\'> </ti:work></ti:textgroup></ti:TextInventory></reply></GetCapabilities>'
cts_passage_xml: str = '<GetPassage xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns="http://chs.harvard.edu/xmlns/cts"><request><requestName>GetPassage</requestName><requestUrn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1-1.1.2</requestUrn></request><reply><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1-1.1.2</urn><passage><TEI xmlns="http://www.tei-c.org/ns/1.0"><text><body><div type="edition" xml:lang="lat" n="urn:cts:latinLit:phi0448.phi001.perseus-lat2"><div n="1" type="textpart" subtype="book"><div type="textpart" subtype="chapter" n="1"><div type="textpart" subtype="section" n="1"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div><div type="textpart" subtype="section" n="2"><p>Hi omnes lingua, institutis, legibus inter se differunt. Gallos ab Aquitanis Garumna flumen, a Belgis Matrona et Sequana dividit.</p></div></div></div></div></body></text></TEI></passage></reply></GetPassage>'
cts_passage_xml_1_level: str = '<GetPassage xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns="http://chs.harvard.edu/xmlns/cts"><request><requestName>GetPassage</requestName><requestUrn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1-1.2</requestUrn></request><reply><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1-1.2</urn><passage><TEI xmlns="http://www.tei-c.org/ns/1.0"><text><body><div type="edition" xml:lang="lat" n="urn:cts:latinLit:phi0448.phi001.perseus-lat2"><div n="1" type="textpart" subtype="book"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div><div n="2" type="textpart" subtype="book"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div><div n="3" type="textpart" subtype="book"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div></div></body></text></TEI></passage></reply></GetPassage>'
cts_passage_xml_1_level: str = '<GetPassage xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns="http://chs.harvard.edu/xmlns/cts"><request><requestName>GetPassage</requestName><requestUrn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1-1.2</requestUrn></request><reply><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1-1.2</urn><passage><TEI xmlns="http://www.tei-c.org/ns/1.0"><text><body><div type="edition" xml:lang="lat" n="urn:cts:latinLit:phi0448.phi001.perseus-lat2"><div n="1" type="textpart" subtype="book"><note>fake textual criticism</note><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div><div n="2" type="textpart" subtype="book"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div><div n="3" type="textpart" subtype="book"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div></div></body></text></TEI></passage></reply></GetPassage>'
cts_passage_xml_2_levels: str = '<GetPassage xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns="http://chs.harvard.edu/xmlns/cts"><request><requestName>GetPassage</requestName><requestUrn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1-1.2</requestUrn></request><reply><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1-1.2</urn><passage><TEI xmlns="http://www.tei-c.org/ns/1.0"><text><body><div type="edition" xml:lang="lat" n="urn:cts:latinLit:phi0448.phi001.perseus-lat2"><div n="1" type="textpart" subtype="book"><div type="textpart" subtype="section" n="1"><p>Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.</p></div></div></div></body></text></TEI></passage></reply></GetPassage>'
cts_reff_xml: str = '<GetValidReff xmlns:tei="http://www.tei-c.org/ns/1.0" xmlns="http://chs.harvard.edu/xmlns/cts"><request><requestName>GetValidReff</requestName><requestUrn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1</requestUrn><requestLevel>3</requestLevel></request><reply><reff><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1</urn><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.2</urn><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.3</urn><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.4</urn><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.5</urn><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.6</urn><urn>urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.7</urn></reff></reply></GetValidReff>'
exercise: Exercise = ExerciseMC.from_dict(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment