Commit ed685f27 authored by Konstantin Schulz's avatar Konstantin Schulz

fixed sentence URN annotations for cases where the text repository does not...

fixed sentence URN annotations for cases where the text repository does not provide enough information
parent 06143cb3
Pipeline #13199 passed with stages
in 2 minutes and 42 seconds
......@@ -458,3 +458,9 @@ class CustomCorpus:
self.corpus = corpus
self.file_path = file_path
self.text_parts: List[TextPart] = [] if text_parts is None else text_parts
class ReferenceableText:
def __init__(self, text: str = "", urn: str = ""):
self.text: str = text
self.urn: str = urn
......@@ -11,7 +11,7 @@ from graphannis.graph import GraphUpdate
from networkx import MultiDiGraph, json_graph
from mcserver.app.models import Phenomenon, Case, PartOfSpeech, Dependency, Solution, ExerciseType, NodeMC, \
ExerciseData, GraphData, LinkMC, TextPart
ExerciseData, GraphData, LinkMC, TextPart, ReferenceableText
from mcserver.config import Config
......@@ -82,30 +82,71 @@ class AnnotationService:
Phenomenon.LEMMA: {}}
@staticmethod
def add_urn_to_sentences(text_list: List[Tuple[str, str]], annotations: List[TokenList]) -> None:
def add_annotations_to_graph(
conll: List[TokenList], g: GraphUpdate, doc_name: str, doc_path: str) -> None:
""" Adds new annotations (provided in CONLL-U format) to a networkx graph. """
current_urn: str = ""
tok_before: str = ""
for tokenList in conll:
conllid_to_annisid = dict()
# create the sentence ID
sentence_id: int = tokenList.metadata["sent_id"]
current_urn = tokenList.metadata.get('urn', current_urn)
sentence_node_name: str = f"{current_urn}/{doc_name}#sent{sentence_id}"
# add nodes
for tok in tokenList.tokens:
token_id: int = tok["id"]
# map CONLL to graphANNIS
tok_id_final: str = sentence_node_name + f"tok{token_id}"
conllid_to_annisid[token_id] = tok_id_final
AnnotationService.map_token(tok, tok_id_final, g)
# a token belongs to its document
g.add_edge(tok_id_final, doc_path, 'annis', 'PartOf', '')
if tok_before:
# add ordering edge between the tokens
g.add_edge(tok_before, tok_id_final, 'annis', 'Ordering', '')
# remember the current token for the next iteration
tok_before = tok_id_final
# add pointing relations
for tok in tokenList.tokens:
head: int = tok.get("head", 0)
if head != 0:
tok_id_source = conllid_to_annisid[head]
tok_id_target = conllid_to_annisid[tok['id']]
g.add_edge(tok_id_source, tok_id_target, '', 'Pointing', 'dep')
if Config.AQL_DEPREL in tok:
g.add_edge_label(tok_id_source, tok_id_target, '', 'Pointing', 'dep', 'udep',
Config.AQL_DEPREL, tok[Config.AQL_DEPREL])
@staticmethod
def add_urn_to_sentences(text_list: List[ReferenceableText], annotations: List[TokenList]) -> None:
""" Adds the relevant URN for every annotated sentence. """
current_text_list_index: int = 0
current_start_index: int = 0
for sent in annotations:
first_token: str = sent.tokens[0]["form"]
new_index: int = text_list[current_text_list_index][1].find(first_token, current_start_index)
# check if the text from the text list contains the first token from the annotations
new_index: int = text_list[current_text_list_index].text.find(first_token, current_start_index)
if new_index > -1:
# the text from the text list contains multiple sentences from the annotations
# thus, increment only the search index, not the text list index
current_start_index = new_index + len(first_token)
elif not first_token[-1].isalpha():
# account for cases where the parser failed to tokenize correctly, thus appending punctuation to the
# end of a regular word
new_index = text_list[current_text_list_index][1].find(first_token[:-1], current_start_index)
# account for cases where the parser failed to tokenize correctly
# this is detected by looking for punctuation appended to the end of a regular word
new_index = text_list[current_text_list_index].text.find(first_token[:-1], current_start_index)
if new_index < 0:
continue
else:
current_start_index = new_index + len(first_token)
else:
while new_index < 0 and len(text_list) > current_text_list_index:
while new_index < 0 and len(text_list) > current_text_list_index + 1:
current_text_list_index += 1
current_start_index = 0
new_index = text_list[current_text_list_index][1].find(first_token, current_start_index)
new_index = text_list[current_text_list_index].text.find(first_token, current_start_index)
current_start_index = new_index + len(first_token)
sent.metadata["urn"] = text_list[current_text_list_index][0]
# assign the URN from the corresponding text list entry to the annotations
sent.metadata["urn"] = text_list[current_text_list_index].urn
@staticmethod
def get_citation_label(text_parts: List[TextPart], citation_values: List[int]) -> str:
......@@ -175,46 +216,16 @@ class AnnotationService:
# delete any existing corpus with this name
cs.delete_corpus(file_name)
# currently there is only one document because texts are their own corpus
doc_name = 'doc1'
doc_name: str = 'doc1'
with GraphUpdate() as g:
doc_path = corpus_name + '/' + doc_name
doc_path: str = corpus_name + '/' + doc_name
# create a corpus and document node
# both nodes belong to the corpus graph, not the annotation graph
g.add_node(node_name=corpus_name, node_type="corpus")
g.add_node(node_name=doc_path, node_type="corpus")
# the document is part of the corpus
g.add_edge(doc_path, corpus_name, 'annis', 'PartOf', '')
tok_before = None
for tokenList in conll:
conllid_to_annisid = dict()
# create the sentence ID
sentence_id: int = tokenList.metadata["sent_id"]
sentence_node_name: str = f"{tokenList.metadata['urn']}/{doc_name}#sent{sentence_id}"
# add nodes
for tok in tokenList.tokens:
token_id: int = tok["id"]
# map CONLL to graphANNIS
tok_id_final = sentence_node_name + "tok{0}".format(token_id)
conllid_to_annisid[tok['id']] = tok_id_final
AnnotationService.map_token(tok, tok_id_final, g)
# a token belongs to its document
g.add_edge(tok_id_final, doc_path, 'annis', 'PartOf', '')
if tok_before is not None:
# add ordering edge between the tokens
g.add_edge(tok_before, tok_id_final, 'annis', 'Ordering', '')
# remember the current token for the next iteration
tok_before = tok_id_final
# add pointing relations
for tok in tokenList.tokens:
if 'head' in tok:
if tok['head'] != 0:
tok_id_source = conllid_to_annisid[tok['head']]
tok_id_target = conllid_to_annisid[tok['id']]
g.add_edge(tok_id_source, tok_id_target, '', 'Pointing', 'dep')
if Config.AQL_DEPREL in tok:
g.add_edge_label(tok_id_source, tok_id_target, '', 'Pointing', 'dep', 'udep',
Config.AQL_DEPREL, tok[Config.AQL_DEPREL])
AnnotationService.add_annotations_to_graph(conll, g, doc_name, doc_path)
cs.apply_update(file_name, g)
@staticmethod
......
......@@ -18,7 +18,7 @@ from requests import HTTPError
from sqlalchemy.exc import OperationalError
from mcserver.app import db
from mcserver.app.models import CitationLevel, GraphData, Solution, ExerciseType, Phenomenon, AnnisResponse, CorpusMC, \
make_solution_element_from_salt_id, FrequencyItem, ResourceType
make_solution_element_from_salt_id, FrequencyItem, ResourceType, ReferenceableText
from mcserver.app.services import AnnotationService, XMLservice, TextService, FileService, FrequencyService, \
CustomCorpusService, DatabaseService
from mcserver.config import Config
......@@ -182,8 +182,8 @@ class CorpusService:
try:
mdg = Config.CORPUS_STORAGE_MANAGER.subcorpus_graph(cts_urn_raw_disk, [doc_id])
except (NoSuchCorpus, GraphANNISException):
text_list: List[Tuple[str, str]] = CorpusService.load_text_list(cts_urn_raw=cts_urn_raw)
raw_text: str = TextService.strip_whitespace(" ".join([x[1] for x in text_list]))
text_list: List[ReferenceableText] = CorpusService.load_text_list(cts_urn_raw=cts_urn_raw)
raw_text: str = TextService.strip_whitespace(" ".join([x.text for x in text_list]))
annotations_conll: str = AnnotationService.get_udpipe(raw_text)
# parse CONLL and add root dependencies as separate node annotations
annotations = AnnotationService.parse_conll_string(annotations_conll)
......@@ -355,7 +355,7 @@ class CorpusService:
return maybe_urn.startswith("urn:")
@staticmethod
def load_text_list(cts_urn_raw: str) -> List[Tuple[str, str]]:
def load_text_list(cts_urn_raw: str) -> List[ReferenceableText]:
""" Loads the text list for a new corpus. """
if CustomCorpusService.is_custom_corpus_urn(cts_urn_raw):
# this is a custom corpus, e.g. the VIVA textbook
......
......@@ -6,7 +6,7 @@ import conllu
import rapidjson as json
from conllu import TokenList
from mcserver import Config
from mcserver.app.models import CustomCorpus, CitationLevel, TextPart, Citation, CorpusMC
from mcserver.app.models import CustomCorpus, CitationLevel, TextPart, Citation, CorpusMC, ReferenceableText
from mcserver.app.services import AnnotationService, FileService
......@@ -56,9 +56,9 @@ class CustomCorpusService:
@staticmethod
def extract_custom_corpus_text(relevant_text_parts: List[TextPart], start_parts: List[str], end_parts: List[str],
base_urn: str, current_idx: int = 0, consider_start: List[bool] = None) \
-> List[Tuple[str, str]]:
-> List[ReferenceableText]:
""" Extracts text from the relevant parts of a (custom) corpus. """
text_list: List[Tuple[str, str]] = []
text_list: List[ReferenceableText] = []
nxt: callable = CustomCorpusService.extract_custom_corpus_text_next_level
for rtp in relevant_text_parts:
new_urn: str = ("." if current_idx else ":").join([base_urn, str(rtp.citation.value)])
......@@ -80,11 +80,11 @@ class CustomCorpusService:
@staticmethod
def extract_custom_corpus_text_next_level(rtp: TextPart, start_parts: List[str], end_parts: List[str], new_urn: str,
text_list: List[Tuple[str, str]], current_idx: int = 0,
text_list: List[ReferenceableText], current_idx: int = 0,
consider_start: List[bool] = None) -> None:
""" Extracts text from the next level of relevant text parts for a (custom corpus). """
if current_idx == len(start_parts) - 1:
text_list.append((new_urn, rtp.text_value))
text_list.append(ReferenceableText(rtp.text_value, new_urn))
else:
current_idx += 1
text_list += CustomCorpusService.extract_custom_corpus_text(rtp.sub_text_parts, start_parts,
......@@ -97,8 +97,8 @@ class CustomCorpusService:
if AnnotationService.has_urn_sentence_range(urn):
urn_split = urn.split("@")
urn = urn_split[0]
text_list: List[Tuple[str, str]] = CustomCorpusService.get_custom_corpus_text(urn)
annotations_conll: str = AnnotationService.get_udpipe(" ".join(x[1] for x in text_list))
text_list: List[ReferenceableText] = CustomCorpusService.get_custom_corpus_text(urn)
annotations_conll: str = AnnotationService.get_udpipe(" ".join(x.text for x in text_list))
conll: List[TokenList] = AnnotationService.parse_conll_string(annotations_conll)
if len(urn_split):
sentence_range: List[int] = list(map(lambda x: int(x), urn_split[1].split("-")))
......@@ -159,7 +159,7 @@ class CustomCorpusService:
return []
@staticmethod
def get_custom_corpus_text(urn: str) -> List[Tuple[str, str]]:
def get_custom_corpus_text(urn: str) -> List[ReferenceableText]:
""" Retrieves the text for a custom corpus, e.g. a textbook. """
urn_parts: List[str] = urn.split(":")
base_urn: str = urn.replace(":" + urn_parts[-1], "")
......@@ -354,7 +354,7 @@ class CustomCorpusService:
@staticmethod
def prepare_custom_corpus_text_next_level(rtp: TextPart, start_parts: List[str], end_parts: List[str], new_urn: str,
text_list: List[Tuple[str, str]], nxt: callable,
text_list: List[ReferenceableText], nxt: callable,
current_idx: int = 0) -> None:
""" Identifies possible candidates and relevant URN parts for the next text level. """
if int(start_parts[current_idx]) < rtp.citation.value < int(end_parts[current_idx]):
......
......@@ -6,7 +6,7 @@ from lxml import etree, objectify
from lxml.etree import _ElementUnicodeResult
from collections import OrderedDict
from mcserver.app.models import ExerciseType, FileType, Solution
from mcserver.app.models import ExerciseType, FileType, Solution, ReferenceableText
from mcserver.app.services import TextService
from mcserver.models_auto import Exercise
......@@ -101,9 +101,9 @@ class XMLservice:
return TextService.strip_whitespace(" ".join([y["form"] for x in conll for y in x]))
@staticmethod
def get_text_parts_by_urn(cts_urn_raw: str, xml: etree._Element) -> List[Tuple[str, str]]:
def get_text_parts_by_urn(cts_urn_raw: str, xml: etree._Element) -> List[ReferenceableText]:
""" Parses an XML file for the various text parts and maps them to their respective URN. """
text_list: List[Tuple[str, str]] = []
text_list: List[ReferenceableText] = []
base_urn: str = ":".join(cts_urn_raw.split(":")[:-1])
target_elements_string: str = "*[@n]"
level1_parts: List[etree._Element] = xml.xpath(
......@@ -120,15 +120,15 @@ class XMLservice:
l3p_value: _ElementUnicodeResult = l3p.xpath("@n")[0]
text_values: List[str] = l3p.xpath(".//text()")
urn: str = f"{base_urn}:{str(l1p_value)}.{str(l2p_value)}.{str(l3p_value)}"
text_list.append((urn, " ".join(" ".join(text_values).split())))
text_list.append(ReferenceableText(" ".join(" ".join(text_values).split()), urn))
else:
text_values: List[str] = l2p.xpath(".//text()")
urn: str = f"{base_urn}:{str(l1p_value)}.{str(l2p_value)}"
text_list.append((urn, " ".join(" ".join(text_values).split())))
text_list.append(ReferenceableText(" ".join(" ".join(text_values).split()), urn))
else:
text_values: List[str] = l1p.xpath(".//text()")
urn: str = f"{base_urn}:{str(l1p_value)}"
text_list.append((urn, " ".join(" ".join(text_values).split())))
text_list.append(ReferenceableText(" ".join(" ".join(text_values).split()), urn))
return text_list
@staticmethod
......
......@@ -20,7 +20,7 @@ from mcserver import Config, TestingConfig
from mcserver.app import db, shutdown_session
from mcserver.app.models import Phenomenon, PartOfSpeech, CitationLevel, ExerciseData, GraphData, \
LinkMC, NodeMC, Language, Dependency, Case, AnnisResponse, Solution, TextPart, Citation, ExerciseMC, CorpusMC, \
SolutionElement
SolutionElement, ReferenceableText
from mcserver.app.services import AnnotationService, CustomCorpusService, TextService, DatabaseService
from mcserver.models_auto import Corpus, Exercise, UpdateInfo
......@@ -759,8 +759,9 @@ class Mocks:
subgraph_json: str = '{"exercise_id":"","exercise_type":"","frequency_analysis":null,"graph_data":{"directed":true,"graph":{},"links":[],"multigraph":true,"nodes":[{"annis_node_name":"urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1-1.1.1/doc1#sent1tok3","annis_node_type":"node","annis_tok":"Galli","annis_type":"node","id":"salt:/urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1-1.1.1/doc1#sent1tok3","is_oov":null,"udep_lemma":"Gallo","udep_upostag":"VERB","udep_xpostag":"L3|modQ|tem1|stAC","udep_feats":"Tense=Pres|VerbForm=Inf|Voice=Pass","solution":null}]},"solutions":[],"text_complexity":null,"uri":""}'
test_args: List[str] = ["tests.py", "-test"]
text_complexity_json_string: str = '{"all":54.53,"avg_w_len":5.79,"avg_w_per_sent":17.33,"lex_den":0.73,"n_abl_abs":0,"n_clause":1,"n_gerund":1,"n_inf":1,"n_part":1,"n_punct":3,"n_sent":3,"n_subclause":0,"n_types":48,"n_w":52,"pos":11}'
text_list: List[Tuple[str, str]] = [("urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1", raw_text.split(".")[0]),
("urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.2", raw_text.split(".")[1])]
text_list: List[ReferenceableText] = [
ReferenceableText(raw_text.split(".")[0], "urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1"),
ReferenceableText(raw_text.split(".")[1], "urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.2")]
text_parts: List[TextPart] = [
TextPart(citation=Citation(level=CitationLevel.book, label="2", value=2), text_value="text", sub_text_parts=[
TextPart(
......@@ -769,7 +770,7 @@ class Mocks:
TextPart(citation=Citation(level=CitationLevel.section, label="1", value=1),
text_value="subtext"),
TextPart(citation=Citation(level=CitationLevel.section, label="2", value=2))])])]
udpipe_string: str = "# newpar\n# sent_id = 1\n# text = Caesar fortis est.\n1\tCaesar\tCaeso\tVERB\tC1|grn1|casA|gen1|stAN\tCase=Nom|Degree=Pos|Gender=Masc|Number=Sing\t2\tcsubj\t_\t_\n2\tfortis\tfortis\tADJ\tC1|grn1|casA|gen1|stAN\tCase=Nom|Degree=Pos|Gender=Masc|Number=Sing\t0\troot\t_\t_\n3\test\tsum\tAUX\tN3|modA|tem1|gen6|stAV\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act\t2\tcop\t_\tSpaceAfter=No\n4\t.\t.\tPUNCT\tPunc\t_\t2\tpunct\t_\t_\n\n# sent_id = 2\n# text = Galli moriuntur.\n1\tGalli\tGallus\tPRON\tF1|grn1|casJ|gen1|stPD\tCase=Nom|Degree=Pos|Gender=Masc|Number=Plur|PronType=Dem\t2\tnsubj:pass\t_\t_\n2\tmoriuntur\tmorior\tVERB\tL3|modJ|tem1|gen9|stAV\tMood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass\t0\troot\t_\tSpaceAfter=No\n3\t.\t.\tPUNCT\tPunc\t_\t2\tpunct\t_\tSpacesAfter=\\n\n\n"
udpipe_string: str = "# newpar\n# sent_id = 1\n# text = Caesar fortis est. Galli moriuntur.\n1\tCaesar\tCaesar\tPROPN\tNe\tCase=Nom|Gender=Masc|Number=Sing\t4\tnsubj\t_\t_\n2\tfortis\tfortis\tADJ\tA-\tCase=Nom|Degree=Pos|Gender=Masc|Number=Sing\t3\tamod\t_\t_\n3\test.\test.\tX\tF-\t_\t4\tobj\t_\t_\n4\tGalli\tGalli\tNOUN\tNb\tCase=Nom|Gender=Masc|Number=Plur\t0\troot\t_\t_\n5\tmoriuntur.\tgreek.expression\tNOUN\tNb\t_\t3\tconj\t_\tSpacesAfter=\\n\n\n"
urn: str = "urn:cts:latinLit:phi0448.phi001.perseus-lat2:1.1.1-1.1.2"
urn_custom: str = f"{CustomCorpusService.custom_corpora[4].corpus.source_urn}:2.23.1-2.23.1"
xapi_json_string: str = '{"0":{"actor":{"account":{"name":"9a7eef78-b0b4-471d-b451-e47c9b20d231"},"objectType":"Agent"},"verb":{"id":"http://adlnet.gov/expapi/verbs/answered","display":{"en-US":"answered"}},"object":{"objectType":"Activity","definition":{"extensions":{"http://h5p.org/x-api/h5p-local-content-id":1},"interactionType":"fill-in","type":"http://adlnet.gov/expapi/activities/cmi.interaction","description":{"en-US":"<p>Matching: Assign the matching elements to each other!</p><br/>divisa __________<br/>dividit __________<br/>"},"correctResponsesPattern":["partes[,]Belgis"]}},"context":{"contextActivities":{"category":[{"id":"http://h5p.org/libraries/H5P.DragText-1.8","objectType":"Activity"}]}},"result":{"response":"Belgis[,]","score":{"min":0,"raw":0,"max":2,"scaled":0},"duration":"PT4.12S","completion":true}}}'
......
......@@ -23,6 +23,7 @@ from typing import Dict, List, Tuple, Type, Any
from conllu import TokenList
from flask import Flask
from gensim.models import Word2Vec
from graphannis.graph import GraphUpdate
from lxml import etree
from networkx import MultiDiGraph, Graph
from requests import HTTPError
......@@ -38,7 +39,7 @@ from mcserver.app.api.exerciseAPI import map_exercise_data_to_database
from mcserver.app.models import ResourceType, FileType, ExerciseType, ExerciseData, \
NodeMC, LinkMC, GraphData, Phenomenon, CustomCorpus, AnnisResponse, Solution, DownloadableFile, Language, \
VocabularyCorpus, TextComplexityMeasure, CitationLevel, FrequencyItem, TextComplexity, Dependency, PartOfSpeech, \
Choice, XapiStatement, ExerciseMC, CorpusMC, make_solution_element_from_salt_id, Sentence
Choice, XapiStatement, ExerciseMC, CorpusMC, make_solution_element_from_salt_id, Sentence, ReferenceableText
from mcserver.app.services import AnnotationService, CorpusService, FileService, CustomCorpusService, DatabaseService, \
XMLservice, TextService, FrequencyService, ExerciseService
from mcserver.config import TestingConfig, Config
......@@ -802,11 +803,11 @@ class CsmTestCase(unittest.TestCase):
with patch.object(CorpusService, "load_text_list", return_value=Mocks.text_list):
with patch.object(CorpusService, "get_raw_text", return_value=Mocks.raw_text):
conll = CorpusService.get_annotations_from_string(Mocks.urn)
self.assertEqual(len(conll[0]), 4)
self.assertEqual(len(conll[0]), 5)
mdg: MultiDiGraph = CorpusService.get_graph(Mocks.urn)
self.assertEqual(len(mdg.nodes), 7)
self.assertEqual(len(mdg.nodes), 5)
mdg = CorpusService.get_graph(f"{Mocks.urn}@1-1")
self.assertEqual(len(mdg.nodes), 7)
self.assertEqual(len(mdg.nodes), 5)
with patch.object(CustomCorpusService, "get_treebank_annotations", return_value=Mocks.annotations):
conll = CorpusService.get_annotations_from_string(Mocks.urn_custom)
self.assertEqual(len(conll[0]), 6)
......@@ -848,6 +849,19 @@ class CsmTestCase(unittest.TestCase):
db.session.query(UpdateInfo).delete()
assert not update_mock.called
def test_map_conll_to_graph(self):
""" Saves an annotated corpus in CONLL format to the ANNIS corpus storage. """
conll: List[TokenList] = Mocks.annotations + copy.deepcopy(Mocks.annotations)
conll[1].metadata = dict(sent_id="2")
disk_urn: str = AnnotationService.get_disk_urn(Mocks.urn_custom)
AnnotationService.map_conll_to_graph(corpus_name=Mocks.urn_custom, conll=conll,
cs=Config.CORPUS_STORAGE_MANAGER, file_name=disk_urn)
result: dict = CorpusService.process_corpus_data(
urn=Mocks.urn_custom, annotations=conll, aqls=["tok"], exercise_type=ExerciseType.cloze,
search_phenomena=[Phenomenon.UPOSTAG])
gd: GraphData = AnnotationService.map_graph_data(result["graph_data_raw"])
self.assertEqual(gd.nodes[-1].id.split("/")[0], gd.nodes[0].id.split("/")[0])
def test_process_corpus_data(self):
"""Builds a graph from annotated text data."""
disk_urn: str = AnnotationService.get_disk_urn(Mocks.urn_custom)
......@@ -863,7 +877,7 @@ class CsmTestCase(unittest.TestCase):
target_corpus: CustomCorpus = next(
(x for x in CustomCorpusService.custom_corpora if x.corpus.source_urn == base_urn), None)
CustomCorpusService.init_custom_corpus(target_corpus)
text_parts_list: List[Tuple[str, str]] = CorpusService.load_text_list(Mocks.urn_custom)
text_parts_list: List[ReferenceableText] = CorpusService.load_text_list(Mocks.urn_custom)
self.assertEqual(len(text_parts_list), 1)
def test_run_app(self):
......@@ -905,16 +919,20 @@ class CommonTestCase(unittest.TestCase):
def test_add_urn_to_sentences(self):
""" Adds the relevant URN for every annotated sentence. """
conll: List[TokenList] = copy.deepcopy(Mocks.annotations)
text_list: List[Tuple[str, str]] = [(Mocks.urn, conll[0].tokens[0]["form"]), (Mocks.urn_custom, "")]
text_list: List[ReferenceableText] = [
ReferenceableText(conll[0].tokens[0]["form"], Mocks.urn),
ReferenceableText("", Mocks.urn_custom), ReferenceableText(conll[0].tokens[0]["form"], Mocks.urn_custom)]
conll[0].tokens[0]["form"] += "."
conll.append(TokenList(tokens=[
{"id": 1, "form": "Caesar.", "lemma": "Caeso", "upostag": "VERB", "xpostag": "L3|modJ|tem3|gen4|stAV",
"feats": {"Mood": "Ind", "Number": "Sing", "Person": "1", "Tense": "Fut", "VerbForm": "Fin",
"Voice": "Pass"}, "head": 0, "deprel": "root", "deps": None, "misc": {"ref": "1.1"}}],
metadata=OrderedDict([("sent_id", "2"), ("urn", "")])))
conll += copy.deepcopy(Mocks.annotations)
AnnotationService.add_urn_to_sentences(text_list, conll)
self.assertEqual(conll[0].metadata["urn"], Mocks.urn)
self.assertEqual(conll[1].metadata["urn"], "")
self.assertEqual(conll[2].metadata["urn"], Mocks.urn_custom)
def test_create_xml_string(self):
"""Exports the exercise data to the Moodle XML format. See https://docs.moodle.org/35/en/Moodle_XML_format ."""
......@@ -937,13 +955,13 @@ class CommonTestCase(unittest.TestCase):
def test_extract_custom_corpus_text(self):
""" Extracts text from the relevant parts of a (custom) corpus. """
new_text_parts: List[Tuple[str, str]] = CustomCorpusService.extract_custom_corpus_text(
new_text_parts: List[ReferenceableText] = CustomCorpusService.extract_custom_corpus_text(
Mocks.text_parts, ["", ""], ["", "0"], "", 1, [False, True])
self.assertEqual(len(new_text_parts), 0)
new_text_parts = CustomCorpusService.extract_custom_corpus_text(Mocks.text_parts, ["", ""], ["", "0"], "", 1)
self.assertEqual(new_text_parts[0][1], Mocks.text_parts[0].text_value)
self.assertEqual(new_text_parts[0].text, Mocks.text_parts[0].text_value)
new_text_parts = CustomCorpusService.extract_custom_corpus_text(Mocks.text_parts, ["1"], ["3"], "")
self.assertEqual(new_text_parts[0][1], Mocks.text_parts[0].text_value)
self.assertEqual(new_text_parts[0].text, Mocks.text_parts[0].text_value)
def test_get_concept_network(self):
"""Extracts a network of words from vector data in an AI model."""
......@@ -984,7 +1002,7 @@ class CommonTestCase(unittest.TestCase):
def test_get_custom_corpus_text(self):
""" Retrieves the text for a custom corpus, e.g. a textbook. """
text_list: List[Tuple[str, str]] = CustomCorpusService.get_custom_corpus_text(Mocks.urn)
text_list: List[ReferenceableText] = CustomCorpusService.get_custom_corpus_text(Mocks.urn)
self.assertEqual(len(text_list), 0)
def test_get_pdf_html_string(self):
......@@ -1034,15 +1052,15 @@ class CommonTestCase(unittest.TestCase):
self.assertEqual(len(conll), 3)
cc: CustomCorpus = CustomCorpusService.custom_corpora[-1]
urn: str = cc.corpus.source_urn + ":1.1-1.2"
conll = CustomCorpusService.get_treebank_sub_annotations(urn, [], cc)
conll: List[TokenList] = CustomCorpusService.get_treebank_sub_annotations(urn, [], cc)
self.assertEqual(len(cc.text_parts), 2)
def test_get_udpipe(self):
"""Annotates a single text with UdPipe. The beginning of the CONLL has to be left out because it contains the
randomly generated temp file path and thus cannot be predicted exactly."""
text = "Caesar fortis est. Galli moriuntur."
conll = AnnotationService.get_udpipe(text)
self.assertIn(Mocks.udpipe_string, conll)
text: str = "Caesar fortis est. Galli moriuntur."
conll_string: str = AnnotationService.get_udpipe(text)
self.assertIn(Mocks.udpipe_string, conll_string)
def test_init_custom_corpus(self):
"""Adds custom corpora to the corpus list, e.g. the PROIEL corpora."""
......@@ -1094,7 +1112,7 @@ class CommonTestCase(unittest.TestCase):
""" Loads the text list for a new corpus. """
with patch.object(mcserver.app.services.corpusService.HttpCtsRetriever, 'getPassage',
return_value=Mocks.cts_passage_xml) as get_passage_mock:
text_parts: List[Tuple[str, str]] = CorpusService.load_text_list(Mocks.urn)
text_parts: List[ReferenceableText] = CorpusService.load_text_list(Mocks.urn)
self.assertEqual(len(text_parts), 2)
get_passage_mock.return_value = Mocks.cts_passage_xml_2_levels
text_parts = CorpusService.load_text_list(Mocks.urn[:-8] + "-1.1")
......@@ -1103,7 +1121,7 @@ class CommonTestCase(unittest.TestCase):
text_parts = CorpusService.load_text_list(Mocks.urn[:-10] + "-3")
self.assertEqual(len(text_parts), 3)
get_passage_mock.side_effect = HTTPError()
text_parts: List[Tuple[str, str]] = CorpusService.load_text_list(Mocks.urn)
text_parts = CorpusService.load_text_list(Mocks.urn)
self.assertEqual(text_parts, [])
def test_make_docx_file(self):
......
......@@ -90,6 +90,13 @@
</ion-grid>
</ion-row>
</ion-grid>
<ion-grid>
<ion-row>
<ion-col>
<a href="{{configMC.developerMailTo}}">{{ 'EMAIL_ERROR' | translate }}</a>
</ion-col>
</ion-row>
</ion-grid>
</ion-content>
</ion-menu>
<ion-router-outlet id="content1"></ion-router-outlet>
......
......@@ -292,7 +292,8 @@ describe('CorpusService', () => {
corpusService.initUpdateInfo().then(() => {
updateInfoSpy.and.callThrough();
corpusService.storage.get(configMC.localStorageKeyUpdateInfo).then((jsonString: string) => {
expect(jsonString).toBeTruthy();
const updateInfo: UpdateInfo = JSON.parse(jsonString) as UpdateInfo;
expect(updateInfo.corpora).toBe(1);
const setSpy: Spy = spyOn(corpusService.storage, 'set').and.returnValue(Promise.resolve());
corpusService.initUpdateInfo().then(() => {
expect(setSpy).toHaveBeenCalledTimes(0);
......
......@@ -35,7 +35,6 @@
</ion-row>
<ion-row>
<ion-col>
<!-- TODO: enable solution shuffling for H5P ? -->
<div class="h5p-container"></div>
</ion-col>
</ion-row>
......@@ -50,7 +49,6 @@ beginning that it is going to be a download (instead of an ordinary link or clic
<a href="{{urlBase + FileType.Pdf + solutionIndicesString}}"
download>{{ 'FILE_TYPE_PDF' | translate }}</a>
</ion-col>
<!-- TODO: ADD MOODLE SUPPORT FOR MARK WORDS EXERCISES -->
<ion-col *ngIf="corpusService.exercise.type !== ExerciseType.markWords">
<a href="{{urlBase + FileType.Xml + solutionIndicesString}}" download>{{ 'FILE_TYPE_XML' |
translate }}</a>
......
......@@ -72,6 +72,9 @@ describe('ShowTextPage', () => {
showTextPage.corpusService.annisResponse.graph_data.nodes = [{}, {}];
result = showTextPage.getWhiteSpace(0);
expect(result.length).toBe(1);
showTextPage.corpusService.annisResponse.graph_data.nodes[1].annis_tok = 'test,';
result = showTextPage.getWhiteSpace(0);
expect(result.length).toBe(1);
showTextPage.corpusService.annisResponse.graph_data.nodes[1].annis_tok = '.';
result = showTextPage.getWhiteSpace(0);
expect(result.length).toBe(0);
......
......@@ -87,7 +87,7 @@ export class ShowTextPage implements OnInit {
getWhiteSpace(index: number): string {
if (this.corpusService.annisResponse.graph_data.nodes[index + 1]) {
if (this.corpusService.annisResponse.graph_data.nodes[index + 1].annis_tok &&
this.corpusService.annisResponse.graph_data.nodes[index + 1].annis_tok
this.corpusService.annisResponse.graph_data.nodes[index + 1].annis_tok[0]
.search(/[.,\/#!$%\^&\*;:{}=\-_`~()]/g) >= 0) {
return '';
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment