Commit ba94ae42 authored by Konstantin Schulz's avatar Konstantin Schulz

vocabulary check now also considers different spelling conventions like 'u' <-> 'v'

parent 21e135cf
Pipeline #15801 failed with stages
in 3 minutes and 4 seconds
......@@ -4,7 +4,7 @@ from typing import List, Set
import conllu
from conllu import TokenList
from mcserver.app.models import Language, VocabularyCorpus, ResourceType
from mcserver.app.services import NetworkService, FileService, DatabaseService
from mcserver.app.services import NetworkService, FileService, DatabaseService, TextService
from mcserver.models_auto import Exercise, UpdateInfo
from openapi.openapi_server.models import MatchingExercise
......@@ -32,6 +32,8 @@ def get(lang: str, frequency_upper_bound: int, last_update_time: int, vocabulary
for exercise in matching_exercises:
conll: List[TokenList] = conllu.parse(exercise.conll)
lemmata: List[str] = [tok["lemma"] for sent in conll for tok in sent.tokens]
exercise.matching_degree = sum((1 if x in vocabulary_set else 0) for x in lemmata) / len(lemmata) * 100
exercise.matching_degree = \
sum((1 if TextService.check_lemma_in_vocabulary(x, vocabulary_set) else 0) for x in lemmata) / \
len(lemmata) * 100
ret_val: List[dict] = [NetworkService.serialize_exercise(x, compress=True) for x in matching_exercises]
return NetworkService.make_json_response(ret_val)
......@@ -12,21 +12,12 @@ from openapi.openapi_server.models import VocabularyForm
def add_sentence(current_lemmata: Dict[int, str], vocabulary_set: Set[str], sentences: List[Sentence],
current_sentence_id: int):
""" Adds a sentence to the response for the vocabulary check. """
matches: List[str] = [current_lemmata[i] for i in current_lemmata if is_match(current_lemmata[i], vocabulary_set)]
matches: List[str] = [current_lemmata[i] for i in current_lemmata if
TextService.check_lemma_in_vocabulary(current_lemmata[i], vocabulary_set)]
new_sentence_matching_degree: float = (len(matches) / len(current_lemmata) * 100) if len(current_lemmata) > 0 else 0
sentences.append(Sentence(id=current_sentence_id, matching_degree=new_sentence_matching_degree))
def check_lemma_suffix(target_lemma: str, vocabulary_set: Set[str]):
""" Checks whether slightly different forms of the lemma are matched by the vocabulary set. """
for suffix in TextService.suffix_map:
if target_lemma[-len(suffix):] == suffix:
for replacement in TextService.suffix_map[suffix]:
if (target_lemma[:-len(suffix)] + replacement) in vocabulary_set:
return True
return False
def check_vocabulary(graph_data: GraphData, vocabulary_set: Set[str]) -> List[Sentence]:
""" Checks whether the lemmata of a given graph/text match a reference vocabulary. """
sentences: List[Sentence] = []
......@@ -59,26 +50,6 @@ def get(frequency_upper_bound: int, query_urn: str, vocabulary: str) -> Response
return NetworkService.make_json_response([x.to_dict() for x in sentences])
def is_match(target_lemma: str, vocabulary_set: Set[str]):
""" Checks whether a given lemma is part of a reference vocabulary."""
if target_lemma in vocabulary_set:
return True
elif "#" in target_lemma and target_lemma.split("#")[0] in vocabulary_set:
return True
elif check_lemma_suffix(target_lemma, vocabulary_set):
return True
elif target_lemma in TextService.proper_nouns_set:
return True
# doesn't check for spelling variants (minerua/minerva)
# maybe perform a second check after the orthography check
else:
for key in TextService.orthography_map:
if key in target_lemma and target_lemma.replace(key, TextService.orthography_map[key]) in vocabulary_set:
return True
return False
# TODO: ADD CASES FOR MISSING ASSIMILATION, E.G. "ADPONERE" INSTEAD OF "APPONERE"
def post(vocabulary_data: dict):
""" Indicates for each token of a corpus whether it is covered by a reference vocabulary. """
vf: VocabularyForm = VocabularyForm.from_dict(vocabulary_data)
......@@ -89,7 +60,7 @@ def post(vocabulary_data: dict):
vocabulary_set.add(char)
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=vf.query_urn, is_csm=False)
for node in ar.graph_data.nodes:
if not is_match(target_lemma=node.udep_lemma, vocabulary_set=vocabulary_set):
if not TextService.check_lemma_in_vocabulary(target_lemma=node.udep_lemma, vocabulary_set=vocabulary_set):
node.is_oov = True
ar: AnnisResponse = AnnisResponse(
solutions=[], uri="", exercise_id="", graph_data=ar.graph_data)
......
......@@ -59,15 +59,38 @@ class TextService:
"textField": "*Bilberries*, also known as *blueberries* are edible, nearly black berries found in nutrient-poor soils.<br><br>*Cloudberries* are edible orange berries similar to *raspberries* or *blackberries* found in alpine and arctic tundra. <br><br>*Redcurrants* are red translucent berries with a diameter of 8\u201310 mm, and are closely related to *blackcurrants*.",
"overallFeedback": [{"from": 0, "to": 100, "feedback": "You got @score of @total points."}]
}
orthography_map: Dict[str, str] = {"que": "", "u": "v", "U": "V", "v": "u", "V": "U"}
orthography_map: Dict[str, str] = {"que": ""}
proper_nouns_set: Set[str]
sentence_count_ranges: List[range] = [range(0, 2), range(2, 5), range(5, 10), range(10, 20), range(20, 40),
range(40, 70), range(70, 110), range(110, 160), range(160, sys.maxsize)]
stop_words_latin: Set[str] = set()
suffix_map: Dict[str, Set[str]] = {"e": {"is", "us"}, "um": {"us"}, "us": {"i"}}
u_v_map: Dict[str, str] = {"u": "v", "U": "V", "v": "u", "V": "U"}
word_count_ranges: List[range] = [range(0, 10), range(10, 50), range(50, 100), range(100, 250), range(250, 500),
range(500, 1000), range(1000, 1500), range(1500, 2000), range(2000, sys.maxsize)]
@staticmethod
def check_lemma_in_vocabulary(target_lemma: str, vocabulary_set: Set[str]) -> bool:
""" Checks if a lemma (or any of its variants) is contained in a given vocabulary. """
u_v_indices: List[int] = [i for i in range(len(target_lemma)) if target_lemma[i] in TextService.u_v_map]
if u_v_indices:
for i in u_v_indices:
replacement: str = TextService.u_v_map[target_lemma[i]]
new_lemma: str = f"{target_lemma[:i]}{replacement}{target_lemma[i + 1:]}"
if TextService.is_match(new_lemma, vocabulary_set):
return True
return TextService.is_match(target_lemma, vocabulary_set)
@staticmethod
def check_lemma_suffix(target_lemma: str, vocabulary_set: Set[str]):
""" Checks whether slightly different forms of the lemma are matched by the vocabulary set. """
for suffix in TextService.suffix_map:
if target_lemma[-len(suffix):] == suffix:
for replacement in TextService.suffix_map[suffix]:
if (target_lemma[:-len(suffix)] + replacement) in vocabulary_set:
return True
return False
@staticmethod
def get_h5p_text_with_solutions(exercise: Exercise, solution_indices: List[int]) -> str:
""" Builds a string to be used in the textfield property value of a content.json file for H5P
......@@ -119,6 +142,25 @@ class TextService:
stop_words_dict: Dict[str, List[str]] = json.loads(content)
TextService.stop_words_latin = set(y for x in stop_words_dict.values() for y in x)
@staticmethod
def is_match(target_lemma: str, vocabulary_set: Set[str]) -> bool:
""" Checks whether a given lemma is part of a reference vocabulary."""
if target_lemma in vocabulary_set:
return True
elif "#" in target_lemma and target_lemma.split("#")[0] in vocabulary_set:
return True
elif TextService.check_lemma_suffix(target_lemma, vocabulary_set):
return True
elif target_lemma in TextService.proper_nouns_set:
return True
else:
for key in TextService.orthography_map:
if key in target_lemma and \
target_lemma.replace(key, TextService.orthography_map[key]) in vocabulary_set:
return True
return False
# TODO: ADD CASES FOR MISSING ASSIMILATION, E.G. "ADPONERE" INSTEAD OF "APPONERE"
@staticmethod
def strip_whitespace(text: str) -> str:
""" Removes extra whitespace before punctuation signs, but leaves it for underscores / word gaps. """
......
......@@ -1115,6 +1115,10 @@ class CommonTestCase(unittest.TestCase):
clear_cache()
self.assertEqual(mock_get_request.call_count, 1)
def test_is_match(self):
""" Checks whether a given lemma is part of a reference vocabulary."""
self.assertTrue(TextService.is_match("neque", {"ne"}))
def test_load_text_list(self):
""" Loads the text list for a new corpus. """
with patch.object(mcserver.app.services.corpusService.HttpCtsRetriever, 'getPassage',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment