Improvements

aff598d1 · Frederik Arnold · 79d3ab45 · aff598d1 · aff598d1 · aff598d1
Commit aff598d1 authored 4 years ago by Frederik Arnold
--- a/evaluation/Result.py
+++ b/evaluation/Result.py
 class Result:
-    def __init__(self, pos, name, total_matches, total_gold_items, precision, recall):
+    def __init__(self, pos, name, total_matches, total_gold_items, precision, recall, true_positives_count):
        self.pos = pos
        self.name = name
        self.total_matches = total_matches
        self.total_gold_items = total_gold_items
        self.precision = precision
        self.recall = recall
+        self.true_positives_count = true_positives_count
--- a/evaluation/Test2.py
+++ b/evaluation/Test2.py
@@ -14,7 +14,7 @@ class GoldItem:
        self.text = text

    def __str__(self):
-        return "Gold Item (" + str(self.start) + ", " + str(self.end) + ", " + self.text + ")"
+        return "Gold Item (" + str(self.start) + ":" + str(self.end) + ", " + self.text + ")"

    def __eq__(self, other):
        if not isinstance(other, GoldItem):
@@ -58,7 +58,7 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):

    num_gold_items = len(gold_items)

-    scientific_content = open(scientific_file, "r").read().replace('\n', ' ')
+    scientific_content = open(scientific_file, "r").read()
    sim_texter = SimTexter(5, 3)
    texts = [literature_content, scientific_content]
    similarities = sim_texter.compare(texts)
@@ -87,9 +87,13 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
    print_line('\nFalse positives:', output_file)

    for i in found_false_positives:
-        start = similarities[i][1].character_start_pos
-        end = similarities[i][1].character_end_pos
-        print_line('false positive: ' + str(i + 1) + ': ' + scientific_content[start:end], output_file)
+        start_lit = similarities[i][0].character_start_pos
+        end_lit = similarities[i][0].character_end_pos
+        start_int = similarities[i][1].character_start_pos
+        end_int = similarities[i][1].character_end_pos
+
+        print_line('false positive: ' + str(i + 1) + ': lit (' + str(start_lit) + ':' + str(end_lit) + '), int (' + str(
+            start_int) + ':' + str(end_int) + '): ' + scientific_content[start_int:end_int], output_file)

    print_line('\nFalse negatives:', output_file)

@@ -106,7 +110,7 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
    if num_gold_items > 0:
        recall = true_positives_count / num_gold_items

-    result = Result(pos, filename, num_matches, num_gold_items, precision, recall)
+    result = Result(pos, filename, num_matches, num_gold_items, precision, recall, true_positives_count)

    queue.put(result)

@@ -127,7 +131,7 @@ if isfile(scientific_path) and scientific_path.endswith(".txt"):
 else:
    start_time = time.time()

-    literature_content = open(literature_path, "r").read().replace('\n', ' ')
+    literature_content = open(literature_path, "r").read()
    processes = []
    queue = Queue()
    results = []
@@ -147,24 +151,29 @@ else:
            p.start()

    for process in processes:
-        result = queue.get()  # will block
+        result = queue.get()
        results.append(result)
        process.join()

-    precision_sum = 0
-    recall_sum = 0
+    total_matches = 0
+    total_gold_items = 0
+    total_true_positives = 0

    for result in results:
-        precision_sum += result.precision
-        recall_sum += result.recall
+        total_matches += result.total_matches
+        total_gold_items += result.total_gold_items
+        total_true_positives += result.true_positives_count

-        result_string = '\n(' + str(result.pos) + '/' + str(count) + ': ' + result.name + '\nTotal matches: ' + str(
+        result_string = '\n(' + str(result.pos) + '/' + str(count) + '): ' + result.name + '\nTotal matches: ' + str(
            result.total_matches) + ', total gold items: ' + str(result.total_gold_items) + '\nPrecision: ' + str(
            result.precision) + '\nRecall: ' + str(result.recall)

        print(result_string)

-    print('\nprecision overall: ' + str(precision_sum / count))
-    print('recall overall: ' + str(recall_sum / count))
+    precision = total_true_positives / total_matches
+    recall = total_true_positives / total_gold_items
+
+    print('\nprecision overall: ' + str(precision))
+    print('recall overall: ' + str(recall))

    print("\n\n--- %s seconds ---" % (time.time() - start_time))
--- a/sim/SimTexter.py
+++ b/sim/SimTexter.py
@@ -56,8 +56,12 @@ class SimTexter:
                current_target_sim = similarities[pos][1]
                next_target_sim = similarities[pos + 1][1]

-            if (0 <= next_target_sim.token_start_pos - (
-                    current_target_sim.token_start_pos + current_target_sim.token_length) <= 2):
+            if ((1 <= next_target_sim.token_start_pos - (
+                    current_target_sim.token_start_pos + current_target_sim.token_length) <= 2) and (
+                        1 <= next_target_sim.token_start_pos - (
+                        current_target_sim.token_start_pos + current_target_sim.token_length) <= 2)) or (
+                    next_target_sim.token_start_pos - (current_target_sim.token_start_pos + current_target_sim.token_length) == 0
+                    and '[...]' in input_texts[1][tokens[next_target_sim.token_start_pos-1].end_pos:tokens[next_target_sim.token_start_pos].start_pos]):

                current_match_segment = (MatchSegment(current_source_sim.text_index, current_source_sim.token_start_pos,
                                                      current_source_sim.token_length + next_source_sim.token_length,
@@ -123,7 +127,8 @@ class SimTexter:
    def clean_text(self, input_text):
        # TODO: optional machen

-        input_text = re.sub("[.?!,‚‘'»«;:/()+\\-–\\[\\]…\"_]", " ", input_text)
+        input_text = re.sub("[^a-zA-Z0-9äüöÄÜÖß ]", " ", input_text)
+        # input_text = re.sub("[.?!,‚‘'’»«<>;:/()+\\-–\\[\\]…\"_\r\n]", " ", input_text)
        input_text = re.sub("[0-9]", " ", input_text)

        return input_text.lower()
@@ -214,7 +219,7 @@ class SimTexter:
        offset_target = 0
        has_skipped = False

-        while 0 < token_pos < len(tokens):
+        while 0 <= token_pos < len(tokens):

            if token_pos < texts[target_text_index].tk_start_pos:
                if token_pos in forward_references:
@@ -331,6 +336,9 @@ class SimTexter:

    def __fuzzy_match(self, input1, input2):

+        if len(input1) < 2 or len(input2) < 2:
+            return 0
+
        ratio = fuzz.ratio(input1, input2)
        return ratio