Skip to content
Snippets Groups Projects
Commit aff598d1 authored by Frederik Arnold's avatar Frederik Arnold
Browse files

Improvements

parent 79d3ab45
No related branches found
No related tags found
No related merge requests found
class Result:
def __init__(self, pos, name, total_matches, total_gold_items, precision, recall):
def __init__(self, pos, name, total_matches, total_gold_items, precision, recall, true_positives_count):
self.pos = pos
self.name = name
self.total_matches = total_matches
self.total_gold_items = total_gold_items
self.precision = precision
self.recall = recall
self.true_positives_count = true_positives_count
......@@ -14,7 +14,7 @@ class GoldItem:
self.text = text
def __str__(self):
return "Gold Item (" + str(self.start) + ", " + str(self.end) + ", " + self.text + ")"
return "Gold Item (" + str(self.start) + ":" + str(self.end) + ", " + self.text + ")"
def __eq__(self, other):
if not isinstance(other, GoldItem):
......@@ -58,7 +58,7 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
num_gold_items = len(gold_items)
scientific_content = open(scientific_file, "r").read().replace('\n', ' ')
scientific_content = open(scientific_file, "r").read()
sim_texter = SimTexter(5, 3)
texts = [literature_content, scientific_content]
similarities = sim_texter.compare(texts)
......@@ -87,9 +87,13 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
print_line('\nFalse positives:', output_file)
for i in found_false_positives:
start = similarities[i][1].character_start_pos
end = similarities[i][1].character_end_pos
print_line('false positive: ' + str(i + 1) + ': ' + scientific_content[start:end], output_file)
start_lit = similarities[i][0].character_start_pos
end_lit = similarities[i][0].character_end_pos
start_int = similarities[i][1].character_start_pos
end_int = similarities[i][1].character_end_pos
print_line('false positive: ' + str(i + 1) + ': lit (' + str(start_lit) + ':' + str(end_lit) + '), int (' + str(
start_int) + ':' + str(end_int) + '): ' + scientific_content[start_int:end_int], output_file)
print_line('\nFalse negatives:', output_file)
......@@ -106,7 +110,7 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
if num_gold_items > 0:
recall = true_positives_count / num_gold_items
result = Result(pos, filename, num_matches, num_gold_items, precision, recall)
result = Result(pos, filename, num_matches, num_gold_items, precision, recall, true_positives_count)
queue.put(result)
......@@ -127,7 +131,7 @@ if isfile(scientific_path) and scientific_path.endswith(".txt"):
else:
start_time = time.time()
literature_content = open(literature_path, "r").read().replace('\n', ' ')
literature_content = open(literature_path, "r").read()
processes = []
queue = Queue()
results = []
......@@ -147,24 +151,29 @@ else:
p.start()
for process in processes:
result = queue.get() # will block
result = queue.get()
results.append(result)
process.join()
precision_sum = 0
recall_sum = 0
total_matches = 0
total_gold_items = 0
total_true_positives = 0
for result in results:
precision_sum += result.precision
recall_sum += result.recall
total_matches += result.total_matches
total_gold_items += result.total_gold_items
total_true_positives += result.true_positives_count
result_string = '\n(' + str(result.pos) + '/' + str(count) + ': ' + result.name + '\nTotal matches: ' + str(
result_string = '\n(' + str(result.pos) + '/' + str(count) + '): ' + result.name + '\nTotal matches: ' + str(
result.total_matches) + ', total gold items: ' + str(result.total_gold_items) + '\nPrecision: ' + str(
result.precision) + '\nRecall: ' + str(result.recall)
print(result_string)
print('\nprecision overall: ' + str(precision_sum / count))
print('recall overall: ' + str(recall_sum / count))
precision = total_true_positives / total_matches
recall = total_true_positives / total_gold_items
print('\nprecision overall: ' + str(precision))
print('recall overall: ' + str(recall))
print("\n\n--- %s seconds ---" % (time.time() - start_time))
......@@ -56,8 +56,12 @@ class SimTexter:
current_target_sim = similarities[pos][1]
next_target_sim = similarities[pos + 1][1]
if (0 <= next_target_sim.token_start_pos - (
current_target_sim.token_start_pos + current_target_sim.token_length) <= 2):
if ((1 <= next_target_sim.token_start_pos - (
current_target_sim.token_start_pos + current_target_sim.token_length) <= 2) and (
1 <= next_target_sim.token_start_pos - (
current_target_sim.token_start_pos + current_target_sim.token_length) <= 2)) or (
next_target_sim.token_start_pos - (current_target_sim.token_start_pos + current_target_sim.token_length) == 0
and '[...]' in input_texts[1][tokens[next_target_sim.token_start_pos-1].end_pos:tokens[next_target_sim.token_start_pos].start_pos]):
current_match_segment = (MatchSegment(current_source_sim.text_index, current_source_sim.token_start_pos,
current_source_sim.token_length + next_source_sim.token_length,
......@@ -123,7 +127,8 @@ class SimTexter:
def clean_text(self, input_text):
# TODO: optional machen
input_text = re.sub("[.?!,‚‘'»«;:/()+\\-–\\[\\]…\"_]", " ", input_text)
input_text = re.sub("[^a-zA-Z0-9äüöÄÜÖß ]", " ", input_text)
# input_text = re.sub("[.?!,‚‘'’»«<>;:/()+\\-–\\[\\]…\"_\r\n]", " ", input_text)
input_text = re.sub("[0-9]", " ", input_text)
return input_text.lower()
......@@ -214,7 +219,7 @@ class SimTexter:
offset_target = 0
has_skipped = False
while 0 < token_pos < len(tokens):
while 0 <= token_pos < len(tokens):
if token_pos < texts[target_text_index].tk_start_pos:
if token_pos in forward_references:
......@@ -331,6 +336,9 @@ class SimTexter:
def __fuzzy_match(self, input1, input2):
if len(input1) < 2 or len(input2) < 2:
return 0
ratio = fuzz.ratio(input1, input2)
return ratio
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment