Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Quid
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Schlüsselstellen
Quid
Commits
aff598d1
Commit
aff598d1
authored
4 years ago
by
Frederik Arnold
Browse files
Options
Downloads
Patches
Plain Diff
Improvements
parent
79d3ab45
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
evaluation/Result.py
+2
-1
2 additions, 1 deletion
evaluation/Result.py
evaluation/Test2.py
+24
-15
24 additions, 15 deletions
evaluation/Test2.py
sim/SimTexter.py
+12
-4
12 additions, 4 deletions
sim/SimTexter.py
with
38 additions
and
20 deletions
evaluation/Result.py
+
2
−
1
View file @
aff598d1
class
Result
:
def
__init__
(
self
,
pos
,
name
,
total_matches
,
total_gold_items
,
precision
,
recall
):
def
__init__
(
self
,
pos
,
name
,
total_matches
,
total_gold_items
,
precision
,
recall
,
true_positives_count
):
self
.
pos
=
pos
self
.
name
=
name
self
.
total_matches
=
total_matches
self
.
total_gold_items
=
total_gold_items
self
.
precision
=
precision
self
.
recall
=
recall
self
.
true_positives_count
=
true_positives_count
This diff is collapsed.
Click to expand it.
evaluation/Test2.py
+
24
−
15
View file @
aff598d1
...
...
@@ -14,7 +14,7 @@ class GoldItem:
self
.
text
=
text
def
__str__
(
self
):
return
"
Gold Item (
"
+
str
(
self
.
start
)
+
"
,
"
+
str
(
self
.
end
)
+
"
,
"
+
self
.
text
+
"
)
"
return
"
Gold Item (
"
+
str
(
self
.
start
)
+
"
:
"
+
str
(
self
.
end
)
+
"
,
"
+
self
.
text
+
"
)
"
def
__eq__
(
self
,
other
):
if
not
isinstance
(
other
,
GoldItem
):
...
...
@@ -58,7 +58,7 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
num_gold_items
=
len
(
gold_items
)
scientific_content
=
open
(
scientific_file
,
"
r
"
).
read
()
.
replace
(
'
\n
'
,
'
'
)
scientific_content
=
open
(
scientific_file
,
"
r
"
).
read
()
sim_texter
=
SimTexter
(
5
,
3
)
texts
=
[
literature_content
,
scientific_content
]
similarities
=
sim_texter
.
compare
(
texts
)
...
...
@@ -87,9 +87,13 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
print_line
(
'
\n
False positives:
'
,
output_file
)
for
i
in
found_false_positives
:
start
=
similarities
[
i
][
1
].
character_start_pos
end
=
similarities
[
i
][
1
].
character_end_pos
print_line
(
'
false positive:
'
+
str
(
i
+
1
)
+
'
:
'
+
scientific_content
[
start
:
end
],
output_file
)
start_lit
=
similarities
[
i
][
0
].
character_start_pos
end_lit
=
similarities
[
i
][
0
].
character_end_pos
start_int
=
similarities
[
i
][
1
].
character_start_pos
end_int
=
similarities
[
i
][
1
].
character_end_pos
print_line
(
'
false positive:
'
+
str
(
i
+
1
)
+
'
: lit (
'
+
str
(
start_lit
)
+
'
:
'
+
str
(
end_lit
)
+
'
), int (
'
+
str
(
start_int
)
+
'
:
'
+
str
(
end_int
)
+
'
):
'
+
scientific_content
[
start_int
:
end_int
],
output_file
)
print_line
(
'
\n
False negatives:
'
,
output_file
)
...
...
@@ -106,7 +110,7 @@ def process_file(queue, literature_content, scientific_file, gold_file, pos):
if
num_gold_items
>
0
:
recall
=
true_positives_count
/
num_gold_items
result
=
Result
(
pos
,
filename
,
num_matches
,
num_gold_items
,
precision
,
recall
)
result
=
Result
(
pos
,
filename
,
num_matches
,
num_gold_items
,
precision
,
recall
,
true_positives_count
)
queue
.
put
(
result
)
...
...
@@ -127,7 +131,7 @@ if isfile(scientific_path) and scientific_path.endswith(".txt"):
else
:
start_time
=
time
.
time
()
literature_content
=
open
(
literature_path
,
"
r
"
).
read
()
.
replace
(
'
\n
'
,
'
'
)
literature_content
=
open
(
literature_path
,
"
r
"
).
read
()
processes
=
[]
queue
=
Queue
()
results
=
[]
...
...
@@ -147,24 +151,29 @@ else:
p
.
start
()
for
process
in
processes
:
result
=
queue
.
get
()
# will block
result
=
queue
.
get
()
results
.
append
(
result
)
process
.
join
()
precision_sum
=
0
recall_sum
=
0
total_matches
=
0
total_gold_items
=
0
total_true_positives
=
0
for
result
in
results
:
precision_sum
+=
result
.
precision
recall_sum
+=
result
.
recall
total_matches
+=
result
.
total_matches
total_gold_items
+=
result
.
total_gold_items
total_true_positives
+=
result
.
true_positives_count
result_string
=
'
\n
(
'
+
str
(
result
.
pos
)
+
'
/
'
+
str
(
count
)
+
'
:
'
+
result
.
name
+
'
\n
Total matches:
'
+
str
(
result_string
=
'
\n
(
'
+
str
(
result
.
pos
)
+
'
/
'
+
str
(
count
)
+
'
)
:
'
+
result
.
name
+
'
\n
Total matches:
'
+
str
(
result
.
total_matches
)
+
'
, total gold items:
'
+
str
(
result
.
total_gold_items
)
+
'
\n
Precision:
'
+
str
(
result
.
precision
)
+
'
\n
Recall:
'
+
str
(
result
.
recall
)
print
(
result_string
)
print
(
'
\n
precision overall:
'
+
str
(
precision_sum
/
count
))
print
(
'
recall overall:
'
+
str
(
recall_sum
/
count
))
precision
=
total_true_positives
/
total_matches
recall
=
total_true_positives
/
total_gold_items
print
(
'
\n
precision overall:
'
+
str
(
precision
))
print
(
'
recall overall:
'
+
str
(
recall
))
print
(
"
\n\n
--- %s seconds ---
"
%
(
time
.
time
()
-
start_time
))
This diff is collapsed.
Click to expand it.
sim/SimTexter.py
+
12
−
4
View file @
aff598d1
...
...
@@ -56,8 +56,12 @@ class SimTexter:
current_target_sim
=
similarities
[
pos
][
1
]
next_target_sim
=
similarities
[
pos
+
1
][
1
]
if
(
0
<=
next_target_sim
.
token_start_pos
-
(
current_target_sim
.
token_start_pos
+
current_target_sim
.
token_length
)
<=
2
):
if
((
1
<=
next_target_sim
.
token_start_pos
-
(
current_target_sim
.
token_start_pos
+
current_target_sim
.
token_length
)
<=
2
)
and
(
1
<=
next_target_sim
.
token_start_pos
-
(
current_target_sim
.
token_start_pos
+
current_target_sim
.
token_length
)
<=
2
))
or
(
next_target_sim
.
token_start_pos
-
(
current_target_sim
.
token_start_pos
+
current_target_sim
.
token_length
)
==
0
and
'
[...]
'
in
input_texts
[
1
][
tokens
[
next_target_sim
.
token_start_pos
-
1
].
end_pos
:
tokens
[
next_target_sim
.
token_start_pos
].
start_pos
]):
current_match_segment
=
(
MatchSegment
(
current_source_sim
.
text_index
,
current_source_sim
.
token_start_pos
,
current_source_sim
.
token_length
+
next_source_sim
.
token_length
,
...
...
@@ -123,7 +127,8 @@ class SimTexter:
def
clean_text
(
self
,
input_text
):
# TODO: optional machen
input_text
=
re
.
sub
(
"
[.?!,‚‘
'
»«;:/()+
\\
-–
\\
[
\\
]…
\"
_]
"
,
"
"
,
input_text
)
input_text
=
re
.
sub
(
"
[^a-zA-Z0-9äüöÄÜÖß ]
"
,
"
"
,
input_text
)
# input_text = re.sub("[.?!,‚‘'’»«<>;:/()+\\-–\\[\\]…\"_\r\n]", " ", input_text)
input_text
=
re
.
sub
(
"
[0-9]
"
,
"
"
,
input_text
)
return
input_text
.
lower
()
...
...
@@ -214,7 +219,7 @@ class SimTexter:
offset_target
=
0
has_skipped
=
False
while
0
<
token_pos
<
len
(
tokens
):
while
0
<
=
token_pos
<
len
(
tokens
):
if
token_pos
<
texts
[
target_text_index
].
tk_start_pos
:
if
token_pos
in
forward_references
:
...
...
@@ -331,6 +336,9 @@ class SimTexter:
def
__fuzzy_match
(
self
,
input1
,
input2
):
if
len
(
input1
)
<
2
or
len
(
input2
)
<
2
:
return
0
ratio
=
fuzz
.
ratio
(
input1
,
input2
)
return
ratio
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment