Skip to content
Snippets Groups Projects
SimTexter.py 16 KiB
Newer Older
Frederik Arnold's avatar
Frederik Arnold committed
from sim.Match import Match
from sim.MatchSegment import MatchSegment
from sim.Text import Text
import re
from sim.Token import Token
from rapidfuzz import fuzz, process
from datasketch import MinHash, MinHashLSH

Frederik Arnold's avatar
Frederik Arnold committed

# noinspection PyMethodMayBeStatic
Frederik Arnold's avatar
Frederik Arnold committed
class SimTexter:

Frederik Arnold's avatar
Frederik Arnold committed
    def __init__(self, min_match_length, max_gap):
Frederik Arnold's avatar
Frederik Arnold committed
        self.min_match_length = min_match_length
Frederik Arnold's avatar
Frederik Arnold committed
        self.max_gap = max_gap
Frederik Arnold's avatar
Frederik Arnold committed

    def compare(self, input_texts):

Frederik Arnold's avatar
Frederik Arnold committed
        (texts, tokens) = self.__read_input(input_texts)
Frederik Arnold's avatar
Frederik Arnold committed

        mts_tags = {}
        forward_references = {}
        lsh = MinHashLSH(threshold=0.80, num_perm=128)

        for i in range(0, len(texts)):
            (mts_tags, forward_references, lsh) = self.__make_forward_references(i, texts[i], tokens, mts_tags,
                                                                                 forward_references, lsh)
Frederik Arnold's avatar
Frederik Arnold committed
        similarities = self.__get_similarities(tokens, texts, 0, 1, forward_references)
Frederik Arnold's avatar
Frederik Arnold committed

        # self.__print_similarities(similarities, input_texts)

        cleaned_similarities = []

        pos = 0
        current_match_segment = None

        while pos < len(similarities):
            if pos + 1 >= len(similarities) and not current_match_segment:

                if current_match_segment:
                    cleaned_similarities.append(current_match_segment)

                cleaned_similarities.append(similarities[pos])
                break

            if current_match_segment:
                extend_existing = True
                current_source_sim = current_match_segment[0]
                next_source_sim = similarities[pos][0]
                current_target_sim = current_match_segment[1]
                next_target_sim = similarities[pos][1]
            else:
                extend_existing = False
                current_source_sim = similarities[pos][0]
                next_source_sim = similarities[pos + 1][0]
                current_target_sim = similarities[pos][1]
                next_target_sim = similarities[pos + 1][1]

Frederik Arnold's avatar
Frederik Arnold committed
            if ((1 <= next_target_sim.token_start_pos - (
                    current_target_sim.token_start_pos + current_target_sim.token_length) <= 2) and (
                        1 <= next_target_sim.token_start_pos - (
                        current_target_sim.token_start_pos + current_target_sim.token_length) <= 2)) or (
                    next_target_sim.token_start_pos - (current_target_sim.token_start_pos + current_target_sim.token_length) == 0
                    and '[...]' in input_texts[1][tokens[next_target_sim.token_start_pos-1].end_pos:tokens[next_target_sim.token_start_pos].start_pos]):

                current_match_segment = (MatchSegment(current_source_sim.text_index, current_source_sim.token_start_pos,
                                                      current_source_sim.token_length + next_source_sim.token_length,
                                                      current_source_sim.character_start_pos,
                                                      next_source_sim.character_end_pos),
                                         MatchSegment(current_target_sim.text_index, current_target_sim.token_start_pos,
                                                      current_target_sim.token_length + next_target_sim.token_length,
                                                      current_target_sim.character_start_pos,
                                                      next_target_sim.character_end_pos))
                if extend_existing:
                    pos = pos + 1
                else:
                    pos = pos + 2

                if pos >= len(similarities):
                    cleaned_similarities.append(current_match_segment)
            else:
                if current_match_segment:
                    cleaned_similarities.append(current_match_segment)
                    current_match_segment = None
                else:
                    cleaned_similarities.append(similarities[pos])
                    pos = pos + 1

        return cleaned_similarities
Frederik Arnold's avatar
Frederik Arnold committed

        # self.print_similarities(similarities, input_texts)

Frederik Arnold's avatar
Frederik Arnold committed
    def __read_input(self, input_texts):
Frederik Arnold's avatar
Frederik Arnold committed

        texts = []
        tokens = []

        for input_text in input_texts:
            nr_of_characters = len(input_text)
            nr_of_words = len(input_text.split())
            file_name = 'dummy'
            tk_start_pos = len(tokens)

            tokens.extend(self.tokenize_text(input_text))
            tk_end_pos = len(tokens)
            text = Text('Text', nr_of_characters, nr_of_words, file_name, tk_start_pos, tk_end_pos)
            texts.append(text)

        return texts, tokens

    def tokenize_text(self, input_text):
        cleaned_text = self.clean_text(input_text)

        tokens = []

        for match in re.finditer("[^\\s]+", cleaned_text):
Frederik Arnold's avatar
Frederik Arnold committed
            token = self.__clean_word(match.group())
Frederik Arnold's avatar
Frederik Arnold committed

            if len(token) > 0:
                text_begin_pos = match.start()
                text_end_pos = match.end()

                tokens.append(Token(token, text_begin_pos, text_end_pos))

        return tokens

    def clean_text(self, input_text):
        # TODO: optional machen

Frederik Arnold's avatar
Frederik Arnold committed
        input_text = re.sub("[^a-zA-Z0-9äüöÄÜÖß ]", " ", input_text)
        # input_text = re.sub("[.?!,‚‘'’»«<>;:/()+\\-–\\[\\]…\"_\r\n]", " ", input_text)
Frederik Arnold's avatar
Frederik Arnold committed
        input_text = re.sub("[0-9]", " ", input_text)

        return input_text.lower()

Frederik Arnold's avatar
Frederik Arnold committed
    def __clean_word(self, input_word):
Frederik Arnold's avatar
Frederik Arnold committed
        # TODO: optional machen
        input_word = input_word.replace('ß', 'ss')
        input_word = input_word.replace('ä', 'ae')
        input_word = input_word.replace('ö', 'oe')
        input_word = input_word.replace('ü', 'ue')
Frederik Arnold's avatar
Frederik Arnold committed
        input_word = input_word.replace('ey', 'ei')
Frederik Arnold's avatar
Frederik Arnold committed

Frederik Arnold's avatar
Frederik Arnold committed
        return input_word

Frederik Arnold's avatar
Frederik Arnold committed
    def __make_forward_references(self, text_index, text, tokens, mts_tags, forward_references, lsh):
Frederik Arnold's avatar
Frederik Arnold committed
        text_begin_pos = text.tk_start_pos
        text_end_pos = text.tk_end_pos

        for i in range(text_begin_pos, text_end_pos - self.min_match_length):
            tag = ''

            for token in tokens[i: i + self.min_match_length]:
                tag = tag + token.text

            if text_index == 0:
                my_set = set(tag)
                min_hash = MinHash(num_perm=128)

                for d in my_set:
                    min_hash.update(d.encode('utf8'))

                lsh.insert(tag, min_hash, False)
            else:
                my_set = set(tag)
                min_hash = MinHash(num_perm=128)

                for d in my_set:
                    min_hash.update(d.encode('utf8'))

                result = lsh.query(min_hash)

                if result and len(result) > 0:
Frederik Arnold's avatar
Frederik Arnold committed
                    closest_match = self.__get_closest_match(result, tag)
Frederik Arnold's avatar
Frederik Arnold committed
                    if closest_match:
                        forward_references[mts_tags[closest_match]] = i

            mts_tags[tag] = i

Frederik Arnold's avatar
Frederik Arnold committed
        return mts_tags, forward_references, lsh
Frederik Arnold's avatar
Frederik Arnold committed

Frederik Arnold's avatar
Frederik Arnold committed
    def __get_similarities(self, tokens, texts, source_text_index, target_text_index, forward_references):
Frederik Arnold's avatar
Frederik Arnold committed
        source_token_start_pos = texts[source_text_index].tk_start_pos
        source_token_end_pos = texts[source_text_index].tk_end_pos

        similarities = []

        while source_token_start_pos + self.min_match_length <= source_token_end_pos:
Frederik Arnold's avatar
Frederik Arnold committed
            best_match = self.__get_best_match(tokens, texts, source_text_index, target_text_index,
Frederik Arnold's avatar
Frederik Arnold committed
                                               source_token_start_pos, forward_references)
Frederik Arnold's avatar
Frederik Arnold committed

Frederik Arnold's avatar
Frederik Arnold committed
            if best_match and best_match.source_length > 0:
Frederik Arnold's avatar
Frederik Arnold committed
                source_character_start_pos = tokens[best_match.source_token_start_pos].start_pos
                source_character_end_pos = tokens[
                    best_match.source_token_start_pos + best_match.source_length - 1].end_pos
Frederik Arnold's avatar
Frederik Arnold committed
                target_character_start_pos = tokens[best_match.target_token_start_pos].start_pos
                target_character_end_pos = tokens[
                    best_match.target_token_start_pos + best_match.target_length - 1].end_pos
Frederik Arnold's avatar
Frederik Arnold committed

                similarities.append((MatchSegment(best_match.source_text_index, best_match.source_token_start_pos,
Frederik Arnold's avatar
Frederik Arnold committed
                                                  best_match.source_length, source_character_start_pos,
Frederik Arnold's avatar
Frederik Arnold committed
                                                  source_character_end_pos),
Frederik Arnold's avatar
Frederik Arnold committed
                                     MatchSegment(best_match.target_text_index, best_match.target_token_start_pos,
Frederik Arnold's avatar
Frederik Arnold committed
                                                  best_match.target_length, target_character_start_pos,
Frederik Arnold's avatar
Frederik Arnold committed
                                                  target_character_end_pos)))
Frederik Arnold's avatar
Frederik Arnold committed

Frederik Arnold's avatar
Frederik Arnold committed
                source_token_start_pos = source_token_start_pos + best_match.source_length
Frederik Arnold's avatar
Frederik Arnold committed
            else:
                source_token_start_pos = source_token_start_pos + 1

        return similarities

Frederik Arnold's avatar
Frederik Arnold committed
    def __get_best_match(self, tokens, texts, source_text_index, target_text_index, source_token_start_pos,
                         forward_references):
Frederik Arnold's avatar
Frederik Arnold committed
        best_match_length = 0
        token_pos = source_token_start_pos
        best_match = None
Frederik Arnold's avatar
Frederik Arnold committed
        offset_source = 0
        offset_target = 0
Frederik Arnold's avatar
Frederik Arnold committed
        has_skipped = False
Frederik Arnold's avatar
Frederik Arnold committed

Frederik Arnold's avatar
Frederik Arnold committed
        while 0 <= token_pos < len(tokens):
Frederik Arnold's avatar
Frederik Arnold committed

            if token_pos < texts[target_text_index].tk_start_pos:
                if token_pos in forward_references:
                    token_pos = forward_references[token_pos]
                else:
                    token_pos = -1
                continue

            min_match_length = self.min_match_length

            if best_match_length > 0:
                min_match_length = best_match_length + 1

            source_token_pos = source_token_start_pos + min_match_length - 1
            target_token_pos = token_pos + min_match_length - 1

            if source_token_pos < texts[source_text_index].tk_end_pos and texts[
                target_text_index].tk_end_pos > target_token_pos >= source_token_pos + min_match_length:

                cnt = min_match_length

Frederik Arnold's avatar
Frederik Arnold committed
                while cnt > 0:
                    if self.__fuzzy_match(tokens[source_token_pos].text, tokens[target_token_pos].text) > 80:
                        source_token_pos = source_token_pos - 1
                        target_token_pos = target_token_pos - 1
                        cnt = cnt - 1
                    else:
                        found = False
                        for i in range(1, self.max_gap + 1):
                            if self.__fuzzy_match(tokens[source_token_pos - i].text,
                                                  tokens[target_token_pos].text) > 80:
Frederik Arnold's avatar
Frederik Arnold committed
                                source_token_pos = source_token_pos - 1 - i
                                target_token_pos = target_token_pos - 1
                                cnt = cnt - 1
                                found = True
                                break
                        if not found:
                            break
Frederik Arnold's avatar
Frederik Arnold committed

                if cnt > 0:
                    if token_pos in forward_references:
                        token_pos = forward_references[token_pos]
                    else:
                        token_pos = -1
                    continue
            else:
                if token_pos in forward_references:
                    token_pos = forward_references[token_pos]
                else:
                    token_pos = -1
                continue

            new_match_length = min_match_length
            source_token_pos = source_token_start_pos + min_match_length
            target_token_pos = token_pos + min_match_length

            while source_token_pos < texts[source_text_index].tk_end_pos and texts[
Frederik Arnold's avatar
Frederik Arnold committed
                target_text_index].tk_end_pos > target_token_pos > source_token_pos + new_match_length:

                if self.__fuzzy_match(tokens[source_token_pos].text, tokens[target_token_pos].text) > 80:
                    source_token_pos = source_token_pos + 1
                    target_token_pos = target_token_pos + 1
                    new_match_length = new_match_length + 1
                elif self.__fuzzy_match(tokens[source_token_pos].text + tokens[source_token_pos + 1].text,
                                        tokens[target_token_pos].text) > 80:
Frederik Arnold's avatar
Frederik Arnold committed
                    source_token_pos = source_token_pos + 1 + 1
                    target_token_pos = target_token_pos + 1
                    new_match_length = new_match_length + 1 + 1
                    offset_target = offset_target + 1
                elif self.__fuzzy_match(tokens[source_token_pos].text,
                                        tokens[target_token_pos].text + tokens[target_token_pos + 1].text) > 80:
Frederik Arnold's avatar
Frederik Arnold committed
                    source_token_pos = source_token_pos + 1
                    target_token_pos = target_token_pos + 1 + 1
                    new_match_length = new_match_length + 1 + 1
                    offset_source = offset_source + 1
Frederik Arnold's avatar
Frederik Arnold committed
                elif not has_skipped:
                    found = False
Frederik Arnold's avatar
Frederik Arnold committed
                    for i in range(1, self.max_gap + 1):
Frederik Arnold's avatar
Frederik Arnold committed
                        if self.__fuzzy_match(tokens[source_token_pos + i].text, tokens[target_token_pos].text) > 80:
                            source_token_pos = source_token_pos + 1 + i
                            target_token_pos = target_token_pos + 1
                            new_match_length = new_match_length + 1 + i
Frederik Arnold's avatar
Frederik Arnold committed
                            offset_target = offset_target + i
Frederik Arnold's avatar
Frederik Arnold committed
                            found = True
                            has_skipped = True
                            break

                    if not found:
                        if self.__fuzzy_match(tokens[source_token_pos].text, tokens[target_token_pos + 1].text) > 80:
                            source_token_pos = source_token_pos + 1
                            target_token_pos = target_token_pos + 1 + 1
                            new_match_length = new_match_length + 1 + 1
                            offset_source = offset_source + 1
                            found = True
                            has_skipped = True

Frederik Arnold's avatar
Frederik Arnold committed
                    if not found:
                        break
                else:
                    break
Frederik Arnold's avatar
Frederik Arnold committed

            if new_match_length >= self.min_match_length and new_match_length > best_match_length:
                best_match_length = new_match_length
                best_match_token_pos = token_pos
                best_match = Match(source_text_index, source_token_start_pos, target_text_index, best_match_token_pos,
Frederik Arnold's avatar
Frederik Arnold committed
                                   best_match_length - offset_source, best_match_length - offset_target)
Frederik Arnold's avatar
Frederik Arnold committed

            if token_pos in forward_references:
                token_pos = forward_references[token_pos]
            else:
                token_pos = -1

        return best_match

Frederik Arnold's avatar
Frederik Arnold committed
    def __fuzzy_match(self, input1, input2):
Frederik Arnold's avatar
Frederik Arnold committed

Frederik Arnold's avatar
Frederik Arnold committed
        if len(input1) < 2 or len(input2) < 2:
            return 0

Frederik Arnold's avatar
Frederik Arnold committed
        ratio = fuzz.ratio(input1, input2)
        return ratio

Frederik Arnold's avatar
Frederik Arnold committed
    def __get_closest_match(self, candidates, word):
Frederik Arnold's avatar
Frederik Arnold committed

        if not candidates or len(candidates) == 0:
            return None

Frederik Arnold's avatar
Frederik Arnold committed
        if word in candidates:
            return word

        best_existing_tag = process.extractOne(word, candidates, scorer=fuzz.ratio, score_cutoff=80)

        if best_existing_tag:
            return best_existing_tag[0]

        return None

Frederik Arnold's avatar
Frederik Arnold committed
    def __print_similarities(self, similarities, input_texts):
        literature_content = input_texts[0]
        scientific_content = input_texts[1]

        result = ''

Frederik Arnold's avatar
Frederik Arnold committed
        for similarity_tuple in similarities:
            similarity_literature = similarity_tuple[0]
            similarity_scientific = similarity_tuple[1]

            content = literature_content[
                      similarity_literature.character_start_pos:similarity_literature.character_end_pos]
            result += '\n' + str(similarity_literature.character_start_pos) + '\t' + str(
                similarity_literature.character_end_pos) + '\t' + content

            content = scientific_content[
                      similarity_scientific.character_start_pos:similarity_scientific.character_end_pos]
            result += '\n' + str(similarity_scientific.character_start_pos) + '\t' + str(
                similarity_scientific.character_end_pos) + '\t' + content

        print(result)