Skip to content
Snippets Groups Projects
SimTexter.py 11.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • Frederik Arnold's avatar
    Frederik Arnold committed
    from sim.Match import Match
    from sim.MatchSegment import MatchSegment
    from sim.Text import Text
    import re
    from sim.Token import Token
    from rapidfuzz import fuzz, process
    from datasketch import MinHash, MinHashLSH
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
    # noinspection PyMethodMayBeStatic
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    class SimTexter:
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __init__(self, min_match_length, max_gap):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            self.min_match_length = min_match_length
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            self.max_gap = max_gap
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
        def compare(self, input_texts):
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            (texts, tokens) = self.__read_input(input_texts)
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
            mts_tags = {}
            forward_references = {}
            lsh = MinHashLSH(threshold=0.80, num_perm=128)
    
            for i in range(0, len(texts)):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                (mts_tags, forward_references, lsh) = self.__make_forward_references(i, texts[i], tokens,
                                                                                                    mts_tags,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                                                                                    forward_references, lsh)
            similarities = self.__get_similarities(tokens, texts, 0, 1, forward_references)
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
            return similarities
    
            # self.print_similarities(similarities, input_texts)
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __read_input(self, input_texts):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
            texts = []
            tokens = []
    
            for input_text in input_texts:
                nr_of_characters = len(input_text)
                nr_of_words = len(input_text.split())
                file_name = 'dummy'
                tk_start_pos = len(tokens)
    
                tokens.extend(self.tokenize_text(input_text))
                tk_end_pos = len(tokens)
                text = Text('Text', nr_of_characters, nr_of_words, file_name, tk_start_pos, tk_end_pos)
                texts.append(text)
    
            return texts, tokens
    
        def tokenize_text(self, input_text):
            cleaned_text = self.clean_text(input_text)
    
            tokens = []
    
            for match in re.finditer("[^\\s]+", cleaned_text):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                token = self.__clean_word(match.group())
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
                if len(token) > 0:
                    text_begin_pos = match.start()
                    text_end_pos = match.end()
    
                    tokens.append(Token(token, text_begin_pos, text_end_pos))
    
            return tokens
    
        def clean_text(self, input_text):
            # TODO: optional machen
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            input_text = re.sub("[.?!,‚‘'»«;:/()+\\-–\\[\\]…]", " ", input_text)
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            input_text = re.sub("[0-9]", " ", input_text)
    
            return input_text.lower()
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __clean_word(self, input_word):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            # TODO: optional machen
            input_word = input_word.replace('ß', 'ss')
            input_word = input_word.replace('ä', 'ae')
            input_word = input_word.replace('ö', 'oe')
            input_word = input_word.replace('ü', 'ue')
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            input_word = input_word.replace('ey', 'ei')
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            return input_word
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __make_forward_references(self, text_index, text, tokens, mts_tags, forward_references, lsh):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            text_begin_pos = text.tk_start_pos
            text_end_pos = text.tk_end_pos
    
            for i in range(text_begin_pos, text_end_pos - self.min_match_length):
                tag = ''
    
                for token in tokens[i: i + self.min_match_length]:
                    tag = tag + token.text
    
                if text_index == 0:
                    my_set = set(tag)
                    min_hash = MinHash(num_perm=128)
    
                    for d in my_set:
                        min_hash.update(d.encode('utf8'))
    
                    lsh.insert(tag, min_hash, False)
                else:
                    my_set = set(tag)
                    min_hash = MinHash(num_perm=128)
    
                    for d in my_set:
                        min_hash.update(d.encode('utf8'))
    
                    result = lsh.query(min_hash)
    
                    if result and len(result) > 0:
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                        closest_match = self.__get_closest_match(result, tag)
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                        if closest_match:
                            forward_references[mts_tags[closest_match]] = i
    
                mts_tags[tag] = i
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            return mts_tags, forward_references, lsh
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __get_similarities(self, tokens, texts, source_text_index, target_text_index, forward_references):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            source_token_start_pos = texts[source_text_index].tk_start_pos
            source_token_end_pos = texts[source_text_index].tk_end_pos
    
            similarities = []
    
            while source_token_start_pos + self.min_match_length <= source_token_end_pos:
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                best_match = self.__get_best_match(tokens, texts, source_text_index, target_text_index,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                                   source_token_start_pos, forward_references)
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                if best_match and best_match.source_length > 0:
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    source_character_start_pos = tokens[best_match.source_token_start_pos].start_pos
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    source_character_end_pos = tokens[best_match.source_token_start_pos + best_match.source_length - 1].end_pos
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    target_character_start_pos = tokens[best_match.target_token_start_pos].start_pos
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    target_character_end_pos = tokens[best_match.target_token_start_pos + best_match.target_length - 1].end_pos
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
                    similarities.append((MatchSegment(best_match.source_text_index, best_match.source_token_start_pos,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                                      best_match.source_length, source_character_start_pos,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                                      source_character_end_pos),
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                         MatchSegment(best_match.target_text_index, best_match.target_token_start_pos,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                                      best_match.target_length, target_character_start_pos,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                                      target_character_end_pos)))
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    source_token_start_pos = source_token_start_pos + best_match.source_length
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                else:
                    source_token_start_pos = source_token_start_pos + 1
    
            return similarities
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __get_best_match(self, tokens, texts, source_text_index, target_text_index, source_token_start_pos,
                             forward_references):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            best_match_length = 0
            token_pos = source_token_start_pos
            best_match = None
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            offset_source = 0
            offset_target = 0
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            has_skipped = False
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
            while 0 < token_pos < len(tokens):
    
                if token_pos < texts[target_text_index].tk_start_pos:
                    if token_pos in forward_references:
                        token_pos = forward_references[token_pos]
                    else:
                        token_pos = -1
                    continue
    
                min_match_length = self.min_match_length
    
                if best_match_length > 0:
                    min_match_length = best_match_length + 1
    
                source_token_pos = source_token_start_pos + min_match_length - 1
                target_token_pos = token_pos + min_match_length - 1
    
                if source_token_pos < texts[source_text_index].tk_end_pos and texts[
                    target_text_index].tk_end_pos > target_token_pos >= source_token_pos + min_match_length:
    
                    cnt = min_match_length
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    while cnt > 0:
                        if self.__fuzzy_match(tokens[source_token_pos].text, tokens[target_token_pos].text) > 80:
                            source_token_pos = source_token_pos - 1
                            target_token_pos = target_token_pos - 1
                            cnt = cnt - 1
                        else:
                            found = False
                            for i in range(1, self.max_gap + 1):
                                if self.__fuzzy_match(tokens[source_token_pos - i].text, tokens[target_token_pos].text) > 80:
                                    source_token_pos = source_token_pos - 1 - i
                                    target_token_pos = target_token_pos - 1
                                    cnt = cnt - 1
                                    found = True
                                    break
                            if not found:
                                break
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
                    if cnt > 0:
                        if token_pos in forward_references:
                            token_pos = forward_references[token_pos]
                        else:
                            token_pos = -1
                        continue
                else:
                    if token_pos in forward_references:
                        token_pos = forward_references[token_pos]
                    else:
                        token_pos = -1
                    continue
    
                new_match_length = min_match_length
                source_token_pos = source_token_start_pos + min_match_length
                target_token_pos = token_pos + min_match_length
    
                while source_token_pos < texts[source_text_index].tk_end_pos and texts[
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    target_text_index].tk_end_pos > target_token_pos > source_token_pos + new_match_length:
    
                    if self.__fuzzy_match(tokens[source_token_pos].text, tokens[target_token_pos].text) > 80:
                        source_token_pos = source_token_pos + 1
                        target_token_pos = target_token_pos + 1
                        new_match_length = new_match_length + 1
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    elif self.__fuzzy_match(tokens[source_token_pos].text + tokens[source_token_pos + 1].text, tokens[target_token_pos].text) > 80:
                        source_token_pos = source_token_pos + 1 + 1
                        target_token_pos = target_token_pos + 1
                        new_match_length = new_match_length + 1 + 1
                        offset_target = offset_target + 1
                    elif self.__fuzzy_match(tokens[source_token_pos].text, tokens[target_token_pos].text + tokens[target_token_pos + 1].text) > 80:
                        source_token_pos = source_token_pos + 1
                        target_token_pos = target_token_pos + 1 + 1
                        new_match_length = new_match_length + 1 + 1
                        offset_source = offset_source + 1
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                    elif not has_skipped:
                        found = False
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                        for i in range(1, self.max_gap + 1):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                            if self.__fuzzy_match(tokens[source_token_pos + i].text, tokens[target_token_pos].text) > 80:
                                source_token_pos = source_token_pos + 1 + i
                                target_token_pos = target_token_pos + 1
                                new_match_length = new_match_length + 1 + i
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                offset_target = offset_target + i
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                found = True
                                has_skipped = True
                                break
    
                        if not found:
                            break
                    else:
                        break
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
                if new_match_length >= self.min_match_length and new_match_length > best_match_length:
                    best_match_length = new_match_length
                    best_match_token_pos = token_pos
                    best_match = Match(source_text_index, source_token_start_pos, target_text_index, best_match_token_pos,
    
    Frederik Arnold's avatar
    Frederik Arnold committed
                                       best_match_length - offset_source, best_match_length - offset_target)
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
                if token_pos in forward_references:
                    token_pos = forward_references[token_pos]
                else:
                    token_pos = -1
    
            return best_match
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __fuzzy_match(self, input1, input2):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
            ratio = fuzz.ratio(input1, input2)
            return ratio
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __get_closest_match(self, candidates, word):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
    
            if not candidates or len(candidates) == 0:
                return None
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            if word in candidates:
                return word
    
            best_existing_tag = process.extractOne(word, candidates, scorer=fuzz.ratio, score_cutoff=80)
    
            if best_existing_tag:
                return best_existing_tag[0]
    
            return None
    
    
    Frederik Arnold's avatar
    Frederik Arnold committed
        def __print_similarities(self, similarities, input_texts):
    
    Frederik Arnold's avatar
    Frederik Arnold committed
            for similarity_tuple in similarities:
                similarity_literature = similarity_tuple[0]
                similarity_scientific = similarity_tuple[1]
    
                print('{0}, {1}'.format(similarity_literature, similarity_scientific))
                print(input_texts[0][similarity_literature.character_start_pos:similarity_literature.character_end_pos])