From ea17109ecd7570cc0a50fc4f55c4bb15e89cb746 Mon Sep 17 00:00:00 2001 From: Frederik Arnold <frederik.arnold@hu-berlin.de> Date: Fri, 21 Mar 2025 14:36:02 +0100 Subject: [PATCH] Refactor cli --- README.md | 5 +- indiquo/cli/IndiQuoCLI.py | 163 ++++++++++++------ ...Predictor.py => BaseCandidatePredictor.py} | 2 +- indiquo/core/BaseScenePredictor.py | 10 ++ indiquo/core/CandidatePredictor.py | 5 +- indiquo/core/CandidatePredictorDummy.py | 7 +- indiquo/core/CandidatePredictorRW.py | 15 +- indiquo/core/CandidatePredictorST.py | 4 +- indiquo/core/IndiQuo.py | 6 +- indiquo/core/IndiQuoBase.py | 1 - indiquo/core/IndiQuoSum.py | 4 - indiquo/core/ScenePredictor.py | 4 +- indiquo/core/ScenePredictorDummy.py | 14 ++ pyproject.toml | 12 +- requirements.txt | 14 +- test/TestSentenceChunker.py | 4 +- 16 files changed, 175 insertions(+), 95 deletions(-) rename indiquo/core/{BasePredictor.py => BaseCandidatePredictor.py} (85%) create mode 100644 indiquo/core/BaseScenePredictor.py create mode 100644 indiquo/core/ScenePredictorDummy.py diff --git a/README.md b/README.md index bd106ae..9d4d973 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,8 @@ This repository contains the tool `IndiQuo` for the detection of indirect quotat between dramas from [DraCor](https://dracor.org) and scholarly works which interpret the drama. ## Installation - -Checkout this repository and then run: - ~~~ -pip install -r requirements.txt +pip install indiquo ~~~ ### Dependencies diff --git a/indiquo/cli/IndiQuoCLI.py b/indiquo/cli/IndiQuoCLI.py index 748a2be..8daa9e0 100644 --- a/indiquo/cli/IndiQuoCLI.py +++ b/indiquo/cli/IndiQuoCLI.py @@ -1,3 +1,4 @@ +import argparse import logging import sys from argparse import ArgumentParser, BooleanOptionalAction @@ -10,6 +11,7 @@ from indiquo.core.CandidatePredictorDummy import CandidatePredictorDummy from indiquo.core.CandidatePredictorSum import CandidatePredictorSum from indiquo.core.IndiQuoBase import IndiQuoBase from indiquo.core.IndiQuoSum import IndiQuoSum +from indiquo.core.ScenePredictorDummy import ScenePredictorDummy from indiquo.training.scene import TrainSceneIdentification try: @@ -30,6 +32,9 @@ import csv from indiquo.training.candidate import TrainCandidateClassifier, TrainCandidateClassifierST +logger = logging.getLogger(__name__) + + def __train_candidate(train_folder_path, output_folder_path, model_name): TrainCandidateClassifier.train(train_folder_path, output_folder_path, model_name) @@ -43,7 +48,7 @@ def __train_scene(train_folder_path, output_folder_path, model_name): def __process_file(indi_quo: IndiQuoBase, filename, target_text, output_folder_path): - print(f'Processing {filename} ...') + logger.info(f'Processing {filename} ...') matches = indi_quo.compare(target_text) @@ -64,32 +69,44 @@ def __process_file(indi_quo: IndiQuoBase, filename, target_text, output_folder_p writer.writerow([m.target_start, m.target_end, speech_text, m.score, scene_predictions]) -def __run_compare(source_file_path, target_path, candidate_model_path, scene_model_path, - output_folder_path, approach, add_context, max_candidate_length, summaries_file_path): +def __run_compare(compare_approach, model_type, source_file_path, target_path, candidate_model_path, scene_model_path, + output_folder_path, add_context, max_candidate_length, summaries_file_path): drama_processor = Dramatist() drama = drama_processor.from_file(source_file_path) sentence_chunker = SentenceChunker(min_length=10, max_length=64, max_sentences=1) - if approach in ['iq', 'st', 'rw']: - if approach == 'iq': + if compare_approach == 'candidate': + if model_type == 'iq': candidate_tokenizer = AutoTokenizer.from_pretrained(candidate_model_path) candidate_model = AutoModelForSequenceClassification.from_pretrained(candidate_model_path) candidate_predictor = CandidatePredictor(candidate_tokenizer, candidate_model, sentence_chunker, add_context, max_candidate_length) - elif approach == 'st': + elif model_type == 'st': candidate_model = SentenceTransformer(candidate_model_path) candidate_predictor = CandidatePredictorST(drama, candidate_model, sentence_chunker, add_context, max_candidate_length) - elif approach == 'rw': + elif model_type == 'rw': candidate_model = SequenceTagger.load(candidate_model_path) candidate_predictor = CandidatePredictorRW(candidate_model, sentence_chunker) + indi_quo = IndiQuo(candidate_predictor, ScenePredictorDummy()) + + elif compare_approach == 'scene': + candidate_predictor = CandidatePredictorDummy(sentence_chunker) scene_model = SentenceTransformer(scene_model_path) scene_predictor = ScenePredictor(drama, scene_model, 10) - indi_quo = IndiQuo(candidate_predictor, scene_predictor) + elif compare_approach == 'full': + candidate_tokenizer = AutoTokenizer.from_pretrained(candidate_model_path) + candidate_model = AutoModelForSequenceClassification.from_pretrained(candidate_model_path) + candidate_predictor = CandidatePredictor(candidate_tokenizer, candidate_model, sentence_chunker, + add_context, max_candidate_length) + + scene_model = SentenceTransformer(scene_model_path) + scene_predictor = ScenePredictor(drama, scene_model, 10) - elif approach == 'sum': + indi_quo = IndiQuo(candidate_predictor, scene_predictor) + elif compare_approach == 'sum': summaries = [] with open(summaries_file_path, 'r') as summary_file: reader = csv.reader(summary_file, delimiter='\t') @@ -103,17 +120,8 @@ def __run_compare(source_file_path, target_path, candidate_model_path, scene_mod candidate_model = SentenceTransformer(candidate_model_path) candidate_predictor = CandidatePredictorSum(summaries, candidate_model, sentence_chunker) - indi_quo = IndiQuoSum(candidate_predictor) - elif approach == 'eval': - candidate_predictor = CandidatePredictorDummy(sentence_chunker) - scene_model = SentenceTransformer(scene_model_path) - scene_predictor = ScenePredictor(drama, scene_model, 10) - indi_quo = IndiQuo(candidate_predictor, scene_predictor) - else: - raise Exception(f'Approach {approach} is unknown') - if isfile(target_path) and target_path.endswith('.txt'): with open(target_path, 'r', encoding='utf-8') as target_file: target_file_content = target_file.read() @@ -158,9 +166,9 @@ def main(argv=None): parser_train_candidate = subparsers_train_model.add_parser('candidate', help=train_candidate_description, description=train_candidate_description) - parser_train_candidate.add_argument('train_folder_path', nargs=1, metavar='train-folder-path', + parser_train_candidate.add_argument('train_folder_path', metavar='train-folder-path', help='Path to the folder with training and validation data') - parser_train_candidate.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', + parser_train_candidate.add_argument('output_folder_path', metavar='output-folder-path', help='Path to the output folder of the trained model') parser_train_candidate.add_argument('--model', dest='model', default='deepset/gbert-large', help='Name of the model on huggingface to use as the base model for fine-tuning' @@ -170,9 +178,9 @@ def main(argv=None): parser_train_st = subparsers_train_model.add_parser('candidate_st', help=train_candidate_st_description, description=train_candidate_st_description) - parser_train_st.add_argument('train_folder_path', nargs=1, metavar='train-folder-path', + parser_train_st.add_argument('train_folder_path', metavar='train-folder-path', help='Path to the folder with training and validation data') - parser_train_st.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', + parser_train_st.add_argument('output_folder_path', metavar='output-folder-path', help='Path to the output folder of the trained model') parser_train_st.add_argument('--model', dest='model', default='deutsche-telekom/gbert-large-paraphrase-cosine', help='Name of the model on huggingface to use as the base model for fine-tuning' @@ -181,9 +189,9 @@ def main(argv=None): parser_train_scene = subparsers_train_model.add_parser('scene', help=train_scene_description, description=train_scene_description) - parser_train_scene.add_argument('train_folder_path', nargs=1, metavar='train-folder-path', + parser_train_scene.add_argument('train_folder_path', metavar='train-folder-path', help='Path to the folder with training and validation data') - parser_train_scene.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', + parser_train_scene.add_argument('output_folder_path', metavar='output-folder-path', help='Path to the input folder') parser_train_scene.add_argument('--model', dest='model', default='deutsche-telekom/gbert-large-paraphrase-cosine', help='Name of the model on huggingface to use as the base model for fine-tuning' @@ -192,35 +200,68 @@ def main(argv=None): parser_compare = subparsers_command.add_parser('compare', help=compare_description, description=compare_description) - parser_compare.add_argument('source_file_path', nargs=1, metavar='source-file-path', - help='Path to the source xml drama file') - parser_compare.add_argument('target_path', nargs=1, metavar='target-path', - help='Path to the target text file or folder') - parser_compare.add_argument('candidate_model_folder_path', nargs=1, metavar='candidate-model-folder-path', - help='Path to the candidate model folder') - parser_compare.add_argument('scene_model_folder_path', nargs=1, metavar='scene-model-folder-path', - help='Path to the scene model folder') - parser_compare.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', - help='The output folder path') - parser_compare.add_argument('--approach', choices=['st', 'rw', 'iq', 'sum', 'eval'], dest='approach', - default='iq', help='The approach to use for candidate prediction') - parser_compare.add_argument('--add-context', dest='add_context', default=True, + subparsers_compare_approach = parser_compare.add_subparsers(dest='compare_approach') + subparsers_compare_approach.required = True + + cp_all = argparse.ArgumentParser(add_help=False) + cp_all.add_argument('source_file_path', metavar='source-file-path', help='Path to the source xml drama file') + cp_all.add_argument('target_path', metavar='target-path', help='Path to the target text file or folder') + + cp_candidate_full = argparse.ArgumentParser(add_help=False) + cp_candidate_full.add_argument('--add-context', dest='add_context', default=True, action=BooleanOptionalAction, help='If set, candidates are embedded in context up to' 'a total length of --max-candidate-length') - parser_compare.add_argument('--max-candidate-length', dest='max_candidate_length', default=128, + cp_candidate_full.add_argument('--max-candidate-length', dest='max_candidate_length', default=128, type=int, help='Maximum length in words of a candidate (default: %(default)d)') - parser_compare.add_argument('--summaries-file-path', dest='summaries_file_path', required=False, + + cp_candidate_model = argparse.ArgumentParser(add_help=False) + cp_candidate_model.add_argument('candidate_model_folder_path', metavar='candidate-model-folder-path', + help='Path to the candidate model folder') + cp_scene_model = argparse.ArgumentParser(add_help=False) + cp_scene_model.add_argument('scene_model_folder_path', metavar='scene-model-folder-path', + help='Path to the scene model folder') + cp_output = argparse.ArgumentParser(add_help=False) + cp_output.add_argument('output_folder_path', metavar='output-folder-path', + help='The output folder path') + + parser_compare_candidate = ( + subparsers_compare_approach.add_parser('candidate', + parents=[cp_all, cp_candidate_model, cp_output, cp_candidate_full], + help='TBD', description='TBD') + + ) + parser_compare_candidate.add_argument('--model-type', choices=['st', 'rw', 'iq'], dest='model_type', + default='iq', help='The model type to use for candidate prediction') + + parser_compare_scene = ( + subparsers_compare_approach.add_parser('scene', + parents=[cp_all, cp_scene_model, cp_output], + help='TBD', description='TBD') + ) + + parser_compare_full = ( + subparsers_compare_approach.add_parser('full', + parents=[cp_all, cp_candidate_model, cp_scene_model, cp_output, cp_candidate_full], + help='TBD', description='TBD') + ) + + parser_compare_sum = ( + subparsers_compare_approach.add_parser('sum', + parents = [cp_all, cp_candidate_model, cp_output], + help='TBD', description='TBD') + ) + parser_compare_sum.add_argument('--summaries-file-path', dest='summaries_file_path', required=False, help='Path to the summaries tsv file. Only used if approach is set to \'sum\'') args = argument_parser.parse_args(argv) log_level = args.log_level - logging.getLogger().setLevel(logging.getLevelName(log_level)) + logging.basicConfig(level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.command == 'train': if args.train_model == 'candidate' or args.train_model == 'candidate_st' or args.train_model == 'scene': - train_folder_path = args.train_folder_path[0] - output_folder_path = args.output_folder_path[0] + train_folder_path = args.train_folder_path + output_folder_path = args.output_folder_path model = args.model model_name_repl = model.replace('/', '') @@ -238,18 +279,32 @@ def main(argv=None): __train_scene(train_folder_path, output_folder_path, model) elif args.command == 'compare': - source_file_path = args.source_file_path[0] - target_path = args.target_path[0] - candidate_model_folder_path = args.candidate_model_folder_path[0] - scene_model_folder_path = args.scene_model_folder_path[0] - output_folder_path = args.output_folder_path[0] - approach = args.approach - add_context = args.add_context - max_candidate_length = args.max_candidate_length + source_file_path = args.source_file_path + target_path = args.target_path + output_folder_path = args.output_folder_path - summaries_file_path = None + c_appr = args.compare_approach + + candidate_model_folder_path = None + if c_appr in ['candidate', 'full', 'sum']: + candidate_model_folder_path = args.candidate_model_folder_path + + scene_model_folder_path = None + if c_appr in ['scene', 'full']: + scene_model_folder_path = args.scene_model_folder_path - if approach == 'sum': + add_context = True + max_candidate_length = 128 + if c_appr in ['candidate', 'full']: + add_context = args.add_context + max_candidate_length = args.max_candidate_length + + model_type = None + if c_appr == 'candidate': + model_type = args.model_type + + summaries_file_path = None + if c_appr == 'sum': summaries_file_path = args.summaries_file_path now = datetime.now() @@ -257,8 +312,8 @@ def main(argv=None): output_folder_path = join(output_folder_path, date_time_string) Path(output_folder_path).mkdir(parents=True, exist_ok=True) - __run_compare(source_file_path, target_path, candidate_model_folder_path, scene_model_folder_path, - output_folder_path, approach, add_context, max_candidate_length, summaries_file_path) + __run_compare(c_appr, model_type, source_file_path, target_path, candidate_model_folder_path, scene_model_folder_path, + output_folder_path, add_context, max_candidate_length, summaries_file_path) if __name__ == '__main__': diff --git a/indiquo/core/BasePredictor.py b/indiquo/core/BaseCandidatePredictor.py similarity index 85% rename from indiquo/core/BasePredictor.py rename to indiquo/core/BaseCandidatePredictor.py index 56b437d..6d5467e 100644 --- a/indiquo/core/BasePredictor.py +++ b/indiquo/core/BaseCandidatePredictor.py @@ -3,7 +3,7 @@ from typing import List from indiquo.core.Candidate import Candidate -class BasePredictor(ABC): +class BaseCandidatePredictor(ABC): @abstractmethod def get_candidates(self, target_text) -> List[Candidate]: diff --git a/indiquo/core/BaseScenePredictor.py b/indiquo/core/BaseScenePredictor.py new file mode 100644 index 0000000..9380fc9 --- /dev/null +++ b/indiquo/core/BaseScenePredictor.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod +from typing import List +from indiquo.core.ScenePrediction import ScenePrediction + + +class BaseScenePredictor(ABC): + + @abstractmethod + def predict_scene(self, text) -> List[List[ScenePrediction]]: + pass diff --git a/indiquo/core/CandidatePredictor.py b/indiquo/core/CandidatePredictor.py index 7a1ad81..6c76927 100644 --- a/indiquo/core/CandidatePredictor.py +++ b/indiquo/core/CandidatePredictor.py @@ -1,8 +1,7 @@ from typing import List import re -from dramatist.drama.Drama import Drama -from indiquo.core.BasePredictor import BasePredictor +from indiquo.core.BaseCandidatePredictor import BaseCandidatePredictor from indiquo.core.Candidate import Candidate from indiquo.core.chunker.BaseChunker import BaseChunker import torch @@ -10,7 +9,7 @@ from kpcommons.Footnote import map_to_real_pos, get_footnote_ranges, remove_foot # noinspection PyMethodMayBeStatic -class CandidatePredictor(BasePredictor): +class CandidatePredictor(BaseCandidatePredictor): def __init__(self, tokenizer, model, chunker: BaseChunker, add_context, max_length): self.tokenizer = tokenizer diff --git a/indiquo/core/CandidatePredictorDummy.py b/indiquo/core/CandidatePredictorDummy.py index 1c2458c..3175ba3 100644 --- a/indiquo/core/CandidatePredictorDummy.py +++ b/indiquo/core/CandidatePredictorDummy.py @@ -1,16 +1,13 @@ from typing import List -from sentence_transformers import util -from dramatist.drama.Drama import Drama -from indiquo.core.BasePredictor import BasePredictor +from indiquo.core.BaseCandidatePredictor import BaseCandidatePredictor from indiquo.core.Candidate import Candidate from indiquo.core.chunker.BaseChunker import BaseChunker from kpcommons.Footnote import map_to_real_pos, get_footnote_ranges, remove_footnotes -import re # noinspection PyMethodMayBeStatic -class CandidatePredictorDummy(BasePredictor): +class CandidatePredictorDummy(BaseCandidatePredictor): def __init__(self, chunker: BaseChunker): self.chunker = chunker diff --git a/indiquo/core/CandidatePredictorRW.py b/indiquo/core/CandidatePredictorRW.py index 077fab9..38ccb9f 100644 --- a/indiquo/core/CandidatePredictorRW.py +++ b/indiquo/core/CandidatePredictorRW.py @@ -1,13 +1,18 @@ -from typing import List, Optional -from flair.data import Sentence -from flair.nn import Model -from indiquo.core.BasePredictor import BasePredictor +from typing import List + +try: + from flair.data import Sentence + from flair.nn import Model +except ModuleNotFoundError: + pass + +from indiquo.core.BaseCandidatePredictor import BaseCandidatePredictor from indiquo.core.Candidate import Candidate from indiquo.core.chunker.BaseChunker import BaseChunker from kpcommons.Footnote import map_to_real_pos, get_footnote_ranges, remove_footnotes -class CandidatePredictorRW(BasePredictor): +class CandidatePredictorRW(BaseCandidatePredictor): def __init__(self, model: Model, chunker: BaseChunker): self.model = model diff --git a/indiquo/core/CandidatePredictorST.py b/indiquo/core/CandidatePredictorST.py index 1312778..2c5ae6f 100644 --- a/indiquo/core/CandidatePredictorST.py +++ b/indiquo/core/CandidatePredictorST.py @@ -2,7 +2,7 @@ from typing import List from sentence_transformers import util from dramatist.drama.Drama import Drama -from indiquo.core.BasePredictor import BasePredictor +from indiquo.core.BaseCandidatePredictor import BaseCandidatePredictor from indiquo.core.Candidate import Candidate from indiquo.core.chunker.BaseChunker import BaseChunker from kpcommons.Footnote import map_to_real_pos, get_footnote_ranges, remove_footnotes @@ -10,7 +10,7 @@ import re # noinspection PyMethodMayBeStatic -class CandidatePredictorST(BasePredictor): +class CandidatePredictorST(BaseCandidatePredictor): def __init__(self, drama: Drama, model, chunker: BaseChunker, add_context, max_length): self.drama = drama diff --git a/indiquo/core/IndiQuo.py b/indiquo/core/IndiQuo.py index 263c79b..9fd9134 100644 --- a/indiquo/core/IndiQuo.py +++ b/indiquo/core/IndiQuo.py @@ -1,16 +1,16 @@ from typing import List -from indiquo.core.BasePredictor import BasePredictor +from indiquo.core.BaseCandidatePredictor import BaseCandidatePredictor +from indiquo.core.BaseScenePredictor import BaseScenePredictor from indiquo.core.Candidate import Candidate from indiquo.core.IndiQuoBase import IndiQuoBase -from indiquo.core.ScenePredictor import ScenePredictor from indiquo.match.Match import Match # noinspection PyMethodMayBeStatic class IndiQuo(IndiQuoBase): - def __init__(self, candidate_predictor: BasePredictor, scene_predictor: ScenePredictor): + def __init__(self, candidate_predictor: BaseCandidatePredictor, scene_predictor: BaseScenePredictor): self.candidate_predictor = candidate_predictor self.scene_predictor = scene_predictor diff --git a/indiquo/core/IndiQuoBase.py b/indiquo/core/IndiQuoBase.py index 16ed50b..34fa49b 100644 --- a/indiquo/core/IndiQuoBase.py +++ b/indiquo/core/IndiQuoBase.py @@ -1,6 +1,5 @@ from abc import ABC, abstractmethod from typing import List -from indiquo.core.Candidate import Candidate from indiquo.match.Match import Match diff --git a/indiquo/core/IndiQuoSum.py b/indiquo/core/IndiQuoSum.py index cc6a7ad..3e20165 100644 --- a/indiquo/core/IndiQuoSum.py +++ b/indiquo/core/IndiQuoSum.py @@ -1,12 +1,8 @@ from typing import List -from indiquo.core.BasePredictor import BasePredictor -from indiquo.core.CandidatePredictor import CandidatePredictor -from indiquo.core.Candidate import Candidate from indiquo.core.CandidatePredictorSum import CandidatePredictorSum from indiquo.core.CandidateWithScenes import CandidateWithScenes from indiquo.core.IndiQuoBase import IndiQuoBase -from indiquo.core.ScenePredictor import ScenePredictor from indiquo.match.Match import Match diff --git a/indiquo/core/ScenePredictor.py b/indiquo/core/ScenePredictor.py index 7f9c602..299a0e9 100644 --- a/indiquo/core/ScenePredictor.py +++ b/indiquo/core/ScenePredictor.py @@ -1,10 +1,11 @@ from dramatist.drama.Drama import Drama from sentence_transformers import util +from indiquo.core.BaseScenePredictor import BaseScenePredictor from indiquo.core.ScenePrediction import ScenePrediction -class ScenePredictor: +class ScenePredictor(BaseScenePredictor): def __init__(self, drama: Drama, model, top_k): self.model = model @@ -23,6 +24,7 @@ class ScenePredictor: self.source_embeddings = model.encode(source_text_blocks, convert_to_tensor=True) + # overriding abstract method def predict_scene(self, text): if isinstance(text, str): text = [text] diff --git a/indiquo/core/ScenePredictorDummy.py b/indiquo/core/ScenePredictorDummy.py new file mode 100644 index 0000000..1790d89 --- /dev/null +++ b/indiquo/core/ScenePredictorDummy.py @@ -0,0 +1,14 @@ +from typing import List + +from indiquo.core.BaseScenePredictor import BaseScenePredictor +from indiquo.core.ScenePrediction import ScenePrediction + + +class ScenePredictorDummy(BaseScenePredictor): + + def predict_scene(self, text) -> List[List[ScenePrediction]]: + if isinstance(text, str): + text = [text] + + result = [[] for _ in range(len(text))] + return result diff --git a/pyproject.toml b/pyproject.toml index 4f8cf88..1671a4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,18 +7,24 @@ build-backend = "setuptools.build_meta" [project] name = "IndiQuo" -version = "0.0.1" +version = "0.1.0" authors = [ { name = "Frederik Arnold", email = "frederik.arnold@hu-berlin.de"} ] description = "" readme = "README.md" license = { file="LICENSE" } -requires-python = ">=3.9" +requires-python = ">=3.11" keywords = ["quotation detection", "quotation identification", "indirect citation extraction", "natural language processing", "nlp", "text reuse"] dependencies = [ - + "sentence-transformers>=3.4.1", + "dramatist>=0.0.7", + "kpcommons>=0.1.2", + "pysbd>=0.3.4", + "datasets>=3.4.1", + "evaluate>=0.4.3", + "accelerate>=1.5.2" ] classifiers = [ diff --git a/requirements.txt b/requirements.txt index b49196c..e5f77e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -sentence-transformers~=3.0.1 -dramatist~=0.0.6 -kpcommons~=0.0.3 -pysbd~=0.3.4 -datasets~=2.20.0 -evaluate~=0.4.2 -accelerate~=0.33.0 \ No newline at end of file +sentence-transformers>=3.4.1 +dramatist>=0.0.7 +kpcommons>=0.1.2 +pysbd>=0.3.4 +datasets>=3.4.1 +evaluate>=0.4.3 +accelerate>=1.5.2 \ No newline at end of file diff --git a/test/TestSentenceChunker.py b/test/TestSentenceChunker.py index 45d67ab..6d88a9c 100644 --- a/test/TestSentenceChunker.py +++ b/test/TestSentenceChunker.py @@ -21,5 +21,5 @@ class SentenceChunkerTestCase(TestCase): result = sentence_chunker.chunk(text) - self.assertEqual(2, len(result)) - self.assertEqual(24, result[0].end) + self.assertEqual(5, len(result)) + self.assertEqual(22, result[0].end) -- GitLab