From b7deb60ad550afaba6c84633b56a812b9de7839b Mon Sep 17 00:00:00 2001 From: Frederik Arnold <frederik.arnold@hu-berlin.de> Date: Fri, 26 Jul 2024 08:56:25 +0200 Subject: [PATCH] Cleanup, add readme --- LICENSE | 201 ++++++++++++++++++ README.md | 130 +++++++++++ indiquo/cli/IndiQuoCLI.py | 107 +++++----- indiquo/core/CandidatePredictor.py | 38 +--- indiquo/core/CandidatePredictorRW.py | 2 +- indiquo/core/CandidatePredictorST.py | 30 +-- indiquo/testing/TestSimilarity.py | 119 ----------- indiquo/testing/__init__.py | 0 .../training/similarity/TrainSimilarity.py | 71 ------- .../similarity/TrainSimilarityCombined.py | 72 ------- .../similarity/TrainSimilarityContrastive.py | 44 ---- .../similarity/TrainSimilarityDrama.py | 56 ----- indiquo/training/similarity/__init__.py | 0 pyproject.toml | 38 ++++ requirements.txt | 7 + 15 files changed, 443 insertions(+), 472 deletions(-) create mode 100644 LICENSE create mode 100644 README.md delete mode 100644 indiquo/testing/TestSimilarity.py delete mode 100644 indiquo/testing/__init__.py delete mode 100644 indiquo/training/similarity/TrainSimilarity.py delete mode 100644 indiquo/training/similarity/TrainSimilarityCombined.py delete mode 100644 indiquo/training/similarity/TrainSimilarityContrastive.py delete mode 100644 indiquo/training/similarity/TrainSimilarityDrama.py delete mode 100644 indiquo/training/similarity/__init__.py create mode 100644 pyproject.toml create mode 100644 requirements.txt diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..cad06b9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023 Schlüsselstellen + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..bd106ae --- /dev/null +++ b/README.md @@ -0,0 +1,130 @@ +# Readme +This repository contains the tool `IndiQuo` for the detection of indirect quotations (summaries and paraphrases) +between dramas from [DraCor](https://dracor.org) and scholarly works which interpret the drama. + +## Installation + +Checkout this repository and then run: + +~~~ +pip install -r requirements.txt +~~~ + +### Dependencies +The dependencies to run the [Rederwiedergabe Tagger](https://github.com/redewiedergabe/tagger) are not installed by +default as this can be a [tricky process](https://github.com/redewiedergabe/tagger/issues/4) and this tagger is only +used as a baseline and not for our approach and therefore not needed in most cases. + +## Usage +The following sections describe how to use IndiQuo on the command line. + +### Training +The library supports training of custom models for candidate identification and scene prediction. + +#### Candidate Identification + +~~~ +indiquo train candidate +path_to_train_folder +path_to_the_output_folder +hugginface_model_name +~~~ + +`path_to_train_folder` has to contain to files named `train_set.tsv` and `val_set.tsv` which contain one example per +line in the form a string and a label, tab separated, for example: + +~~~ +Some positive example 1 +Some negative example 0 +~~~ + +`hugginface_model_name` is the name of the model on huggingface to use for fine-tuning, `deepset/gbert-large` is used +as the default. + +#### Scene Prediction + +~~~ +indiquo train candidate +path_to_train_folder +path_to_the_output_folder +hugginface_model_name +~~~ + +`path_to_train_folder` has to contain to files named `train_set.tsv` and `val_set.tsv` which contain one example per +line in the form two strings, a drama excerpt and a corresponding summary, tab separated, for example: + +~~~ +Drama excerpt Summary +~~~ + +`hugginface_model_name` is the name of the model on huggingface to use for fine-tuning, +`deutsche-telekom/gbert-large-paraphrase-cosine` is used as the default. + +### Indirect Quotation Identification + +To run `IndiQuo` with the default model, use the following command: + +~~~ +indiquo compare path_to_source_text path_to_target_text --text --output-type text +~~~ + +<details> +<summary>All IndiQuo command line options</summary> + +~~~ +usage: indiquo compare [-h] [--approach {st,rw,iq,sum,eval}] + [--add-context | --no-add-context] + [--max-candidate-length MAX_CANDIDATE_LENGTH] + [--summaries-file-path SUMMARIES_FILE_PATH] + source-file-path target-path + candidate-model-folder-path scene-model-folder-path + output-folder-path + +positional arguments: + source-file-path Path to the source xml drama file + target-path Path to the target text file or folder + candidate-model-folder-path + Path to the candidate model folder + scene-model-folder-path + Path to the scene model folder + output-folder-path The output folder path + +options: + -h, --help show this help message and exit + --approach {st,rw,iq,sum,eval} + The approach to use for candidate prediction + --add-context, --no-add-context + If set, candidates are embedded in context up toa + total length of --max-candidate-length (default: True) + --max-candidate-length MAX_CANDIDATE_LENGTH + Maximum length in words of a candidate (default: 128) + --summaries-file-path SUMMARIES_FILE_PATH + Path to the summaries tsv file. Only used if approach + is set to 'sum' +~~~ + +By default, the approach to use is set to `iq` which is the approach presented in the paper. The approach option can +be changed to run the base models (rw=rederwiedergabe, st=SentenceTransformer) or to use a SentenceTransformer with +summaries (sum) or only run scene prediction for evaluation purposes (eval). + +</details> + +The output folder will contain a tsv file for each txt file in the target path. The tsv files have the following +structure: + +The output will look something like this: + +~~~ +start end text score scenes +10 15 some text 0.5 1:1:0.2#2:5:0.5#... +~~~ + +The first three columns are the character start and end positions and the text of the quotation in the target text. The +fourth column is the probability of the positive class, i.e., the candidate is an indirect quotation. The last column +contains the top 10 source scenes separated by '#' and each part has the following structure: act:scene:probability. + +## Citation +If you use the code in repository or base your work on our code, please cite our paper: +~~~ +TBD +~~~ \ No newline at end of file diff --git a/indiquo/cli/IndiQuoCLI.py b/indiquo/cli/IndiQuoCLI.py index e27aec0..ff15a37 100644 --- a/indiquo/cli/IndiQuoCLI.py +++ b/indiquo/cli/IndiQuoCLI.py @@ -18,7 +18,6 @@ try: except ModuleNotFoundError: pass -from quid.core.Quid import Quid from dramatist.core.Dramatist import Dramatist from indiquo.core.CandidatePredictorST import CandidatePredictorST from indiquo.core.IndiQuo import IndiQuo @@ -46,22 +45,10 @@ def __train_scene(train_folder_path, output_folder_path, model_name): def __process_file(indi_quo: IndiQuoBase, filename, target_text, output_folder_path): print(f'Processing {filename} ...') - # quid_matches = quid.compare(drama.get_text(), target_text) - - # source_text = drama.get_text() - - # short_matches: List[MatchRef] = pro_quo_lm.compare(source_text, target_text, quid_matches) - # all_matches = short_matches - - # long_matches = Helper.remove_short_matches(quid_matches, target_text) - # all_matches.extend(long_matches) - # all_matches = Helper.remove_overlapping_matches(all_matches, target_text) - - # matches = indi_quo.compare(target_text, all_matches) matches = indi_quo.compare(target_text) - with open(join(output_folder_path, f'{filename}.tsv'), "w", encoding='utf-8') as output_file: - writer = csv.writer(output_file, delimiter="\t", lineterminator="\n") + with open(join(output_folder_path, f'{filename}.tsv'), 'w', encoding='utf-8') as output_file: + writer = csv.writer(output_file, delimiter='\t', lineterminator='\n') writer.writerow(['start', 'end', 'text', 'score', 'scenes']) for m in matches: @@ -78,8 +65,7 @@ def __process_file(indi_quo: IndiQuoBase, filename, target_text, output_folder_p def __run_compare(source_file_path, target_path, candidate_model_path, scene_model_path, - output_folder_path, approach, add_context, summaries_file_path): - + output_folder_path, approach, add_context, max_candidate_length, summaries_file_path): drama_processor = Dramatist() drama = drama_processor.from_file(source_file_path) sentence_chunker = SentenceChunker(min_length=10, max_length=64, max_sentences=1) @@ -89,10 +75,11 @@ def __run_compare(source_file_path, target_path, candidate_model_path, scene_mod candidate_tokenizer = AutoTokenizer.from_pretrained(candidate_model_path) candidate_model = AutoModelForSequenceClassification.from_pretrained(candidate_model_path) candidate_predictor = CandidatePredictor(drama, candidate_tokenizer, candidate_model, sentence_chunker, - add_context) + add_context, max_candidate_length) elif approach == 'st': candidate_model = SentenceTransformer(candidate_model_path) - candidate_predictor = CandidatePredictorST(drama, candidate_model, sentence_chunker, add_context) + candidate_predictor = CandidatePredictorST(drama, candidate_model, sentence_chunker, add_context, + max_candidate_length) elif approach == 'rw': candidate_model = SequenceTagger.load(candidate_model_path) candidate_predictor = CandidatePredictorRW(candidate_model, sentence_chunker) @@ -146,6 +133,14 @@ def __run_compare(source_file_path, target_path, candidate_model_path, scene_mod def main(argv=None): + train_description = 'This command allows the user to train their own model.' + + train_candidate_description = '' + train_candidate_st_description = '' + train_scene_description = '' + + compare_description = '' + argument_parser = ArgumentParser(prog='indiquo', description='TBD') argument_parser.add_argument('--log-level', dest='log_level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', @@ -155,61 +150,67 @@ def main(argv=None): subparsers_command = argument_parser.add_subparsers(dest='command') subparsers_command.required = True - parser_train = subparsers_command.add_parser('train', help='', description='') + parser_train = subparsers_command.add_parser('train', help=train_description, description=train_description) subparsers_train_model = parser_train.add_subparsers(dest='train_model') subparsers_train_model.required = True - parser_train_candidate = subparsers_train_model.add_parser('candidate', help='', description='') + parser_train_candidate = subparsers_train_model.add_parser('candidate', help=train_candidate_description, + description=train_candidate_description) parser_train_candidate.add_argument('train_folder_path', nargs=1, metavar='train-folder-path', - help='Path to the ') + help='Path to the folder with training and validation data') parser_train_candidate.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', - help="Path to the input folder") + help='Path to the output folder of the trained model') parser_train_candidate.add_argument('--model', dest='model', default='deepset/gbert-large', - help="") + help='Name of the model on huggingface to use as the base model for fine-tuning' + ' (default: %(default)s)') - # TODO: rename to candidate_st or similar - parser_train_st = subparsers_train_model.add_parser('st', help='', description='') + # probably not needed as this did not perform well + parser_train_st = subparsers_train_model.add_parser('candidate_st', help=train_candidate_st_description, + description=train_candidate_st_description) parser_train_st.add_argument('train_folder_path', nargs=1, metavar='train-folder-path', - help='Path to the ') + help='Path to the folder with training and validation data') parser_train_st.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', - help="Path to the input folder") + help='Path to the output folder of the trained model') parser_train_st.add_argument('--model', dest='model', default='deutsche-telekom/gbert-large-paraphrase-cosine', - help="") + help='Name of the model on huggingface to use as the base model for fine-tuning' + ' (default: %(default)s)') - parser_train_scene = subparsers_train_model.add_parser('scene', help='', description='') + parser_train_scene = subparsers_train_model.add_parser('scene', help=train_scene_description, + description=train_scene_description) parser_train_scene.add_argument('train_folder_path', nargs=1, metavar='train-folder-path', - help='Path to the ') + help='Path to the folder with training and validation data') parser_train_scene.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', - help="Path to the input folder") + help='Path to the input folder') parser_train_scene.add_argument('--model', dest='model', default='deutsche-telekom/gbert-large-paraphrase-cosine', - help="") + help='Name of the model on huggingface to use as the base model for fine-tuning' + ' (default: %(default)s)') - parser_compare = subparsers_command.add_parser('compare', help='', description='') + parser_compare = subparsers_command.add_parser('compare', help=compare_description, + description=compare_description) - parser_compare.add_argument("source_file_path", nargs=1, metavar="source-file-path", - help="Path to the source xml file") + parser_compare.add_argument('source_file_path', nargs=1, metavar='source-file-path', + help='Path to the source xml drama file') parser_compare.add_argument('target_path', nargs=1, metavar='target-path', help='Path to the target text file or folder') parser_compare.add_argument('candidate_model_folder_path', nargs=1, metavar='candidate-model-folder-path', - help='Path to the similarity model folder') + help='Path to the candidate model folder') parser_compare.add_argument('scene_model_folder_path', nargs=1, metavar='scene-model-folder-path', - help='Path to the similarity model folder') - parser_compare.add_argument('--output-folder-path', dest="output_folder_path", - help="The output folder path. If this option is set the output will be saved to a file" - " created in the specified folder") - parser_compare.add_argument('--direct-quotes-path', dest='direct_quotes_path', - help='Path to the file or folder with direct quotes. If this option is not set, then' - ' ProQuoLM is used to find direct quotes.') + help='Path to the scene model folder') + parser_compare.add_argument('output_folder_path', nargs=1, metavar='output-folder-path', + help='The output folder path') parser_compare.add_argument('--approach', choices=['st', 'rw', 'iq', 'sum', 'eval'], dest='approach', - default='iq', help='TBD') + default='iq', help='The approach to use for candidate prediction') parser_compare.add_argument('--add-context', dest='add_context', default=True, - action=BooleanOptionalAction, help='') - parser_compare.add_argument("--summaries-file-path", dest="summaries_file_path", required=False, - help="Path to the summaries tsv file") + action=BooleanOptionalAction, help='If set, candidates are embedded in context up to' + 'a total length of --max-candidate-length') + parser_compare.add_argument('--max-candidate-length', dest='max_candidate_length', default=128, + type=int, help='Maximum length in words of a candidate (default: %(default)d)') + parser_compare.add_argument('--summaries-file-path', dest='summaries_file_path', required=False, + help='Path to the summaries tsv file. Only used if approach is set to \'sum\'') args = argument_parser.parse_args(argv) @@ -217,7 +218,7 @@ def main(argv=None): logging.getLogger().setLevel(logging.getLevelName(log_level)) if args.command == 'train': - if args.train_model == 'candidate' or args.train_model == 'st' or args.train_model == 'scene': + if args.train_model == 'candidate' or args.train_model == 'candidate_st' or args.train_model == 'scene': train_folder_path = args.train_folder_path[0] output_folder_path = args.output_folder_path[0] model = args.model @@ -231,7 +232,7 @@ def main(argv=None): if args.train_model == 'candidate': __train_candidate(train_folder_path, output_folder_path, model) - elif args.train_model == 'st': + elif args.train_model == 'candidate_st': __train_candidate_st(train_folder_path, output_folder_path, model) elif args.train_model == 'scene': __train_scene(train_folder_path, output_folder_path, model) @@ -241,10 +242,10 @@ def main(argv=None): target_path = args.target_path[0] candidate_model_folder_path = args.candidate_model_folder_path[0] scene_model_folder_path = args.scene_model_folder_path[0] - output_folder_path = args.output_folder_path - # direct_quotes_path = args.direct_quotes_path + output_folder_path = args.output_folder_path[0] approach = args.approach add_context = args.add_context + max_candidate_length = args.max_candidate_length summaries_file_path = None @@ -257,7 +258,7 @@ def main(argv=None): Path(output_folder_path).mkdir(parents=True, exist_ok=True) __run_compare(source_file_path, target_path, candidate_model_folder_path, scene_model_folder_path, - output_folder_path, approach, add_context, summaries_file_path) + output_folder_path, approach, add_context, max_candidate_length, summaries_file_path) if __name__ == '__main__': diff --git a/indiquo/core/CandidatePredictor.py b/indiquo/core/CandidatePredictor.py index 0efaf07..db123e5 100644 --- a/indiquo/core/CandidatePredictor.py +++ b/indiquo/core/CandidatePredictor.py @@ -11,10 +11,8 @@ from kpcommons.Footnote import map_to_real_pos, get_footnote_ranges, remove_foot # noinspection PyMethodMayBeStatic class CandidatePredictor(BasePredictor): - MAX_LENGTH = 256 - SIMILARITY_THRESHOLD = 0.0 - def __init__(self, drama: Drama, tokenizer, model, chunker: BaseChunker, add_context): + def __init__(self, drama: Drama, tokenizer, model, chunker: BaseChunker, add_context, max_length): self.drama = drama self.tokenizer = tokenizer self.model = model @@ -22,16 +20,7 @@ class CandidatePredictor(BasePredictor): self.all_text_blocks = [] self.source_text_blocks = [] self.add_context = add_context - - # for act_nr, act in enumerate(drama.acts): - # for scene_nr, scene in enumerate(act.scenes): - # text_blocks = scene.get_text_in_blocks(128) - # - # for tbt in text_blocks: - # self.all_text_blocks.append((act_nr, scene_nr, tbt.text)) - # self.source_text_blocks.append(tbt.text) - # - # self.source_embeddings = model.encode(self.source_text_blocks, convert_to_tensor=True) + self.max_length = max_length # overriding abstract method def get_candidates(self, target_text) -> List[Candidate]: @@ -46,22 +35,8 @@ class CandidatePredictor(BasePredictor): chunk.start = real_start chunk.end = real_end - filtered_chunks = chunks - # for chunk in chunks: - # found_match = False - # for dq in direct_quotes: - # overlap_length = Util.calculate_overlap(chunk.start, chunk.end, dq.target_span.start, - # dq.target_span.end) - # - # if overlap_length > 0: - # found_match = True - # break - # - # if not found_match: - # filtered_chunks.append(chunk) - candidates: List[Candidate] = [] - for chunk in filtered_chunks: + for chunk in chunks: if self.add_context: text = self.__add_context(chunk.text, target_text, chunk.start, chunk.end) else: @@ -80,13 +55,12 @@ class CandidatePredictor(BasePredictor): logits = self.model(**inputs).logits p = torch.nn.functional.softmax(logits, dim=1) score = float(p[0, 1]) - if score > self.SIMILARITY_THRESHOLD: - return score + return score - return None + # return None def __add_context(self, quote_text, text, quote_start, quote_end): - rest_len = self.MAX_LENGTH - len(quote_text.split()) + rest_len = self.max_length - len(quote_text.split()) if rest_len < 0: raise Exception('No rest len!') diff --git a/indiquo/core/CandidatePredictorRW.py b/indiquo/core/CandidatePredictorRW.py index 36d2ce2..077fab9 100644 --- a/indiquo/core/CandidatePredictorRW.py +++ b/indiquo/core/CandidatePredictorRW.py @@ -14,7 +14,7 @@ class CandidatePredictorRW(BasePredictor): self.chunker = chunker # overriding abstract method - def get_candidates(self, target_text, direct_quotes) -> List[Candidate]: + def get_candidates(self, target_text) -> List[Candidate]: fn_ranges, fn_ranges_with_offset = get_footnote_ranges(target_text) target_text_wo_fn: str = remove_footnotes(target_text) chunks = self.chunker.chunk(target_text_wo_fn) diff --git a/indiquo/core/CandidatePredictorST.py b/indiquo/core/CandidatePredictorST.py index 45a368c..416bd4e 100644 --- a/indiquo/core/CandidatePredictorST.py +++ b/indiquo/core/CandidatePredictorST.py @@ -11,16 +11,15 @@ import re # noinspection PyMethodMayBeStatic class CandidatePredictorST(BasePredictor): - MAX_LENGTH = 128 - SIMILARITY_THRESHOLD = 0.0 - def __init__(self, drama: Drama, model, chunker: BaseChunker, add_context): + def __init__(self, drama: Drama, model, chunker: BaseChunker, add_context, max_length): self.drama = drama self.model = model self.chunker = chunker self.all_text_blocks = [] self.source_text_blocks = [] self.add_context = add_context + self.max_length = max_length for act_nr, act in enumerate(drama.acts): for scene_nr, scene in enumerate(act.scenes): @@ -33,7 +32,7 @@ class CandidatePredictorST(BasePredictor): self.source_embeddings = model.encode(self.source_text_blocks, convert_to_tensor=True) # overriding abstract method - def get_candidates(self, target_text, direct_quotes) -> List[Candidate]: + def get_candidates(self, target_text) -> List[Candidate]: fn_ranges, fn_ranges_with_offset = get_footnote_ranges(target_text) target_text_wo_fn: str = remove_footnotes(target_text) chunks = self.chunker.chunk(target_text_wo_fn) @@ -45,22 +44,8 @@ class CandidatePredictorST(BasePredictor): chunk.start = real_start chunk.end = real_end - filtered_chunks = chunks - # for chunk in chunks: - # found_match = False - # for dq in direct_quotes: - # overlap_length = Util.calculate_overlap(chunk.start, chunk.end, dq.target_span.start, - # dq.target_span.end) - # - # if overlap_length > 0: - # found_match = True - # break - # - # if not found_match: - # filtered_chunks.append(chunk) - candidates: List[Candidate] = [] - for chunk in filtered_chunks: + for chunk in chunks: if self.add_context: text = self.__add_context(chunk.text, target_text, chunk.start, chunk.end) @@ -87,13 +72,10 @@ class CandidatePredictorST(BasePredictor): start_line, end_line = self.drama.acts[act_nr].scenes[scene_nr].get_line_range() scene_scores.append((start_line, end_line, score, text)) - if scene_scores[0][2] >= self.SIMILARITY_THRESHOLD: - return scene_scores[0][2] - - return None + return scene_scores[0][2] def __add_context(self, quote_text, text, quote_start, quote_end): - rest_len = self.MAX_LENGTH - len(quote_text.split()) + rest_len = self.max_length - len(quote_text.split()) if rest_len < 0: raise Exception('No rest len!') diff --git a/indiquo/testing/TestSimilarity.py b/indiquo/testing/TestSimilarity.py deleted file mode 100644 index d98e094..0000000 --- a/indiquo/testing/TestSimilarity.py +++ /dev/null @@ -1,119 +0,0 @@ -import csv -from argparse import ArgumentParser -from sentence_transformers import SentenceTransformer, util -import random - - -def load_test_items_scene(test_file_path): - test_items = [] - - all_validation_elements = [] - - with open(test_file_path, 'r') as test_file: - reader = csv.reader(test_file, delimiter='\t') - # skip first row (header) - next(reader, None) - - for row in reader: - all_validation_elements.append((row[0], row[1])) - - for pos, val_item in enumerate(all_validation_elements): - test_items.append((val_item[0], val_item[1], 1)) - poses = random.sample(range(0, len(all_validation_elements)), 2) - - if poses[0] != pos: - other = all_validation_elements[poses[0]] - else: - other = all_validation_elements[poses[1]] - - test_items.append((val_item[0], other[1], 0)) - - return test_items - - -def load_test_items_candidate(test_file_path): - test_items = [] - - with open(test_file_path, 'r') as gold_file: - reader = csv.reader(gold_file, delimiter='\t') - # skip first row (header) - next(reader, None) - - for row in reader: - text_1 = row[0] - text_2 = row[1] - label = int(row[2]) - test_items.append((text_1, text_2, label)) - - return test_items - - -def test(input_path, model_path): - test_items = load_test_items_scene(input_path) - - sentences_1 = [] - sentences_2 = [] - labels = [] - - for ti in test_items[:50]: - sentences_1.append(ti[0]) - sentences_2.append(ti[1]) - labels.append(ti[2]) - - model = SentenceTransformer(model_path) - - embeddings1 = model.encode(sentences_1, convert_to_tensor=True) - embeddings2 = model.encode(sentences_2, convert_to_tensor=True) - - cosine_scores = util.cos_sim(embeddings1, embeddings2) - - tp_cnt = 0 - fp_cnt = 0 - tn_cnt = 0 - fn_cnt = 0 - - for i in range(len(sentences_1)): - score = cosine_scores[i][i] - - if score > 0.3: - if labels[i] == 1: - tp_cnt += 1 - else: - fp_cnt += 1 - # print(f'FP: {example}, {pred}, {name}, {quote}') - else: - if labels[i] == 0: - tn_cnt += 1 - else: - fn_cnt += 1 - # print(f'FN: {example}, {pred}, {name}, {quote}') - - precision = tp_cnt / (tp_cnt + fp_cnt) - recall = tp_cnt / (tp_cnt + fn_cnt) - - f_score = 0 - if precision + recall > 0: - f_score = (2 * precision * recall) / (precision + recall) - - print(f'TP: {tp_cnt}, FP: {fp_cnt}, TN: {tn_cnt}, FN: {fn_cnt}, Precision: {precision}, Recall: {recall},' - f' F-Score: {f_score}') - - -def main(): - argument_parser = ArgumentParser() - - argument_parser.add_argument('--test-file-path', dest='test_file_path', - help='') - argument_parser.add_argument('--model-path', dest='model_path', - help='') - - args = argument_parser.parse_args() - test_file_path = args.test_file_path - model_path = args.model_path - # model_path = 'deutsche-telekom/gbert-large-paraphrase-cosine' - - test(test_file_path, model_path) - - -if __name__ == '__main__': - main() diff --git a/indiquo/testing/__init__.py b/indiquo/testing/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/indiquo/training/similarity/TrainSimilarity.py b/indiquo/training/similarity/TrainSimilarity.py deleted file mode 100644 index e613e9b..0000000 --- a/indiquo/training/similarity/TrainSimilarity.py +++ /dev/null @@ -1,71 +0,0 @@ -import math -from argparse import ArgumentParser -from os.path import join - -from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation -from sentence_transformers.losses import BatchHardTripletLossDistanceFunction -from torch.utils.data import DataLoader -import csv - -from datetime import datetime - - -def main(): - argument_parser = ArgumentParser() - - argument_parser.add_argument('input_path', nargs=1, metavar='input-path', - help="Path to the input folder") - argument_parser.add_argument('output_path', nargs=1, metavar='output-path', - help="Path to the input folder") - argument_parser.add_argument('model', nargs=1, metavar='model', - help="") - - args = argument_parser.parse_args() - input_path = args.input_path[0] - output_path = args.output_path[0] - model_name = args.model[0] - - model_name_repl = model_name.replace('/', '') - - now = datetime.now() - date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S') - date_time_string += f'_{model_name_repl}' - output_path = join(output_path, date_time_string) - - train_examples = [] - - with open(join(input_path, 'train_set.tsv'), 'r') as train_file: - reader = csv.reader(train_file, delimiter='\t') - - for row in reader: - ie = InputExample(texts=[row[0], row[1], row[2]]) - train_examples.append(ie) - - val_anchor = [] - val_positive = [] - val_negative = [] - with open(join(input_path, 'val_set.tsv'), 'r') as train_file: - reader = csv.reader(train_file, delimiter='\t') - - for row in reader: - val_anchor.append(row[0]) - val_positive.append(row[1]) - val_negative.append(row[2]) - - model = SentenceTransformer(model_name) - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8) - - train_loss = losses.TripletLoss(model=model) - - evaluator = evaluation.TripletEvaluator(val_anchor, val_positive, val_negative) - - num_epochs = 3 - warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up - - # Tune the model - model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, - evaluator=evaluator, evaluation_steps=10000, output_path=output_path) - - -if __name__ == '__main__': - main() diff --git a/indiquo/training/similarity/TrainSimilarityCombined.py b/indiquo/training/similarity/TrainSimilarityCombined.py deleted file mode 100644 index 70fcbfd..0000000 --- a/indiquo/training/similarity/TrainSimilarityCombined.py +++ /dev/null @@ -1,72 +0,0 @@ -import math -from argparse import ArgumentParser -from os.path import join - -from sentence_transformers import SentenceTransformer, InputExample, losses -from torch.utils.data import DataLoader -import csv - -from datetime import datetime - - -def main(): - argument_parser = ArgumentParser() - - argument_parser.add_argument('candidate_train_path', nargs=1, metavar='candidate-train-path', - help="Path to the input folder") - argument_parser.add_argument('scene_train_path', nargs=1, metavar='scene-train-path', - help="Path to the input folder") - argument_parser.add_argument('output_path', nargs=1, metavar='output-path', - help="Path to the input folder") - argument_parser.add_argument('model', nargs=1, metavar='model', - help="") - - args = argument_parser.parse_args() - candidate_path = args.candidate_train_path[0] - scene_path = args.scene_train_path[0] - output_path = args.output_path[0] - model_name = args.model[0] - - model_name_repl = model_name.replace('/', '') - - now = datetime.now() - date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S') - date_time_string += f'_{model_name_repl}' - output_path = join(output_path, date_time_string) - - candidate_examples = [] - scene_examples = [] - - model = SentenceTransformer(model_name) - - with open(join(candidate_path, 'train_set.tsv'), 'r') as train_file: - reader = csv.reader(train_file, delimiter='\t') - - for row in reader: - ie = InputExample(texts=[row[0], row[1]], label=int(row[2])) - candidate_examples.append(ie) - - candidate_train_dataloader = DataLoader(candidate_examples, shuffle=True, batch_size=16) - candidate_train_loss = losses.OnlineContrastiveLoss(model=model) - - with open(join(scene_path, 'train_set.tsv'), 'r') as train_file: - reader = csv.reader(train_file, delimiter='\t') - - for row in reader: - ie = InputExample(texts=[row[0], row[1]]) - scene_examples.append(ie) - - scene_train_dataloader = DataLoader(scene_examples, shuffle=True, batch_size=16) - scene_train_loss = losses.MultipleNegativesRankingLoss(model=model) - - num_epochs = 3 - warmup_steps = math.ceil(len(scene_train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up - - # Tune the model - model.fit(train_objectives=[(candidate_train_dataloader, candidate_train_loss), - (scene_train_dataloader, scene_train_loss)], epochs=num_epochs, - warmup_steps=warmup_steps, output_path=output_path) - - -if __name__ == '__main__': - main() diff --git a/indiquo/training/similarity/TrainSimilarityContrastive.py b/indiquo/training/similarity/TrainSimilarityContrastive.py deleted file mode 100644 index 0cf4c33..0000000 --- a/indiquo/training/similarity/TrainSimilarityContrastive.py +++ /dev/null @@ -1,44 +0,0 @@ -import math -from argparse import ArgumentParser -from os.path import join - -from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation -from torch.utils.data import DataLoader -import csv - -from datetime import datetime - - -def train(): - - model_name_repl = model_name.replace('/', '') - - now = datetime.now() - date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S') - date_time_string += f'_{model_name_repl}' - output_path = join(output_path, date_time_string) - - train_examples = [] - - with open(join(input_path, 'train_set.tsv'), 'r') as train_file: - reader = csv.reader(train_file, delimiter='\t') - - for row in reader: - ie = InputExample(texts=[row[0], row[1]], label=int(row[2])) - train_examples.append(ie) - - model = SentenceTransformer(model_name) - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) - train_loss = losses.ContrastiveLoss(model=model) - - # evaluator = evaluation.TripletEvaluator(val_anchor, val_positive, val_negative) - BinaryClassificationEvaluator - - num_epochs = 5 - warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up - model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, - output_path=output_path) - - -if __name__ == '__main__': - main() diff --git a/indiquo/training/similarity/TrainSimilarityDrama.py b/indiquo/training/similarity/TrainSimilarityDrama.py deleted file mode 100644 index 46737f9..0000000 --- a/indiquo/training/similarity/TrainSimilarityDrama.py +++ /dev/null @@ -1,56 +0,0 @@ -import math -from argparse import ArgumentParser -from os.path import join - -from sentence_transformers import SentenceTransformer, InputExample, losses -from torch.utils.data import DataLoader -import csv - -from datetime import datetime - - -def main(): - argument_parser = ArgumentParser() - - argument_parser.add_argument('input_path', nargs=1, metavar='input-path', - help="Path to the input folder") - argument_parser.add_argument('output_path', nargs=1, metavar='output-path', - help="Path to the input folder") - argument_parser.add_argument('model', nargs=1, metavar='model', - help="") - - args = argument_parser.parse_args() - input_path = args.input_path[0] - output_path = args.output_path[0] - model_name = args.model[0] - - model_name_repl = model_name.replace('/', '') - - now = datetime.now() - date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S') - date_time_string += f'_{model_name_repl}' - output_path = join(output_path, date_time_string) - - train_examples = [] - - with open(join(input_path, 'train_set.tsv'), 'r') as train_file: - reader = csv.reader(train_file, delimiter='\t') - - for row in reader: - ie = InputExample(texts=[row[0], row[1]]) - train_examples.append(ie) - - model = SentenceTransformer(model_name) - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) - train_loss = losses.MultipleNegativesRankingLoss(model=model) - - num_epochs = 3 - warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up - - # Tune the model - model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, - output_path=output_path) - - -if __name__ == '__main__': - main() diff --git a/indiquo/training/similarity/__init__.py b/indiquo/training/similarity/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4f8cf88 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" + +[project] +name = "IndiQuo" +version = "0.0.1" +authors = [ + { name = "Frederik Arnold", email = "frederik.arnold@hu-berlin.de"} +] +description = "" +readme = "README.md" +license = { file="LICENSE" } +requires-python = ">=3.9" +keywords = ["quotation detection", "quotation identification", "indirect citation extraction", + "natural language processing", "nlp", "text reuse"] +dependencies = [ + +] + +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["indiquo*"] + +[project.scripts] +indiquo = "indiquo.cli.IndiQuoCLI:main" + +[project.urls] +"Homepage" = "https://hu.berlin/indiquo" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b49196c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +sentence-transformers~=3.0.1 +dramatist~=0.0.6 +kpcommons~=0.0.3 +pysbd~=0.3.4 +datasets~=2.20.0 +evaluate~=0.4.2 +accelerate~=0.33.0 \ No newline at end of file -- GitLab