From b7deb60ad550afaba6c84633b56a812b9de7839b Mon Sep 17 00:00:00 2001
From: Frederik Arnold <frederik.arnold@hu-berlin.de>
Date: Fri, 26 Jul 2024 08:56:25 +0200
Subject: [PATCH] Cleanup, add readme

---
 LICENSE                                       | 201 ++++++++++++++++++
 README.md                                     | 130 +++++++++++
 indiquo/cli/IndiQuoCLI.py                     | 107 +++++-----
 indiquo/core/CandidatePredictor.py            |  38 +---
 indiquo/core/CandidatePredictorRW.py          |   2 +-
 indiquo/core/CandidatePredictorST.py          |  30 +--
 indiquo/testing/TestSimilarity.py             | 119 -----------
 indiquo/testing/__init__.py                   |   0
 .../training/similarity/TrainSimilarity.py    |  71 -------
 .../similarity/TrainSimilarityCombined.py     |  72 -------
 .../similarity/TrainSimilarityContrastive.py  |  44 ----
 .../similarity/TrainSimilarityDrama.py        |  56 -----
 indiquo/training/similarity/__init__.py       |   0
 pyproject.toml                                |  38 ++++
 requirements.txt                              |   7 +
 15 files changed, 443 insertions(+), 472 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 README.md
 delete mode 100644 indiquo/testing/TestSimilarity.py
 delete mode 100644 indiquo/testing/__init__.py
 delete mode 100644 indiquo/training/similarity/TrainSimilarity.py
 delete mode 100644 indiquo/training/similarity/TrainSimilarityCombined.py
 delete mode 100644 indiquo/training/similarity/TrainSimilarityContrastive.py
 delete mode 100644 indiquo/training/similarity/TrainSimilarityDrama.py
 delete mode 100644 indiquo/training/similarity/__init__.py
 create mode 100644 pyproject.toml
 create mode 100644 requirements.txt

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..cad06b9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023 SchlÃ¼sselstellen
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bd106ae
--- /dev/null
+++ b/README.md
@@ -0,0 +1,130 @@
+# Readme
+This repository contains the tool `IndiQuo` for the detection of indirect quotations (summaries and paraphrases)
+between dramas from [DraCor](https://dracor.org) and scholarly works which interpret the drama.
+
+## Installation
+
+Checkout this repository and then run:
+
+~~~
+pip install -r requirements.txt
+~~~
+
+### Dependencies
+The dependencies to run the [Rederwiedergabe Tagger](https://github.com/redewiedergabe/tagger) are not installed by
+default as this can be a [tricky process](https://github.com/redewiedergabe/tagger/issues/4) and this tagger is only
+used as a baseline and not for our approach and therefore not needed in most cases.
+
+## Usage
+The following sections describe how to use IndiQuo on the command line.
+
+### Training
+The library supports training of custom models for candidate identification and scene prediction.
+
+#### Candidate Identification
+
+~~~
+indiquo train candidate
+path_to_train_folder
+path_to_the_output_folder
+hugginface_model_name
+~~~
+
+`path_to_train_folder` has to contain to files named `train_set.tsv` and `val_set.tsv` which contain one example per
+line in the form a string and a label, tab separated, for example:
+
+~~~
+Some positive example	1
+Some negative example	0
+~~~
+
+`hugginface_model_name` is the name of the model on huggingface to use for fine-tuning, `deepset/gbert-large` is used
+as the default.
+
+#### Scene Prediction
+
+~~~
+indiquo train candidate
+path_to_train_folder
+path_to_the_output_folder
+hugginface_model_name
+~~~
+
+`path_to_train_folder` has to contain to files named `train_set.tsv` and `val_set.tsv` which contain one example per
+line in the form two strings, a drama excerpt and a corresponding summary, tab separated, for example:
+
+~~~
+Drama excerpt	Summary
+~~~
+
+`hugginface_model_name` is the name of the model on huggingface to use for fine-tuning,
+`deutsche-telekom/gbert-large-paraphrase-cosine` is used  as the default.
+
+### Indirect Quotation Identification
+
+To run `IndiQuo` with the default model, use the following command:
+
+~~~
+indiquo compare path_to_source_text path_to_target_text --text --output-type text
+~~~
+
+<details>
+<summary>All IndiQuo command line options</summary>
+
+~~~
+usage: indiquo compare [-h] [--approach {st,rw,iq,sum,eval}]
+                       [--add-context | --no-add-context]
+                       [--max-candidate-length MAX_CANDIDATE_LENGTH]
+                       [--summaries-file-path SUMMARIES_FILE_PATH]
+                       source-file-path target-path
+                       candidate-model-folder-path scene-model-folder-path
+                       output-folder-path
+
+positional arguments:
+  source-file-path      Path to the source xml drama file
+  target-path           Path to the target text file or folder
+  candidate-model-folder-path
+                        Path to the candidate model folder
+  scene-model-folder-path
+                        Path to the scene model folder
+  output-folder-path    The output folder path
+
+options:
+  -h, --help            show this help message and exit
+  --approach {st,rw,iq,sum,eval}
+                        The approach to use for candidate prediction
+  --add-context, --no-add-context
+                        If set, candidates are embedded in context up toa
+                        total length of --max-candidate-length (default: True)
+  --max-candidate-length MAX_CANDIDATE_LENGTH
+                        Maximum length in words of a candidate (default: 128)
+  --summaries-file-path SUMMARIES_FILE_PATH
+                        Path to the summaries tsv file. Only used if approach
+                        is set to 'sum'
+~~~
+
+By default, the approach to use is set to `iq` which is the approach presented in the paper. The approach option can
+be changed to run the base models (rw=rederwiedergabe, st=SentenceTransformer) or to use a SentenceTransformer with
+summaries (sum) or only run scene prediction for evaluation purposes (eval).
+
+</details>
+
+The output folder will contain a tsv file for each txt file in the target path. The tsv files have the following
+structure:
+
+The output will look something like this:
+
+~~~
+start   end text        score   scenes
+10      15  some text   0.5     1:1:0.2#2:5:0.5#...
+~~~
+
+The first three columns are the character start and end positions and the text of the quotation in the target text. The
+fourth column is the probability of the positive class, i.e., the candidate is an indirect quotation. The last column
+contains the top 10 source scenes separated by '#' and each part has the following structure: act:scene:probability. 
+
+## Citation
+If you use the code in repository or base your work on our code, please cite our paper:
+~~~
+TBD
+~~~
\ No newline at end of file
diff --git a/indiquo/cli/IndiQuoCLI.py b/indiquo/cli/IndiQuoCLI.py
index e27aec0..ff15a37 100644
--- a/indiquo/cli/IndiQuoCLI.py
+++ b/indiquo/cli/IndiQuoCLI.py
@@ -18,7 +18,6 @@ try:
 except ModuleNotFoundError:
     pass
 
-from quid.core.Quid import Quid
 from dramatist.core.Dramatist import Dramatist
 from indiquo.core.CandidatePredictorST import CandidatePredictorST
 from indiquo.core.IndiQuo import IndiQuo
@@ -46,22 +45,10 @@ def __train_scene(train_folder_path, output_folder_path, model_name):
 def __process_file(indi_quo: IndiQuoBase, filename, target_text, output_folder_path):
     print(f'Processing {filename} ...')
 
-    # quid_matches = quid.compare(drama.get_text(), target_text)
-
-    # source_text = drama.get_text()
-
-    # short_matches: List[MatchRef] = pro_quo_lm.compare(source_text, target_text, quid_matches)
-    # all_matches = short_matches
-
-    # long_matches = Helper.remove_short_matches(quid_matches, target_text)
-    # all_matches.extend(long_matches)
-    # all_matches = Helper.remove_overlapping_matches(all_matches, target_text)
-
-    # matches = indi_quo.compare(target_text, all_matches)
     matches = indi_quo.compare(target_text)
 
-    with open(join(output_folder_path, f'{filename}.tsv'), "w", encoding='utf-8') as output_file:
-        writer = csv.writer(output_file, delimiter="\t", lineterminator="\n")
+    with open(join(output_folder_path, f'{filename}.tsv'), 'w', encoding='utf-8') as output_file:
+        writer = csv.writer(output_file, delimiter='\t', lineterminator='\n')
         writer.writerow(['start', 'end', 'text', 'score', 'scenes'])
 
         for m in matches:
@@ -78,8 +65,7 @@ def __process_file(indi_quo: IndiQuoBase, filename, target_text, output_folder_p
 
 
 def __run_compare(source_file_path, target_path, candidate_model_path, scene_model_path,
-                  output_folder_path, approach, add_context, summaries_file_path):
-
+                  output_folder_path, approach, add_context, max_candidate_length, summaries_file_path):
     drama_processor = Dramatist()
     drama = drama_processor.from_file(source_file_path)
     sentence_chunker = SentenceChunker(min_length=10, max_length=64, max_sentences=1)
@@ -89,10 +75,11 @@ def __run_compare(source_file_path, target_path, candidate_model_path, scene_mod
             candidate_tokenizer = AutoTokenizer.from_pretrained(candidate_model_path)
             candidate_model = AutoModelForSequenceClassification.from_pretrained(candidate_model_path)
             candidate_predictor = CandidatePredictor(drama, candidate_tokenizer, candidate_model, sentence_chunker,
-                                                     add_context)
+                                                     add_context, max_candidate_length)
         elif approach == 'st':
             candidate_model = SentenceTransformer(candidate_model_path)
-            candidate_predictor = CandidatePredictorST(drama, candidate_model, sentence_chunker, add_context)
+            candidate_predictor = CandidatePredictorST(drama, candidate_model, sentence_chunker, add_context,
+                                                       max_candidate_length)
         elif approach == 'rw':
             candidate_model = SequenceTagger.load(candidate_model_path)
             candidate_predictor = CandidatePredictorRW(candidate_model, sentence_chunker)
@@ -146,6 +133,14 @@ def __run_compare(source_file_path, target_path, candidate_model_path, scene_mod
 
 
 def main(argv=None):
+    train_description = 'This command allows the user to train their own model.'
+
+    train_candidate_description = ''
+    train_candidate_st_description = ''
+    train_scene_description = ''
+
+    compare_description = ''
+
     argument_parser = ArgumentParser(prog='indiquo', description='TBD')
 
     argument_parser.add_argument('--log-level', dest='log_level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR',
@@ -155,61 +150,67 @@ def main(argv=None):
     subparsers_command = argument_parser.add_subparsers(dest='command')
     subparsers_command.required = True
 
-    parser_train = subparsers_command.add_parser('train', help='', description='')
+    parser_train = subparsers_command.add_parser('train', help=train_description, description=train_description)
 
     subparsers_train_model = parser_train.add_subparsers(dest='train_model')
     subparsers_train_model.required = True
 
-    parser_train_candidate = subparsers_train_model.add_parser('candidate', help='', description='')
+    parser_train_candidate = subparsers_train_model.add_parser('candidate', help=train_candidate_description,
+                                                               description=train_candidate_description)
 
     parser_train_candidate.add_argument('train_folder_path', nargs=1, metavar='train-folder-path',
-                                        help='Path to the ')
+                                        help='Path to the folder with training and validation data')
     parser_train_candidate.add_argument('output_folder_path', nargs=1, metavar='output-folder-path',
-                                        help="Path to the input folder")
+                                        help='Path to the output folder of the trained model')
     parser_train_candidate.add_argument('--model', dest='model', default='deepset/gbert-large',
-                                        help="")
+                                        help='Name of the model on huggingface to use as the base model for fine-tuning'
+                                             ' (default: %(default)s)')
 
-    # TODO: rename to candidate_st or similar
-    parser_train_st = subparsers_train_model.add_parser('st', help='', description='')
+    # probably not needed as this did not perform well
+    parser_train_st = subparsers_train_model.add_parser('candidate_st', help=train_candidate_st_description,
+                                                        description=train_candidate_st_description)
 
     parser_train_st.add_argument('train_folder_path', nargs=1, metavar='train-folder-path',
-                                        help='Path to the ')
+                                 help='Path to the folder with training and validation data')
     parser_train_st.add_argument('output_folder_path', nargs=1, metavar='output-folder-path',
-                                        help="Path to the input folder")
+                                 help='Path to the output folder of the trained model')
     parser_train_st.add_argument('--model', dest='model', default='deutsche-telekom/gbert-large-paraphrase-cosine',
-                                        help="")
+                                 help='Name of the model on huggingface to use as the base model for fine-tuning'
+                                      ' (default: %(default)s)')
 
-    parser_train_scene = subparsers_train_model.add_parser('scene', help='', description='')
+    parser_train_scene = subparsers_train_model.add_parser('scene', help=train_scene_description,
+                                                           description=train_scene_description)
 
     parser_train_scene.add_argument('train_folder_path', nargs=1, metavar='train-folder-path',
-                                 help='Path to the ')
+                                    help='Path to the folder with training and validation data')
     parser_train_scene.add_argument('output_folder_path', nargs=1, metavar='output-folder-path',
-                                 help="Path to the input folder")
+                                    help='Path to the input folder')
     parser_train_scene.add_argument('--model', dest='model', default='deutsche-telekom/gbert-large-paraphrase-cosine',
-                                 help="")
+                                    help='Name of the model on huggingface to use as the base model for fine-tuning'
+                                         ' (default: %(default)s)')
 
-    parser_compare = subparsers_command.add_parser('compare', help='', description='')
+    parser_compare = subparsers_command.add_parser('compare', help=compare_description,
+                                                   description=compare_description)
 
-    parser_compare.add_argument("source_file_path", nargs=1, metavar="source-file-path",
-                                help="Path to the source xml file")
+    parser_compare.add_argument('source_file_path', nargs=1, metavar='source-file-path',
+                                help='Path to the source xml drama file')
     parser_compare.add_argument('target_path', nargs=1, metavar='target-path',
                                 help='Path to the target text file or folder')
     parser_compare.add_argument('candidate_model_folder_path', nargs=1, metavar='candidate-model-folder-path',
-                                help='Path to the similarity model folder')
+                                help='Path to the candidate model folder')
     parser_compare.add_argument('scene_model_folder_path', nargs=1, metavar='scene-model-folder-path',
-                                help='Path to the similarity model folder')
-    parser_compare.add_argument('--output-folder-path', dest="output_folder_path",
-                                help="The output folder path. If this option is set the output will be saved to a file"
-                                     " created in the specified folder")
-    parser_compare.add_argument('--direct-quotes-path', dest='direct_quotes_path',
-                                help='Path to the file or folder with direct quotes. If this option is not set, then'
-                                     ' ProQuoLM is used to find direct quotes.')
+                                help='Path to the scene model folder')
+    parser_compare.add_argument('output_folder_path', nargs=1, metavar='output-folder-path',
+                                help='The output folder path')
     parser_compare.add_argument('--approach', choices=['st', 'rw', 'iq', 'sum', 'eval'], dest='approach',
-                                default='iq', help='TBD')
+                                default='iq', help='The approach to use for candidate prediction')
     parser_compare.add_argument('--add-context', dest='add_context', default=True,
-                                action=BooleanOptionalAction, help='')
-    parser_compare.add_argument("--summaries-file-path", dest="summaries_file_path", required=False,
-                                help="Path to the summaries tsv file")
+                                action=BooleanOptionalAction, help='If set, candidates are embedded in context up to'
+                                                                   'a total length of --max-candidate-length')
+    parser_compare.add_argument('--max-candidate-length', dest='max_candidate_length', default=128,
+                                type=int, help='Maximum length in words of a candidate (default: %(default)d)')
+    parser_compare.add_argument('--summaries-file-path', dest='summaries_file_path', required=False,
+                                help='Path to the summaries tsv file. Only used if approach is set to \'sum\'')
 
     args = argument_parser.parse_args(argv)
 
@@ -217,7 +218,7 @@ def main(argv=None):
     logging.getLogger().setLevel(logging.getLevelName(log_level))
 
     if args.command == 'train':
-        if args.train_model == 'candidate' or args.train_model == 'st' or args.train_model == 'scene':
+        if args.train_model == 'candidate' or args.train_model == 'candidate_st' or args.train_model == 'scene':
             train_folder_path = args.train_folder_path[0]
             output_folder_path = args.output_folder_path[0]
             model = args.model
@@ -231,7 +232,7 @@ def main(argv=None):
 
             if args.train_model == 'candidate':
                 __train_candidate(train_folder_path, output_folder_path, model)
-            elif args.train_model == 'st':
+            elif args.train_model == 'candidate_st':
                 __train_candidate_st(train_folder_path, output_folder_path, model)
             elif args.train_model == 'scene':
                 __train_scene(train_folder_path, output_folder_path, model)
@@ -241,10 +242,10 @@ def main(argv=None):
         target_path = args.target_path[0]
         candidate_model_folder_path = args.candidate_model_folder_path[0]
         scene_model_folder_path = args.scene_model_folder_path[0]
-        output_folder_path = args.output_folder_path
-        # direct_quotes_path = args.direct_quotes_path
+        output_folder_path = args.output_folder_path[0]
         approach = args.approach
         add_context = args.add_context
+        max_candidate_length = args.max_candidate_length
 
         summaries_file_path = None
 
@@ -257,7 +258,7 @@ def main(argv=None):
         Path(output_folder_path).mkdir(parents=True, exist_ok=True)
 
         __run_compare(source_file_path, target_path, candidate_model_folder_path, scene_model_folder_path,
-                      output_folder_path, approach, add_context, summaries_file_path)
+                      output_folder_path, approach, add_context, max_candidate_length, summaries_file_path)
 
 
 if __name__ == '__main__':
diff --git a/indiquo/core/CandidatePredictor.py b/indiquo/core/CandidatePredictor.py
index 0efaf07..db123e5 100644
--- a/indiquo/core/CandidatePredictor.py
+++ b/indiquo/core/CandidatePredictor.py
@@ -11,10 +11,8 @@ from kpcommons.Footnote import map_to_real_pos, get_footnote_ranges, remove_foot
 
 # noinspection PyMethodMayBeStatic
 class CandidatePredictor(BasePredictor):
-    MAX_LENGTH = 256
-    SIMILARITY_THRESHOLD = 0.0
 
-    def __init__(self, drama: Drama, tokenizer, model, chunker: BaseChunker, add_context):
+    def __init__(self, drama: Drama, tokenizer, model, chunker: BaseChunker, add_context, max_length):
         self.drama = drama
         self.tokenizer = tokenizer
         self.model = model
@@ -22,16 +20,7 @@ class CandidatePredictor(BasePredictor):
         self.all_text_blocks = []
         self.source_text_blocks = []
         self.add_context = add_context
-
-        # for act_nr, act in enumerate(drama.acts):
-        #     for scene_nr, scene in enumerate(act.scenes):
-        #         text_blocks = scene.get_text_in_blocks(128)
-        #
-        #         for tbt in text_blocks:
-        #             self.all_text_blocks.append((act_nr, scene_nr, tbt.text))
-        #             self.source_text_blocks.append(tbt.text)
-        #
-        # self.source_embeddings = model.encode(self.source_text_blocks, convert_to_tensor=True)
+        self.max_length = max_length
 
     # overriding abstract method
     def get_candidates(self, target_text) -> List[Candidate]:
@@ -46,22 +35,8 @@ class CandidatePredictor(BasePredictor):
             chunk.start = real_start
             chunk.end = real_end
 
-        filtered_chunks = chunks
-        # for chunk in chunks:
-        #     found_match = False
-        #     for dq in direct_quotes:
-        #         overlap_length = Util.calculate_overlap(chunk.start, chunk.end, dq.target_span.start,
-        #                                                 dq.target_span.end)
-        #
-        #         if overlap_length > 0:
-        #             found_match = True
-        #             break
-        #
-        #     if not found_match:
-        #         filtered_chunks.append(chunk)
-
         candidates: List[Candidate] = []
-        for chunk in filtered_chunks:
+        for chunk in chunks:
             if self.add_context:
                 text = self.__add_context(chunk.text, target_text, chunk.start, chunk.end)
             else:
@@ -80,13 +55,12 @@ class CandidatePredictor(BasePredictor):
             logits = self.model(**inputs).logits
             p = torch.nn.functional.softmax(logits, dim=1)
             score = float(p[0, 1])
-            if score > self.SIMILARITY_THRESHOLD:
-                return score
+            return score
 
-        return None
+        # return None
 
     def __add_context(self, quote_text, text, quote_start, quote_end):
-        rest_len = self.MAX_LENGTH - len(quote_text.split())
+        rest_len = self.max_length - len(quote_text.split())
 
         if rest_len < 0:
             raise Exception('No rest len!')
diff --git a/indiquo/core/CandidatePredictorRW.py b/indiquo/core/CandidatePredictorRW.py
index 36d2ce2..077fab9 100644
--- a/indiquo/core/CandidatePredictorRW.py
+++ b/indiquo/core/CandidatePredictorRW.py
@@ -14,7 +14,7 @@ class CandidatePredictorRW(BasePredictor):
         self.chunker = chunker
 
     # overriding abstract method
-    def get_candidates(self, target_text, direct_quotes) -> List[Candidate]:
+    def get_candidates(self, target_text) -> List[Candidate]:
         fn_ranges, fn_ranges_with_offset = get_footnote_ranges(target_text)
         target_text_wo_fn: str = remove_footnotes(target_text)
         chunks = self.chunker.chunk(target_text_wo_fn)
diff --git a/indiquo/core/CandidatePredictorST.py b/indiquo/core/CandidatePredictorST.py
index 45a368c..416bd4e 100644
--- a/indiquo/core/CandidatePredictorST.py
+++ b/indiquo/core/CandidatePredictorST.py
@@ -11,16 +11,15 @@ import re
 
 # noinspection PyMethodMayBeStatic
 class CandidatePredictorST(BasePredictor):
-    MAX_LENGTH = 128
-    SIMILARITY_THRESHOLD = 0.0
 
-    def __init__(self, drama: Drama, model, chunker: BaseChunker, add_context):
+    def __init__(self, drama: Drama, model, chunker: BaseChunker, add_context, max_length):
         self.drama = drama
         self.model = model
         self.chunker = chunker
         self.all_text_blocks = []
         self.source_text_blocks = []
         self.add_context = add_context
+        self.max_length = max_length
 
         for act_nr, act in enumerate(drama.acts):
             for scene_nr, scene in enumerate(act.scenes):
@@ -33,7 +32,7 @@ class CandidatePredictorST(BasePredictor):
         self.source_embeddings = model.encode(self.source_text_blocks, convert_to_tensor=True)
 
     # overriding abstract method
-    def get_candidates(self, target_text, direct_quotes) -> List[Candidate]:
+    def get_candidates(self, target_text) -> List[Candidate]:
         fn_ranges, fn_ranges_with_offset = get_footnote_ranges(target_text)
         target_text_wo_fn: str = remove_footnotes(target_text)
         chunks = self.chunker.chunk(target_text_wo_fn)
@@ -45,22 +44,8 @@ class CandidatePredictorST(BasePredictor):
             chunk.start = real_start
             chunk.end = real_end
 
-        filtered_chunks = chunks
-        # for chunk in chunks:
-        #     found_match = False
-        #     for dq in direct_quotes:
-        #         overlap_length = Util.calculate_overlap(chunk.start, chunk.end, dq.target_span.start,
-        #                                                 dq.target_span.end)
-        #
-        #         if overlap_length > 0:
-        #             found_match = True
-        #             break
-        #
-        #     if not found_match:
-        #         filtered_chunks.append(chunk)
-
         candidates: List[Candidate] = []
-        for chunk in filtered_chunks:
+        for chunk in chunks:
 
             if self.add_context:
                 text = self.__add_context(chunk.text, target_text, chunk.start, chunk.end)
@@ -87,13 +72,10 @@ class CandidatePredictorST(BasePredictor):
             start_line, end_line = self.drama.acts[act_nr].scenes[scene_nr].get_line_range()
             scene_scores.append((start_line, end_line, score, text))
 
-        if scene_scores[0][2] >= self.SIMILARITY_THRESHOLD:
-            return scene_scores[0][2]
-
-        return None
+        return scene_scores[0][2]
 
     def __add_context(self, quote_text, text, quote_start, quote_end):
-        rest_len = self.MAX_LENGTH - len(quote_text.split())
+        rest_len = self.max_length - len(quote_text.split())
 
         if rest_len < 0:
             raise Exception('No rest len!')
diff --git a/indiquo/testing/TestSimilarity.py b/indiquo/testing/TestSimilarity.py
deleted file mode 100644
index d98e094..0000000
--- a/indiquo/testing/TestSimilarity.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import csv
-from argparse import ArgumentParser
-from sentence_transformers import SentenceTransformer, util
-import random
-
-
-def load_test_items_scene(test_file_path):
-    test_items = []
-
-    all_validation_elements = []
-
-    with open(test_file_path, 'r') as test_file:
-        reader = csv.reader(test_file, delimiter='\t')
-        # skip first row (header)
-        next(reader, None)
-
-        for row in reader:
-            all_validation_elements.append((row[0], row[1]))
-
-    for pos, val_item in enumerate(all_validation_elements):
-        test_items.append((val_item[0], val_item[1], 1))
-        poses = random.sample(range(0, len(all_validation_elements)), 2)
-
-        if poses[0] != pos:
-            other = all_validation_elements[poses[0]]
-        else:
-            other = all_validation_elements[poses[1]]
-
-        test_items.append((val_item[0], other[1], 0))
-
-    return test_items
-
-
-def load_test_items_candidate(test_file_path):
-    test_items = []
-
-    with open(test_file_path, 'r') as gold_file:
-        reader = csv.reader(gold_file, delimiter='\t')
-        # skip first row (header)
-        next(reader, None)
-
-        for row in reader:
-            text_1 = row[0]
-            text_2 = row[1]
-            label = int(row[2])
-            test_items.append((text_1, text_2, label))
-
-    return test_items
-
-
-def test(input_path, model_path):
-    test_items = load_test_items_scene(input_path)
-
-    sentences_1 = []
-    sentences_2 = []
-    labels = []
-
-    for ti in test_items[:50]:
-        sentences_1.append(ti[0])
-        sentences_2.append(ti[1])
-        labels.append(ti[2])
-
-    model = SentenceTransformer(model_path)
-
-    embeddings1 = model.encode(sentences_1, convert_to_tensor=True)
-    embeddings2 = model.encode(sentences_2, convert_to_tensor=True)
-
-    cosine_scores = util.cos_sim(embeddings1, embeddings2)
-
-    tp_cnt = 0
-    fp_cnt = 0
-    tn_cnt = 0
-    fn_cnt = 0
-
-    for i in range(len(sentences_1)):
-        score = cosine_scores[i][i]
-
-        if score > 0.3:
-            if labels[i] == 1:
-                tp_cnt += 1
-            else:
-                fp_cnt += 1
-                # print(f'FP: {example}, {pred}, {name}, {quote}')
-        else:
-            if labels[i] == 0:
-                tn_cnt += 1
-            else:
-                fn_cnt += 1
-                # print(f'FN: {example}, {pred}, {name}, {quote}')
-
-    precision = tp_cnt / (tp_cnt + fp_cnt)
-    recall = tp_cnt / (tp_cnt + fn_cnt)
-
-    f_score = 0
-    if precision + recall > 0:
-        f_score = (2 * precision * recall) / (precision + recall)
-
-    print(f'TP: {tp_cnt}, FP: {fp_cnt}, TN: {tn_cnt}, FN: {fn_cnt}, Precision: {precision}, Recall: {recall},'
-          f' F-Score: {f_score}')
-
-
-def main():
-    argument_parser = ArgumentParser()
-
-    argument_parser.add_argument('--test-file-path', dest='test_file_path',
-                                 help='')
-    argument_parser.add_argument('--model-path', dest='model_path',
-                                 help='')
-
-    args = argument_parser.parse_args()
-    test_file_path = args.test_file_path
-    model_path = args.model_path
-    # model_path = 'deutsche-telekom/gbert-large-paraphrase-cosine'
-
-    test(test_file_path, model_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/indiquo/testing/__init__.py b/indiquo/testing/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/indiquo/training/similarity/TrainSimilarity.py b/indiquo/training/similarity/TrainSimilarity.py
deleted file mode 100644
index e613e9b..0000000
--- a/indiquo/training/similarity/TrainSimilarity.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import math
-from argparse import ArgumentParser
-from os.path import join
-
-from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
-from sentence_transformers.losses import BatchHardTripletLossDistanceFunction
-from torch.utils.data import DataLoader
-import csv
-
-from datetime import datetime
-
-
-def main():
-    argument_parser = ArgumentParser()
-
-    argument_parser.add_argument('input_path', nargs=1, metavar='input-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('output_path', nargs=1, metavar='output-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('model', nargs=1, metavar='model',
-                                 help="")
-
-    args = argument_parser.parse_args()
-    input_path = args.input_path[0]
-    output_path = args.output_path[0]
-    model_name = args.model[0]
-
-    model_name_repl = model_name.replace('/', '')
-
-    now = datetime.now()
-    date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S')
-    date_time_string += f'_{model_name_repl}'
-    output_path = join(output_path, date_time_string)
-
-    train_examples = []
-
-    with open(join(input_path, 'train_set.tsv'), 'r') as train_file:
-        reader = csv.reader(train_file, delimiter='\t')
-
-        for row in reader:
-            ie = InputExample(texts=[row[0], row[1], row[2]])
-            train_examples.append(ie)
-
-    val_anchor = []
-    val_positive = []
-    val_negative = []
-    with open(join(input_path, 'val_set.tsv'), 'r') as train_file:
-        reader = csv.reader(train_file, delimiter='\t')
-
-        for row in reader:
-            val_anchor.append(row[0])
-            val_positive.append(row[1])
-            val_negative.append(row[2])
-
-    model = SentenceTransformer(model_name)
-    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
-
-    train_loss = losses.TripletLoss(model=model)
-
-    evaluator = evaluation.TripletEvaluator(val_anchor, val_positive, val_negative)
-
-    num_epochs = 3
-    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-
-    # Tune the model
-    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps,
-              evaluator=evaluator, evaluation_steps=10000, output_path=output_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/indiquo/training/similarity/TrainSimilarityCombined.py b/indiquo/training/similarity/TrainSimilarityCombined.py
deleted file mode 100644
index 70fcbfd..0000000
--- a/indiquo/training/similarity/TrainSimilarityCombined.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import math
-from argparse import ArgumentParser
-from os.path import join
-
-from sentence_transformers import SentenceTransformer, InputExample, losses
-from torch.utils.data import DataLoader
-import csv
-
-from datetime import datetime
-
-
-def main():
-    argument_parser = ArgumentParser()
-
-    argument_parser.add_argument('candidate_train_path', nargs=1, metavar='candidate-train-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('scene_train_path', nargs=1, metavar='scene-train-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('output_path', nargs=1, metavar='output-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('model', nargs=1, metavar='model',
-                                 help="")
-
-    args = argument_parser.parse_args()
-    candidate_path = args.candidate_train_path[0]
-    scene_path = args.scene_train_path[0]
-    output_path = args.output_path[0]
-    model_name = args.model[0]
-
-    model_name_repl = model_name.replace('/', '')
-
-    now = datetime.now()
-    date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S')
-    date_time_string += f'_{model_name_repl}'
-    output_path = join(output_path, date_time_string)
-
-    candidate_examples = []
-    scene_examples = []
-
-    model = SentenceTransformer(model_name)
-
-    with open(join(candidate_path, 'train_set.tsv'), 'r') as train_file:
-        reader = csv.reader(train_file, delimiter='\t')
-
-        for row in reader:
-            ie = InputExample(texts=[row[0], row[1]], label=int(row[2]))
-            candidate_examples.append(ie)
-
-    candidate_train_dataloader = DataLoader(candidate_examples, shuffle=True, batch_size=16)
-    candidate_train_loss = losses.OnlineContrastiveLoss(model=model)
-
-    with open(join(scene_path, 'train_set.tsv'), 'r') as train_file:
-        reader = csv.reader(train_file, delimiter='\t')
-
-        for row in reader:
-            ie = InputExample(texts=[row[0], row[1]])
-            scene_examples.append(ie)
-
-    scene_train_dataloader = DataLoader(scene_examples, shuffle=True, batch_size=16)
-    scene_train_loss = losses.MultipleNegativesRankingLoss(model=model)
-
-    num_epochs = 3
-    warmup_steps = math.ceil(len(scene_train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-
-    # Tune the model
-    model.fit(train_objectives=[(candidate_train_dataloader, candidate_train_loss),
-                                (scene_train_dataloader, scene_train_loss)], epochs=num_epochs,
-              warmup_steps=warmup_steps, output_path=output_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/indiquo/training/similarity/TrainSimilarityContrastive.py b/indiquo/training/similarity/TrainSimilarityContrastive.py
deleted file mode 100644
index 0cf4c33..0000000
--- a/indiquo/training/similarity/TrainSimilarityContrastive.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import math
-from argparse import ArgumentParser
-from os.path import join
-
-from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
-from torch.utils.data import DataLoader
-import csv
-
-from datetime import datetime
-
-
-def train():
-
-    model_name_repl = model_name.replace('/', '')
-
-    now = datetime.now()
-    date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S')
-    date_time_string += f'_{model_name_repl}'
-    output_path = join(output_path, date_time_string)
-
-    train_examples = []
-
-    with open(join(input_path, 'train_set.tsv'), 'r') as train_file:
-        reader = csv.reader(train_file, delimiter='\t')
-
-        for row in reader:
-            ie = InputExample(texts=[row[0], row[1]], label=int(row[2]))
-            train_examples.append(ie)
-
-    model = SentenceTransformer(model_name)
-    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
-    train_loss = losses.ContrastiveLoss(model=model)
-
-    # evaluator = evaluation.TripletEvaluator(val_anchor, val_positive, val_negative)
-    BinaryClassificationEvaluator
-
-    num_epochs = 5
-    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps,
-              output_path=output_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/indiquo/training/similarity/TrainSimilarityDrama.py b/indiquo/training/similarity/TrainSimilarityDrama.py
deleted file mode 100644
index 46737f9..0000000
--- a/indiquo/training/similarity/TrainSimilarityDrama.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import math
-from argparse import ArgumentParser
-from os.path import join
-
-from sentence_transformers import SentenceTransformer, InputExample, losses
-from torch.utils.data import DataLoader
-import csv
-
-from datetime import datetime
-
-
-def main():
-    argument_parser = ArgumentParser()
-
-    argument_parser.add_argument('input_path', nargs=1, metavar='input-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('output_path', nargs=1, metavar='output-path',
-                                 help="Path to the input folder")
-    argument_parser.add_argument('model', nargs=1, metavar='model',
-                                 help="")
-
-    args = argument_parser.parse_args()
-    input_path = args.input_path[0]
-    output_path = args.output_path[0]
-    model_name = args.model[0]
-
-    model_name_repl = model_name.replace('/', '')
-
-    now = datetime.now()
-    date_time_string = now.strftime('%Y_%m_%d_%H_%M_%S')
-    date_time_string += f'_{model_name_repl}'
-    output_path = join(output_path, date_time_string)
-
-    train_examples = []
-
-    with open(join(input_path, 'train_set.tsv'), 'r') as train_file:
-        reader = csv.reader(train_file, delimiter='\t')
-
-        for row in reader:
-            ie = InputExample(texts=[row[0], row[1]])
-            train_examples.append(ie)
-
-    model = SentenceTransformer(model_name)
-    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
-    train_loss = losses.MultipleNegativesRankingLoss(model=model)
-
-    num_epochs = 3
-    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-
-    # Tune the model
-    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps,
-              output_path=output_path)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/indiquo/training/similarity/__init__.py b/indiquo/training/similarity/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4f8cf88
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,38 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "IndiQuo"
+version = "0.0.1"
+authors = [
+    { name = "Frederik Arnold", email = "frederik.arnold@hu-berlin.de"}
+]
+description = ""
+readme = "README.md"
+license = { file="LICENSE" }
+requires-python = ">=3.9"
+keywords = ["quotation detection", "quotation identification", "indirect citation extraction",
+    "natural language processing", "nlp", "text reuse"]
+dependencies = [
+
+]
+
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["indiquo*"]
+
+[project.scripts]
+indiquo = "indiquo.cli.IndiQuoCLI:main"
+
+[project.urls]
+"Homepage" = "https://hu.berlin/indiquo"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b49196c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+sentence-transformers~=3.0.1
+dramatist~=0.0.6
+kpcommons~=0.0.3
+pysbd~=0.3.4
+datasets~=2.20.0
+evaluate~=0.4.2
+accelerate~=0.33.0
\ No newline at end of file
-- 
GitLab