Update documentation

8ab86434 · Frederik Arnold · 5d1c406d · 8ab86434 · 8ab86434 · 8ab86434
Commit 8ab86434 authored 6 days ago by Frederik Arnold
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ as the default.
 #### Scene Prediction

 ~~~
-indiquo train candidate
+indiquo train scene
 path_to_train_folder
 path_to_the_output_folder
 hugginface_model_name
@@ -59,51 +59,42 @@ Drama excerpt	Summary

 ### Indirect Quotation Identification

-To run `IndiQuo` with the default model, use the following command:
+To run `IndiQuo` inference with the default models, use the following command:

 ~~~
-indiquo compare path_to_source_text path_to_target_text --text --output-type text
+indiquo compare full path_to_drama_xml path_to_target_text output_path
 ~~~

 <details>
 <summary>All IndiQuo command line options</summary>

 ~~~
-usage: indiquo compare [-h] [--approach {st,rw,iq,sum,eval}]
-                       [--add-context | --no-add-context]
-                       [--max-candidate-length MAX_CANDIDATE_LENGTH]
-                       [--summaries-file-path SUMMARIES_FILE_PATH]
-                       source-file-path target-path
-                       candidate-model-folder-path scene-model-folder-path
-                       output-folder-path
+usage: indiquo compare full [-h] [--add-context | --no-add-context]
+                            [--max-candidate-length MAX_CANDIDATE_LENGTH]
+                            source-file-path target-path candidate-model
+                            scene-model output-folder-path
+
+Identify candidates and corresponding scenes.

 positional arguments:
  source-file-path      Path to the source xml drama file
  target-path           Path to the target text file or folder
-  candidate-model-folder-path
-                        Path to the candidate model folder
-  scene-model-folder-path
-                        Path to the scene model folder
-  output-folder-path    The output folder path
+  candidate-model       Name of the model to load from Hugging Face or path to
+                        the model folder (default: Fredr0id/indiquo-
+                        candidate).
+  scene-model           Name of the model to load from Hugging Face or path to
+                        the model folder (default: Fredr0id/indiquo-scene).
+  output-folder-path    The output folder path.

 options:
  -h, --help            show this help message and exit
-  --approach {st,rw,iq,sum,eval}
-                        The approach to use for candidate prediction
  --add-context, --no-add-context
-                        If set, candidates are embedded in context up toa
-                        total length of --max-candidate-length (default: True)
+                        If set, candidates are embedded in context up to a
+                        total length of --max-candidate-length
  --max-candidate-length MAX_CANDIDATE_LENGTH
                        Maximum length in words of a candidate (default: 128)
-  --summaries-file-path SUMMARIES_FILE_PATH
-                        Path to the summaries tsv file. Only used if approach
-                        is set to 'sum'
 ~~~

-By default, the approach to use is set to `iq` which is the approach presented in the paper. The approach option can
-be changed to run the base models (rw=rederwiedergabe, st=SentenceTransformer) or to use a SentenceTransformer with
-summaries (sum) or only run scene prediction for evaluation purposes (eval).
-
 </details>

 The output folder will contain a tsv file for each txt file in the target path. The tsv files have the following
@@ -120,6 +111,13 @@ The first three columns are the character start and end positions and the text o
 fourth column is the probability of the positive class, i.e., the candidate is an indirect quotation. The last column
 contains the top 10 source scenes separated by '#' and each part has the following structure: act:scene:probability. 

+### Baselines and reproduction
+
+It is possible to only run the candidate classification step with the command `compare candidate`. With the option
+`--model-type` it is possible to run the base models (rw=rederwiedergabe, st=SentenceTransformer).
+
+With the command `compare sum` a SentenceTransformer with summaries can be used.
+
 ## Citation
 If you use the code in repository or base your work on our code, please cite our paper:
 ~~~

--- a/indiquo/cli/IndiQuoCLI.py
+++ b/indiquo/cli/IndiQuoCLI.py
@@ -143,15 +143,24 @@ def __run_compare(compare_approach: str, model_type: str, source_file_path: str,


 def main(argv=None):
-    train_description = 'This command allows the user to train their own model.'
+    indiquo_description = ("IndiQuo is a tool for the detection of indirect quotations (summaries and paraphrases)"
+                           " between dramas and scholarly works.")

-    train_candidate_description = ''
-    train_candidate_st_description = ''
-    train_scene_description = ''
+    train_description = 'This command allows the user to train their a custom model.'

-    compare_description = ''
+    train_candidate_description = 'Train a custom candidate classification model.'
+    train_candidate_st_description = 'Train a custom SentenceTransformer model.'
+    train_scene_description = 'Tran a custom scene identification model.'

-    argument_parser = ArgumentParser(prog='indiquo', description='TBD')
+    compare_description = ('This command allows the user to run inference and execute different functionality which is'
+                           ' specified in a subcommand.')
+
+    compare_candidate_description = 'Identify candidates for indirect quotations.'
+    compare_scene_description = 'Identify the scene of indirect quotations.'
+    compare_full_description = 'Identify candidates and corresponding scenes.'
+    compare_sum_description = 'Use summaries to identify indirect quotations.'
+
+    argument_parser = ArgumentParser(prog='indiquo', description=indiquo_description)

    argument_parser.add_argument('--log-level', dest='log_level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR',
                                                                           'CRITICAL'],
@@ -212,48 +221,48 @@ def main(argv=None):
    cp_candidate_full = argparse.ArgumentParser(add_help=False)
    cp_candidate_full.add_argument('--add-context', dest='add_context', default=True,
                                action=BooleanOptionalAction, help='If set, candidates are embedded in context up to'
-                                                                   'a total length of --max-candidate-length')
+                                                                   ' a total length of --max-candidate-length')
    cp_candidate_full.add_argument('--max-candidate-length', dest='max_candidate_length', default=128,
                                type=int, help='Maximum length in words of a candidate (default: %(default)d)')

    cp_candidate_model = argparse.ArgumentParser(add_help=False)
-    cp_candidate_model.add_argument('candidate_model_folder_path', metavar='candidate-model-folder-path',
-                                help='Path to the candidate model folder')
+    cp_candidate_model.add_argument('candidate_model', metavar='candidate-model', default='Fredr0id/indiquo-candidate',
+                                help='Name of the model to load from Hugging Face or path to the model folder (default: %(default)s).')
    cp_scene_model = argparse.ArgumentParser(add_help=False)
-    cp_scene_model.add_argument('scene_model_folder_path', metavar='scene-model-folder-path',
-                                help='Path to the scene model folder')
+    cp_scene_model.add_argument('scene_model', metavar='scene-model', default='Fredr0id/indiquo-scene',
+                                help='Name of the model to load from Hugging Face or path to the model folder (default: %(default)s).')
    cp_output = argparse.ArgumentParser(add_help=False)
    cp_output.add_argument('output_folder_path', metavar='output-folder-path',
-                                help='The output folder path')
+                                help='The output folder path.')

    parser_compare_candidate = (
        subparsers_compare_approach.add_parser('candidate',
                                               parents=[cp_all, cp_candidate_model, cp_output, cp_candidate_full],
-                                               help='TBD', description='TBD')
+                                               help=compare_candidate_description, description=compare_candidate_description)

    )
    parser_compare_candidate.add_argument('--model-type', choices=['st', 'rw', 'iq'], dest='model_type',
-                              default='iq', help='The model type to use for candidate prediction')
+                              default='iq', help='The model type to use for candidate prediction.')

    parser_compare_scene = (
        subparsers_compare_approach.add_parser('scene',
                                               parents=[cp_all, cp_scene_model, cp_output],
-                                               help='TBD', description='TBD')
+                                               help=compare_scene_description, description=compare_scene_description)
    )

    parser_compare_full = (
        subparsers_compare_approach.add_parser('full',
                                               parents=[cp_all, cp_candidate_model, cp_scene_model, cp_output, cp_candidate_full],
-                                               help='TBD', description='TBD')
+                                               help=compare_full_description, description=compare_full_description)
    )

    parser_compare_sum = (
        subparsers_compare_approach.add_parser('sum',
                                               parents = [cp_all, cp_candidate_model, cp_output],
-                                               help='TBD', description='TBD')
+                                               help=compare_sum_description, description=compare_sum_description)
    )
-    parser_compare_sum.add_argument('--summaries-file-path', dest='summaries_file_path', required=False,
-                                help='Path to the summaries tsv file. Only used if approach is set to \'sum\'')
+    parser_compare_sum.add_argument('--summaries-file-path', dest='summaries_file_path', required=True,
+                                help='Path to the summaries tsv file.')

    args = argument_parser.parse_args(argv)


--- a/indiquo/core/CandidatePredictorST.py
+++ b/indiquo/core/CandidatePredictorST.py
@@ -14,7 +14,6 @@ import re
 class CandidatePredictorST(BaseCandidatePredictor):

    def __init__(self, drama: Drama, model: SentenceTransformer, chunker: BaseChunker, add_context: bool, max_length: int):
-        self.drama = drama
        self.model = model
        self.chunker = chunker
        self.all_text_blocks = []

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ version = "0.1.0"
 authors = [
    { name = "Frederik Arnold", email = "frederik.arnold@hu-berlin.de"}
 ]
-description = ""
+description = "IndiQuo is a tool for the detection of indirect quotations (summaries and paraphrases)."
 readme = "README.md"
 license = { file="LICENSE" }
 requires-python = ">=3.11"