Commit 150d280e authored by Konstantin Schulz's avatar Konstantin Schulz
Browse files

merge source code for frontend and backend

parent 5f8b59f7

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.
- test
stage: test
- docker-compose build
- docker-compose run --rm mcserver bash -c "source ../venv/bin/activate && coverage run --rcfile=.coveragerc && coverage combine && coverage report -m"
coverage: '/^TOTAL.+?(\d+\%)$/'
- python
- pip
- flask
08d2554 (HEAD -> develop, origin/master, origin/develop, origin/HEAD, master) put test Mocks in their own file
5ed1fc3 improved performance of frequency analysis and introduced new unit tests
36bd443 CI: combined badge tasks
151073f CI: updated regex for coverage parsing
8ca7f32 CI: added installation of dependencies
fc520c6 updated CI to work with gitlab-runner
564875d updated CI config
66c787a fixed python version for CI
c0bde97 fixed pip installation on CI server
723c6ea reintroducing CI
f98c223 included text complexity information into the rawTextAPI, while leaving it also as its own, independent API for target querying of certain measures
836c747 added frequency analysis API to provide more support for matching exercises
9d31295 excluded cached json files from source code repository
ad9dd0a Merge branch 'develop' of into develop
3ae5638 optimized database access to prevent idle transactions
993698f Lexical density measure added to the text complexity api
bf45368 Small code fixing on the Text Complexity API
e8f1ea9 fixed a bug in the subgraph API
fe83d3d mark words exercises now have the maximum number of correct solutions in their description
23adc81 raw text basis for CONLLU annotation data is now uniform (whitespace always stripped)
560a5c3 fixed POS counting measure in text complexity
aa5bc49 Merge branch 'develop' of into develop
e046ed2 extended subgraph API for more flexible retrieval of smaller graphs from a larger one
857f1fa fixing code to integrate text complexity api
af033c2 started integrating text complexity with main application
2796afd Merge branch 'develop' of into develop
f7643c5 simplified and cleaner database access, session are now ended on quitting the application
228de51 Merge branch 'master' of
b7a7169 AQL queries
b60ca29 Merge branch 'develop'
314e2f0 fix corrupted exercise data
f4341c8 Merge branch 'develop'
9539ad3 fixed logging and the response type of the vocabulary check API
f382908 Merge branch 'develop'
136b80a adjust rawTextAPI to the new AnnisResponse model
bf33414 Merge branch 'develop'
9c9a059 exercise API now supports querying of single old exercises, in conjunction with an exercise repository (exerciseList API)
89e3352 Merge branch 'develop'
4c42c43 fixed URNs for vocabulary check / subgraph data
dcc8bfd Merge branch 'develop'
5d3102b file API can now generate PDF and DOCX from HTML
b10fdf6 Merge branch 'develop'
f96aeaa refactoring of all imports so the backend can be used without docker more easily
a8245ce Merge branch 'develop'
185c432 reff and texts/annotations are now more precisely identifiable by CTS URN
923f648 Merge branch 'develop'
c06a1ae static exercise API now also gives URN and lemmata, but crawls only solutions instead of the whole exercise
cea2076 Merge branch 'develop'
63ffe39 added file export (PDF, DOCX) for the mark words exercise
33a627b Merge branch 'develop'
2facd85 Merge branch 'develop' of into develop
589f9b6 introducing new exercise type: Mark Words
f7c5c1a Merge branch 'develop' of into develop
7b758c0 Text Complexity Revision and Expansion
7098da5 Merge branch 'develop'
e5f68e8 introducing the Static Exercises API which offers deep links to exercises for various Latin words
e42626f Merge branch 'develop'
0c46b8d fixed logging for docker
544f3fa Merge branch 'develop'
9e547b7 added whitespace before cloze exercise gaps
02e86b3 updated unit tests and fixed a minor issue in the text complexity API
3b0f797 added text complexity initialization
7a86145 Merge branch 'develop' of into develop
039f30a Introducing the Text Complexity API
4c18575 Merge branch 'develop'
fed041a fixed file permissions for learning results
4570e88 Merge branch 'develop'
5311d44 fixes for file API (request parsing locations) and file directories
34dbe7a Merge branch 'develop'
ff83e2f fix for the arguments of the file API
6b15a21 Merge branch 'develop'
fffe4d5 file API now accepts POST requests to save learning results
04695b1 Merge branch 'develop'
c6188b5 added more file permissions to the temporary export files so Docker can handle them more easily
904374e Merge branch 'develop'
6d9911f added UDPipe binary for windows
0cca892 Merge branch 'develop'
50247f1 fixed a bug where exercises for textbooks with vocabulary filters were not loaded due to a mismatch in the solution architecture
9fe94a4 Merge branch 'develop'
796f307 introduced recovery after database connection failures, e.g. due to broken SSL connections
5718f7d Merge branch 'develop'
2bdb366 fix PyCharm debugging for Docker
88d6ae3 Merge branch 'develop'
4f0e4a8 Merge branch 'develop' of into develop
be0005f introduced tags for Moodle XML and fixed database migration with Docker
0c9ca73 Merge branch 'develop' into 'master'
42cdc17 Proper Nouns are now recognized as a match in the Vocabulary API
cb811be KWIC API now builds SVG data from ANNIS graphs
f3d49ed fix a bug in the valid reff cache where it would use the raw CTS URN as a file name instead of a disk URN
7e9eb60 improved docker setup by separating the Corpus Storage Manager from the main application
39dbb37 introducing Docker and a fix to CSM initialization
4d5adf4 improved CSM initialization
628b727 small fix to custom corpus handling
8600b3f introducing H5P API and KWIC API
3c865a5 introducing vocabulary checks for the PROIEL treebank
040a9ba improved vocabulary check
a64226f exported files now do not have additional whitespace before punctuation signs anymore
76c65c5 small fix to vocabulary API where errors occurred in custom corpora if the desired sentence range was bigger than the actual sub-graph
d385130 file API now supports restricting solutions to certain indices (e.g. only known vocabulary)
601c4dc fix to config checks
06ab42e rollback to threads instead of processes
a2d9d7b further fix for migration script
51b9a7f further fixes for migration script
75da7d7 quick fix for the migration script
1e4c1ee hotfix for custom corpus processing
f0054e8 quick fix for selecting sentence sub-graphs of custom corpora
3c2846d small fix for exercise export of custom corpora
7ac8ce6 improved corpus update mechanism
288fbcb added the PROIEL treebank texts and annotations to the corpus
49ee733 fixed graph cache for sub-corpora
ae43147 fixed text generation for sub-graphs
98570dc improved corpus retrieval performance (more caching)
9d8f8d0 rawTextAPI now supports getting smaller parts of CTS corpora, even if those parts do not match the usual citation boundaries
d8854b5 small fixes to CSM detection and the respective corpus handling
5508209 fixed corpus access from inside and outside of CSM
6200323 quick fix for vocabulary filter, added VIVA vocabulary
bc01cfd introducing the vocabulary filter
f005634 fix for missing cache directory
1bf0011 small fix for the custom corpus initialization
23503d2 small fix to file write operations
2e96c92 corpora and validReff are now cached on disk
4aefe3c ANNIS now delivers arbitrary numbers of results all in one go
289e06f fixed root dependency search and text annotations consistency
0398d7b small fix for adding corpora
f4e3678 exercises now support multiple search values for the same phenomenon
bcaa586 Merge branch 'master' into develop
9ecfe75 added sudo and pwd to CI script
764c9ca added tags to the CI config
f7ae850 introducing CI
da03e6c added the matching exercise type and added CONLL annotations to the exercise database model
f13bece backup save
456937f fixed upgrade script for the database
7bacd0a exercises can now be exported to .DOCX, database needs to be adapted for that
f429b7c added query feature for dependencies and lemma search
c580e06 exercises now support AQL queries for all the POS tags according to the UD schema
f1864a0 database initialization is now exclusively performed by the Corpus Storage Manager thread
4b97dca major restructuring to make the app distributable as a wheel with setuptools; VScode config may need some small fixes to file paths
1a70b54 update interval for corpus age check is now customizable
f795634 trying to prevent flush errors during corpus list updates
2094195 database access is now (partially) handled by multithreading instead of multiprocessing, because HTTP requests do not seem to work in a multiprocessing environment
671c5e6 fixed flush errors in the corpus service when accessing the database
9bc2a8a fixed I/O error for trying to print in background processes
ec4362c fixed concurrency issues with the corpus service and its database access
018ba3e improved error handling for database errors due to concurrency
8bb5556 small fix to the citation level handling
d237c48 the validReff API now sends only the directly relevant citations (without nesting)
5147860 Merge branch 'master' into develop
4172267 added support for the VIVA textbook, and non-CTS corpora in general
4aafba5 Less verbose logging to keep the logfile smaller
1ef726e quick fix for the file API
c7d6ba4 (origin/feature/graphannis-exercise) Merge branch 'master' into develop
174a1af fixed logging
8828c8a Merge branch 'develop' of into develop
1782b1f corpus storage manager now runs in an external process so it can always serve the same instance
6c6dec1 Set venv environment for VS Code
175bec4 Set venv environment for VS Code
47c800b replaced ujson package with rapidjson
70fae52 improved speed of JSON serialization during exercise generation
770e117 Merge branch 'master' into develop
7e62225 fixed recursion error in the exercise API
207a0bf Remove the app.db
e126770 improved file handling, is now cross-platform
4a7f096 changed gunicorn async workers from aiohttp to eventlet
815b9fa changed gunicorn config to use async workers
6c9457f trying to fix intermittent CORS issues
9e74d09 fixed request args for file api
89f6780 fixed requirements
50131b6 fixed PDF export in file API
9b48359 fixed gitignore for files
b1f6a22 Revert "added PDF export to the file API"
98ce220 added PDF export to the file API
7e9ea0a improved temp file handling and added general feedback parameter exercise generation
8e1da84 fixed CORS issues, improved XML for exercise generation (more customization possible)
2dc4abb fixed CORS for file API
63a66d6 modified exercise API to contain url for file download, added Exercise table in the database, added file download API
f1b40ec reinforced CORS for post requests to the exercise API
6c231ff fixed ordering links in the exercise API
c1494c7 tidy up unit tests
99fa4de the application now always uses the same instance of the graph store object, with a customizable location on disk
3afe1b6 fixed request parsing and added unit tests for exercise API
4117c7f Merge branch 'feature/graphannis-exercise'
d4a7586 added annotation service
40a8ed4 exercise API now looks for raw text in the body
251be8c update
c6779f7 reformatted blank lines
02ef657 requirements now include conllu and networkx
9715b9f updated readme
eca859f Merge branch 'feature/graphannis-exercise' of into feature/graphannis-exercise
8ea5597 Merge branch 'master' of
78664ca updated git clone url
22bec95 remember solution
07f5e76 add also the dependency edges as pointing relations
79220ce Use "title" instead of textname
d8f8e2b Replace the token value and remove the lemma for matches
e7c0f5e Return the subgraph as node link JSON data
7518e2d Map all fields, not only pos and lemma
9b7a8ca Clean up current state by putting some functionality into extra functions
098bf5a Map tokens with their lemma and pos
40bd75c Parse CONLL with
bc8959c Begin to parse CONLL
70909ca Nicer debugger params
156759b Add VS Code debug config
79bd944 Add some example code
7477937 Ignore app.db and .env file
437325b fixed new raw text API
bacb56e gunicorn now has autoreload on source code change
7628b5c added raw text retrieval API
5fd6e36 updated readme & config
9afc9df updated readme
9d78b1d updated readme
b2455c9 added mock data for exercise generation
dca331d updated config handling
ad8b2a2 fixed log file pointer
2c261d8 added new logging mechanism for gunicorn
cec641c removed print statement from production context
0bcd268 relocated database initialization so it only happens when the main script is called directly
987da6c added columns for citation levels
e4dfe64 added migration script
ca937ca added citation level foreign keys in corpus table
c4458de added citationlevel table
d9afc39 added code documentation
d21a1bb increased gunicorn worker count
14da52a added logging to gunicorn
abb17a6 added gunicorn config
df78de8 updated instructions and corpus URN
a129d74 added some debug info
6cd0741 moved host IP handling to .env
7feb49c fixed host name handling
770d1cb moved host name to .env
ad9e612 updated git URL
7baee25 added timestamp to logging
b8e6869 fixed error logging
719d6f9 added blinker dependency
7333350 added exception handler
89fdb4a added logging
73e64ef changed API response to "real" JSON wrapped in a Flask Response
afe070d now allowing CORS
9a28f6d initial commit with updated readme
98f8826 update
8f76619 Merge branch 'master' of
9761ec3 update
1ee1f9b update
4a8bad0 update
eac85ec update
a16b70a update
b68889f update
0f33d9b update
f1bd5ec update
f22d2eb Merge branch 'master' of
37a8545 update
ae1f9ad .env deleted online with Bitbucket
2dd17f6 update
38fbeb3 update
6cc7a5c update
2821544 env
6fbbc1e rewrite
1a169c5 update
60696ed update
922102e upd
d1d37bb Update
33e3c66 Add new file
#do not use an alpine-based image, it is too cumbersome to install all the necessary components to make the C/C++
#dependencies in the requirements.txt work
FROM python:3.8
RUN useradd -ms /bin/bash mc
WORKDIR /home/mc
RUN python -m venv venv
RUN apt-get update
RUN apt-get install -y openssh-server nano
COPY mcserver/requirements.txt requirements.txt
RUN venv/bin/pip install --no-cache-dir --default-timeout=120 -r requirements.txt
#prepare ssh
RUN mkdir /var/run/sshd
RUN echo 'root:root' | chpasswd
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed 's@session\s*required\s* optional' -i /etc/pam.d/sshd
ENV NOTVISIBLE "in users profile"
RUN echo "export VISIBLE=now" >> /etc/profile
RUN /usr/bin/ssh-keygen -A
CMD ["/usr/sbin/sshd", "-D"]
COPY . mc_backend
WORKDIR /home/mc/mc_backend
Copyright (c) 2018 The Python Packaging Authority
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
\ No newline at end of file
[![pipeline status](](
[![coverage report](](
# Installation instructions
## Docker
1. Install Docker ( and Docker-Compose (
2. Clone the repo:
`git clone`
3. Move to the newly created folder:
`cd mc_backend`
4. Run `docker-compose build`
5. Run `docker-compose up -d`
## No Docker
1. Set up a PostgreSQL database manually. If necessary, adjust the URI your .env file.
2. Run `python` and `python` as separate processes.
## Endpoints
The default starting point for the API will be at http://localhost:5000/mc/api/v1.0/corpora .
# Configuration
For general configuration, use the file "mcserver/".
To customize sensitive parts of your configuration, create a ".env" file in the directory "mcserver". A basic .env file may look like this:
\# Change this to "production" for public use
\# for Windows, use instead
# Dependencies
To update outdated dependencies, find the relevant ones by running: `pip list -o`
Then, for each of the listed dependencies, run: `pip install -U <DEPENDENCY_NAME>`
Or combine both commands in one line: `pip list -o --format=freeze | grep -v '^\-e' | cut -d = -f 1 | xargs -n1 pip install -U`
# Database
To autogenerate a new migration script, start the Docker container with the database and run: `flask db migrate`.
To migrate the database to a newer version manually, run: `flask db upgrade`
To migrate the database to a newer version manually, run: `flask db downgrade`
If it does nothing or fails, make sure that the environment variable FLASK_APP is set correctly (see
# Debugging
## Access to the Docker container
Use `docker-compose down` to stop and remove the currently running containers.
Use `ssh root@localhost -p 8022 -o "UserKnownHostsFile /dev/null"` to connect to the container via SSH. Password is "root".
Alternatively, get the container ID via `docker ps` and connect via `docker exec -it CONTAINER_ID bash`. Or, for root access, use: `docker exec -u 0 -it CONTAINER_ID bash`
To snapshot a running container, use `docker commit CONTAINER_ID`. It returns a snapshot ID, which you can access via `docker run -it SNAPSHOT_ID`.
# Testing
To check the coverage of the current tests, run
`coverage run --rcfile=.coveragerc && coverage combine && coverage report -m`.
# Documentation
## Changelog
To update the changelog, use: `git log --oneline --decorate --color > CHANGELOG`
from flask import Flask
from mcserver import get_app, get_cfg
app: Flask = get_app()
if __name__ == "__main__":, port=get_cfg().HOST_PORT, use_reloader=False)
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~
"""The main application: Machina Callida.
It is a server-side backend for retrieving Latin texts and
generating language exercises for them."""
import sys
from typing import Type
from flask import Flask
from import create_csm_app
from mcserver.config import Config, ProductionConfig, DevelopmentConfig, TestingConfig
def get_app() -> Flask:
return create_csm_app(get_cfg())
def get_cfg() -> Type[Config]:
return ProductionConfig if Config.IS_PRODUCTION else (
TestingConfig if len(sys.argv) > 1 and sys.argv[1] == Config.TEST_FLAG else DevelopmentConfig)
def run_app() -> None:
cfg: Type[Config] = get_cfg()
get_app().run(host=cfg.HOST_IP, port=cfg.CORPUS_STORAGE_MANAGER_PORT, use_reloader=False)
if __name__ == "__main__":
# reloader has to be disabled because of a bug with Flask and multiprocessing
from csm import get_app, get_cfg
get_app().run(host=get_cfg().HOST_IP, port=get_cfg().HOST_PORT, use_reloader=False)
from typing import Type
from flask import Flask
from graphannis.cs import CorpusStorageManager
from mcserver import Config
from import init_app_common, init_logging
def create_csm_app(cfg: Type[Config] = Config):
"""Creates a new Flask app that represents a Corpus Storage Manager."""
app_csm: Flask = init_app_common(cfg=cfg, is_csm=True)
from import bp
init_logging(app_csm, Config.LOG_PATH_CSM)
return app_csm
"""The API blueprint. Register it on the main application to enable the REST API for text retrieval."""
from flask import Blueprint
from flask_restful import Api
from mcserver import Config
bp = Blueprint("api", __name__)
api = Api(bp)
from import AnnisFindAPI
from import CorpusStorageManagerAPI
from import FrequencyAPI
from import SubgraphAPI
from import TextComplexityAPI
api.add_resource(AnnisFindAPI, Config.SERVER_URI_ANNIS_FIND, endpoint="find")
api.add_resource(CorpusStorageManagerAPI, Config.SERVER_URI_CSM, endpoint="csm")
api.add_resource(FrequencyAPI, Config.SERVER_URI_FREQUENCY, endpoint="frequency")
api.add_resource(SubgraphAPI, Config.SERVER_URI_CSM_SUBGRAPH, endpoint="subgraph")
api.add_resource(TextComplexityAPI, Config.SERVER_URI_TEXT_COMPLEXITY, endpoint='textcomplexity')
import flask
from flask_restful import Resource, reqparse
from import NetworkService, CorpusService, AnnotationService
class AnnisFindAPI(Resource):
def __init__(self):
self.reqparse = reqparse.RequestParser()
self.reqparse.add_argument("aql", type=str, required=True, location="form", help="No AQL provided")
self.reqparse.add_argument("urn", type=str, required=True, default="", location="form", help="No URN provided")
super(AnnisFindAPI, self).__init__()
def get(self):
""" Returns matches from ANNIS for a given CTS URN and AQL. """
# get request arguments
args: dict = flask.request.args
urn: str = args["urn"]
aql: str = args["aql"]
return NetworkService.make_json_response(CorpusService.find_matches(urn, aql, is_csm=True))
import json
from json import JSONDecodeError
from typing import Dict, List
import flask
from conllu import TokenList
from flask_restful import Resource, reqparse, abort
from import ExerciseType, Phenomenon, AnnisResponse
from import CorpusService, NetworkService
class CorpusStorageManagerAPI(Resource):
"""Represents an API for the Corpus Storage Manager.
It manages the database and everything corpus-related."""
def __init__(self):
self.reqparse = reqparse.RequestParser()
self.reqparse.add_argument("title", type=str, required=True, location="data", help="No title provided")
self.reqparse.add_argument("annotations", required=True, location="data",
help="No annotations provided")
self.reqparse.add_argument("aqls", required=True, location="data", help="No AQLs provided",
self.reqparse.add_argument("exercise_type", type=str, required=True, location="data",
help="No exercise type provided")
self.reqparse.add_argument("search_phenomena", type=str, required=False, location="data",
help="No search phenomena provided")
self.reqparse.add_argument("urn", type=str, required=False, help="No text identifier provided")
super(CorpusStorageManagerAPI, self).__init__()
def get(self):
""" Returns graph data for a given CTS URN. """
# get request arguments
args: Dict = flask.request.args
cts_urn: str = args["urn"]
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=cts_urn, is_csm=True)
if not ar.nodes:
return NetworkService.make_json_response(ar.__dict__)
def post(self):
"""Given the relevant corpus data, gives back search results as graph data."""
args: dict = {}
args = json.loads("utf-8"))
except JSONDecodeError:
title: str = args["title"]
annotations_or_urn: str = args["annotations"]
aqls: List[str] = args["aqls"]
exercise_type: ExerciseType = ExerciseType[args["exercise_type"]]