Commit 2840244d authored by Konstantin Schulz's avatar Konstantin Schulz

Corpus Storage Manager and MCserver are now merged into a single Flask application

parent ba94ae42
Pipeline #15837 failed with stages
in 2 minutes and 8 seconds
docker-compose build
docker-compose run --rm --entrypoint="npm run test-ci" mc_frontend >ci_frontend.log
docker-compose run --rm --entrypoint="npm run test-ci" mc_frontend > ci_frontend.log
docker-compose run --rm --entrypoint="./coverage_backend.sh" mcserver > ci_backend.log
./coverage_ci.sh
cat coverage.log
version: '3.7'
services:
csm:
depends_on:
- mcserver
version: '3.7'
services:
mcserver:
depends_on:
- csm
version: '3.7'
services:
csm:
build:
context: ./mc_backend
dockerfile: Dockerfile
command: /home/mc/venv/bin/gunicorn -c csm/gunicorn_config.py run_csm:app
depends_on:
- db
environment:
- FLASK_APP=run_csm.py
- IS_THIS_A_DOCKER_CONTAINER=Yes
- PYTHONPATH=/home/mc
ports:
- "6555:6555"
restart: always
stdin_open: true
db:
image: postgres
environment:
......
# Installing the backend via command line
1. Set up a PostgreSQL database manually (https://www.postgresql.org/download/). If necessary, adjust the URI in your .env file located at `mcserver/.env`.
2. Run `pip install -r requirements.txt`.
3. Run `python app.py` and `python run_csm.py` as separate processes.
3. Run `python app.py`.
## Endpoints
The default starting point for the API will be at http://localhost:5000/mc/api/v1.0/corpora .
......
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~
*.pyc
*.log*
*.env
*.db
*.db-journal
*.coverage
/env
"""The main application: Machina Callida.
It is a server-side backend for retrieving Latin texts and
generating language exercises for them."""
import sys
from typing import Type
from flask import Flask
from csm.app import create_csm_app
from mcserver.config import Config, ProductionConfig, DevelopmentConfig, TestingConfig
def get_app() -> Flask:
return create_csm_app(get_cfg())
def get_cfg() -> Type[Config]:
return ProductionConfig if Config.IS_PRODUCTION else (
TestingConfig if len(sys.argv) > 1 and sys.argv[1] == Config.TEST_FLAG else DevelopmentConfig)
def run_app() -> None:
cfg: Type[Config] = get_cfg()
get_app().run(host=cfg.HOST_IP_CSM, port=cfg.CORPUS_STORAGE_MANAGER_PORT, use_reloader=False)
if __name__ == "__main__":
# reloader has to be disabled because of a bug with Flask and multiprocessing
run_app()
from csm import get_app, get_cfg
get_app().run(host=get_cfg().HOST_IP_CSM, port=get_cfg().HOST_PORT, use_reloader=False)
from typing import Type
from flask import Flask
from graphannis.cs import CorpusStorageManager
from mcserver import Config
from mcserver.app import init_app_common, init_logging
def create_csm_app(cfg: Type[Config] = Config) -> Flask:
"""Creates a new Flask app that represents a Corpus Storage Manager."""
Config.CORPUS_STORAGE_MANAGER = CorpusStorageManager(Config.GRAPH_DATABASE_DIR)
app_csm: Flask = init_app_common(cfg=cfg, is_csm=True)
from csm.app.api import bp
app_csm.register_blueprint(bp)
init_logging(app_csm, Config.LOG_PATH_CSM)
return app_csm
"""The API blueprint. Register it on the main application to enable the REST API for text retrieval."""
from flask import Blueprint
from flask_restful import Api
from mcserver import Config
bp = Blueprint("api", __name__)
api = Api(bp)
from . import frequencyAPI, textcomplexityAPI
from csm.app.api.annisFindAPI import AnnisFindAPI
from csm.app.api.corpusStorageManagerAPI import CorpusStorageManagerAPI
from csm.app.api.subgraphAPI import SubgraphAPI
api.add_resource(AnnisFindAPI, Config.SERVER_URI_ANNIS_FIND, endpoint="find")
api.add_resource(CorpusStorageManagerAPI, Config.SERVER_URI_CSM, endpoint="csm")
api.add_resource(SubgraphAPI, Config.SERVER_URI_CSM_SUBGRAPH, endpoint="subgraph")
import flask
from flask_restful import Resource
from flask_restful.reqparse import RequestParser
from mcserver.app.services import NetworkService, CorpusService
class AnnisFindAPI(Resource):
def __init__(self):
self.reqparse: RequestParser = NetworkService.base_request_parser.copy()
self.reqparse.add_argument("aql", type=str, required=True, location="form", help="No AQL provided")
self.reqparse.add_argument("urn", type=str, required=True, default="", location="form", help="No URN provided")
super(AnnisFindAPI, self).__init__()
def get(self):
""" Returns matches from ANNIS for a given CTS URN and AQL. """
# get request arguments
args: dict = flask.request.args
urn: str = args["urn"]
aql: str = args["aql"]
return NetworkService.make_json_response(CorpusService.find_matches(urn, aql, is_csm=True))
import json
from json import JSONDecodeError
from typing import Dict, List
import flask
from conllu import TokenList
from flask_restful import Resource, abort
from flask_restful.reqparse import RequestParser
from mcserver.app.models import ExerciseType, Phenomenon, AnnisResponse
from mcserver.app.services import CorpusService, NetworkService
class CorpusStorageManagerAPI(Resource):
"""Represents an API for the Corpus Storage Manager.
It manages the database and everything corpus-related."""
def __init__(self):
self.reqparse: RequestParser = NetworkService.base_request_parser.copy()
self.reqparse.add_argument("title", type=str, required=True, location="data", help="No title provided")
self.reqparse.add_argument("annotations", required=True, location="data",
help="No annotations provided")
self.reqparse.add_argument("aqls", required=True, location="data", help="No AQLs provided",
action="append")
self.reqparse.add_argument("exercise_type", type=str, required=True, location="data",
help="No exercise type provided")
self.reqparse.add_argument("search_phenomena", type=str, required=False, location="data",
help="No search phenomena provided")
self.reqparse.add_argument("urn", type=str, required=False, help="No text identifier provided")
super(CorpusStorageManagerAPI, self).__init__()
def get(self):
""" Returns graph data for a given CTS URN. """
# get request arguments
args: Dict = flask.request.args
cts_urn: str = args["urn"]
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=cts_urn, is_csm=True)
if not ar.graph_data.nodes:
abort(404)
return NetworkService.make_json_response(ar.to_dict())
def post(self):
"""Given the relevant corpus data, gives back search results as graph data."""
args: dict = {}
try:
args = json.loads(flask.request.data.decode("utf-8"))
except JSONDecodeError:
abort(400)
title: str = args["title"]
annotations_or_urn: str = args["annotations"]
aqls: List[str] = args["aqls"]
exercise_type: ExerciseType = ExerciseType[args["exercise_type"]]
search_phenomena: List[Phenomenon] = [Phenomenon().__getattribute__(x.upper()) for x in
args["search_phenomena"]]
conll: List[TokenList] = CorpusService.get_annotations_from_string(annotations_or_urn)
ret_val: dict = CorpusService.process_corpus_data(title, conll, aqls, exercise_type, search_phenomena)
# serialize the results to json
return NetworkService.make_json_response(ret_val)
from typing import List, Dict, Set
from mcserver.app.models import Phenomenon, FrequencyItem
from mcserver.app.services import NetworkService, CorpusService, AnnotationService
def get(urn: str):
""" Returns results for a frequency query from ANNIS for a given CTS URN and AQL. """
fa: List[FrequencyItem] = CorpusService.get_frequency_analysis(urn, is_csm=True)
# map the abbreviated values found by ANNIS to our own model
skip_set: Set[Phenomenon] = {Phenomenon.LEMMA, Phenomenon.DEPENDENCY}
for fi in fa:
for i in range(len(fi.values)):
if fi.phenomena[i] in skip_set:
continue
value_map: Dict[str, List[str]] = AnnotationService.phenomenon_map[fi.phenomena[i]]
fi.values[i] = next((x for x in value_map if fi.values[i] in value_map[x]), None)
return NetworkService.make_json_response([x.to_dict() for x in fa])
import json
from typing import Dict, List
import flask
from flask_restful import Resource
from flask_restful.reqparse import RequestParser
from mcserver.app.models import ExerciseData, GraphData, Solution, AnnisResponse, make_solution_element_from_salt_id
from mcserver.app.services import CorpusService, AnnotationService, NetworkService
class SubgraphAPI(Resource):
def __init__(self):
self.reqparse: RequestParser = NetworkService.base_request_parser.copy()
self.reqparse.add_argument("aqls", required=False, location="data", help="No AQLs provided", action="append")
self.reqparse.add_argument("ctx_left", type=str, required=False, default="", location="data",
help="No left context provided")
self.reqparse.add_argument("ctx_right", type=str, required=False, default="", location="data",
help="No right context provided")
self.reqparse.add_argument("node_ids", type=str, required=False, location="data", help="No node IDs provided")
self.reqparse.add_argument("urn", type=str, required=False, default="", location="data", help="No URN provided")
super(SubgraphAPI, self).__init__()
def get(self):
""" Returns subgraph data for a given CTS URN and node IDs. """
args: Dict = flask.request.args
aql: str = str(args['aqls'])
urn: str = args["urn"]
ctx_left: int = int(args["ctx_left"])
ctx_right: int = int(args["ctx_right"])
ar: AnnisResponse = CorpusService.get_subgraph(urn, aql, ctx_left, ctx_right, is_csm=True)
return NetworkService.make_json_response(ar.to_dict())
def post(self):
""" Returns subgraph data for a given CTS URN and AQL. """
# get request arguments
args: Dict = json.loads(flask.request.data.decode("utf-8"))
cts_urn: str = args["urn"]
aqls: List[str] = args["aqls"]
ctx_left: int = int(args["ctx_left"])
ctx_right: int = int(args["ctx_right"])
disk_urn: str = AnnotationService.get_disk_urn(cts_urn)
exercise_data_list: List[ExerciseData] = []
for aql in aqls:
node_ids: List[str] = CorpusService.find_matches(cts_urn, aql, is_csm=True)
for node_id in node_ids:
gd: GraphData = AnnotationService.get_single_subgraph(
disk_urn, [node_id], ctx_left, ctx_right, is_csm=True)
exercise_data_list.append(ExerciseData(
graph=gd, uri="", solutions=[Solution(target=make_solution_element_from_salt_id(node_id))]))
ret_val: List[dict] = [x.serialize() for x in exercise_data_list]
return NetworkService.make_json_response(ret_val)
import rapidjson as json
from mcserver.app.models import AnnisResponse, TextComplexity
from mcserver.app.services import NetworkService, CorpusService, TextComplexityService
from openapi.openapi_server.models import TextComplexityForm
def post(complexity_data: dict):
tcf: TextComplexityForm = TextComplexityForm.from_dict(complexity_data)
ar: AnnisResponse = AnnisResponse.from_dict(json.loads(tcf.annis_response)) if tcf.annis_response \
else CorpusService.get_corpus(tcf.urn, is_csm=True)
tc: TextComplexity = TextComplexityService.text_complexity(tcf.measure, tcf.urn, True, ar.graph_data)
return NetworkService.make_json_response(tc.to_dict())
openapi: "3.0.0"
info:
title: Machina Callida Backend REST API (Corpus Storage Manager)
version: "1.0"
servers:
- url: http://localhost:6555/mc/api/v1.0
paths:
/frequency:
get:
summary: Returns results for a frequency query from ANNIS for a given CTS URN.
operationId: csm.app.api.frequencyAPI.get
responses:
200:
description: Frequency analysis, i.e. a list of frequency items.
content:
application/json:
schema:
type: array
description: List of items with frequency data for linguistic phenomena.
items:
$ref: "../openapi_models.yaml#/components/schemas/FrequencyItem"
parameters:
- $ref: '../openapi_models.yaml#/components/parameters/UrnParam'
/textcomplexity:
post:
summary: Gives users measures of text complexity for a given text.
operationId: csm.app.api.textcomplexityAPI.post
responses:
200:
description: Text complexity measures for a given text.
content:
application/json:
schema:
$ref: '../openapi_models.yaml#/components/schemas/TextComplexity'
requestBody:
required: true
content:
application/x-www-form-urlencoded:
schema:
$ref: '../openapi_models.yaml#/components/schemas/TextComplexityForm'
"""Configuration for the gunicorn server"""
from mcserver import Config
bind = "{0}:{1}".format(Config.HOST_IP_CSM, Config.CORPUS_STORAGE_MANAGER_PORT)
debug = False
reload = True
timeout = 3600
workers = 1
......@@ -7,7 +7,6 @@ from threading import Thread
from time import strftime
from typing import Type
import connexion
import flask
import open_alchemy
import prance
from connexion import FlaskApp
......@@ -15,6 +14,7 @@ from flask import Flask, got_request_exception, request, Response, send_from_dir
from flask_cors import CORS
from flask_migrate import Migrate
from flask_sqlalchemy import SQLAlchemy
from graphannis.cs import CorpusStorageManager
from open_alchemy import init_yaml
from mcserver.config import Config
......@@ -47,6 +47,7 @@ def create_app(cfg: Type[Config] = Config) -> Flask:
# use local postgres database for migrations
if len(sys.argv) > 2 and sys.argv[2] == Config.FLASK_MIGRATE:
cfg.SQLALCHEMY_DATABASE_URI = Config.DATABASE_URL_LOCAL
Config.CORPUS_STORAGE_MANAGER = CorpusStorageManager(Config.GRAPH_DATABASE_DIR)
app: Flask = init_app_common(cfg=cfg)
from mcserver.app.services import bp as services_bp
app.register_blueprint(services_bp)
......@@ -68,19 +69,17 @@ def full_init(app: Flask, cfg: Type[Config] = Config) -> None:
from mcserver.app.services.corpusService import CorpusService
CorpusService.init_corpora()
from mcserver.app.services import ExerciseService
ExerciseService.update_exercises(is_csm=True)
ExerciseService.update_exercises()
if not cfg.TESTING:
CorpusService.init_graphannis_logging()
start_updater(app)
def init_app_common(cfg: Type[Config] = Config, is_csm: bool = False) -> Flask:
def init_app_common(cfg: Type[Config] = Config) -> Flask:
""" Initializes common Flask parts, e.g. CORS, configuration, database, migrations and custom corpora."""
spec_dir: str = Config.CSM_DIRECTORY if is_csm else Config.MC_SERVER_DIRECTORY
connexion_app: FlaskApp = connexion.FlaskApp(
__name__, port=(cfg.CORPUS_STORAGE_MANAGER_PORT if is_csm else cfg.HOST_PORT), specification_dir=spec_dir)
spec_path: str = Config.API_SPEC_CSM_FILE_PATH if is_csm else Config.API_SPEC_MCSERVER_FILE_PATH
parser = prance.ResolvingParser(spec_path, lazy=True, strict=False) # str(Path(spec_path).absolute())
__name__, port=cfg.HOST_PORT, specification_dir=Config.MC_SERVER_DIRECTORY)
parser = prance.ResolvingParser(Config.API_SPEC_MCSERVER_FILE_PATH, lazy=True, strict=False)
parser.parse()
connexion_app.add_api(parser.specification)
apply_event_handlers(connexion_app)
......@@ -91,15 +90,13 @@ def init_app_common(cfg: Type[Config] = Config, is_csm: bool = False) -> Flask:
app.app_context().push()
db.init_app(app)
migrate.init_app(app, db)
if is_csm or cfg.TESTING:
db.create_all()
if is_csm:
from mcserver.app.services.databaseService import DatabaseService
DatabaseService.init_db_alembic()
db.create_all()
from mcserver.app.services.databaseService import DatabaseService
DatabaseService.init_db_alembic()
from mcserver.app.services.textService import TextService
TextService.init_proper_nouns_list()
TextService.init_stop_words_latin()
if is_csm:
if not Config.TESTING:
full_init(app, cfg)
return app
......@@ -118,7 +115,7 @@ def init_logging(app: Flask, log_file_path: str):
app.logger.warning(f"Accessing database at: {database_uri}")
def log_exception(sender_app: Flask, exception, **extra):
def log_exception(sender_app: Flask, exception: Exception, **extra):
"""Logs errors that occur while the Flask app is working.
Arguments:
......@@ -126,7 +123,7 @@ def log_exception(sender_app: Flask, exception, **extra):
exception -- the exception to be logged
**extra -- any additional arguments
"""
sender_app.logger.info(f"ERROR for {flask.request.url}")
sender_app.logger.info(f"ERROR for {request.url}")
def start_updater(app: Flask) -> Thread:
......
......@@ -4,6 +4,7 @@ import connexion
import rapidjson as json
from typing import List, Dict, Union
import requests
from conllu import TokenList
from connexion.lifecycle import ConnexionResponse
from flask import Response
from mcserver.app import db
......@@ -33,7 +34,7 @@ def get(eid: str) -> Union[Response, ConnexionResponse]:
exercise: TExercise = DatabaseService.query(Exercise, filter_by=dict(eid=eid), first=True)
if not exercise:
return connexion.problem(404, Config.ERROR_TITLE_NOT_FOUND, Config.ERROR_MESSAGE_EXERCISE_NOT_FOUND)
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=exercise.urn, is_csm=False)
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=exercise.urn)
if not ar.graph_data.nodes:
return connexion.problem(404, Config.ERROR_TITLE_NOT_FOUND, Config.ERROR_MESSAGE_CORPUS_NOT_FOUND)
exercise.last_access_time = datetime.utcnow().timestamp()
......@@ -47,17 +48,10 @@ def get(eid: str) -> Union[Response, ConnexionResponse]:
def get_graph_data(title: str, conll_string_or_urn: str, aqls: List[str], exercise_type: ExerciseType,
search_phenomena: List[Phenomenon]):
search_phenomena: List[Phenomenon]) -> dict:
"""Sends annotated text data or a URN to the Corpus Storage Manager in order to get a graph."""
url: str = f"{Config.INTERNET_PROTOCOL}{Config.HOST_IP_CSM}:{Config.CORPUS_STORAGE_MANAGER_PORT}"
data: str = json.dumps(
dict(title=title, annotations=conll_string_or_urn, aqls=aqls, exercise_type=exercise_type.name,
search_phenomena=search_phenomena))
response: requests.Response = requests.post(url, data=data)
try:
return json.loads(response.text)
except ValueError:
raise
conll: List[TokenList] = CorpusService.get_annotations_from_string(conll_string_or_urn)
return CorpusService.process_corpus_data(title, conll, aqls, exercise_type, search_phenomena)
def make_new_exercise(conll: str, correct_feedback: str, exercise_type: str, general_feedback: str,
......@@ -97,7 +91,7 @@ def map_exercise_data_to_database(exercise_data: ExerciseData, exercise_type: st
solutions: List[Solution] = adjust_solutions(exercise_data=exercise_data, solutions=solutions,
exercise_type=exercise_type)
quiz_solutions: str = json.dumps([x.to_dict() for x in solutions])
tc: TextComplexity = TextComplexityService.text_complexity(TextComplexityMeasure.all.name, urn, False,
tc: TextComplexity = TextComplexityService.text_complexity(TextComplexityMeasure.all.name, urn,
exercise_data.graph)
new_exercise: Exercise = ExerciseMC.from_dict(
conll=conll, correct_feedback=correct_feedback, eid=xml_guid, exercise_type=exercise_type,
......@@ -128,7 +122,7 @@ def post(exercise_data: dict) -> Union[Response, ConnexionResponse]:
search_values_list]
# if there is custom text instead of a URN, immediately annotate it
conll_string_or_urn: str = ef.urn if CorpusService.is_urn(ef.urn) else AnnotationService.get_udpipe(
CorpusService.get_raw_text(ef.urn, False))
CorpusService.get_raw_text(ef.urn))
try:
# construct graph from CONLL data
response: dict = get_graph_data(title=ef.urn, conll_string_or_urn=conll_string_or_urn, aqls=aqls,
......
import requests
import rapidjson as json
from mcserver import Config
from mcserver.app.services import NetworkService
from typing import List, Set, Dict
from mcserver.app.services import NetworkService, CorpusService, AnnotationService
from openapi.openapi_server.models import FrequencyItem, Phenomenon
def get(urn: str):
""" Returns results for a frequency query from ANNIS for a given CTS URN and AQL. """
url: str = f"{Config.INTERNET_PROTOCOL}{Config.HOST_IP_CSM}:{Config.CORPUS_STORAGE_MANAGER_PORT}" + \
Config.SERVER_URI_FREQUENCY
response: requests.Response = requests.get(url, params=dict(urn=urn))
return NetworkService.make_json_response(json.loads(response.text))
fa: List[FrequencyItem] = CorpusService.get_frequency_analysis(urn)
# map the abbreviated values found by ANNIS to our own model
skip_set: Set[Phenomenon] = {Phenomenon.LEMMA, Phenomenon.DEPENDENCY}
for fi in fa:
for i in range(len(fi.values)):
if fi.phenomena[i] in skip_set:
continue
value_map: Dict[str, List[str]] = AnnotationService.phenomenon_map[fi.phenomena[i]]
fi.values[i] = next((x for x in value_map if fi.values[i] in value_map[x]), None)
return NetworkService.make_json_response([x.to_dict() for x in fa])
......@@ -6,15 +6,13 @@ from collections import OrderedDict
from sys import platform
from tempfile import mkstemp
from typing import List, Dict
import requests
from bs4 import BeautifulSoup, ResultSet, Tag
from conllu import TokenList
from flask import Response
from mcserver.app.models import ExerciseType, ExerciseData, LinkMC, NodeMC
from mcserver.app.services import AnnotationService, NetworkService
from mcserver.app.models import ExerciseType, ExerciseData, LinkMC, NodeMC, make_solution_element_from_salt_id
from mcserver.app.services import AnnotationService, NetworkService, CorpusService
from mcserver.config import Config
from openapi.openapi_server.models import KwicForm
from openapi.openapi_server.models import KwicForm, GraphData, Solution
def post(kwic_data: dict) -> Response:
......@@ -23,12 +21,15 @@ def post(kwic_data: dict) -> Response:
kwic_form: KwicForm = KwicForm.from_dict(kwic_data)
search_values_list: List[str] = json.loads(kwic_form.search_values)
aqls: List[str] = AnnotationService.map_search_values_to_aql(search_values_list, ExerciseType.kwic)
url: str = f"{Config.INTERNET_PROTOCOL}{Config.HOST_IP_CSM}:{Config.CORPUS_STORAGE_MANAGER_PORT}{Config.SERVER_URI_CSM_SUBGRAPH}"
data: str = json.dumps(
dict(urn=kwic_data["urn"], aqls=aqls, ctx_left=str(kwic_form.ctx_left), ctx_right=str(kwic_form.ctx_right)))
response: requests.Response = requests.post(url, data=data)
response_content: List[dict] = json.loads(response.text)
exercise_data_list: List[ExerciseData] = [ExerciseData(json_dict=x) for x in response_content]
disk_urn: str = AnnotationService.get_disk_urn(kwic_form.urn)
exercise_data_list: List[ExerciseData] = []
for aql in aqls:
node_ids: List[str] = CorpusService.find_matches(kwic_form.urn, aql)
for node_id in node_ids:
gd: GraphData = AnnotationService.get_single_subgraph(
disk_urn, [node_id], kwic_form.ctx_left, kwic_form.ctx_right)
exercise_data_list.append(ExerciseData(
graph=gd, uri="", solutions=[Solution(target=make_solution_element_from_salt_id(node_id))]))
ret_val: str = ""
for i in range(len(exercise_data_list)):
ret_val += handle_exercise_data(exercise_data_list[i], kwic_form.ctx_left, kwic_form.ctx_right)
......
......@@ -11,9 +11,9 @@ from mcserver.app.services import CorpusService, NetworkService, TextComplexityS
def get(urn: str) -> Union[Response, ConnexionResponse]:
"""Provides the raw text for a requested text passage."""
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=urn, is_csm=False)
ar: AnnisResponse = CorpusService.get_corpus(cts_urn=urn)