|
|
LAUDATIO requires metadata for the corpus, its documents and annotations. For each component, you need to upload a TEI XML metadata file. LAUDATIO uses a TEI customization for each component, which is based on a [Metamodel for Corpus Metadata](https://doi.org/10.18452/19407). It defines which metadata needs to be recorded for each version of a corpus. The TEI customization is published on Zenodo (https://zenodo.org/record/2543455#.XVui7ntCSMo).
|
|
|
|
|
|
# TEI Costumization
|
|
|
The TEI customization uses the teiHeader for realizing the metadata. For the three customizations, a basic teiHeader structure contains fileDesc, titleStmt, publicationStmt, sourceDesc, encodingDesc, revisionDesc.
|
|
|
|
|
|
## Corpus metadata
|
|
|
The TEI XML file for corpus metadata provides information about the corpus title, corpus editors, annotators and rsearchers involved in processing the data (infrastructure task), and project contexts. it further refers to all documents and lists annotations including values and description (guidelines) for each corpus format.
|
|
|
|
|
|
titleStmt contains:
|
|
|
|
|
|
```xml
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
|
|
<teiHeader type="CorpusHeader">
|
|
|
<fileDesc>
|
|
|
<titleStmt>
|
|
|
<title>XYZ</title>
|
|
|
<!-- add a title -->
|
|
|
<editor n="1" role="CorpusEditor"><!-- add more editors if necessary -->
|
|
|
<persName><!-- optional, recommended add norm references such as OCRIDs as attributes,
|
|
|
add @key and @ref, e.g.: key="orcid" ref="https://orcid.org/1234-1234-1234-" -->
|
|
|
<forename>Jane</forename>
|
|
|
<surname>Doe</surname>
|
|
|
</persName>
|
|
|
<affiliation>
|
|
|
<orgName type="Department">Department of Linguistics</orgName>
|
|
|
<orgName type="Institution">XYZ</orgName><!-- e.g. university -->
|
|
|
</affiliation>
|
|
|
</editor>
|
|
|
<author n="1" role="Annotator"><!-- add more annotators if necessary, count in attribute @n -->
|
|
|
<persName>
|
|
|
<forename>John</forename>
|
|
|
<surname>Doe</surname>
|
|
|
</persName>
|
|
|
<affiliation>
|
|
|
<orgName type="Department">Department of History</orgName>
|
|
|
<orgName type="Institution">University</orgName><!-- e.g. university -->
|
|
|
</affiliation>
|
|
|
</author>
|
|
|
<respStmt>
|
|
|
<resp>Metadata</resp>
|
|
|
<persName><!-- add more if necessary -->
|
|
|
<forename>John</forename>
|
|
|
<surname>Doe</surname>
|
|
|
</persName>
|
|
|
<orgName type="Department">Department of History</orgName>
|
|
|
<orgName type="Institution">University</orgName>
|
|
|
<!-- e.g. university -->
|
|
|
</respStmt>
|
|
|
</titleStmt>
|
|
|
<!-- ... -->
|
|
|
</fileDesc>
|
|
|
</teiHeader>
|
|
|
</TEI>
|
|
|
|
|
|
```
|
|
|
|
|
|
fileDesc contains metadata that describe the amount of tokens, the publication context, corpus licence and a list of documents:
|
|
|
|
|
|
```xml
|
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
|
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
|
|
<teiHeader type="CorpusHeader">
|
|
|
<fileDesc>
|
|
|
<titleStmt/>
|
|
|
<extent type="Tokens">123456789</extent>
|
|
|
<publicationStmt>
|
|
|
<authority>Hamburg University</authority>
|
|
|
<!-- e.g. your university -->
|
|
|
<idno>xyz</idno>
|
|
|
<!-- add identificators if available -->
|
|
|
<availability status="free">
|
|
|
<licence target="http://creativecommons.org/licenses/by/4.0/"/>
|
|
|
<!-- e.g. http://creativecommons.org/licenses/by/4.0/ -->
|
|
|
<p>The corpus is publish with CC-BY 4.0 licence.</p>
|
|
|
<!-- prose description of the licence -->
|
|
|
</availability>
|
|
|
<date type="CorpusRelease" when="2019">First complete corpus release.</date>
|
|
|
<!-- short description of the release type -->
|
|
|
</publicationStmt>
|
|
|
<sourceDesc>
|
|
|
<list type="CorpusDocument">
|
|
|
<!-- each document header contains an ID in <fileDesc xml:id="document1">, list the references here -->
|
|
|
<item corresp="Print1" n="1"/>
|
|
|
<item corresp="Print1" n="1"/>
|
|
|
</list>
|
|
|
</sourceDesc>
|
|
|
</fileDesc>
|
|
|
<!-- ... -->
|
|
|
</teiHeader>
|
|
|
</TEI>
|
|
|
```
|
|
|
|
|
|
profileDesc contains metadata about languages in the documents of the corpus:
|
|
|
|
|
|
```xml
|
|
|
<profileDesc>
|
|
|
<langUsage>
|
|
|
<language ident="de" style="Language">Early New High German</language>
|
|
|
<language ident="de" style="LanguageArea">Southern dialects</language>
|
|
|
<language ident="de" style="LanguageType">Bavarian</language>
|
|
|
</langUsage>
|
|
|
</profileDesc>
|
|
|
```
|
|
|
|
|
|
encodingDesc contains annotation keys and values including a descriptions (guidelines) for each corpus format, revisionDesc metadata about corpus revision:
|
|
|
|
|
|
```xml
|
|
|
<?xml version='1.0' encoding='utf-8'?>
|
|
|
<?xml-model href="corpus.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
|
|
|
<TEI xmlns="http://www.tei-c.org/ns/1.0">
|
|
|
<!-- we use "xyz" to indicate where you can add prose text or values, replace xyz where you would like to fill in something, remove XYZ in any other case -->
|
|
|
<teiHeader type="CorpusHeader">
|
|
|
<fileDesc/>
|
|
|
<profileDesc/>
|
|
|
<encodingDesc n="1">
|
|
|
<!-- each <encodingDesc> describes annotation that are released in a format of the corpus, add more <encodingsDesc> if the corpus has more than one format -->
|
|
|
<appInfo>
|
|
|
<application ident="EXMARaLDA" version="3.0">
|
|
|
<label>EXMARaLDA XML for Partitur editor.</label>
|
|
|
</application>
|
|
|
</appInfo>
|
|
|
<projectDesc>
|
|
|
<p>
|
|
|
<ref target="www.xyz.de"/>Data annotation was carried out in our project:
|
|
|
project description. used </p>
|
|
|
</projectDesc>
|
|
|
<editorialDecl>
|
|
|
<segmentation>
|
|
|
<p>Annotation 'dipl' has an independent segmentation. Every other annotation is
|
|
|
based on the segmentation of 'dipl.</p>
|
|
|
</segmentation>
|
|
|
<normalization>
|
|
|
<p>No normalization is applied.</p>
|
|
|
</normalization>
|
|
|
</editorialDecl>
|
|
|
<tagsDecl>
|
|
|
<namespace name="dipl" rend="Transcription" xml:id="d">
|
|
|
<tagUsage gi="STring">Diplomatic,character based transcription.</tagUsage>
|
|
|
</namespace>
|
|
|
<namespace name="POS" rend="Lexical" xml:id="pos">
|
|
|
<tagUsage gi="DET">Determiner.</tagUsage>
|
|
|
<tagUsage gi="N">Noun.</tagUsage>
|
|
|
<tagUsage gi="V">Verb.</tagUsage>
|
|
|
<tagUsage gi="P">Punctuation.</tagUsage>
|
|
|
<tagUsage gi="PRON">Pronoun.</tagUsage>
|
|
|
</namespace>
|
|
|
</tagsDecl>
|
|
|
</encodingDesc>
|
|
|
<revisionDesc>
|
|
|
<change n="1.0" type="CorpusRelease" when="2019" who="xyz">xyz</change>
|
|
|
</revisionDesc>
|
|
|
</teiHeader>
|
|
|
<text/>
|
|
|
</TEI>
|
|
|
```
|
|
|
|
|
|
|
|
|
|