Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CoNNL-U language support #3810

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions components.json
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,10 @@
"alias": "conc",
"owner": "jasontatton"
},
"conllu": {
"title": "CoNLL-U",
"owner": "Querela"
},
"csp": {
"title": "Content-Security-Policy",
"owner": "ScottHelme"
Expand Down
162 changes: 162 additions & 0 deletions components/prism-conllu.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
(function (Prism) {

Prism.languages.conllu = {
// comment lines
comment: {
pattern: /#(?:[^\n])*/,
inside: {
metadata: {
pattern: /(?:\w+)\s*=\s*.*/,
inside: {
key: {
pattern: /\w+(?=\s*=)/,
alias: 'property',
},
value: {
pattern: /(\s*=\s*)\S.*$/,
lookbehind: true,
alias: 'string',
},
operator: /[=]/,
}
},
punctuation: /^#/,
}
},
// separator between two sentence blocks
"sentence-separator": {
pattern: /(\r?\n)(?=\r?\n)/s,
lookbehind: true,
},
// word lines
token: {
pattern: /.+/,
inside: {
id: {
pattern: /^\d+(?:[.-]\d+)?/,
alias: 'number',
},
// form / lemma / upos / xpos / feats / head / deprel / deps / misc
value: {
pattern: /^(\t)[^\t]*(?=\t|$)/,
lookbehind: true,
// alias: 'string',
// inside: {
// unspecified: /_/,
// }
},
},
},
};

const featKeyExp = /[A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?/;
const featValueExp = /.+/; // we just want everything here ... not /[A-Z0-9][A-Za-z0-9]*/;
const featsGrammar = {
punctuation: /\|/,
feature: {
pattern: RegExp('^' + featKeyExp.source + '=' + '.*' + '$'),
inside: {
key: {
pattern: RegExp(featKeyExp.source + '(?==)'), // /\w+(?==)/,
alias: 'property',
},
value: [
{
pattern: /(=)(?:yes|no)$/i,
lookbehind: true,
alias: 'boolean',
}, {
pattern: RegExp('(=)' + featValueExp.source + '$'), // /(=).+$/,
lookbehind: true,
alias: 'string',
}
],
operator: /=/,
},
},
};

const relationExp = /^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$/;
const depsGrammar = {
punctuation: /\|/,
dep: {
pattern: /^\S+$/,
inside: {
head: {
pattern: /\d+(?=:)/,
alias: 'number',
},
punctuation: /^:/,
relation: {
pattern: /.+/, // we just capture everything, should be ok
alias: 'symbol',
},
}
},
}

// hook to assign roles to value fields
const entryTypes = ['form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'];
const entryTypesAlias = [null, null, 'symbol', 'symbol', null, 'number', 'symbol', null, null];
const entryTypeInside = [null, null, null, null, featsGrammar, null, null, depsGrammar, featsGrammar];
Prism.hooks.add('after-tokenize', function (env) {
if (env.language !== 'conllu') {
return;
}

for (const row of env.tokens) {
// go over each token row (if it is a "token" and not a comment/sentence-separator)
if (row.type === 'token') {
let entryTypeCounter = 0;
for (const field of row.content) {
// skip space between
if (typeof field === 'string') { continue; }
// only fields, not ids
if (field?.type !== 'value') { continue; }

if (field.alias === undefined) { field.alias = []; }
if (typeof field.alias === 'string') { field.alias = [field.alias]; }

// check if "_" value, and assign class
if (field.content === '_') {
field.alias.push('unspecified');
}

// assign role to value based on position
if (entryTypeCounter < entryTypes.length) {
// add "value" as one alias
field.alias.push(field.type);
// change field type
field.type = entryTypes[entryTypeCounter];
// add alias if available
if (entryTypesAlias[entryTypeCounter] !== null) {
field.alias.push(entryTypesAlias[entryTypeCounter]);
} else if (entryTypeInside[entryTypeCounter] === null) {
// only assign string if there is no inner processing?
field.alias.push('string');
}

// run inner processing only for selected types!
if (field.content !== '_' && entryTypeInside[entryTypeCounter] !== null) {
field.content = Prism.tokenize(field.content, entryTypeInside[entryTypeCounter]);
}
}

entryTypeCounter++;
}
}
}
});

// just to have the classes listed on /faq.html#how-do-i-know-which-tokens-i-can-style-for
// insert dummy rules that do not match anything
// TODO: unsure about possible performance hit? - there should not be anything left to match but regex matching steps increase linearly with input string length ...
// for (let index = 0; index < entryTypes.length; index++) {
// const entryType = entryTypes[index];
// const entryTypeAlias = entryTypesAlias[index];
// const name = 'value.' + entryType + (entryTypeAlias !== null ? '.' + entryTypeAlias : '');
// // use some invalid pattern
// Prism.languages.conllu.token.inside[name] = /\b\B/;
// }

}(Prism));
119 changes: 119 additions & 0 deletions examples/prism-conllu.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
<p>Full details can be fround at <a href="https://universaldependencies.org/format.html" target="_blank">Universal Dependencies - Format</a>.</p>

<h2>Comments</h2>

<pre><code># sent_id = 2
# text = I have no clue.
# or a simple string</code></pre>

<h2>Full Example</h2>

<pre><code># sent_id = 2
# text = I have no clue.
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
3 no no DET DT PronType=Neg 4 det _ _
4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
5 . . PUNCT . _ 2 punct _ _</code></pre>

<h2>Words, Tokens and Empty Nodes</h2>

<pre><code>1-2 vámonos _
1 vamos ir
2 nos nosotros
3-4 al _
3 a a
4 el el
5 mar mar</code></pre>

<pre><code>1 Sue Sue
2 likes like
3 coffee coffee
4 and and
5 Bill Bill
5.1 likes like
6 tea tea</code></pre>

<pre><code>1 nosotros nosotros
2 vamos ir
3-4 al _
3 a a
4 el el
5 mar mar
6 y y
7 vosotros vosotros
7.1 vais ir
8-9 al _
8 a a
9 el el
10 parque parque</code></pre>

<h2>Morphological Annotation</h2>

<pre><code>1 Då då ADV AB _
2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act
3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing
4 elva elva NUM RG.NOM Case=Nom|NumType=Card
5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur
6 . . PUNCT DL.MAD _</code></pre>

<h2>Syntactic Annotation</h2>

<pre><code>1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root
3 and and CCONJ CC _ 4 cc 4:cc
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj
6 . . PUNCT . _ 2 punct 2:punct</code></pre>

<h2>Untokenized Text</h2>

<pre><code># text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“).
# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”).
1 Er er PRON … _
2 arbeitet arbeiten VERB … _
3-4 fürs _ _ … _
3 für für ADP … _
4 das der DET … _
5 FBI FBI PROPN … _
6 ( ( PUNCT … SpaceAfter=No
7 deutsch deutsch ADV … _
8 etwa etwa ADV … SpaceAfter=No
9 : : PUNCT … _
10 „ „ PUNCT … SpaceAfter=No
11 Bundesamt Bundesamt NOUN … _
12 für für ADP … _
13 Ermittlung Ermittlung NOUN … SpaceAfter=No
14 “ “ PUNCT … SpaceAfter=No
15 ) ) PUNCT … SpaceAfter=No
16 . . PUNCT … _</code></pre>

<h2>Sentence Boundaries and Comments</h2>

<pre><code># sent_id = 1
# text = They buy and sell books.
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
3 and and CCONJ CC _ 4 cc 4:cc _
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
6 . . PUNCT . _ 2 punct 2:punct _

# sent_id = 2
# text = I have no clue.
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
3 no no DET DT PronType=Neg 4 det _ _
4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
5 . . PUNCT . _ 2 punct _ _

# sent_id = panc0.s4
# text = तत् यथानुश्रूयते।
# translit = tat yathānuśrūyate.
# text_fr = Voilà ce qui nous est parvenu par la tradition orale.
# text_en = This is what is heard.
1 तत् तद् DET _ Case=Nom|…|PronType=Dem 3 nsubj _ Translit=tat|LTranslit=tad|Gloss=it
2-3 यथानुश्रूयते _ _ _ _ _ _ _ SpaceAfter=No
2 यथा यथा ADV _ PronType=Rel 3 advmod _ Translit=yathā|LTranslit=yathā|Gloss=how
3 अनुश्रूयते अनु-श्रु VERB _ Mood=Ind|…|Voice=Pass 0 root _ Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard
4 । । PUNCT _ _ 3 punct _ Translit=.|LTranslit=.|Gloss=.</code></pre>
97 changes: 97 additions & 0 deletions tests/languages/conllu/index_feature.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
1-2 vámonos _
1 vamos ir
2 nos nosotros
3-4 al _
3 a a
4 el el
5 mar mar

1 Sue Sue
2 likes like
3 coffee coffee
4 and and
5 Bill Bill
5.1 likes like
6 tea tea

----------------------------------------------------

[
["token", [
["id", "1-2"],
["form", "vámonos"],
["lemma", "_"]
]],
["token", [
["id", "1"],
["form", "vamos"],
["lemma", "ir"]
]],
["token", [
["id", "2"],
["form", "nos"],
["lemma", "nosotros"]
]],
["token", [
["id", "3-4"],
["form", "al"],
["lemma", "_"]
]],
["token", [
["id", "3"],
["form", "a"],
["lemma", "a"]
]],
["token", [
["id", "4"],
["form", "el"],
["lemma", "el"]
]],
["token", [
["id", "5"],
["form", "mar"],
["lemma", "mar"]
]],
["sentence-separator", ""],
["token", [
["id", "1"],
["form", "Sue"],
["lemma", "Sue"]
]],
["token", [
["id", "2"],
["form", "likes"],
["lemma", "like"]
]],
["token", [
["id", "3"],
["form", "coffee"],
["lemma", "coffee"]
]],
["token", [
["id", "4"],
["form", "and"],
["lemma", "and"]
]],
["token", [
["id", "5"],
["form", "Bill"],
["lemma", "Bill"]
]],
["token", [
["id", "5.1"],
["form", "likes"],
["lemma", "like"]
]],
["token", [
["id", "6"],
["form", "tea"],
["lemma", "tea"]
]]
]

----------------------------------------------------

Testing indexing schemes.

https://universaldependencies.org/format.html
Loading
Loading