Skip to content

Commit

Permalink
Add CoNNL-U language support, see #3790
Browse files Browse the repository at this point in the history
  • Loading branch information
Querela committed Jul 11, 2024
1 parent 59e5a34 commit 768c8f1
Show file tree
Hide file tree
Showing 9 changed files with 1,561 additions and 0 deletions.
4 changes: 4 additions & 0 deletions components.json
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,10 @@
"alias": "conc",
"owner": "jasontatton"
},
"conllu": {
"title": "CoNLL-U",
"owner": "Querela"
},
"csp": {
"title": "Content-Security-Policy",
"owner": "ScottHelme"
Expand Down
162 changes: 162 additions & 0 deletions components/prism-conllu.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
(function (Prism) {

Prism.languages.conllu = {
// comment lines
comment: {
pattern: /#(?:[^\n])*/,
inside: {
metadata: {
pattern: /(?:\w+)\s*=\s*.*/,
inside: {
key: {
pattern: /\w+(?=\s*=)/,
alias: 'property',
},
value: {
pattern: /(\s*=\s*)\S.*$/,
lookbehind: true,
alias: 'string',
},
operator: /[=]/,
}
},
punctuation: /^#/,
}
},
// separator between two sentence blocks
"sentence-separator": {
pattern: /(\r?\n)(?=\r?\n)/s,
lookbehind: true,
},
// word lines
token: {
pattern: /.+/,
inside: {
id: {
pattern: /^\d+(?:[.-]\d+)?/,
alias: 'number',
},
// form / lemma / upos / xpos / feats / head / deprel / deps / misc
value: {
pattern: /^(\t)[^\t]*(?=\t|$)/,
lookbehind: true,
// alias: 'string',
// inside: {
// unspecified: /_/,
// }
},
},
},
};

const featKeyExp = /[A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?/;
const featValueExp = /.+/; // we just want everything here ... not /[A-Z0-9][A-Za-z0-9]*/;
const featsGrammar = {
punctuation: /\|/,
feature: {
pattern: RegExp('^' + featKeyExp.source + '=' + '.*' + '$'),
inside: {
key: {
pattern: RegExp(featKeyExp.source + '(?==)'), // /\w+(?==)/,
alias: 'property',
},
value: [
{
pattern: /(=)(?:yes|no)$/i,
lookbehind: true,
alias: 'boolean',
}, {
pattern: RegExp('(=)' + featValueExp.source + '$'), // /(=).+$/,
lookbehind: true,
alias: 'string',
}
],
operator: /=/,
},
},
};

const relationExp = /^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$/;
const depsGrammar = {
punctuation: /\|/,
dep: {
pattern: /^\S+$/,
inside: {
head: {
pattern: /\d+(?=:)/,
alias: 'number',
},
punctuation: /^:/,
relation: {
pattern: /.+/, // we just capture everything, should be ok
alias: 'symbol',
},
}
},
}

// hook to assign roles to value fields
const entryTypes = ['form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'];
const entryTypesAlias = [null, null, 'symbol', 'symbol', null, 'number', 'symbol', null, null];
const entryTypeInside = [null, null, null, null, featsGrammar, null, null, depsGrammar, featsGrammar];
Prism.hooks.add('after-tokenize', function (env) {
if (env.language !== 'conllu') {
return;
}

for (const row of env.tokens) {
// go over each token row (if it is a "token" and not a comment/sentence-separator)
if (row.type === 'token') {
let entryTypeCounter = 0;
for (const field of row.content) {
// skip space between
if (typeof field === 'string') { continue; }
// only fields, not ids
if (field?.type !== 'value') { continue; }

if (field.alias === undefined) { field.alias = []; }
if (typeof field.alias === 'string') { field.alias = [field.alias]; }

// check if "_" value, and assign class
if (field.content === '_') {
field.alias.push('unspecified');
}

// assign role to value based on position
if (entryTypeCounter < entryTypes.length) {
// add "value" as one alias
field.alias.push(field.type);
// change field type
field.type = entryTypes[entryTypeCounter];
// add alias if available
if (entryTypesAlias[entryTypeCounter] !== null) {
field.alias.push(entryTypesAlias[entryTypeCounter]);
} else if (entryTypeInside[entryTypeCounter] === null) {
// only assign string if there is no inner processing?
field.alias.push('string');
}

// run inner processing only for selected types!
if (field.content !== '_' && entryTypeInside[entryTypeCounter] !== null) {
field.content = Prism.tokenize(field.content, entryTypeInside[entryTypeCounter]);
}
}

entryTypeCounter++;
}
}
}
});

// just to have the classes listed on /faq.html#how-do-i-know-which-tokens-i-can-style-for
// insert dummy rules that do not match anything
// TODO: unsure about possible performance hit? - there should not be anything left to match but regex matching steps increase linearly with input string length ...
// for (let index = 0; index < entryTypes.length; index++) {
// const entryType = entryTypes[index];
// const entryTypeAlias = entryTypesAlias[index];
// const name = 'value.' + entryType + (entryTypeAlias !== null ? '.' + entryTypeAlias : '');
// // use some invalid pattern
// Prism.languages.conllu.token.inside[name] = /\b\B/;
// }

}(Prism));
119 changes: 119 additions & 0 deletions examples/prism-conllu.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
<p>Full details can be fround at <a href="https://universaldependencies.org/format.html" target="_blank">Universal Dependencies - Format</a>.</p>

<h2>Comments</h2>

<pre><code># sent_id = 2
# text = I have no clue.
# or a simple string</code></pre>

<h2>Full Example</h2>

<pre><code># sent_id = 2
# text = I have no clue.
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
3 no no DET DT PronType=Neg 4 det _ _
4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
5 . . PUNCT . _ 2 punct _ _</code></pre>

<h2>Words, Tokens and Empty Nodes</h2>

<pre><code>1-2 vámonos _
1 vamos ir
2 nos nosotros
3-4 al _
3 a a
4 el el
5 mar mar</code></pre>

<pre><code>1 Sue Sue
2 likes like
3 coffee coffee
4 and and
5 Bill Bill
5.1 likes like
6 tea tea</code></pre>

<pre><code>1 nosotros nosotros
2 vamos ir
3-4 al _
3 a a
4 el el
5 mar mar
6 y y
7 vosotros vosotros
7.1 vais ir
8-9 al _
8 a a
9 el el
10 parque parque</code></pre>

<h2>Morphological Annotation</h2>

<pre><code>1 Då då ADV AB _
2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act
3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing
4 elva elva NUM RG.NOM Case=Nom|NumType=Card
5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur
6 . . PUNCT DL.MAD _</code></pre>

<h2>Syntactic Annotation</h2>

<pre><code>1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root
3 and and CCONJ CC _ 4 cc 4:cc
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj
6 . . PUNCT . _ 2 punct 2:punct</code></pre>

<h2>Untokenized Text</h2>

<pre><code># text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“).
# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”).
1 Er er PRON … _
2 arbeitet arbeiten VERB … _
3-4 fürs _ _ … _
3 für für ADP … _
4 das der DET … _
5 FBI FBI PROPN … _
6 ( ( PUNCT … SpaceAfter=No
7 deutsch deutsch ADV … _
8 etwa etwa ADV … SpaceAfter=No
9 : : PUNCT … _
10 „ „ PUNCT … SpaceAfter=No
11 Bundesamt Bundesamt NOUN … _
12 für für ADP … _
13 Ermittlung Ermittlung NOUN … SpaceAfter=No
14 “ “ PUNCT … SpaceAfter=No
15 ) ) PUNCT … SpaceAfter=No
16 . . PUNCT … _</code></pre>

<h2>Sentence Boundaries and Comments</h2>

<pre><code># sent_id = 1
# text = They buy and sell books.
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
3 and and CCONJ CC _ 4 cc 4:cc _
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
6 . . PUNCT . _ 2 punct 2:punct _

# sent_id = 2
# text = I have no clue.
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
3 no no DET DT PronType=Neg 4 det _ _
4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
5 . . PUNCT . _ 2 punct _ _

# sent_id = panc0.s4
# text = तत् यथानुश्रूयते।
# translit = tat yathānuśrūyate.
# text_fr = Voilà ce qui nous est parvenu par la tradition orale.
# text_en = This is what is heard.
1 तत् तद् DET _ Case=Nom|…|PronType=Dem 3 nsubj _ Translit=tat|LTranslit=tad|Gloss=it
2-3 यथानुश्रूयते _ _ _ _ _ _ _ SpaceAfter=No
2 यथा यथा ADV _ PronType=Rel 3 advmod _ Translit=yathā|LTranslit=yathā|Gloss=how
3 अनुश्रूयते अनु-श्रु VERB _ Mood=Ind|…|Voice=Pass 0 root _ Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard
4 । । PUNCT _ _ 3 punct _ Translit=.|LTranslit=.|Gloss=.</code></pre>
97 changes: 97 additions & 0 deletions tests/languages/conllu/index_feature.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
1-2 vámonos _
1 vamos ir
2 nos nosotros
3-4 al _
3 a a
4 el el
5 mar mar

1 Sue Sue
2 likes like
3 coffee coffee
4 and and
5 Bill Bill
5.1 likes like
6 tea tea

----------------------------------------------------

[
["token", [
["id", "1-2"],
["form", "vámonos"],
["lemma", "_"]
]],
["token", [
["id", "1"],
["form", "vamos"],
["lemma", "ir"]
]],
["token", [
["id", "2"],
["form", "nos"],
["lemma", "nosotros"]
]],
["token", [
["id", "3-4"],
["form", "al"],
["lemma", "_"]
]],
["token", [
["id", "3"],
["form", "a"],
["lemma", "a"]
]],
["token", [
["id", "4"],
["form", "el"],
["lemma", "el"]
]],
["token", [
["id", "5"],
["form", "mar"],
["lemma", "mar"]
]],
["sentence-separator", ""],
["token", [
["id", "1"],
["form", "Sue"],
["lemma", "Sue"]
]],
["token", [
["id", "2"],
["form", "likes"],
["lemma", "like"]
]],
["token", [
["id", "3"],
["form", "coffee"],
["lemma", "coffee"]
]],
["token", [
["id", "4"],
["form", "and"],
["lemma", "and"]
]],
["token", [
["id", "5"],
["form", "Bill"],
["lemma", "Bill"]
]],
["token", [
["id", "5.1"],
["form", "likes"],
["lemma", "like"]
]],
["token", [
["id", "6"],
["form", "tea"],
["lemma", "tea"]
]]
]

----------------------------------------------------

Testing indexing schemes.

https://universaldependencies.org/format.html
Loading

0 comments on commit 768c8f1

Please sign in to comment.