diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt index b1f008d..50cddb5 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt @@ -1,310 +1,342 @@ -package org.ivdnt.galahad.port.tei.export - -import org.ivdnt.galahad.app.report.Report -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.port.folia.export.deepcopy -import org.ivdnt.galahad.port.xml.getPlainTextContent -import org.ivdnt.galahad.util.insertFirst -import org.ivdnt.galahad.util.containedIn -import org.ivdnt.galahad.util.matchesUpTo -import org.w3c.dom.Document -import org.w3c.dom.Element -import org.w3c.dom.Node -import java.util.* -import kotlin.collections.ArrayList -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.WordForm - -fun HashSet.contains(s: String?, ignoreCase: Boolean = false): Boolean { - return any { it.equals(s, ignoreCase) } -} - -fun Node.wrapChildrenIn(newNode: Node) { - val children = this.childNodes.deepcopy() - for (child in children) { - newNode.appendChild(child) - } - // this.childnodes.length is now zero - newNode.parentNode.insertBefore(this, newNode) - this.appendChild(newNode) -} - -open class TEITextMerger( - var node: Node, - var offset: Int, - var document: Document, - val wordFormIter: ListIterator, - private val deleteList: ArrayList, - val layer: Layer, - val format: DocumentFormat, -) { - var newWTag: Element? = null - - open fun merge() { - if (node.nodeType == Node.TEXT_NODE) { - handeTextNode() - } else if (node.nodeType == Node.ELEMENT_NODE) { - handleElementNode() - } - } - - private fun handeTextNode() { - // Check whether there are wordforms to add - val wordFromsToAdd = getWordFormsToAdd().sortedBy { it.offset } - if (wordFromsToAdd.isNotEmpty()) { - // add the wordforms, mark this node for deletion - markForDeletion(node) - var previousEndOffset = offset - for (wf in wordFromsToAdd) { - if (wf.offset < offset) { - previousEndOffset = updateEndOffset(wf, previousEndOffset) - continue // It has been merged with a previous . - } - addWordForm(previousEndOffset, wf) - previousEndOffset = wf.endOffset - } - addTrailingChars(wordFromsToAdd) - } - } - - protected fun markForDeletion(n: Node): Boolean { - // If not contained in w, delete it. - val shouldDelete = !n.containedIn("w") - if (shouldDelete) deleteList.add(n) - return shouldDelete - } - - private fun addTrailingChars(wordFromsToAdd: List) { - var startIndex: Int = wordFromsToAdd.last().endOffset - offset - if (startIndex > node.textContent.length) { - startIndex = node.textContent.length - } - if (node.nodeType == Node.TEXT_NODE) { - val trailingText = node.textContent.substring(startIndex, node.textContent.length) - if (trailingText.isNotEmpty()) { - val trailingChars = document.createTextNode(trailingText) - node.parentNode.insertBefore(trailingChars, node) - } - } - } - - private fun updateEndOffset(wf: WordForm, previousEndOffset: Int): Int { - val words = node.textContent.split(Regex("""\s+""")) - if (words[0].isNotEmpty()) { - var i = 0 - var startIndex: Int - do { - startIndex = wf.literal.lastIndexOf(words[0].substring(0, words[0].length - i)) - i++ - } while (startIndex == -1) - return previousEndOffset + wf.literal.length - startIndex - } - return previousEndOffset - } - - protected open fun addWordForm(previousEndOffset: Int, wf: WordForm) { - // add leading characters - val leadingCharsText = node.textContent.substring(previousEndOffset - offset, wf.offset - offset) - if (leadingCharsText.isNotEmpty()) { - val leadingCharsNode = document.createTextNode(leadingCharsText) - node.parentNode.insertBefore(leadingCharsNode, node) - } - - // add tag or tag, depending on pos-tag - newWTag = createWTag(wf) - var endOffset: Int = wf.endOffset - offset - if (endOffset > node.textContent.length) { - endOffset = node.textContent.length - } - - // If our current text matches only partially, we will have to merge nodes. - val baseText = node.textContent.substring(wf.offset - offset, endOffset) - if (wf.literal == baseText) { - val newTextNode: Node = document.createTextNode(wf.literal) - node.parentNode.insertBefore(newWTag, node) - newWTag!!.insertFirst(newTextNode) - } else { - mergeTextNodes(baseText, newWTag!!, wf) - } - } - - private fun mergeTextNodes(baseText: String, wTag: Element, wf: WordForm) { - // Create a newTextNode, because node is in deleteList. - val newTextNode: Node = document.createTextNode(baseText) - node.parentNode.replaceChild(wTag, node) - // node is now dangling, so give it a textable parent. - val tmp = document.createElement("w") - tmp.appendChild(node) - // Now it is safe to overwrite node. - node = newTextNode - wTag.appendChild(node) - // Start merging the with what comes next. - mergeWTagWithSiblings(wTag, wf) - } - - private fun mergeWTagWithSiblings(wTag: Element, wf: WordForm) { - var wTag: Element = wTag - do { - // Go up in the tree until we have a nextSibling. - while (wTag.nextSibling == null) { - //wTag.wrapChildrenIn(wTag.parentNode) - wTag = moveWTagUp(wTag) - } - // Text contents. - val sibText = wTag.nextSibling.getPlainTextContent() - val plainText = wTag.getPlainTextContent() - var wText = "" - for (i in plainText.length-1 downTo 0) { - val tmp = plainText.substring(i) - if (!wf.literal.contains(tmp)) break - wText = tmp - } - - val stillNeeded = wf.literal.substring(wText.length) - // Determine up to where the sibText matches stillNeeded. - val matchingIndex = sibText.matchesUpTo(stillNeeded) - // Append the part that matches - if (sibText.length == matchingIndex) wTag.appendChild(wTag.nextSibling) - else { - val sibClone = wTag.nextSibling.cloneNode(true) - val sibTextToMatch = sibText.substring(0, matchingIndex) - val textToMatch = wText + sibTextToMatch - while (wText + sibClone.getPlainTextContent() != textToMatch) { - // get last element - var lastChild = sibClone - while (lastChild.lastChild != null) { - lastChild = lastChild.lastChild - } - // cut or delete - if (lastChild.textContent.isNotEmpty()) { - val matchesUpTo: Int = lastChild.textContent.matchesUpTo(sibTextToMatch) - lastChild.textContent = lastChild.textContent.substring(0, matchesUpTo) - } else { - lastChild.parentNode.removeChild(lastChild) - } - } - wTag.appendChild(sibClone) - } - // Continue while the literal is still only partially found - } while (!wTag.textContent.contains(wf.literal)) - } - - protected open fun moveWTagUp(wTag: Element): Element { - // TODO this still breaks in some cases. - wTag.wrapChildrenIn(wTag.parentNode) - return wTag - } - - protected open fun createWTag(wf: WordForm): Element { - val termToAdd = layer.termForWordForm(wf) - val wTag = if (layer.tagset.punctuationTags.contains(termToAdd.pos)) { - val n = document.createElement("pc") - n - } else { - val n = document.createElement("w") - n.setAttribute("lemma", termToAdd.lemma) - n - } - // Both and have a pos. - wTag.setAttribute(posType(), termToAdd.pos) - return wTag - } - - private fun posType(): String { - // For now always write pos to the @pos attribute. - // Even for legacy formats, because we want to update to TEIp5. - return "pos" - } - - protected fun getWordFormForOffsetOrNull(): WordForm? { - while(wordFormIter.hasNext()) { - val wf = wordFormIter.next() - if (wf.offset == offset) { - return wf - } else if (wf.offset > offset) { - // overstepped - wordFormIter.previous() - break - } - } - return null - } - - protected open fun handleElementNode() { - val element = node as Element - if (element.tagName == "w" || element.tagName == "pc") { - var wordFormToAdd: WordForm? = getWordFormForOffsetOrNull() - if (wordFormToAdd != null) { - // remove all whitespace within a -tag (although this rarely occurs anyway). - val sourceLiteral = node.getPlainTextContent().replace(Regex("""\s"""), "") - if (wordFormToAdd.literal == sourceLiteral) { - // This is a simple case since the tokenization matches - mergeWTag(wordFormToAdd, element) - } else { - // Tokenization mismatch, report it - Report.spottedIncompatibleTokenization( - wordFormToAdd, WordForm( - node.textContent, offset, node.textContent.length, - node.attributes?.getNamedItem("xml:id")?.textContent ?: "" - ) - ) - - // Best effort to fix it - if (wordFormToAdd.length > node.getPlainTextContent().length) { - // Add the term to this node, - // The excess length is ignored, which might result in the following token missing it's match - mergeWTag(wordFormToAdd, element) - } else { // wordFormToAdd.length < node.textContent.length - // This is a tricky case, we might want to split up the node - // Will leave this case for now - Report.tokenMissingAnnotation(node.textContent, offset) - } - } - } else { - // This is strange, we would expect an annotation, report it - Report.tokenMissingAnnotation(node.textContent, offset) - } - } - } - - private fun mergeWTag(wordFormToAdd: WordForm, element: Element) { - val termToAdd = layer.termForWordForm(wordFormToAdd) - // tags do not have a lemma. - if (element.tagName == "w") { - element.setAttribute("lemma", termToAdd.lemma) - } - element.setAttribute(posType(), termToAdd.pos) - element.removeAttribute("type") // Update legacy formats to TEI p5 - } - - private fun getWordFormsToAdd(): List { - val string = node.textContent // TODO or getplaintextcontent() ??? - val textEndOffset = offset + string.length - val result = mutableListOf() - // Go to the previous, if there is any. - // To fix scenarios like: abcdef ghi, when parsing the text node 'def ghi'. - if (wordFormIter.hasPrevious()) { - val prev = wordFormIter.previous() - if (endOfTermWithinText(prev, textEndOffset)) { - result.add(prev) - } - wordFormIter.next() - } - while (wordFormIter.hasNext()) { - val it = wordFormIter.next() - // Note: 'start'. The end might be in a following node. - val startOfTermWithinText = (it.offset >= offset) && (it.offset < textEndOffset) - if (startOfTermWithinText || endOfTermWithinText(it, textEndOffset)) { - result.add(it) - } else { - wordFormIter.previous() - break - } - } - return result - } - - private fun endOfTermWithinText(it: WordForm, textEndOffset: Int): Boolean { - return (it.endOffset > offset) && (it.endOffset <= textEndOffset) - } +package org.ivdnt.galahad.port.tei.export + +import org.ivdnt.galahad.app.report.Report +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.port.folia.export.deepcopy +import org.ivdnt.galahad.port.xml.getPlainTextContent +import org.w3c.dom.Document +import org.w3c.dom.Element +import org.w3c.dom.Node +import java.util.* +import kotlin.collections.ArrayList +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.WordForm +import org.ivdnt.galahad.util.* + +fun HashSet.contains(s: String?, ignoreCase: Boolean = false): Boolean { + return any { it.equals(s, ignoreCase) } +} + +open class TEITextMerger( + var node: Node, + var offset: Int, + var document: Document, + val wordFormIter: ListIterator, + private val deleteList: ArrayList, + val layer: Layer, + val format: DocumentFormat, +) { + var newWTag: Element? = null + + open fun merge() { + if (node.nodeType == Node.TEXT_NODE) { + handeTextNode() + } else if (node.nodeType == Node.ELEMENT_NODE) { + handleElementNode() + } + } + + private fun handeTextNode() { + // Check whether there are wordforms to add + val wordFromsToAdd = getWordFormsToAdd().sortedBy { it.offset } + if (wordFromsToAdd.isNotEmpty()) { + // add the wordforms, mark this node for deletion + markForDeletion(node) + var previousEndOffset = offset + for (wf in wordFromsToAdd) { + if (wf.offset < offset) { + previousEndOffset = updateEndOffset(wf, previousEndOffset) + continue // It has been merged with a previous . + } + addWordForm(previousEndOffset, wf) + previousEndOffset = wf.endOffset + } + addTrailingChars(wordFromsToAdd) + } + } + + protected fun markForDeletion(n: Node): Boolean { + // If not contained in w, delete it. + val shouldDelete = !n.containedIn("w") + if (shouldDelete) deleteList.add(n) + return shouldDelete + } + + private fun addTrailingChars(wordFromsToAdd: List) { + var startIndex: Int = wordFromsToAdd.last().endOffset - offset + if (startIndex > node.textContent.length) { + startIndex = node.textContent.length + } + if (node.nodeType == Node.TEXT_NODE) { + val trailingText = node.textContent.substring(startIndex, node.textContent.length) + if (trailingText.isNotBlank()) { + val trailingChars = document.createTextNode(trailingText) + node.parentNode.insertBefore(trailingChars, node) + } else if (trailingText.isBlank() && trailingText.isNotEmpty()) { + // Remove any empty nodes after trailing text has been removed. + // Example scenario:

example

+ // After constructing the , the space before the
remains. So we remove it and the parent (the ). + if (node.getPlainTextContent() == node.parentNode.getPlainTextContent()) { + deleteList.add(node.parentNode) + } + } + } + } + + private fun updateEndOffset(wf: WordForm, previousEndOffset: Int): Int { + val words = node.textContent.split(Regex("""\s+""")) + if (words[0].isNotEmpty()) { + var i = 0 + var startIndex: Int + do { + startIndex = wf.literal.lastIndexOf(words[0].substring(0, words[0].length - i)) + i++ + } while (startIndex == -1) + return previousEndOffset + wf.literal.length - startIndex + } + return previousEndOffset + } + + protected open fun addWordForm(previousEndOffset: Int, wf: WordForm) { + // add leading characters + val leadingCharsText = node.textContent.substring(previousEndOffset - offset, wf.offset - offset) + if (leadingCharsText.isNotEmpty()) { + val leadingCharsNode = document.createTextNode(leadingCharsText) + node.parentNode.insertBefore(leadingCharsNode, node) + } + + // add tag or tag, depending on pos-tag + newWTag = createWTag(wf) + var endOffset: Int = wf.endOffset - offset + if (endOffset > node.textContent.length) { + endOffset = node.textContent.length + } + + // If our current text matches only partially, we will have to merge nodes. + val baseText = node.textContent.substring(wf.offset - offset, endOffset) + if (wf.literal == baseText) { + val newTextNode: Node = document.createTextNode(wf.literal) + node.parentNode.insertBefore(newWTag, node) + newWTag!!.insertFirst(newTextNode) + } else { + mergeTextNodes(baseText, newWTag!!, wf) + } + } + + private fun mergeTextNodes(baseText: String, wTag: Element, wf: WordForm) { + // Create a newTextNode, because node is in deleteList. + val newTextNode: Node = document.createTextNode(baseText) + node.parentNode.replaceChild(wTag, node) + // node is now dangling, so give it a textable parent. + val tmp = document.createElement("w") + tmp.appendChild(node) + // Now it is safe to overwrite node. + node = newTextNode + wTag.appendChild(node) + // Start merging the with what comes next. + mergeWTagWithSiblings(wTag, wf) + } + + private fun mergeWTagWithSiblings(wTag: Element, wf: WordForm) { + var wTag: Element = wTag + do { + // Go up in the tree until we have a nextSibling. + while (wTag.nextSibling == null) { + //wTag.wrapChildrenIn(wTag.parentNode) + wTag = moveWTagUp(wTag) + } + // Text contents. + val sibText = wTag.nextSibling.getPlainTextContent() + val plainText = wTag.getPlainTextContent() + var wText = "" + for (i in plainText.length-1 downTo 0) { + val tmp = plainText.substring(i) + if (!wf.literal.contains(tmp)) break + wText = tmp + } + + val stillNeeded = wf.literal.substring(wText.length) + // Determine up to where the sibText matches stillNeeded. + val matchingIndex = sibText.matchesUpTo(stillNeeded) + // Append the part that matches + if (sibText.length == matchingIndex) { + // The whole sibling text matches: e.g. vanden + wTag.appendChild(wTag.nextSibling) + } + else { // The sibling text matches partially: e.g. vanden graal + val sibClone = wTag.nextSibling.cloneNode(true) + val clean = wTag.nextSibling.cloneNode(false) + if (clean.nodeType == Node.TEXT_NODE) { + clean.textContent = "" + } + val sibTextToMatch = sibText.substring(0, matchingIndex) + treeTraversal(sibClone, sibTextToMatch, clean, clean) + wTag.appendChild(clean) + } + // Continue while the literal is still only partially found + } while (!wTag.textContent.contains(wf.literal)) + } + + private fun treeTraversal(node: Node, textToMatch: String, clean: Node, cleanIndex: Node): Boolean { + // leaf node action: either add the text as a whole, or add a part + if (node.nodeType == Node.TEXT_NODE) { + val nodeText = node.getPlainTextContent() + if (nodeText.isNotEmpty()) { // some text: e.g. vanden graal + val correctedTextToMatch = textToMatch.substring(clean.getPlainTextContent().length) + val matchesUpTo: Int = nodeText.matchesUpTo(correctedTextToMatch) + cleanIndex.textContent = nodeText.substring(0, matchesUpTo) + } else { + // Nothing to do. Remember: [node] is already cloned to [clean]. + } + // Are we done now? + if (clean.getPlainTextContent() == textToMatch) { + return true + } + } + // Not a leaf node, recurse. + else { + for (i in 0 until node.childNodes.length) { + val child = node.childNodes.item(i) + val cleanChild = child.cloneNode(false) + if (cleanChild.nodeType == Node.TEXT_NODE) { + cleanChild.textContent = "" + } + cleanIndex.appendChild(cleanChild) + // And if at any point the text matches, we return. + if (treeTraversal(child, textToMatch, clean, cleanChild)) return true + } + } + // We are in a leaf node and we are not yet done. + return false + } + + protected open fun moveWTagUp(wTag: Element): Element { + val refToParent = wTag.parentNode + + val clonedParent = wTag.parentNode.cloneNode(false) + val children = wTag.childNodes.deepcopy() + for (child in children) { + clonedParent.appendChild(child) + } + wTag.appendChild(clonedParent) + wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode) + + if (refToParent.childOrNull("w", recurse=true) == null) { + deleteList.add(refToParent) + } + return wTag + } + + protected open fun createWTag(wf: WordForm): Element { + val termToAdd = layer.termForWordForm(wf) + val wTag = if (layer.tagset.punctuationTags.contains(termToAdd.pos)) { + val n = document.createElement("pc") + n + } else { + val n = document.createElement("w") + n.setAttribute("lemma", termToAdd.lemma) + n + } + // Both and have a pos. + wTag.setAttribute(posType(), termToAdd.pos) + return wTag + } + + private fun posType(): String { + // For now always write pos to the @pos attribute. + // Even for legacy formats, because we want to update to TEIp5. + return "pos" + } + + protected fun getWordFormForOffsetOrNull(): WordForm? { + while(wordFormIter.hasNext()) { + val wf = wordFormIter.next() + if (wf.offset == offset) { + return wf + } else if (wf.offset > offset) { + // overstepped + wordFormIter.previous() + break + } + } + return null + } + + protected open fun handleElementNode() { + val element = node as Element + if (element.tagName == "w" || element.tagName == "pc") { + var wordFormToAdd: WordForm? = getWordFormForOffsetOrNull() + if (wordFormToAdd != null) { + // remove all whitespace within a -tag (although this rarely occurs anyway). + val sourceLiteral = node.getPlainTextContent().replace(Regex("""\s"""), "") + if (wordFormToAdd.literal == sourceLiteral) { + // This is a simple case since the tokenization matches + mergeWTag(wordFormToAdd, element) + } else { + // Tokenization mismatch, report it + Report.spottedIncompatibleTokenization( + wordFormToAdd, WordForm( + node.textContent, offset, node.textContent.length, + node.attributes?.getNamedItem("xml:id")?.textContent ?: "" + ) + ) + + // Best effort to fix it + if (wordFormToAdd.length > node.getPlainTextContent().length) { + // Add the term to this node, + // The excess length is ignored, which might result in the following token missing it's match + mergeWTag(wordFormToAdd, element) + } else { // wordFormToAdd.length < node.textContent.length + // This is a tricky case, we might want to split up the node + // Will leave this case for now + Report.tokenMissingAnnotation(node.textContent, offset) + } + } + } else { + // This is strange, we would expect an annotation, report it + Report.tokenMissingAnnotation(node.textContent, offset) + } + } + } + + private fun mergeWTag(wordFormToAdd: WordForm, element: Element) { + val termToAdd = layer.termForWordForm(wordFormToAdd) + // tags do not have a lemma. + if (element.tagName == "w") { + element.setAttribute("lemma", termToAdd.lemma) + } + element.setAttribute(posType(), termToAdd.pos) + element.removeAttribute("type") // Update legacy formats to TEI p5 + } + + private fun getWordFormsToAdd(): List { + val string = node.textContent + val textEndOffset = offset + string.length + val result = mutableListOf() + // Go to the previous, if there is any. + // To fix scenarios like: abcdef ghi, when parsing the text node 'def ghi'. + if (wordFormIter.hasPrevious()) { + val prev = wordFormIter.previous() + if (endOfTermWithinText(prev, textEndOffset)) { + result.add(prev) + } + wordFormIter.next() + } + while (wordFormIter.hasNext()) { + val it = wordFormIter.next() + // Note: 'start'. The end might be in a following node. + val startOfTermWithinText = (it.offset >= offset) && (it.offset < textEndOffset) + if (startOfTermWithinText || endOfTermWithinText(it, textEndOffset)) { + result.add(it) + } else { + wordFormIter.previous() + break + } + } + return result + } + + private fun endOfTermWithinText(it: WordForm, textEndOffset: Int): Boolean { + return (it.endOffset > offset) && (it.endOffset <= textEndOffset) + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/util/NodeExtensions.kt b/server/src/main/kotlin/org/ivdnt/galahad/util/NodeExtensions.kt index 97859a3..bacb055 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/util/NodeExtensions.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/util/NodeExtensions.kt @@ -38,12 +38,16 @@ fun Node.nextNonTextSibling(): Node? { } /** Looks for the first child node, 1 deep, or null. */ -fun Node.childOrNull(childTag: String): Node? { +fun Node.childOrNull(childTag: String, recurse: Boolean = false): Node? { for (i in 0 until this.childNodes.length) { if (this.childNodes.item(i).nodeType == Node.ELEMENT_NODE) { if ((this.childNodes.item(i) as Element).tagName == childTag) { return this.childNodes.item(i) } + val childReturn = this.childNodes.item(i).childOrNull(childTag, recurse) + if (childReturn != null) { + return childReturn + } } } return null diff --git a/server/src/test/resources/tei/twine/merged-output.xml b/server/src/test/resources/tei/twine/merged-output.xml index b383f69..0c6893b 100644 --- a/server/src/test/resources/tei/twine/merged-output.xml +++ b/server/src/test/resources/tei/twine/merged-output.xml @@ -1,151 +1,135 @@ - - - - - - pie-tdn-all - - - TDN-Core - - - testCorpus - - - 872e7126-848d-4603-a57d-bc8abb889b96 - - - 2024-03-12 10:34:09 - - - testUser - - - 814bd005-ac80-4ee9-96fc-8582438714e1 - - - twine.input - - - - - -

net was ik naar school heen en terug wezen lopen enzo

- -. -. -. -scholen -scholen - - scholen -scho len -scho -len - -

electu

arien zijnde

- -

electu

arien

- - electu

arien

- - electu

arien

- -

testen

- -

testen woord

- -

cba fedabc defghi jklmno

-

abc def ghijklmno pqrstu xyzabc def

-

To obey or not to be,
that is the one question that yremained.

- - -

abc

-

abcdef

-

abcdef

- -

abcdef

-

abcdefghi

-

defghijkl

-

abcdefghijkl

- -

abcdef

-

abcdefghi

-

abcdefghi

-

abcdefghijkl

- -

cbaabcdef

-

cbaabcdefghi

-

abccbadefghi

-

abccbadefghijkl

- - -

die droecheit - rebellen.

- -

scholen.

- -

wilt verwittigen. Eersame vrome

- -

test

- -

Copie Wert belast ende bevolen

- -

andere officieren. Concludeert daeromme

- -

na de Ordonnantie en Placaaten van de Politie, haare af kundinge

- - -

Hoe Walewein Lancelote bescudde en enen camp voor hem vacht.

- -

ter putientie? van mijn heer

- -

alhier noe hoechnoedich

- -

met noch dlik? uuyt desen

- -

hopluyden oftmedende knechten

- -

naemen van de Keyserlicke Majesteyt de justicie om vergiffenisse.

- - -

nyet en es, maer der selver

- -

ik loop naar school

- -

ik loop naar huis

- -

stadt Sutphen Receptae 16 Martii

- -

frunden Receptae 16 Martii

- -

mir das ring, das sonst

- -

test-en

- -

genige voirschreven dan dat oick die fiandt ahm gesyn sich versammelen und sines arlisztes gewelttig moth sijn, vifftig oders

- - -

blievet froeme dat l g i in gesontheid

- -

wij in dese weszhae zeer verfallene und geoppressirde nots ke ghiene provisi van holt bij

- -

gunstige here, wij hebben hierbevoerens hadden

- -

An den scholtis to Zutphen of sijnen statholder Erentfeste

- -

welcker voirscreuen LVj LXj rydders die

- -

desse breiff. vermeldet Soe is

- - -

vyfftich auerkomen ende met

- -

pater toe Aelsum (bij Akkrum), ende

- -

Mijnen gans goetwijlleghen dijens[t] na allen vermoeghen altijt tibi honorem *  . Eerbare,

- - - -
+ + + twine.inputlinguistic annotation by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)Instituut voor de Nederlandse TaalDutch Language InstituteTEI merged by GaLAHaD (https://portal.clarin.ivdnt.org/galahad)Instituut voor de Nederlandse TaalDutch Language Institute!Needs to be filled in!twine.input__UUID_IGNORED_BY_TEST___teitestCorpussource namehttp://source.urltwine.inputinlineTDN-CoreThe file was automatically annotated within the platform GaLAHaD, which is a central hub for enriching historical Dutch.TEI xmlautomatically annotatedDutchtrue + + +

net was iknaar school heen en terug wezen lopen enzo

+ +. +. +. +scholen +scholen + + scholen +scho len +scho +len + +

electu

arien zijnde

+ +

electu

arien

+ + electu

arien

+ + electu

arien

+ +

testen

+ +

testen woord

+ +

testen

+

woord testen

+ +

woordtesten woord

+

woord testen woord

+ +

testen

+

testen

+ + +

cba fedabcdefghi jklmno

+

abcdef ghijklmnopqrstu xyzabc def

+

To obey or not to be,
that is theonequestion that yremained.

+ + +

abc

+

abcdef

+

abcdef

+ +

abcdef

+

abcdefghi

+

defghijkl

+

abcdefghijkl

+ +

abcdef

+

abcdefghi

+

abcdefghi

+

abcdefghijkl

+ +

cbaabcdef

+

cbaabcdefghi

+

abccbadefghi

+

abccbadefghijkl

+ + +

die droecheit + rebellen.

+ +

scholen.

+ +

wilt verwittigen. Eersame vrome

+ +

test

+ +

Copie Wert belast ende bevolen

+ +

andere officieren. Concludeert daeromme

+ +

na deOrdonnantie en Placaaten van de Politie, haare af kundinge

+ + +

Hoe Walewein Lancelote bescudde en enen camp voor hem vacht.

+ +

ter putientie? van mijn heer

+ +

alhiernoe hoechnoedich

+ +

met nochdlik? uuyt desen

+ +

hopluydenoftmedende knechten

+ +

naemen van de Keyserlicke Majesteyt de justicie om vergiffenisse.

+ + +

nyet en es, maer der selver

+ +

ik loopnaar school

+ +

ik loop naarhuis

+ +

stadt Sutphen Receptae 16 Martii

+ +

frunden Receptae 16 Martii

+ +

mir dasring, das sonst

+ +

test-en

+ +

genige voirschreven dan dat oick die fiandt ahm gesyn sich versammelen und sines arlisztes gewelttig moth sijn, vifftigoders

+ + +

blievet froeme dat l g i in gesontheid

+ +

wijin dese weszhae zeer verfallene und geoppressirde notske ghiene provisi van holt bij

+ +

gunstige here, wijhebben hierbevoerens hadden

+ +

An den scholtis to Zutphen of sijnen statholder Erentfeste

+ +

welcker voirscreuen LVjLXj rydders die

+ +

desse breiff.vermeldet Soe is

+ + +

vyfftichauerkomen ende met

+ +

pater toeAelsum(bij Akkrum), ende

+ +

Mijnen gans goetwijlleghen dijens[t] na allen vermoeghen altijt tibi honorem * . Eerbare,

+ +

ghemeynten vandenlande van breuites vitammagistrum

+ + +
\ No newline at end of file diff --git a/server/src/test/resources/tei/twine/pie-tdn.tsv b/server/src/test/resources/tei/twine/pie-tdn.tsv index e2f94ba..40c9fce 100644 --- a/server/src/test/resources/tei/twine/pie-tdn.tsv +++ b/server/src/test/resources/tei/twine/pie-tdn.tsv @@ -27,9 +27,20 @@ electu electus VRB(finiteness=fin,tense=pres)+PD(type=pers,position=free) arien arie NOU-C(number=pl) electu electus VRB(finiteness=fin,tense=pres)+PD(type=pers,position=free) arien arie NOU-C(number=pl) -testen te+esten PD(type=d-p,subtype=art,position=prenom)+NOU-C(number=sg) -testen te+eest NUM(type=ord,position=prenom,representation=let) +testen test NOU-C(number=pl) +testen test NOU-C(number=pl) woord woord NOU-C(number=sg) +testen test NOU-C(number=pl) +woord woord NOU-C(number=sg) +testen test NOU-C(number=pl) +woord woord NOU-C(number=sg) +testen test NOU-C(number=pl) +woord woord NOU-C(number=sg) +woord woord NOU-C(number=sg) +testen test NOU-C(number=pl) +woord woord NOU-C(number=sg) +testen test NOU-C(number=pl) +testen test NOU-C(number=pl) cba kabel AA(degree=pos,position=free) fedabc fedacht NOU-C(number=sg) defghi defigen NOU-C(number=sg) @@ -272,3 +283,10 @@ honorem hoorn NOU-P . LET Eerbare eerbaar AA(degree=pos,position=free) , LET +ghemeynten gemeente NOU-C(number=pl) +vanden van+de ADP(type=pre)+PD(type=d-p,subtype=art,position=prenom) +lande land NOU-C(number=sg) +van van ADP(type=pre) +breuites Breuites NOU-P +vitam vitam RES(type=for) +magistrum magistrum RES(type=for) diff --git a/server/src/test/resources/tei/twine/plaintext.txt b/server/src/test/resources/tei/twine/plaintext.txt index 25f5aa9..dea8fec 100644 --- a/server/src/test/resources/tei/twine/plaintext.txt +++ b/server/src/test/resources/tei/twine/plaintext.txt @@ -26,6 +26,18 @@ testen testen woord +testen + +woord testen + +woord testen woord + +woord testen woord + +testen + + testen + cba fedabc defghi jklmno abc def ghijklmno pqrstu xyzabc def @@ -133,3 +145,5 @@ pater toe Aelsum (bij Akkrum), ende Mijnen gans goetwijlleghen dijens[t] na allen vermoeghen altijt tibi honorem *  . Eerbare, +ghemeynten vanden lande van breuites vitam magistrum + diff --git a/server/src/test/resources/tei/twine/twine.input.xml b/server/src/test/resources/tei/twine/twine.input.xml index e1fd6c3..b3f72ad 100644 --- a/server/src/test/resources/tei/twine/twine.input.xml +++ b/server/src/test/resources/tei/twine/twine.input.xml @@ -38,6 +38,16 @@ len

testen woord

+

testen

+

woord testen

+ +

woord testen woord

+

woord testen woord

+ +

testen

+

testen

+ +

cba fedabc defghi jklmno

abc def ghijklmno pqrstu xyzabc def @@ -158,6 +168,7 @@ school

Mijnen gans goetwijlleghen dijens[t] na allen vermoeghen altijt tibi honorem *  . Eerbare,

+

ghemeynten vanden lande van breuites vitam magistrum