From 8fed93bae161506b3775c40ce85115dab4628e12 Mon Sep 17 00:00:00 2001
From: Vincent Prins
Date: Thu, 20 Jun 2024 12:54:23 +0200
Subject: [PATCH] Fix NULL appearing in exported files as lemma/pos
Now we export empty strings instead. See INL/Cobaltje#134
---
.../org/ivdnt/galahad/data/layer/Term.kt | 212 +++++++------
.../port/folia/export/FoliaTextMerger.kt | 300 +++++++++---------
.../folia/export/LayerToFoliaConverter.kt | 162 +++++-----
.../port/naf/export/LayerToNAFConverter.kt | 4 +-
.../port/tei/export/LayerToTEIConverter.kt | 4 +-
.../galahad/port/tei/export/TEITextMerger.kt | 8 +-
.../port/tsv/export/LayerToTSVConverter.kt | 4 +-
.../galahad/port/tsv/export/TSVLayerMerger.kt | 124 ++++----
.../galahad/data/document/DocumentTest.kt | 2 +-
.../all-formats/input/input.folia.xml | 16 +-
.../resources/all-formats/input/input.tsv | 16 +-
.../output/from-Conllu-to-Folia.folia.xml | 18 +-
.../all-formats/output/from-Conllu-to-Tsv.tsv | 16 +-
.../all-formats/output/from-Folia-to-Tsv.tsv | 16 +-
.../output/from-Naf-to-Folia.folia.xml | 18 +-
.../all-formats/output/from-Naf-to-Tsv.tsv | 16 +-
.../output/from-TeiP5-to-Folia.folia.xml | 18 +-
.../all-formats/output/from-TeiP5-to-Tsv.tsv | 16 +-
.../output/from-Tsv-to-Folia.folia.xml | 18 +-
.../output/from-Txt-to-Folia.folia.xml | 18 +-
.../all-formats/output/from-Txt-to-Tsv.tsv | 16 +-
.../folia/twine/merged-output.folia.xml | 2 +-
22 files changed, 517 insertions(+), 507 deletions(-)
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt
index 2aeddd8..e786b86 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt
@@ -1,102 +1,112 @@
-package org.ivdnt.galahad.data.layer
-
-import com.fasterxml.jackson.annotation.JsonIgnore
-import com.fasterxml.jackson.annotation.JsonProperty
-
-/** Avoid empty strings in the CSV representation. */
-fun Term.toNonEmptyPair(): Pair {
- return (this.pos ?: Term.NO_POS) to (this.lemma ?: Term.NO_LEMMA)
-}
-
-/**
- * A term in a [Layer]. A term has a [lemma], a [pos] and refers to one or multiple [WordForm].
- * Referring to multiple [WordForm] is used to represent multi-word terms, although it is currently not used.
- * Lemma and pos can be null.
- */
-data class Term(
- @JsonProperty("lemma") val lemma: String?,
- @JsonProperty("pos") val pos: String?,
- @JsonProperty("targets") val targets: MutableList,
-) {
- /** Whether the lemma is not null. */
- @get:JsonIgnore
- val hasLemma: Boolean = lemma != null
-
- /** Whether the pos is not null. */
- @get:JsonIgnore
- val hasPOS: Boolean = pos != null
-
- @get:JsonIgnore
- val posHeadGroupOrDefault
- get() = posHeadGroup ?: NO_POS
-
- @get:JsonIgnore
- val lemmaOrDefault
- get() = lemma ?: NO_LEMMA
-
- /** Whether this term refers to multiple [WordForm]. */
- @get:JsonIgnore
- val isMultiTarget = targets.size > 1
-
- /** The head of the first [pos]. E.g. "PD" for "PD(type=art)+NOU(num=sg)". */
- @get:JsonIgnore
- val posHead: String? = posToPosHead(pos)
-
- @get:JsonIgnore
- val isMultiPos: Boolean = pos?.contains("+") ?: false
-
- /** The head of all [pos]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)". */
- @get:JsonIgnore
- val posHeadGroup: String? = run {
- // Split on +
- if (!isMultiPos) return@run posHead
- val result: String? = pos?.split("+")?.mapNotNull { posToPosHead(it) }?.joinToString("+")
- result
- }
-
-
-
- /** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */
- @get:JsonIgnore
- val posFeatures: String?
- get() {
- if (pos == null) return null
- val featureStart: Int = pos?.indexOf('(') ?: -1
- val featureEnd: Int = pos?.indexOf(')') ?: -1
- return if (featureStart != -1 && featureEnd != -1) {
- return pos!!.slice(featureStart + 1 until featureEnd)
- } else null
- }
-
- /** Offset of the first [WordForm] in [targets].*/
- @get:JsonIgnore
- val firstOffset get() = targets.minOfOrNull { it.offset } ?: -1
-
- /** String constructed from all the [WordForm] in [targets]. */
- @get:JsonIgnore
- val literals: String
- get() = targets.joinToString(" ") { it.literal }
-
- companion object {
- const val NO_POS = "NO_POS"
- const val NO_LEMMA = "NO_LEMMA"
- val EMPTY = Term(null, null, mutableListOf())
- private fun posToPosHead(pos: String?): String? {
- return if (pos == null) {
- null
- } else if (pos.contains('(')) {
- // pos contains a non-letter non-digit character
- val headEnd = pos.indexOf('(')
- val head = pos.slice(0 until headEnd)
- if (head.isEmpty()) {
- pos // pos is non-empty and starts with a non-letter character, e.g.: _
- } else {
- head
- }
- } else {
- // pos is 0 or more letters only
- pos
- }
- }
- }
+package org.ivdnt.galahad.data.layer
+
+import com.fasterxml.jackson.annotation.JsonIgnore
+import com.fasterxml.jackson.annotation.JsonProperty
+
+/** Avoid empty strings in the CSV representation. */
+fun Term.toNonEmptyPair(): Pair {
+ return (this.pos ?: Term.NO_POS) to (this.lemma ?: Term.NO_LEMMA)
+}
+
+/**
+ * A term in a [Layer]. A term has a [lemma], a [pos] and refers to one or multiple [WordForm].
+ * Referring to multiple [WordForm] is used to represent multi-word terms, although it is currently not used.
+ * Lemma and pos can be null.
+ */
+data class Term(
+ @JsonProperty("lemma") val lemma: String?,
+ @JsonProperty("pos") val pos: String?,
+ @JsonProperty("targets") val targets: MutableList,
+) {
+ /** Whether the lemma is not null. */
+ @get:JsonIgnore
+ val hasLemma: Boolean = lemma != null
+
+ /** Whether the pos is not null. */
+ @get:JsonIgnore
+ val hasPOS: Boolean = pos != null
+
+ @get:JsonIgnore
+ val posHeadGroupOrDefault
+ get() = posHeadGroup ?: NO_POS
+
+ @get:JsonIgnore
+ val lemmaOrDefault
+ get() = lemma ?: NO_LEMMA
+
+ @get:JsonIgnore
+ val lemmaOrEmpty
+ get() = lemma ?: ""
+
+ @get:JsonIgnore
+ val posOrEmpty
+ get() = pos ?: ""
+
+ /** Whether this term refers to multiple [WordForm]. */
+ @get:JsonIgnore
+ val isMultiTarget = targets.size > 1
+
+ /** The head of the first [pos]. E.g. "PD" for "PD(type=art)+NOU(num=sg)". */
+ @get:JsonIgnore
+ val posHead: String? = posToPosHead(pos)
+
+ @get:JsonIgnore
+ val isMultiPos: Boolean = pos?.contains("+") ?: false
+
+ /** The head of all [pos]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)". */
+ @get:JsonIgnore
+ val posHeadGroup: String? = run {
+ // Split on +
+ if (!isMultiPos) return@run posHead
+ val result: String? = pos?.split("+")?.mapNotNull { posToPosHead(it) }?.joinToString("+")
+ result
+ }
+
+ @get:JsonIgnore
+ val posHeadGroupOrEmpty
+ get() = posHeadGroup ?: ""
+
+ /** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */
+ @get:JsonIgnore
+ val posFeatures: String?
+ get() {
+ if (pos == null) return null
+ val featureStart: Int = pos.indexOf('(') ?: -1
+ val featureEnd: Int = pos.indexOf(')') ?: -1
+ return if (featureStart != -1 && featureEnd != -1) {
+ return pos.slice(featureStart + 1 until featureEnd)
+ } else null
+ }
+
+ /** Offset of the first [WordForm] in [targets].*/
+ @get:JsonIgnore
+ val firstOffset get() = targets.minOfOrNull { it.offset } ?: -1
+
+ /** String constructed from all the [WordForm] in [targets]. */
+ @get:JsonIgnore
+ val literals: String
+ get() = targets.joinToString(" ") { it.literal }
+
+ companion object {
+ const val NO_POS = "NO_POS"
+ const val NO_LEMMA = "NO_LEMMA"
+ val EMPTY = Term(null, null, mutableListOf())
+ private fun posToPosHead(pos: String?): String? {
+ return if (pos == null) {
+ null
+ } else if (pos.contains('(')) {
+ // pos contains a non-letter non-digit character
+ val headEnd = pos.indexOf('(')
+ val head = pos.slice(0 until headEnd)
+ if (head.isEmpty()) {
+ pos // pos is non-empty and starts with a non-letter character, e.g.: _
+ } else {
+ head
+ }
+ } else {
+ // pos is 0 or more letters only
+ pos
+ }
+ }
+ }
}
\ No newline at end of file
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt
index 7e8da15..f826434 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt
@@ -1,151 +1,151 @@
-package org.ivdnt.galahad.port.folia.export
-
-import org.ivdnt.galahad.data.document.DocumentFormat
-import org.ivdnt.galahad.data.layer.Layer
-import org.ivdnt.galahad.data.layer.Term
-import org.ivdnt.galahad.data.layer.WordForm
-import org.ivdnt.galahad.port.tei.export.TEITextMerger
-import org.ivdnt.galahad.util.insertFirst
-import org.ivdnt.galahad.port.xml.reparseText
-import org.ivdnt.galahad.port.xml.tagName
-import org.ivdnt.galahad.util.childOrNull
-import org.ivdnt.galahad.util.insertAfter
-import org.w3c.dom.Document
-import org.w3c.dom.Element
-import org.w3c.dom.Node
-import org.w3c.dom.NodeList
-
-fun NodeList.deepcopy(): ArrayList {
- val copy = ArrayList()
- for (i in 0 until this.length) {
- copy.add(this.item(i))
- }
- return copy
-}
-
-class FoliaTextMerger(
- node: Node, offset: Int, document: Document, wordFormIter: ListIterator,
- deleteList: ArrayList, layer: Layer
-) : TEITextMerger(node, offset, document, wordFormIter, deleteList, layer, DocumentFormat.Folia) {
-
- override fun merge() {
- if (node.tagName() == "t" || node.tagName()?.startsWith("t-") == true) {
- // We are going to add nodes when exporting, but we don't want to iterate over them
- // So better store references to the current nodes and use them
- val parent = node
- val oldChildNodes = node.childNodes.deepcopy()
- var endsWithSpace = true
- for (child in oldChildNodes) {
- node = child
- val text = reparseText(child.textContent)
-
- if (node.nodeType == Node.TEXT_NODE) {
- child.textContent = text
- }
- // TODO reparseText overwrites embedded t-styles
-
- // never set the offset of more than one space.
- if (endsWithSpace && text.startsWith(" ")) {
- offset -= 1
- }
- merge()
-
- // Keep track of the ending space
- if (text.isNotEmpty() && text.endsWith(" "))
- endsWithSpace = true
- else if (text.isNotEmpty())
- endsWithSpace = false
-
- if (child.nodeType == Node.TEXT_NODE)
- offset += text.length
- }
- // Remove parent and transfer children.
- if(markForDeletion(parent)) {
- var last = parent
- for (i in parent.childNodes.length - 1 downTo 0) {
- val c = parent.childNodes.item(i)
- parent.parentNode.insertBefore(c, last)
- last = c
- }
- }
-
- } else {
- super.merge()
- }
- }
-
- override fun createWTag(wf: WordForm): Element {
- val wTag = node.parentNode.cloneNode(false)
- return wTag as Element
- }
-
- override fun addWordForm(previousEndOffset: Int, wf: WordForm) {
- super.addWordForm(previousEndOffset, wf)
- // For Folia, newWTag is actually a or tag.
- var tTag: Node = newWTag!!
- // Make sure tTag points to a . For e.g. a , grab the first parent.
- var parent =
- if (tTag.parentNode.tagName() == "t") tTag.parentNode
- else tTag.parentNode.parentNode // First iteration looks at grandparent, because t-style copied itself.
- while (tTag.tagName() != "t") {
- val clone = parent.cloneNode(false)
- tTag.parentNode.replaceChild(clone,tTag)
- clone.insertFirst(tTag)
- // Ready for next iter.
- parent = parent.parentNode
- tTag = clone
- }
- // Create the which will contain the
- val wTag = document.createElement("w")
- val term = layer.termForWordForm(wf)
- wTag.addTerm(term)
- // Contain it.
- tTag.parentNode.replaceChild(wTag,tTag)
- wTag.insertFirst(tTag)
- }
-
- override fun handleElementNode() {
- val element = node as Element
- if (element.tagName != "w") return
-
- val wordFormToAdd = getWordFormForOffsetOrNull() ?: return
- val term = layer.termForWordForm(wordFormToAdd)
- element.addTerm(term)
- }
-
- override fun moveWTagUp(wTag: Element): Element {
- wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode)
- val clone = wTag.parentNode.cloneNode(false)
- wTag.parentNode.replaceChild(clone, wTag)
- clone.appendChild(wTag)
- newWTag = clone as Element
- return clone
- }
-
- private fun Element.addTerm(term: Term) {
- this.addTermFeature("lemma", term.lemma.toString())
- this.addTermFeature("pos", term.pos.toString(), term.posHeadGroup)
- }
-
- private fun Element.addTermFeature(name: String, value: String, head: String? = null) {
- /* If at some point we want to remove existing annotations layers (pos & lemma) in folia tags
- * uncomment this. For now, multiple annotation layers are okay in the export.
- // Find the child elements of [name] and delete them
- val children = this.childNodes.deepcopy()
- for (child in children) {
- if (child.tagName() == name) {
- this.removeChild(child)
- }
- }*/
-
- // Create a new child element of [name]
- val child = this.ownerDocument.createElement(name)
- child.setAttribute("class", value)
- // For PoS
- if (head != null) child.setAttribute("head", head)
- // Folia metadata.
- child.setAttribute("processor", layer.name)
- child.setAttribute("set", layer.name)
- this.appendChild(child)
- }
+package org.ivdnt.galahad.port.folia.export
+
+import org.ivdnt.galahad.data.document.DocumentFormat
+import org.ivdnt.galahad.data.layer.Layer
+import org.ivdnt.galahad.data.layer.Term
+import org.ivdnt.galahad.data.layer.WordForm
+import org.ivdnt.galahad.port.tei.export.TEITextMerger
+import org.ivdnt.galahad.util.insertFirst
+import org.ivdnt.galahad.port.xml.reparseText
+import org.ivdnt.galahad.port.xml.tagName
+import org.ivdnt.galahad.util.childOrNull
+import org.ivdnt.galahad.util.insertAfter
+import org.w3c.dom.Document
+import org.w3c.dom.Element
+import org.w3c.dom.Node
+import org.w3c.dom.NodeList
+
+fun NodeList.deepcopy(): ArrayList {
+ val copy = ArrayList()
+ for (i in 0 until this.length) {
+ copy.add(this.item(i))
+ }
+ return copy
+}
+
+class FoliaTextMerger(
+ node: Node, offset: Int, document: Document, wordFormIter: ListIterator,
+ deleteList: ArrayList, layer: Layer
+) : TEITextMerger(node, offset, document, wordFormIter, deleteList, layer, DocumentFormat.Folia) {
+
+ override fun merge() {
+ if (node.tagName() == "t" || node.tagName()?.startsWith("t-") == true) {
+ // We are going to add nodes when exporting, but we don't want to iterate over them
+ // So better store references to the current nodes and use them
+ val parent = node
+ val oldChildNodes = node.childNodes.deepcopy()
+ var endsWithSpace = true
+ for (child in oldChildNodes) {
+ node = child
+ val text = reparseText(child.textContent)
+
+ if (node.nodeType == Node.TEXT_NODE) {
+ child.textContent = text
+ }
+ // TODO reparseText overwrites embedded t-styles
+
+ // never set the offset of more than one space.
+ if (endsWithSpace && text.startsWith(" ")) {
+ offset -= 1
+ }
+ merge()
+
+ // Keep track of the ending space
+ if (text.isNotEmpty() && text.endsWith(" "))
+ endsWithSpace = true
+ else if (text.isNotEmpty())
+ endsWithSpace = false
+
+ if (child.nodeType == Node.TEXT_NODE)
+ offset += text.length
+ }
+ // Remove parent and transfer children.
+ if(markForDeletion(parent)) {
+ var last = parent
+ for (i in parent.childNodes.length - 1 downTo 0) {
+ val c = parent.childNodes.item(i)
+ parent.parentNode.insertBefore(c, last)
+ last = c
+ }
+ }
+
+ } else {
+ super.merge()
+ }
+ }
+
+ override fun createWTag(wf: WordForm): Element {
+ val wTag = node.parentNode.cloneNode(false)
+ return wTag as Element
+ }
+
+ override fun addWordForm(previousEndOffset: Int, wf: WordForm) {
+ super.addWordForm(previousEndOffset, wf)
+ // For Folia, newWTag is actually a or tag.
+ var tTag: Node = newWTag!!
+ // Make sure tTag points to a . For e.g. a , grab the first parent.
+ var parent =
+ if (tTag.parentNode.tagName() == "t") tTag.parentNode
+ else tTag.parentNode.parentNode // First iteration looks at grandparent, because t-style copied itself.
+ while (tTag.tagName() != "t") {
+ val clone = parent.cloneNode(false)
+ tTag.parentNode.replaceChild(clone,tTag)
+ clone.insertFirst(tTag)
+ // Ready for next iter.
+ parent = parent.parentNode
+ tTag = clone
+ }
+ // Create the which will contain the
+ val wTag = document.createElement("w")
+ val term = layer.termForWordForm(wf)
+ wTag.addTerm(term)
+ // Contain it.
+ tTag.parentNode.replaceChild(wTag,tTag)
+ wTag.insertFirst(tTag)
+ }
+
+ override fun handleElementNode() {
+ val element = node as Element
+ if (element.tagName != "w") return
+
+ val wordFormToAdd = getWordFormForOffsetOrNull() ?: return
+ val term = layer.termForWordForm(wordFormToAdd)
+ element.addTerm(term)
+ }
+
+ override fun moveWTagUp(wTag: Element): Element {
+ wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode)
+ val clone = wTag.parentNode.cloneNode(false)
+ wTag.parentNode.replaceChild(clone, wTag)
+ clone.appendChild(wTag)
+ newWTag = clone as Element
+ return clone
+ }
+
+ private fun Element.addTerm(term: Term) {
+ this.addTermFeature("lemma", term.lemmaOrEmpty)
+ this.addTermFeature("pos", term.posOrEmpty, term.posHeadGroupOrEmpty)
+ }
+
+ private fun Element.addTermFeature(name: String, value: String, head: String? = null) {
+ /* If at some point we want to remove existing annotations layers (pos & lemma) in folia tags
+ * uncomment this. For now, multiple annotation layers are okay in the export.
+ // Find the child elements of [name] and delete them
+ val children = this.childNodes.deepcopy()
+ for (child in children) {
+ if (child.tagName() == name) {
+ this.removeChild(child)
+ }
+ }*/
+
+ // Create a new child element of [name]
+ val child = this.ownerDocument.createElement(name)
+ child.setAttribute("class", value)
+ // For PoS
+ if (head != null) child.setAttribute("head", head)
+ // Folia metadata.
+ child.setAttribute("processor", layer.name)
+ child.setAttribute("set", layer.name)
+ this.appendChild(child)
+ }
}
\ No newline at end of file
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt
index 5eecf59..777de28 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt
@@ -1,82 +1,82 @@
-package org.ivdnt.galahad.port.folia.export
-
-import org.ivdnt.galahad.data.document.DocumentFormat
-import org.ivdnt.galahad.data.layer.Term
-import org.ivdnt.galahad.port.DocumentTransformMetadata
-import org.ivdnt.galahad.port.LayerConverter
-import org.ivdnt.galahad.port.LayerTransformer
-import org.ivdnt.galahad.util.XMLWriter
-import org.ivdnt.galahad.util.escapeXML
-import org.ivdnt.galahad.util.toValidXmlId
-import java.io.OutputStream
-
-class LayerToFoliaConverter (
- transformMetadata: DocumentTransformMetadata,
-) : LayerConverter, LayerTransformer( transformMetadata ) {
-
- override val format: DocumentFormat
- get() = DocumentFormat.Folia
-
- val id: String
- get() = document.getUploadedRawFile().nameWithoutExtension.toValidXmlId()
-
- override fun convert(outputStream: OutputStream) {
- val taggerName = tagger.id
- val writer = XMLWriter(outputStream)
- // XML Header
- writer.writeLineRaw("")
- writer.openTag("")
- // Metadata
- writeMetadata(writer, taggerName)
- // Textbody
- writeTextBody(writer, taggerName)
- }
-
- private fun writeMetadata(writer: XMLWriter, taggerName: String) {
- writer.openTag("")
- // Annotations
- writer.openTag("")
- writer.writeLine("")
- writer.writeLine("")
- writer.writeLine("")
- writer.writeLine("")
- for (annotation in setOf("lemma", "pos")) {
- writer.openTag("<$annotation-annotation set=\"${taggerName}\">")
- writer.writeLine("")
- writer.closeTag("$annotation-annotation>")
- }
- writer.closeTag("")
- // Provenance
- writer.openTag("")
- writer.writeLine(
- ""
- )
- writer.closeTag("")
- writer.closeTag("")
- }
-
- private fun writeTextBody(writer: XMLWriter, taggerName: String) {
- writer.openTag("")
- writer.openTag("")
- for ((index, term) in this.result.terms.withIndex()) {
- // Single W
- writeSingleW(writer, index, term, taggerName)
- }
- writer.closeTag("
")
- writer.closeTag("")
- writer.closeTag("")
- }
-
- private fun writeSingleW(
- writer: XMLWriter, index: Int, term: Term,
- taggerName: String,
- ) {
- writer.openTag("")
- writer.writeLine("${term.targets[0].literal.escapeXML()}")
- writer.writeLine("")
- writer.writeLine(
- ""
- )
- writer.closeTag("")
- }
+package org.ivdnt.galahad.port.folia.export
+
+import org.ivdnt.galahad.data.document.DocumentFormat
+import org.ivdnt.galahad.data.layer.Term
+import org.ivdnt.galahad.port.DocumentTransformMetadata
+import org.ivdnt.galahad.port.LayerConverter
+import org.ivdnt.galahad.port.LayerTransformer
+import org.ivdnt.galahad.util.XMLWriter
+import org.ivdnt.galahad.util.escapeXML
+import org.ivdnt.galahad.util.toValidXmlId
+import java.io.OutputStream
+
+class LayerToFoliaConverter (
+ transformMetadata: DocumentTransformMetadata,
+) : LayerConverter, LayerTransformer( transformMetadata ) {
+
+ override val format: DocumentFormat
+ get() = DocumentFormat.Folia
+
+ val id: String
+ get() = document.getUploadedRawFile().nameWithoutExtension.toValidXmlId()
+
+ override fun convert(outputStream: OutputStream) {
+ val taggerName = tagger.id
+ val writer = XMLWriter(outputStream)
+ // XML Header
+ writer.writeLineRaw("")
+ writer.openTag("")
+ // Metadata
+ writeMetadata(writer, taggerName)
+ // Textbody
+ writeTextBody(writer, taggerName)
+ }
+
+ private fun writeMetadata(writer: XMLWriter, taggerName: String) {
+ writer.openTag("")
+ // Annotations
+ writer.openTag("")
+ writer.writeLine("")
+ writer.writeLine("")
+ writer.writeLine("")
+ writer.writeLine("")
+ for (annotation in setOf("lemma", "pos")) {
+ writer.openTag("<$annotation-annotation set=\"${taggerName}\">")
+ writer.writeLine("")
+ writer.closeTag("$annotation-annotation>")
+ }
+ writer.closeTag("")
+ // Provenance
+ writer.openTag("")
+ writer.writeLine(
+ ""
+ )
+ writer.closeTag("")
+ writer.closeTag("")
+ }
+
+ private fun writeTextBody(writer: XMLWriter, taggerName: String) {
+ writer.openTag("")
+ writer.openTag("")
+ for ((index, term) in this.result.terms.withIndex()) {
+ // Single W
+ writeSingleW(writer, index, term, taggerName)
+ }
+ writer.closeTag("
")
+ writer.closeTag("")
+ writer.closeTag("")
+ }
+
+ private fun writeSingleW(
+ writer: XMLWriter, index: Int, term: Term,
+ taggerName: String,
+ ) {
+ writer.openTag("")
+ writer.writeLine("${term.targets[0].literal.escapeXML()}")
+ writer.writeLine("")
+ writer.writeLine(
+ ""
+ )
+ writer.closeTag("")
+ }
}
\ No newline at end of file
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt
index 0906d6b..4ff753b 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt
@@ -57,8 +57,8 @@ class LayerToNAFConverter (
val xterm = xmlDoc.createElement("term")
terms.appendChild(xterm)
xterm.setAttribute("id", "t$index")
- xterm.setAttribute("lemma", term.lemma)
- xterm.setAttribute("pos", term.pos)
+ xterm.setAttribute("lemma", term.lemmaOrEmpty)
+ xterm.setAttribute("pos", term.posOrEmpty)
val xspan = xmlDoc.createElement("span")
xterm.appendChild( xspan )
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt
index 118b8f3..109c26d 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt
@@ -112,8 +112,8 @@ class LayerToTEIConverter(
writer.writeRaw("${getLiteral()}")
} else {
// If it is not punctuation, safely assume it can be interpreted as
- val lemma = term.lemma?.escapeXML()
- val pos = term.pos?.escapeXML()
+ val lemma = term.lemmaOrEmpty.escapeXML()
+ val pos = term.posOrEmpty.escapeXML()
writer.writeRaw("${getLiteral()}")
}
}
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt
index 50cddb5..e4b061c 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt
@@ -234,11 +234,11 @@ open class TEITextMerger(
n
} else {
val n = document.createElement("w")
- n.setAttribute("lemma", termToAdd.lemma)
+ n.setAttribute("lemma", termToAdd.lemmaOrEmpty)
n
}
// Both and have a pos.
- wTag.setAttribute(posType(), termToAdd.pos)
+ wTag.setAttribute(posType(), termToAdd.posOrEmpty)
return wTag
}
@@ -303,9 +303,9 @@ open class TEITextMerger(
val termToAdd = layer.termForWordForm(wordFormToAdd)
// tags do not have a lemma.
if (element.tagName == "w") {
- element.setAttribute("lemma", termToAdd.lemma)
+ element.setAttribute("lemma", termToAdd.lemmaOrEmpty)
}
- element.setAttribute(posType(), termToAdd.pos)
+ element.setAttribute(posType(), termToAdd.posOrEmpty)
element.removeAttribute("type") // Update legacy formats to TEI p5
}
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt
index cb87b06..36a278a 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt
@@ -18,8 +18,8 @@ class LayerToTSVConverter(
outputStream.write("word\tlemma\tpos\n".encodeToByteArray()) // 'word' is the blacklab default
// Body
result.terms.forEach {
- // Note that this might have weird result for multi-wordform tokens
- outputStream.write("${it.literals}\t${it.lemma}\t${it.pos}\n".encodeToByteArray())
+ // Explicitly non-null.
+ outputStream.write("${it.literals}\t${it.lemmaOrEmpty}\t${it.posOrEmpty}\n".encodeToByteArray())
}
}
}
\ No newline at end of file
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt
index a268c3b..24666d0 100644
--- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt
+++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt
@@ -1,63 +1,63 @@
-package org.ivdnt.galahad.port.tsv.export
-
-import org.ivdnt.galahad.data.layer.Layer
-import org.ivdnt.galahad.port.DocumentTransformMetadata
-import org.ivdnt.galahad.port.LayerMerger
-import org.ivdnt.galahad.port.LayerTransformer
-import org.ivdnt.galahad.port.tsv.TSVFile
-import java.io.File
-import kotlin.io.path.createTempDirectory
-
-/**
- * Do not call directly. Use [TSVFile.merge] instead.
- */
-internal open class TSVLayerMerger(
- open val sourceFile: TSVFile,
- transformMetadata: DocumentTransformMetadata,
-) : LayerMerger, LayerTransformer(transformMetadata) {
- val layer = transformMetadata.layer
- val outFile: File = createTempDirectory("teimerge").toFile().resolve(transformMetadata.document.name)
- protected open val hasHeader: Boolean = true
- /**
- * Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile.
- * Read in per line, split on tabs, swap out pos & lemma and commit to new file
- */
- override fun merge(): TSVFile {
- sourceFile.parse() // parse the sourceFile if needed.
- parseByLine()
- return TSVFile(outFile)
- }
-
- protected fun parseByLine() {
- var termIndex = if (hasHeader) -1 else 0 // Start at -1 to take the header into account.
- sourceFile.file.inputStream().bufferedReader().forEachLine { line ->
- if (termIndex == -1) {
- // Copy header to output & continue
- outFile.appendText(line + "\n")
- termIndex++
- } else {
- val columns = line.split("\t").toMutableList()
- if (columns.size >= 3) {
- // Swap out pos & lemma, keep the rest.
- replaceColumns(columns, layer, termIndex)
- outFile.appendText(columns.joinToString("\t") + "\n")
- termIndex++
- } else {
- // Output whatever was on that line. Presumably whitespace.
- outFile.appendText(line + "\n")
- }
- }
- }
- }
-
- /*
- * Replace the PoS and lemma values in their previously indexed columns.
- */
- protected open fun replaceColumns(
- columns: MutableList, layer: Layer,
- termIndex: Int,
- ) {
- columns[sourceFile.posIndex!!] = layer.terms[termIndex].pos.toString()
- columns[sourceFile.lemmaIndex!!] = layer.terms[termIndex].lemma.toString()
- }
+package org.ivdnt.galahad.port.tsv.export
+
+import org.ivdnt.galahad.data.layer.Layer
+import org.ivdnt.galahad.port.DocumentTransformMetadata
+import org.ivdnt.galahad.port.LayerMerger
+import org.ivdnt.galahad.port.LayerTransformer
+import org.ivdnt.galahad.port.tsv.TSVFile
+import java.io.File
+import kotlin.io.path.createTempDirectory
+
+/**
+ * Do not call directly. Use [TSVFile.merge] instead.
+ */
+internal open class TSVLayerMerger(
+ open val sourceFile: TSVFile,
+ transformMetadata: DocumentTransformMetadata,
+) : LayerMerger, LayerTransformer(transformMetadata) {
+ val layer = transformMetadata.layer
+ val outFile: File = createTempDirectory("teimerge").toFile().resolve(transformMetadata.document.name)
+ protected open val hasHeader: Boolean = true
+ /**
+ * Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile.
+ * Read in per line, split on tabs, swap out pos & lemma and commit to new file
+ */
+ override fun merge(): TSVFile {
+ sourceFile.parse() // parse the sourceFile if needed.
+ parseByLine()
+ return TSVFile(outFile)
+ }
+
+ protected fun parseByLine() {
+ var termIndex = if (hasHeader) -1 else 0 // Start at -1 to take the header into account.
+ sourceFile.file.inputStream().bufferedReader().forEachLine { line ->
+ if (termIndex == -1) {
+ // Copy header to output & continue
+ outFile.appendText(line + "\n")
+ termIndex++
+ } else {
+ val columns = line.split("\t").toMutableList()
+ if (columns.size >= 3) {
+ // Swap out pos & lemma, keep the rest.
+ replaceColumns(columns, layer, termIndex)
+ outFile.appendText(columns.joinToString("\t") + "\n")
+ termIndex++
+ } else {
+ // Output whatever was on that line. Presumably whitespace.
+ outFile.appendText(line + "\n")
+ }
+ }
+ }
+ }
+
+ /*
+ * Replace the PoS and lemma values in their previously indexed columns.
+ */
+ protected open fun replaceColumns(
+ columns: MutableList, layer: Layer,
+ termIndex: Int,
+ ) {
+ columns[sourceFile.posIndex!!] = layer.terms[termIndex].posOrEmpty
+ columns[sourceFile.lemmaIndex!!] = layer.terms[termIndex].lemmaOrEmpty
+ }
}
\ No newline at end of file
diff --git a/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt
index 766e5c4..d9006b1 100644
--- a/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt
+++ b/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt
@@ -107,7 +107,7 @@ class DocumentTest {
val result: File = doc.generateAs(formatTo, meta)
val expected: File = Resource.get("all-formats/output/from-$formatFrom-to-$formatTo.${formatTo.extension}")
val test = TestResult(expected.readText(), result.readText())
- test.ignoreDate().ignoreUUID().result()
+ test.ignoreDate().ignoreUUID().ignoreTrailingWhiteSpaces().result()
}
}
}
diff --git a/server/src/test/resources/all-formats/input/input.folia.xml b/server/src/test/resources/all-formats/input/input.folia.xml
index b5d232c..dc0cbf3 100644
--- a/server/src/test/resources/all-formats/input/input.folia.xml
+++ b/server/src/test/resources/all-formats/input/input.folia.xml
@@ -36,7 +36,7 @@
,
-
+
@@ -51,17 +51,17 @@
.
-
+
.
-
+
.
-
+
@@ -86,12 +86,12 @@
:
-
+
"
-
+
@@ -116,12 +116,12 @@
?
-
+
"
-
+
diff --git a/server/src/test/resources/all-formats/input/input.tsv b/server/src/test/resources/all-formats/input/input.tsv
index 10def0d..38fa3a7 100644
--- a/server/src/test/resources/all-formats/input/input.tsv
+++ b/server/src/test/resources/all-formats/input/input.tsv
@@ -2,21 +2,21 @@ word lemma pos
Hebban hebben VRB(finiteness=fin,tense=past)
olla olle RES(type=for)
vogala vogala NOU-C(number=sg)
-, null LET
+, LET
nestas nestatis RES(type=for)
hagunnan haguna NOU-C(number=sg)
-. null LET
-. null LET
-. null LET
+. LET
+. LET
+. LET
Hinase hinas NOU-P
ic ik PD(type=pers,position=free)
ende en CONJ(type=coor)
thu huben ADV(type=reg)
-: null LET
-" null LET
+: LET
+" LET
uuat uiteen PD(type=w-p,position=free)
unbidan unibent CONJ(type=sub)
wi wij PD(type=pers,position=free)
nu nu ADV(type=reg)
-? null LET
-" null LET
\ No newline at end of file
+? LET
+" LET
\ No newline at end of file
diff --git a/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml
index 155bcef..95a563f 100644
--- a/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml
+++ b/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml
@@ -36,7 +36,7 @@
,
-
+
@@ -51,17 +51,17 @@
.
-
+
.
-
+
.
-
+
@@ -86,12 +86,12 @@
:
-
+
"
-
+
@@ -116,14 +116,14 @@
?
-
+
"
-
+
-
+
\ No newline at end of file
diff --git a/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv
index d75894e..3764c09 100644
--- a/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv
+++ b/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv
@@ -2,21 +2,21 @@ word lemma pos
Hebban hebben VRB(finiteness=fin,tense=past)
olla olle RES(type=for)
vogala vogala NOU-C(number=sg)
-, null LET
+, LET
nestas nestatis RES(type=for)
hagunnan haguna NOU-C(number=sg)
-. null LET
-. null LET
-. null LET
+. LET
+. LET
+. LET
Hinase hinas NOU-P
ic ik PD(type=pers,position=free)
ende en CONJ(type=coor)
thu huben ADV(type=reg)
-: null LET
-" null LET
+: LET
+" LET
uuat uiteen PD(type=w-p,position=free)
unbidan unibent CONJ(type=sub)
wi wij PD(type=pers,position=free)
nu nu ADV(type=reg)
-? null LET
-" null LET
+? LET
+" LET
diff --git a/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv
index d75894e..3764c09 100644
--- a/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv
+++ b/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv
@@ -2,21 +2,21 @@ word lemma pos
Hebban hebben VRB(finiteness=fin,tense=past)
olla olle RES(type=for)
vogala vogala NOU-C(number=sg)
-, null LET
+, LET
nestas nestatis RES(type=for)
hagunnan haguna NOU-C(number=sg)
-. null LET
-. null LET
-. null LET
+. LET
+. LET
+. LET
Hinase hinas NOU-P
ic ik PD(type=pers,position=free)
ende en CONJ(type=coor)
thu huben ADV(type=reg)
-: null LET
-" null LET
+: LET
+" LET
uuat uiteen PD(type=w-p,position=free)
unbidan unibent CONJ(type=sub)
wi wij PD(type=pers,position=free)
nu nu ADV(type=reg)
-? null LET
-" null LET
+? LET
+" LET
diff --git a/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml
index 162375f..54f8f29 100644
--- a/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml
+++ b/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml
@@ -36,7 +36,7 @@
,
-
+
@@ -51,17 +51,17 @@
.
-
+
.
-
+
.
-
+
@@ -86,12 +86,12 @@
:
-
+
"
-
+
@@ -116,14 +116,14 @@
?
-
+
"
-
+
-
+
\ No newline at end of file
diff --git a/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv
index d75894e..3764c09 100644
--- a/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv
+++ b/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv
@@ -2,21 +2,21 @@ word lemma pos
Hebban hebben VRB(finiteness=fin,tense=past)
olla olle RES(type=for)
vogala vogala NOU-C(number=sg)
-, null LET
+, LET
nestas nestatis RES(type=for)
hagunnan haguna NOU-C(number=sg)
-. null LET
-. null LET
-. null LET
+. LET
+. LET
+. LET
Hinase hinas NOU-P
ic ik PD(type=pers,position=free)
ende en CONJ(type=coor)
thu huben ADV(type=reg)
-: null LET
-" null LET
+: LET
+" LET
uuat uiteen PD(type=w-p,position=free)
unbidan unibent CONJ(type=sub)
wi wij PD(type=pers,position=free)
nu nu ADV(type=reg)
-? null LET
-" null LET
+? LET
+" LET
diff --git a/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml
index 63ffb00..6f28cee 100644
--- a/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml
+++ b/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml
@@ -36,7 +36,7 @@
,
-
+
@@ -51,17 +51,17 @@
.
-
+
.
-
+
.
-
+
@@ -86,12 +86,12 @@
:
-
+
"
-
+
@@ -116,14 +116,14 @@
?
-
+
"
-
+
-
+
\ No newline at end of file
diff --git a/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv
index d75894e..3764c09 100644
--- a/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv
+++ b/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv
@@ -2,21 +2,21 @@ word lemma pos
Hebban hebben VRB(finiteness=fin,tense=past)
olla olle RES(type=for)
vogala vogala NOU-C(number=sg)
-, null LET
+, LET
nestas nestatis RES(type=for)
hagunnan haguna NOU-C(number=sg)
-. null LET
-. null LET
-. null LET
+. LET
+. LET
+. LET
Hinase hinas NOU-P
ic ik PD(type=pers,position=free)
ende en CONJ(type=coor)
thu huben ADV(type=reg)
-: null LET
-" null LET
+: LET
+" LET
uuat uiteen PD(type=w-p,position=free)
unbidan unibent CONJ(type=sub)
wi wij PD(type=pers,position=free)
nu nu ADV(type=reg)
-? null LET
-" null LET
+? LET
+" LET
diff --git a/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml
index 155bcef..95a563f 100644
--- a/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml
+++ b/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml
@@ -36,7 +36,7 @@
,
-
+
@@ -51,17 +51,17 @@
.
-
+
.
-
+
.
-
+
@@ -86,12 +86,12 @@
:
-
+
"
-
+
@@ -116,14 +116,14 @@
?
-
+
"
-
+
-
+
\ No newline at end of file
diff --git a/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml
index 155bcef..95a563f 100644
--- a/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml
+++ b/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml
@@ -36,7 +36,7 @@
,
-
+
@@ -51,17 +51,17 @@
.
-
+
.
-
+
.
-
+
@@ -86,12 +86,12 @@
:
-
+
"
-
+
@@ -116,14 +116,14 @@
?
-
+
"
-
+
-
+
\ No newline at end of file
diff --git a/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv
index d75894e..3764c09 100644
--- a/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv
+++ b/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv
@@ -2,21 +2,21 @@ word lemma pos
Hebban hebben VRB(finiteness=fin,tense=past)
olla olle RES(type=for)
vogala vogala NOU-C(number=sg)
-, null LET
+, LET
nestas nestatis RES(type=for)
hagunnan haguna NOU-C(number=sg)
-. null LET
-. null LET
-. null LET
+. LET
+. LET
+. LET
Hinase hinas NOU-P
ic ik PD(type=pers,position=free)
ende en CONJ(type=coor)
thu huben ADV(type=reg)
-: null LET
-" null LET
+: LET
+" LET
uuat uiteen PD(type=w-p,position=free)
unbidan unibent CONJ(type=sub)
wi wij PD(type=pers,position=free)
nu nu ADV(type=reg)
-? null LET
-" null LET
+? LET
+" LET
diff --git a/server/src/test/resources/folia/twine/merged-output.folia.xml b/server/src/test/resources/folia/twine/merged-output.folia.xml
index a95f5ff..331bd13 100644
--- a/server/src/test/resources/folia/twine/merged-output.folia.xml
+++ b/server/src/test/resources/folia/twine/merged-output.folia.xml
@@ -23,7 +23,7 @@
- To obey or not to be,
that is theonequestion that yremained.
+ To obey or not to be,
that is theonequestion that yremained.