From 8fed93bae161506b3775c40ce85115dab4628e12 Mon Sep 17 00:00:00 2001 From: Vincent Prins Date: Thu, 20 Jun 2024 12:54:23 +0200 Subject: [PATCH] Fix NULL appearing in exported files as lemma/pos Now we export empty strings instead. See INL/Cobaltje#134 --- .../org/ivdnt/galahad/data/layer/Term.kt | 212 +++++++------ .../port/folia/export/FoliaTextMerger.kt | 300 +++++++++--------- .../folia/export/LayerToFoliaConverter.kt | 162 +++++----- .../port/naf/export/LayerToNAFConverter.kt | 4 +- .../port/tei/export/LayerToTEIConverter.kt | 4 +- .../galahad/port/tei/export/TEITextMerger.kt | 8 +- .../port/tsv/export/LayerToTSVConverter.kt | 4 +- .../galahad/port/tsv/export/TSVLayerMerger.kt | 124 ++++---- .../galahad/data/document/DocumentTest.kt | 2 +- .../all-formats/input/input.folia.xml | 16 +- .../resources/all-formats/input/input.tsv | 16 +- .../output/from-Conllu-to-Folia.folia.xml | 18 +- .../all-formats/output/from-Conllu-to-Tsv.tsv | 16 +- .../all-formats/output/from-Folia-to-Tsv.tsv | 16 +- .../output/from-Naf-to-Folia.folia.xml | 18 +- .../all-formats/output/from-Naf-to-Tsv.tsv | 16 +- .../output/from-TeiP5-to-Folia.folia.xml | 18 +- .../all-formats/output/from-TeiP5-to-Tsv.tsv | 16 +- .../output/from-Tsv-to-Folia.folia.xml | 18 +- .../output/from-Txt-to-Folia.folia.xml | 18 +- .../all-formats/output/from-Txt-to-Tsv.tsv | 16 +- .../folia/twine/merged-output.folia.xml | 2 +- 22 files changed, 517 insertions(+), 507 deletions(-) diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt index 2aeddd8..e786b86 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt @@ -1,102 +1,112 @@ -package org.ivdnt.galahad.data.layer - -import com.fasterxml.jackson.annotation.JsonIgnore -import com.fasterxml.jackson.annotation.JsonProperty - -/** Avoid empty strings in the CSV representation. */ -fun Term.toNonEmptyPair(): Pair { - return (this.pos ?: Term.NO_POS) to (this.lemma ?: Term.NO_LEMMA) -} - -/** - * A term in a [Layer]. A term has a [lemma], a [pos] and refers to one or multiple [WordForm]. - * Referring to multiple [WordForm] is used to represent multi-word terms, although it is currently not used. - * Lemma and pos can be null. - */ -data class Term( - @JsonProperty("lemma") val lemma: String?, - @JsonProperty("pos") val pos: String?, - @JsonProperty("targets") val targets: MutableList, -) { - /** Whether the lemma is not null. */ - @get:JsonIgnore - val hasLemma: Boolean = lemma != null - - /** Whether the pos is not null. */ - @get:JsonIgnore - val hasPOS: Boolean = pos != null - - @get:JsonIgnore - val posHeadGroupOrDefault - get() = posHeadGroup ?: NO_POS - - @get:JsonIgnore - val lemmaOrDefault - get() = lemma ?: NO_LEMMA - - /** Whether this term refers to multiple [WordForm]. */ - @get:JsonIgnore - val isMultiTarget = targets.size > 1 - - /** The head of the first [pos]. E.g. "PD" for "PD(type=art)+NOU(num=sg)". */ - @get:JsonIgnore - val posHead: String? = posToPosHead(pos) - - @get:JsonIgnore - val isMultiPos: Boolean = pos?.contains("+") ?: false - - /** The head of all [pos]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)". */ - @get:JsonIgnore - val posHeadGroup: String? = run { - // Split on + - if (!isMultiPos) return@run posHead - val result: String? = pos?.split("+")?.mapNotNull { posToPosHead(it) }?.joinToString("+") - result - } - - - - /** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */ - @get:JsonIgnore - val posFeatures: String? - get() { - if (pos == null) return null - val featureStart: Int = pos?.indexOf('(') ?: -1 - val featureEnd: Int = pos?.indexOf(')') ?: -1 - return if (featureStart != -1 && featureEnd != -1) { - return pos!!.slice(featureStart + 1 until featureEnd) - } else null - } - - /** Offset of the first [WordForm] in [targets].*/ - @get:JsonIgnore - val firstOffset get() = targets.minOfOrNull { it.offset } ?: -1 - - /** String constructed from all the [WordForm] in [targets]. */ - @get:JsonIgnore - val literals: String - get() = targets.joinToString(" ") { it.literal } - - companion object { - const val NO_POS = "NO_POS" - const val NO_LEMMA = "NO_LEMMA" - val EMPTY = Term(null, null, mutableListOf()) - private fun posToPosHead(pos: String?): String? { - return if (pos == null) { - null - } else if (pos.contains('(')) { - // pos contains a non-letter non-digit character - val headEnd = pos.indexOf('(') - val head = pos.slice(0 until headEnd) - if (head.isEmpty()) { - pos // pos is non-empty and starts with a non-letter character, e.g.: _ - } else { - head - } - } else { - // pos is 0 or more letters only - pos - } - } - } +package org.ivdnt.galahad.data.layer + +import com.fasterxml.jackson.annotation.JsonIgnore +import com.fasterxml.jackson.annotation.JsonProperty + +/** Avoid empty strings in the CSV representation. */ +fun Term.toNonEmptyPair(): Pair { + return (this.pos ?: Term.NO_POS) to (this.lemma ?: Term.NO_LEMMA) +} + +/** + * A term in a [Layer]. A term has a [lemma], a [pos] and refers to one or multiple [WordForm]. + * Referring to multiple [WordForm] is used to represent multi-word terms, although it is currently not used. + * Lemma and pos can be null. + */ +data class Term( + @JsonProperty("lemma") val lemma: String?, + @JsonProperty("pos") val pos: String?, + @JsonProperty("targets") val targets: MutableList, +) { + /** Whether the lemma is not null. */ + @get:JsonIgnore + val hasLemma: Boolean = lemma != null + + /** Whether the pos is not null. */ + @get:JsonIgnore + val hasPOS: Boolean = pos != null + + @get:JsonIgnore + val posHeadGroupOrDefault + get() = posHeadGroup ?: NO_POS + + @get:JsonIgnore + val lemmaOrDefault + get() = lemma ?: NO_LEMMA + + @get:JsonIgnore + val lemmaOrEmpty + get() = lemma ?: "" + + @get:JsonIgnore + val posOrEmpty + get() = pos ?: "" + + /** Whether this term refers to multiple [WordForm]. */ + @get:JsonIgnore + val isMultiTarget = targets.size > 1 + + /** The head of the first [pos]. E.g. "PD" for "PD(type=art)+NOU(num=sg)". */ + @get:JsonIgnore + val posHead: String? = posToPosHead(pos) + + @get:JsonIgnore + val isMultiPos: Boolean = pos?.contains("+") ?: false + + /** The head of all [pos]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)". */ + @get:JsonIgnore + val posHeadGroup: String? = run { + // Split on + + if (!isMultiPos) return@run posHead + val result: String? = pos?.split("+")?.mapNotNull { posToPosHead(it) }?.joinToString("+") + result + } + + @get:JsonIgnore + val posHeadGroupOrEmpty + get() = posHeadGroup ?: "" + + /** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */ + @get:JsonIgnore + val posFeatures: String? + get() { + if (pos == null) return null + val featureStart: Int = pos.indexOf('(') ?: -1 + val featureEnd: Int = pos.indexOf(')') ?: -1 + return if (featureStart != -1 && featureEnd != -1) { + return pos.slice(featureStart + 1 until featureEnd) + } else null + } + + /** Offset of the first [WordForm] in [targets].*/ + @get:JsonIgnore + val firstOffset get() = targets.minOfOrNull { it.offset } ?: -1 + + /** String constructed from all the [WordForm] in [targets]. */ + @get:JsonIgnore + val literals: String + get() = targets.joinToString(" ") { it.literal } + + companion object { + const val NO_POS = "NO_POS" + const val NO_LEMMA = "NO_LEMMA" + val EMPTY = Term(null, null, mutableListOf()) + private fun posToPosHead(pos: String?): String? { + return if (pos == null) { + null + } else if (pos.contains('(')) { + // pos contains a non-letter non-digit character + val headEnd = pos.indexOf('(') + val head = pos.slice(0 until headEnd) + if (head.isEmpty()) { + pos // pos is non-empty and starts with a non-letter character, e.g.: _ + } else { + head + } + } else { + // pos is 0 or more letters only + pos + } + } + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt index 7e8da15..f826434 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt @@ -1,151 +1,151 @@ -package org.ivdnt.galahad.port.folia.export - -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.port.tei.export.TEITextMerger -import org.ivdnt.galahad.util.insertFirst -import org.ivdnt.galahad.port.xml.reparseText -import org.ivdnt.galahad.port.xml.tagName -import org.ivdnt.galahad.util.childOrNull -import org.ivdnt.galahad.util.insertAfter -import org.w3c.dom.Document -import org.w3c.dom.Element -import org.w3c.dom.Node -import org.w3c.dom.NodeList - -fun NodeList.deepcopy(): ArrayList { - val copy = ArrayList() - for (i in 0 until this.length) { - copy.add(this.item(i)) - } - return copy -} - -class FoliaTextMerger( - node: Node, offset: Int, document: Document, wordFormIter: ListIterator, - deleteList: ArrayList, layer: Layer -) : TEITextMerger(node, offset, document, wordFormIter, deleteList, layer, DocumentFormat.Folia) { - - override fun merge() { - if (node.tagName() == "t" || node.tagName()?.startsWith("t-") == true) { - // We are going to add nodes when exporting, but we don't want to iterate over them - // So better store references to the current nodes and use them - val parent = node - val oldChildNodes = node.childNodes.deepcopy() - var endsWithSpace = true - for (child in oldChildNodes) { - node = child - val text = reparseText(child.textContent) - - if (node.nodeType == Node.TEXT_NODE) { - child.textContent = text - } - // TODO reparseText overwrites embedded t-styles - - // never set the offset of more than one space. - if (endsWithSpace && text.startsWith(" ")) { - offset -= 1 - } - merge() - - // Keep track of the ending space - if (text.isNotEmpty() && text.endsWith(" ")) - endsWithSpace = true - else if (text.isNotEmpty()) - endsWithSpace = false - - if (child.nodeType == Node.TEXT_NODE) - offset += text.length - } - // Remove parent and transfer children. - if(markForDeletion(parent)) { - var last = parent - for (i in parent.childNodes.length - 1 downTo 0) { - val c = parent.childNodes.item(i) - parent.parentNode.insertBefore(c, last) - last = c - } - } - - } else { - super.merge() - } - } - - override fun createWTag(wf: WordForm): Element { - val wTag = node.parentNode.cloneNode(false) - return wTag as Element - } - - override fun addWordForm(previousEndOffset: Int, wf: WordForm) { - super.addWordForm(previousEndOffset, wf) - // For Folia, newWTag is actually a or tag. - var tTag: Node = newWTag!! - // Make sure tTag points to a . For e.g. a , grab the first parent. - var parent = - if (tTag.parentNode.tagName() == "t") tTag.parentNode - else tTag.parentNode.parentNode // First iteration looks at grandparent, because t-style copied itself. - while (tTag.tagName() != "t") { - val clone = parent.cloneNode(false) - tTag.parentNode.replaceChild(clone,tTag) - clone.insertFirst(tTag) - // Ready for next iter. - parent = parent.parentNode - tTag = clone - } - // Create the which will contain the - val wTag = document.createElement("w") - val term = layer.termForWordForm(wf) - wTag.addTerm(term) - // Contain it. - tTag.parentNode.replaceChild(wTag,tTag) - wTag.insertFirst(tTag) - } - - override fun handleElementNode() { - val element = node as Element - if (element.tagName != "w") return - - val wordFormToAdd = getWordFormForOffsetOrNull() ?: return - val term = layer.termForWordForm(wordFormToAdd) - element.addTerm(term) - } - - override fun moveWTagUp(wTag: Element): Element { - wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode) - val clone = wTag.parentNode.cloneNode(false) - wTag.parentNode.replaceChild(clone, wTag) - clone.appendChild(wTag) - newWTag = clone as Element - return clone - } - - private fun Element.addTerm(term: Term) { - this.addTermFeature("lemma", term.lemma.toString()) - this.addTermFeature("pos", term.pos.toString(), term.posHeadGroup) - } - - private fun Element.addTermFeature(name: String, value: String, head: String? = null) { - /* If at some point we want to remove existing annotations layers (pos & lemma) in folia tags - * uncomment this. For now, multiple annotation layers are okay in the export. - // Find the child elements of [name] and delete them - val children = this.childNodes.deepcopy() - for (child in children) { - if (child.tagName() == name) { - this.removeChild(child) - } - }*/ - - // Create a new child element of [name] - val child = this.ownerDocument.createElement(name) - child.setAttribute("class", value) - // For PoS - if (head != null) child.setAttribute("head", head) - // Folia metadata. - child.setAttribute("processor", layer.name) - child.setAttribute("set", layer.name) - this.appendChild(child) - } +package org.ivdnt.galahad.port.folia.export + +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.data.layer.WordForm +import org.ivdnt.galahad.port.tei.export.TEITextMerger +import org.ivdnt.galahad.util.insertFirst +import org.ivdnt.galahad.port.xml.reparseText +import org.ivdnt.galahad.port.xml.tagName +import org.ivdnt.galahad.util.childOrNull +import org.ivdnt.galahad.util.insertAfter +import org.w3c.dom.Document +import org.w3c.dom.Element +import org.w3c.dom.Node +import org.w3c.dom.NodeList + +fun NodeList.deepcopy(): ArrayList { + val copy = ArrayList() + for (i in 0 until this.length) { + copy.add(this.item(i)) + } + return copy +} + +class FoliaTextMerger( + node: Node, offset: Int, document: Document, wordFormIter: ListIterator, + deleteList: ArrayList, layer: Layer +) : TEITextMerger(node, offset, document, wordFormIter, deleteList, layer, DocumentFormat.Folia) { + + override fun merge() { + if (node.tagName() == "t" || node.tagName()?.startsWith("t-") == true) { + // We are going to add nodes when exporting, but we don't want to iterate over them + // So better store references to the current nodes and use them + val parent = node + val oldChildNodes = node.childNodes.deepcopy() + var endsWithSpace = true + for (child in oldChildNodes) { + node = child + val text = reparseText(child.textContent) + + if (node.nodeType == Node.TEXT_NODE) { + child.textContent = text + } + // TODO reparseText overwrites embedded t-styles + + // never set the offset of more than one space. + if (endsWithSpace && text.startsWith(" ")) { + offset -= 1 + } + merge() + + // Keep track of the ending space + if (text.isNotEmpty() && text.endsWith(" ")) + endsWithSpace = true + else if (text.isNotEmpty()) + endsWithSpace = false + + if (child.nodeType == Node.TEXT_NODE) + offset += text.length + } + // Remove parent and transfer children. + if(markForDeletion(parent)) { + var last = parent + for (i in parent.childNodes.length - 1 downTo 0) { + val c = parent.childNodes.item(i) + parent.parentNode.insertBefore(c, last) + last = c + } + } + + } else { + super.merge() + } + } + + override fun createWTag(wf: WordForm): Element { + val wTag = node.parentNode.cloneNode(false) + return wTag as Element + } + + override fun addWordForm(previousEndOffset: Int, wf: WordForm) { + super.addWordForm(previousEndOffset, wf) + // For Folia, newWTag is actually a or tag. + var tTag: Node = newWTag!! + // Make sure tTag points to a . For e.g. a , grab the first parent. + var parent = + if (tTag.parentNode.tagName() == "t") tTag.parentNode + else tTag.parentNode.parentNode // First iteration looks at grandparent, because t-style copied itself. + while (tTag.tagName() != "t") { + val clone = parent.cloneNode(false) + tTag.parentNode.replaceChild(clone,tTag) + clone.insertFirst(tTag) + // Ready for next iter. + parent = parent.parentNode + tTag = clone + } + // Create the which will contain the + val wTag = document.createElement("w") + val term = layer.termForWordForm(wf) + wTag.addTerm(term) + // Contain it. + tTag.parentNode.replaceChild(wTag,tTag) + wTag.insertFirst(tTag) + } + + override fun handleElementNode() { + val element = node as Element + if (element.tagName != "w") return + + val wordFormToAdd = getWordFormForOffsetOrNull() ?: return + val term = layer.termForWordForm(wordFormToAdd) + element.addTerm(term) + } + + override fun moveWTagUp(wTag: Element): Element { + wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode) + val clone = wTag.parentNode.cloneNode(false) + wTag.parentNode.replaceChild(clone, wTag) + clone.appendChild(wTag) + newWTag = clone as Element + return clone + } + + private fun Element.addTerm(term: Term) { + this.addTermFeature("lemma", term.lemmaOrEmpty) + this.addTermFeature("pos", term.posOrEmpty, term.posHeadGroupOrEmpty) + } + + private fun Element.addTermFeature(name: String, value: String, head: String? = null) { + /* If at some point we want to remove existing annotations layers (pos & lemma) in folia tags + * uncomment this. For now, multiple annotation layers are okay in the export. + // Find the child elements of [name] and delete them + val children = this.childNodes.deepcopy() + for (child in children) { + if (child.tagName() == name) { + this.removeChild(child) + } + }*/ + + // Create a new child element of [name] + val child = this.ownerDocument.createElement(name) + child.setAttribute("class", value) + // For PoS + if (head != null) child.setAttribute("head", head) + // Folia metadata. + child.setAttribute("processor", layer.name) + child.setAttribute("set", layer.name) + this.appendChild(child) + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt index 5eecf59..777de28 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt @@ -1,82 +1,82 @@ -package org.ivdnt.galahad.port.folia.export - -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.port.DocumentTransformMetadata -import org.ivdnt.galahad.port.LayerConverter -import org.ivdnt.galahad.port.LayerTransformer -import org.ivdnt.galahad.util.XMLWriter -import org.ivdnt.galahad.util.escapeXML -import org.ivdnt.galahad.util.toValidXmlId -import java.io.OutputStream - -class LayerToFoliaConverter ( - transformMetadata: DocumentTransformMetadata, -) : LayerConverter, LayerTransformer( transformMetadata ) { - - override val format: DocumentFormat - get() = DocumentFormat.Folia - - val id: String - get() = document.getUploadedRawFile().nameWithoutExtension.toValidXmlId() - - override fun convert(outputStream: OutputStream) { - val taggerName = tagger.id - val writer = XMLWriter(outputStream) - // XML Header - writer.writeLineRaw("") - writer.openTag("") - // Metadata - writeMetadata(writer, taggerName) - // Textbody - writeTextBody(writer, taggerName) - } - - private fun writeMetadata(writer: XMLWriter, taggerName: String) { - writer.openTag("") - // Annotations - writer.openTag("") - writer.writeLine("") - writer.writeLine("") - writer.writeLine("") - writer.writeLine("") - for (annotation in setOf("lemma", "pos")) { - writer.openTag("<$annotation-annotation set=\"${taggerName}\">") - writer.writeLine("") - writer.closeTag("") - } - writer.closeTag("") - // Provenance - writer.openTag("") - writer.writeLine( - "" - ) - writer.closeTag("") - writer.closeTag("") - } - - private fun writeTextBody(writer: XMLWriter, taggerName: String) { - writer.openTag("") - writer.openTag("

") - for ((index, term) in this.result.terms.withIndex()) { - // Single W - writeSingleW(writer, index, term, taggerName) - } - writer.closeTag("

") - writer.closeTag("
") - writer.closeTag("
") - } - - private fun writeSingleW( - writer: XMLWriter, index: Int, term: Term, - taggerName: String, - ) { - writer.openTag("") - writer.writeLine("${term.targets[0].literal.escapeXML()}") - writer.writeLine("") - writer.writeLine( - "" - ) - writer.closeTag("") - } +package org.ivdnt.galahad.port.folia.export + +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.LayerConverter +import org.ivdnt.galahad.port.LayerTransformer +import org.ivdnt.galahad.util.XMLWriter +import org.ivdnt.galahad.util.escapeXML +import org.ivdnt.galahad.util.toValidXmlId +import java.io.OutputStream + +class LayerToFoliaConverter ( + transformMetadata: DocumentTransformMetadata, +) : LayerConverter, LayerTransformer( transformMetadata ) { + + override val format: DocumentFormat + get() = DocumentFormat.Folia + + val id: String + get() = document.getUploadedRawFile().nameWithoutExtension.toValidXmlId() + + override fun convert(outputStream: OutputStream) { + val taggerName = tagger.id + val writer = XMLWriter(outputStream) + // XML Header + writer.writeLineRaw("") + writer.openTag("") + // Metadata + writeMetadata(writer, taggerName) + // Textbody + writeTextBody(writer, taggerName) + } + + private fun writeMetadata(writer: XMLWriter, taggerName: String) { + writer.openTag("") + // Annotations + writer.openTag("") + writer.writeLine("") + writer.writeLine("") + writer.writeLine("") + writer.writeLine("") + for (annotation in setOf("lemma", "pos")) { + writer.openTag("<$annotation-annotation set=\"${taggerName}\">") + writer.writeLine("") + writer.closeTag("") + } + writer.closeTag("") + // Provenance + writer.openTag("") + writer.writeLine( + "" + ) + writer.closeTag("") + writer.closeTag("") + } + + private fun writeTextBody(writer: XMLWriter, taggerName: String) { + writer.openTag("") + writer.openTag("

") + for ((index, term) in this.result.terms.withIndex()) { + // Single W + writeSingleW(writer, index, term, taggerName) + } + writer.closeTag("

") + writer.closeTag("
") + writer.closeTag("
") + } + + private fun writeSingleW( + writer: XMLWriter, index: Int, term: Term, + taggerName: String, + ) { + writer.openTag("") + writer.writeLine("${term.targets[0].literal.escapeXML()}") + writer.writeLine("") + writer.writeLine( + "" + ) + writer.closeTag("") + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt index 0906d6b..4ff753b 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt @@ -57,8 +57,8 @@ class LayerToNAFConverter ( val xterm = xmlDoc.createElement("term") terms.appendChild(xterm) xterm.setAttribute("id", "t$index") - xterm.setAttribute("lemma", term.lemma) - xterm.setAttribute("pos", term.pos) + xterm.setAttribute("lemma", term.lemmaOrEmpty) + xterm.setAttribute("pos", term.posOrEmpty) val xspan = xmlDoc.createElement("span") xterm.appendChild( xspan ) diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt index 118b8f3..109c26d 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt @@ -112,8 +112,8 @@ class LayerToTEIConverter( writer.writeRaw("${getLiteral()}") } else { // If it is not punctuation, safely assume it can be interpreted as - val lemma = term.lemma?.escapeXML() - val pos = term.pos?.escapeXML() + val lemma = term.lemmaOrEmpty.escapeXML() + val pos = term.posOrEmpty.escapeXML() writer.writeRaw("${getLiteral()}") } } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt index 50cddb5..e4b061c 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt @@ -234,11 +234,11 @@ open class TEITextMerger( n } else { val n = document.createElement("w") - n.setAttribute("lemma", termToAdd.lemma) + n.setAttribute("lemma", termToAdd.lemmaOrEmpty) n } // Both and have a pos. - wTag.setAttribute(posType(), termToAdd.pos) + wTag.setAttribute(posType(), termToAdd.posOrEmpty) return wTag } @@ -303,9 +303,9 @@ open class TEITextMerger( val termToAdd = layer.termForWordForm(wordFormToAdd) // tags do not have a lemma. if (element.tagName == "w") { - element.setAttribute("lemma", termToAdd.lemma) + element.setAttribute("lemma", termToAdd.lemmaOrEmpty) } - element.setAttribute(posType(), termToAdd.pos) + element.setAttribute(posType(), termToAdd.posOrEmpty) element.removeAttribute("type") // Update legacy formats to TEI p5 } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt index cb87b06..36a278a 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt @@ -18,8 +18,8 @@ class LayerToTSVConverter( outputStream.write("word\tlemma\tpos\n".encodeToByteArray()) // 'word' is the blacklab default // Body result.terms.forEach { - // Note that this might have weird result for multi-wordform tokens - outputStream.write("${it.literals}\t${it.lemma}\t${it.pos}\n".encodeToByteArray()) + // Explicitly non-null. + outputStream.write("${it.literals}\t${it.lemmaOrEmpty}\t${it.posOrEmpty}\n".encodeToByteArray()) } } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt index a268c3b..24666d0 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt @@ -1,63 +1,63 @@ -package org.ivdnt.galahad.port.tsv.export - -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.port.DocumentTransformMetadata -import org.ivdnt.galahad.port.LayerMerger -import org.ivdnt.galahad.port.LayerTransformer -import org.ivdnt.galahad.port.tsv.TSVFile -import java.io.File -import kotlin.io.path.createTempDirectory - -/** - * Do not call directly. Use [TSVFile.merge] instead. - */ -internal open class TSVLayerMerger( - open val sourceFile: TSVFile, - transformMetadata: DocumentTransformMetadata, -) : LayerMerger, LayerTransformer(transformMetadata) { - val layer = transformMetadata.layer - val outFile: File = createTempDirectory("teimerge").toFile().resolve(transformMetadata.document.name) - protected open val hasHeader: Boolean = true - /** - * Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile. - * Read in per line, split on tabs, swap out pos & lemma and commit to new file - */ - override fun merge(): TSVFile { - sourceFile.parse() // parse the sourceFile if needed. - parseByLine() - return TSVFile(outFile) - } - - protected fun parseByLine() { - var termIndex = if (hasHeader) -1 else 0 // Start at -1 to take the header into account. - sourceFile.file.inputStream().bufferedReader().forEachLine { line -> - if (termIndex == -1) { - // Copy header to output & continue - outFile.appendText(line + "\n") - termIndex++ - } else { - val columns = line.split("\t").toMutableList() - if (columns.size >= 3) { - // Swap out pos & lemma, keep the rest. - replaceColumns(columns, layer, termIndex) - outFile.appendText(columns.joinToString("\t") + "\n") - termIndex++ - } else { - // Output whatever was on that line. Presumably whitespace. - outFile.appendText(line + "\n") - } - } - } - } - - /* - * Replace the PoS and lemma values in their previously indexed columns. - */ - protected open fun replaceColumns( - columns: MutableList, layer: Layer, - termIndex: Int, - ) { - columns[sourceFile.posIndex!!] = layer.terms[termIndex].pos.toString() - columns[sourceFile.lemmaIndex!!] = layer.terms[termIndex].lemma.toString() - } +package org.ivdnt.galahad.port.tsv.export + +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.LayerMerger +import org.ivdnt.galahad.port.LayerTransformer +import org.ivdnt.galahad.port.tsv.TSVFile +import java.io.File +import kotlin.io.path.createTempDirectory + +/** + * Do not call directly. Use [TSVFile.merge] instead. + */ +internal open class TSVLayerMerger( + open val sourceFile: TSVFile, + transformMetadata: DocumentTransformMetadata, +) : LayerMerger, LayerTransformer(transformMetadata) { + val layer = transformMetadata.layer + val outFile: File = createTempDirectory("teimerge").toFile().resolve(transformMetadata.document.name) + protected open val hasHeader: Boolean = true + /** + * Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile. + * Read in per line, split on tabs, swap out pos & lemma and commit to new file + */ + override fun merge(): TSVFile { + sourceFile.parse() // parse the sourceFile if needed. + parseByLine() + return TSVFile(outFile) + } + + protected fun parseByLine() { + var termIndex = if (hasHeader) -1 else 0 // Start at -1 to take the header into account. + sourceFile.file.inputStream().bufferedReader().forEachLine { line -> + if (termIndex == -1) { + // Copy header to output & continue + outFile.appendText(line + "\n") + termIndex++ + } else { + val columns = line.split("\t").toMutableList() + if (columns.size >= 3) { + // Swap out pos & lemma, keep the rest. + replaceColumns(columns, layer, termIndex) + outFile.appendText(columns.joinToString("\t") + "\n") + termIndex++ + } else { + // Output whatever was on that line. Presumably whitespace. + outFile.appendText(line + "\n") + } + } + } + } + + /* + * Replace the PoS and lemma values in their previously indexed columns. + */ + protected open fun replaceColumns( + columns: MutableList, layer: Layer, + termIndex: Int, + ) { + columns[sourceFile.posIndex!!] = layer.terms[termIndex].posOrEmpty + columns[sourceFile.lemmaIndex!!] = layer.terms[termIndex].lemmaOrEmpty + } } \ No newline at end of file diff --git a/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt index 766e5c4..d9006b1 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt @@ -107,7 +107,7 @@ class DocumentTest { val result: File = doc.generateAs(formatTo, meta) val expected: File = Resource.get("all-formats/output/from-$formatFrom-to-$formatTo.${formatTo.extension}") val test = TestResult(expected.readText(), result.readText()) - test.ignoreDate().ignoreUUID().result() + test.ignoreDate().ignoreUUID().ignoreTrailingWhiteSpaces().result() } } } diff --git a/server/src/test/resources/all-formats/input/input.folia.xml b/server/src/test/resources/all-formats/input/input.folia.xml index b5d232c..dc0cbf3 100644 --- a/server/src/test/resources/all-formats/input/input.folia.xml +++ b/server/src/test/resources/all-formats/input/input.folia.xml @@ -36,7 +36,7 @@ , - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,12 +116,12 @@ ? - + " - +

diff --git a/server/src/test/resources/all-formats/input/input.tsv b/server/src/test/resources/all-formats/input/input.tsv index 10def0d..38fa3a7 100644 --- a/server/src/test/resources/all-formats/input/input.tsv +++ b/server/src/test/resources/all-formats/input/input.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET \ No newline at end of file +? LET +" LET \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml index 155bcef..95a563f 100644 --- a/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml index 162375f..54f8f29 100644 --- a/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml index 63ffb00..6f28cee 100644 --- a/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml index 155bcef..95a563f 100644 --- a/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml index 155bcef..95a563f 100644 --- a/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/folia/twine/merged-output.folia.xml b/server/src/test/resources/folia/twine/merged-output.folia.xml index a95f5ff..331bd13 100644 --- a/server/src/test/resources/folia/twine/merged-output.folia.xml +++ b/server/src/test/resources/folia/twine/merged-output.folia.xml @@ -23,7 +23,7 @@

- To obey or not to be,
that is theonequestion that yremained. + To obey or not to be,
that is theonequestion that yremained.