diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt index de5826d..dfe92a2 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt @@ -1,10 +1,10 @@ package org.ivdnt.galahad.port.xml import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.layer.Term import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.port.BLFXML import org.ivdnt.galahad.util.getXmlBuilder import org.w3c.dom.Document @@ -33,8 +33,8 @@ fun Node.tagName(): String? { } /** - * Should the text text inside this node be interpreted as source text? - * Asssumes we are already inside of a text container e.g. + * Should the text inside this node be interpreted as source text? + * Assumes we are already inside a text container e.g. */ private fun Node.isTextable(): Boolean { if( this.tagName() == "note" && this.attributes.getNamedItem("type")?.textContent == "editorial" ) { @@ -210,7 +210,7 @@ class BLFXMLParser ( - private fun addPlaintext( literal: String ) { + private fun addPlaintext(literal: String) { plainTextOutputStream.write( literal.toByteArray() ) offset += literal.length @@ -287,7 +287,11 @@ class BLFXMLParser ( val wordForm = WordForm( literal, offset, literal.length, id ?: "no-id" ) sourceLayer.wordForms.add( wordForm ) - addPlaintext(literal) + var text = literal.trim() + if (node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail)) { + text = " $text" + } + addPlaintext(text) val term = Term(lem, pos, mutableListOf(wordForm)) sourceLayer.terms.add( term ) diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt index 895bc71..0c6daae 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt @@ -22,6 +22,9 @@ internal class TEIExportTest { @Test fun `Merge pie-tdn result with heavily twined tei`() { + val file = TEIFile(Resource.get("tei/twine/twine.input.xml")) + assertPlainText("tei/twine", file) + val plaintext: String = Resource.get("tei/twine/plaintext.txt").readText() val layer = LayerBuilder() .loadLayerFromTSV("tei/twine/pie-tdn.tsv", plaintext) @@ -37,6 +40,9 @@ internal class TEIExportTest { @Test fun `Merge a pie-tdn layer with a tei file that only contains plaintext`() { + val file = TEIFile(Resource.get("tei/brieven/input.tei.xml")) + assertPlainText("tei/brieven", file) + val plaintext: String = Resource.get("tei/brieven/plaintext.txt").readText() val layer = LayerBuilder() .loadLayerFromTSV("tei/brieven/pie.tsv", plaintext) diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt index bb441c7..7ef9429 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt @@ -13,7 +13,7 @@ internal class TEIImportTest { @Test fun `Multiple text elements`() { val teiFile = TEIFile(Resource.get("tei/dummies/multipletextelements.xml")) - assertEquals("text1\ntext2text3", teiFile.plainTextReader().readText().trim()) + assertEquals("text1\ntext2 text3", teiFile.plainTextReader().readText().trim()) } @Test @@ -40,6 +40,13 @@ internal class TEIImportTest { // Has no source layer assertPlainText("tei/brieven", file) } + + @Test + fun `Import TEI with w-tags without spaces in between`() { + val file = TEIFile(Resource.get("tei/nospaces/input.tei.xml")) + assertEquals("a a a", file.plainTextReader().readText().trim()) + + } } @Nested diff --git a/server/src/test/resources/tei/nospaces/input.tei.xml b/server/src/test/resources/tei/nospaces/input.tei.xml new file mode 100644 index 0000000..4ac729d --- /dev/null +++ b/server/src/test/resources/tei/nospaces/input.tei.xml @@ -0,0 +1,7 @@ + + +

+ aaa +

+
+
\ No newline at end of file