Skip to content

Commit

Permalink
Add spaces between <w> tags if not present
Browse files Browse the repository at this point in the history
  • Loading branch information
PrinsINT committed Jun 21, 2024
1 parent 18ae80e commit 3cff7bf
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package org.ivdnt.galahad.port.xml

import org.ivdnt.galahad.data.document.DocumentFormat
import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME
import org.ivdnt.galahad.data.layer.Layer
import org.ivdnt.galahad.data.layer.Term
import org.ivdnt.galahad.data.layer.WordForm
import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME
import org.ivdnt.galahad.port.BLFXML
import org.ivdnt.galahad.util.getXmlBuilder
import org.w3c.dom.Document
Expand Down Expand Up @@ -33,8 +33,8 @@ fun Node.tagName(): String? {
}

/**
* Should the text text inside this node be interpreted as source text?
* Asssumes we are already inside of a text container e.g. <text>
* Should the text inside this node be interpreted as source text?
* Assumes we are already inside a text container e.g. <text>
*/
private fun Node.isTextable(): Boolean {
if( this.tagName() == "note" && this.attributes.getNamedItem("type")?.textContent == "editorial" ) {
Expand Down Expand Up @@ -210,7 +210,7 @@ class BLFXMLParser (



private fun addPlaintext( literal: String ) {
private fun addPlaintext(literal: String) {
plainTextOutputStream.write( literal.toByteArray() )
offset += literal.length

Expand Down Expand Up @@ -287,7 +287,11 @@ class BLFXMLParser (

val wordForm = WordForm( literal, offset, literal.length, id ?: "no-id" )
sourceLayer.wordForms.add( wordForm )
addPlaintext(literal)
var text = literal.trim()
if (node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail)) {
text = " $text"
}
addPlaintext(text)

val term = Term(lem, pos, mutableListOf(wordForm))
sourceLayer.terms.add( term )
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ internal class TEIExportTest {

@Test
fun `Merge pie-tdn result with heavily twined tei`() {
val file = TEIFile(Resource.get("tei/twine/twine.input.xml"))
assertPlainText("tei/twine", file)

val plaintext: String = Resource.get("tei/twine/plaintext.txt").readText()
val layer = LayerBuilder()
.loadLayerFromTSV("tei/twine/pie-tdn.tsv", plaintext)
Expand All @@ -37,6 +40,9 @@ internal class TEIExportTest {

@Test
fun `Merge a pie-tdn layer with a tei file that only contains plaintext`() {
val file = TEIFile(Resource.get("tei/brieven/input.tei.xml"))
assertPlainText("tei/brieven", file)

val plaintext: String = Resource.get("tei/brieven/plaintext.txt").readText()
val layer = LayerBuilder()
.loadLayerFromTSV("tei/brieven/pie.tsv", plaintext)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ internal class TEIImportTest {
@Test
fun `Multiple text elements`() {
val teiFile = TEIFile(Resource.get("tei/dummies/multipletextelements.xml"))
assertEquals("text1\ntext2text3", teiFile.plainTextReader().readText().trim())
assertEquals("text1\ntext2 text3", teiFile.plainTextReader().readText().trim())
}

@Test
Expand All @@ -40,6 +40,13 @@ internal class TEIImportTest {
// Has no source layer
assertPlainText("tei/brieven", file)
}

@Test
fun `Import TEI with w-tags without spaces in between`() {
val file = TEIFile(Resource.get("tei/nospaces/input.tei.xml"))
assertEquals("a a a", file.plainTextReader().readText().trim())

}
}

@Nested
Expand Down
7 changes: 7 additions & 0 deletions server/src/test/resources/tei/nospaces/input.tei.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<TEI>
<text>
<p>
<w>a</w><w>a</w><w>a</w>
</p>
</text>
</TEI>

0 comments on commit 3cff7bf

Please sign in to comment.