Also match <w>-tags when merging if a single PUNCT differs

And write the correct offset to the sourcelayer now that we add spaces between <w>-tags
INL · Jun 24, 2024 · d776e7c · d776e7c
1 parent d4fcaff
commit d776e7c
Show file tree

Hide file tree

Showing 4 changed files with 175 additions and 163 deletions.
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt b/server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt
@@ -18,8 +18,8 @@ class Report : Logging {
             // println("Spotted incompatible tokenization for \"${wf.literal}\" at offset ${wf.offset}")
             // Now we do nothing, but it is good to centrally register this
             logger().warn( "REPORT: Spotted incompatible tokenization for wordforms \n" +
-                    "    - $wf1 \n" +
-                    "    - $wf2"
+                    "    - ${wf1.literal} \n" +
+                    "    - ${wf2.literal}"
             )
         }
 

diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt
@@ -1,151 +1,153 @@
-package org.ivdnt.galahad.evaluation.comparison
-
-import com.fasterxml.jackson.annotation.JsonIgnore
-import org.ivdnt.galahad.data.layer.Layer
-import org.ivdnt.galahad.data.layer.Term
-
-fun ListIterator<Term>.nextOrNull(): Term? {
-    val iter = iterator()
-    return if (iter.hasNext()) iter.next() else null
-}
-
-// Some hardcoded punctuation
-val PUNCTUATION = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"")
-
-/**
- * Match the [Layer.terms] of two layers based on their position (offset and length), not on their actual value (literal/pos/lemma).
- * When pos filters are provide, only match [TermComparison] of that pos.
- */
-class LayerComparison(
-    private val hypothesisLayer: Layer,
-    private val referenceLayer: Layer,
-    private val layerFilter: LayerFilter? = null,
-) {
-    @JsonIgnore
-    val matches: MutableList<TermComparison> = ArrayList()
-
-    @JsonIgnore
-    val referenceTermsWithoutMatches: MutableList<Term> = ArrayList()
-
-    @JsonIgnore
-    val hypothesisTermsWithoutMatches: MutableList<Term> = ArrayList()
-    @JsonIgnore
-    private val hypoIter: ListIterator<Term> = iterForTermsInLayer(hypothesisLayer)
-    @JsonIgnore
-    private val refIter: ListIterator<Term> = iterForTermsInLayer(referenceLayer)
-    @JsonIgnore
-    private var currentHypoTerm: Term? = Term.EMPTY
-    @JsonIgnore
-    private var currentRefTerm: Term? = Term.EMPTY
-
-    init {
-        if (refIter.hasNext() && hypoIter.hasNext()) {
-            compare()
-        } else {
-            hypothesisTermsWithoutMatches.addAll(hypothesisLayer.terms)
-            referenceTermsWithoutMatches.addAll(referenceLayer.terms)
-        }
-    }
-
-    /** Iterate through the terms of both layers simultaneously and compare them. */
-    private fun compare() {
-        // First terms
-        nextHypo()
-        nextRef()
-        // While there are next terms
-        while (currentHypoTerm != null && currentRefTerm != null) {
-            val comp = TermComparison(hypoTerm = currentHypoTerm!!, refTerm = currentRefTerm!!)
-            compareTerm(comp)
-        }
-        // One of the two could be non-null. These are not included in the remaining refIter.
-        currentHypoTerm?.let(::hypoNoMatch)
-        currentRefTerm?.let(::refNoMatch)
-        // The remaining terms have no matches
-        hypoIter.forEachRemaining(::hypoNoMatch)
-        refIter.forEachRemaining(::refNoMatch)
-    }
-
-    private fun compareTerm(comp: TermComparison) {
-        // Act on the comparison
-        if (comp.fullOverlap) {
-            fullMatch(comp)
-        } else {
-            // Unequal first offset
-            if (comp.hypoTerm.firstOffset < comp.refTerm.firstOffset) {
-                hypoNoMatch()
-            } else if (comp.hypoTerm.firstOffset > comp.refTerm.firstOffset) {
-                refNoMatch()
-            }
-            // Equal first offset but no match.
-            // Try to truncate either terms to see if the last char is punctuation.
-            else if (symmetricTruncatedPcMatch(comp)) {
-                // If so, still match it.
-                fullMatch(comp)
-            } else {
-                hypoNoMatch()
-                refNoMatch()
-            }
-        }
-    }
-
-    private fun fullMatch(termComparison: TermComparison) {
-        if (layerFilter?.filter(termComparison) != false) {
-            matches.add(termComparison)
-        }
-        nextHypo()
-        nextRef()
-    }
-
-    private fun hypoNoMatch() {
-        hypoNoMatch(currentHypoTerm!!)
-        nextHypo()
-    }
-
-    private fun hypoNoMatch(t: Term) {
-        // Note how layerFilter can be null, and both null and true != false.
-        if (layerFilter?.hypoTermFilter?.filter(t) != false) {
-            hypothesisTermsWithoutMatches.add(t)
-        }
-    }
-
-    private fun refNoMatch() {
-        refNoMatch(currentRefTerm!!)
-        nextRef()
-    }
-
-    private fun refNoMatch(t: Term) {
-        if (layerFilter?.refTermFilter?.filter(t) != false) {
-            referenceTermsWithoutMatches.add(t)
-        }
-    }
-
-    private fun nextHypo() {
-        currentHypoTerm = hypoIter.nextOrNull()
-    }
-
-    private fun nextRef() {
-        currentRefTerm = refIter.nextOrNull()
-    }
-
-    /** Iterate through the terms of the layer sorted on offset. */
-    private fun iterForTermsInLayer(layer: Layer): ListIterator<Term> {
-        return layer.terms
-            // Terms can only be a match if their first offset is the same
-            .sortedBy { it.firstOffset }.listIterator()
-    }
-
-    private fun symmetricTruncatedPcMatch(comp: TermComparison): Boolean {
-        val aStr: String = comp.hypoTerm.literals
-        val bStr: String = comp.refTerm.literals
-        return truncatedPcMatch(aStr, bStr) || truncatedPcMatch(bStr, aStr)
-    }
-
-    private fun truncatedPcMatch(aStr: String, bStr: String): Boolean {
-        if (PUNCTUATION.contains(aStr.last().toString())) {
-            if (aStr.slice(0 until aStr.lastIndex) == bStr) {
-                return true
-            }
-        }
-        return false
-    }
+package org.ivdnt.galahad.evaluation.comparison
+
+import com.fasterxml.jackson.annotation.JsonIgnore
+import org.ivdnt.galahad.data.layer.Layer
+import org.ivdnt.galahad.data.layer.Term
+
+fun ListIterator<Term>.nextOrNull(): Term? {
+    val iter = iterator()
+    return if (iter.hasNext()) iter.next() else null
+}
+
+// Some hardcoded punctuation
+val PUNCTUATION = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"")
+
+/**
+ * Match the [Layer.terms] of two layers based on their position (offset and length), not on their actual value (literal/pos/lemma).
+ * When pos filters are provide, only match [TermComparison] of that pos.
+ */
+class LayerComparison(
+    private val hypothesisLayer: Layer,
+    private val referenceLayer: Layer,
+    private val layerFilter: LayerFilter? = null,
+) {
+    @JsonIgnore
+    val matches: MutableList<TermComparison> = ArrayList()
+
+    @JsonIgnore
+    val referenceTermsWithoutMatches: MutableList<Term> = ArrayList()
+
+    @JsonIgnore
+    val hypothesisTermsWithoutMatches: MutableList<Term> = ArrayList()
+    @JsonIgnore
+    private val hypoIter: ListIterator<Term> = iterForTermsInLayer(hypothesisLayer)
+    @JsonIgnore
+    private val refIter: ListIterator<Term> = iterForTermsInLayer(referenceLayer)
+    @JsonIgnore
+    private var currentHypoTerm: Term? = Term.EMPTY
+    @JsonIgnore
+    private var currentRefTerm: Term? = Term.EMPTY
+
+    init {
+        if (refIter.hasNext() && hypoIter.hasNext()) {
+            compare()
+        } else {
+            hypothesisTermsWithoutMatches.addAll(hypothesisLayer.terms)
+            referenceTermsWithoutMatches.addAll(referenceLayer.terms)
+        }
+    }
+
+    /** Iterate through the terms of both layers simultaneously and compare them. */
+    private fun compare() {
+        // First terms
+        nextHypo()
+        nextRef()
+        // While there are next terms
+        while (currentHypoTerm != null && currentRefTerm != null) {
+            val comp = TermComparison(hypoTerm = currentHypoTerm!!, refTerm = currentRefTerm!!)
+            compareTerm(comp)
+        }
+        // One of the two could be non-null. These are not included in the remaining refIter.
+        currentHypoTerm?.let(::hypoNoMatch)
+        currentRefTerm?.let(::refNoMatch)
+        // The remaining terms have no matches
+        hypoIter.forEachRemaining(::hypoNoMatch)
+        refIter.forEachRemaining(::refNoMatch)
+    }
+
+    private fun compareTerm(comp: TermComparison) {
+        // Act on the comparison
+        if (comp.fullOverlap) {
+            fullMatch(comp)
+        } else {
+            // Unequal first offset
+            if (comp.hypoTerm.firstOffset < comp.refTerm.firstOffset) {
+                hypoNoMatch()
+            } else if (comp.hypoTerm.firstOffset > comp.refTerm.firstOffset) {
+                refNoMatch()
+            }
+            // Equal first offset but no match.
+            // Try to truncate either terms to see if the last char is punctuation.
+            else if (symmetricTruncatedPcMatch(comp)) {
+                // If so, still match it.
+                fullMatch(comp)
+            } else {
+                hypoNoMatch()
+                refNoMatch()
+            }
+        }
+    }
+
+    private fun fullMatch(termComparison: TermComparison) {
+        if (layerFilter?.filter(termComparison) != false) {
+            matches.add(termComparison)
+        }
+        nextHypo()
+        nextRef()
+    }
+
+    private fun hypoNoMatch() {
+        hypoNoMatch(currentHypoTerm!!)
+        nextHypo()
+    }
+
+    private fun hypoNoMatch(t: Term) {
+        // Note how layerFilter can be null, and both null and true != false.
+        if (layerFilter?.hypoTermFilter?.filter(t) != false) {
+            hypothesisTermsWithoutMatches.add(t)
+        }
+    }
+
+    private fun refNoMatch() {
+        refNoMatch(currentRefTerm!!)
+        nextRef()
+    }
+
+    private fun refNoMatch(t: Term) {
+        if (layerFilter?.refTermFilter?.filter(t) != false) {
+            referenceTermsWithoutMatches.add(t)
+        }
+    }
+
+    private fun nextHypo() {
+        currentHypoTerm = hypoIter.nextOrNull()
+    }
+
+    private fun nextRef() {
+        currentRefTerm = refIter.nextOrNull()
+    }
+
+    /** Iterate through the terms of the layer sorted on offset. */
+    private fun iterForTermsInLayer(layer: Layer): ListIterator<Term> {
+        return layer.terms
+            // Terms can only be a match if their first offset is the same
+            .sortedBy { it.firstOffset }.listIterator()
+    }
+
+    private fun symmetricTruncatedPcMatch(comp: TermComparison): Boolean {
+        val aStr: String = comp.hypoTerm.literals
+        val bStr: String = comp.refTerm.literals
+        return truncatedPcMatch(aStr, bStr) || truncatedPcMatch(bStr, aStr)
+    }
+
+    companion object {
+        fun truncatedPcMatch(aStr: String, bStr: String): Boolean {
+            if (PUNCTUATION.contains(aStr.last().toString())) {
+                if (aStr.slice(0 until aStr.lastIndex) == bStr) {
+                    return true
+                }
+            }
+            return false
+        }
+    }
 }
diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt
@@ -4,6 +4,7 @@ import org.ivdnt.galahad.app.report.Report
 import org.ivdnt.galahad.data.document.DocumentFormat
 import org.ivdnt.galahad.data.layer.Layer
 import org.ivdnt.galahad.data.layer.WordForm
+import org.ivdnt.galahad.evaluation.comparison.LayerComparison.Companion.truncatedPcMatch
 import org.ivdnt.galahad.port.folia.export.deepcopy
 import org.ivdnt.galahad.port.xml.getPlainTextContent
 import org.ivdnt.galahad.util.*
@@ -267,8 +268,9 @@ open class TEITextMerger(
             if (wordFormToAdd != null) {
                 // remove all whitespace within a <w>-tag (although this rarely occurs anyway).
                 val sourceLiteral = node.getPlainTextContent().replace(Regex("""\s"""), "")
-                if (wordFormToAdd.literal == sourceLiteral) {
-                    // This is a simple case since the tokenization matches
+                if (wordFormToAdd.literal == sourceLiteral // This is a simple case since the tokenization matches
+                    || truncatedPcMatch(sourceLiteral, wordFormToAdd.literal) // Also match with single punctuation (e.g. word. -> word)
+                ) {
                     mergeWTag(wordFormToAdd, element)
                 } else {
                     // Tokenization mismatch, report it

diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt
@@ -277,24 +277,32 @@ class BLFXMLParser (
     }
 
     private fun handleWordOrPunctNode( node: Node ) {
-        // custom node handling
-        nodeHandler(node, offset, xmlDocument)
+        // Handle cases like <w>a</w><w>b</w> -> "a b" (add space in plaintext)
+        val needsSpacing = node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail)
+        val spaceOffset = if (needsSpacing) 1 else 0
+        val trueWordOffset = offset + spaceOffset
 
+        // Handle merging
+        nodeHandler(node, trueWordOffset, xmlDocument)
+
+        // Extraction
         val literal = literalExtractor(node).trim() // wordPathExpression.evaluate( node )
         val lem = lemmaExtractor(node) // lemPathExpression.evaluate( node )
         val pos = posExtractor(node) // posPathExpression.evaluate( node )
         val id = idExtractor(node)
 
-        val wordForm = WordForm( literal, offset, literal.length, id ?: "no-id" )
-        sourceLayer.wordForms.add( wordForm )
+        // Add the word to the source layer
+        val wordForm = WordForm(literal, trueWordOffset, literal.length, id ?: "no-id" )
+        val term = Term(lem, pos, mutableListOf(wordForm))
+        sourceLayer.wordForms.add(wordForm)
+        sourceLayer.terms.add(term)
+
+        // Add the word to the plaintext
         var text = literal.trim()
-        if (node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail)) {
-           text = " $text"
+        if (needsSpacing) {
+            text = " $text"
         }
         addPlaintext(text)
-
-        val term = Term(lem, pos, mutableListOf(wordForm))
-        sourceLayer.terms.add( term )
     }
 
     fun xmlToString(pretty: Boolean): String {