Skip to content

Commit

Permalink
Also match <w>-tags when merging if a single PUNCT differs
Browse files Browse the repository at this point in the history
And write the correct offset to the sourcelayer now that we add spaces between <w>-tags
  • Loading branch information
PrinsINT committed Jun 24, 2024
1 parent d4fcaff commit d776e7c
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 163 deletions.
4 changes: 2 additions & 2 deletions server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ class Report : Logging {
// println("Spotted incompatible tokenization for \"${wf.literal}\" at offset ${wf.offset}")
// Now we do nothing, but it is good to centrally register this
logger().warn( "REPORT: Spotted incompatible tokenization for wordforms \n" +
" - $wf1 \n" +
" - $wf2"
" - ${wf1.literal} \n" +
" - ${wf2.literal}"
)
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,151 +1,153 @@
package org.ivdnt.galahad.evaluation.comparison

import com.fasterxml.jackson.annotation.JsonIgnore
import org.ivdnt.galahad.data.layer.Layer
import org.ivdnt.galahad.data.layer.Term

fun ListIterator<Term>.nextOrNull(): Term? {
val iter = iterator()
return if (iter.hasNext()) iter.next() else null
}

// Some hardcoded punctuation
val PUNCTUATION = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"")

/**
* Match the [Layer.terms] of two layers based on their position (offset and length), not on their actual value (literal/pos/lemma).
* When pos filters are provide, only match [TermComparison] of that pos.
*/
class LayerComparison(
private val hypothesisLayer: Layer,
private val referenceLayer: Layer,
private val layerFilter: LayerFilter? = null,
) {
@JsonIgnore
val matches: MutableList<TermComparison> = ArrayList()

@JsonIgnore
val referenceTermsWithoutMatches: MutableList<Term> = ArrayList()

@JsonIgnore
val hypothesisTermsWithoutMatches: MutableList<Term> = ArrayList()
@JsonIgnore
private val hypoIter: ListIterator<Term> = iterForTermsInLayer(hypothesisLayer)
@JsonIgnore
private val refIter: ListIterator<Term> = iterForTermsInLayer(referenceLayer)
@JsonIgnore
private var currentHypoTerm: Term? = Term.EMPTY
@JsonIgnore
private var currentRefTerm: Term? = Term.EMPTY

init {
if (refIter.hasNext() && hypoIter.hasNext()) {
compare()
} else {
hypothesisTermsWithoutMatches.addAll(hypothesisLayer.terms)
referenceTermsWithoutMatches.addAll(referenceLayer.terms)
}
}

/** Iterate through the terms of both layers simultaneously and compare them. */
private fun compare() {
// First terms
nextHypo()
nextRef()
// While there are next terms
while (currentHypoTerm != null && currentRefTerm != null) {
val comp = TermComparison(hypoTerm = currentHypoTerm!!, refTerm = currentRefTerm!!)
compareTerm(comp)
}
// One of the two could be non-null. These are not included in the remaining refIter.
currentHypoTerm?.let(::hypoNoMatch)
currentRefTerm?.let(::refNoMatch)
// The remaining terms have no matches
hypoIter.forEachRemaining(::hypoNoMatch)
refIter.forEachRemaining(::refNoMatch)
}

private fun compareTerm(comp: TermComparison) {
// Act on the comparison
if (comp.fullOverlap) {
fullMatch(comp)
} else {
// Unequal first offset
if (comp.hypoTerm.firstOffset < comp.refTerm.firstOffset) {
hypoNoMatch()
} else if (comp.hypoTerm.firstOffset > comp.refTerm.firstOffset) {
refNoMatch()
}
// Equal first offset but no match.
// Try to truncate either terms to see if the last char is punctuation.
else if (symmetricTruncatedPcMatch(comp)) {
// If so, still match it.
fullMatch(comp)
} else {
hypoNoMatch()
refNoMatch()
}
}
}

private fun fullMatch(termComparison: TermComparison) {
if (layerFilter?.filter(termComparison) != false) {
matches.add(termComparison)
}
nextHypo()
nextRef()
}

private fun hypoNoMatch() {
hypoNoMatch(currentHypoTerm!!)
nextHypo()
}

private fun hypoNoMatch(t: Term) {
// Note how layerFilter can be null, and both null and true != false.
if (layerFilter?.hypoTermFilter?.filter(t) != false) {
hypothesisTermsWithoutMatches.add(t)
}
}

private fun refNoMatch() {
refNoMatch(currentRefTerm!!)
nextRef()
}

private fun refNoMatch(t: Term) {
if (layerFilter?.refTermFilter?.filter(t) != false) {
referenceTermsWithoutMatches.add(t)
}
}

private fun nextHypo() {
currentHypoTerm = hypoIter.nextOrNull()
}

private fun nextRef() {
currentRefTerm = refIter.nextOrNull()
}

/** Iterate through the terms of the layer sorted on offset. */
private fun iterForTermsInLayer(layer: Layer): ListIterator<Term> {
return layer.terms
// Terms can only be a match if their first offset is the same
.sortedBy { it.firstOffset }.listIterator()
}

private fun symmetricTruncatedPcMatch(comp: TermComparison): Boolean {
val aStr: String = comp.hypoTerm.literals
val bStr: String = comp.refTerm.literals
return truncatedPcMatch(aStr, bStr) || truncatedPcMatch(bStr, aStr)
}

private fun truncatedPcMatch(aStr: String, bStr: String): Boolean {
if (PUNCTUATION.contains(aStr.last().toString())) {
if (aStr.slice(0 until aStr.lastIndex) == bStr) {
return true
}
}
return false
}
package org.ivdnt.galahad.evaluation.comparison

import com.fasterxml.jackson.annotation.JsonIgnore
import org.ivdnt.galahad.data.layer.Layer
import org.ivdnt.galahad.data.layer.Term

fun ListIterator<Term>.nextOrNull(): Term? {
val iter = iterator()
return if (iter.hasNext()) iter.next() else null
}

// Some hardcoded punctuation
val PUNCTUATION = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"")

/**
* Match the [Layer.terms] of two layers based on their position (offset and length), not on their actual value (literal/pos/lemma).
* When pos filters are provide, only match [TermComparison] of that pos.
*/
class LayerComparison(
private val hypothesisLayer: Layer,
private val referenceLayer: Layer,
private val layerFilter: LayerFilter? = null,
) {
@JsonIgnore
val matches: MutableList<TermComparison> = ArrayList()

@JsonIgnore
val referenceTermsWithoutMatches: MutableList<Term> = ArrayList()

@JsonIgnore
val hypothesisTermsWithoutMatches: MutableList<Term> = ArrayList()
@JsonIgnore
private val hypoIter: ListIterator<Term> = iterForTermsInLayer(hypothesisLayer)
@JsonIgnore
private val refIter: ListIterator<Term> = iterForTermsInLayer(referenceLayer)
@JsonIgnore
private var currentHypoTerm: Term? = Term.EMPTY
@JsonIgnore
private var currentRefTerm: Term? = Term.EMPTY

init {
if (refIter.hasNext() && hypoIter.hasNext()) {
compare()
} else {
hypothesisTermsWithoutMatches.addAll(hypothesisLayer.terms)
referenceTermsWithoutMatches.addAll(referenceLayer.terms)
}
}

/** Iterate through the terms of both layers simultaneously and compare them. */
private fun compare() {
// First terms
nextHypo()
nextRef()
// While there are next terms
while (currentHypoTerm != null && currentRefTerm != null) {
val comp = TermComparison(hypoTerm = currentHypoTerm!!, refTerm = currentRefTerm!!)
compareTerm(comp)
}
// One of the two could be non-null. These are not included in the remaining refIter.
currentHypoTerm?.let(::hypoNoMatch)
currentRefTerm?.let(::refNoMatch)
// The remaining terms have no matches
hypoIter.forEachRemaining(::hypoNoMatch)
refIter.forEachRemaining(::refNoMatch)
}

private fun compareTerm(comp: TermComparison) {
// Act on the comparison
if (comp.fullOverlap) {
fullMatch(comp)
} else {
// Unequal first offset
if (comp.hypoTerm.firstOffset < comp.refTerm.firstOffset) {
hypoNoMatch()
} else if (comp.hypoTerm.firstOffset > comp.refTerm.firstOffset) {
refNoMatch()
}
// Equal first offset but no match.
// Try to truncate either terms to see if the last char is punctuation.
else if (symmetricTruncatedPcMatch(comp)) {
// If so, still match it.
fullMatch(comp)
} else {
hypoNoMatch()
refNoMatch()
}
}
}

private fun fullMatch(termComparison: TermComparison) {
if (layerFilter?.filter(termComparison) != false) {
matches.add(termComparison)
}
nextHypo()
nextRef()
}

private fun hypoNoMatch() {
hypoNoMatch(currentHypoTerm!!)
nextHypo()
}

private fun hypoNoMatch(t: Term) {
// Note how layerFilter can be null, and both null and true != false.
if (layerFilter?.hypoTermFilter?.filter(t) != false) {
hypothesisTermsWithoutMatches.add(t)
}
}

private fun refNoMatch() {
refNoMatch(currentRefTerm!!)
nextRef()
}

private fun refNoMatch(t: Term) {
if (layerFilter?.refTermFilter?.filter(t) != false) {
referenceTermsWithoutMatches.add(t)
}
}

private fun nextHypo() {
currentHypoTerm = hypoIter.nextOrNull()
}

private fun nextRef() {
currentRefTerm = refIter.nextOrNull()
}

/** Iterate through the terms of the layer sorted on offset. */
private fun iterForTermsInLayer(layer: Layer): ListIterator<Term> {
return layer.terms
// Terms can only be a match if their first offset is the same
.sortedBy { it.firstOffset }.listIterator()
}

private fun symmetricTruncatedPcMatch(comp: TermComparison): Boolean {
val aStr: String = comp.hypoTerm.literals
val bStr: String = comp.refTerm.literals
return truncatedPcMatch(aStr, bStr) || truncatedPcMatch(bStr, aStr)
}

companion object {
fun truncatedPcMatch(aStr: String, bStr: String): Boolean {
if (PUNCTUATION.contains(aStr.last().toString())) {
if (aStr.slice(0 until aStr.lastIndex) == bStr) {
return true
}
}
return false
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import org.ivdnt.galahad.app.report.Report
import org.ivdnt.galahad.data.document.DocumentFormat
import org.ivdnt.galahad.data.layer.Layer
import org.ivdnt.galahad.data.layer.WordForm
import org.ivdnt.galahad.evaluation.comparison.LayerComparison.Companion.truncatedPcMatch
import org.ivdnt.galahad.port.folia.export.deepcopy
import org.ivdnt.galahad.port.xml.getPlainTextContent
import org.ivdnt.galahad.util.*
Expand Down Expand Up @@ -267,8 +268,9 @@ open class TEITextMerger(
if (wordFormToAdd != null) {
// remove all whitespace within a <w>-tag (although this rarely occurs anyway).
val sourceLiteral = node.getPlainTextContent().replace(Regex("""\s"""), "")
if (wordFormToAdd.literal == sourceLiteral) {
// This is a simple case since the tokenization matches
if (wordFormToAdd.literal == sourceLiteral // This is a simple case since the tokenization matches
|| truncatedPcMatch(sourceLiteral, wordFormToAdd.literal) // Also match with single punctuation (e.g. word. -> word)
) {
mergeWTag(wordFormToAdd, element)
} else {
// Tokenization mismatch, report it
Expand Down
26 changes: 17 additions & 9 deletions server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -277,24 +277,32 @@ class BLFXMLParser (
}

private fun handleWordOrPunctNode( node: Node ) {
// custom node handling
nodeHandler(node, offset, xmlDocument)
// Handle cases like <w>a</w><w>b</w> -> "a b" (add space in plaintext)
val needsSpacing = node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail)
val spaceOffset = if (needsSpacing) 1 else 0
val trueWordOffset = offset + spaceOffset

// Handle merging
nodeHandler(node, trueWordOffset, xmlDocument)

// Extraction
val literal = literalExtractor(node).trim() // wordPathExpression.evaluate( node )
val lem = lemmaExtractor(node) // lemPathExpression.evaluate( node )
val pos = posExtractor(node) // posPathExpression.evaluate( node )
val id = idExtractor(node)

val wordForm = WordForm( literal, offset, literal.length, id ?: "no-id" )
sourceLayer.wordForms.add( wordForm )
// Add the word to the source layer
val wordForm = WordForm(literal, trueWordOffset, literal.length, id ?: "no-id" )
val term = Term(lem, pos, mutableListOf(wordForm))
sourceLayer.wordForms.add(wordForm)
sourceLayer.terms.add(term)

// Add the word to the plaintext
var text = literal.trim()
if (node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail)) {
text = " $text"
if (needsSpacing) {
text = " $text"
}
addPlaintext(text)

val term = Term(lem, pos, mutableListOf(wordForm))
sourceLayer.terms.add( term )
}

fun xmlToString(pretty: Boolean): String {
Expand Down

0 comments on commit d776e7c

Please sign in to comment.