diff --git a/client/src/components/GTabs.vue b/client/src/components/GTabs.vue index 3e15896..2e2d840 100644 --- a/client/src/components/GTabs.vue +++ b/client/src/components/GTabs.vue @@ -1,325 +1,325 @@ - - - - - - \ No newline at end of file diff --git a/client/src/components/input/UploadDocuments.vue b/client/src/components/input/UploadDocuments.vue index 5ff21ea..6c8d9a7 100644 --- a/client/src/components/input/UploadDocuments.vue +++ b/client/src/components/input/UploadDocuments.vue @@ -1,107 +1,151 @@ - - - - - \ No newline at end of file diff --git a/client/src/components/modals/GModal.vue b/client/src/components/modals/GModal.vue index c8645ff..70a6465 100644 --- a/client/src/components/modals/GModal.vue +++ b/client/src/components/modals/GModal.vue @@ -1,108 +1,109 @@ - - - - - \ No newline at end of file diff --git a/client/src/components/tables/GTable.vue b/client/src/components/tables/GTable.vue index 2378169..3b70fb5 100644 --- a/client/src/components/tables/GTable.vue +++ b/client/src/components/tables/GTable.vue @@ -375,7 +375,7 @@ table.loading .loading-symbol { table .loading-symbol { transition: opacity 2s ease, visibility 2s ease; opacity: 0; - z-index: 3; + z-index: 1; position: absolute; top: 50%; left: 50%; diff --git a/client/src/views/HomeView.vue b/client/src/views/HomeView.vue index d400797..24af793 100644 --- a/client/src/views/HomeView.vue +++ b/client/src/views/HomeView.vue @@ -1,107 +1,111 @@ - - - - - + + + + + diff --git a/client/src/views/help/subviews/GeneralView.vue b/client/src/views/help/subviews/GeneralView.vue index 0c566c9..f3aeeac 100644 --- a/client/src/views/help/subviews/GeneralView.vue +++ b/client/src/views/help/subviews/GeneralView.vue @@ -132,8 +132,9 @@

For tool developers, the docker-based application architecture ensures easy contribution of tools to the platform. The application and taggers are hosted by the INT and accessible with any CLARIN-account. There is - also the option to self-host an instance using the publicly available docker images from the INT docker hub or - the open source code available on GitHub. + also the option to self-host an instance using the publicly available docker images from the + INT Docker Hub or the open source code + available on GitHub.

diff --git a/codemeta-harvest.json b/codemeta-harvest.json index f9b0f51..7da42b6 100644 --- a/codemeta-harvest.json +++ b/codemeta-harvest.json @@ -1,389 +1,388 @@ -{ - "@context": "https://w3id.org/codemeta/3.0", - "@type": "SoftwareSourceCode", - "dateCreated": "2024-05-31", - "datePublished": "2024-05-31", - "applicationCategory": [ - "https://w3id.org/nwo-research-fields#Linguistics", - "https://w3id.org/nwo-research-fields#ComputationalLinguisticsAndPhilology", - "https://w3id.org/nwo-research-fields#ArtificialIntelligenceExpertSystems", - "https://w3id.org/nwo-research-fields#SoftwareForHumanities", - "https://w3id.org/nwo-research-fields#TextualAndLinguisticCorpora", - "https://vocabs.dariah.eu/tadirah/enriching", - "https://vocabs.dariah.eu/tadirah/posTagging", - "https://vocabs.dariah.eu/tadirah/tagging", - "https://vocabs.dariah.eu/tadirah/annotating", - "https://vocabs.dariah.eu/tadirah/merging", - "https://vocabs.dariah.eu/tadirah/converting", - "https://vocabs.dariah.eu/tadirah/lemmatizing", - "https://vocabs.dariah.eu/tadirah/machineLearning", - "https://vocabs.dariah.eu/tadirah/comparing", - "https://vocabs.dariah.eu/tadirah/analyzing" - ], - "author": [ - { - "@type": "Person", - "@id": "http://orcid.org/0009-0006-9941-9582", - "email": "vincent.prins@ivdnt.org", - "familyName": "Prins", - "givenName": "Vincent" - }, - { - "@type": "Person", - "familyName": "Brouwer", - "givenName": "Tim" - } - ], - "contributor": [ - "http://orcid.org/0009-0006-9941-9582", - { - "@type": "Person", - "familyName": "Brouwer", - "givenName": "Tim" - } - ], - "maintainer": "http://orcid.org/0009-0006-9941-9582", - "codeRepository": "git+https://github.com/INL/galahad.git", - "description": "GaLAHaD (Generating Linguistic Annotations for Historical Dutch) allows linguists to compare taggers, tag their own corpora, evaluate the results and export their tagged documents.", - "downloadUrl": "https://github.com/INL/galahad", - "license": "http://spdx.org/licenses/Apache-2.0", - "name": "GaLAHaD", - "identifier": "galahad", - "funding": [ - { - "@type": "Grant", - "name": "CLARIAH-PLUS (NWO grant 184.034.023)", - "funder": { - "@type": "Organization", - "name": "NWO", - "url": "https://www.nwo.nl" - } - } - ], - "operatingSystem": "Linux", - "programmingLanguage": [ - "Kotlin", - "Javascript", - "Typescript" - ], - "runtimePlatform": [ - "JVM", - "Node" - ], - "codemeta:contIntegration": { - "id": "https://github.com/INL/galahad/actions" - }, - "continuousIntegration": "https://github.com/INL/galahad/actions", - "developmentStatus": [ - "https://www.repostatus.org/#active", - "https://w3id.org/research-technology-readiness-levels#Level6LatePrototype" - ], - "producer": { - "@type": "Organization", - "name": "Instituut voor de Nederlandse taal", - "@id": "https://www.ivdnt.org", - "url": "https://www.ivdnt.org" - }, - "issueTracker": "https://github.com/INL/galahad/issues", - "readme": "https://github.com/INL/Galahad/blob/release/readme.md", - "releaseNotes": "https://github.com/INL/Galahad/releases", - "softwareHelp": [ - { - "@id": "https://portal.clarin.ivdnt.org/galahad/help", - "@type": "WebSite", - "name": "GaLAHad Help", - "url": "https://portal.clarin.ivdnt.org/galahad/help" - } - ], - "targetProduct": [ - { - "@type": "WebApplication", - "name": "GaLAHaD", - "provider": "https://www.ivdnt.org", - "url": "https://portal.clarin.ivdnt.org/galahad" - }, - { - "@type": "WebAPI", - "name": "GaLAHaD API", - "provider": "https://www.ivdnt.org", - "endpointUrl": { - "@type": "EntryPoint", - "url": "https://portal.clarin.ivdnt.org/galahad/api", - "contentType": "application/json" - }, - "endpointDescription": { - "@type": "CreativeWork", - "encodingFormat": "application/json", - "url": "https://portal.clarin.ivdnt.org/galahad/api/v3/api-docs" - }, - "documentation": "https://portal.clarin.ivdnt.org/galahad/api/swagger-ui/index.html", - "version": "1.0.0", - "consumesData": [ - { - "@type": "TextDigitalDocument", - "encodingFormat": "text/plain" - }, - { - "@type": "TextDigitalDocument", - "encodingFormat": "application/folia+xml" - }, - { - "@type": "TextDigitalDocument", - "encodingFormat": "application/tei+xml" - }, - { - "@type": "TextDigitalDocument", - "encodingFormat": "text/tab-separated-values" - }, - { - "@type": "TextDigitalDocument", - "encodingFormat": "https://github.com/newsreader/NAF" - }, - { - "@type": "TextDigitalDocument", - "encodingFormat": "https://universaldependencies.org/format.html" - } - ], - "producesData": [ - { - "@type": "CreativeWork", - "encodingFormat": "application/zip" - } - ] - }, - { - "@type": "ServerApplication", - "name": "GaLAHaD proxy" - }, - { - "@type": "SoftwareImage", - "name": "GaLAHaD client Docker image" - }, - { - "@type": "SoftwareImage", - "name": "GaLAHaD server Docker image" - }, - { - "@type": "SoftwareImage", - "name": "GaLAHaD proxy Docker image" - } - ], - "softwareRequirements": [ - { - "@type": "SoftwareApplication", - "identifier": "@rollup/plugin-yaml", - "name": "@rollup/plugin-yaml", - "version": "^4.0.1" - }, - { - "@type": "SoftwareApplication", - "identifier": "@types/jest", - "name": "@types/jest", - "version": "^28.1.6" - }, - { - "@type": "SoftwareApplication", - "identifier": "@types/js-yaml", - "name": "@types/js-yaml", - "version": "^4.0.5" - }, - { - "@type": "SoftwareApplication", - "identifier": "@types/uuid", - "name": "@types/uuid", - "version": "^8.3.4" - }, - { - "@type": "SoftwareApplication", - "identifier": "@typescript-eslint/eslint-plugin", - "name": "@typescript-eslint/eslint-plugin", - "version": "^5.30.7" - }, - { - "@type": "SoftwareApplication", - "identifier": "@typescript-eslint/parser", - "name": "@typescript-eslint/parser", - "version": "^5.30.7" - }, - { - "@type": "SoftwareApplication", - "identifier": "@vitejs/plugin-vue", - "name": "@vitejs/plugin-vue", - "version": "^5.0.4" - }, - { - "@type": "SoftwareApplication", - "identifier": "@vue/eslint-config-typescript", - "name": "@vue/eslint-config-typescript", - "version": "^11.0.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "axios", - "name": "axios", - "version": "^1.6.2" - }, - { - "@type": "SoftwareApplication", - "identifier": "buffer", - "name": "buffer", - "version": "^6.0.3" - }, - { - "@type": "SoftwareApplication", - "identifier": "content-disposition", - "name": "content-disposition", - "version": "^0.5.4" - }, - { - "@type": "SoftwareApplication", - "identifier": "eslint", - "name": "eslint", - "version": "^8.20.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "eslint-plugin-vue", - "name": "eslint-plugin-vue", - "version": "^9.2.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "js-yaml", - "name": "js-yaml", - "version": "^4.1.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "json-loader", - "name": "json-loader", - "version": "^0.5.7" - }, - { - "@type": "SoftwareApplication", - "identifier": "mutationobserver-shim", - "name": "mutationobserver-shim", - "version": "^0.3.7" - }, - { - "@type": "SoftwareApplication", - "identifier": "node-sass", - "name": "node-sass", - "version": "^9.0.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "pinia", - "name": "pinia", - "version": "^2.0.16" - }, - { - "@type": "SoftwareApplication", - "identifier": "safe-buffer", - "name": "safe-buffer", - "version": "^5.2.1" - }, - { - "@type": "SoftwareApplication", - "identifier": "sass", - "name": "sass", - "version": "^1.53.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "typescript", - "name": "typescript", - "version": "^4.7.4" - }, - { - "@type": "SoftwareApplication", - "identifier": "uuid", - "name": "uuid", - "version": "^8.3.2" - }, - { - "@type": "SoftwareApplication", - "identifier": "vite", - "name": "vite", - "version": "^5.1.7" - }, - { - "@type": "SoftwareApplication", - "identifier": "vue", - "name": "vue", - "version": "^3.2.37" - }, - { - "@type": "SoftwareApplication", - "identifier": "vue-router", - "name": "vue-router", - "version": "^4.1.2" - }, - { - "@type": "SoftwareApplication", - "identifier": "vue-slider-component", - "name": "vue-slider-component", - "version": "^4.1.0-beta.1" - }, - { - "@type": "SoftwareApplication", - "identifier": "com.beust.klaxon", - "name": "klaxon", - "version": "5.6" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.jetbrains.kotlin.kotlin-reflect", - "name": "kotlin-reflect", - "version": "1.9.22" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.jetbrains.kotlin.kotlin-stdlib", - "name": "kotlin-stdlib", - "version": "1.9.22" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.jetbrains.kotlinx.kotlinx-coroutines-core-jvm", - "name": "kotlinx-coroutines-core-jvm", - "version": "1.7.3" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.jetbrains.kotlinx.kotlinx-serialization-json-jvm", - "name": "kotlinx-serialization-json-jvm", - "version": "1.6.3" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.apache.logging.log4j.log4j-api-kotlin", - "name": "log4j-api-kotlin", - "version": "1.2.0" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.yaml.snakeyaml", - "name": "snakeyaml", - "version": "2.2" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.springframework.boot.spring-boot-devtools", - "name": "spring-boot-devtools", - "version": "3.2.3" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.springframework.boot.spring-boot-starter-web", - "name": "spring-boot-starter-web", - "version": "3.2.4" - }, - { - "@type": "SoftwareApplication", - "identifier": "org.springdoc.springdoc-openapi-starter-webmvc-ui", - "name": "springdoc-openapi-starter-webmvc-ui", - "version": "2.5.0" - } - ] +{ + "@context": "https://w3id.org/codemeta/3.0", + "@type": "SoftwareSourceCode", + "dateCreated": "2024-05-31", + "datePublished": "2024-05-31", + "applicationCategory": [ + "https://w3id.org/nwo-research-fields#Linguistics", + "https://w3id.org/nwo-research-fields#ComputationalLinguisticsAndPhilology", + "https://w3id.org/nwo-research-fields#ArtificialIntelligenceExpertSystems", + "https://w3id.org/nwo-research-fields#SoftwareForHumanities", + "https://w3id.org/nwo-research-fields#TextualAndLinguisticCorpora", + "https://vocabs.dariah.eu/tadirah/enriching", + "https://vocabs.dariah.eu/tadirah/posTagging", + "https://vocabs.dariah.eu/tadirah/tagging", + "https://vocabs.dariah.eu/tadirah/annotating", + "https://vocabs.dariah.eu/tadirah/merging", + "https://vocabs.dariah.eu/tadirah/converting", + "https://vocabs.dariah.eu/tadirah/lemmatizing", + "https://vocabs.dariah.eu/tadirah/machineLearning", + "https://vocabs.dariah.eu/tadirah/comparing", + "https://vocabs.dariah.eu/tadirah/analyzing" + ], + "author": [ + { + "@type": "Person", + "@id": "http://orcid.org/0009-0006-9941-9582", + "email": "vincent.prins@ivdnt.org", + "familyName": "Prins", + "givenName": "Vincent" + }, + { + "@type": "Person", + "familyName": "Brouwer", + "givenName": "Tim" + } + ], + "contributor": [ + "http://orcid.org/0009-0006-9941-9582", + { + "@type": "Person", + "familyName": "Brouwer", + "givenName": "Tim" + } + ], + "maintainer": "http://orcid.org/0009-0006-9941-9582", + "codeRepository": "git+https://github.com/INL/galahad.git", + "description": "GaLAHaD (Generating Linguistic Annotations for Historical Dutch) allows linguists to compare taggers, tag their own corpora, evaluate the results and export their tagged documents.", + "downloadUrl": "https://github.com/INL/galahad", + "license": "http://spdx.org/licenses/Apache-2.0", + "name": "GaLAHaD", + "identifier": "galahad", + "funding": [ + { + "@type": "Grant", + "name": "CLARIAH-PLUS (NWO grant 184.034.023)", + "funder": { + "@type": "Organization", + "name": "NWO", + "url": "https://www.nwo.nl" + } + } + ], + "operatingSystem": "Linux", + "programmingLanguage": [ + "Kotlin", + "Javascript", + "Typescript" + ], + "runtimePlatform": [ + "JVM", + "Node" + ], + "codemeta:contIntegration": { + "id": "https://github.com/INL/galahad/actions" + }, + "continuousIntegration": "https://github.com/INL/galahad/actions", + "developmentStatus": [ + "https://www.repostatus.org/#active", + "https://w3id.org/research-technology-readiness-levels#Level6LatePrototype" + ], + "producer": { + "@type": "Organization", + "name": "Instituut voor de Nederlandse taal", + "@id": "https://www.ivdnt.org", + "url": "https://www.ivdnt.org" + }, + "issueTracker": "https://github.com/INL/galahad/issues", + "readme": "https://github.com/INL/Galahad/blob/release/readme.md", + "releaseNotes": "https://github.com/INL/Galahad/releases", + "softwareHelp": [ + { + "@id": "https://portal.clarin.ivdnt.org/galahad/help", + "@type": "WebSite", + "name": "GaLAHaD Help", + "url": "https://portal.clarin.ivdnt.org/galahad/help" + } + ], + "targetProduct": [ + { + "@type": "WebApplication", + "name": "GaLAHaD", + "provider": "https://www.ivdnt.org", + "url": "https://portal.clarin.ivdnt.org/galahad" + }, + { + "@type": "WebAPI", + "name": "GaLAHaD API", + "provider": "https://www.ivdnt.org", + "endpointUrl": { + "@type": "EntryPoint", + "url": "https://portal.clarin.ivdnt.org/galahad/api", + "contentType": "application/json" + }, + "endpointDescription": { + "@type": "CreativeWork", + "encodingFormat": "application/json", + "url": "https://portal.clarin.ivdnt.org/galahad/api/v3/api-docs" + }, + "documentation": "https://portal.clarin.ivdnt.org/galahad/api/swagger-ui/index.html", + "consumesData": [ + { + "@type": "TextDigitalDocument", + "encodingFormat": "text/plain" + }, + { + "@type": "TextDigitalDocument", + "encodingFormat": "application/folia+xml" + }, + { + "@type": "TextDigitalDocument", + "encodingFormat": "application/tei+xml" + }, + { + "@type": "TextDigitalDocument", + "encodingFormat": "text/tab-separated-values" + }, + { + "@type": "TextDigitalDocument", + "encodingFormat": "https://github.com/newsreader/NAF" + }, + { + "@type": "TextDigitalDocument", + "encodingFormat": "https://universaldependencies.org/format.html" + } + ], + "producesData": [ + { + "@type": "CreativeWork", + "encodingFormat": "application/zip" + } + ] + }, + { + "@type": "ServerApplication", + "name": "GaLAHaD proxy" + }, + { + "@type": "SoftwareImage", + "name": "GaLAHaD client Docker image" + }, + { + "@type": "SoftwareImage", + "name": "GaLAHaD server Docker image" + }, + { + "@type": "SoftwareImage", + "name": "GaLAHaD proxy Docker image" + } + ], + "softwareRequirements": [ + { + "@type": "SoftwareApplication", + "identifier": "@rollup/plugin-yaml", + "name": "@rollup/plugin-yaml", + "version": "^4.0.1" + }, + { + "@type": "SoftwareApplication", + "identifier": "@types/jest", + "name": "@types/jest", + "version": "^28.1.6" + }, + { + "@type": "SoftwareApplication", + "identifier": "@types/js-yaml", + "name": "@types/js-yaml", + "version": "^4.0.5" + }, + { + "@type": "SoftwareApplication", + "identifier": "@types/uuid", + "name": "@types/uuid", + "version": "^8.3.4" + }, + { + "@type": "SoftwareApplication", + "identifier": "@typescript-eslint/eslint-plugin", + "name": "@typescript-eslint/eslint-plugin", + "version": "^5.30.7" + }, + { + "@type": "SoftwareApplication", + "identifier": "@typescript-eslint/parser", + "name": "@typescript-eslint/parser", + "version": "^5.30.7" + }, + { + "@type": "SoftwareApplication", + "identifier": "@vitejs/plugin-vue", + "name": "@vitejs/plugin-vue", + "version": "^5.0.4" + }, + { + "@type": "SoftwareApplication", + "identifier": "@vue/eslint-config-typescript", + "name": "@vue/eslint-config-typescript", + "version": "^11.0.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "axios", + "name": "axios", + "version": "^1.6.2" + }, + { + "@type": "SoftwareApplication", + "identifier": "buffer", + "name": "buffer", + "version": "^6.0.3" + }, + { + "@type": "SoftwareApplication", + "identifier": "content-disposition", + "name": "content-disposition", + "version": "^0.5.4" + }, + { + "@type": "SoftwareApplication", + "identifier": "eslint", + "name": "eslint", + "version": "^8.20.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "eslint-plugin-vue", + "name": "eslint-plugin-vue", + "version": "^9.2.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "js-yaml", + "name": "js-yaml", + "version": "^4.1.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "json-loader", + "name": "json-loader", + "version": "^0.5.7" + }, + { + "@type": "SoftwareApplication", + "identifier": "mutationobserver-shim", + "name": "mutationobserver-shim", + "version": "^0.3.7" + }, + { + "@type": "SoftwareApplication", + "identifier": "node-sass", + "name": "node-sass", + "version": "^9.0.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "pinia", + "name": "pinia", + "version": "^2.0.16" + }, + { + "@type": "SoftwareApplication", + "identifier": "safe-buffer", + "name": "safe-buffer", + "version": "^5.2.1" + }, + { + "@type": "SoftwareApplication", + "identifier": "sass", + "name": "sass", + "version": "^1.53.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "typescript", + "name": "typescript", + "version": "^4.7.4" + }, + { + "@type": "SoftwareApplication", + "identifier": "uuid", + "name": "uuid", + "version": "^8.3.2" + }, + { + "@type": "SoftwareApplication", + "identifier": "vite", + "name": "vite", + "version": "^5.1.7" + }, + { + "@type": "SoftwareApplication", + "identifier": "vue", + "name": "vue", + "version": "^3.2.37" + }, + { + "@type": "SoftwareApplication", + "identifier": "vue-router", + "name": "vue-router", + "version": "^4.1.2" + }, + { + "@type": "SoftwareApplication", + "identifier": "vue-slider-component", + "name": "vue-slider-component", + "version": "^4.1.0-beta.1" + }, + { + "@type": "SoftwareApplication", + "identifier": "com.beust.klaxon", + "name": "klaxon", + "version": "5.6" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.jetbrains.kotlin.kotlin-reflect", + "name": "kotlin-reflect", + "version": "1.9.22" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.jetbrains.kotlin.kotlin-stdlib", + "name": "kotlin-stdlib", + "version": "1.9.22" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.jetbrains.kotlinx.kotlinx-coroutines-core-jvm", + "name": "kotlinx-coroutines-core-jvm", + "version": "1.7.3" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.jetbrains.kotlinx.kotlinx-serialization-json-jvm", + "name": "kotlinx-serialization-json-jvm", + "version": "1.6.3" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.apache.logging.log4j.log4j-api-kotlin", + "name": "log4j-api-kotlin", + "version": "1.2.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.yaml.snakeyaml", + "name": "snakeyaml", + "version": "2.2" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.springframework.boot.spring-boot-devtools", + "name": "spring-boot-devtools", + "version": "3.2.3" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.springframework.boot.spring-boot-starter-web", + "name": "spring-boot-starter-web", + "version": "3.2.4" + }, + { + "@type": "SoftwareApplication", + "identifier": "org.springdoc.springdoc-openapi-starter-webmvc-ui", + "name": "springdoc-openapi-starter-webmvc-ui", + "version": "2.5.0" + } + ] } \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index a80002d..a06c0fe 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,73 +1,79 @@ -# Docker compose file for galahad client, server and proxy -services: - client: - image: instituutnederlandsetaal/galahad-client:${APP_VERSION} - restart: unless-stopped - networks: - - proxy_network - deploy: - resources: - limits: - cpus: '0.50' - memory: 512M - reservations: - cpus: '0.25' - memory: 128M - server: - image: instituutnederlandsetaal/galahad-server:${APP_VERSION} - restart: unless-stopped - volumes: - - user-corpora-volume:/data/corpora/custom - - preset-corpora-volume:/data/corpora/presets - - formats-volume:/data/formats - - taggers-volume:/data/taggers - - tagsets-volume:/data/tagsets - - admins-volume:/data/admins - networks: - - proxy_network - - taggers - deploy: - resources: - limits: - cpus: '2.00' - memory: 8G # should be higher that the java heap size - reservations: - cpus: '1.00' - memory: 2G - environment: # Otherwise container doesn't restart on OutOfMemory - - "JAVA_OPTS=-XX:+ExitOnOutOfMemoryError" - # Simple nginx reverse proxy to combine front- and backend - proxy: - image: instituutnederlandsetaal/galahad-proxy:${APP_VERSION} - depends_on: - - client - - server - ports: - - "80:80" - restart: unless-stopped - networks: - - proxy_network - deploy: - resources: - limits: - cpus: '0.50' - memory: 256M - reservations: - cpus: '0.25' - memory: 128M - -networks: - front: - driver: bridge - proxy_network: - driver: bridge - taggers: - external: true - name: $taggers_network -volumes: - user-corpora-volume: - preset-corpora-volume: - formats-volume: - taggers-volume: - tagsets-volume: - admins-volume: +# Docker compose file for galahad client, server and proxy +services: + client: + image: instituutnederlandsetaal/galahad-client:${APP_VERSION} + build: + context: client + restart: unless-stopped + networks: + - proxy_network + deploy: + resources: + limits: + cpus: '0.50' + memory: 512M + reservations: + cpus: '0.25' + memory: 128M + server: + image: instituutnederlandsetaal/galahad-server:${APP_VERSION} + build: + context: server + restart: unless-stopped + volumes: + - user-corpora-volume:/data/corpora/custom + - preset-corpora-volume:/data/corpora/presets + - formats-volume:/data/formats + - taggers-volume:/data/taggers + - tagsets-volume:/data/tagsets + - admins-volume:/data/admins + networks: + - proxy_network + - taggers + deploy: + resources: + limits: + cpus: '2.00' + memory: 8G # should be higher that the java heap size + reservations: + cpus: '1.00' + memory: 2G + environment: # Otherwise container doesn't restart on OutOfMemory + - "JAVA_OPTS=-XX:+ExitOnOutOfMemoryError" + # Simple nginx reverse proxy to combine front- and backend + proxy: + image: instituutnederlandsetaal/galahad-proxy:${APP_VERSION} + build: + context: proxy + depends_on: + - client + - server + ports: + - "80:80" + restart: unless-stopped + networks: + - proxy_network + deploy: + resources: + limits: + cpus: '0.50' + memory: 256M + reservations: + cpus: '0.25' + memory: 128M + +networks: + front: + driver: bridge + proxy_network: + driver: bridge + taggers: + external: true + name: $taggers_network +volumes: + user-corpora-volume: + preset-corpora-volume: + formats-volume: + taggers-volume: + tagsets-volume: + admins-volume: diff --git a/server/src/main/kotlin/org/ivdnt/galahad/FileBackedValue.kt b/server/src/main/kotlin/org/ivdnt/galahad/FileBackedValue.kt index 791b1e8..8dccae2 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/FileBackedValue.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/FileBackedValue.kt @@ -4,14 +4,8 @@ import com.fasterxml.jackson.core.type.TypeReference import com.fasterxml.jackson.databind.ObjectMapper import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.runBlocking -import kotlinx.coroutines.withContext import org.apache.logging.log4j.kotlin.Logging -import java.io.* -import java.nio.ByteBuffer -import java.nio.channels.FileChannel -import java.nio.channels.FileLock -import java.nio.channels.OverlappingFileLockException -import java.nio.file.StandardOpenOption +import java.io.File val mapper: ObjectMapper by lazy { ObjectMapper() } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/app/GalahadApplication.kt b/server/src/main/kotlin/org/ivdnt/galahad/app/GalahadApplication.kt index ed9d009..b00c1e3 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/app/GalahadApplication.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/app/GalahadApplication.kt @@ -1,214 +1,212 @@ -package org.ivdnt.galahad.app - -import com.fasterxml.jackson.annotation.JsonProperty -import jakarta.servlet.http.HttpServletRequest -import org.apache.logging.log4j.kotlin.Logging -import org.apache.tomcat.util.http.fileupload.FileUploadException -import org.springframework.beans.factory.annotation.Autowired -import org.springframework.boot.autoconfigure.SpringBootApplication -import org.springframework.boot.context.properties.ConfigurationProperties -import org.springframework.boot.runApplication -import org.springframework.boot.web.servlet.error.ErrorController -import org.springframework.context.annotation.Bean -import org.springframework.context.annotation.ComponentScan -import org.springframework.context.annotation.Configuration -import org.springframework.http.HttpStatus -import org.springframework.http.ResponseEntity -import org.springframework.web.bind.annotation.CrossOrigin -import org.springframework.web.bind.annotation.GetMapping -import org.springframework.web.bind.annotation.RequestMapping -import org.springframework.web.bind.annotation.RestController -import org.springframework.web.filter.CommonsRequestLoggingFilter -import org.springframework.web.servlet.config.annotation.ViewControllerRegistry -import org.springframework.web.servlet.config.annotation.WebMvcConfigurer -import java.io.File -import java.io.IOException -import java.math.BigDecimal -import java.net.URI -import java.util.concurrent.TimeUnit -import java.util.regex.Matcher -import java.util.regex.Pattern - -// This is a possibly incomplete list of all the endpoints -// For a complete overview better go to -// SWAGGER_API_URL -const val BASE_URL = "/" -const val SWAGGER_API_URL = "/swagger-ui/index.html" - -const val TAGSETS_URL = "/tagsets" -const val BENCHMARKS_URL = "/benchmarks" -const val VERSION_URL = "/version" - -const val TAGGERS_URL = "/taggers" -const val TAGGER_URL = "$TAGGERS_URL/{tagger}" -const val TAGGER_HEALTH_URL = "$TAGGER_URL/health" - -const val ASSAYS_URL = "/assays" - -const val INTERNAL_JOBS_URL = "/internal/jobs" -const val INTERNAL_JOBS_RESULT_URL = "$INTERNAL_JOBS_URL/result" -const val INTERNAL_JOBS_ERROR_URL = "$INTERNAL_JOBS_URL/error" - -const val CORPORA_URL = "/corpora" -const val PUBLIC_CORPORA_URL = "/public_corpora" -const val DATASETS_CORPORA_URL = "/datasets_corpora" -const val CORPUS_URL = "$CORPORA_URL/{corpus}" - -const val JOBS_URL = "$CORPUS_URL/jobs" -const val JOB_URL = "$JOBS_URL/{job}" -const val JOB_DOCUMENT_URL = "$JOB_URL/documents/{document}" - -const val EVALUATION_URL = "$JOB_URL/evaluation" -const val ASSAY_URL = "$EVALUATION_URL/assay" -const val DISTRIBUTION_URL = "$EVALUATION_URL/distribution" -const val METRICS_URL = "$EVALUATION_URL/metrics" -const val METRICS_CSV_URL = "$METRICS_URL/download" -const val CONFUSION_URL = "$EVALUATION_URL/confusion" -const val CONFUSION_CSV_URL = "$CONFUSION_URL/download" -const val EVALUATION_CSV_URL = "$EVALUATION_URL/download" - -const val DOCUMENTS_URL = "$CORPUS_URL/documents" -const val DOCUMENT_URL = "$DOCUMENTS_URL/{document}" -const val DOCUMENT_RAW_FILE_URL = "$DOCUMENT_URL/raw" // returns the blob of the raw document - -var application_profile: String = System.getenv("spring.profiles.active") ?: "prod" -fun String.runCommand(workingDir: File, timeout: Long = 60): String? { - try { - val parts = this.split("\\s".toRegex()) - val proc = ProcessBuilder(*parts.toTypedArray()) - .directory(workingDir) - .inheritIO() - .start() - - proc.waitFor(timeout, TimeUnit.MINUTES) - return proc.inputStream.bufferedReader().readText() - } catch(e: IOException) { - e.printStackTrace() - return null - } -} - -@Configuration -@ConfigurationProperties(prefix = "") -class Config { - - lateinit var workDir: String - - @Bean - fun getWorkingDirectory(): File { - return File( workDir ) - } - -} - -@ComponentScan("org.ivdnt.galahad") -@SpringBootApplication -class GalahadApplication - -fun main(args: Array) { - runApplication(*args) -} - -@RestController -class ApplicationController : ErrorController, Logging { - - @Autowired - private val request: HttpServletRequest? = null - - @GetMapping( BASE_URL ) - @CrossOrigin - fun getApplication(): ResponseEntity { - // Since we have nothing to show at this URL, we redirect to the API UI instead - logger.info( "Get root" ) - return ResponseEntity.status(HttpStatus.FOUND).location(URI.create(request?.contextPath + SWAGGER_API_URL)).build() - } - - @GetMapping( "/user" ) - @CrossOrigin - fun getUser(): User { - logger.info( "Get user" ) - return User.getUserFromRequestOrThrow( request ) - } - - @GetMapping( BENCHMARKS_URL ) - @CrossOrigin - fun getBenchmarks(): ByteArray? { - logger.info( "Get benchmarks" ) - val file = File( "benchmarks.yml" ) - return if( file.exists() ) file.readBytes() else null - } - - @GetMapping( VERSION_URL ) - @CrossOrigin - fun getVersion(): ByteArray { - logger.info( "Get version" ) - return this::class.java.classLoader.getResourceAsStream("version.yml")!!.readBytes() - } - - - data class ErrorResponse( - @JsonProperty val statusCode: HttpStatus, - @JsonProperty val message: String, - ) - - @RequestMapping("/error") -// @ResponseBody - @CrossOrigin - fun handleError(request: HttpServletRequest): ErrorResponse { - val statusCode = HttpStatus.valueOf( request.getAttribute("jakarta.servlet.error.status_code") as Int? ?: 500 ) - val exception = request.getAttribute("jakarta.servlet.error.exception") as Exception? - if (exception?.cause is FileUploadException) { - // The error message is good as is. No need to wrap it - return ErrorResponse( statusCode, (exception.cause as FileUploadException).message ?: "exception inception" ) - } - return ErrorResponse( statusCode, "${if (exception == null) "N/A" else exception.message}") - } - -} - -@Configuration -@ConfigurationProperties(prefix = "spring.servlet.multipart") -class MultipartConfig { - - lateinit var maxFileSize: String - lateinit var maxRequestSize: String - - val maxFilesSizeAsBytes: Long - get() { - return toBytes(maxFileSize) - } - - companion object { - fun toBytes(filesize: String?): Long { - var returnValue: Long = -1 - val patt: Pattern = Pattern.compile("([\\d.]+)([GMK]B)", Pattern.CASE_INSENSITIVE) - val matcher: Matcher = patt.matcher(filesize) - val powerMap: MutableMap = HashMap() - powerMap["GB"] = 3 - powerMap["MB"] = 2 - powerMap["KB"] = 1 - if (matcher.find()) { - val number: String = matcher.group(1) - val pow = powerMap[matcher.group(2).uppercase()]!! - var bytes = BigDecimal(number) - bytes = bytes.multiply(BigDecimal.valueOf(1024).pow(pow)) - returnValue = bytes.longValueExact() - } - return returnValue - } - } -} - -@Configuration -class RequestLoggingFilterConfig { - @Bean - fun logFilter(): CommonsRequestLoggingFilter { - val filter = CommonsRequestLoggingFilter() - filter.setIncludeQueryString(true) - filter.setIncludePayload(true) - filter.setMaxPayloadLength(10000) - filter.setIncludeHeaders(true) - filter.setAfterMessagePrefix("REQUEST DATA : ") - return filter - } +package org.ivdnt.galahad.app + +import com.fasterxml.jackson.annotation.JsonProperty +import jakarta.servlet.http.HttpServletRequest +import org.apache.logging.log4j.kotlin.Logging +import org.apache.tomcat.util.http.fileupload.FileUploadException +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.boot.autoconfigure.SpringBootApplication +import org.springframework.boot.context.properties.ConfigurationProperties +import org.springframework.boot.runApplication +import org.springframework.boot.web.servlet.error.ErrorController +import org.springframework.context.annotation.Bean +import org.springframework.context.annotation.ComponentScan +import org.springframework.context.annotation.Configuration +import org.springframework.http.HttpStatus +import org.springframework.http.ResponseEntity +import org.springframework.web.bind.annotation.CrossOrigin +import org.springframework.web.bind.annotation.GetMapping +import org.springframework.web.bind.annotation.RequestMapping +import org.springframework.web.bind.annotation.RestController +import org.springframework.web.filter.CommonsRequestLoggingFilter +import java.io.File +import java.io.IOException +import java.math.BigDecimal +import java.net.URI +import java.util.concurrent.TimeUnit +import java.util.regex.Matcher +import java.util.regex.Pattern + +// This is a possibly incomplete list of all the endpoints +// For a complete overview better go to +// SWAGGER_API_URL +const val BASE_URL = "/" +const val SWAGGER_API_URL = "/swagger-ui/index.html" + +const val TAGSETS_URL = "/tagsets" +const val BENCHMARKS_URL = "/benchmarks" +const val VERSION_URL = "/version" + +const val TAGGERS_URL = "/taggers" +const val TAGGER_URL = "$TAGGERS_URL/{tagger}" +const val TAGGER_HEALTH_URL = "$TAGGER_URL/health" + +const val ASSAYS_URL = "/assays" + +const val INTERNAL_JOBS_URL = "/internal/jobs" +const val INTERNAL_JOBS_RESULT_URL = "$INTERNAL_JOBS_URL/result" +const val INTERNAL_JOBS_ERROR_URL = "$INTERNAL_JOBS_URL/error" + +const val CORPORA_URL = "/corpora" +const val PUBLIC_CORPORA_URL = "/public_corpora" +const val DATASETS_CORPORA_URL = "/datasets_corpora" +const val CORPUS_URL = "$CORPORA_URL/{corpus}" + +const val JOBS_URL = "$CORPUS_URL/jobs" +const val JOB_URL = "$JOBS_URL/{job}" +const val JOB_DOCUMENT_URL = "$JOB_URL/documents/{document}" + +const val EVALUATION_URL = "$JOB_URL/evaluation" +const val ASSAY_URL = "$EVALUATION_URL/assay" +const val DISTRIBUTION_URL = "$EVALUATION_URL/distribution" +const val METRICS_URL = "$EVALUATION_URL/metrics" +const val METRICS_CSV_URL = "$METRICS_URL/download" +const val CONFUSION_URL = "$EVALUATION_URL/confusion" +const val CONFUSION_CSV_URL = "$CONFUSION_URL/download" +const val EVALUATION_CSV_URL = "$EVALUATION_URL/download" + +const val DOCUMENTS_URL = "$CORPUS_URL/documents" +const val DOCUMENT_URL = "$DOCUMENTS_URL/{document}" +const val DOCUMENT_RAW_FILE_URL = "$DOCUMENT_URL/raw" // returns the blob of the raw document + +var application_profile: String = System.getenv("spring.profiles.active") ?: "prod" +fun String.runCommand(workingDir: File, timeout: Long = 60): String? { + try { + val parts = this.split("\\s".toRegex()) + val proc = ProcessBuilder(*parts.toTypedArray()) + .directory(workingDir) + .inheritIO() + .start() + + proc.waitFor(timeout, TimeUnit.MINUTES) + return proc.inputStream.bufferedReader().readText() + } catch(e: IOException) { + e.printStackTrace() + return null + } +} + +@Configuration +@ConfigurationProperties(prefix = "") +class Config { + + lateinit var workDir: String + + @Bean + fun getWorkingDirectory(): File { + return File( workDir ) + } + +} + +@ComponentScan("org.ivdnt.galahad") +@SpringBootApplication +class GalahadApplication + +fun main(args: Array) { + runApplication(*args) +} + +@RestController +class ApplicationController : ErrorController, Logging { + + @Autowired + private val request: HttpServletRequest? = null + + @GetMapping( BASE_URL ) + @CrossOrigin + fun getApplication(): ResponseEntity { + // Since we have nothing to show at this URL, we redirect to the API UI instead + logger.info( "Get root" ) + return ResponseEntity.status(HttpStatus.FOUND).location(URI.create(request?.contextPath + SWAGGER_API_URL)).build() + } + + @GetMapping( "/user" ) + @CrossOrigin + fun getUser(): User { + logger.info( "Get user" ) + return User.getUserFromRequestOrThrow( request ) + } + + @GetMapping( BENCHMARKS_URL ) + @CrossOrigin + fun getBenchmarks(): ByteArray? { + logger.info( "Get benchmarks" ) + val file = File( "benchmarks.yml" ) + return if( file.exists() ) file.readBytes() else null + } + + @GetMapping( VERSION_URL ) + @CrossOrigin + fun getVersion(): ByteArray { + logger.info( "Get version" ) + return this::class.java.classLoader.getResourceAsStream("version.yml")!!.readBytes() + } + + + data class ErrorResponse( + @JsonProperty val statusCode: HttpStatus, + @JsonProperty val message: String, + ) + + @RequestMapping("/error") +// @ResponseBody + @CrossOrigin + fun handleError(request: HttpServletRequest): ErrorResponse { + val statusCode = HttpStatus.valueOf( request.getAttribute("jakarta.servlet.error.status_code") as Int? ?: 500 ) + val exception = request.getAttribute("jakarta.servlet.error.exception") as Exception? + if (exception?.cause is FileUploadException) { + // The error message is good as is. No need to wrap it + return ErrorResponse( statusCode, (exception.cause as FileUploadException).message ?: "exception inception" ) + } + return ErrorResponse( statusCode, "${if (exception == null) "N/A" else exception.message}") + } + +} + +@Configuration +@ConfigurationProperties(prefix = "spring.servlet.multipart") +class MultipartConfig { + + lateinit var maxFileSize: String + lateinit var maxRequestSize: String + + val maxFilesSizeAsBytes: Long + get() { + return toBytes(maxFileSize) + } + + companion object { + fun toBytes(filesize: String?): Long { + var returnValue: Long = -1 + val patt: Pattern = Pattern.compile("([\\d.]+)([GMK]B)", Pattern.CASE_INSENSITIVE) + val matcher: Matcher = patt.matcher(filesize) + val powerMap: MutableMap = HashMap() + powerMap["GB"] = 3 + powerMap["MB"] = 2 + powerMap["KB"] = 1 + if (matcher.find()) { + val number: String = matcher.group(1) + val pow = powerMap[matcher.group(2).uppercase()]!! + var bytes = BigDecimal(number) + bytes = bytes.multiply(BigDecimal.valueOf(1024).pow(pow)) + returnValue = bytes.longValueExact() + } + return returnValue + } + } +} + +@Configuration +class RequestLoggingFilterConfig { + @Bean + fun logFilter(): CommonsRequestLoggingFilter { + val filter = CommonsRequestLoggingFilter() + filter.setIncludeQueryString(true) + filter.setIncludePayload(true) + filter.setMaxPayloadLength(10000) + filter.setIncludeHeaders(true) + filter.setAfterMessagePrefix("REQUEST DATA : ") + return filter + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt b/server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt index 08a64e3..6329c88 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/app/report/Report.kt @@ -2,8 +2,8 @@ package org.ivdnt.galahad.app.report import org.apache.logging.log4j.kotlin.Logging import org.apache.logging.log4j.kotlin.logger -import org.ivdnt.galahad.data.layer.WordForm import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.data.layer.WordForm class Report : Logging { @@ -18,8 +18,8 @@ class Report : Logging { // println("Spotted incompatible tokenization for \"${wf.literal}\" at offset ${wf.offset}") // Now we do nothing, but it is good to centrally register this logger().warn( "REPORT: Spotted incompatible tokenization for wordforms \n" + - " - $wf1 \n" + - " - $wf2" + " - ${wf1.literal} \n" + + " - ${wf2.literal}" ) } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/CorporaController.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/CorporaController.kt index 10ee3a7..f7d6427 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/CorporaController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/CorporaController.kt @@ -25,7 +25,7 @@ class CorporaController( @Autowired private val response: HttpServletResponse? = null private fun File.corpus(): Corpus { - return Corpus(this, User.getUserFromRequestOrThrow(request)) + return Corpus(this) } private fun assertCorpusNameValidOrThrow(corpus: String) { diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/ExportController.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/ExportController.kt index 8cc80b2..4bca5e2 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/ExportController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/ExportController.kt @@ -1,159 +1,161 @@ -package org.ivdnt.galahad.data - -import jakarta.servlet.http.HttpServletRequest -import jakarta.servlet.http.HttpServletResponse -import org.apache.logging.log4j.kotlin.Logging -import org.ivdnt.galahad.app.JOB_DOCUMENT_URL -import org.ivdnt.galahad.app.JOB_URL -import org.ivdnt.galahad.app.User -import org.ivdnt.galahad.data.document.Document -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.port.CorpusTransformMetadata -import org.ivdnt.galahad.port.DocumentTransformMetadata -import org.ivdnt.galahad.port.InternalFile -import org.ivdnt.galahad.util.setContentDisposition -import org.springframework.beans.factory.annotation.Autowired -import org.springframework.web.bind.annotation.* -import java.io.File -import java.util.* - -@RestController -class ExportController( - val corpora: CorporaController, -) : Logging { - - @Autowired - private val request: HttpServletRequest? = null - - @Autowired - private val response: HttpServletResponse? = null - - private fun getCorpusTransformMetadata(corpusID: UUID, jobName: String): CorpusTransformMetadata { - // Exporting documents requires you to have write access. - val corpus = corpora.getWriteAccessOrThrow(corpusID, request) - val job = corpus.jobs.readOrThrow(jobName) - return CorpusTransformMetadata( - corpus = corpus, job = job, user = User.getUserFromRequestOrThrow(request) - ) - } - - private fun getDocumentTransformMetadata( - corpus: UUID, - job: String, - document: String, - ): DocumentTransformMetadata { - return getCorpusTransformMetadata(corpus, job).documentMetadata(document) - } - - @GetMapping("$JOB_URL/export/convert") - @CrossOrigin - @ResponseBody - fun convertAndExportJob( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam("format") formatName: String, - @RequestParam("posHeadOnly") posHeadOnly: Boolean = false, - ) { - return exportCorpusJobInFormat(corpus, job, formatName, shouldMerge = false, posHeadOnly) - } - - @GetMapping("$JOB_URL/export/merge") - @CrossOrigin - @ResponseBody - fun mergeAndExportJob( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam("format") - filterFormat: String, // Needed when raw formats are mixed, since BlackLab only accepts 1 type per corpus - @RequestParam("posHeadOnly") - posHeadOnly: Boolean, - ) { - return exportCorpusJobInFormat(corpus, job, filterFormat, shouldMerge = true, posHeadOnly) - } - - fun exportCorpusJobInFormat(corpus: UUID, job: String, formatName: String, shouldMerge: Boolean, posHeadOnly: Boolean) { - val format = DocumentFormat.fromString(formatName) - val ctm = getCorpusTransformMetadata(corpus, job) - setZipResponseHeader(ctm) - ctm.corpus.getZipped(ctm, formatMapper = { - try { - // Document conversions. - val dtm = ctm.documentMetadata(it.name) - return@getZipped if (shouldMerge && mergeFormatMatches(it, format)) { - logger.info("Merging ${it.name} of format ${it.format}") - mergeAndExportDocument(dtm, posHeadOnly).file - } else { - logger.info("Converting ${it.name} of format ${it.format} to $format") - convertAndExportDocument(dtm, format, posHeadOnly) - } - } catch (e: Exception) { - throw Exception("Could not convert file ${it.name} to format ${format}. ${e.message}.") - } - }, filter = { - // Filter out untagged documents. - document -> - ctm.documentMetadata(document.name).layer != Layer.EMPTY - }, outputStream = response?.outputStream) - } - - private fun mergeFormatMatches( - it: Document, format: DocumentFormat, - ): Boolean { - var otherFormat = it.format - // Overwrite the format for legacy formats that can in fact be merged. - if (otherFormat == DocumentFormat.TeiP5Legacy) { - otherFormat = DocumentFormat.TeiP5 - } - return otherFormat == format - } - - @GetMapping("$JOB_DOCUMENT_URL/export/convert") - @CrossOrigin - @ResponseBody - fun convertAndExportDocument( - @PathVariable corpus: UUID, - @PathVariable job: String, - @PathVariable document: String, - @RequestParam("format") formatName: String, - @RequestParam("posHeadOnly") posHeadOnly: Boolean, - ): ByteArray? { - val format = DocumentFormat.fromString(formatName) - val dtm = getDocumentTransformMetadata(corpus, job, document) - // TODO("set headers") - return convertAndExportDocument(dtm, format, posHeadOnly).readBytes() - } - - fun convertAndExportDocument(dtm: DocumentTransformMetadata, format: DocumentFormat, posHeadOnly: Boolean): File { - if (posHeadOnly) { - dtm.convertLayerToPosHead() - } - return dtm.document.generateAs(format, dtm) - } - - @GetMapping("$JOB_DOCUMENT_URL/export/merge") - @CrossOrigin - @ResponseBody - fun mergeAndExportDocument( - @PathVariable corpus: UUID, - @PathVariable job: String, - @PathVariable document: String, - @RequestParam("posHeadOnly") posHeadOnly: Boolean, - ): ByteArray? { - val dtm = getDocumentTransformMetadata(corpus, job, document) - // TODO("set headers") - return mergeAndExportDocument(dtm, posHeadOnly).file.readBytes() - } - - fun mergeAndExportDocument(dtm: DocumentTransformMetadata, posHeadOnly: Boolean): InternalFile { - if (posHeadOnly) { - dtm.convertLayerToPosHead() - } - return dtm.document.merge(dtm) - } - - private fun setZipResponseHeader(ctm: CorpusTransformMetadata) { - response!!.contentType = "application/zip" - response.setContentDisposition(ctm.corpus.metadata.expensiveGet().name + ".zip") - } -} +package org.ivdnt.galahad.data + +import jakarta.servlet.http.HttpServletRequest +import jakarta.servlet.http.HttpServletResponse +import org.apache.logging.log4j.kotlin.Logging +import org.ivdnt.galahad.app.JOB_DOCUMENT_URL +import org.ivdnt.galahad.app.JOB_URL +import org.ivdnt.galahad.app.User +import org.ivdnt.galahad.data.document.Document +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.port.CorpusTransformMetadata +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.InternalFile +import org.ivdnt.galahad.util.setContentDisposition +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.web.bind.annotation.* +import java.io.File +import java.util.* + +@RestController +class ExportController( + val corpora: CorporaController, +) : Logging { + + @Autowired + private val request: HttpServletRequest? = null + + @Autowired + private val response: HttpServletResponse? = null + + private fun getCorpusTransformMetadata(corpusID: UUID, jobName: String, formatName: DocumentFormat): CorpusTransformMetadata { + // Exporting documents requires you to have write access. + val corpus = corpora.getWriteAccessOrThrow(corpusID, request) + val job = corpus.jobs.readOrThrow(jobName) + return CorpusTransformMetadata( + corpus = corpus, job = job, user = User.getUserFromRequestOrThrow(request), targetFormat = formatName + ) + } + + private fun getDocumentTransformMetadata( + corpus: UUID, + job: String, + document: String, + format: DocumentFormat, + ): DocumentTransformMetadata { + return getCorpusTransformMetadata(corpus, job, format).documentMetadata(document) + } + + @GetMapping("$JOB_URL/export/convert") + @CrossOrigin + @ResponseBody + fun convertAndExportJob( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam("format") formatName: String, + @RequestParam("posHeadOnly") posHeadOnly: Boolean = false, + ) { + return exportCorpusJobInFormat(corpus, job, formatName, shouldMerge = false, posHeadOnly) + } + + @GetMapping("$JOB_URL/export/merge") + @CrossOrigin + @ResponseBody + fun mergeAndExportJob( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam("format") + filterFormat: String, // Needed when raw formats are mixed, since BlackLab only accepts 1 type per corpus + @RequestParam("posHeadOnly") + posHeadOnly: Boolean, + ) { + return exportCorpusJobInFormat(corpus, job, filterFormat, shouldMerge = true, posHeadOnly) + } + + fun exportCorpusJobInFormat(corpus: UUID, job: String, formatName: String, shouldMerge: Boolean, posHeadOnly: Boolean) { + val format = DocumentFormat.fromString(formatName) + val ctm = getCorpusTransformMetadata(corpus, job, format) + setZipResponseHeader(ctm) + ctm.corpus.getZipped(ctm, formatMapper = { + try { + // Document conversions. + val dtm = ctm.documentMetadata(it.name) + return@getZipped if (shouldMerge && mergeFormatMatches(it, format)) { + logger.info("Merging ${it.name} of format ${it.format}") + mergeAndExportDocument(dtm, posHeadOnly).file + } else { + logger.info("Converting ${it.name} of format ${it.format} to $format") + convertAndExportDocument(dtm, format, posHeadOnly) + } + } catch (e: Exception) { + throw Exception("Could not convert file ${it.name} to format ${format}. ${e.message}.") + } + }, filter = { + // Filter out untagged documents. + document -> + ctm.documentMetadata(document.name).layer != Layer.EMPTY + }, outputStream = response?.outputStream) + } + + private fun mergeFormatMatches( + it: Document, format: DocumentFormat, + ): Boolean { + var otherFormat = it.format + // Overwrite the format for legacy formats that can in fact be merged. + if (otherFormat == DocumentFormat.TeiP5Legacy) { + otherFormat = DocumentFormat.TeiP5 + } + return otherFormat == format + } + + @GetMapping("$JOB_DOCUMENT_URL/export/convert") + @CrossOrigin + @ResponseBody + fun convertAndExportDocument( + @PathVariable corpus: UUID, + @PathVariable job: String, + @PathVariable document: String, + @RequestParam("format") formatName: String, + @RequestParam("posHeadOnly") posHeadOnly: Boolean, + ): ByteArray? { + val format = DocumentFormat.fromString(formatName) + val dtm = getDocumentTransformMetadata(corpus, job, document, format) + // TODO("set headers") + return convertAndExportDocument(dtm, format, posHeadOnly).readBytes() + } + + fun convertAndExportDocument(dtm: DocumentTransformMetadata, format: DocumentFormat, posHeadOnly: Boolean): File { + if (posHeadOnly) { + dtm.convertLayerToPosHead() + } + return dtm.document.generateAs(format, dtm) + } + + @GetMapping("$JOB_DOCUMENT_URL/export/merge") + @CrossOrigin + @ResponseBody + fun mergeAndExportDocument( + @PathVariable corpus: UUID, + @PathVariable job: String, + @PathVariable document: String, + @RequestParam("posHeadOnly") posHeadOnly: Boolean, + ): ByteArray? { + val doc = corpora.getWriteAccessOrThrow(corpus, request).documents.readOrThrow(document) + val dtm = getDocumentTransformMetadata(corpus, job, document, doc.format) + // TODO("set headers") + return mergeAndExportDocument(dtm, posHeadOnly).file.readBytes() + } + + fun mergeAndExportDocument(dtm: DocumentTransformMetadata, posHeadOnly: Boolean): InternalFile { + if (posHeadOnly) { + dtm.convertLayerToPosHead() + } + return dtm.document.merge(dtm) + } + + private fun setZipResponseHeader(ctm: CorpusTransformMetadata) { + response!!.contentType = "application/zip" + response.setContentDisposition(ctm.corpus.metadata.expensiveGet().name + ".zip") + } +} diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/corpus/Corpus.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/corpus/Corpus.kt index 382cd55..ba70eb3 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/corpus/Corpus.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/corpus/Corpus.kt @@ -1,207 +1,206 @@ -package org.ivdnt.galahad.data.corpus - -import org.ivdnt.galahad.BaseFileSystemStore -import org.ivdnt.galahad.FileBackedCache -import org.ivdnt.galahad.FileBackedValue -import org.ivdnt.galahad.app.ExpensiveGettable -import org.ivdnt.galahad.app.User -import org.ivdnt.galahad.app.executeAndLogTime -import org.ivdnt.galahad.data.document.Document -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.document.Documents -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME -import org.ivdnt.galahad.jobs.Jobs -import org.ivdnt.galahad.port.CmdiMetadata -import org.ivdnt.galahad.port.CorpusTransformMetadata -import org.ivdnt.galahad.taggers.Taggers -import org.ivdnt.galahad.util.createZipFile -import java.io.File -import java.io.OutputStream -import java.nio.file.Files -import java.util.* -import kotlin.io.path.createTempDirectory - -/** - * A corpus is a collection of documents, metadata and jobs, saved to a folder. The folder contents are: - * - * - documents/: a folder containing all documents in the corpus. Represented by [Documents]. - * - jobs/: a folder containing all jobs that were active at some point in the corpus. The sourceLayer is one of them. Represented by [Jobs]. - * - metadata: a cache file storing [MutableCorpusMetadata] about the corpus. - * - metadata.cache: a cache file storing [CorpusMetadata] about the corpus. - * - * A Corpus has an owner, who can add collaborators and viewers. - * Collaborators have read and write access (and can add viewers and collaborators). - * Viewers have read access. - * Admins have access to all corpora with read and write access. - */ -class Corpus( - workDirectory: File, - user: User, -) : BaseFileSystemStore(workDirectory) { - - val documents = Documents(workDirectory.resolve("documents")) - - // Make sure this is initialized before accessing metadata - private val fileBackedMetadata: FileBackedValue - get() = FileBackedValue(workDirectory.resolve("metadata"), MutableCorpusMetadata.initValue()) - - /** - * Convenient access to [MutableCorpusMetadata] without the need to get the expensive [CorpusMetadata] - * When uploading docs, for example, all we need to know is if the user has permission. - */ - var mutableCorpusMetadata: MutableCorpusMetadata - get() = fileBackedMetadata.read() - private set(value) = fileBackedMetadata.modify { value } - - private val metadataCache = object : FileBackedCache( - file = getMetadataFile(), initValue = CorpusMetadata() - ) { - override fun isValid(lastModified: Long): Boolean { - return lastModified >= fileBackedMetadata.lastModified - } - - override fun set(): CorpusMetadata { - return CorpusMetadata( - // Mutable fields - owner = mutableCorpusMetadata.owner, - name = mutableCorpusMetadata.name, - eraTo = mutableCorpusMetadata.eraTo, - eraFrom = mutableCorpusMetadata.eraFrom, - tagset = mutableCorpusMetadata.tagset, - dataset = mutableCorpusMetadata.isDataset, - public = mutableCorpusMetadata.isDataset, // Note that we set isPublic the same as isDataset. - collaborators = mutableCorpusMetadata.collaborators ?: setOf(), - viewers = mutableCorpusMetadata.viewers ?: setOf(), - sourceName = mutableCorpusMetadata.sourceName, - sourceURL = mutableCorpusMetadata.sourceURL, - // Immutable/calculated fields - uuid = UUID.fromString(workDirectory.name), - activeJobs = jobs.readAll().filter { it.isActive }.size, - numDocs = documents.readAll().size, - sizeInBytes = workDirectory.walkTopDown().filter { it.isFile }.map { it.length() }.sum(), // expensive - lastModified = System.currentTimeMillis(), - ) - } - } - - private fun getMetadataFile() = workDirectory.resolve("metadata.cache") - - /** Invalidate cache when new documents are uploaded or job activity changes */ - fun invalidateCache() { - getMetadataFile().delete() - } - - val metadata: ExpensiveGettable = object : ExpensiveGettable { - override fun expensiveGet() = metadataCache.get() - } - - val sourceTagger: ExpensiveGettable = object : ExpensiveGettable { - override fun expensiveGet(): Taggers.Summary { - val metadata = metadata.expensiveGet() - return Taggers.Summary( - id = SOURCE_LAYER_NAME, - description = "uploaded annotations", - tagset = metadata.tagset, - eraFrom = metadata.eraFrom, - eraTo = metadata.eraTo, - produces = setOf("TODO"), - ) - } - } - - // Note: this is somewhat inefficient, since have to get the sourceTagger, even though we might not use it. - val jobs get() = Jobs(workDirectory.resolve("jobs"), this) - - fun delete() { - workDirectory.deleteRecursively() - } - - /** - * Overwrite the [CorpusMetadata] in [metadata] with [newMeta], - * except for the owner, which should be grabbed from the existing [metadata]. - * - * If a user appears multiple times in the permission hierarchy, only the upper level remains. - */ - fun updateMetadata(newMeta: MutableCorpusMetadata, user: User): ExpensiveGettable { - if (!mutableCorpusMetadata.isPublic && newMeta.isPublic) { - // Corpus is being set to public - if (!mutableCorpusMetadata.canMakePublic(user)) { - throw Exception("Unauthorized") - } - } - if (mutableCorpusMetadata.collaborators != newMeta.collaborators || mutableCorpusMetadata.viewers != newMeta.viewers) { - // Collaborators have changed - if (!mutableCorpusMetadata.canAddNewUsers(user) && mutableCorpusMetadata.owner != "") { - throw Exception("Unauthorized") - } - } - // If mutableCorpusMetadata.owner is "", we are working with the InitValue of FileBackedValue, - // so the updateMetadata call is initializing the corpus. - val owner = if (mutableCorpusMetadata.owner == "") user.id else mutableCorpusMetadata.owner - // Overwrite the owner with the original, so collaborators can't change it. - newMeta.owner = owner - - // Trim textual intputs - newMeta.name = newMeta.name.trim() - newMeta.sourceName = newMeta.sourceName?.trim() - newMeta.tagset = newMeta.tagset?.trim() - newMeta.collaborators = newMeta.collaborators?.map { it.trim() }?.toSet() - newMeta.viewers = newMeta.viewers?.map { it.trim() }?.toSet() - - // merge isPublic and isDataset - newMeta.isPublic = newMeta.isDataset - - // Remove owner from list of collaborators & viewers - newMeta.collaborators = newMeta.collaborators?.filter { it != owner }?.toSet() - newMeta.viewers = newMeta.viewers?.filter { it != owner }?.toSet() - // Remove collaborators from list of viewers - if (newMeta.collaborators != null) newMeta.viewers = newMeta.viewers?.filter { - !newMeta.collaborators!!.contains(it) - }?.toSet() - - mutableCorpusMetadata = newMeta - return metadata - } - - fun removeAsViewer(user: User) { - fileBackedMetadata.modify { - val viewers = it.viewers?.toMutableSet() - viewers?.removeIf { i -> i == user.id } - it.viewers = viewers - it - } - } - - fun removeAsCollaborator(user: User) { - fileBackedMetadata.modify { - val collaborators = it.collaborators?.toMutableSet() - collaborators?.removeIf { i -> i == user.id } - it.collaborators = collaborators - it - } - } - - /** - * Maps all [Document] found in [Documents] to the desired [DocumentFormat] and zips them. [formatMapper] should perform the mapping. - */ - fun getZipped( - ctm: CorpusTransformMetadata, - formatMapper: (Document) -> File, - filter: (Document) -> Boolean, - outputStream: OutputStream? = null, - ): File { - val name = metadata.expensiveGet().name - var zipFile: File? = null - val documents = documents.readAll().filter(filter) - executeAndLogTime("Generating $name zip") { - val convertedDocs = documents.asSequence().map(formatMapper) - val docsToCmdi = documents.asSequence().map { CmdiMetadata(ctm.documentMetadata(it.name)).file } - val cmdiZip = createZipFile(docsToCmdi, includeCMDI = true) - // rename the cmdiZip to "metadata" - val dest = File(createTempDirectory("metadata").toFile(), "metadata.zip") - Files.move(cmdiZip.toPath(), dest.toPath()) - zipFile = createZipFile(convertedDocs + dest, outputStream) - } - return zipFile!! - } +package org.ivdnt.galahad.data.corpus + +import org.ivdnt.galahad.BaseFileSystemStore +import org.ivdnt.galahad.FileBackedCache +import org.ivdnt.galahad.FileBackedValue +import org.ivdnt.galahad.app.ExpensiveGettable +import org.ivdnt.galahad.app.User +import org.ivdnt.galahad.app.executeAndLogTime +import org.ivdnt.galahad.data.document.Document +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.document.Documents +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME +import org.ivdnt.galahad.jobs.Jobs +import org.ivdnt.galahad.port.CmdiMetadata +import org.ivdnt.galahad.port.CorpusTransformMetadata +import org.ivdnt.galahad.taggers.Taggers +import org.ivdnt.galahad.util.createZipFile +import java.io.File +import java.io.OutputStream +import java.nio.file.Files +import java.util.* +import kotlin.io.path.createTempDirectory + +/** + * A corpus is a collection of documents, metadata and jobs, saved to a folder. The folder contents are: + * + * - documents/: a folder containing all documents in the corpus. Represented by [Documents]. + * - jobs/: a folder containing all jobs that were active at some point in the corpus. The sourceLayer is one of them. Represented by [Jobs]. + * - metadata: a cache file storing [MutableCorpusMetadata] about the corpus. + * - metadata.cache: a cache file storing [CorpusMetadata] about the corpus. + * + * A Corpus has an owner, who can add collaborators and viewers. + * Collaborators have read and write access (and can add viewers and collaborators). + * Viewers have read access. + * Admins have access to all corpora with read and write access. + */ +class Corpus( + workDirectory: File, +) : BaseFileSystemStore(workDirectory) { + + val documents = Documents(workDirectory.resolve("documents")) + + // Make sure this is initialized before accessing metadata + private val fileBackedMetadata: FileBackedValue + get() = FileBackedValue(workDirectory.resolve("metadata"), MutableCorpusMetadata.initValue()) + + /** + * Convenient access to [MutableCorpusMetadata] without the need to get the expensive [CorpusMetadata] + * When uploading docs, for example, all we need to know is if the user has permission. + */ + var mutableCorpusMetadata: MutableCorpusMetadata + get() = fileBackedMetadata.read() + private set(value) = fileBackedMetadata.modify { value } + + private val metadataCache = object : FileBackedCache( + file = getMetadataFile(), initValue = CorpusMetadata() + ) { + override fun isValid(lastModified: Long): Boolean { + return lastModified >= fileBackedMetadata.lastModified + } + + override fun set(): CorpusMetadata { + return CorpusMetadata( + // Mutable fields + owner = mutableCorpusMetadata.owner, + name = mutableCorpusMetadata.name, + eraTo = mutableCorpusMetadata.eraTo, + eraFrom = mutableCorpusMetadata.eraFrom, + tagset = mutableCorpusMetadata.tagset, + dataset = mutableCorpusMetadata.isDataset, + public = mutableCorpusMetadata.isDataset, // Note that we set isPublic the same as isDataset. + collaborators = mutableCorpusMetadata.collaborators ?: setOf(), + viewers = mutableCorpusMetadata.viewers ?: setOf(), + sourceName = mutableCorpusMetadata.sourceName, + sourceURL = mutableCorpusMetadata.sourceURL, + // Immutable/calculated fields + uuid = UUID.fromString(workDirectory.name), + activeJobs = jobs.readAll().filter { it.isActive }.size, + numDocs = documents.readAll().size, + sizeInBytes = workDirectory.walkTopDown().filter { it.isFile }.map { it.length() }.sum(), // expensive + lastModified = System.currentTimeMillis(), + ) + } + } + + private fun getMetadataFile() = workDirectory.resolve("metadata.cache") + + /** Invalidate cache when new documents are uploaded or job activity changes */ + fun invalidateCache() { + getMetadataFile().delete() + } + + val metadata: ExpensiveGettable = object : ExpensiveGettable { + override fun expensiveGet() = metadataCache.get() + } + + val sourceTagger: ExpensiveGettable = object : ExpensiveGettable { + override fun expensiveGet(): Taggers.Summary { + val metadata = metadata.expensiveGet() + return Taggers.Summary( + id = SOURCE_LAYER_NAME, + description = "uploaded annotations", + tagset = metadata.tagset, + eraFrom = metadata.eraFrom, + eraTo = metadata.eraTo, + produces = setOf("TODO"), + ) + } + } + + // Note: this is somewhat inefficient, since have to get the sourceTagger, even though we might not use it. + val jobs get() = Jobs(workDirectory.resolve("jobs"), this) + + fun delete() { + workDirectory.deleteRecursively() + } + + /** + * Overwrite the [CorpusMetadata] in [metadata] with [newMeta], + * except for the owner, which should be grabbed from the existing [metadata]. + * + * If a user appears multiple times in the permission hierarchy, only the upper level remains. + */ + fun updateMetadata(newMeta: MutableCorpusMetadata, user: User): ExpensiveGettable { + if (!mutableCorpusMetadata.isPublic && newMeta.isPublic) { + // Corpus is being set to public + if (!mutableCorpusMetadata.canMakePublic(user)) { + throw Exception("Unauthorized") + } + } + if (mutableCorpusMetadata.collaborators != newMeta.collaborators || mutableCorpusMetadata.viewers != newMeta.viewers) { + // Collaborators have changed + if (!mutableCorpusMetadata.canAddNewUsers(user) && mutableCorpusMetadata.owner != "") { + throw Exception("Unauthorized") + } + } + // If mutableCorpusMetadata.owner is "", we are working with the InitValue of FileBackedValue, + // so the updateMetadata call is initializing the corpus. + val owner = if (mutableCorpusMetadata.owner == "") user.id else mutableCorpusMetadata.owner + // Overwrite the owner with the original, so collaborators can't change it. + newMeta.owner = owner + + // Trim textual intputs + newMeta.name = newMeta.name.trim() + newMeta.sourceName = newMeta.sourceName?.trim() + newMeta.tagset = newMeta.tagset?.trim() + newMeta.collaborators = newMeta.collaborators?.map { it.trim() }?.toSet() + newMeta.viewers = newMeta.viewers?.map { it.trim() }?.toSet() + + // merge isPublic and isDataset + newMeta.isPublic = newMeta.isDataset + + // Remove owner from list of collaborators & viewers + newMeta.collaborators = newMeta.collaborators?.filter { it != owner }?.toSet() + newMeta.viewers = newMeta.viewers?.filter { it != owner }?.toSet() + // Remove collaborators from list of viewers + if (newMeta.collaborators != null) newMeta.viewers = newMeta.viewers?.filter { + !newMeta.collaborators!!.contains(it) + }?.toSet() + + mutableCorpusMetadata = newMeta + return metadata + } + + fun removeAsViewer(user: User) { + fileBackedMetadata.modify { + val viewers = it.viewers?.toMutableSet() + viewers?.removeIf { i -> i == user.id } + it.viewers = viewers + it + } + } + + fun removeAsCollaborator(user: User) { + fileBackedMetadata.modify { + val collaborators = it.collaborators?.toMutableSet() + collaborators?.removeIf { i -> i == user.id } + it.collaborators = collaborators + it + } + } + + /** + * Maps all [Document] found in [Documents] to the desired [DocumentFormat] and zips them. [formatMapper] should perform the mapping. + */ + fun getZipped( + ctm: CorpusTransformMetadata, + formatMapper: (Document) -> File, + filter: (Document) -> Boolean, + outputStream: OutputStream? = null, + ): File { + val name = metadata.expensiveGet().name + var zipFile: File? = null + val documents = documents.readAll().filter(filter) + executeAndLogTime("Generating $name zip") { + val convertedDocs = documents.asSequence().map(formatMapper) + val docsToCmdi = documents.asSequence().map { CmdiMetadata(ctm.documentMetadata(it.name)).file } + val cmdiZip = createZipFile(docsToCmdi, includeCMDI = true) + // rename the cmdiZip to "metadata" + val dest = File(createTempDirectory("metadata").toFile(), "metadata.zip") + Files.move(cmdiZip.toPath(), dest.toPath()) + zipFile = createZipFile(convertedDocs + dest, outputStream) + } + return zipFile!! + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/document/Document.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/document/Document.kt index 10b8112..b6ec005 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/document/Document.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/document/Document.kt @@ -5,7 +5,10 @@ import org.ivdnt.galahad.FileBackedCache import org.ivdnt.galahad.FileBackedValue import org.ivdnt.galahad.app.ExpensiveGettable import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.port.* +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.InternalFile +import org.ivdnt.galahad.port.PlainTextableFile +import org.ivdnt.galahad.port.SourceLayerableFile import org.ivdnt.galahad.port.conllu.export.LayerToConlluConverter import org.ivdnt.galahad.port.folia.export.LayerToFoliaConverter import org.ivdnt.galahad.port.naf.export.LayerToNAFConverter diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/document/Documents.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/document/Documents.kt index 680b66d..bccfe58 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/document/Documents.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/document/Documents.kt @@ -3,7 +3,6 @@ package org.ivdnt.galahad.data.document import org.ivdnt.galahad.BaseFileSystemStore import org.ivdnt.galahad.app.CRUDSet import org.ivdnt.galahad.data.DocumentWriteType -import org.ivdnt.galahad.data.DocumentsController import java.io.File /** diff --git a/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt b/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt index 2aeddd8..66e63ad 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/data/layer/Term.kt @@ -1,102 +1,112 @@ -package org.ivdnt.galahad.data.layer - -import com.fasterxml.jackson.annotation.JsonIgnore -import com.fasterxml.jackson.annotation.JsonProperty - -/** Avoid empty strings in the CSV representation. */ -fun Term.toNonEmptyPair(): Pair { - return (this.pos ?: Term.NO_POS) to (this.lemma ?: Term.NO_LEMMA) -} - -/** - * A term in a [Layer]. A term has a [lemma], a [pos] and refers to one or multiple [WordForm]. - * Referring to multiple [WordForm] is used to represent multi-word terms, although it is currently not used. - * Lemma and pos can be null. - */ -data class Term( - @JsonProperty("lemma") val lemma: String?, - @JsonProperty("pos") val pos: String?, - @JsonProperty("targets") val targets: MutableList, -) { - /** Whether the lemma is not null. */ - @get:JsonIgnore - val hasLemma: Boolean = lemma != null - - /** Whether the pos is not null. */ - @get:JsonIgnore - val hasPOS: Boolean = pos != null - - @get:JsonIgnore - val posHeadGroupOrDefault - get() = posHeadGroup ?: NO_POS - - @get:JsonIgnore - val lemmaOrDefault - get() = lemma ?: NO_LEMMA - - /** Whether this term refers to multiple [WordForm]. */ - @get:JsonIgnore - val isMultiTarget = targets.size > 1 - - /** The head of the first [pos]. E.g. "PD" for "PD(type=art)+NOU(num=sg)". */ - @get:JsonIgnore - val posHead: String? = posToPosHead(pos) - - @get:JsonIgnore - val isMultiPos: Boolean = pos?.contains("+") ?: false - - /** The head of all [pos]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)". */ - @get:JsonIgnore - val posHeadGroup: String? = run { - // Split on + - if (!isMultiPos) return@run posHead - val result: String? = pos?.split("+")?.mapNotNull { posToPosHead(it) }?.joinToString("+") - result - } - - - - /** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */ - @get:JsonIgnore - val posFeatures: String? - get() { - if (pos == null) return null - val featureStart: Int = pos?.indexOf('(') ?: -1 - val featureEnd: Int = pos?.indexOf(')') ?: -1 - return if (featureStart != -1 && featureEnd != -1) { - return pos!!.slice(featureStart + 1 until featureEnd) - } else null - } - - /** Offset of the first [WordForm] in [targets].*/ - @get:JsonIgnore - val firstOffset get() = targets.minOfOrNull { it.offset } ?: -1 - - /** String constructed from all the [WordForm] in [targets]. */ - @get:JsonIgnore - val literals: String - get() = targets.joinToString(" ") { it.literal } - - companion object { - const val NO_POS = "NO_POS" - const val NO_LEMMA = "NO_LEMMA" - val EMPTY = Term(null, null, mutableListOf()) - private fun posToPosHead(pos: String?): String? { - return if (pos == null) { - null - } else if (pos.contains('(')) { - // pos contains a non-letter non-digit character - val headEnd = pos.indexOf('(') - val head = pos.slice(0 until headEnd) - if (head.isEmpty()) { - pos // pos is non-empty and starts with a non-letter character, e.g.: _ - } else { - head - } - } else { - // pos is 0 or more letters only - pos - } - } - } +package org.ivdnt.galahad.data.layer + +import com.fasterxml.jackson.annotation.JsonIgnore +import com.fasterxml.jackson.annotation.JsonProperty + +/** Avoid empty strings in the CSV representation. */ +fun Term.toNonEmptyPair(): Pair { + return (this.pos ?: Term.NO_POS) to (this.lemma ?: Term.NO_LEMMA) +} + +/** + * A term in a [Layer]. A term has a [lemma], a [pos] and refers to one or multiple [WordForm]. + * Referring to multiple [WordForm] is used to represent multi-word terms, although it is currently not used. + * Lemma and pos can be null. + */ +data class Term( + @JsonProperty("lemma") val lemma: String?, + @JsonProperty("pos") val pos: String?, + @JsonProperty("targets") val targets: MutableList, +) { + /** Whether the lemma is not null. */ + @get:JsonIgnore + val hasLemma: Boolean = lemma != null + + /** Whether the pos is not null. */ + @get:JsonIgnore + val hasPOS: Boolean = pos != null + + @get:JsonIgnore + val posHeadGroupOrDefault + get() = posHeadGroup ?: NO_POS + + @get:JsonIgnore + val lemmaOrDefault + get() = lemma ?: NO_LEMMA + + @get:JsonIgnore + val lemmaOrEmpty + get() = lemma ?: "" + + @get:JsonIgnore + val posOrEmpty + get() = pos ?: "" + + /** Whether this term refers to multiple [WordForm]. */ + @get:JsonIgnore + val isMultiTarget = targets.size > 1 + + /** The head of the first [pos]. E.g. "PD" for "PD(type=art)+NOU(num=sg)". */ + @get:JsonIgnore + val posHead: String? = posToPosHead(pos) + + @get:JsonIgnore + val isMultiPos: Boolean = pos?.contains("+") ?: false + + /** The head of all [pos]. E.g. "PD+NOU" for "PD(type=art)+NOU(num=sg)". */ + @get:JsonIgnore + val posHeadGroup: String? = run { + // Split on + + if (!isMultiPos) return@run posHead + val result: String? = pos?.split("+")?.mapNotNull { posToPosHead(it) }?.joinToString("+") + result + } + + @get:JsonIgnore + val posHeadGroupOrEmpty + get() = posHeadGroup ?: "" + + /** The features of [pos]. E.g. "num=sg" for "NOU(num=sg)". Does not support multi-pos. */ + @get:JsonIgnore + val posFeatures: String? + get() { + if (pos == null) return null + val featureStart: Int = pos.indexOf('(') + val featureEnd: Int = pos.indexOf(')') + return if (featureStart != -1 && featureEnd != -1) { + return pos.slice(featureStart + 1 until featureEnd) + } else null + } + + /** Offset of the first [WordForm] in [targets].*/ + @get:JsonIgnore + val firstOffset get() = targets.minOfOrNull { it.offset } ?: -1 + + /** String constructed from all the [WordForm] in [targets]. */ + @get:JsonIgnore + val literals: String + get() = targets.joinToString(" ") { it.literal } + + companion object { + const val NO_POS = "NO_POS" + const val NO_LEMMA = "NO_LEMMA" + val EMPTY = Term(null, null, mutableListOf()) + private fun posToPosHead(pos: String?): String? { + return if (pos == null) { + null + } else if (pos.contains('(')) { + // pos contains a non-letter non-digit character + val headEnd = pos.indexOf('(') + val head = pos.slice(0 until headEnd) + if (head.isEmpty()) { + pos // pos is non-empty and starts with a non-letter character, e.g.: _ + } else { + head + } + } else { + // pos is 0 or more letters only + pos + } + } + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/EvaluationController.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/EvaluationController.kt index 1eb858f..eed3d90 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/EvaluationController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/EvaluationController.kt @@ -1,256 +1,256 @@ -package org.ivdnt.galahad.evaluation - -import jakarta.servlet.http.HttpServletRequest -import jakarta.servlet.http.HttpServletResponse -import org.apache.logging.log4j.kotlin.Logging -import org.ivdnt.galahad.app.* -import org.ivdnt.galahad.data.CorporaController -import org.ivdnt.galahad.data.corpus.CorpusMetadata -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME -import org.ivdnt.galahad.evaluation.comparison.ConfusionLayerFilter -import org.ivdnt.galahad.evaluation.comparison.PosLemmaTermFilter -import org.ivdnt.galahad.evaluation.comparison.MetricsLayerFilter -import org.ivdnt.galahad.evaluation.confusion.Confusion -import org.ivdnt.galahad.evaluation.confusion.CorpusConfusion -import org.ivdnt.galahad.evaluation.distribution.CorpusDistribution -import org.ivdnt.galahad.evaluation.metrics.CorpusMetrics -import org.ivdnt.galahad.evaluation.metrics.MetricsSettings -import org.ivdnt.galahad.evaluation.metrics.METRIC_TYPES -import org.ivdnt.galahad.evaluation.metrics.PosByPosMetricsSettings -import org.ivdnt.galahad.port.csv.CSVFile -import org.ivdnt.galahad.util.createZipFile -import org.ivdnt.galahad.util.setContentDisposition -import org.springframework.beans.factory.annotation.Autowired -import org.springframework.web.bind.annotation.* -import java.io.File -import java.time.LocalDateTime -import java.time.format.DateTimeFormatter -import java.util.* -import kotlin.io.path.createTempDirectory - -const val DISTRIBUTION_MAX_SIZE = 1000 - -@RestController -class EvaluationController( - val corpora: CorporaController, -) : Logging { - - @Autowired - private val request: HttpServletRequest? = null - - @Autowired - private val response: HttpServletResponse? = null - - @GetMapping(DISTRIBUTION_URL) - @CrossOrigin - fun getDistribution( - @PathVariable corpus: UUID, - @PathVariable job: String, - ): CorpusDistribution { - logger.info("Get distribution for hypothesis layer $job in $corpus") - return CorpusDistribution( - corpora.getReadAccessOrThrow(corpus, request), - job - ).trim(DISTRIBUTION_MAX_SIZE) as CorpusDistribution - } - - @GetMapping(METRICS_URL) - @CrossOrigin - fun getMetrics( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam reference: String?, - ): CorpusMetrics { - logger.info("Get metrics for reference layer $reference and hypothesis layer $job in $corpus") - - val cm = CorpusMetrics( - corpora.getReadAccessOrThrow(corpus, request), - settings = METRIC_TYPES, - hypothesis = job, - reference = if (reference.isNullOrBlank()) SOURCE_LAYER_NAME else reference - ) - return cm - } - - @GetMapping(METRICS_CSV_URL) - @CrossOrigin - fun getMetricsSamples( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam reference: String, - @RequestParam setting: String, - @RequestParam("class") classType: String, - @RequestParam group: String? - ): ByteArray { - val setting = METRIC_TYPES.first { it.id == setting } - - val layerFilter: MetricsLayerFilter? = getLayerFilter(group, setting) - - val cm = CorpusMetrics( - corpus = corpora.getReadAccessOrThrow(corpus, request), - hypothesis = job, - reference = reference, - layerFilter = layerFilter, - truncate = false, - settings = listOf(setting), - ) - val mt = cm.metricTypes.values.first() - - if (group != null) { - val fileName = "metrics-$reference-$job-${classType}-${group}.csv" - val csv = mt.samplesToCsv(group, classType) - return samplesToZip(corpus, job, reference, csv, fileName) - } else { - val fileName = "metrics-$reference-$job-${mt.setting.id}-${classType}.csv" - val csv = mt.samplesToCsv(classType) - return samplesToZip(corpus, job, reference, csv, fileName) - } - } - - private fun getLayerFilter(group: String?, setting: MetricsSettings): MetricsLayerFilter? { - val layerFilter: MetricsLayerFilter? - if (group != null) { - val hypoFilter: PosLemmaTermFilter - val refFilter: PosLemmaTermFilter - if (setting is PosByPosMetricsSettings) { - hypoFilter = PosLemmaTermFilter(group, null) - refFilter = PosLemmaTermFilter(group, null) - } else { - hypoFilter = PosLemmaTermFilter(null, group) - refFilter = PosLemmaTermFilter(null, group) - } - layerFilter = MetricsLayerFilter(hypoFilter, refFilter) - } else { - layerFilter = null - } - return layerFilter - } - - fun samplesToZip( - corpus: UUID, - job: String, - reference: String?, - csvBody: String, - fileName: String - ): ByteArray { - // Create csv file. - val dir: File = createTempDirectory("samples").toFile() - val file = CSVFile(dir.resolve(fileName)) - file.appendText(CSVFile.toCSVHeader(listOf("token","$reference lemma","$reference pos","$job lemma","$job pos"))) - file.appendText(csvBody) - // Write metadata & create zip - val metadata = writeMetadataToDir(corpus, job, reference, dir) - val zipFile = createZipFile(dir.listFiles()!!.asSequence()) - // Configure response for zip. - response!!.contentType = "application/zip" - response.setContentDisposition(metadata.name + "-evaluation.zip") - // zip the directory - return zipFile.readBytes() - } - - @GetMapping(CONFUSION_URL) - @CrossOrigin - fun getConfusion( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam reference: String?, - ): Confusion { - logger.info("Get confusion for reference layer $reference and hypothesis layer $job in $corpus") - return CorpusConfusion( - corpora.getReadAccessOrThrow(corpus, request), - hypothesis = job, - reference = if (reference.isNullOrBlank()) SOURCE_LAYER_NAME else reference, - ) - } - - @GetMapping(CONFUSION_CSV_URL) - @CrossOrigin - fun getConfusionSamples( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam reference: String, - @RequestParam hypoPosFilter: String?, - @RequestParam refPosFilter: String?, - ): ByteArray { - val cc = CorpusConfusion( - corpus = corpora.getReadAccessOrThrow(corpus, request), - hypothesis = job, - reference = reference, - layerFilter = ConfusionLayerFilter( - PosLemmaTermFilter(hypoPosFilter, null), - PosLemmaTermFilter(refPosFilter, null) - ) - ) - val fileName = "confusion-${refPosFilter}-${hypoPosFilter}.csv" - val csv = cc.samplesToCSV() - return samplesToZip(corpus, job, reference, csv, fileName) - } - - @GetMapping(EVALUATION_CSV_URL) - @ResponseBody - @CrossOrigin - fun download( - @PathVariable corpus: UUID, - @PathVariable job: String, - @RequestParam reference: String?, - @RequestParam hypothesisPos: String?, - @RequestParam referencePos: String?, - ): ByteArray { - if (reference != null && hypothesisPos != null && referencePos != null) - return getConfusionSamples(corpus, job, reference, hypothesisPos, referencePos) - - return executeAndLogTime("GetEvaluationCSVs") { - val dir: File = createTempDirectory("evaluation").toFile() - createDistributionCsv(dir, corpus, job) - if (reference != null) { - createMetricsCsv(dir, corpus, job, reference) - createConfusionCsv(dir, corpus, job, reference) - } - val metadata = writeMetadataToDir(corpus, job, reference, dir) - response!!.contentType = "application/zip" - response.setContentDisposition(metadata.name + "-evaluation.zip") - - // zip the directory - val zipFile = createZipFile(dir.listFiles()!!.asSequence()) - zipFile.readBytes() - } - } - - private fun createConfusionCsv(dir: File, corpus: UUID, job: String, reference: String?) { - val file = CSVFile(dir.resolve("confusion.csv")) - file.appendText(getConfusion(corpus, job, reference).countsToCSV()) - } - - private fun createMetricsCsv(dir: File, corpus: UUID, job: String, reference: String?) { - val metrics = getMetrics(corpus, job = job, reference = reference) - val globFile = CSVFile(dir.resolve("metrics-global.csv")) - globFile.appendText(metrics.toGlobalCsv()) - - metrics.metricTypes.values.forEach { mt -> - val file = CSVFile(dir.resolve("metrics-${mt.setting.id}.csv")) - file.appendText(mt.toGroupedCsv()) - } - } - - private fun createDistributionCsv(dir: File, corpus: UUID, job: String) { - val file = CSVFile(dir.resolve("distribution.csv")) - file.appendText(getDistribution(corpus, job).toCSV()) - } - - private fun writeMetadataToDir( - corpus: UUID, job: String, reference: String?, dir: File, - ): CorpusMetadata { - val corpus = corpora.getReadAccessOrThrow(corpus, request) - val metadata = corpus.metadata.expensiveGet() - - val metadataFile = File(dir.resolve("metadata.txt").toURI()) - metadataFile.appendText("Evaluation generated by Galahad\n") - metadataFile.appendText("${LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}\n") - metadataFile.appendText("Corpus: ${metadata.name}\n") - metadataFile.appendText("Documents: ${metadata.numDocs}\n") - metadataFile.appendText("Era: ${metadata.eraFrom}-${metadata.eraTo}\n") - metadataFile.appendText("Hypothesis: $job\n") - if (reference != null) metadataFile.appendText("Reference: $reference\n") - return metadata - } +package org.ivdnt.galahad.evaluation + +import jakarta.servlet.http.HttpServletRequest +import jakarta.servlet.http.HttpServletResponse +import org.apache.logging.log4j.kotlin.Logging +import org.ivdnt.galahad.app.* +import org.ivdnt.galahad.data.CorporaController +import org.ivdnt.galahad.data.corpus.CorpusMetadata +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME +import org.ivdnt.galahad.evaluation.comparison.ConfusionLayerFilter +import org.ivdnt.galahad.evaluation.comparison.MetricsLayerFilter +import org.ivdnt.galahad.evaluation.comparison.PosLemmaTermFilter +import org.ivdnt.galahad.evaluation.confusion.Confusion +import org.ivdnt.galahad.evaluation.confusion.CorpusConfusion +import org.ivdnt.galahad.evaluation.distribution.CorpusDistribution +import org.ivdnt.galahad.evaluation.metrics.CorpusMetrics +import org.ivdnt.galahad.evaluation.metrics.METRIC_TYPES +import org.ivdnt.galahad.evaluation.metrics.MetricsSettings +import org.ivdnt.galahad.evaluation.metrics.PosByPosMetricsSettings +import org.ivdnt.galahad.port.csv.CSVFile +import org.ivdnt.galahad.util.createZipFile +import org.ivdnt.galahad.util.setContentDisposition +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.web.bind.annotation.* +import java.io.File +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import java.util.* +import kotlin.io.path.createTempDirectory + +const val DISTRIBUTION_MAX_SIZE = 1000 + +@RestController +class EvaluationController( + val corpora: CorporaController, +) : Logging { + + @Autowired + private val request: HttpServletRequest? = null + + @Autowired + private val response: HttpServletResponse? = null + + @GetMapping(DISTRIBUTION_URL) + @CrossOrigin + fun getDistribution( + @PathVariable corpus: UUID, + @PathVariable job: String, + ): CorpusDistribution { + logger.info("Get distribution for hypothesis layer $job in $corpus") + return CorpusDistribution( + corpora.getReadAccessOrThrow(corpus, request), + job + ).trim(DISTRIBUTION_MAX_SIZE) as CorpusDistribution + } + + @GetMapping(METRICS_URL) + @CrossOrigin + fun getMetrics( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam reference: String?, + ): CorpusMetrics { + logger.info("Get metrics for reference layer $reference and hypothesis layer $job in $corpus") + + val cm = CorpusMetrics( + corpora.getReadAccessOrThrow(corpus, request), + settings = METRIC_TYPES, + hypothesis = job, + reference = if (reference.isNullOrBlank()) SOURCE_LAYER_NAME else reference + ) + return cm + } + + @GetMapping(METRICS_CSV_URL) + @CrossOrigin + fun getMetricsSamples( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam reference: String, + @RequestParam setting: String, + @RequestParam("class") classType: String, + @RequestParam group: String? + ): ByteArray { + val setting = METRIC_TYPES.first { it.id == setting } + + val layerFilter: MetricsLayerFilter? = getLayerFilter(group, setting) + + val cm = CorpusMetrics( + corpus = corpora.getReadAccessOrThrow(corpus, request), + hypothesis = job, + reference = reference, + layerFilter = layerFilter, + truncate = false, + settings = listOf(setting), + ) + val mt = cm.metricTypes.values.first() + + if (group != null) { + val fileName = "metrics-$reference-$job-${classType}-${group}.csv" + val csv = mt.samplesToCsv(group, classType) + return samplesToZip(corpus, job, reference, csv, fileName) + } else { + val fileName = "metrics-$reference-$job-${mt.setting.id}-${classType}.csv" + val csv = mt.samplesToCsv(classType) + return samplesToZip(corpus, job, reference, csv, fileName) + } + } + + private fun getLayerFilter(group: String?, setting: MetricsSettings): MetricsLayerFilter? { + val layerFilter: MetricsLayerFilter? + if (group != null) { + val hypoFilter: PosLemmaTermFilter + val refFilter: PosLemmaTermFilter + if (setting is PosByPosMetricsSettings) { + hypoFilter = PosLemmaTermFilter(group, null) + refFilter = PosLemmaTermFilter(group, null) + } else { + hypoFilter = PosLemmaTermFilter(null, group) + refFilter = PosLemmaTermFilter(null, group) + } + layerFilter = MetricsLayerFilter(hypoFilter, refFilter) + } else { + layerFilter = null + } + return layerFilter + } + + fun samplesToZip( + corpus: UUID, + job: String, + reference: String?, + csvBody: String, + fileName: String + ): ByteArray { + // Create csv file. + val dir: File = createTempDirectory("samples").toFile() + val file = CSVFile(dir.resolve(fileName)) + file.appendText(CSVFile.toCSVHeader(listOf("token","$reference lemma","$reference pos","$job lemma","$job pos"))) + file.appendText(csvBody) + // Write metadata & create zip + val metadata = writeMetadataToDir(corpus, job, reference, dir) + val zipFile = createZipFile(dir.listFiles()!!.asSequence()) + // Configure response for zip. + response!!.contentType = "application/zip" + response.setContentDisposition(metadata.name + "-evaluation.zip") + // zip the directory + return zipFile.readBytes() + } + + @GetMapping(CONFUSION_URL) + @CrossOrigin + fun getConfusion( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam reference: String?, + ): Confusion { + logger.info("Get confusion for reference layer $reference and hypothesis layer $job in $corpus") + return CorpusConfusion( + corpora.getReadAccessOrThrow(corpus, request), + hypothesis = job, + reference = if (reference.isNullOrBlank()) SOURCE_LAYER_NAME else reference, + ) + } + + @GetMapping(CONFUSION_CSV_URL) + @CrossOrigin + fun getConfusionSamples( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam reference: String, + @RequestParam hypoPosFilter: String?, + @RequestParam refPosFilter: String?, + ): ByteArray { + val cc = CorpusConfusion( + corpus = corpora.getReadAccessOrThrow(corpus, request), + hypothesis = job, + reference = reference, + layerFilter = ConfusionLayerFilter( + PosLemmaTermFilter(hypoPosFilter, null), + PosLemmaTermFilter(refPosFilter, null) + ) + ) + val fileName = "confusion-${refPosFilter}-${hypoPosFilter}.csv" + val csv = cc.samplesToCSV() + return samplesToZip(corpus, job, reference, csv, fileName) + } + + @GetMapping(EVALUATION_CSV_URL) + @ResponseBody + @CrossOrigin + fun download( + @PathVariable corpus: UUID, + @PathVariable job: String, + @RequestParam reference: String?, + @RequestParam hypothesisPos: String?, + @RequestParam referencePos: String?, + ): ByteArray { + if (reference != null && hypothesisPos != null && referencePos != null) + return getConfusionSamples(corpus, job, reference, hypothesisPos, referencePos) + + return executeAndLogTime("GetEvaluationCSVs") { + val dir: File = createTempDirectory("evaluation").toFile() + createDistributionCsv(dir, corpus, job) + if (reference != null) { + createMetricsCsv(dir, corpus, job, reference) + createConfusionCsv(dir, corpus, job, reference) + } + val metadata = writeMetadataToDir(corpus, job, reference, dir) + response!!.contentType = "application/zip" + response.setContentDisposition(metadata.name + "-evaluation.zip") + + // zip the directory + val zipFile = createZipFile(dir.listFiles()!!.asSequence()) + zipFile.readBytes() + } + } + + private fun createConfusionCsv(dir: File, corpus: UUID, job: String, reference: String?) { + val file = CSVFile(dir.resolve("confusion.csv")) + file.appendText(getConfusion(corpus, job, reference).countsToCSV()) + } + + private fun createMetricsCsv(dir: File, corpus: UUID, job: String, reference: String?) { + val metrics = getMetrics(corpus, job = job, reference = reference) + val globFile = CSVFile(dir.resolve("metrics-global.csv")) + globFile.appendText(metrics.toGlobalCsv()) + + metrics.metricTypes.values.forEach { mt -> + val file = CSVFile(dir.resolve("metrics-${mt.setting.id}.csv")) + file.appendText(mt.toGroupedCsv()) + } + } + + private fun createDistributionCsv(dir: File, corpus: UUID, job: String) { + val file = CSVFile(dir.resolve("distribution.csv")) + file.appendText(getDistribution(corpus, job).toCSV()) + } + + private fun writeMetadataToDir( + corpus: UUID, job: String, reference: String?, dir: File, + ): CorpusMetadata { + val corpus = corpora.getReadAccessOrThrow(corpus, request) + val metadata = corpus.metadata.expensiveGet() + + val metadataFile = File(dir.resolve("metadata.txt").toURI()) + metadataFile.appendText("Evaluation generated by Galahad\n") + metadataFile.appendText("${LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"))}\n") + metadataFile.appendText("Corpus: ${metadata.name}\n") + metadataFile.appendText("Documents: ${metadata.numDocs}\n") + metadataFile.appendText("Era: ${metadata.eraFrom}-${metadata.eraTo}\n") + metadataFile.appendText("Hypothesis: $job\n") + if (reference != null) metadataFile.appendText("Reference: $reference\n") + return metadata + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/assays/AssaysController.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/assays/AssaysController.kt index 4fdfe1e..59a1ba8 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/assays/AssaysController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/assays/AssaysController.kt @@ -1,106 +1,105 @@ -package org.ivdnt.galahad.evaluation.assays - -import jakarta.servlet.http.HttpServletRequest -import jakarta.servlet.http.HttpServletResponse -import org.apache.logging.log4j.kotlin.Logging -import org.ivdnt.galahad.FileBackedCache -import org.ivdnt.galahad.app.ASSAYS_URL -import org.ivdnt.galahad.app.ASSAY_URL -import org.ivdnt.galahad.data.CorporaController -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME -import org.ivdnt.galahad.evaluation.metrics.FlatMetricType -import org.ivdnt.galahad.evaluation.metrics.FlatMetricTypeAssay -import org.springframework.beans.factory.annotation.Autowired -import org.springframework.web.bind.annotation.* -import java.util.* -import kotlin.collections.HashMap - -/** - * { - * "dataset-1": { - * "posByPos": { - * "tagger-1": { - * "micro": { ... }, "macro": { ... } - * }, - * "tagger-2": { ... }, - * }, - * "lemmaByLemma": { ... }, - * }, - * "dataset-2": { ... }, - * } - */ -typealias AssaysMatrix = Map> -typealias MutableAssaysMatrix = MutableMap>> - -@RestController -class AssaysController( - val corpora: CorporaController, -) : Logging { - - @Autowired - private val request: HttpServletRequest? = null - - @Autowired - private val response: HttpServletResponse? = null - - /** - * A matrix of 'tagger' -> 'dataset' -> 'FlatMetric' -> 'scores per category', - * for all datasets corpora that have been tagged with at least one tagger, excluding the sourceLayer. - */ - val assaysMatrix = object : FileBackedCache(corpora.assaysFile, HashMap()) { - override fun isValid(lastModified: Long): Boolean { - return corpora.datasets.firstOrNull { it.lastModified > lastModified } == null - TODO("Maybe just check the validity of the other assays?") - } - - override fun set(): AssaysMatrix { - // tagger -> dataset -> assay - val assaysMatrix: MutableAssaysMatrix = HashMap() - // For all datasets - corpora.datasets.forEach { dataset -> - // For all jobs in the dataset - dataset.jobs.readAll() - // Skip the source layer - .filter { it.name != SOURCE_LAYER_NAME } - // Add the assay to the matrix - .forEach { job -> - val meta = dataset.metadata.expensiveGet() - // Initialize the dataset row if needed - if (assaysMatrix[meta.name] == null) { - assaysMatrix[meta.name] = HashMap() - } - val assay = getAssay(meta.uuid, job.name) - assay?.forEach { - assaysMatrix[meta.name]?.putIfAbsent(it.key, HashMap()) - assaysMatrix[meta.name]?.get(it.key)?.put(job.name, it.value) - } - } - } - return assaysMatrix - } - } - - /** - * Get the assay for a single job in a specific corpus. Also used to construct [assaysMatrix]. - */ - @GetMapping(ASSAY_URL) - @CrossOrigin - fun getAssay( - @PathVariable corpus: UUID, - @PathVariable job: String, - ): FlatMetricTypeAssay? { - // The assay is some simple, preferably plain numerical, value that indicates preformance - // Since the exact requirements for the definition of an assays might still change - // we don't provide a solid definition, but instead it is defined ad hoc here - return corpora.getReadAccessOrThrow(corpus, request).jobs.readOrNull(job)?.assay?.get() - } - - /** - * Return [assaysMatrix]. - */ - @RequestMapping(value = [ASSAYS_URL], method = [RequestMethod.GET], produces = ["application/json"]) - @CrossOrigin - fun getAssays(): AssaysMatrix { - return assaysMatrix.get() - } +package org.ivdnt.galahad.evaluation.assays + +import jakarta.servlet.http.HttpServletRequest +import jakarta.servlet.http.HttpServletResponse +import org.apache.logging.log4j.kotlin.Logging +import org.ivdnt.galahad.FileBackedCache +import org.ivdnt.galahad.app.ASSAYS_URL +import org.ivdnt.galahad.app.ASSAY_URL +import org.ivdnt.galahad.data.CorporaController +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME +import org.ivdnt.galahad.evaluation.metrics.FlatMetricType +import org.ivdnt.galahad.evaluation.metrics.FlatMetricTypeAssay +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.web.bind.annotation.* +import java.util.* + +/** + * { + * "dataset-1": { + * "posByPos": { + * "tagger-1": { + * "micro": { ... }, "macro": { ... } + * }, + * "tagger-2": { ... }, + * }, + * "lemmaByLemma": { ... }, + * }, + * "dataset-2": { ... }, + * } + */ +typealias AssaysMatrix = Map> +typealias MutableAssaysMatrix = MutableMap>> + +@RestController +class AssaysController( + val corpora: CorporaController, +) : Logging { + + @Autowired + private val request: HttpServletRequest? = null + + @Autowired + private val response: HttpServletResponse? = null + + /** + * A matrix of 'tagger' -> 'dataset' -> 'FlatMetric' -> 'scores per category', + * for all datasets corpora that have been tagged with at least one tagger, excluding the sourceLayer. + */ + val assaysMatrix = object : FileBackedCache(corpora.assaysFile, HashMap()) { + override fun isValid(lastModified: Long): Boolean { + return corpora.datasets.firstOrNull { it.lastModified > lastModified } == null + TODO("Maybe just check the validity of the other assays?") + } + + override fun set(): AssaysMatrix { + // tagger -> dataset -> assay + val assaysMatrix: MutableAssaysMatrix = HashMap() + // For all datasets + corpora.datasets.forEach { dataset -> + // For all jobs in the dataset + dataset.jobs.readAll() + // Skip the source layer + .filter { it.name != SOURCE_LAYER_NAME } + // Add the assay to the matrix + .forEach { job -> + val meta = dataset.metadata.expensiveGet() + // Initialize the dataset row if needed + if (assaysMatrix[meta.name] == null) { + assaysMatrix[meta.name] = HashMap() + } + val assay = getAssay(meta.uuid, job.name) + assay?.forEach { + assaysMatrix[meta.name]?.putIfAbsent(it.key, HashMap()) + assaysMatrix[meta.name]?.get(it.key)?.put(job.name, it.value) + } + } + } + return assaysMatrix + } + } + + /** + * Get the assay for a single job in a specific corpus. Also used to construct [assaysMatrix]. + */ + @GetMapping(ASSAY_URL) + @CrossOrigin + fun getAssay( + @PathVariable corpus: UUID, + @PathVariable job: String, + ): FlatMetricTypeAssay? { + // The assay is some simple, preferably plain numerical, value that indicates preformance + // Since the exact requirements for the definition of an assays might still change + // we don't provide a solid definition, but instead it is defined ad hoc here + return corpora.getReadAccessOrThrow(corpus, request).jobs.readOrNull(job)?.assay?.get() + } + + /** + * Return [assaysMatrix]. + */ + @RequestMapping(value = [ASSAYS_URL], method = [RequestMethod.GET], produces = ["application/json"]) + @CrossOrigin + fun getAssays(): AssaysMatrix { + return assaysMatrix.get() + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt index e02745c..b0b174c 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/LayerComparison.kt @@ -1,151 +1,156 @@ -package org.ivdnt.galahad.evaluation.comparison - -import com.fasterxml.jackson.annotation.JsonIgnore -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.data.layer.Term - -fun ListIterator.nextOrNull(): Term? { - val iter = iterator() - return if (iter.hasNext()) iter.next() else null -} - -// Some hardcoded punctuation -val PUNCTUATION = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"") - -/** - * Match the [Layer.terms] of two layers based on their position (offset and length), not on their actual value (literal/pos/lemma). - * When pos filters are provide, only match [TermComparison] of that pos. - */ -class LayerComparison( - private val hypothesisLayer: Layer, - private val referenceLayer: Layer, - private val layerFilter: LayerFilter? = null, -) { - @JsonIgnore - val matches: MutableList = ArrayList() - - @JsonIgnore - val referenceTermsWithoutMatches: MutableList = ArrayList() - - @JsonIgnore - val hypothesisTermsWithoutMatches: MutableList = ArrayList() - @JsonIgnore - private val hypoIter: ListIterator = iterForTermsInLayer(hypothesisLayer) - @JsonIgnore - private val refIter: ListIterator = iterForTermsInLayer(referenceLayer) - @JsonIgnore - private var currentHypoTerm: Term? = Term.EMPTY - @JsonIgnore - private var currentRefTerm: Term? = Term.EMPTY - - init { - if (refIter.hasNext() && hypoIter.hasNext()) { - compare() - } else { - hypothesisTermsWithoutMatches.addAll(hypothesisLayer.terms) - referenceTermsWithoutMatches.addAll(referenceLayer.terms) - } - } - - /** Iterate through the terms of both layers simultaneously and compare them. */ - private fun compare() { - // First terms - nextHypo() - nextRef() - // While there are next terms - while (currentHypoTerm != null && currentRefTerm != null) { - val comp = TermComparison(hypoTerm = currentHypoTerm!!, refTerm = currentRefTerm!!) - compareTerm(comp) - } - // One of the two could be non-null. These are not included in the remaining refIter. - currentHypoTerm?.let(::hypoNoMatch) - currentRefTerm?.let(::refNoMatch) - // The remaining terms have no matches - hypoIter.forEachRemaining(::hypoNoMatch) - refIter.forEachRemaining(::refNoMatch) - } - - private fun compareTerm(comp: TermComparison) { - // Act on the comparison - if (comp.fullOverlap) { - fullMatch(comp) - } else { - // Unequal first offset - if (comp.hypoTerm.firstOffset < comp.refTerm.firstOffset) { - hypoNoMatch() - } else if (comp.hypoTerm.firstOffset > comp.refTerm.firstOffset) { - refNoMatch() - } - // Equal first offset but no match. - // Try to truncate either terms to see if the last char is punctuation. - else if (symmetricTruncatedPcMatch(comp)) { - // If so, still match it. - fullMatch(comp) - } else { - hypoNoMatch() - refNoMatch() - } - } - } - - private fun fullMatch(termComparison: TermComparison) { - if (layerFilter?.filter(termComparison) != false) { - matches.add(termComparison) - } - nextHypo() - nextRef() - } - - private fun hypoNoMatch() { - hypoNoMatch(currentHypoTerm!!) - nextHypo() - } - - private fun hypoNoMatch(t: Term) { - // Note how layerFilter can be null, and both null and true != false. - if (layerFilter?.hypoTermFilter?.filter(t) != false) { - hypothesisTermsWithoutMatches.add(t) - } - } - - private fun refNoMatch() { - refNoMatch(currentRefTerm!!) - nextRef() - } - - private fun refNoMatch(t: Term) { - if (layerFilter?.refTermFilter?.filter(t) != false) { - referenceTermsWithoutMatches.add(t) - } - } - - private fun nextHypo() { - currentHypoTerm = hypoIter.nextOrNull() - } - - private fun nextRef() { - currentRefTerm = refIter.nextOrNull() - } - - /** Iterate through the terms of the layer sorted on offset. */ - private fun iterForTermsInLayer(layer: Layer): ListIterator { - return layer.terms - // Terms can only be a match if their first offset is the same - .sortedBy { it.firstOffset }.listIterator() - } - - private fun symmetricTruncatedPcMatch(comp: TermComparison): Boolean { - val aStr: String = comp.hypoTerm.literals - val bStr: String = comp.refTerm.literals - return truncatedPcMatch(aStr, bStr) || truncatedPcMatch(bStr, aStr) - } - - private fun truncatedPcMatch(aStr: String, bStr: String): Boolean { - if (PUNCTUATION.contains(aStr.last().toString())) { - if (aStr.slice(0 until aStr.lastIndex) == bStr) { - return true - } - } - return false - } +package org.ivdnt.galahad.evaluation.comparison + +import com.fasterxml.jackson.annotation.JsonIgnore +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.data.layer.Term + +fun ListIterator.nextOrNull(): Term? { + val iter = iterator() + return if (iter.hasNext()) iter.next() else null +} + +// Some hardcoded punctuation +val PUNCTUATION = listOf(",", ".", "?", "!", ":", ";", ")", "(", "'", "\"") + +/** + * Match the [Layer.terms] of two layers based on their position (offset and length), not on their actual value (literal/pos/lemma). + * When pos filters are provide, only match [TermComparison] of that pos. + */ +class LayerComparison( + private val hypothesisLayer: Layer, + private val referenceLayer: Layer, + private val layerFilter: LayerFilter? = null, +) { + @JsonIgnore + val matches: MutableList = ArrayList() + + @JsonIgnore + val referenceTermsWithoutMatches: MutableList = ArrayList() + + @JsonIgnore + val hypothesisTermsWithoutMatches: MutableList = ArrayList() + @JsonIgnore + private val hypoIter: ListIterator = iterForTermsInLayer(hypothesisLayer) + @JsonIgnore + private val refIter: ListIterator = iterForTermsInLayer(referenceLayer) + @JsonIgnore + private var currentHypoTerm: Term? = Term.EMPTY + @JsonIgnore + private var currentRefTerm: Term? = Term.EMPTY + + init { + if (refIter.hasNext() && hypoIter.hasNext()) { + compare() + } else { + hypothesisTermsWithoutMatches.addAll(hypothesisLayer.terms) + referenceTermsWithoutMatches.addAll(referenceLayer.terms) + } + } + + /** Iterate through the terms of both layers simultaneously and compare them. */ + private fun compare() { + // First terms + nextHypo() + nextRef() + // While there are next terms + while (currentHypoTerm != null && currentRefTerm != null) { + val comp = TermComparison(hypoTerm = currentHypoTerm!!, refTerm = currentRefTerm!!) + compareTerm(comp) + } + // One of the two could be non-null. These are not included in the remaining refIter. + currentHypoTerm?.let(::hypoNoMatch) + currentRefTerm?.let(::refNoMatch) + // The remaining terms have no matches + hypoIter.forEachRemaining(::hypoNoMatch) + refIter.forEachRemaining(::refNoMatch) + } + + private fun compareTerm(comp: TermComparison) { + // Act on the comparison + if (comp.fullOverlap) { + fullMatch(comp) + } else { + // Unequal first offset + if (comp.hypoTerm.firstOffset < comp.refTerm.firstOffset) { + hypoNoMatch() + } else if (comp.hypoTerm.firstOffset > comp.refTerm.firstOffset) { + refNoMatch() + } + // Equal first offset but no match. + // Try to truncate either terms to see if the last char is punctuation. + else if (symmetricTruncatedPcMatch(comp)) { + // If so, still match it. + fullMatch(comp) + } else { + hypoNoMatch() + refNoMatch() + } + } + } + + private fun fullMatch(termComparison: TermComparison) { + if (layerFilter?.filter(termComparison) != false) { + matches.add(termComparison) + } + nextHypo() + nextRef() + } + + private fun hypoNoMatch() { + hypoNoMatch(currentHypoTerm!!) + nextHypo() + } + + private fun hypoNoMatch(t: Term) { + // Note how layerFilter can be null, and both null and true != false. + if (layerFilter?.hypoTermFilter?.filter(t) != false) { + hypothesisTermsWithoutMatches.add(t) + } + } + + private fun refNoMatch() { + refNoMatch(currentRefTerm!!) + nextRef() + } + + private fun refNoMatch(t: Term) { + if (layerFilter?.refTermFilter?.filter(t) != false) { + referenceTermsWithoutMatches.add(t) + } + } + + private fun nextHypo() { + currentHypoTerm = hypoIter.nextOrNull() + } + + private fun nextRef() { + currentRefTerm = refIter.nextOrNull() + } + + /** Iterate through the terms of the layer sorted on offset. */ + private fun iterForTermsInLayer(layer: Layer): ListIterator { + return layer.terms + // Terms can only be a match if their first offset is the same + .sortedBy { it.firstOffset }.listIterator() + } + + private fun symmetricTruncatedPcMatch(comp: TermComparison): Boolean { + val aStr: String = comp.hypoTerm.literals + val bStr: String = comp.refTerm.literals + return truncatedPcMatch(aStr, bStr) || truncatedPcMatch(bStr, aStr) + } + + companion object { + fun truncatedPcMatch(aStr: String, bStr: String): Boolean { + if (aStr.isEmpty() || bStr.isEmpty()) { + return false + } + if (PUNCTUATION.contains(aStr.last().toString())) { + if (aStr.slice(0 until aStr.lastIndex) == bStr) { + return true + } + } + return false + } + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermComparison.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermComparison.kt index 9811ec7..534ee40 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermComparison.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermComparison.kt @@ -1,93 +1,93 @@ -package org.ivdnt.galahad.evaluation.comparison - -import com.fasterxml.jackson.annotation.JsonIgnore -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.data.layer.WordForm - -fun symmetricDifference( - set1: Set, - set2: Set, - equals: (T, T) -> Boolean, -): Set { - - val mset1: MutableSet = HashSet(set1) - val mset2: MutableSet = HashSet(set2) - - for (v1 in set1) { - for (v2 in set2) { - if (equals(v1, v2)) { - mset1.remove(v1) - mset2.remove(v2) - } - } - } - - return mset1 union mset2 -} - -data class TermComparison( - val hypoTerm: Term, // Hypothesis - val refTerm: Term, // True reference -) { - /** Full overlap dependent on the word forms. Overlap of position, not lemma/pos. */ - @get:JsonIgnore - val fullOverlap: Boolean - get() = symmetricDifference(hypoTerm.targets.toSet(), refTerm.targets.toSet(), - equals = { wf1: WordForm, wf2: WordForm -> - WordFormComparison(wf1, wf2).fullOverlap - }).isEmpty() - - /** Partial overlap dependent on the word forms. Overlap of position, not lemma/pos. */ - // Currently not used. - @get:JsonIgnore - val partialOverlap: Boolean - get() { - hypoTerm.targets.forEach { target1 -> - refTerm.targets.forEach { target2 -> - if (WordFormComparison(target1, target2).partialOverlap) { - return true - } - } - } - return false - } - - @get:JsonIgnore - val equalPosLemma: Boolean - get() { - return (equalLemma) && (equalPOS) - } - - /** Whether the lemma is equal. When the reference lemma is empty or null, any hypothesis lemma is fine. */ - @get:JsonIgnore - val equalLemma: Boolean - get() { - if (refTerm.lemma == null) return true - if (refTerm.lemma!!.isEmpty()) return true - if (hypoTerm.lemma == null) return false - return hypoTerm.lemma!!.equals(refTerm.lemma!!, true) - } - - /** Whether the pos is equal. When the reference pos is empty or null, any hypothesis pos is fine. */ - @get:JsonIgnore - val equalPOS: Boolean - get() { - if (refTerm.pos == null) return true - if (refTerm.pos!!.isEmpty()) return true - if (hypoTerm.pos == null) return false - return hypoTerm.pos.equals(refTerm.pos, true) - } - - @get:JsonIgnore - val equalGroupPosHead: Boolean - get() { - if (refTerm.posHeadGroup == null) return true - if (refTerm.posHeadGroup!!.isEmpty()) return true - if (hypoTerm.posHeadGroup == null) return false - return hypoTerm.posHeadGroup.equals(refTerm.posHeadGroup, true) - } - - companion object { - const val MISSING_MATCH = "Missing match" - } +package org.ivdnt.galahad.evaluation.comparison + +import com.fasterxml.jackson.annotation.JsonIgnore +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.data.layer.WordForm + +fun symmetricDifference( + set1: Set, + set2: Set, + equals: (T, T) -> Boolean, +): Set { + + val mset1: MutableSet = HashSet(set1) + val mset2: MutableSet = HashSet(set2) + + for (v1 in set1) { + for (v2 in set2) { + if (equals(v1, v2)) { + mset1.remove(v1) + mset2.remove(v2) + } + } + } + + return mset1 union mset2 +} + +data class TermComparison( + val hypoTerm: Term, // Hypothesis + val refTerm: Term, // True reference +) { + /** Full overlap dependent on the word forms. Overlap of position, not lemma/pos. */ + @get:JsonIgnore + val fullOverlap: Boolean + get() = symmetricDifference(hypoTerm.targets.toSet(), refTerm.targets.toSet(), + equals = { wf1: WordForm, wf2: WordForm -> + WordFormComparison(wf1, wf2).fullOverlap + }).isEmpty() + + /** Partial overlap dependent on the word forms. Overlap of position, not lemma/pos. */ + // Currently not used. + @get:JsonIgnore + val partialOverlap: Boolean + get() { + hypoTerm.targets.forEach { target1 -> + refTerm.targets.forEach { target2 -> + if (WordFormComparison(target1, target2).partialOverlap) { + return true + } + } + } + return false + } + + @get:JsonIgnore + val equalPosLemma: Boolean + get() { + return (equalLemma) && (equalPOS) + } + + /** Whether the lemma is equal. When the reference lemma is empty or null, any hypothesis lemma is fine. */ + @get:JsonIgnore + val equalLemma: Boolean + get() { + if (refTerm.lemma == null) return true + if (refTerm.lemma.isEmpty()) return true + if (hypoTerm.lemma == null) return false + return hypoTerm.lemma.equals(refTerm.lemma, true) + } + + /** Whether the pos is equal. When the reference pos is empty or null, any hypothesis pos is fine. */ + @get:JsonIgnore + val equalPOS: Boolean + get() { + if (refTerm.pos == null) return true + if (refTerm.pos.isEmpty()) return true + if (hypoTerm.pos == null) return false + return hypoTerm.pos.equals(refTerm.pos, true) + } + + @get:JsonIgnore + val equalGroupPosHead: Boolean + get() { + if (refTerm.posHeadGroup == null) return true + if (refTerm.posHeadGroup!!.isEmpty()) return true + if (hypoTerm.posHeadGroup == null) return false + return hypoTerm.posHeadGroup.equals(refTerm.posHeadGroup, true) + } + + companion object { + const val MISSING_MATCH = "Missing match" + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermFilter.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermFilter.kt index c0b729e..4bf7265 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermFilter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/comparison/TermFilter.kt @@ -13,9 +13,9 @@ class PosLemmaTermFilter ( val posHeadGroup: String? = null, val lemma: String? = null, ) : TermFilter { - val multiplePosFilter: (Term) -> Boolean = { t: Term -> t.isMultiPos } - val otherPosFilter: (Term) -> Boolean = { t: Term -> t.pos?.matches(OTHER_POS_REGEX.toRegex()) ?: false} - val singlePosFilter: (Term) -> Boolean = { t: Term -> t.posHeadGroupOrDefault == posHeadGroup } + private val multiplePosFilter: (Term) -> Boolean = { t: Term -> t.isMultiPos } + private val otherPosFilter: (Term) -> Boolean = { t: Term -> t.pos?.contains(Regex(OTHER_POS_REGEX)) ?: false} + private val singlePosFilter: (Term) -> Boolean = { t: Term -> t.posHeadGroupOrDefault == posHeadGroup } val posFilter: (Term) -> Boolean val lemmaFilter: (Term) -> Boolean diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/Confusion.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/Confusion.kt index 256c353..fab0380 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/Confusion.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/confusion/Confusion.kt @@ -9,7 +9,7 @@ import org.ivdnt.galahad.port.csv.CSVFile const val MULTIPLE_POS = "MULTIPLE" const val OTHER_POS = "OTHER" -const val OTHER_POS_REGEX = """^[^A-Z]""" +const val OTHER_POS_REGEX = """^[^a-zA-Z]""" /** * Generic class for the part of speech confusion of a corpus or document. @@ -111,8 +111,8 @@ open class Confusion(private val truncate: Boolean = true): CsvSampleExporter { pos1.contains('+') -> add(MULTIPLE_POS, pos2, evaluationEntry) pos2.contains('+') -> add(pos1, MULTIPLE_POS, evaluationEntry) // Non-alphabetical pos are mapped to a single category "other" - pos1.matches(OTHER_POS_REGEX.toRegex()) -> add(OTHER_POS, pos2, evaluationEntry) - pos2.matches(OTHER_POS_REGEX.toRegex()) -> add(pos1, OTHER_POS, evaluationEntry) + pos1.contains(Regex(OTHER_POS_REGEX)) -> add(OTHER_POS, pos2, evaluationEntry) + pos2.contains(Regex(OTHER_POS_REGEX)) -> add(pos1, OTHER_POS, evaluationEntry) // Otherwise a simple merge else -> matrix.merge(Pair(pos1, pos2), evaluationEntry) { a, b -> EvaluationEntry.add(a, b, truncate) } } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/distribution/Distribution.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/distribution/Distribution.kt index 390912b..9a959bb 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/distribution/Distribution.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/distribution/Distribution.kt @@ -1,90 +1,89 @@ -package org.ivdnt.galahad.evaluation.distribution - -import com.fasterxml.jackson.annotation.JsonIgnore -import org.ivdnt.galahad.data.layer.Term - -/** - * Generic class for frequency distributions of terms in a corpus or document. - * The idea is to sum up the distribution as we go through the terms one by one using [add]. - */ -open class Distribution { - - /** - * (lem, pos) -> (count, literal[]) - */ - @JsonIgnore - val distributionMap: MutableMap, Pair> = HashMap() - - var isTrimmed = false - var coveredChars = 0 - var coveredAlphabeticChars = 0 - var totalChars = 0 - var totalAlphabeticChars = 0 - - /** - * Is serialized and send through API, so it is in fact used. - */ - @Suppress("unused") - val distribution: Set - get() = distributionMap.entries.map { - DistributionRow( - it.key.first, - it.key.second, - it.value.first, - it.value.second - ) - }.toSet() - - fun add(term: Term) { - val literal: String = term.literals - coveredChars += literal.length - coveredAlphabeticChars += literal.count { char -> char.isLetter() } - add( - lemma = term.lemma ?: Term.NO_LEMMA, - pos = term.posHeadGroup ?: Term.NO_POS, - count = 1, - literals = LiteralsEntry(mapOf(term.literals to 1)) - ) - } - - private fun add(lemma: String, pos: String, count: Int, literals: LiteralsEntry) { - distributionMap.merge( - Pair(lemma, pos), Pair(count, literals) - ) { p1, p2 -> Pair(p1.first + p2.first, p1.second.add(p2.second)) } - } - - fun add(other: Distribution) { - coveredChars += other.coveredChars - coveredAlphabeticChars += other.coveredAlphabeticChars - totalChars += other.totalChars - totalAlphabeticChars += other.totalAlphabeticChars - other.distributionMap.forEach { - add(it.key.first, it.key.second, it.value.first, it.value.second) - } - } - - fun toCSV(): String { - var csv: String = DistributionRow.getCsvHeader() - getSorted().forEach { csv += it.toCSVRecord() } - return csv - } - - @JsonIgnore - fun getSorted(): Set { - return distributionMap.toList() - // Sort on count (descending, hence minus), then on lemma. - // Note that 'it' is of type Pair< Pair , Pair > - .sortedWith(compareBy({ -it.second.first }, { it.first.first.lowercase() })).toMap() - // Map to serializable rows. - .entries.map { DistributionRow(it.key.first, it.key.second, it.value.first, it.value.second) }.toSet() - } - - fun trim(maxSize: Int, minVal: Int = 2): Distribution { - if (distributionMap.size > maxSize) { - isTrimmed = true - distributionMap.entries.removeIf { it.value.first < minVal } - trim(maxSize, minVal + 1) // recursion ftw - } - return this - } +package org.ivdnt.galahad.evaluation.distribution + +import com.fasterxml.jackson.annotation.JsonIgnore +import org.ivdnt.galahad.data.layer.Term + +/** + * Generic class for frequency distributions of terms in a corpus or document. + * The idea is to sum up the distribution as we go through the terms one by one using [add]. + */ +open class Distribution { + + /** + * (lem, pos) -> (count, literal[]) + */ + @JsonIgnore + val distributionMap: MutableMap, Pair> = HashMap() + + var isTrimmed = false + var coveredChars = 0 + var coveredAlphabeticChars = 0 + var totalChars = 0 + var totalAlphabeticChars = 0 + + /** + * Is serialized and send through API, so it is in fact used. + */ + val distribution: Set + get() = distributionMap.entries.map { + DistributionRow( + it.key.first, + it.key.second, + it.value.first, + it.value.second + ) + }.toSet() + + fun add(term: Term) { + val literal: String = term.literals + coveredChars += literal.length + coveredAlphabeticChars += literal.count { char -> char.isLetter() } + add( + lemma = term.lemma ?: Term.NO_LEMMA, + pos = term.posHeadGroup ?: Term.NO_POS, + count = 1, + literals = LiteralsEntry(mapOf(term.literals to 1)) + ) + } + + private fun add(lemma: String, pos: String, count: Int, literals: LiteralsEntry) { + distributionMap.merge( + Pair(lemma, pos), Pair(count, literals) + ) { p1, p2 -> Pair(p1.first + p2.first, p1.second.add(p2.second)) } + } + + fun add(other: Distribution) { + coveredChars += other.coveredChars + coveredAlphabeticChars += other.coveredAlphabeticChars + totalChars += other.totalChars + totalAlphabeticChars += other.totalAlphabeticChars + other.distributionMap.forEach { + add(it.key.first, it.key.second, it.value.first, it.value.second) + } + } + + fun toCSV(): String { + var csv: String = DistributionRow.getCsvHeader() + getSorted().forEach { csv += it.toCSVRecord() } + return csv + } + + @JsonIgnore + fun getSorted(): Set { + return distributionMap.toList() + // Sort on count (descending, hence minus), then on lemma. + // Note that 'it' is of type Pair< Pair , Pair > + .sortedWith(compareBy({ -it.second.first }, { it.first.first.lowercase() })).toMap() + // Map to serializable rows. + .entries.map { DistributionRow(it.key.first, it.key.second, it.value.first, it.value.second) }.toSet() + } + + fun trim(maxSize: Int, minVal: Int = 2): Distribution { + if (distributionMap.size > maxSize) { + isTrimmed = true + distributionMap.entries.removeIf { it.value.first < minVal } + trim(maxSize, minVal + 1) // recursion ftw + } + return this + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/Metric.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/Metric.kt index 1435ae5..dadde11 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/Metric.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/Metric.kt @@ -1,164 +1,166 @@ -package org.ivdnt.galahad.evaluation.metrics - -import com.fasterxml.jackson.annotation.JsonIgnore -import com.fasterxml.jackson.annotation.JsonProperty -import org.ivdnt.galahad.evaluation.EvaluationEntry -import org.ivdnt.galahad.port.csv.CSVFile -import org.ivdnt.galahad.port.csv.CSVHeader -import org.ivdnt.galahad.port.csv.CSVRecord -import org.ivdnt.galahad.util.toFixed - -/** - * The number of [Term]s between two [Layer]s that are equal, different or not present in the reference and hypothesis layers. - * Based on lemma and part of speech. - * - * Note that normal [Metric]s always contain samples, which are cheap to get when we are calculating them anyway, - * However for downstream applications like aggregates, we may want to omit the samples - * Therefore we can use this utility class - */ - -data class ClassificationMetrics( - val precision: Float = 0f, - val recall: Float = 0f, - val f1: Float = 0f, - val accuracy: Float = 0f, -) { - operator fun plus(other: ClassificationMetrics): ClassificationMetrics = ClassificationMetrics( - precision + other.precision, - recall + other.recall, - f1 + other.f1, - accuracy + other.accuracy, - ) - - operator fun div(divisor: Int): ClassificationMetrics = this * (1.0f / divisor) - - operator fun times(factor: Float): ClassificationMetrics = ClassificationMetrics( - precision * factor, - recall * factor, - f1 * factor, - accuracy * factor, - ) - - companion object { - fun calculate(cls: ClassificationClasses): ClassificationMetrics { - return calculate(cls.flat) - } - - fun calculate(cls: FlatClassificationClasses, micro: Boolean = false): ClassificationMetrics { - val tp = cls.truePositive.toFloat() - val fp = cls.falsePositive.toFloat() - val fn = cls.falseNegative.toFloat() - // When calculating micro-accuracy, tp and fp are the same, so don't double count. - val total = if(micro) tp + fp else cls.count.toFloat() - return calculate(tp, fp, fn, total) - } - - private fun calculate(tp: Float, fp: Float, fn: Float, total: Float): ClassificationMetrics { - fun notNaN(value: Float): Float = if (value.isNaN()) 0.0f else value - - val precision = notNaN(tp / (tp + fp)) - val recall = notNaN(tp / (tp + fn)) - - return ClassificationMetrics( - precision = precision, - recall = recall, - f1 = notNaN(2.0f * (precision * recall) / (precision + recall)), - accuracy = notNaN(tp / (total)), - ) - } - } -} - -/** Use for micro-averaging */ -open class FlatClassificationClasses( - var truePositive: Int = 0, - var falsePositive: Int = 0, - var falseNegative: Int = 0, - // Similar to falseNegative. - var noMatch: Int = 0, - var count: Int = 0, -) { - operator fun plus(other: FlatClassificationClasses) = FlatClassificationClasses( - truePositive + other.truePositive, - falsePositive + other.falsePositive, - falseNegative + other.falseNegative, - noMatch + other.noMatch, - count + other.count, - ) -} - -open class ClassificationClasses( - var truePositive: EvaluationEntry = EvaluationEntry(), - var falsePositive: EvaluationEntry = EvaluationEntry(), - var falseNegative: EvaluationEntry = EvaluationEntry(), - // Similar to falseNegative. - var noMatch: EvaluationEntry = EvaluationEntry(), - /** sample count without duplicates, for calculating accuracy. */ - @JsonIgnore var count: Int = 1, -) { - open val classCount: Int - get() = truePositive.count + falsePositive.count + falseNegative.count + noMatch.count - - fun add(other: ClassificationClasses, truncate: Boolean = true): ClassificationClasses { - truePositive = EvaluationEntry.add(truePositive, other.truePositive, truncate) - falsePositive = EvaluationEntry.add(falsePositive, other.falsePositive, truncate) - falseNegative = EvaluationEntry.add(falseNegative, other.falseNegative, truncate) - noMatch = EvaluationEntry.add(noMatch, other.noMatch, truncate) - count += other.count - return this - } - - @get:JsonIgnore - open val flat: FlatClassificationClasses - get() = FlatClassificationClasses( - truePositive = truePositive.count, - falsePositive = falsePositive.count, - falseNegative = falseNegative.count, - noMatch = noMatch.count, - count = count - ) -} - -data class Metric( - @JsonProperty("name") val name: String, - @JsonProperty("classes") var cls: ClassificationClasses = ClassificationClasses(), -) { - @get:JsonProperty("metrics") - val clsMetrics - get() = ClassificationMetrics.calculate(cls) - - fun add(other: Metric, truncate: Boolean): Metric { - cls.add(other.cls, truncate) - return this - } - - fun toCSVRecord(): CSVRecord { - return CSVFile.toCSVRecord(listOf( - name, - clsMetrics.precision.toFixed(), - clsMetrics.recall.toFixed(), - clsMetrics.f1.toFixed(), - cls.classCount, - cls.truePositive.count, - cls.falsePositive.count, - cls.falseNegative.count, - cls.noMatch.count, - )) - } - - companion object { - fun getCsvHeader(): CSVHeader { - return CSVFile.toCSVHeader(listOf( - "grouped by", - "precision", - "recall", - "f1", - "count", - "true positive count", - "false positive count", - "false negative count", - "no match count") - ) - } - } +package org.ivdnt.galahad.evaluation.metrics + +import com.fasterxml.jackson.annotation.JsonIgnore +import com.fasterxml.jackson.annotation.JsonProperty +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.evaluation.EvaluationEntry +import org.ivdnt.galahad.port.csv.CSVFile +import org.ivdnt.galahad.port.csv.CSVHeader +import org.ivdnt.galahad.port.csv.CSVRecord +import org.ivdnt.galahad.util.toFixed + +/** + * The number of [Term]s between two [Layer]s that are equal, different or not present in the reference and hypothesis layers. + * Based on lemma and part of speech. + * + * Note that normal [Metric]s always contain samples, which are cheap to get when we are calculating them anyway, + * However for downstream applications like aggregates, we may want to omit the samples + * Therefore we can use this utility class + */ + +data class ClassificationMetrics( + val precision: Float = 0f, + val recall: Float = 0f, + val f1: Float = 0f, + val accuracy: Float = 0f, +) { + operator fun plus(other: ClassificationMetrics): ClassificationMetrics = ClassificationMetrics( + precision + other.precision, + recall + other.recall, + f1 + other.f1, + accuracy + other.accuracy, + ) + + operator fun div(divisor: Int): ClassificationMetrics = this * (1.0f / divisor) + + operator fun times(factor: Float): ClassificationMetrics = ClassificationMetrics( + precision * factor, + recall * factor, + f1 * factor, + accuracy * factor, + ) + + companion object { + fun calculate(cls: ClassificationClasses): ClassificationMetrics { + return calculate(cls.flat) + } + + fun calculate(cls: FlatClassificationClasses, micro: Boolean = false): ClassificationMetrics { + val tp = cls.truePositive.toFloat() + val fp = cls.falsePositive.toFloat() + val fn = cls.falseNegative.toFloat() + // When calculating micro-accuracy, tp and fp are the same, so don't double count. + val total = if(micro) tp + fp else cls.count.toFloat() + return calculate(tp, fp, fn, total) + } + + private fun calculate(tp: Float, fp: Float, fn: Float, total: Float): ClassificationMetrics { + fun notNaN(value: Float): Float = if (value.isNaN()) 0.0f else value + + val precision = notNaN(tp / (tp + fp)) + val recall = notNaN(tp / (tp + fn)) + + return ClassificationMetrics( + precision = precision, + recall = recall, + f1 = notNaN(2.0f * (precision * recall) / (precision + recall)), + accuracy = notNaN(tp / (total)), + ) + } + } +} + +/** Use for micro-averaging */ +open class FlatClassificationClasses( + var truePositive: Int = 0, + var falsePositive: Int = 0, + var falseNegative: Int = 0, + // Similar to falseNegative. + var noMatch: Int = 0, + var count: Int = 0, +) { + operator fun plus(other: FlatClassificationClasses) = FlatClassificationClasses( + truePositive + other.truePositive, + falsePositive + other.falsePositive, + falseNegative + other.falseNegative, + noMatch + other.noMatch, + count + other.count, + ) +} + +open class ClassificationClasses( + var truePositive: EvaluationEntry = EvaluationEntry(), + var falsePositive: EvaluationEntry = EvaluationEntry(), + var falseNegative: EvaluationEntry = EvaluationEntry(), + // Similar to falseNegative. + var noMatch: EvaluationEntry = EvaluationEntry(), + /** sample count without duplicates, for calculating accuracy. */ + @JsonIgnore var count: Int = 1, +) { + open val classCount: Int + get() = truePositive.count + falsePositive.count + falseNegative.count + noMatch.count + + fun add(other: ClassificationClasses, truncate: Boolean = true): ClassificationClasses { + truePositive = EvaluationEntry.add(truePositive, other.truePositive, truncate) + falsePositive = EvaluationEntry.add(falsePositive, other.falsePositive, truncate) + falseNegative = EvaluationEntry.add(falseNegative, other.falseNegative, truncate) + noMatch = EvaluationEntry.add(noMatch, other.noMatch, truncate) + count += other.count + return this + } + + @get:JsonIgnore + open val flat: FlatClassificationClasses + get() = FlatClassificationClasses( + truePositive = truePositive.count, + falsePositive = falsePositive.count, + falseNegative = falseNegative.count, + noMatch = noMatch.count, + count = count + ) +} + +data class Metric( + @JsonProperty("name") val name: String, + @JsonProperty("classes") var cls: ClassificationClasses = ClassificationClasses(), +) { + @get:JsonProperty("metrics") + val clsMetrics + get() = ClassificationMetrics.calculate(cls) + + fun add(other: Metric, truncate: Boolean): Metric { + cls.add(other.cls, truncate) + return this + } + + fun toCSVRecord(): CSVRecord { + return CSVFile.toCSVRecord(listOf( + name, + clsMetrics.precision.toFixed(), + clsMetrics.recall.toFixed(), + clsMetrics.f1.toFixed(), + cls.classCount, + cls.truePositive.count, + cls.falsePositive.count, + cls.falseNegative.count, + cls.noMatch.count, + )) + } + + companion object { + fun getCsvHeader(): CSVHeader { + return CSVFile.toCSVHeader(listOf( + "grouped by", + "precision", + "recall", + "f1", + "count", + "true positive count", + "false positive count", + "false negative count", + "no match count") + ) + } + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/MetricsType.kt b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/MetricsType.kt index 5633b8d..705d92c 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/MetricsType.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/evaluation/metrics/MetricsType.kt @@ -1,177 +1,176 @@ -package org.ivdnt.galahad.evaluation.metrics - -import com.fasterxml.jackson.annotation.JsonIgnore -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.evaluation.CsvSampleExporter -import org.ivdnt.galahad.evaluation.EvaluationEntry -import org.ivdnt.galahad.evaluation.comparison.TermComparison -import org.ivdnt.galahad.port.csv.CSVFile -import org.ivdnt.galahad.port.csv.CSVHeader -import org.ivdnt.galahad.util.toFixed - -typealias FlatMetricTypeAssay = Map -class FlatMetricType( - val micro: ClassificationMetrics = ClassificationMetrics(), - val macro: ClassificationMetrics = ClassificationMetrics(), -) - -class MetricsType( - val setting: MetricsSettings, - @JsonIgnore var truncate: Boolean = true -) : CsvSampleExporter { - @JsonIgnore - var map: MutableMap = HashMap() - - /** Metrics separated per POS. */ - val grouped: Set - get() { - return if (map.size > TRUNCATE) { - // sort mt.value.map on mt.value.map["someKey"].cls.classCount, take the first TRUNCATE elements, and then map - map.entries.asSequence() - .sortedByDescending { it.value.cls.classCount }.take(TRUNCATE) - .associateBy({ it.key }, { it.value }).values.toSet() - } else { - map.values.toSet() - } - } - - val classes: ClassificationClasses - get() = map.values.map { it.cls }.toMutableList().apply{ this.add(0, ClassificationClasses(count=0)) }.reduce { a, b -> a.add(b, truncate) }.apply { falsePositive = EvaluationEntry() } - - val macro: ClassificationMetrics - get() { - if (map.isEmpty()) { - return ClassificationMetrics() - } - return map.values.map { it.clsMetrics }.reduce { a, b -> a + b } / map.size - } - - val micro: ClassificationMetrics - get() { - if (map.isEmpty()) { - return ClassificationMetrics() - } - return ClassificationMetrics.calculate(map.values.map { it.cls.flat }.reduce { a, b -> a + b }, micro = true) - } - - fun toGlobalCsv(): String { - // Expensive calculations. - val microMetrics = micro - val macroMetrics = macro - - return CSVFile.toCSVRecord(listOf( - setting.annotation, - setting.group, - macroMetrics.precision.toFixed(), - macroMetrics.recall.toFixed(), - macroMetrics.f1.toFixed(), - microMetrics.accuracy.toFixed(), - classes.classCount, - classes.truePositive.count, - classes.falseNegative.count, - classes.noMatch.count, - )) - } - - fun toFlat(): FlatMetricType { - return FlatMetricType(micro, macro) - } - - fun toGroupedCsv(): String { - var csv = Metric.getCsvHeader() - grouped.sortedBy { it.name }.forEach{ csv += it.toCSVRecord() } - return csv - } - - // Cumulative addition functions. - private fun add(metric: Metric) { - map.merge(metric.name, metric) { m1, m2 -> m1.add(m2, truncate) } - } - - fun add(other: MetricsType) { - other.map.values.toSet().forEach(this::add) - } - - fun add(comp: TermComparison) { - if (!setting.filterBy(comp)) { - return - } - - if (comp.hypoTerm == Term.EMPTY) { - add ( - Metric( - name = setting.groupBy(comp.refTerm), - cls = ClassificationClasses( - noMatch = EvaluationEntry(1, mutableListOf(comp)), - count = 0 - ) - ) - ) - } - - // One of these two will be empty, we don't know which. - val (trues, falses) = truesFalses(comp, setting::termsEqual) - val cls = ClassificationClasses( - truePositive = trues, - falseNegative = falses, - ) - add( - Metric( - name = setting.groupBy(comp.refTerm), - cls = cls - ) - ) - if (falses.count != 0) { - // This term is also be someone else's false positive, so switch around. - val cls2 = ClassificationClasses( - falsePositive = EvaluationEntry(count = falses.samples.size, falses.samples.toMutableList()), - count = if (setting.groupBy(comp.hypoTerm) == setting.groupBy(comp.refTerm)) 0 else 1 - ) - add( - Metric( - name = setting.groupBy(comp.hypoTerm), // Terms are switched, so hypo. - cls = cls2 - ) - ) - } - } - - private fun truesFalses(comp: TermComparison, cond: (TermComparison) -> Boolean): Pair { - val trues = if (cond(comp)) { - EvaluationEntry(1, mutableListOf(comp)) - } else { - EvaluationEntry() - } - val falses = if (!cond(comp)) { - EvaluationEntry(1, mutableListOf(comp)) - } else { - EvaluationEntry() - } - return Pair(trues, falses) - } - - fun samplesToCsv(group: String, classType: String): String { - return when (classType) { - "truePositive" -> samplesToCSV(map[group]?.cls?.truePositive?.samples) - "falsePositive" -> samplesToCSV(map[group]?.cls?.falsePositive?.samples) - "falseNegative" -> samplesToCSV(map[group]?.cls?.falseNegative?.samples) - "noMatch" -> samplesToCSV(map[group]?.cls?.noMatch?.samples) - else -> "" - } - } - - fun samplesToCsv(classType: String): String { - return when (classType) { - "truePositive" -> samplesToCSV(classes.truePositive.samples) - "falseNegative" -> samplesToCSV(classes.falseNegative.samples) - "noMatch" -> samplesToCSV(classes.noMatch.samples) - else -> "" - } - } - - - override fun samplesToCSV(): String { - val cls = listOf(classes.falsePositive, classes.falseNegative, classes.truePositive) - return samplesToCSV(cls.flatMap { it.samples }) - } +package org.ivdnt.galahad.evaluation.metrics + +import com.fasterxml.jackson.annotation.JsonIgnore +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.evaluation.CsvSampleExporter +import org.ivdnt.galahad.evaluation.EvaluationEntry +import org.ivdnt.galahad.evaluation.comparison.TermComparison +import org.ivdnt.galahad.port.csv.CSVFile +import org.ivdnt.galahad.util.toFixed + +typealias FlatMetricTypeAssay = Map +class FlatMetricType( + val micro: ClassificationMetrics = ClassificationMetrics(), + val macro: ClassificationMetrics = ClassificationMetrics(), +) + +class MetricsType( + val setting: MetricsSettings, + @JsonIgnore var truncate: Boolean = true +) : CsvSampleExporter { + @JsonIgnore + var map: MutableMap = HashMap() + + /** Metrics separated per POS. */ + val grouped: Set + get() { + return if (map.size > TRUNCATE) { + // sort mt.value.map on mt.value.map["someKey"].cls.classCount, take the first TRUNCATE elements, and then map + map.entries.asSequence() + .sortedByDescending { it.value.cls.classCount }.take(TRUNCATE) + .associateBy({ it.key }, { it.value }).values.toSet() + } else { + map.values.toSet() + } + } + + val classes: ClassificationClasses + get() = map.values.map { it.cls }.toMutableList().apply{ this.add(0, ClassificationClasses(count=0)) }.reduce { a, b -> a.add(b, truncate) }.apply { falsePositive = EvaluationEntry() } + + val macro: ClassificationMetrics + get() { + if (map.isEmpty()) { + return ClassificationMetrics() + } + return map.values.map { it.clsMetrics }.reduce { a, b -> a + b } / map.size + } + + val micro: ClassificationMetrics + get() { + if (map.isEmpty()) { + return ClassificationMetrics() + } + return ClassificationMetrics.calculate(map.values.map { it.cls.flat }.reduce { a, b -> a + b }, micro = true) + } + + fun toGlobalCsv(): String { + // Expensive calculations. + val microMetrics = micro + val macroMetrics = macro + + return CSVFile.toCSVRecord(listOf( + setting.annotation, + setting.group, + macroMetrics.precision.toFixed(), + macroMetrics.recall.toFixed(), + macroMetrics.f1.toFixed(), + microMetrics.accuracy.toFixed(), + classes.classCount, + classes.truePositive.count, + classes.falseNegative.count, + classes.noMatch.count, + )) + } + + fun toFlat(): FlatMetricType { + return FlatMetricType(micro, macro) + } + + fun toGroupedCsv(): String { + var csv = Metric.getCsvHeader() + grouped.sortedBy { it.name }.forEach{ csv += it.toCSVRecord() } + return csv + } + + // Cumulative addition functions. + private fun add(metric: Metric) { + map.merge(metric.name, metric) { m1, m2 -> m1.add(m2, truncate) } + } + + fun add(other: MetricsType) { + other.map.values.toSet().forEach(this::add) + } + + fun add(comp: TermComparison) { + if (!setting.filterBy(comp)) { + return + } + + if (comp.hypoTerm == Term.EMPTY) { + add ( + Metric( + name = setting.groupBy(comp.refTerm), + cls = ClassificationClasses( + noMatch = EvaluationEntry(1, mutableListOf(comp)), + count = 0 + ) + ) + ) + } + + // One of these two will be empty, we don't know which. + val (trues, falses) = truesFalses(comp, setting::termsEqual) + val cls = ClassificationClasses( + truePositive = trues, + falseNegative = falses, + ) + add( + Metric( + name = setting.groupBy(comp.refTerm), + cls = cls + ) + ) + if (falses.count != 0) { + // This term is also be someone else's false positive, so switch around. + val cls2 = ClassificationClasses( + falsePositive = EvaluationEntry(count = falses.samples.size, falses.samples.toMutableList()), + count = if (setting.groupBy(comp.hypoTerm) == setting.groupBy(comp.refTerm)) 0 else 1 + ) + add( + Metric( + name = setting.groupBy(comp.hypoTerm), // Terms are switched, so hypo. + cls = cls2 + ) + ) + } + } + + private fun truesFalses(comp: TermComparison, cond: (TermComparison) -> Boolean): Pair { + val trues = if (cond(comp)) { + EvaluationEntry(1, mutableListOf(comp)) + } else { + EvaluationEntry() + } + val falses = if (!cond(comp)) { + EvaluationEntry(1, mutableListOf(comp)) + } else { + EvaluationEntry() + } + return Pair(trues, falses) + } + + fun samplesToCsv(group: String, classType: String): String { + return when (classType) { + "truePositive" -> samplesToCSV(map[group]?.cls?.truePositive?.samples) + "falsePositive" -> samplesToCSV(map[group]?.cls?.falsePositive?.samples) + "falseNegative" -> samplesToCSV(map[group]?.cls?.falseNegative?.samples) + "noMatch" -> samplesToCSV(map[group]?.cls?.noMatch?.samples) + else -> "" + } + } + + fun samplesToCsv(classType: String): String { + return when (classType) { + "truePositive" -> samplesToCSV(classes.truePositive.samples) + "falseNegative" -> samplesToCSV(classes.falseNegative.samples) + "noMatch" -> samplesToCSV(classes.noMatch.samples) + else -> "" + } + } + + + override fun samplesToCSV(): String { + val cls = listOf(classes.falsePositive, classes.falseNegative, classes.truePositive) + return samplesToCSV(cls.flatMap { it.samples }) + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/jobs/DocumentJob.kt b/server/src/main/kotlin/org/ivdnt/galahad/jobs/DocumentJob.kt index 046d4b8..6455ebd 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/jobs/DocumentJob.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/jobs/DocumentJob.kt @@ -7,6 +7,7 @@ import org.ivdnt.galahad.FileBackedValue import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.layer.LayerPreview import org.ivdnt.galahad.data.layer.LayerSummary +import org.ivdnt.galahad.jobs.DocumentJob.DocumentProcessingStatus import org.ivdnt.galahad.tagset.Tagset import java.io.File import java.util.* diff --git a/server/src/main/kotlin/org/ivdnt/galahad/jobs/InternalJobController.kt b/server/src/main/kotlin/org/ivdnt/galahad/jobs/InternalJobController.kt index 0fc5429..ff93245 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/jobs/InternalJobController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/jobs/InternalJobController.kt @@ -1,17 +1,19 @@ package org.ivdnt.galahad.jobs import org.apache.logging.log4j.kotlin.Logging -import org.ivdnt.galahad.app.* +import org.ivdnt.galahad.app.Config +import org.ivdnt.galahad.app.INTERNAL_JOBS_ERROR_URL +import org.ivdnt.galahad.app.INTERNAL_JOBS_RESULT_URL import org.ivdnt.galahad.data.CorporaController import org.ivdnt.galahad.data.document.Document -import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.port.SourceLayerableFile +import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.port.InternalFile +import org.ivdnt.galahad.port.SourceLayerableFile import org.ivdnt.galahad.port.tsv.TSVFile import org.ivdnt.galahad.taggers.Taggers -import org.ivdnt.galahad.tagset.TagsetStore import org.ivdnt.galahad.tagset.Tagset +import org.ivdnt.galahad.tagset.TagsetStore import org.springframework.web.bind.annotation.* import org.springframework.web.multipart.MultipartFile import java.io.File diff --git a/server/src/main/kotlin/org/ivdnt/galahad/jobs/JobsController.kt b/server/src/main/kotlin/org/ivdnt/galahad/jobs/JobsController.kt index f0d46a2..4f56cb7 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/jobs/JobsController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/jobs/JobsController.kt @@ -1,15 +1,13 @@ package org.ivdnt.galahad.jobs -import com.fasterxml.jackson.annotation.JsonProperty +import jakarta.servlet.http.HttpServletRequest +import jakarta.servlet.http.HttpServletResponse import org.ivdnt.galahad.app.JOBS_URL import org.ivdnt.galahad.app.JOB_URL import org.ivdnt.galahad.data.CorporaController import org.springframework.beans.factory.annotation.Autowired import org.springframework.web.bind.annotation.* import java.util.* -import jakarta.servlet.http.HttpServletRequest -import jakarta.servlet.http.HttpServletResponse -import org.ivdnt.galahad.app.TAGGERS_URL @RestController class JobsController( diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/CmdiMetadata.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/CmdiMetadata.kt index 120c843..13dc261 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/CmdiMetadata.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/CmdiMetadata.kt @@ -1,60 +1,96 @@ -package org.ivdnt.galahad.port - -import org.ivdnt.galahad.data.corpus.CorpusMetadata -import org.ivdnt.galahad.util.escapeXML -import org.ivdnt.galahad.util.toNonEmptyString -import java.io.File -import java.text.SimpleDateFormat -import java.util.* -import kotlin.io.path.createTempDirectory - -class CmdiMetadata(transformMetadata: DocumentTransformMetadata) : LayerTransformer(transformMetadata) { - companion object { - val tmp_dir: File = createTempDirectory("cmdi").toFile() - } - - val file: File - - init { - var template = this::class.java.classLoader.getResource("CMDI-template.xml")!!.readText() - val corpusMetadata: CorpusMetadata = this.transformMetadata.corpus.metadata.expensiveGet() - val docTitle = document.getUploadedRawFile().nameWithoutExtension - - // Current year, month and day, zero-padded - val year = SimpleDateFormat("yyyy").format(Date()) - val month = SimpleDateFormat("MM").format(Date()) - val day = SimpleDateFormat("dd").format(Date()) - val date = "$year-$month-$day" - - // Retrieve GaLAHaD version from the same version.yml used in the client about page. - val versionStream = this::class.java.classLoader.getResource("version.yml")!!.openStream() - val versionProperties = Properties() - versionProperties.load(versionStream) - val galahadVersion = versionProperties.getProperty("VERSION") - - val replacements = mapOf( - "CORPUS_NAME" to corpusMetadata.name, - "DATE" to date, - "YEAR" to year, - "MONTH" to month, - "DAY" to day, - "PID" to document.uuid.toString(), - "GALAHAD_VERSION" to galahadVersion, - "TITLE" to docTitle, - "SOURCE_NAME" to corpusMetadata.sourceName.toNonEmptyString("!No source name defined!"), - "SOURCE_URL" to corpusMetadata.sourceURL.toNonEmptyString("!No source URL defined!"), - "ERA_FROM" to corpusMetadata.eraFrom.toString(), - "ERA_TO" to corpusMetadata.eraTo.toString(), - "TAGSET" to tagger.tagset.toNonEmptyString("!No tagset defined!"), - "FORMAT" to document.format.identifier, - "TAGGER_NAME" to tagger.id, - "TAGGER_VERSION" to tagger.version, - "TAGGER_URL" to tagger.model.href, - ) - for ((key, value) in replacements) { - template = template.replace(key, value.escapeXML()) - } - file = tmp_dir.resolve("CMDI-$docTitle.xml") - file.writeText(template) - } +package org.ivdnt.galahad.port + +import org.ivdnt.galahad.data.corpus.CorpusMetadata +import org.ivdnt.galahad.tagset.TagsetStore +import org.ivdnt.galahad.util.escapeXML +import org.ivdnt.galahad.util.getXmlBuilder +import org.ivdnt.galahad.util.toNonEmptyString +import org.w3c.dom.Node +import java.io.File +import java.text.SimpleDateFormat +import java.util.* +import javax.xml.transform.Transformer +import javax.xml.transform.TransformerFactory +import javax.xml.transform.dom.DOMSource +import javax.xml.transform.stream.StreamResult +import javax.xml.xpath.XPathConstants +import javax.xml.xpath.XPathFactory +import kotlin.io.path.createTempDirectory + +/** Constructs a CMDI file for exported documents. */ +class CmdiMetadata(transformMetadata: DocumentTransformMetadata) : LayerTransformer(transformMetadata) { + + companion object { + private val tmp_dir: File = createTempDirectory("cmdi").toFile() + } + + /** We need tagsets to go from tagger.tagset to tagset.fullName */ + private val tagsets = TagsetStore() + + // Some vals for repeated access. + private val docTitle = document.getUploadedRawFile().nameWithoutExtension + private val corpusMetadata: CorpusMetadata = transformMetadata.corpus.metadata.expensiveGet() + private val format = transformMetadata.targetFormat.identifier + + /** After initialization this file will contain the CMDI */ + val file: File + + init { + // Load CMDI template + val cmdiTemplate = this::class.java.classLoader.getResourceAsStream("CMDI-template.xml") + val xmlDoc = getXmlBuilder().parse(cmdiTemplate) + + val replacements: Map, String> = getReplacements() + // Replace them + for ((keys, value) in replacements) { + val xpath = XPathFactory.newInstance().newXPath() + for (key in keys) { + val expr = xpath.compile("CMD//$key") + val node = expr.evaluate(xmlDoc, XPathConstants.NODE) as Node + node.textContent = value.escapeXML() + } + } + // Write to disk + file = tmp_dir.resolve("CMDI-$docTitle.xml") + val tf: Transformer = TransformerFactory.newInstance().newTransformer() + tf.transform(DOMSource(xmlDoc), StreamResult(file.outputStream())) + } + + private fun getReplacements(): Map, String> { + // Current year, month and day, zero-padded + val now = Date() + val year = SimpleDateFormat("yyyy").format(now) + val month = SimpleDateFormat("MM").format(now) + val day = SimpleDateFormat("dd").format(now) + val date = "$year-$month-$day" + + // Retrieve GaLAHaD version from the same version.yml used in the client about page. + val versionStream = this::class.java.classLoader.getResource("version.yml")!!.openStream() + val versionProperties = Properties() + versionProperties.load(versionStream) + val galahadVersion = versionProperties.getProperty("GITHUB_REF_NAME") + + // Define replacements + return mapOf( + listOf("MdCollectionDisplayName", "corpusName") to corpusMetadata.name, + listOf("MdCreationDate") to date, + listOf("Annotation//yearFrom", "Annotation//yearTo") to year, + listOf("Annotation//monthFrom", "Annotation//monthTo") to "--$month", + listOf("Annotation//dayFrom", "Annotation//dayTo") to "---$day", + listOf("ResourceRef") to "https://resolver.ivdnt.org/${document.uuid}", + listOf("GaLAHaDPersistentIdentifier") to "${document.uuid}_tei", + listOf("conversionDescription") to "exported to $format by GaLAHaD", + listOf("Conversion_GaLAHaD//toolVersion") to galahadVersion, + listOf("sourceID") to docTitle, + listOf("sourceCollection") to corpusMetadata.sourceName.toNonEmptyString("!No source name defined!"), + listOf("sourceCollectionURI") to corpusMetadata.sourceURL.toNonEmptyString("!No source URL defined!"), + listOf("Source_GaLAHaD//yearFrom") to corpusMetadata.eraFrom.toString(), + listOf("Source_GaLAHaD//yearTo") to corpusMetadata.eraTo.toString(), + listOf("annotationSet") to tagsets.getOrNull(tagger.tagset)?.longName.toNonEmptyString("!No tagset defined!"), + listOf("annotationFormat") to format, + listOf("Annotation//toolName") to tagger.id, + listOf("Annotation//toolVersion") to tagger.version, + listOf("Annotation//toolURI") to tagger.model.href, + ) + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/TransformMetadata.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/TransformMetadata.kt index 4c4951a..c719716 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/TransformMetadata.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/TransformMetadata.kt @@ -1,47 +1,51 @@ -package org.ivdnt.galahad.port - -import org.ivdnt.galahad.data.corpus.Corpus -import org.ivdnt.galahad.app.User -import org.ivdnt.galahad.data.document.Document -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.jobs.Job - -open class CorpusTransformMetadata( - val corpus: Corpus, - val job: Job, - val user: User -) { - - fun documentMetadata( document: String ): DocumentTransformMetadata { - return DocumentTransformMetadata( - corpus = corpus, - job = job, - document = corpus.documents.readOrThrow( document ), - user = user - ) - } -} - -class DocumentTransformMetadata( - val corpus: Corpus, - val job: Job, - val document: Document, - val user: User -) { - - val layer: Layer = job.document(document.name).result - - val plainText: String - get() = document.plaintext - - fun convertLayerToPosHead() { - for (i in layer.terms.indices) { - val t = layer.terms[i] - layer.terms[i] = Term( - lemma = t.lemma, - pos = t.posHeadGroup, - targets = t.targets) - } - } -} +package org.ivdnt.galahad.port + +import org.ivdnt.galahad.app.User +import org.ivdnt.galahad.data.corpus.Corpus +import org.ivdnt.galahad.data.document.Document +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.jobs.Job + +open class CorpusTransformMetadata( + val corpus: Corpus, + val job: Job, + val user: User, + val targetFormat: DocumentFormat, +) { + + fun documentMetadata( document: String ): DocumentTransformMetadata { + return DocumentTransformMetadata( + corpus = corpus, + job = job, + document = corpus.documents.readOrThrow( document ), + user = user, + targetFormat = targetFormat, + ) + } +} + +class DocumentTransformMetadata( + val corpus: Corpus, + val job: Job, + val document: Document, + val user: User, + val targetFormat: DocumentFormat +) { + + val layer: Layer = job.document(document.name).result + + val plainText: String + get() = document.plaintext + + fun convertLayerToPosHead() { + for (i in layer.terms.indices) { + val t = layer.terms[i] + layer.terms[i] = Term( + lemma = t.lemma, + pos = t.posHeadGroup, + targets = t.targets) + } + } +} diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/csv/CSVFile.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/csv/CSVFile.kt index 6615ea3..7a68c0a 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/csv/CSVFile.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/csv/CSVFile.kt @@ -10,7 +10,7 @@ class CSVFile( ) : File(path.toURI()) { init { - this.appendText(getExcelCompatibilityHeader()) + this.appendText(excelCompatibilityHeader) } /** Append Excel compatible text. */ @@ -33,12 +33,13 @@ class CSVFile( // This is sad and I don't know of a clean solution short of having a configuration option 'excel-compatibility' // We will just except this loss and accommodate Excel/Windows user who might be less technically skilled // and hope that the Linux crowd will be able to handle this slightly weird header themselves - fun getExcelCompatibilityHeader(): String { - // Force Excel to read the csv as UTF16LE. Needed to render e.g. 'Ć¼'. - // https://en.wikipedia.org/wiki/Byte_order_mark - val bom = '\uFEFF' - return "${bom}sep=,\n" - } + val excelCompatibilityHeader: String + get() { + // Force Excel to read the csv as UTF16LE. Needed to render e.g. 'Ć¼'. + // https://en.wikipedia.org/wiki/Byte_order_mark + val bom = '\uFEFF' + return "${bom}sep=,\n" + } fun toCSVHeader(headers: List): CSVHeader { // This is just an alias diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/FoliaReader.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/FoliaReader.kt index 4b9a4d7..eda7688 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/FoliaReader.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/FoliaReader.kt @@ -1,9 +1,9 @@ package org.ivdnt.galahad.port.folia +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.layer.Term import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.port.folia.export.deepcopy import org.ivdnt.galahad.port.xml.reparseText import org.ivdnt.galahad.port.xml.tagName @@ -18,8 +18,7 @@ class FoliaReader( val file: File, val nodeHandler: (node: Node, offset: Int, document: Document) -> Unit, ) { - - val xmlDoc = getXmlBuilder().parse(file) + val xmlDoc: Document = getXmlBuilder().parse(file) val plainTextBuilder: StringBuilder = StringBuilder() val sourceLayer = Layer(SOURCE_LAYER_NAME) @@ -83,9 +82,9 @@ class FoliaReader( } "s" -> { - addNonFloatingString("\n") + nonFloatingNL() recurse() - addNonFloatingString("\n") + nonFloatingNL() previousWasW = false } "p" -> { @@ -103,18 +102,15 @@ class FoliaReader( } /** Adds a newline if the last character exists and is a non newline.*/ - private fun addNonFloatingString(str: String) { - if (plainTextBuilder.isNotEmpty() && !plainTextBuilder.endsWith(str)) { - plainTextBuilder.append(str) + private fun nonFloatingNL() { + if (plainTextBuilder.isNotEmpty() && !plainTextBuilder.endsWith("\n")) { + plainTextBuilder.append("\n") } } private fun nonFloatingDoubleNL() { - if (plainTextBuilder.isEmpty()) - return - else if (!plainTextBuilder.endsWith("\n")) { - plainTextBuilder.append("\n\n") - } else if (!plainTextBuilder.endsWith("\n\n")) { + nonFloatingNL() + if (plainTextBuilder.isNotEmpty() && !plainTextBuilder.endsWith("\n\n")) { plainTextBuilder.append("\n") } } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt index 7e8da15..9125da0 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/FoliaTextMerger.kt @@ -1,151 +1,150 @@ -package org.ivdnt.galahad.port.folia.export - -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.port.tei.export.TEITextMerger -import org.ivdnt.galahad.util.insertFirst -import org.ivdnt.galahad.port.xml.reparseText -import org.ivdnt.galahad.port.xml.tagName -import org.ivdnt.galahad.util.childOrNull -import org.ivdnt.galahad.util.insertAfter -import org.w3c.dom.Document -import org.w3c.dom.Element -import org.w3c.dom.Node -import org.w3c.dom.NodeList - -fun NodeList.deepcopy(): ArrayList { - val copy = ArrayList() - for (i in 0 until this.length) { - copy.add(this.item(i)) - } - return copy -} - -class FoliaTextMerger( - node: Node, offset: Int, document: Document, wordFormIter: ListIterator, - deleteList: ArrayList, layer: Layer -) : TEITextMerger(node, offset, document, wordFormIter, deleteList, layer, DocumentFormat.Folia) { - - override fun merge() { - if (node.tagName() == "t" || node.tagName()?.startsWith("t-") == true) { - // We are going to add nodes when exporting, but we don't want to iterate over them - // So better store references to the current nodes and use them - val parent = node - val oldChildNodes = node.childNodes.deepcopy() - var endsWithSpace = true - for (child in oldChildNodes) { - node = child - val text = reparseText(child.textContent) - - if (node.nodeType == Node.TEXT_NODE) { - child.textContent = text - } - // TODO reparseText overwrites embedded t-styles - - // never set the offset of more than one space. - if (endsWithSpace && text.startsWith(" ")) { - offset -= 1 - } - merge() - - // Keep track of the ending space - if (text.isNotEmpty() && text.endsWith(" ")) - endsWithSpace = true - else if (text.isNotEmpty()) - endsWithSpace = false - - if (child.nodeType == Node.TEXT_NODE) - offset += text.length - } - // Remove parent and transfer children. - if(markForDeletion(parent)) { - var last = parent - for (i in parent.childNodes.length - 1 downTo 0) { - val c = parent.childNodes.item(i) - parent.parentNode.insertBefore(c, last) - last = c - } - } - - } else { - super.merge() - } - } - - override fun createWTag(wf: WordForm): Element { - val wTag = node.parentNode.cloneNode(false) - return wTag as Element - } - - override fun addWordForm(previousEndOffset: Int, wf: WordForm) { - super.addWordForm(previousEndOffset, wf) - // For Folia, newWTag is actually a or tag. - var tTag: Node = newWTag!! - // Make sure tTag points to a . For e.g. a , grab the first parent. - var parent = - if (tTag.parentNode.tagName() == "t") tTag.parentNode - else tTag.parentNode.parentNode // First iteration looks at grandparent, because t-style copied itself. - while (tTag.tagName() != "t") { - val clone = parent.cloneNode(false) - tTag.parentNode.replaceChild(clone,tTag) - clone.insertFirst(tTag) - // Ready for next iter. - parent = parent.parentNode - tTag = clone - } - // Create the which will contain the - val wTag = document.createElement("w") - val term = layer.termForWordForm(wf) - wTag.addTerm(term) - // Contain it. - tTag.parentNode.replaceChild(wTag,tTag) - wTag.insertFirst(tTag) - } - - override fun handleElementNode() { - val element = node as Element - if (element.tagName != "w") return - - val wordFormToAdd = getWordFormForOffsetOrNull() ?: return - val term = layer.termForWordForm(wordFormToAdd) - element.addTerm(term) - } - - override fun moveWTagUp(wTag: Element): Element { - wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode) - val clone = wTag.parentNode.cloneNode(false) - wTag.parentNode.replaceChild(clone, wTag) - clone.appendChild(wTag) - newWTag = clone as Element - return clone - } - - private fun Element.addTerm(term: Term) { - this.addTermFeature("lemma", term.lemma.toString()) - this.addTermFeature("pos", term.pos.toString(), term.posHeadGroup) - } - - private fun Element.addTermFeature(name: String, value: String, head: String? = null) { - /* If at some point we want to remove existing annotations layers (pos & lemma) in folia tags - * uncomment this. For now, multiple annotation layers are okay in the export. - // Find the child elements of [name] and delete them - val children = this.childNodes.deepcopy() - for (child in children) { - if (child.tagName() == name) { - this.removeChild(child) - } - }*/ - - // Create a new child element of [name] - val child = this.ownerDocument.createElement(name) - child.setAttribute("class", value) - // For PoS - if (head != null) child.setAttribute("head", head) - // Folia metadata. - child.setAttribute("processor", layer.name) - child.setAttribute("set", layer.name) - this.appendChild(child) - } +package org.ivdnt.galahad.port.folia.export + +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.data.layer.WordForm +import org.ivdnt.galahad.port.tei.export.TEITextMerger +import org.ivdnt.galahad.port.xml.reparseText +import org.ivdnt.galahad.port.xml.tagName +import org.ivdnt.galahad.util.insertAfter +import org.ivdnt.galahad.util.insertFirst +import org.w3c.dom.Document +import org.w3c.dom.Element +import org.w3c.dom.Node +import org.w3c.dom.NodeList + +fun NodeList.deepcopy(): ArrayList { + val copy = ArrayList() + for (i in 0 until this.length) { + copy.add(this.item(i)) + } + return copy +} + +class FoliaTextMerger( + node: Node, offset: Int, document: Document, wordFormIter: ListIterator, + deleteList: ArrayList, layer: Layer +) : TEITextMerger(node, offset, document, wordFormIter, deleteList, layer, DocumentFormat.Folia) { + + override fun merge() { + if (node.tagName() == "t" || node.tagName()?.startsWith("t-") == true) { + // We are going to add nodes when exporting, but we don't want to iterate over them + // So better store references to the current nodes and use them + val parent = node + val oldChildNodes = node.childNodes.deepcopy() + var endsWithSpace = true + for (child in oldChildNodes) { + node = child + val text = reparseText(child.textContent) + + if (node.nodeType == Node.TEXT_NODE) { + child.textContent = text + } + // TODO reparseText overwrites embedded t-styles + + // never set the offset of more than one space. + if (endsWithSpace && text.startsWith(" ")) { + offset -= 1 + } + merge() + + // Keep track of the ending space + if (text.isNotEmpty() && text.endsWith(" ")) + endsWithSpace = true + else if (text.isNotEmpty()) + endsWithSpace = false + + if (child.nodeType == Node.TEXT_NODE) + offset += text.length + } + // Remove parent and transfer children. + if(markForDeletion(parent)) { + var last = parent + for (i in parent.childNodes.length - 1 downTo 0) { + val c = parent.childNodes.item(i) + parent.parentNode.insertBefore(c, last) + last = c + } + } + + } else { + super.merge() + } + } + + override fun createWTag(wf: WordForm): Element { + val wTag = node.parentNode.cloneNode(false) + return wTag as Element + } + + override fun addWordForm(previousEndOffset: Int, wf: WordForm) { + super.addWordForm(previousEndOffset, wf) + // For Folia, newWTag is actually a or tag. + var tTag: Node = newWTag!! + // Make sure tTag points to a . For e.g. a , grab the first parent. + var parent = + if (tTag.parentNode.tagName() == "t") tTag.parentNode + else tTag.parentNode.parentNode // First iteration looks at grandparent, because t-style copied itself. + while (tTag.tagName() != "t") { + val clone = parent.cloneNode(false) + tTag.parentNode.replaceChild(clone,tTag) + clone.insertFirst(tTag) + // Ready for next iter. + parent = parent.parentNode + tTag = clone + } + // Create the which will contain the + val wTag = document.createElement("w") + val term = layer.termForWordForm(wf) + wTag.addTerm(term) + // Contain it. + tTag.parentNode.replaceChild(wTag,tTag) + wTag.insertFirst(tTag) + } + + override fun handleElementNode() { + val element = node as Element + if (element.tagName != "w") return + + val wordFormToAdd = getWordFormForOffsetOrNull() ?: return + val term = layer.termForWordForm(wordFormToAdd) + element.addTerm(term) + } + + override fun moveWTagUp(wTag: Element): Element { + wTag.parentNode.parentNode.insertAfter(wTag, wTag.parentNode) + val clone = wTag.parentNode.cloneNode(false) + wTag.parentNode.replaceChild(clone, wTag) + clone.appendChild(wTag) + newWTag = clone as Element + return clone + } + + private fun Element.addTerm(term: Term) { + this.addTermFeature("lemma", term.lemmaOrEmpty) + this.addTermFeature("pos", term.posOrEmpty, term.posHeadGroupOrEmpty) + } + + private fun Element.addTermFeature(name: String, value: String, head: String? = null) { + /* If at some point we want to remove existing annotations layers (pos & lemma) in folia tags + * uncomment this. For now, multiple annotation layers are okay in the export. + // Find the child elements of [name] and delete them + val children = this.childNodes.deepcopy() + for (child in children) { + if (child.tagName() == name) { + this.removeChild(child) + } + }*/ + + // Create a new child element of [name] + val child = this.ownerDocument.createElement(name) + child.setAttribute("class", value) + // For PoS + if (head != null) child.setAttribute("head", head) + // Folia metadata. + child.setAttribute("processor", layer.name) + child.setAttribute("set", layer.name) + this.appendChild(child) + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt index 5eecf59..777de28 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/folia/export/LayerToFoliaConverter.kt @@ -1,82 +1,82 @@ -package org.ivdnt.galahad.port.folia.export - -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.Term -import org.ivdnt.galahad.port.DocumentTransformMetadata -import org.ivdnt.galahad.port.LayerConverter -import org.ivdnt.galahad.port.LayerTransformer -import org.ivdnt.galahad.util.XMLWriter -import org.ivdnt.galahad.util.escapeXML -import org.ivdnt.galahad.util.toValidXmlId -import java.io.OutputStream - -class LayerToFoliaConverter ( - transformMetadata: DocumentTransformMetadata, -) : LayerConverter, LayerTransformer( transformMetadata ) { - - override val format: DocumentFormat - get() = DocumentFormat.Folia - - val id: String - get() = document.getUploadedRawFile().nameWithoutExtension.toValidXmlId() - - override fun convert(outputStream: OutputStream) { - val taggerName = tagger.id - val writer = XMLWriter(outputStream) - // XML Header - writer.writeLineRaw("") - writer.openTag("") - // Metadata - writeMetadata(writer, taggerName) - // Textbody - writeTextBody(writer, taggerName) - } - - private fun writeMetadata(writer: XMLWriter, taggerName: String) { - writer.openTag("") - // Annotations - writer.openTag("") - writer.writeLine("") - writer.writeLine("") - writer.writeLine("") - writer.writeLine("") - for (annotation in setOf("lemma", "pos")) { - writer.openTag("<$annotation-annotation set=\"${taggerName}\">") - writer.writeLine("") - writer.closeTag("") - } - writer.closeTag("") - // Provenance - writer.openTag("") - writer.writeLine( - "" - ) - writer.closeTag("") - writer.closeTag("") - } - - private fun writeTextBody(writer: XMLWriter, taggerName: String) { - writer.openTag("") - writer.openTag("

") - for ((index, term) in this.result.terms.withIndex()) { - // Single W - writeSingleW(writer, index, term, taggerName) - } - writer.closeTag("

") - writer.closeTag("
") - writer.closeTag("
") - } - - private fun writeSingleW( - writer: XMLWriter, index: Int, term: Term, - taggerName: String, - ) { - writer.openTag("") - writer.writeLine("${term.targets[0].literal.escapeXML()}") - writer.writeLine("") - writer.writeLine( - "" - ) - writer.closeTag("") - } +package org.ivdnt.galahad.port.folia.export + +import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.layer.Term +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.LayerConverter +import org.ivdnt.galahad.port.LayerTransformer +import org.ivdnt.galahad.util.XMLWriter +import org.ivdnt.galahad.util.escapeXML +import org.ivdnt.galahad.util.toValidXmlId +import java.io.OutputStream + +class LayerToFoliaConverter ( + transformMetadata: DocumentTransformMetadata, +) : LayerConverter, LayerTransformer( transformMetadata ) { + + override val format: DocumentFormat + get() = DocumentFormat.Folia + + val id: String + get() = document.getUploadedRawFile().nameWithoutExtension.toValidXmlId() + + override fun convert(outputStream: OutputStream) { + val taggerName = tagger.id + val writer = XMLWriter(outputStream) + // XML Header + writer.writeLineRaw("") + writer.openTag("") + // Metadata + writeMetadata(writer, taggerName) + // Textbody + writeTextBody(writer, taggerName) + } + + private fun writeMetadata(writer: XMLWriter, taggerName: String) { + writer.openTag("") + // Annotations + writer.openTag("") + writer.writeLine("") + writer.writeLine("") + writer.writeLine("") + writer.writeLine("") + for (annotation in setOf("lemma", "pos")) { + writer.openTag("<$annotation-annotation set=\"${taggerName}\">") + writer.writeLine("") + writer.closeTag("") + } + writer.closeTag("") + // Provenance + writer.openTag("") + writer.writeLine( + "" + ) + writer.closeTag("") + writer.closeTag("") + } + + private fun writeTextBody(writer: XMLWriter, taggerName: String) { + writer.openTag("") + writer.openTag("

") + for ((index, term) in this.result.terms.withIndex()) { + // Single W + writeSingleW(writer, index, term, taggerName) + } + writer.closeTag("

") + writer.closeTag("
") + writer.closeTag("
") + } + + private fun writeSingleW( + writer: XMLWriter, index: Int, term: Term, + taggerName: String, + ) { + writer.openTag("") + writer.writeLine("${term.targets[0].literal.escapeXML()}") + writer.writeLine("") + writer.writeLine( + "" + ) + writer.closeTag("") + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/NAFFile.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/NAFFile.kt index 37063bf..f287c73 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/NAFFile.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/NAFFile.kt @@ -1,10 +1,10 @@ package org.ivdnt.galahad.port.naf import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.layer.Term import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.port.DocumentTransformMetadata import org.ivdnt.galahad.port.PlainTextableFile import org.ivdnt.galahad.port.SourceLayerableFile @@ -26,11 +26,11 @@ class NAFFile ( val xmlDoc: Document = getXmlBuilder().parse(file) - val xPathfactory = XPathFactory.newInstance() - val xpath = xPathfactory.newXPath() - val expr = xpath.compile("/NAF/raw") - val wfExpr = xpath.compile("/NAF/text/wf") - val termExpr = xpath.compile("/NAF/terms/term") + private val xPathfactory = XPathFactory.newInstance() + private val xpath = xPathfactory.newXPath() + private val expr = xpath.compile("/NAF/raw") + private val wfExpr = xpath.compile("/NAF/text/wf") + private val termExpr = xpath.compile("/NAF/terms/term") override fun plainTextReader(): StringReader { return (expr.evaluate(xmlDoc, XPathConstants.NODE ) as Node).textContent.reader() diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt index 0906d6b..4ff753b 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/naf/export/LayerToNAFConverter.kt @@ -57,8 +57,8 @@ class LayerToNAFConverter ( val xterm = xmlDoc.createElement("term") terms.appendChild(xterm) xterm.setAttribute("id", "t$index") - xterm.setAttribute("lemma", term.lemma) - xterm.setAttribute("pos", term.pos) + xterm.setAttribute("lemma", term.lemmaOrEmpty) + xterm.setAttribute("pos", term.posOrEmpty) val xspan = xmlDoc.createElement("span") xterm.appendChild( xspan ) diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/TEIFile.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/TEIFile.kt index b601164..4edc2b3 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/TEIFile.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/TEIFile.kt @@ -1,9 +1,9 @@ package org.ivdnt.galahad.port.tei import org.ivdnt.galahad.app.executeAndLogTime -import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.document.DocumentFormat import org.ivdnt.galahad.data.document.FormatInducer +import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.port.DocumentTransformMetadata import org.ivdnt.galahad.port.tei.export.TEILayerMerger import org.ivdnt.galahad.port.xml.AnnotatedFile diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt index 118b8f3..109c26d 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/LayerToTEIConverter.kt @@ -112,8 +112,8 @@ class LayerToTEIConverter( writer.writeRaw("${getLiteral()}") } else { // If it is not punctuation, safely assume it can be interpreted as - val lemma = term.lemma?.escapeXML() - val pos = term.pos?.escapeXML() + val lemma = term.lemmaOrEmpty.escapeXML() + val pos = term.posOrEmpty.escapeXML() writer.writeRaw("${getLiteral()}") } } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEIMetadata.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEIMetadata.kt index 3d6af72..456c9cf 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEIMetadata.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEIMetadata.kt @@ -5,15 +5,11 @@ import org.ivdnt.galahad.data.document.DocumentFormat import org.ivdnt.galahad.port.LayerTransformer import org.ivdnt.galahad.port.xml.XMLMetadata import org.ivdnt.galahad.util.childOrNull -import org.ivdnt.galahad.util.getXmlBuilder import org.ivdnt.galahad.util.toNonEmptyString import org.w3c.dom.Document import org.w3c.dom.Element import org.w3c.dom.Node -import org.xml.sax.InputSource -import java.io.ByteArrayOutputStream import java.io.OutputStream -import java.io.StringReader import java.util.* import javax.xml.transform.OutputKeys import javax.xml.transform.Transformer diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt index 50cddb5..37f03e5 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tei/export/TEITextMerger.kt @@ -1,17 +1,16 @@ package org.ivdnt.galahad.port.tei.export import org.ivdnt.galahad.app.report.Report +import org.ivdnt.galahad.data.document.DocumentFormat import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.data.layer.WordForm +import org.ivdnt.galahad.evaluation.comparison.LayerComparison.Companion.truncatedPcMatch import org.ivdnt.galahad.port.folia.export.deepcopy import org.ivdnt.galahad.port.xml.getPlainTextContent +import org.ivdnt.galahad.util.* import org.w3c.dom.Document import org.w3c.dom.Element import org.w3c.dom.Node -import java.util.* -import kotlin.collections.ArrayList -import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.util.* fun HashSet.contains(s: String?, ignoreCase: Boolean = false): Boolean { return any { it.equals(s, ignoreCase) } @@ -234,11 +233,11 @@ open class TEITextMerger( n } else { val n = document.createElement("w") - n.setAttribute("lemma", termToAdd.lemma) + n.setAttribute("lemma", termToAdd.lemmaOrEmpty) n } // Both and have a pos. - wTag.setAttribute(posType(), termToAdd.pos) + wTag.setAttribute(posType(), termToAdd.posOrEmpty) return wTag } @@ -269,8 +268,9 @@ open class TEITextMerger( if (wordFormToAdd != null) { // remove all whitespace within a -tag (although this rarely occurs anyway). val sourceLiteral = node.getPlainTextContent().replace(Regex("""\s"""), "") - if (wordFormToAdd.literal == sourceLiteral) { - // This is a simple case since the tokenization matches + if (wordFormToAdd.literal == sourceLiteral // This is a simple case since the tokenization matches + || truncatedPcMatch(sourceLiteral, wordFormToAdd.literal) // Also match with single punctuation (e.g. word. -> word) + ) { mergeWTag(wordFormToAdd, element) } else { // Tokenization mismatch, report it @@ -303,9 +303,9 @@ open class TEITextMerger( val termToAdd = layer.termForWordForm(wordFormToAdd) // tags do not have a lemma. if (element.tagName == "w") { - element.setAttribute("lemma", termToAdd.lemma) + element.setAttribute("lemma", termToAdd.lemmaOrEmpty) } - element.setAttribute(posType(), termToAdd.pos) + element.setAttribute(posType(), termToAdd.posOrEmpty) element.removeAttribute("type") // Update legacy formats to TEI p5 } diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/TSVFile.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/TSVFile.kt index 36d13fd..674f1ae 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/TSVFile.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/TSVFile.kt @@ -1,9 +1,12 @@ package org.ivdnt.galahad.port.tsv import org.ivdnt.galahad.data.document.DocumentFormat -import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME -import org.ivdnt.galahad.port.* +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.InternalFile +import org.ivdnt.galahad.port.PlainTextableFile +import org.ivdnt.galahad.port.SourceLayerableFile import org.ivdnt.galahad.port.conllu.ConlluFile import org.ivdnt.galahad.port.tsv.export.TSVLayerMerger import java.io.File diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt index cb87b06..36a278a 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/LayerToTSVConverter.kt @@ -18,8 +18,8 @@ class LayerToTSVConverter( outputStream.write("word\tlemma\tpos\n".encodeToByteArray()) // 'word' is the blacklab default // Body result.terms.forEach { - // Note that this might have weird result for multi-wordform tokens - outputStream.write("${it.literals}\t${it.lemma}\t${it.pos}\n".encodeToByteArray()) + // Explicitly non-null. + outputStream.write("${it.literals}\t${it.lemmaOrEmpty}\t${it.posOrEmpty}\n".encodeToByteArray()) } } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt index a268c3b..24666d0 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/tsv/export/TSVLayerMerger.kt @@ -1,63 +1,63 @@ -package org.ivdnt.galahad.port.tsv.export - -import org.ivdnt.galahad.data.layer.Layer -import org.ivdnt.galahad.port.DocumentTransformMetadata -import org.ivdnt.galahad.port.LayerMerger -import org.ivdnt.galahad.port.LayerTransformer -import org.ivdnt.galahad.port.tsv.TSVFile -import java.io.File -import kotlin.io.path.createTempDirectory - -/** - * Do not call directly. Use [TSVFile.merge] instead. - */ -internal open class TSVLayerMerger( - open val sourceFile: TSVFile, - transformMetadata: DocumentTransformMetadata, -) : LayerMerger, LayerTransformer(transformMetadata) { - val layer = transformMetadata.layer - val outFile: File = createTempDirectory("teimerge").toFile().resolve(transformMetadata.document.name) - protected open val hasHeader: Boolean = true - /** - * Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile. - * Read in per line, split on tabs, swap out pos & lemma and commit to new file - */ - override fun merge(): TSVFile { - sourceFile.parse() // parse the sourceFile if needed. - parseByLine() - return TSVFile(outFile) - } - - protected fun parseByLine() { - var termIndex = if (hasHeader) -1 else 0 // Start at -1 to take the header into account. - sourceFile.file.inputStream().bufferedReader().forEachLine { line -> - if (termIndex == -1) { - // Copy header to output & continue - outFile.appendText(line + "\n") - termIndex++ - } else { - val columns = line.split("\t").toMutableList() - if (columns.size >= 3) { - // Swap out pos & lemma, keep the rest. - replaceColumns(columns, layer, termIndex) - outFile.appendText(columns.joinToString("\t") + "\n") - termIndex++ - } else { - // Output whatever was on that line. Presumably whitespace. - outFile.appendText(line + "\n") - } - } - } - } - - /* - * Replace the PoS and lemma values in their previously indexed columns. - */ - protected open fun replaceColumns( - columns: MutableList, layer: Layer, - termIndex: Int, - ) { - columns[sourceFile.posIndex!!] = layer.terms[termIndex].pos.toString() - columns[sourceFile.lemmaIndex!!] = layer.terms[termIndex].lemma.toString() - } +package org.ivdnt.galahad.port.tsv.export + +import org.ivdnt.galahad.data.layer.Layer +import org.ivdnt.galahad.port.DocumentTransformMetadata +import org.ivdnt.galahad.port.LayerMerger +import org.ivdnt.galahad.port.LayerTransformer +import org.ivdnt.galahad.port.tsv.TSVFile +import java.io.File +import kotlin.io.path.createTempDirectory + +/** + * Do not call directly. Use [TSVFile.merge] instead. + */ +internal open class TSVLayerMerger( + open val sourceFile: TSVFile, + transformMetadata: DocumentTransformMetadata, +) : LayerMerger, LayerTransformer(transformMetadata) { + val layer = transformMetadata.layer + val outFile: File = createTempDirectory("teimerge").toFile().resolve(transformMetadata.document.name) + protected open val hasHeader: Boolean = true + /** + * Merge uploaded raw file with tagger layer. Headers indices are already determined by TSVFile. + * Read in per line, split on tabs, swap out pos & lemma and commit to new file + */ + override fun merge(): TSVFile { + sourceFile.parse() // parse the sourceFile if needed. + parseByLine() + return TSVFile(outFile) + } + + protected fun parseByLine() { + var termIndex = if (hasHeader) -1 else 0 // Start at -1 to take the header into account. + sourceFile.file.inputStream().bufferedReader().forEachLine { line -> + if (termIndex == -1) { + // Copy header to output & continue + outFile.appendText(line + "\n") + termIndex++ + } else { + val columns = line.split("\t").toMutableList() + if (columns.size >= 3) { + // Swap out pos & lemma, keep the rest. + replaceColumns(columns, layer, termIndex) + outFile.appendText(columns.joinToString("\t") + "\n") + termIndex++ + } else { + // Output whatever was on that line. Presumably whitespace. + outFile.appendText(line + "\n") + } + } + } + } + + /* + * Replace the PoS and lemma values in their previously indexed columns. + */ + protected open fun replaceColumns( + columns: MutableList, layer: Layer, + termIndex: Int, + ) { + columns[sourceFile.posIndex!!] = layer.terms[termIndex].posOrEmpty + columns[sourceFile.lemmaIndex!!] = layer.terms[termIndex].lemmaOrEmpty + } } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt index de5826d..9f084a1 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/BLFXMLParser.kt @@ -1,10 +1,10 @@ package org.ivdnt.galahad.port.xml import org.ivdnt.galahad.data.document.DocumentFormat +import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.layer.Term import org.ivdnt.galahad.data.layer.WordForm -import org.ivdnt.galahad.data.document.SOURCE_LAYER_NAME import org.ivdnt.galahad.port.BLFXML import org.ivdnt.galahad.util.getXmlBuilder import org.w3c.dom.Document @@ -33,8 +33,8 @@ fun Node.tagName(): String? { } /** - * Should the text text inside this node be interpreted as source text? - * Asssumes we are already inside of a text container e.g. + * Should the text inside this node be interpreted as source text? + * Assumes we are already inside a text container e.g. */ private fun Node.isTextable(): Boolean { if( this.tagName() == "note" && this.attributes.getNamedItem("type")?.textContent == "editorial" ) { @@ -210,7 +210,7 @@ class BLFXMLParser ( - private fun addPlaintext( literal: String ) { + private fun addPlaintext(literal: String) { plainTextOutputStream.write( literal.toByteArray() ) offset += literal.length @@ -277,20 +277,32 @@ class BLFXMLParser ( } private fun handleWordOrPunctNode( node: Node ) { - // custom node handling - nodeHandler(node, offset, xmlDocument) + // Handle cases like ab -> "a b" (add space in plaintext) + val needsSpacing = node.tagName() == "w" && plaintextTail.isNotBlank() && !Regex("""\s$""").containsMatchIn(plaintextTail) + val spaceOffset = if (needsSpacing) 1 else 0 + val trueWordOffset = offset + spaceOffset + + // Handle merging + nodeHandler(node, trueWordOffset, xmlDocument) + // Extraction val literal = literalExtractor(node).trim() // wordPathExpression.evaluate( node ) val lem = lemmaExtractor(node) // lemPathExpression.evaluate( node ) val pos = posExtractor(node) // posPathExpression.evaluate( node ) val id = idExtractor(node) - val wordForm = WordForm( literal, offset, literal.length, id ?: "no-id" ) - sourceLayer.wordForms.add( wordForm ) - addPlaintext(literal) - + // Add the word to the source layer + val wordForm = WordForm(literal, trueWordOffset, literal.length, id ?: "no-id" ) val term = Term(lem, pos, mutableListOf(wordForm)) - sourceLayer.terms.add( term ) + sourceLayer.wordForms.add(wordForm) + sourceLayer.terms.add(term) + + // Add the word to the plaintext + var text = literal.trim() + if (needsSpacing) { + text = " $text" + } + addPlaintext(text) } fun xmlToString(pretty: Boolean): String { diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLFile.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLFile.kt index 8c4711a..1615223 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLFile.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLFile.kt @@ -1,9 +1,7 @@ package org.ivdnt.galahad.port.xml import org.ivdnt.galahad.port.InternalFile -import org.xml.sax.Attributes import java.io.File -import java.util.* abstract class XMLFile( final override val file: File, diff --git a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLMetadata.kt b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLMetadata.kt index c73c98b..085cc85 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLMetadata.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/port/xml/XMLMetadata.kt @@ -29,7 +29,7 @@ open class XMLMetadata( } /** - * Add a tag to [this] with [name], [textContent], and optional [attrValue] and [targetAttr]. + * Add a tag to [this] with [name], [textContent], and optional [attrValue] * Defaults to writing attribute @type. */ protected fun Node.createChild( diff --git a/server/src/main/kotlin/org/ivdnt/galahad/taggers/TaggersController.kt b/server/src/main/kotlin/org/ivdnt/galahad/taggers/TaggersController.kt index a90dbb6..28dbf6a 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/taggers/TaggersController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/taggers/TaggersController.kt @@ -4,25 +4,25 @@ import com.beust.klaxon.JsonObject import com.beust.klaxon.Parser import com.beust.klaxon.Parser.Companion.default import com.fasterxml.jackson.annotation.JsonProperty +import jakarta.servlet.http.HttpServletRequest +import jakarta.servlet.http.HttpServletResponse import org.apache.logging.log4j.kotlin.Logging import org.ivdnt.galahad.app.TAGGERS_URL import org.ivdnt.galahad.app.TAGGER_HEALTH_URL import org.ivdnt.galahad.app.TAGGER_URL import org.springframework.beans.factory.annotation.Autowired +import org.springframework.http.HttpMethod import org.springframework.web.bind.annotation.CrossOrigin import org.springframework.web.bind.annotation.GetMapping import org.springframework.web.bind.annotation.PathVariable import org.springframework.web.bind.annotation.RestController +import org.springframework.web.client.RestTemplate +import org.springframework.web.util.UriComponentsBuilder import java.net.URI +import java.net.URL import java.net.http.HttpClient import java.net.http.HttpRequest import java.net.http.HttpResponse -import jakarta.servlet.http.HttpServletRequest -import jakarta.servlet.http.HttpServletResponse -import org.springframework.http.HttpMethod -import org.springframework.web.client.RestTemplate -import org.springframework.web.util.UriComponentsBuilder -import java.net.URL @RestController class TaggersController : Logging { diff --git a/server/src/main/kotlin/org/ivdnt/galahad/tagset/TagsetController.kt b/server/src/main/kotlin/org/ivdnt/galahad/tagset/TagsetController.kt index dc0d232..bade02a 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/tagset/TagsetController.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/tagset/TagsetController.kt @@ -3,7 +3,10 @@ package org.ivdnt.galahad.tagset import org.apache.logging.log4j.kotlin.Logging import org.ivdnt.galahad.app.TAGSETS_URL import org.springframework.http.HttpStatus -import org.springframework.web.bind.annotation.* +import org.springframework.web.bind.annotation.CrossOrigin +import org.springframework.web.bind.annotation.GetMapping +import org.springframework.web.bind.annotation.PathVariable +import org.springframework.web.bind.annotation.RestController import org.springframework.web.server.ResponseStatusException @RestController diff --git a/server/src/main/kotlin/org/ivdnt/galahad/util/ResourceUtil.kt b/server/src/main/kotlin/org/ivdnt/galahad/util/ResourceUtil.kt index d94624e..4c8644c 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/util/ResourceUtil.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/util/ResourceUtil.kt @@ -1,6 +1,7 @@ -package org.ivdnt.galahad.util - -import java.io.InputStream - -fun getResourceStream(path: String): InputStream? = - object {}.javaClass.classLoader.getResourceAsStream(path) +package org.ivdnt.galahad.util + +import java.io.InputStream + +/** Get a resource from src/main/resources. */ +fun getResourceStream(path: String): InputStream? = + object {}.javaClass.classLoader.getResourceAsStream(path) diff --git a/server/src/main/kotlin/org/ivdnt/galahad/util/StringExtensions.kt b/server/src/main/kotlin/org/ivdnt/galahad/util/StringExtensions.kt index 0bcd00a..6a52a48 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/util/StringExtensions.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/util/StringExtensions.kt @@ -9,6 +9,7 @@ fun String.matchesUpTo(textToMatch: String): Int { return matchingIndex } +/** Same as toString, unless the caller is null or the resulting string is empty: then we use [default]. */ fun Any?.toNonEmptyString(default: String): String { return if (this == null || this.toString().isEmpty()) default else this.toString() } \ No newline at end of file diff --git a/server/src/main/kotlin/org/ivdnt/galahad/util/UTF8Util.kt b/server/src/main/kotlin/org/ivdnt/galahad/util/UTF8Util.kt index 8bb3d69..cbdd4d7 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/util/UTF8Util.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/util/UTF8Util.kt @@ -1,7 +1,7 @@ package org.ivdnt.galahad.util -import java.net.URLEncoder import jakarta.servlet.http.HttpServletResponse +import java.net.URLEncoder /** * A valid filename for windows and linux. Exceptions like COM1 still exist. diff --git a/server/src/main/kotlin/org/ivdnt/galahad/util/ZipFile.kt b/server/src/main/kotlin/org/ivdnt/galahad/util/ZipFile.kt index 9209b64..9e80cce 100644 --- a/server/src/main/kotlin/org/ivdnt/galahad/util/ZipFile.kt +++ b/server/src/main/kotlin/org/ivdnt/galahad/util/ZipFile.kt @@ -1,6 +1,9 @@ package org.ivdnt.galahad.util -import java.io.* +import java.io.BufferedOutputStream +import java.io.File +import java.io.FileOutputStream +import java.io.OutputStream import java.util.zip.ZipEntry import java.util.zip.ZipOutputStream diff --git a/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt index 766e5c4..6bad97a 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/data/document/DocumentTest.kt @@ -91,7 +91,7 @@ class DocumentTest { // Convert to each other format for (formatTo in DocumentFormat.entries) { val meta = DocumentTransformMetadata( - corpus, job, doc, User("testUser") + corpus, job, doc, User("testUser"), formatTo ) when (formatTo) { // Skip the unsupported @@ -107,7 +107,7 @@ class DocumentTest { val result: File = doc.generateAs(formatTo, meta) val expected: File = Resource.get("all-formats/output/from-$formatFrom-to-$formatTo.${formatTo.extension}") val test = TestResult(expected.readText(), result.readText()) - test.ignoreDate().ignoreUUID().result() + test.ignoreDate().ignoreUUID().ignoreTrailingWhiteSpaces().result() } } } diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/Util.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/Util.kt index b219f24..670934d 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/Util.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/Util.kt @@ -9,6 +9,7 @@ import org.ivdnt.galahad.app.User import org.ivdnt.galahad.data.corpus.Corpus import org.ivdnt.galahad.data.corpus.MutableCorpusMetadata import org.ivdnt.galahad.data.document.Document +import org.ivdnt.galahad.data.document.DocumentFormat import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.data.layer.Term import org.ivdnt.galahad.data.layer.WordForm @@ -51,7 +52,7 @@ fun getJsonMapper(): ObjectMapper { fun createCorpus(workdir: File? = null, isDataset: Boolean = false, isAdmin: Boolean = false): Corpus { val parent = workdir ?: createTempDirectory().toFile() - val corpus = Corpus(parent.resolve(UUID.randomUUID().toString()), User("you")) + val corpus = Corpus(parent.resolve(UUID.randomUUID().toString())) corpus.updateMetadata( MutableCorpusMetadata( "you", @@ -189,16 +190,16 @@ class DocTestBuilder( /** The file extension is relevant, otherwise conversion will fail */ fun getDummyTransformMetadata( layer: Layer, - ext: String? = null, + format: DocumentFormat, file: File? = null, ): DocumentTransformMetadata { - val file = file ?: createTempDirectory().toFile().resolve("dummy.$ext") + val file = file ?: createTempDirectory().toFile().resolve("dummy.${format.extension}") file.createNewFile() val docName = corpus.documents.create(file) val job = corpus.jobs.createOrThrow(TestConfig.TAGGER_NAME) job.document(docName).setResult(layer) return DocumentTransformMetadata( - corpus, job, corpus.documents.readOrThrow(docName), User("testUser") + corpus, job, corpus.documents.readOrThrow(docName), User("testUser"), format ) } @@ -206,7 +207,7 @@ class DocTestBuilder( fun convertToTSV(layer: Layer): TestResult { val exporter = LayerToTSVConverter( - getDummyTransformMetadata(layer, "tsv") + getDummyTransformMetadata(layer, DocumentFormat.Tsv) ) val result = exporter.convertToFileNamed("test") return got(result.readText()) @@ -217,7 +218,7 @@ class DocTestBuilder( } fun mergeTSV(file: File, layer: Layer): TestResult { - val transformMetadata = getDummyTransformMetadata(layer, file = file) + val transformMetadata = getDummyTransformMetadata(layer, DocumentFormat.Tsv, file) val result: TSVFile = TSVFile(file).merge(transformMetadata) return got(result.file.readText()) } @@ -226,7 +227,7 @@ class DocTestBuilder( fun convertToConllu(layer: Layer): TestResult { val exporter = LayerToConlluConverter( - getDummyTransformMetadata(layer, "conllu") + getDummyTransformMetadata(layer, DocumentFormat.Conllu) ) val result = exporter.convertToFileNamed("test") return got(result.readText()) @@ -237,7 +238,7 @@ class DocTestBuilder( } fun mergeConllu(file: File, layer: Layer): TestResult { - val transformMetadata = getDummyTransformMetadata(layer, file = file) + val transformMetadata = getDummyTransformMetadata(layer, DocumentFormat.Conllu, file) val result: ConlluFile = ConlluFile(file).merge(transformMetadata) return got(result.file.readText()) } @@ -246,7 +247,7 @@ class DocTestBuilder( fun convertToNaf(file: File, layer: Layer): TestResult { val exporter = LayerToNAFConverter( - getDummyTransformMetadata(layer, file = file) + getDummyTransformMetadata(layer, DocumentFormat.Naf, file) ) val result = exporter.convertToFileNamed("test") return got(result.readText()) @@ -256,28 +257,26 @@ class DocTestBuilder( fun convertToFolia(file: File, layer: Layer): TestResult { val exporter = LayerToFoliaConverter( - getDummyTransformMetadata(layer, file = file) + getDummyTransformMetadata(layer, DocumentFormat.Folia, file) ) val result = exporter.convertToFileNamed("test") return got(result.readText()) } fun mergeFolia(file: File, layer: Layer): TestResult { - val transformMetadata = getDummyTransformMetadata(layer, file = file) + val transformMetadata = getDummyTransformMetadata(layer, DocumentFormat.Folia, file) val result: FoliaFile = FoliaFile(file).merge(transformMetadata) return got(result.file.readText()) } // TEI - fun convertToTEI(teiFile: File, layer: Layer): TestResult { - val docName = corpus.documents.create(teiFile) + fun convertToTEI(file: File, layer: Layer): TestResult { + val docName = corpus.documents.create(file) val job = corpus.jobs.createOrThrow(TestConfig.TAGGER_NAME) job.document(docName).setResult(layer) val exporter = LayerToTEIConverter( - DocumentTransformMetadata( - corpus, job, corpus.documents.readOrThrow(docName), User("testUser") - ) + getDummyTransformMetadata(layer, DocumentFormat.TeiP5, file) ) val result = exporter.convertToFileNamed("tst") @@ -289,7 +288,7 @@ class DocTestBuilder( } fun mergeTEI(file: File, layer: Layer): TestResult { - val transformMetadata = getDummyTransformMetadata(layer, file = file) + val transformMetadata = getDummyTransformMetadata(layer, DocumentFormat.TeiP5, file) val result: TEIFile = TEIFile(file).merge(transformMetadata) return got(result.file.readText()) } diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/naf/NafExportTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/naf/NafExportTest.kt index e99a21e..9637f40 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/naf/NafExportTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/naf/NafExportTest.kt @@ -1,6 +1,7 @@ package org.ivdnt.galahad.port.naf import org.ivdnt.galahad.data.corpus.Corpus +import org.ivdnt.galahad.data.document.DocumentFormat import org.ivdnt.galahad.port.DocTest import org.ivdnt.galahad.port.LayerBuilder import org.ivdnt.galahad.port.Resource @@ -38,7 +39,7 @@ class NafExportTest { @Test fun `Merge throws`() { val layer = LayerBuilder().loadDummies(1).build() - val meta = DocTest.builder(corpus).getDummyTransformMetadata(layer, "txt") + val meta = DocTest.builder(corpus).getDummyTransformMetadata(layer, DocumentFormat.Txt) assertThrows(Exception::class.java) { Resource.getDoc("naf/import/input.naf.xml").merge(meta) } diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/plain/PlainFileTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/plain/PlainFileTest.kt index a49a159..0d471d2 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/plain/PlainFileTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/plain/PlainFileTest.kt @@ -1,6 +1,7 @@ package org.ivdnt.galahad.port.plain import org.ivdnt.galahad.data.corpus.Corpus +import org.ivdnt.galahad.data.document.DocumentFormat import org.ivdnt.galahad.data.layer.Layer import org.ivdnt.galahad.port.DocTestBuilder import org.ivdnt.galahad.port.Resource @@ -30,7 +31,7 @@ internal class PlainFileTest { val builder = DocTestBuilder(corpus) val file = PlainFile(Resource.get("txt/input.txt")) assertThrows(Exception::class.java) { - file.merge(builder.getDummyTransformMetadata(Layer.EMPTY, "txt")) + file.merge(builder.getDummyTransformMetadata(Layer.EMPTY, DocumentFormat.Txt)) } } } \ No newline at end of file diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt index d729d06..0c6daae 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIExportTest.kt @@ -22,6 +22,9 @@ internal class TEIExportTest { @Test fun `Merge pie-tdn result with heavily twined tei`() { + val file = TEIFile(Resource.get("tei/twine/twine.input.xml")) + assertPlainText("tei/twine", file) + val plaintext: String = Resource.get("tei/twine/plaintext.txt").readText() val layer = LayerBuilder() .loadLayerFromTSV("tei/twine/pie-tdn.tsv", plaintext) @@ -37,6 +40,9 @@ internal class TEIExportTest { @Test fun `Merge a pie-tdn layer with a tei file that only contains plaintext`() { + val file = TEIFile(Resource.get("tei/brieven/input.tei.xml")) + assertPlainText("tei/brieven", file) + val plaintext: String = Resource.get("tei/brieven/plaintext.txt").readText() val layer = LayerBuilder() .loadLayerFromTSV("tei/brieven/pie.tsv", plaintext) @@ -152,7 +158,8 @@ internal class TEIExportTest { corpus = corpus, job = corpus.jobs.readOrThrow(jobName), document = corpus.documents.readOrThrow(docName), - user = User("test-user") + user = User("test-user"), + targetFormat = DocumentFormat.TeiP5 ) ) } @@ -170,7 +177,8 @@ internal class TEIExportTest { corpus = corpus, job = corpus.jobs.readOrThrow(jobName), document = corpus.documents.readOrThrow(teiUploadedFileName), - user = User("test-user") + user = User("test-user"), + targetFormat = DocumentFormat.TeiP5 ) ) } diff --git a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt index bb441c7..7ef9429 100644 --- a/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt +++ b/server/src/test/kotlin/org/ivdnt/galahad/port/tei/TEIImportTest.kt @@ -13,7 +13,7 @@ internal class TEIImportTest { @Test fun `Multiple text elements`() { val teiFile = TEIFile(Resource.get("tei/dummies/multipletextelements.xml")) - assertEquals("text1\ntext2text3", teiFile.plainTextReader().readText().trim()) + assertEquals("text1\ntext2 text3", teiFile.plainTextReader().readText().trim()) } @Test @@ -40,6 +40,13 @@ internal class TEIImportTest { // Has no source layer assertPlainText("tei/brieven", file) } + + @Test + fun `Import TEI with w-tags without spaces in between`() { + val file = TEIFile(Resource.get("tei/nospaces/input.tei.xml")) + assertEquals("a a a", file.plainTextReader().readText().trim()) + + } } @Nested diff --git a/server/src/test/resources/all-formats/input/input.folia.xml b/server/src/test/resources/all-formats/input/input.folia.xml index b5d232c..dc0cbf3 100644 --- a/server/src/test/resources/all-formats/input/input.folia.xml +++ b/server/src/test/resources/all-formats/input/input.folia.xml @@ -36,7 +36,7 @@ , - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,12 +116,12 @@ ? - + " - +

diff --git a/server/src/test/resources/all-formats/input/input.tsv b/server/src/test/resources/all-formats/input/input.tsv index 10def0d..38fa3a7 100644 --- a/server/src/test/resources/all-formats/input/input.tsv +++ b/server/src/test/resources/all-formats/input/input.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET \ No newline at end of file +? LET +" LET \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml index 155bcef..95a563f 100644 --- a/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Conllu-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Conllu-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Folia-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml index 162375f..54f8f29 100644 --- a/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Naf-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Naf-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml index 63ffb00..6f28cee 100644 --- a/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-TeiP5-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-TeiP5-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml index 155bcef..95a563f 100644 --- a/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Tsv-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml b/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml index 155bcef..95a563f 100644 --- a/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml +++ b/server/src/test/resources/all-formats/output/from-Txt-to-Folia.folia.xml @@ -36,7 +36,7 @@
, - + @@ -51,17 +51,17 @@ . - + . - + . - + @@ -86,12 +86,12 @@ : - + " - + @@ -116,14 +116,14 @@ ? - + " - +

- + \ No newline at end of file diff --git a/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv b/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv index d75894e..3764c09 100644 --- a/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv +++ b/server/src/test/resources/all-formats/output/from-Txt-to-Tsv.tsv @@ -2,21 +2,21 @@ word lemma pos Hebban hebben VRB(finiteness=fin,tense=past) olla olle RES(type=for) vogala vogala NOU-C(number=sg) -, null LET +, LET nestas nestatis RES(type=for) hagunnan haguna NOU-C(number=sg) -. null LET -. null LET -. null LET +. LET +. LET +. LET Hinase hinas NOU-P ic ik PD(type=pers,position=free) ende en CONJ(type=coor) thu huben ADV(type=reg) -: null LET -" null LET +: LET +" LET uuat uiteen PD(type=w-p,position=free) unbidan unibent CONJ(type=sub) wi wij PD(type=pers,position=free) nu nu ADV(type=reg) -? null LET -" null LET +? LET +" LET diff --git a/server/src/test/resources/folia/twine/merged-output.folia.xml b/server/src/test/resources/folia/twine/merged-output.folia.xml index a95f5ff..331bd13 100644 --- a/server/src/test/resources/folia/twine/merged-output.folia.xml +++ b/server/src/test/resources/folia/twine/merged-output.folia.xml @@ -23,7 +23,7 @@

- To obey or not to be,
that is theonequestion that yremained. + To obey or not to be,
that is theonequestion that yremained.

diff --git a/server/src/test/resources/tei/nospaces/input.tei.xml b/server/src/test/resources/tei/nospaces/input.tei.xml new file mode 100644 index 0000000..4ac729d --- /dev/null +++ b/server/src/test/resources/tei/nospaces/input.tei.xml @@ -0,0 +1,7 @@ + + +

+ aaa +

+ + \ No newline at end of file