Release 1.0.0

INL · Jun 4, 2024 · 3dbc5fa · 3dbc5fa
2 parents 35171aa + 27c4142
commit 3dbc5fa
Show file tree

Hide file tree

Showing 22 changed files with 784 additions and 15 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "galahad-corpus-data"]
 	path = galahad-corpus-data
 	url = https://github.com/INL/galahad-corpus-data
+[submodule "galahad-taggers-dockerized"]
+	path = galahad-taggers-dockerized
+	url = https://github.com/INL/galahad-taggers-dockerized
diff --git a/Readme.md b/Readme.md
@@ -1,4 +1,4 @@
-# galahad-train-battery (0.9.1)
+# galahad-train-battery (1.0.0)
 Python program for training linguistic annotation taggers based on a configuration file and list of datasets. It prepares the resulting trained models for dockerization and adds relevant metadata. It is tagger software agnostic as long as a simple Python shell is built around it.
 
 ### GaLAHaD-related Repositories

diff --git a/codemeta.json b/codemeta.json
@@ -1,8 +1,8 @@
 {
     "@context": "https://w3id.org/codemeta/3.0",
     "@type": "SoftwareSourceCode",
-    "version": "0.9.1",
-    "dateModified": "2024-05-31",
+    "version": "1.0.0",
+    "dateModified": "2024-06-04",
     "dateCreated": "2024-05-31",
     "datePublished": "2024-05-31",
     "applicationCategory": [

diff --git a/configs/pie/tdn/config.json → configs/pie/TDN-1400-1600/config.json b/configs/pie/tdn/config.json → configs/pie/TDN-1400-1600/config.json
diff --git a/configs/pie/TDN-1400-1600/datasets.json b/configs/pie/TDN-1400-1600/datasets.json
@@ -0,0 +1,14 @@
+{
+    "name": "1400-1600",
+    "datasets": [
+        "dbnl-excerpts-15",
+        "dbnl-excerpts-16",
+        "dictionary-quotations-15",
+        "dictionary-quotations-16",
+        "clvn"
+    ],
+    "tagset": "TDN-Core",
+    "version": "1.0",
+    "eraFrom": "1400",
+    "eraTo": "1600"
+}
diff --git a/configs/pie/TDN-1600-1900/config.json b/configs/pie/TDN-1600-1900/config.json
@@ -0,0 +1,113 @@
+{
+    "verbose": true,
+    "report_freq": 10000,
+    "modelname": "DEFINE_IN_ENVIRONMENT",
+    "modelpath": "DEFINE_IN_ENVIRONMENT",
+    "input_path": "DEFINE_IN_ENVIRONMENT",
+    "dev_path": "DEFINE_IN_ENVIRONMENT",
+    "test_path": "",
+    "breakline_ref": "pos",
+    "breakline_data": ".$",
+    "max_sent_len": 35,
+    "max_sents": 1000000,
+    "word_max_size": 50000,
+    "word_min_freq": 1,
+    "word_lower": false,
+    "char_max_size": 500,
+    "char_min_freq": 1,
+    "char_lower": false,
+    "char_eos": true,
+    "char_bos": true,
+    "utfnorm": false,
+    "utfnorm_type": "NFKD",
+    "drop_diacritics": false,
+    "header": false,
+    "sep": "\t",
+    "tasks_order": [
+        "pos",
+        "lemma"
+    ],
+    "tasks": [
+        {
+            "name": "lemma",
+            "level": "char",
+            "decoder": "attentional",
+            "context": "sentence",
+            "layer": -1,
+            "settings": {
+                "bos": true,
+                "eos": true,
+                "lower": true,
+                "target": "lemma"
+            },
+            "target": true,
+            "default": "copy",
+            "read_only": false
+        },
+        {
+            "name": "pos",
+            "level": "token",
+            "decoder": "linear",
+            "context": "sentence",
+            "layer": -1,
+            "settings": {
+                "lower": false,
+                "target": "pos"
+            },
+            "target": false,
+            "default": "copy",
+            "read_only": false
+        }
+    ],
+    "task_defaults": {
+        "level": "token",
+        "layer": -1,
+        "decoder": "linear",
+        "context": "sentence"
+    },
+    "patience": 5,
+    "factor": 0.5,
+    "threshold": 0,
+    "min_weight": 0,
+    "include_lm": true,
+    "lm_shared_softmax": true,
+    "lm_schedule": {
+        "patience": 2,
+        "factor": 0.5,
+        "weight": 0.2,
+        "mode": "min"
+    },
+    "buffer_size": 10000,
+    "cache_dataset": false,
+    "minimize_pad": false,
+    "epochs": 100,
+    "batch_size": 25,
+    "shuffle": true,
+    "device": "cuda",
+    "run_test": false,
+    "pretrain_embeddings": false,
+    "load_pretrained_embeddings": "",
+    "load_pretrained_encoder": "",
+    "freeze_embeddings": false,
+    "dropout": 0.25,
+    "word_dropout": 0,
+    "optimizer": "Adam",
+    "clip_norm": 5.0,
+    "lr": 0.001,
+    "lr_factor": 0.75,
+    "min_lr": 0.000001,
+    "lr_patience": 2,
+    "checks_per_epoch": 1,
+    "wemb_dim": 0,
+    "cemb_dim": 300,
+    "cemb_type": "rnn",
+    "custom_cemb_cell": false,
+    "cemb_layers": 2,
+    "merge_type": "concat",
+    "scorer": "general",
+    "linear_layers": 1,
+    "hidden_size": 150,
+    "num_layers": 1,
+    "cell": "GRU",
+    "init_rnn": "default"
+}
diff --git a/configs/pie/TDN-1600-1900/datasets.json b/configs/pie/TDN-1600-1900/datasets.json
@@ -0,0 +1,17 @@
+{
+    "name": "1600-1900",
+    "datasets": [
+        "dbnl-excerpts-17",
+        "dbnl-excerpts-18",
+        "dbnl-excerpts-19",
+        "dictionary-quotations-17",
+        "dictionary-quotations-18",
+        "dictionary-quotations-19",
+        "couranten",
+        "letters-as-loot"
+    ],
+    "tagset": "TDN-Core",
+    "version": "1.0",
+    "eraFrom": "1600",
+    "eraTo": "1900"
+}
diff --git a/configs/pie/TDN-ALL/config.json b/configs/pie/TDN-ALL/config.json
@@ -0,0 +1,113 @@
+{
+    "verbose": true,
+    "report_freq": 10000,
+    "modelname": "DEFINE_IN_ENVIRONMENT",
+    "modelpath": "DEFINE_IN_ENVIRONMENT",
+    "input_path": "DEFINE_IN_ENVIRONMENT",
+    "dev_path": "DEFINE_IN_ENVIRONMENT",
+    "test_path": "",
+    "breakline_ref": "pos",
+    "breakline_data": ".$",
+    "max_sent_len": 35,
+    "max_sents": 1000000,
+    "word_max_size": 50000,
+    "word_min_freq": 1,
+    "word_lower": false,
+    "char_max_size": 500,
+    "char_min_freq": 1,
+    "char_lower": false,
+    "char_eos": true,
+    "char_bos": true,
+    "utfnorm": false,
+    "utfnorm_type": "NFKD",
+    "drop_diacritics": false,
+    "header": false,
+    "sep": "\t",
+    "tasks_order": [
+        "pos",
+        "lemma"
+    ],
+    "tasks": [
+        {
+            "name": "lemma",
+            "level": "char",
+            "decoder": "attentional",
+            "context": "sentence",
+            "layer": -1,
+            "settings": {
+                "bos": true,
+                "eos": true,
+                "lower": true,
+                "target": "lemma"
+            },
+            "target": true,
+            "default": "copy",
+            "read_only": false
+        },
+        {
+            "name": "pos",
+            "level": "token",
+            "decoder": "linear",
+            "context": "sentence",
+            "layer": -1,
+            "settings": {
+                "lower": false,
+                "target": "pos"
+            },
+            "target": false,
+            "default": "copy",
+            "read_only": false
+        }
+    ],
+    "task_defaults": {
+        "level": "token",
+        "layer": -1,
+        "decoder": "linear",
+        "context": "sentence"
+    },
+    "patience": 5,
+    "factor": 0.5,
+    "threshold": 0,
+    "min_weight": 0,
+    "include_lm": true,
+    "lm_shared_softmax": true,
+    "lm_schedule": {
+        "patience": 2,
+        "factor": 0.5,
+        "weight": 0.2,
+        "mode": "min"
+    },
+    "buffer_size": 10000,
+    "cache_dataset": false,
+    "minimize_pad": false,
+    "epochs": 100,
+    "batch_size": 25,
+    "shuffle": true,
+    "device": "cuda",
+    "run_test": false,
+    "pretrain_embeddings": false,
+    "load_pretrained_embeddings": "",
+    "load_pretrained_encoder": "",
+    "freeze_embeddings": false,
+    "dropout": 0.25,
+    "word_dropout": 0,
+    "optimizer": "Adam",
+    "clip_norm": 5.0,
+    "lr": 0.001,
+    "lr_factor": 0.75,
+    "min_lr": 0.000001,
+    "lr_patience": 2,
+    "checks_per_epoch": 1,
+    "wemb_dim": 0,
+    "cemb_dim": 300,
+    "cemb_type": "rnn",
+    "custom_cemb_cell": false,
+    "cemb_layers": 2,
+    "merge_type": "concat",
+    "scorer": "general",
+    "linear_layers": 1,
+    "hidden_size": 150,
+    "num_layers": 1,
+    "cell": "GRU",
+    "init_rnn": "default"
+}
diff --git a/configs/pie/TDN-ALL/datasets.json b/configs/pie/TDN-ALL/datasets.json
@@ -0,0 +1,22 @@
+{
+    "name": "ALL",
+    "datasets": [
+        "dbnl-excerpts-15",
+        "dbnl-excerpts-16",
+        "dbnl-excerpts-17",
+        "dbnl-excerpts-18",
+        "dbnl-excerpts-19",
+        "dictionary-quotations-15",
+        "dictionary-quotations-16",
+        "dictionary-quotations-17",
+        "dictionary-quotations-18",
+        "dictionary-quotations-19",
+        "clvn",
+        "couranten",
+        "letters-as-loot"
+    ],
+    "tagset": "TDN-Core",
+    "version": "1.0",
+    "eraFrom": "1400",
+    "eraTo": "1900"
+}