Merge pull request #449 from empiriker/master

Use linkage model in Russian Wiktionary
tatuylonen · Jan 5, 2024 · 40cd4a8 · 40cd4a8
2 parents c0c0c17 + 529b763
commit 40cd4a8
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 34 deletions.
diff --git a/src/wiktextract/extractor/ru/linkage.py b/src/wiktextract/extractor/ru/linkage.py
@@ -1,6 +1,6 @@
 from wikitextprocessor import NodeKind, WikiNode
 
-from wiktextract.extractor.ru.models import WordEntry
+from wiktextract.extractor.ru.models import Linkage, WordEntry
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -20,4 +20,4 @@ def extract_linkages(
     for link_node in level_node.find_child_recursively(NodeKind.LINK):
         word = clean_node(wxr, {}, link_node).strip()
         if word:
-            getattr(word_entry, linkage_type).append(word)
+            getattr(word_entry, linkage_type).append(Linkage(word=word))
diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py
@@ -21,6 +21,10 @@ class Translation(BaseModelWrap):
     )
 
 
+class Linkage(BaseModelWrap):
+    word: str = ""
+
+
 class Sound(BaseModelWrap):
     ipa: Optional[str] = Field(
         default=None, description="International Phonetic Alphabet"
@@ -34,7 +38,7 @@ class Sound(BaseModelWrap):
     tags: Optional[list[str]] = Field(
         default=[], description="Specifying the variant of the pronunciation"
     )
-    homophones: Optional[list[str]] = Field(
+    homophones: list[Linkage] = Field(
         default=[], description="Words with same pronunciation"
     )
 
@@ -118,34 +122,19 @@ class WordEntry(BaseModelWrap):
     sounds: Optional[list[Sound]] = []
     senses: Optional[list[Sense]] = []
     translations: Optional[list[Translation]] = []
-
-    antonyms: Optional[list[str]] = Field(
-        default=[], description="List of antonyms"
-    )
-    anagrams: Optional[list[str]] = Field(
-        default=[], description="List of anagrams"
-    )
-    variants: Optional[list[str]] = Field(
-        default=[], description="List of variants"
-    )
-    hypernyms: Optional[list[str]] = Field(
+    antonyms: list[Linkage] = Field(default=[], description="List of antonyms")
+    anagrams: list[Linkage] = Field(default=[], description="List of anagrams")
+    variants: list[Linkage] = Field(default=[], description="List of variants")
+    hypernyms: list[Linkage] = Field(
         default=[], description="List of hypernyms"
     )
-    hyponyms: Optional[list[str]] = Field(
-        default=[], description="List of hyponyms"
-    )
-    derived: Optional[list[str]] = Field(
+    hyponyms: list[Linkage] = Field(default=[], description="List of hyponyms")
+    derived: list[Linkage] = Field(
         default=[], description="List of derived terms"
     )
-    meronyms: Optional[list[str]] = Field(
-        default=[], description="List of meronyms"
-    )
-    synonyms: Optional[list[str]] = Field(
-        default=[], description="List of synonyms"
-    )
-    coordinate_terms: Optional[list[str]] = Field(
+    meronyms: list[Linkage] = Field(default=[], description="List of meronyms")
+    synonyms: list[Linkage] = Field(default=[], description="List of synonyms")
+    coordinate_terms: list[Linkage] = Field(
         default=[], description="List of coordinate terms"
     )
-    holonyms: Optional[list[str]] = Field(
-        default=[], description="List of holonyms"
-    )
+    holonyms: list[Linkage] = Field(default=[], description="List of holonyms")
diff --git a/src/wiktextract/extractor/ru/pronunciation.py b/src/wiktextract/extractor/ru/pronunciation.py
@@ -4,7 +4,7 @@
 from wikitextprocessor import NodeKind
 from wikitextprocessor.parser import LevelNode, WikiNode, WikiNodeChildrenList
 
-from wiktextract.extractor.ru.models import Sound, WordEntry
+from wiktextract.extractor.ru.models import Linkage, Sound, WordEntry
 from wiktextract.extractor.share import create_audio_url_dict
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -208,7 +208,9 @@ def extract_homophones(
     template_params: dict[str, WikiNode],
 ):
     homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", ""))
-    homophones = [h.strip() for h in homophones_raw.split(",") if h.strip()]
+    homophones = [
+        Linkage(word=h.strip()) for h in homophones_raw.split(",") if h.strip()
+    ]
     if homophones:
         if isinstance(sounds, list):
             for sound in sounds:

diff --git a/tests/test_ru_pronunciation.py b/tests/test_ru_pronunciation.py
@@ -69,7 +69,7 @@ def test_process_transcription_template(self):
                 "expected": {
                     "ipa": "vot",
                     "audio": "Ru-вот.ogg",
-                    "homophones": ["вод"],
+                    "homophones": [{"word": "вод"}],
                 },
             },
         ]
@@ -103,7 +103,7 @@ def test_process_transcriptions_template(self):
                 "expected": [
                     {
                         "ipa": "bɐˈlʲit",
-                        "homophones": ["болит"],
+                        "homophones": [{"word": "болит"}],
                         "tags": ["singular"],
                     },
                     {
@@ -154,7 +154,7 @@ def test_process_transcription_ru_template_2(self):
                 {
                     "ipa": "vot",
                     "audio": "Ru-вот.ogg",
-                    "homophones": ["вод"],
+                    "homophones": [{"word": "вод"}],
                 }
             ],
         )
@@ -194,7 +194,7 @@ def test_process_transcriptions_ru_template_2(self):
             [
                 {
                     "ipa": "bɐˈlʲit",
-                    "homophones": ["болит"],
+                    "homophones": [{"word": "болит"}],
                     "tags": ["singular"],
                 },
                 {