Skip to content

Commit

Permalink
Merge pull request #449 from empiriker/master
Browse files Browse the repository at this point in the history
Use linkage model in Russian Wiktionary
  • Loading branch information
xxyzz committed Jan 5, 2024
2 parents c0c0c17 + 529b763 commit 40cd4a8
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 34 deletions.
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/ru/linkage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from wikitextprocessor import NodeKind, WikiNode

from wiktextract.extractor.ru.models import WordEntry
from wiktextract.extractor.ru.models import Linkage, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand All @@ -20,4 +20,4 @@ def extract_linkages(
for link_node in level_node.find_child_recursively(NodeKind.LINK):
word = clean_node(wxr, {}, link_node).strip()
if word:
getattr(word_entry, linkage_type).append(word)
getattr(word_entry, linkage_type).append(Linkage(word=word))
41 changes: 15 additions & 26 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ class Translation(BaseModelWrap):
)


class Linkage(BaseModelWrap):
word: str = ""


class Sound(BaseModelWrap):
ipa: Optional[str] = Field(
default=None, description="International Phonetic Alphabet"
Expand All @@ -34,7 +38,7 @@ class Sound(BaseModelWrap):
tags: Optional[list[str]] = Field(
default=[], description="Specifying the variant of the pronunciation"
)
homophones: Optional[list[str]] = Field(
homophones: list[Linkage] = Field(
default=[], description="Words with same pronunciation"
)

Expand Down Expand Up @@ -118,34 +122,19 @@ class WordEntry(BaseModelWrap):
sounds: Optional[list[Sound]] = []
senses: Optional[list[Sense]] = []
translations: Optional[list[Translation]] = []

antonyms: Optional[list[str]] = Field(
default=[], description="List of antonyms"
)
anagrams: Optional[list[str]] = Field(
default=[], description="List of anagrams"
)
variants: Optional[list[str]] = Field(
default=[], description="List of variants"
)
hypernyms: Optional[list[str]] = Field(
antonyms: list[Linkage] = Field(default=[], description="List of antonyms")
anagrams: list[Linkage] = Field(default=[], description="List of anagrams")
variants: list[Linkage] = Field(default=[], description="List of variants")
hypernyms: list[Linkage] = Field(
default=[], description="List of hypernyms"
)
hyponyms: Optional[list[str]] = Field(
default=[], description="List of hyponyms"
)
derived: Optional[list[str]] = Field(
hyponyms: list[Linkage] = Field(default=[], description="List of hyponyms")
derived: list[Linkage] = Field(
default=[], description="List of derived terms"
)
meronyms: Optional[list[str]] = Field(
default=[], description="List of meronyms"
)
synonyms: Optional[list[str]] = Field(
default=[], description="List of synonyms"
)
coordinate_terms: Optional[list[str]] = Field(
meronyms: list[Linkage] = Field(default=[], description="List of meronyms")
synonyms: list[Linkage] = Field(default=[], description="List of synonyms")
coordinate_terms: list[Linkage] = Field(
default=[], description="List of coordinate terms"
)
holonyms: Optional[list[str]] = Field(
default=[], description="List of holonyms"
)
holonyms: list[Linkage] = Field(default=[], description="List of holonyms")
6 changes: 4 additions & 2 deletions src/wiktextract/extractor/ru/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from wikitextprocessor import NodeKind
from wikitextprocessor.parser import LevelNode, WikiNode, WikiNodeChildrenList

from wiktextract.extractor.ru.models import Sound, WordEntry
from wiktextract.extractor.ru.models import Linkage, Sound, WordEntry
from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down Expand Up @@ -208,7 +208,9 @@ def extract_homophones(
template_params: dict[str, WikiNode],
):
homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", ""))
homophones = [h.strip() for h in homophones_raw.split(",") if h.strip()]
homophones = [
Linkage(word=h.strip()) for h in homophones_raw.split(",") if h.strip()
]
if homophones:
if isinstance(sounds, list):
for sound in sounds:
Expand Down
8 changes: 4 additions & 4 deletions tests/test_ru_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_process_transcription_template(self):
"expected": {
"ipa": "vot",
"audio": "Ru-вот.ogg",
"homophones": ["вод"],
"homophones": [{"word": "вод"}],
},
},
]
Expand Down Expand Up @@ -103,7 +103,7 @@ def test_process_transcriptions_template(self):
"expected": [
{
"ipa": "bɐˈlʲit",
"homophones": ["болит"],
"homophones": [{"word": "болит"}],
"tags": ["singular"],
},
{
Expand Down Expand Up @@ -154,7 +154,7 @@ def test_process_transcription_ru_template_2(self):
{
"ipa": "vot",
"audio": "Ru-вот.ogg",
"homophones": ["вод"],
"homophones": [{"word": "вод"}],
}
],
)
Expand Down Expand Up @@ -194,7 +194,7 @@ def test_process_transcriptions_ru_template_2(self):
[
{
"ipa": "bɐˈlʲit",
"homophones": ["болит"],
"homophones": [{"word": "болит"}],
"tags": ["singular"],
},
{
Expand Down

0 comments on commit 40cd4a8

Please sign in to comment.