Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix most flake8 warnings #13

Merged
merged 1 commit into from
Sep 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@

# -- Project information -----------------------------------------------------

project = u'LaoNLP'
copyright = u'2020 - 2021, Wannaphong Phatthiyaphaibun'
author = u'Wannaphong Phatthiyaphaibun'
project = 'LaoNLP'
copyright = '2020 - 2021, Wannaphong Phatthiyaphaibun'
author = 'Wannaphong Phatthiyaphaibun'

# The short X.Y version
version = u''
version = ''
# The full version, including alpha/beta/rc tags
release = u''
release = ''


# -- General configuration ---------------------------------------------------
Expand Down Expand Up @@ -133,8 +133,8 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'SphinxGitHubActionTest.tex', u'Sphinx GitHub Action Test Documentation',
u'Sean Zheng', 'manual'),
(master_doc, 'SphinxGitHubActionTest.tex', 'Sphinx GitHub Action Test Documentation',
'Sean Zheng', 'manual'),
]


Expand All @@ -143,7 +143,7 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'sphinxgithubactiontest', u'Sphinx GitHub Action Test Documentation',
(master_doc, 'sphinxgithubactiontest', 'Sphinx GitHub Action Test Documentation',
[author], 1)
]

Expand All @@ -154,7 +154,7 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'SphinxGitHubActionTest', u'Sphinx GitHub Action Test Documentation',
(master_doc, 'SphinxGitHubActionTest', 'Sphinx GitHub Action Test Documentation',
author, 'SphinxGitHubActionTest', 'One line description of project.',
'Miscellaneous'),
]
Expand Down Expand Up @@ -185,4 +185,4 @@
'special-members': '__init__',
'undoc-members': True,
'exclude-members': '__weakref__'
}
}
36 changes: 18 additions & 18 deletions laonlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,33 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from laonlp.tokenize import *
from laonlp.corpus import *
from laonlp.transliterate import *
from laonlp.tag import pos_tag
from laonlp.tokenize import *
from laonlp.transliterate import *
from laonlp.util import *

TONE_MARKS = "່້"+"໊໋"
CONSONANTS = "ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮ"
VOWELS_COMBINING = "ັ"+"ິີ"+"ຶືຸ"+"ູົໍ"
VOWELS = "ະັາ"+"ຳິີ"+"ຶືຸ"+"ູົຼ"+"ຽເແ"+"ໂໃໄ"+"ໍ"
NUMBERS = "໑໒໓໔໕໖໗໘໙໐" # 1234567890
NUMBERS = "໑໒໓໔໕໖໗໘໙໐" # 1234567890
CANCELLATION_MARK = "\u0ECC"
# This is Obsolete consonants.
# You can read at https://en.wikipedia.org/wiki/Lao_script
lao_obsolete_consonants_mapping_thai = {
"ຆ":"ฆ", # PALI GHA
"ຉ":"ฉ", # PALI CHA
"ຌ":"ฌ", # PALI JHA
"ຎ":"ญ", # PALI NYA
"ຏ":"ฏ", # PALI TTA
"ຐ":"ฐ", # PALI TTHA
"ຑ":"ฑ", # PALI DDA
"ຒ":"ฒ", # PALI DDHA
"ຓ":"ณ", # PALI NNA
"ຘ":"ธ", # PALI DHA
"ຠ":"ภ", # PALI BHA
"ຨ":"ศ", # SANSKRIT SHA
"ຩ":"ษ", # SANSKRIT SSA
"ຬ":"ฬ", # PALI LLA
}
"ຆ": "ฆ", # PALI GHA
"ຉ": "ฉ", # PALI CHA
"ຌ": "ฌ", # PALI JHA
"ຎ": "ญ", # PALI NYA
"ຏ": "ฏ", # PALI TTA
"ຐ": "ฐ", # PALI TTHA
"ຑ": "ฑ", # PALI DDA
"ຒ": "ฒ", # PALI DDHA
"ຓ": "ณ", # PALI NNA
"ຘ": "ธ", # PALI DHA
"ຠ": "ภ", # PALI BHA
"ຨ": "ศ", # SANSKRIT SHA
"ຩ": "ษ", # SANSKRIT SSA
"ຬ": "ฬ", # PALI LLA
}
2 changes: 1 addition & 1 deletion laonlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
"lao_wiktionarydict",
"get_path_corpus",
"lao_stopwords"
]
]
2 changes: 1 addition & 1 deletion laonlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@


def get_path_corpus(file):
return os.path.join(laonlp_path, "corpus", file)
return os.path.join(laonlp_path, "corpus", file)
10 changes: 5 additions & 5 deletions laonlp/corpus/lao_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def lao_dictionary() -> List[str]:
"""
path = get_path_corpus("Lao-Dictionary.txt")
with open(path, "r", encoding="utf-8-sig") as f:
return [i.strip() for i in f.readlines() if i[0]!="#"]
return [i.strip() for i in f.readlines() if i[0] != "#"]


def lao_spellcheckdict() -> List[str]:
Expand All @@ -34,13 +34,13 @@ def lao_spellcheckdict() -> List[str]:
"""
path = get_path_corpus("lo_spellcheck_dict.txt")
with open(path, "r", encoding="utf-8-sig") as f:
return [i.strip() for i in f.readlines() if i[0]!="#"]
return [i.strip() for i in f.readlines() if i[0] != "#"]


def lao_wannaphongdict() -> List[str]:
path = get_path_corpus("lao-wannaphong.txt")
with open(path, "r", encoding="utf-8-sig") as f:
return [i.strip() for i in f.readlines() if i[0]!="#"]
return [i.strip() for i in f.readlines() if i[0] != "#"]


def lao_wiktionarydict() -> List[str]:
Expand All @@ -49,7 +49,7 @@ def lao_wiktionarydict() -> List[str]:
"""
path = get_path_corpus("wiktionary-20210720.txt")
with open(path, "r", encoding="utf-8-sig") as f:
return [i.strip() for i in f.readlines() if i[0]!="#"]
return [i.strip() for i in f.readlines() if i[0] != "#"]


def lao_words() -> List[str]:
Expand All @@ -68,5 +68,5 @@ def lao_stopwords() -> FrozenSet[str]:
path = get_path_corpus("stopwords_lao.txt")
with open(path, "r", encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()
lines = [line.strip() for line in lines if line.startswith("#") == False]
lines = [line.strip() for line in lines if line.startswith("#") is False]
return frozenset(filter(None, lines))
13 changes: 6 additions & 7 deletions laonlp/corpus/mopt_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,38 +17,37 @@
import csv
from collections import defaultdict

from laonlp.corpus import laonlp_path
from laonlp.corpus.core import get_path_corpus
corpus_path = get_path_corpus("lao-eng-dictionary.csv")
list_data=[]
with open(corpus_path,encoding="utf-8-sig") as csvfile:
list_data = []
with open(corpus_path, encoding="utf-8-sig") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
list_data.append(row)


def get_lao_eng()->dict:
def get_lao_eng() -> dict:
_w = defaultdict(list)
for i in list_data:
_w[i['LaoWord']].append(i['English'])
return _w


def get_eng_lao()->dict:
def get_eng_lao() -> dict:
_w = defaultdict(list)
for i in list_data:
_w[i['English']].append(i['LaoWord'])
return _w


def get_pronunciation()->dict:
def get_pronunciation() -> dict:
_w = defaultdict(list)
for i in list_data:
_w[i['LaoWord']].append(i['Pronunciation'])
return _w


def get_type()->dict:
def get_type() -> dict:
_w = defaultdict(list)
for i in list_data:
_w[i['LaoWord']].append(i['Type'])
Expand Down
5 changes: 3 additions & 2 deletions laonlp/translate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
]
from laonlp.translate.mopt_dict import dictionary

def word_dictionary(word: str, src: str, target: str, name: str = "mopt_laos")->list:

def word_dictionary(word: str, src: str, target: str, name: str = "mopt_laos") -> list:
"""
Word dictionary

Expand All @@ -29,4 +30,4 @@ def word_dictionary(word: str, src: str, target: str, name: str = "mopt_laos")->
:return: return word
:rtype: str
"""
return dictionary(word, src, target)
return dictionary(word, src, target)
6 changes: 2 additions & 4 deletions laonlp/translate/mopt_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from typing import List

from laonlp.corpus import mopt_dict


def dictionary(word: str, src: str, target: str)->list:
def dictionary(word: str, src: str, target: str) -> list:
if src == "lao" and target == "eng":
_temp = mopt_dict.get_lao_eng()
if word not in list(_temp.keys()):
Expand All @@ -31,4 +29,4 @@ def dictionary(word: str, src: str, target: str)->list:
return None
return _temp[word]
else:
return word
return word
2 changes: 1 addition & 1 deletion laonlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
)
from laonlp.util.lao import (
remove_tone_mark
)
)
6 changes: 4 additions & 2 deletions laonlp/util/digitconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
"ສູນ"
]
_dict_lao_arabic = {
i:j for i,j in zip(list(NUMBERS), list(_arabic_numerals))
i: j for i, j in zip(list(NUMBERS), list(_arabic_numerals))
}
_dict_arabic_lao = {
i:j for i,j in zip(list(_arabic_numerals), list(NUMBERS))
i: j for i, j in zip(list(_arabic_numerals), list(NUMBERS))
}
_lao_arabic_table = str.maketrans(_dict_lao_arabic)
_arabic_lao_table = str.maketrans(_dict_arabic_lao)
Expand All @@ -48,6 +48,7 @@ def lao_digit_to_arabic_digit(text: str) -> str:
"""
return text.translate(_lao_arabic_table)


def arabic_digit_to_lao_digit(text: str) -> str:
"""
Arabic digit to Lao digit
Expand All @@ -58,6 +59,7 @@ def arabic_digit_to_lao_digit(text: str) -> str:
"""
return text.translate(_arabic_lao_table)


def number2lao(numbers: int):
"""
Numbers to La opronunciation
Expand Down
4 changes: 2 additions & 2 deletions laonlp/util/lao.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
limitations under the License.
"""
TONE_MARKS = "່້"+"໊໋"
_tone_mark = str.maketrans({i:None for i in TONE_MARKS})
_tone_mark = str.maketrans({i: None for i in TONE_MARKS})


def remove_tone_mark(text: str) -> str:
Expand All @@ -26,4 +26,4 @@ def remove_tone_mark(text: str) -> str:
:return: returns a lao text without tone mark.
:rtype: str
"""
return text.translate(_tone_mark)
return text.translate(_tone_mark)
19 changes: 10 additions & 9 deletions laonlp/word_vector/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
except ModuleNotFoundError:
raise ModuleNotFoundError('Word vector functionalities require huggingface_hub which is not currently installed. Please try installing the package via "pip install huggingface_hub".')


class Word2Vec:
"""
Word2Vec
"""
def __init__(self, model: str, corpus: str="oscar"):
def __init__(self, model: str, corpus: str = "oscar"):
"""
:param str model: model name (cbow or skip-gram)
:param str corpus: corpus name (oscar)
Expand All @@ -39,27 +40,27 @@ def __init__(self, model: str, corpus: str="oscar"):
if self.corpus not in ["oscar"]:
raise NotImplementedError("LaoNLP doesn't support %s corpus." % self.corpus)
self.load_model(self.model)

def load_model(self, model: str):
"""
Load Word2Vec model

:param str model: model name (cbow or skip-gram)
"""
if model=="cbow":
if model == "cbow":
self.model_path = hf_hub_download(repo_id="wannaphong/Lao-Word-Embedding", filename="lao_oscar_cbow_model.bin")
elif model=="skip-gram":
elif model == "skip-gram":
self.model_path = hf_hub_download(repo_id="wannaphong/Lao-Word-Embedding", filename="lao_oscar_skipgram_model.bin")
else:
raise NotImplementedError("LaoNLP doesn't support %s model." % model)
self.model_wav2vec = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(self.model_path, binary=True, encoding='utf-8-sig', unicode_errors='ignore')

def get_model(self):
"""
Get gensim.models.keyedvectors.KeyedVectors class
"""
return self.model_wav2vec

def doesnt_match(self, words: List[str]) -> str:
"""
Get donesn't match
Expand All @@ -70,12 +71,12 @@ def doesnt_match(self, words: List[str]) -> str:
:rtype: str
"""
return self.model_wav2vec.doesnt_match(words)

def most_similar_cosmul(self, positive: List[str], negative: List[str]):
return self.model_wav2vec.most_similar_cosmul(
positive=positive, negative=negative
)

def similarity(self, word1: str, word2: str) -> float:
"""
Find similarity between word pairs.
Expand All @@ -86,4 +87,4 @@ def similarity(self, word1: str, word2: str) -> float:
:return: return similarity
:rtype: float
"""
return self.model_wav2vec.similarity(word1, word2)
return self.model_wav2vec.similarity(word1, word2)
2 changes: 1 addition & 1 deletion tests/test_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@

class TestTagPackage(unittest.TestCase):
def test_word_dictionary(self):
self.assertIsNotNone(word_dictionary("cat","en","lao"))
self.assertIsNotNone(word_dictionary("cat", "en", "lao"))
4 changes: 3 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ def test_lao_digit_to_arabic_digit(self):
lao_digit_to_arabic_digit("໑໒໓໔໕໖໗໘໙໐"),
'1234567890'
)

def test_arabic_digit_to_lao_digit(self):
self.assertEqual(
arabic_digit_to_lao_digit('1234567890'),
"໑໒໓໔໕໖໗໘໙໐"
)

def test_remove_tone_mark(self):
self.assertEqual(
remove_tone_mark("ຜູ້"),
'ຜູ'
)
)
Loading
Loading