From 0c851bdfbc707cc4ccb13fe2390cc9b2af080205 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Sat, 23 Sep 2023 15:34:26 +0800 Subject: [PATCH] Fix most flake8 warnings --- docs/conf.py | 20 +++++++++---------- laonlp/__init__.py | 36 +++++++++++++++++----------------- laonlp/corpus/__init__.py | 2 +- laonlp/corpus/core.py | 2 +- laonlp/corpus/lao_words.py | 10 +++++----- laonlp/corpus/mopt_dict.py | 13 ++++++------ laonlp/translate/__init__.py | 5 +++-- laonlp/translate/mopt_dict.py | 6 ++---- laonlp/util/__init__.py | 2 +- laonlp/util/digitconv.py | 6 ++++-- laonlp/util/lao.py | 4 ++-- laonlp/word_vector/word2vec.py | 19 +++++++++--------- tests/test_translate.py | 2 +- tests/test_util.py | 4 +++- tests/test_word_vector.py | 4 ++-- 15 files changed, 69 insertions(+), 66 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index aafa296..fa7d29f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,14 +19,14 @@ # -- Project information ----------------------------------------------------- -project = u'LaoNLP' -copyright = u'2020 - 2021, Wannaphong Phatthiyaphaibun' -author = u'Wannaphong Phatthiyaphaibun' +project = 'LaoNLP' +copyright = '2020 - 2021, Wannaphong Phatthiyaphaibun' +author = 'Wannaphong Phatthiyaphaibun' # The short X.Y version -version = u'' +version = '' # The full version, including alpha/beta/rc tags -release = u'' +release = '' # -- General configuration --------------------------------------------------- @@ -133,8 +133,8 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'SphinxGitHubActionTest.tex', u'Sphinx GitHub Action Test Documentation', - u'Sean Zheng', 'manual'), + (master_doc, 'SphinxGitHubActionTest.tex', 'Sphinx GitHub Action Test Documentation', + 'Sean Zheng', 'manual'), ] @@ -143,7 +143,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'sphinxgithubactiontest', u'Sphinx GitHub Action Test Documentation', + (master_doc, 'sphinxgithubactiontest', 'Sphinx GitHub Action Test Documentation', [author], 1) ] @@ -154,7 +154,7 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'SphinxGitHubActionTest', u'Sphinx GitHub Action Test Documentation', + (master_doc, 'SphinxGitHubActionTest', 'Sphinx GitHub Action Test Documentation', author, 'SphinxGitHubActionTest', 'One line description of project.', 'Miscellaneous'), ] @@ -185,4 +185,4 @@ 'special-members': '__init__', 'undoc-members': True, 'exclude-members': '__weakref__' -} \ No newline at end of file +} diff --git a/laonlp/__init__.py b/laonlp/__init__.py index cfb154d..b242c1e 100644 --- a/laonlp/__init__.py +++ b/laonlp/__init__.py @@ -14,33 +14,33 @@ See the License for the specific language governing permissions and limitations under the License. """ -from laonlp.tokenize import * from laonlp.corpus import * -from laonlp.transliterate import * from laonlp.tag import pos_tag +from laonlp.tokenize import * +from laonlp.transliterate import * from laonlp.util import * TONE_MARKS = "່້"+"໊໋" CONSONANTS = "ກຂຄງຈສຊຍດຕຖທນບປຜຝພຟມຢຣລວຫອຮ" VOWELS_COMBINING = "ັ"+"ິີ"+"ຶືຸ"+"ູົໍ" VOWELS = "ະັາ"+"ຳິີ"+"ຶືຸ"+"ູົຼ"+"ຽເແ"+"ໂໃໄ"+"ໍ" -NUMBERS = "໑໒໓໔໕໖໗໘໙໐" # 1234567890 +NUMBERS = "໑໒໓໔໕໖໗໘໙໐" # 1234567890 CANCELLATION_MARK = "\u0ECC" # This is Obsolete consonants. # You can read at https://en.wikipedia.org/wiki/Lao_script lao_obsolete_consonants_mapping_thai = { - "ຆ":"ฆ", # PALI GHA - "ຉ":"ฉ", # PALI CHA - "ຌ":"ฌ", # PALI JHA - "ຎ":"ญ", # PALI NYA - "ຏ":"ฏ", # PALI TTA - "ຐ":"ฐ", # PALI TTHA - "ຑ":"ฑ", # PALI DDA - "ຒ":"ฒ", # PALI DDHA - "ຓ":"ณ", # PALI NNA - "ຘ":"ธ", # PALI DHA - "ຠ":"ภ", # PALI BHA - "ຨ":"ศ", # SANSKRIT SHA - "ຩ":"ษ", # SANSKRIT SSA - "ຬ":"ฬ", # PALI LLA -} \ No newline at end of file + "ຆ": "ฆ", # PALI GHA + "ຉ": "ฉ", # PALI CHA + "ຌ": "ฌ", # PALI JHA + "ຎ": "ญ", # PALI NYA + "ຏ": "ฏ", # PALI TTA + "ຐ": "ฐ", # PALI TTHA + "ຑ": "ฑ", # PALI DDA + "ຒ": "ฒ", # PALI DDHA + "ຓ": "ณ", # PALI NNA + "ຘ": "ธ", # PALI DHA + "ຠ": "ภ", # PALI BHA + "ຨ": "ศ", # SANSKRIT SHA + "ຩ": "ษ", # SANSKRIT SSA + "ຬ": "ฬ", # PALI LLA +} diff --git a/laonlp/corpus/__init__.py b/laonlp/corpus/__init__.py index 4243707..04d6b55 100644 --- a/laonlp/corpus/__init__.py +++ b/laonlp/corpus/__init__.py @@ -30,4 +30,4 @@ "lao_wiktionarydict", "get_path_corpus", "lao_stopwords" -] \ No newline at end of file +] diff --git a/laonlp/corpus/core.py b/laonlp/corpus/core.py index 2f8198f..ab91e3a 100644 --- a/laonlp/corpus/core.py +++ b/laonlp/corpus/core.py @@ -19,4 +19,4 @@ def get_path_corpus(file): - return os.path.join(laonlp_path, "corpus", file) \ No newline at end of file + return os.path.join(laonlp_path, "corpus", file) diff --git a/laonlp/corpus/lao_words.py b/laonlp/corpus/lao_words.py index 7771977..ae9080d 100644 --- a/laonlp/corpus/lao_words.py +++ b/laonlp/corpus/lao_words.py @@ -25,7 +25,7 @@ def lao_dictionary() -> List[str]: """ path = get_path_corpus("Lao-Dictionary.txt") with open(path, "r", encoding="utf-8-sig") as f: - return [i.strip() for i in f.readlines() if i[0]!="#"] + return [i.strip() for i in f.readlines() if i[0] != "#"] def lao_spellcheckdict() -> List[str]: @@ -34,13 +34,13 @@ def lao_spellcheckdict() -> List[str]: """ path = get_path_corpus("lo_spellcheck_dict.txt") with open(path, "r", encoding="utf-8-sig") as f: - return [i.strip() for i in f.readlines() if i[0]!="#"] + return [i.strip() for i in f.readlines() if i[0] != "#"] def lao_wannaphongdict() -> List[str]: path = get_path_corpus("lao-wannaphong.txt") with open(path, "r", encoding="utf-8-sig") as f: - return [i.strip() for i in f.readlines() if i[0]!="#"] + return [i.strip() for i in f.readlines() if i[0] != "#"] def lao_wiktionarydict() -> List[str]: @@ -49,7 +49,7 @@ def lao_wiktionarydict() -> List[str]: """ path = get_path_corpus("wiktionary-20210720.txt") with open(path, "r", encoding="utf-8-sig") as f: - return [i.strip() for i in f.readlines() if i[0]!="#"] + return [i.strip() for i in f.readlines() if i[0] != "#"] def lao_words() -> List[str]: @@ -68,5 +68,5 @@ def lao_stopwords() -> FrozenSet[str]: path = get_path_corpus("stopwords_lao.txt") with open(path, "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() - lines = [line.strip() for line in lines if line.startswith("#") == False] + lines = [line.strip() for line in lines if line.startswith("#") is False] return frozenset(filter(None, lines)) diff --git a/laonlp/corpus/mopt_dict.py b/laonlp/corpus/mopt_dict.py index 758cffd..c98e672 100644 --- a/laonlp/corpus/mopt_dict.py +++ b/laonlp/corpus/mopt_dict.py @@ -17,38 +17,37 @@ import csv from collections import defaultdict -from laonlp.corpus import laonlp_path from laonlp.corpus.core import get_path_corpus corpus_path = get_path_corpus("lao-eng-dictionary.csv") -list_data=[] -with open(corpus_path,encoding="utf-8-sig") as csvfile: +list_data = [] +with open(corpus_path, encoding="utf-8-sig") as csvfile: reader = csv.DictReader(csvfile) for row in reader: list_data.append(row) -def get_lao_eng()->dict: +def get_lao_eng() -> dict: _w = defaultdict(list) for i in list_data: _w[i['LaoWord']].append(i['English']) return _w -def get_eng_lao()->dict: +def get_eng_lao() -> dict: _w = defaultdict(list) for i in list_data: _w[i['English']].append(i['LaoWord']) return _w -def get_pronunciation()->dict: +def get_pronunciation() -> dict: _w = defaultdict(list) for i in list_data: _w[i['LaoWord']].append(i['Pronunciation']) return _w -def get_type()->dict: +def get_type() -> dict: _w = defaultdict(list) for i in list_data: _w[i['LaoWord']].append(i['Type']) diff --git a/laonlp/translate/__init__.py b/laonlp/translate/__init__.py index 15165dc..20daed8 100644 --- a/laonlp/translate/__init__.py +++ b/laonlp/translate/__init__.py @@ -19,7 +19,8 @@ ] from laonlp.translate.mopt_dict import dictionary -def word_dictionary(word: str, src: str, target: str, name: str = "mopt_laos")->list: + +def word_dictionary(word: str, src: str, target: str, name: str = "mopt_laos") -> list: """ Word dictionary @@ -29,4 +30,4 @@ def word_dictionary(word: str, src: str, target: str, name: str = "mopt_laos")-> :return: return word :rtype: str """ - return dictionary(word, src, target) \ No newline at end of file + return dictionary(word, src, target) diff --git a/laonlp/translate/mopt_dict.py b/laonlp/translate/mopt_dict.py index 693c59b..2393f72 100644 --- a/laonlp/translate/mopt_dict.py +++ b/laonlp/translate/mopt_dict.py @@ -14,12 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. """ -from typing import List - from laonlp.corpus import mopt_dict -def dictionary(word: str, src: str, target: str)->list: +def dictionary(word: str, src: str, target: str) -> list: if src == "lao" and target == "eng": _temp = mopt_dict.get_lao_eng() if word not in list(_temp.keys()): @@ -31,4 +29,4 @@ def dictionary(word: str, src: str, target: str)->list: return None return _temp[word] else: - return word \ No newline at end of file + return word diff --git a/laonlp/util/__init__.py b/laonlp/util/__init__.py index 43905b9..74abdda 100644 --- a/laonlp/util/__init__.py +++ b/laonlp/util/__init__.py @@ -25,4 +25,4 @@ ) from laonlp.util.lao import ( remove_tone_mark -) \ No newline at end of file +) diff --git a/laonlp/util/digitconv.py b/laonlp/util/digitconv.py index 87d7c55..be49262 100644 --- a/laonlp/util/digitconv.py +++ b/laonlp/util/digitconv.py @@ -29,10 +29,10 @@ "ສູນ" ] _dict_lao_arabic = { - i:j for i,j in zip(list(NUMBERS), list(_arabic_numerals)) + i: j for i, j in zip(list(NUMBERS), list(_arabic_numerals)) } _dict_arabic_lao = { - i:j for i,j in zip(list(_arabic_numerals), list(NUMBERS)) + i: j for i, j in zip(list(_arabic_numerals), list(NUMBERS)) } _lao_arabic_table = str.maketrans(_dict_lao_arabic) _arabic_lao_table = str.maketrans(_dict_arabic_lao) @@ -48,6 +48,7 @@ def lao_digit_to_arabic_digit(text: str) -> str: """ return text.translate(_lao_arabic_table) + def arabic_digit_to_lao_digit(text: str) -> str: """ Arabic digit to Lao digit @@ -58,6 +59,7 @@ def arabic_digit_to_lao_digit(text: str) -> str: """ return text.translate(_arabic_lao_table) + def number2lao(numbers: int): """ Numbers to La opronunciation diff --git a/laonlp/util/lao.py b/laonlp/util/lao.py index 651214a..7a31776 100644 --- a/laonlp/util/lao.py +++ b/laonlp/util/lao.py @@ -15,7 +15,7 @@ limitations under the License. """ TONE_MARKS = "່້"+"໊໋" -_tone_mark = str.maketrans({i:None for i in TONE_MARKS}) +_tone_mark = str.maketrans({i: None for i in TONE_MARKS}) def remove_tone_mark(text: str) -> str: @@ -26,4 +26,4 @@ def remove_tone_mark(text: str) -> str: :return: returns a lao text without tone mark. :rtype: str """ - return text.translate(_tone_mark) \ No newline at end of file + return text.translate(_tone_mark) diff --git a/laonlp/word_vector/word2vec.py b/laonlp/word_vector/word2vec.py index 4bb16da..fdb6aa5 100644 --- a/laonlp/word_vector/word2vec.py +++ b/laonlp/word_vector/word2vec.py @@ -25,11 +25,12 @@ except ModuleNotFoundError: raise ModuleNotFoundError('Word vector functionalities require huggingface_hub which is not currently installed. Please try installing the package via "pip install huggingface_hub".') + class Word2Vec: """ Word2Vec """ - def __init__(self, model: str, corpus: str="oscar"): + def __init__(self, model: str, corpus: str = "oscar"): """ :param str model: model name (cbow or skip-gram) :param str corpus: corpus name (oscar) @@ -39,27 +40,27 @@ def __init__(self, model: str, corpus: str="oscar"): if self.corpus not in ["oscar"]: raise NotImplementedError("LaoNLP doesn't support %s corpus." % self.corpus) self.load_model(self.model) - + def load_model(self, model: str): """ Load Word2Vec model :param str model: model name (cbow or skip-gram) """ - if model=="cbow": + if model == "cbow": self.model_path = hf_hub_download(repo_id="wannaphong/Lao-Word-Embedding", filename="lao_oscar_cbow_model.bin") - elif model=="skip-gram": + elif model == "skip-gram": self.model_path = hf_hub_download(repo_id="wannaphong/Lao-Word-Embedding", filename="lao_oscar_skipgram_model.bin") else: raise NotImplementedError("LaoNLP doesn't support %s model." % model) self.model_wav2vec = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(self.model_path, binary=True, encoding='utf-8-sig', unicode_errors='ignore') - + def get_model(self): """ Get gensim.models.keyedvectors.KeyedVectors class """ return self.model_wav2vec - + def doesnt_match(self, words: List[str]) -> str: """ Get donesn't match @@ -70,12 +71,12 @@ def doesnt_match(self, words: List[str]) -> str: :rtype: str """ return self.model_wav2vec.doesnt_match(words) - + def most_similar_cosmul(self, positive: List[str], negative: List[str]): return self.model_wav2vec.most_similar_cosmul( positive=positive, negative=negative ) - + def similarity(self, word1: str, word2: str) -> float: """ Find similarity between word pairs. @@ -86,4 +87,4 @@ def similarity(self, word1: str, word2: str) -> float: :return: return similarity :rtype: float """ - return self.model_wav2vec.similarity(word1, word2) \ No newline at end of file + return self.model_wav2vec.similarity(word1, word2) diff --git a/tests/test_translate.py b/tests/test_translate.py index 3ce9ad6..39d2ba5 100644 --- a/tests/test_translate.py +++ b/tests/test_translate.py @@ -6,4 +6,4 @@ class TestTagPackage(unittest.TestCase): def test_word_dictionary(self): - self.assertIsNotNone(word_dictionary("cat","en","lao")) \ No newline at end of file + self.assertIsNotNone(word_dictionary("cat", "en", "lao")) diff --git a/tests/test_util.py b/tests/test_util.py index 63618e5..e66215f 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -10,13 +10,15 @@ def test_lao_digit_to_arabic_digit(self): lao_digit_to_arabic_digit("໑໒໓໔໕໖໗໘໙໐"), '1234567890' ) + def test_arabic_digit_to_lao_digit(self): self.assertEqual( arabic_digit_to_lao_digit('1234567890'), "໑໒໓໔໕໖໗໘໙໐" ) + def test_remove_tone_mark(self): self.assertEqual( remove_tone_mark("ຜູ້"), 'ຜູ' - ) \ No newline at end of file + ) diff --git a/tests/test_word_vector.py b/tests/test_word_vector.py index 616d920..788ad45 100644 --- a/tests/test_word_vector.py +++ b/tests/test_word_vector.py @@ -7,6 +7,6 @@ class TestTagPackage(unittest.TestCase): def test_word2vec(self): _m1 = Word2Vec(model="cbow") - self.assertIsNotNone(_m1.similarity("ແປດ","ແພະ")) + self.assertIsNotNone(_m1.similarity("ແປດ", "ແພະ")) _m2 = Word2Vec(model="skip-gram") - self.assertIsNotNone(_m2.similarity("ແປດ","ແພະ")) \ No newline at end of file + self.assertIsNotNone(_m2.similarity("ແປດ", "ແພະ"))