diff --git a/README.md b/README.md index cfc81ca..42e574d 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ +
+

+ + + +

+
+ # safetext Rule-based profanity checking tool for English and Turkish. diff --git a/safetext/__init__.py b/safetext/__init__.py index b32f3f0..acba445 100644 --- a/safetext/__init__.py +++ b/safetext/__init__.py @@ -1,10 +1,6 @@ -from safetext.utils import detect_language_from_srt, detect_language_from_text +import os -from .languages.de import GermanProfanityChecker -from .languages.en import EnglishProfanityChecker -from .languages.es import SpanishProfanityChecker -from .languages.pt import PortugueseProfanityChecker -from .languages.tr import TurkishProfanityChecker +from safetext.utils import detect_language_from_srt, detect_language_from_text __version__ = "0.0.4" @@ -17,20 +13,17 @@ def __init__(self, language="en"): if language is not None: self.set_language(language) - def set_language(self, language): + def set_language(self, language: str): + """Sets the language of the profanity checker.""" + words_file_path = self._get_words_filepath(language) + if not os.path.exists(words_file_path): + raise ValueError(f"No profanity word list found for language '{language}'.") + self.language = language - if language == "en": - self.checker = EnglishProfanityChecker() - elif language == "tr": - self.checker = TurkishProfanityChecker() - elif language == "es": - self.checker = SpanishProfanityChecker() - elif language == "de": - self.checker = GermanProfanityChecker() - elif language == "pt": - self.checker = PortugueseProfanityChecker() - else: - raise ValueError("Language not supported") + self.checker = ProfanityChecker(language) + + def _get_words_filepath(self, language: str) -> str: + return os.path.join(os.path.dirname(__file__), f"languages/{language}/words.txt") def set_language_from_text(self, text): """ @@ -76,7 +69,7 @@ def check_profanity(self, text): - end: The end index of the profanity word in the text. """ if self.checker is None: - raise ValueError("Language not set") + self._auto_set_language(text) return self.checker.check(text) def censor_profanity(self, text): @@ -90,5 +83,90 @@ def censor_profanity(self, text): str: The censored text. The profanity words are replaced with asterisks. """ if self.checker is None: - raise ValueError("Language not set") + self._auto_set_language(text) return self.checker.censor(text) + + def _auto_set_language(self, text: str): + detected_language = detect_language_from_text(text) + self.set_language(detected_language) + + +class ProfanityChecker: + """Base class for profanity checkers.""" + + def __init__(self, language): + self.language = language + + @property + def words_filepath(self): + """Get the filepath for the profanity words file.""" + import pathlib + + return f"{pathlib.Path(__file__).parent.resolve()}/languages/{self.language}/words.txt" + + @property + def profanity_words(self): + """Get the profanity words for the language.""" + if not hasattr(self, "_profanity_words"): + self._profanity_words = self._read_words(self.words_filepath) + + return self._profanity_words + + def _check(self, text): + """Check the text for profanity.""" + # Split the text into a list of words + words = text.split() + + # Initialize a list to store the indices of profanity words + profanity_infos = [] + + for i, word in enumerate(words): + if word.lower() in self.profanity_words: + start_index = sum(len(w) + 1 for w in words[:i]) # +1 to account for space between words + end_index = start_index + len(word) + profanity_info = { + "word": word, + "index": i + 1, + "start": start_index, + "end": end_index, + } + profanity_infos.append(profanity_info) + + return profanity_infos + + def _read_words(self, filepath): + """Read the profanity words from the given file.""" + with open(filepath, encoding="utf8") as f: + profanity_words = f.read().splitlines() + + return profanity_words + + def _preprocess(self, text): + """Preprocess the text before checking for profanity.""" + return text + + def check(self, text): + """ + Check the text for profanity. + + Args: + text (str): The text to check for profanity. + + Returns: + list: A list of profanity infos. Each profanity info is a dict with the following keys: + - word: The profanity word. + - index: The index of the profanity word in the text. + - start: The start index of the profanity word in the text. + - end: The end index of the profanity word in the text. + """ + return self._check(self._preprocess(text)) + + def censor(self, text): + """Censor the text.""" + detected_profanities = self.check(text) + for profanity in detected_profanities: + start_index = profanity["start"] + end_index = profanity["end"] + text = text.replace(text[start_index:end_index], "***") + + return text diff --git a/safetext/languages/__init__.py b/safetext/languages/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/safetext/languages/base.py b/safetext/languages/base.py deleted file mode 100644 index 7c5b1f7..0000000 --- a/safetext/languages/base.py +++ /dev/null @@ -1,79 +0,0 @@ -class BaseProfanityChecker: - """Base class for profanity checkers.""" - - def __init__(self, language): - self.language = language - - @property - def words_filepath(self): - """Get the filepath for the profanity words file.""" - import pathlib - - return f"{pathlib.Path(__file__).parent.resolve()}/{self.language}/words.txt" - - @property - def profanity_words(self): - """Get the profanity words for the language.""" - if not hasattr(self, "_profanity_words"): - self._profanity_words = self._read_words(self.words_filepath) - - return self._profanity_words - - def _check(self, text): - """Check the text for profanity.""" - # Split the text into a list of words - words = text.split() - - # Initialize a list to store the indices of profanity words - profanity_infos = [] - - for i, word in enumerate(words): - if word.lower() in self.profanity_words: - start_index = sum(len(w) + 1 for w in words[:i]) # +1 to account for space between words - end_index = start_index + len(word) - profanity_info = { - "word": word, - "index": i + 1, - "start": start_index, - "end": end_index, - } - profanity_infos.append(profanity_info) - - return profanity_infos - - def _read_words(self, filepath): - """Read the profanity words from the given file.""" - with open(filepath, encoding="utf8") as f: - profanity_words = f.read().splitlines() - - return profanity_words - - def _preprocess(self, text): - """Preprocess the text before checking for profanity.""" - return text - - def check(self, text): - """ - Check the text for profanity. - - Args: - text (str): The text to check for profanity. - - Returns: - list: A list of profanity infos. Each profanity info is a dict with the following keys: - - word: The profanity word. - - index: The index of the profanity word in the text. - - start: The start index of the profanity word in the text. - - end: The end index of the profanity word in the text. - """ - return self._check(self._preprocess(text)) - - def censor(self, text): - """Censor the text.""" - detected_profanities = self.check(text) - for profanity in detected_profanities: - start_index = profanity["start"] - end_index = profanity["end"] - text = text.replace(text[start_index:end_index], "***") - - return text diff --git a/safetext/languages/de/__init__.py b/safetext/languages/de/__init__.py deleted file mode 100644 index 293338d..0000000 --- a/safetext/languages/de/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from safetext.languages.base import BaseProfanityChecker - - -class GermanProfanityChecker(BaseProfanityChecker): - """German profanity checker.""" - - def __init__(self): - super().__init__(language="de") diff --git a/safetext/languages/en/__init__.py b/safetext/languages/en/__init__.py deleted file mode 100644 index f440887..0000000 --- a/safetext/languages/en/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from safetext.languages.base import BaseProfanityChecker - - -class EnglishProfanityChecker(BaseProfanityChecker): - """English profanity checker.""" - - def __init__(self): - super().__init__(language="en") diff --git a/safetext/languages/es/__init__.py b/safetext/languages/es/__init__.py deleted file mode 100644 index 17b28c0..0000000 --- a/safetext/languages/es/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from safetext.languages.base import BaseProfanityChecker - - -class SpanishProfanityChecker(BaseProfanityChecker): - """Spanish profanity checker.""" - - def __init__(self): - super().__init__(language="es") diff --git a/safetext/languages/pt/__init__.py b/safetext/languages/pt/__init__.py deleted file mode 100644 index 0fb5c42..0000000 --- a/safetext/languages/pt/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from safetext.languages.base import BaseProfanityChecker - - -class PortugueseProfanityChecker(BaseProfanityChecker): - """Portuguese profanity checker.""" - - def __init__(self): - super().__init__(language="pt") diff --git a/safetext/languages/tr/__init__.py b/safetext/languages/tr/__init__.py deleted file mode 100644 index 070005e..0000000 --- a/safetext/languages/tr/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from safetext.languages.base import BaseProfanityChecker - - -class TurkishProfanityChecker(BaseProfanityChecker): - """Turkish profanity checker.""" - - def __init__(self): - super().__init__(language="tr") diff --git a/safetext/utils.py b/safetext/utils.py index 2844026..0e05694 100644 --- a/safetext/utils.py +++ b/safetext/utils.py @@ -1,29 +1,62 @@ +import os +from typing import List + +import pysrt from lingua import Language, LanguageDetectorBuilder -LANGUAGE_TO_CODE = { - Language.ENGLISH: "en", - Language.TURKISH: "tr", - Language.GERMAN: "de", - Language.FRENCH: "fr", - Language.SPANISH: "es", -} -LANGUAGES = [Language.ENGLISH, Language.TURKISH, Language.GERMAN, Language.FRENCH, Language.SPANISH] -DETECTOR = LanguageDetectorBuilder.from_languages(*LANGUAGES).build() + +def available_languages() -> List[Language]: + """ + Scans the 'languages' directory to identify available languages based on directory names. + + Returns: + List[Language]: A list of available languages as Language enum values. + """ + current_file_directory = os.path.dirname(__file__) + + languages_path = os.path.join(current_file_directory, "languages") + + all_items_in_languages_dir = os.listdir(languages_path) + + available_lang_codes = [] + + for item in all_items_in_languages_dir: + item_full_path = os.path.join(languages_path, item) + + if os.path.isdir(item_full_path): + available_lang_codes.append(item) + + available_langs = [] + for lang in Language: + if lang.iso_code_639_1.name.lower() in available_lang_codes: # Correctly access the ISO 639-1 code + available_langs.append(lang) + + return available_langs + + +def initialize_detector() -> LanguageDetectorBuilder: + """ + Dynamically initializes the language detector based on the available languages. + + Returns: + LanguageDetectorBuilder: An initialized language detector. + """ + return LanguageDetectorBuilder.from_languages(*available_languages()).build() def detect_language_from_text(text: str) -> str: """ - Detects the language of the given text. + Detects the language of the given text using the dynamically initialized language detector. Args: text (str): The text to detect the language of. Returns: - str: The language code of the detected language. - (e.g. "en", "tr") + str: The ISO 639-1 language code of the detected language. """ - result = DETECTOR.detect_language_of(text) - return LANGUAGE_TO_CODE[result] + DETECTOR = initialize_detector() + detected_language = DETECTOR.detect_language_of(text) + return detected_language.iso_code_639_1.name.lower() # IsoCode639_1 def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str: @@ -38,8 +71,6 @@ def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str: str: The language code of the detected language. (e.g. "en", "tr") """ - import pysrt - subs = pysrt.open(srt_file, encoding="utf-8") text = " ".join([sub.text_without_tags.replace("\n", " ") for sub in subs[:use_first_n_subs]])