diff --git a/README.md b/README.md
index cfc81ca..42e574d 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,14 @@
+
+
# safetext
Rule-based profanity checking tool for English and Turkish.
diff --git a/safetext/__init__.py b/safetext/__init__.py
index b32f3f0..acba445 100644
--- a/safetext/__init__.py
+++ b/safetext/__init__.py
@@ -1,10 +1,6 @@
-from safetext.utils import detect_language_from_srt, detect_language_from_text
+import os
-from .languages.de import GermanProfanityChecker
-from .languages.en import EnglishProfanityChecker
-from .languages.es import SpanishProfanityChecker
-from .languages.pt import PortugueseProfanityChecker
-from .languages.tr import TurkishProfanityChecker
+from safetext.utils import detect_language_from_srt, detect_language_from_text
__version__ = "0.0.4"
@@ -17,20 +13,17 @@ def __init__(self, language="en"):
if language is not None:
self.set_language(language)
- def set_language(self, language):
+ def set_language(self, language: str):
+ """Sets the language of the profanity checker."""
+ words_file_path = self._get_words_filepath(language)
+ if not os.path.exists(words_file_path):
+ raise ValueError(f"No profanity word list found for language '{language}'.")
+
self.language = language
- if language == "en":
- self.checker = EnglishProfanityChecker()
- elif language == "tr":
- self.checker = TurkishProfanityChecker()
- elif language == "es":
- self.checker = SpanishProfanityChecker()
- elif language == "de":
- self.checker = GermanProfanityChecker()
- elif language == "pt":
- self.checker = PortugueseProfanityChecker()
- else:
- raise ValueError("Language not supported")
+ self.checker = ProfanityChecker(language)
+
+ def _get_words_filepath(self, language: str) -> str:
+ return os.path.join(os.path.dirname(__file__), f"languages/{language}/words.txt")
def set_language_from_text(self, text):
"""
@@ -76,7 +69,7 @@ def check_profanity(self, text):
- end: The end index of the profanity word in the text.
"""
if self.checker is None:
- raise ValueError("Language not set")
+ self._auto_set_language(text)
return self.checker.check(text)
def censor_profanity(self, text):
@@ -90,5 +83,90 @@ def censor_profanity(self, text):
str: The censored text. The profanity words are replaced with asterisks.
"""
if self.checker is None:
- raise ValueError("Language not set")
+ self._auto_set_language(text)
return self.checker.censor(text)
+
+ def _auto_set_language(self, text: str):
+ detected_language = detect_language_from_text(text)
+ self.set_language(detected_language)
+
+
+class ProfanityChecker:
+ """Base class for profanity checkers."""
+
+ def __init__(self, language):
+ self.language = language
+
+ @property
+ def words_filepath(self):
+ """Get the filepath for the profanity words file."""
+ import pathlib
+
+ return f"{pathlib.Path(__file__).parent.resolve()}/languages/{self.language}/words.txt"
+
+ @property
+ def profanity_words(self):
+ """Get the profanity words for the language."""
+ if not hasattr(self, "_profanity_words"):
+ self._profanity_words = self._read_words(self.words_filepath)
+
+ return self._profanity_words
+
+ def _check(self, text):
+ """Check the text for profanity."""
+ # Split the text into a list of words
+ words = text.split()
+
+ # Initialize a list to store the indices of profanity words
+ profanity_infos = []
+
+ for i, word in enumerate(words):
+ if word.lower() in self.profanity_words:
+ start_index = sum(len(w) + 1 for w in words[:i]) # +1 to account for space between words
+ end_index = start_index + len(word)
+ profanity_info = {
+ "word": word,
+ "index": i + 1,
+ "start": start_index,
+ "end": end_index,
+ }
+ profanity_infos.append(profanity_info)
+
+ return profanity_infos
+
+ def _read_words(self, filepath):
+ """Read the profanity words from the given file."""
+ with open(filepath, encoding="utf8") as f:
+ profanity_words = f.read().splitlines()
+
+ return profanity_words
+
+ def _preprocess(self, text):
+ """Preprocess the text before checking for profanity."""
+ return text
+
+ def check(self, text):
+ """
+ Check the text for profanity.
+
+ Args:
+ text (str): The text to check for profanity.
+
+ Returns:
+ list: A list of profanity infos. Each profanity info is a dict with the following keys:
+ - word: The profanity word.
+ - index: The index of the profanity word in the text.
+ - start: The start index of the profanity word in the text.
+ - end: The end index of the profanity word in the text.
+ """
+ return self._check(self._preprocess(text))
+
+ def censor(self, text):
+ """Censor the text."""
+ detected_profanities = self.check(text)
+ for profanity in detected_profanities:
+ start_index = profanity["start"]
+ end_index = profanity["end"]
+ text = text.replace(text[start_index:end_index], "***")
+
+ return text
diff --git a/safetext/languages/__init__.py b/safetext/languages/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/safetext/languages/base.py b/safetext/languages/base.py
deleted file mode 100644
index 7c5b1f7..0000000
--- a/safetext/languages/base.py
+++ /dev/null
@@ -1,79 +0,0 @@
-class BaseProfanityChecker:
- """Base class for profanity checkers."""
-
- def __init__(self, language):
- self.language = language
-
- @property
- def words_filepath(self):
- """Get the filepath for the profanity words file."""
- import pathlib
-
- return f"{pathlib.Path(__file__).parent.resolve()}/{self.language}/words.txt"
-
- @property
- def profanity_words(self):
- """Get the profanity words for the language."""
- if not hasattr(self, "_profanity_words"):
- self._profanity_words = self._read_words(self.words_filepath)
-
- return self._profanity_words
-
- def _check(self, text):
- """Check the text for profanity."""
- # Split the text into a list of words
- words = text.split()
-
- # Initialize a list to store the indices of profanity words
- profanity_infos = []
-
- for i, word in enumerate(words):
- if word.lower() in self.profanity_words:
- start_index = sum(len(w) + 1 for w in words[:i]) # +1 to account for space between words
- end_index = start_index + len(word)
- profanity_info = {
- "word": word,
- "index": i + 1,
- "start": start_index,
- "end": end_index,
- }
- profanity_infos.append(profanity_info)
-
- return profanity_infos
-
- def _read_words(self, filepath):
- """Read the profanity words from the given file."""
- with open(filepath, encoding="utf8") as f:
- profanity_words = f.read().splitlines()
-
- return profanity_words
-
- def _preprocess(self, text):
- """Preprocess the text before checking for profanity."""
- return text
-
- def check(self, text):
- """
- Check the text for profanity.
-
- Args:
- text (str): The text to check for profanity.
-
- Returns:
- list: A list of profanity infos. Each profanity info is a dict with the following keys:
- - word: The profanity word.
- - index: The index of the profanity word in the text.
- - start: The start index of the profanity word in the text.
- - end: The end index of the profanity word in the text.
- """
- return self._check(self._preprocess(text))
-
- def censor(self, text):
- """Censor the text."""
- detected_profanities = self.check(text)
- for profanity in detected_profanities:
- start_index = profanity["start"]
- end_index = profanity["end"]
- text = text.replace(text[start_index:end_index], "***")
-
- return text
diff --git a/safetext/languages/de/__init__.py b/safetext/languages/de/__init__.py
deleted file mode 100644
index 293338d..0000000
--- a/safetext/languages/de/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from safetext.languages.base import BaseProfanityChecker
-
-
-class GermanProfanityChecker(BaseProfanityChecker):
- """German profanity checker."""
-
- def __init__(self):
- super().__init__(language="de")
diff --git a/safetext/languages/en/__init__.py b/safetext/languages/en/__init__.py
deleted file mode 100644
index f440887..0000000
--- a/safetext/languages/en/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from safetext.languages.base import BaseProfanityChecker
-
-
-class EnglishProfanityChecker(BaseProfanityChecker):
- """English profanity checker."""
-
- def __init__(self):
- super().__init__(language="en")
diff --git a/safetext/languages/es/__init__.py b/safetext/languages/es/__init__.py
deleted file mode 100644
index 17b28c0..0000000
--- a/safetext/languages/es/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from safetext.languages.base import BaseProfanityChecker
-
-
-class SpanishProfanityChecker(BaseProfanityChecker):
- """Spanish profanity checker."""
-
- def __init__(self):
- super().__init__(language="es")
diff --git a/safetext/languages/pt/__init__.py b/safetext/languages/pt/__init__.py
deleted file mode 100644
index 0fb5c42..0000000
--- a/safetext/languages/pt/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from safetext.languages.base import BaseProfanityChecker
-
-
-class PortugueseProfanityChecker(BaseProfanityChecker):
- """Portuguese profanity checker."""
-
- def __init__(self):
- super().__init__(language="pt")
diff --git a/safetext/languages/tr/__init__.py b/safetext/languages/tr/__init__.py
deleted file mode 100644
index 070005e..0000000
--- a/safetext/languages/tr/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from safetext.languages.base import BaseProfanityChecker
-
-
-class TurkishProfanityChecker(BaseProfanityChecker):
- """Turkish profanity checker."""
-
- def __init__(self):
- super().__init__(language="tr")
diff --git a/safetext/utils.py b/safetext/utils.py
index 2844026..0e05694 100644
--- a/safetext/utils.py
+++ b/safetext/utils.py
@@ -1,29 +1,62 @@
+import os
+from typing import List
+
+import pysrt
from lingua import Language, LanguageDetectorBuilder
-LANGUAGE_TO_CODE = {
- Language.ENGLISH: "en",
- Language.TURKISH: "tr",
- Language.GERMAN: "de",
- Language.FRENCH: "fr",
- Language.SPANISH: "es",
-}
-LANGUAGES = [Language.ENGLISH, Language.TURKISH, Language.GERMAN, Language.FRENCH, Language.SPANISH]
-DETECTOR = LanguageDetectorBuilder.from_languages(*LANGUAGES).build()
+
+def available_languages() -> List[Language]:
+ """
+ Scans the 'languages' directory to identify available languages based on directory names.
+
+ Returns:
+ List[Language]: A list of available languages as Language enum values.
+ """
+ current_file_directory = os.path.dirname(__file__)
+
+ languages_path = os.path.join(current_file_directory, "languages")
+
+ all_items_in_languages_dir = os.listdir(languages_path)
+
+ available_lang_codes = []
+
+ for item in all_items_in_languages_dir:
+ item_full_path = os.path.join(languages_path, item)
+
+ if os.path.isdir(item_full_path):
+ available_lang_codes.append(item)
+
+ available_langs = []
+ for lang in Language:
+ if lang.iso_code_639_1.name.lower() in available_lang_codes: # Correctly access the ISO 639-1 code
+ available_langs.append(lang)
+
+ return available_langs
+
+
+def initialize_detector() -> LanguageDetectorBuilder:
+ """
+ Dynamically initializes the language detector based on the available languages.
+
+ Returns:
+ LanguageDetectorBuilder: An initialized language detector.
+ """
+ return LanguageDetectorBuilder.from_languages(*available_languages()).build()
def detect_language_from_text(text: str) -> str:
"""
- Detects the language of the given text.
+ Detects the language of the given text using the dynamically initialized language detector.
Args:
text (str): The text to detect the language of.
Returns:
- str: The language code of the detected language.
- (e.g. "en", "tr")
+ str: The ISO 639-1 language code of the detected language.
"""
- result = DETECTOR.detect_language_of(text)
- return LANGUAGE_TO_CODE[result]
+ DETECTOR = initialize_detector()
+ detected_language = DETECTOR.detect_language_of(text)
+ return detected_language.iso_code_639_1.name.lower() # IsoCode639_1
def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str:
@@ -38,8 +71,6 @@ def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str:
str: The language code of the detected language.
(e.g. "en", "tr")
"""
- import pysrt
-
subs = pysrt.open(srt_file, encoding="utf-8")
text = " ".join([sub.text_without_tags.replace("\n", " ") for sub in subs[:use_first_n_subs]])