safevideo · SeeknnDestroy · Dec 21, 2023 · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 lingua-language-detector==1.3.1
 pysrt
+requests
+python-dotenv
diff --git a/safetext/__init__.py b/safetext/__init__.py
@@ -1,19 +1,39 @@
+import logging
 import os
 import re
-from typing import Dict, List
+from typing import Dict, List, Optional
+
+import requests
 
 from safetext.utils import detect_language_from_srt, detect_language_from_text
 
-__version__ = "0.0.5"
+__version__ = "0.0.6"
 
 
 class SafeText:
+    """A class to provide text analysis for profanity detection using the built-in ProfanityChecker and
+    optionally validating the results against the ModerateContentAPI.
+    """
+
+    def __init__(self, language: str = "en", validate_profanity: bool = False):
+        """
+        Initializes the SafeText with a specified language and validation option.
 
-    def __init__(self, language="en"):
+        Args:
+            language (str): The language code for the profanity list. (ISO 639-1)
+            validate_profanity (bool): Flag to enable validation of profanity detection results
+                                       against ModerateContentAPI when using ProfanityChecker.
+        """
         self.language = language
         self.checker = None
         if language is not None:
             self.set_language(language)
+        self.moderate_content_api_key = os.getenv('MODERATE_CONTENT_API_KEY')
+        self.validate_profanity = validate_profanity
+
+        if validate_profanity and not self.moderate_content_api_key:
+            raise ValueError(
+                "MODERATE_CONTENT_API_KEY key must set as an environment variable for validation.")
 
     def set_language(self, language: str):
         """Sets the language of the profanity checker."""
@@ -56,9 +76,11 @@ def set_language_from_srt(self, srt_file, use_first_n_subs=10):
         language = detect_language_from_srt(srt_file, use_first_n_subs)
         self.set_language(language)
 
-    def check_profanity(self, text):
+    def check_profanity(self, text: str):
         """
-        Checks the given text for profanity.
+        Checks the given text for profanity using the selected method. If validation is enabled, it logs the
+        probable missing bad words and false positives between the results of ProfanityChecker and
+        ModerateContentAPI.
 
         Args:
             text (str): The text to check for profanity.
@@ -70,9 +92,12 @@ def check_profanity(self, text):
                 - start: The start index of the profanity word in the text.
                 - end: The end index of the profanity word in the text.
         """
-        if self.checker is None:
-            self._auto_set_language(text)
-        return self.checker.check(text)
+        checker_results = self.checker.check(text)
+        if self.validate_profanity:
+            checker_bad_words = [profanity["word"] for profanity in checker_results]
+            self._validate_profanity(text, checker_bad_words)
+
+        return checker_results
 
     def censor_profanity(self, text):
         """
@@ -88,9 +113,48 @@ def censor_profanity(self, text):
             self._auto_set_language(text)
         return self.checker.censor(text)
 
-    def _auto_set_language(self, text: str):
-        detected_language = detect_language_from_text(text)
-        self.set_language(detected_language)
+    def get_bad_words(self, text: str = None, profanity_results: Optional[List[Dict]] = None) -> List[str]:
+        """
+        Retrieves a list of bad words found in the given text or from provided profanity results.
+
+        Args:
+            text (str, optional): The text to scan for profanities.
+            profanity_results (Optional[List[Dict]], optional): Pre-calculated profanity results.
+
+        Returns:
+            List[str]: A list of bad words detected in the text.
+        """
+        return self.checker.get_bad_words(text, profanity_results)
+
+    def _auto_set_language(self, text):
+        """
+        Detects the language of the given text and sets the language of the profanity checker.
+
+        Args:
+            text (str): The text to detect the language of.
+        """
+        language = detect_language_from_text(text)
+        self.set_language(language)
+
+    def _validate_profanity(self, text: str, checker_bad_words: list):
+        """
+        Validates the profanity detection results of ProfanityChecker against the ModerateContentAPI.
+
+        Args:
+            text (str): The text that was checked.
+            checker_bad_words (list): The list of bad words detected by ProfanityChecker.
+        """
+        api_bad_words = ModerateContentAPI(self.moderate_content_api_key).get_bad_words(text)
+
+        missing_words = set(api_bad_words) - set(checker_bad_words)
+        false_positives = set(checker_bad_words) - set(api_bad_words)
+
+        if missing_words:
+            logging.info(f"Possible missing bad words: {missing_words}")
+        if false_positives:
+            logging.info(f"Possible false detected words: {false_positives}")
+        if not missing_words and not false_positives:
+            logging.info("All good for validation!")
 
 
 class ProfanityChecker:
@@ -189,17 +253,29 @@ def _find_profanity_phrase(self, profanity: str, lower_text: str, profanity_info
             })
             start = lower_text.find(profanity, end)
 
-    def check(self, text: str) -> List[Dict]:
+    def get_bad_words(self, text: str = None, profanity_results: Optional[List[Dict]] = None) -> List[str]:
         """
-        Checks the given text for profanity.
+        Retrieves a list of bad words found in the given text or from provided profanity results.
 
         Args:
-            text (str): The text to check for profanity.
+            text (str, optional): The text to scan for profanities.
+            profanity_results (Optional[List[Dict]], optional): Pre-calculated profanity results from
+            ProfanityChecker.check() method.
 
         Returns:
-            List[Dict]: A list of dictionaries, each containing information about a found profanity.
+            List[str]: A list of bad words detected in the text.
         """
-        return self._find_profanities(text)
+        if text is None and profanity_results is None:
+            raise ValueError("Either text or profanity_results must be provided.")
+
+        if profanity_results is None:
+            profanity_results = self.check(text)
+
+        bad_words = []
+        for profanity in profanity_results:
+            bad_words.append(profanity["word"])
+
+        return bad_words
 
     def censor(self, text: str) -> str:
         """
@@ -217,3 +293,108 @@ def censor(self, text: str) -> str:
             end_index = profanity["end"]
             text = text[:start_index] + '*' * (end_index - start_index) + text[end_index:]
         return text
+
+    def check(self, text: str) -> List[Dict]:
+        """
+        Checks the given text for profanity.
+
+        Args:
+            text (str): The text to check for profanity.
+
+        Returns:
+            List[Dict]: A list of dictionaries, each containing information about a found profanity.
+        """
+        return self._find_profanities(text)
+
+
+class ModerateContentAPI:
+    """
+    A class to interact with the Moderate Content API for profanity detection.
+
+    This class facilitates the detection of bad words in text using the
+    Moderate Content API. It allows for fetching a list of bad words detected
+    in a given text.
+
+    Attributes:
+        api_key (str): The API key for accessing the Moderate Content API.
+    """
+
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initializes the ModerateContentAPI with an optional API key.
+
+        Args:
+            api_key (str, optional): The API key for the Moderate Content API.
+                                     If not provided, it will look for an API key
+                                     in the MODERATE_CONTENT_API_KEY environment variable.
+        """
+        self.api_key = api_key or os.getenv('MODERATE_CONTENT_API_KEY')
+        if not self.api_key:
+            raise ValueError("API key must be provided or set as an environment variable.")
+
+    def _request_api(self, text: str, exclude: Optional[str] = None, replace: Optional[str] = None) -> Dict:
+        """
+        Makes a request to the Moderate Content API and returns the response.
+
+        Args:
+            text (str): The text to analyze for bad words.
+            exclude (str, optional): A comma-delimited list of words to exclude from checking.
+            replace (str, optional): A string of characters to replace bad words with.
+
+        Returns:
+            Dict: A dictionary containing the API response.
+        """
+        api_url = "https://api.moderatecontent.com/text/"
+        params = {'key': self.api_key, 'msg': text, 'exclude': exclude, 'replace': replace}
+        try:
+            response = requests.get(api_url, params=params)
+            response.raise_for_status()
+            return response.json()
+        except requests.RequestException as e:
+            # Log the exception details here
+            raise ConnectionError("Failed to connect to the Moderate Content API.") from e
+
+    def get_bad_words(self,
+                      text: str,
+                      exclude: Optional[str] = None,
+                      replace: Optional[str] = None) -> List[str]:
+        """
+        Analyzes the given text and returns a list of bad words found.
+
+        Args:
+            text (str): The text to analyze for bad words.
+            exclude (str, optional): A comma-delimited list of words to exclude from checking.
+            replace (str, optional): A string of characters to replace bad words with.
+
+        Returns:
+            List[str]: A list of bad words detected in the text.
+        """
+        response = self._request_api(text, exclude, replace)
+        return response.get('bad_words', [])
+
+    def censor(self, text: str, exclude: Optional[str] = None, replace: Optional[str] = None) -> str:
+        """
+        Analyzes the given text and returns a censored version of it.
+
+        Args:
+            text (str): The text to analyze for bad words.
+            exclude (str, optional): A comma-delimited list of words to exclude from checking.
+            replace (str, optional): A string of characters to replace bad words with.
+
+        Returns:
+            str: The censored text with bad words replaced by asterisks.
+        """
+        response = self._request_api(text, exclude, replace)
+        return response.get('clean', '')
+
+    def check(self, text: str):
+        """
+        Checks the given text for profanity.
+
+        Args:
+            text (str): The text to check for profanity.
+
+        Returns:
+            List[str]: A list of bad words detected in the text.
+        """
+        return self.get_bad_words(text)
diff --git a/safetext/languages/en/words.txt b/safetext/languages/en/words.txt
@@ -46,7 +46,6 @@ ball licking
 ball sack
 ball sucking
 ballbag
-balls
 ballsack
 bangbros
 bangbus
@@ -118,7 +117,6 @@ bukkake
 bulldyke
 bullet vibe
 bullshit
-bum
 bung hole
 bunghole
 bunny fucker
@@ -502,6 +500,7 @@ mo-fo
 mof0
 mofo
 mong
+moron
 mothafuck
 mothafucka
 mothafuckas
@@ -668,7 +667,6 @@ rimming
 rosy palm
 rosy palm and her 5 sisters
 rusty trombone
-s hit
 s&m
 s.o.b.
 s_h_i_t