Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement validate_profanity feature via external api #38

Merged
merged 13 commits into from
Dec 21, 2023
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
lingua-language-detector==1.3.1
pysrt
requests
python-dotenv
213 changes: 197 additions & 16 deletions safetext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,39 @@
import logging
import os
import re
from typing import Dict, List
from typing import Dict, List, Optional

import requests

from safetext.utils import detect_language_from_srt, detect_language_from_text

__version__ = "0.0.5"
__version__ = "0.0.6"


class SafeText:
"""A class to provide text analysis for profanity detection using the built-in ProfanityChecker and
optionally validating the results against the ModerateContentAPI.
"""

def __init__(self, language: str = "en", validate_profanity: bool = False):
"""
Initializes the SafeText with a specified language and validation option.

def __init__(self, language="en"):
Args:
language (str): The language code for the profanity list. (ISO 639-1)
validate_profanity (bool): Flag to enable validation of profanity detection results
against ModerateContentAPI when using ProfanityChecker.
"""
self.language = language
self.checker = None
if language is not None:
self.set_language(language)
self.moderate_content_api_key = os.getenv('MODERATE_CONTENT_API_KEY')
self.validate_profanity = validate_profanity

if validate_profanity and not self.moderate_content_api_key:
raise ValueError(
"MODERATE_CONTENT_API_KEY key must set as an environment variable for validation.")

def set_language(self, language: str):
"""Sets the language of the profanity checker."""
Expand Down Expand Up @@ -56,9 +76,11 @@ def set_language_from_srt(self, srt_file, use_first_n_subs=10):
language = detect_language_from_srt(srt_file, use_first_n_subs)
self.set_language(language)

def check_profanity(self, text):
def check_profanity(self, text: str):
"""
Checks the given text for profanity.
Checks the given text for profanity using the selected method. If validation is enabled, it logs the
probable missing bad words and false positives between the results of ProfanityChecker and
ModerateContentAPI.

Args:
text (str): The text to check for profanity.
Expand All @@ -70,9 +92,12 @@ def check_profanity(self, text):
- start: The start index of the profanity word in the text.
- end: The end index of the profanity word in the text.
"""
if self.checker is None:
self._auto_set_language(text)
return self.checker.check(text)
checker_results = self.checker.check(text)
if self.validate_profanity:
checker_bad_words = [profanity["word"] for profanity in checker_results]
self._validate_profanity(text, checker_bad_words)

return checker_results

def censor_profanity(self, text):
"""
Expand All @@ -88,9 +113,48 @@ def censor_profanity(self, text):
self._auto_set_language(text)
return self.checker.censor(text)

def _auto_set_language(self, text: str):
detected_language = detect_language_from_text(text)
self.set_language(detected_language)
def get_bad_words(self, text: str = None, profanity_results: Optional[List[Dict]] = None) -> List[str]:
"""
Retrieves a list of bad words found in the given text or from provided profanity results.

Args:
text (str, optional): The text to scan for profanities.
profanity_results (Optional[List[Dict]], optional): Pre-calculated profanity results.

Returns:
List[str]: A list of bad words detected in the text.
"""
return self.checker.get_bad_words(text, profanity_results)

def _auto_set_language(self, text):
"""
Detects the language of the given text and sets the language of the profanity checker.

Args:
text (str): The text to detect the language of.
"""
language = detect_language_from_text(text)
self.set_language(language)

def _validate_profanity(self, text: str, checker_bad_words: list):
"""
Validates the profanity detection results of ProfanityChecker against the ModerateContentAPI.

Args:
text (str): The text that was checked.
checker_bad_words (list): The list of bad words detected by ProfanityChecker.
"""
api_bad_words = ModerateContentAPI(self.moderate_content_api_key).get_bad_words(text)

missing_words = set(api_bad_words) - set(checker_bad_words)
false_positives = set(checker_bad_words) - set(api_bad_words)

if missing_words:
logging.info(f"Possible missing bad words: {missing_words}")
if false_positives:
logging.info(f"Possible false detected words: {false_positives}")
if not missing_words and not false_positives:
logging.info("All good for validation!")


class ProfanityChecker:
Expand Down Expand Up @@ -189,17 +253,29 @@ def _find_profanity_phrase(self, profanity: str, lower_text: str, profanity_info
})
start = lower_text.find(profanity, end)

def check(self, text: str) -> List[Dict]:
def get_bad_words(self, text: str = None, profanity_results: Optional[List[Dict]] = None) -> List[str]:
"""
Checks the given text for profanity.
Retrieves a list of bad words found in the given text or from provided profanity results.

Args:
text (str): The text to check for profanity.
text (str, optional): The text to scan for profanities.
profanity_results (Optional[List[Dict]], optional): Pre-calculated profanity results from
ProfanityChecker.check() method.

Returns:
List[Dict]: A list of dictionaries, each containing information about a found profanity.
List[str]: A list of bad words detected in the text.
"""
return self._find_profanities(text)
if text is None and profanity_results is None:
raise ValueError("Either text or profanity_results must be provided.")

if profanity_results is None:
profanity_results = self.check(text)

bad_words = []
for profanity in profanity_results:
bad_words.append(profanity["word"])

return bad_words

def censor(self, text: str) -> str:
"""
Expand All @@ -217,3 +293,108 @@ def censor(self, text: str) -> str:
end_index = profanity["end"]
text = text[:start_index] + '*' * (end_index - start_index) + text[end_index:]
return text

def check(self, text: str) -> List[Dict]:
"""
Checks the given text for profanity.

Args:
text (str): The text to check for profanity.

Returns:
List[Dict]: A list of dictionaries, each containing information about a found profanity.
"""
return self._find_profanities(text)


class ModerateContentAPI:
"""
A class to interact with the Moderate Content API for profanity detection.

This class facilitates the detection of bad words in text using the
Moderate Content API. It allows for fetching a list of bad words detected
in a given text.

Attributes:
api_key (str): The API key for accessing the Moderate Content API.
"""

def __init__(self, api_key: Optional[str] = None):
"""
Initializes the ModerateContentAPI with an optional API key.

Args:
api_key (str, optional): The API key for the Moderate Content API.
If not provided, it will look for an API key
in the MODERATE_CONTENT_API_KEY environment variable.
"""
self.api_key = api_key or os.getenv('MODERATE_CONTENT_API_KEY')
if not self.api_key:
raise ValueError("API key must be provided or set as an environment variable.")

def _request_api(self, text: str, exclude: Optional[str] = None, replace: Optional[str] = None) -> Dict:
"""
Makes a request to the Moderate Content API and returns the response.

Args:
text (str): The text to analyze for bad words.
exclude (str, optional): A comma-delimited list of words to exclude from checking.
replace (str, optional): A string of characters to replace bad words with.

Returns:
Dict: A dictionary containing the API response.
"""
api_url = "https://api.moderatecontent.com/text/"
params = {'key': self.api_key, 'msg': text, 'exclude': exclude, 'replace': replace}
try:
response = requests.get(api_url, params=params)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
# Log the exception details here
raise ConnectionError("Failed to connect to the Moderate Content API.") from e

def get_bad_words(self,
text: str,
exclude: Optional[str] = None,
replace: Optional[str] = None) -> List[str]:
"""
Analyzes the given text and returns a list of bad words found.

Args:
text (str): The text to analyze for bad words.
exclude (str, optional): A comma-delimited list of words to exclude from checking.
replace (str, optional): A string of characters to replace bad words with.

Returns:
List[str]: A list of bad words detected in the text.
"""
response = self._request_api(text, exclude, replace)
return response.get('bad_words', [])

def censor(self, text: str, exclude: Optional[str] = None, replace: Optional[str] = None) -> str:
"""
Analyzes the given text and returns a censored version of it.

Args:
text (str): The text to analyze for bad words.
exclude (str, optional): A comma-delimited list of words to exclude from checking.
replace (str, optional): A string of characters to replace bad words with.

Returns:
str: The censored text with bad words replaced by asterisks.
"""
response = self._request_api(text, exclude, replace)
return response.get('clean', '')

def check(self, text: str):
"""
Checks the given text for profanity.

Args:
text (str): The text to check for profanity.

Returns:
List[str]: A list of bad words detected in the text.
"""
return self.get_bad_words(text)
4 changes: 1 addition & 3 deletions safetext/languages/en/words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ ball licking
ball sack
ball sucking
ballbag
balls
ballsack
bangbros
bangbus
Expand Down Expand Up @@ -118,7 +117,6 @@ bukkake
bulldyke
bullet vibe
bullshit
bum
bung hole
bunghole
bunny fucker
Expand Down Expand Up @@ -502,6 +500,7 @@ mo-fo
mof0
mofo
mong
moron
mothafuck
mothafucka
mothafuckas
Expand Down Expand Up @@ -668,7 +667,6 @@ rimming
rosy palm
rosy palm and her 5 sisters
rusty trombone
s hit
s&m
s.o.b.
s_h_i_t
Expand Down