Skip to content

Commit

Permalink
fix: clean titles (#273)
Browse files Browse the repository at this point in the history
Signed-off-by: Eiko Wagenknecht <git@eiko-wagenknecht.de>
  • Loading branch information
eikowagenknecht committed Oct 24, 2023
1 parent 049cc6e commit d25912a
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 45 deletions.
20 changes: 18 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,31 @@
"version": "0.2.0",
"configurations": [
{
"name": "Debug Lootscraper",
"name": "Lootscraper",
"type": "python",
"request": "launch",
"module": "lootscraper",
"args": [],
"console": "integratedTerminal"
},
{
"name": "Python: Debug Unit Tests",
"name": "Cleanup Tool",
"type": "python",
"request": "launch",
"code": "import lootscraper.tools; lootscraper.tools.cleanup()",
"args": [],
"console": "integratedTerminal"
},
{
"name": "Refresh Tool",
"type": "python",
"request": "launch",
"code": "import lootscraper.tools; lootscraper.tools.refresh()",
"args": [],
"console": "integratedTerminal"
},
{
"name": "Unit Tests",
"type": "python",
"request": "launch",
"purpose": [
Expand Down
32 changes: 24 additions & 8 deletions config.default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,30 @@ db_echo = false
web_timeout_seconds = 5

[scraper]
# Possible Values: ["AMAZON", "APPLE", "EPIC", "GOG", "GOOGLE", "HUMBLE", "ITCH", "STEAM", "UBISOFT"]
offer_sources = []
# Possible Values: ["GAME", "LOOT"]
offer_types = []
# Possible Values: ["ALWAYS", "TEMPORARY", "CLAIMABLE"]
offer_durations = []
# Possible Values: ["STEAM", "IGDB"]
info_sources = []
offer_sources = [
# "AMAZON",
# "APPLE",
# "EPIC",
# "GOG",
# "GOOGLE",
# "HUMBLE",
# "ITCH",
# "STEAM",
# "UBISOFT",
]
offer_types = [
# "GAME",
# "LOOT",
]
offer_durations = [
# "ALWAYS",
# "TEMPORARY",
# "CLAIMABLE",
]
info_sources = [
# "STEAM",
# "IGDB",
]

[actions]
scrape_info = true
Expand Down
5 changes: 1 addition & 4 deletions src/lootscraper/scraper/amazon_games.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from lootscraper.database import Offer
from lootscraper.scraper.amazon_base import AmazonBaseScraper, AmazonRawOffer
from lootscraper.scraper.scraper_base import OfferHandler, RawOffer, Scraper
from lootscraper.utils import clean_game_title

if TYPE_CHECKING:
from playwright.async_api import Locator, Page
Expand Down Expand Up @@ -50,8 +49,6 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
"title": raw_offer.title,
}

probable_game_name = clean_game_title(raw_offer.title)

# Date
# This is a bit more complicated as only the relative end is
# displayed ("Ends in ..."). So we have to guess the real date:
Expand Down Expand Up @@ -130,7 +127,7 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
duration=AmazonGamesScraper.get_duration(),
type=AmazonGamesScraper.get_type(),
title=raw_offer.title,
probable_game_name=probable_game_name,
probable_game_name=raw_offer.title,
seen_last=datetime.now(timezone.utc),
valid_to=end_date,
rawtext=rawtext,
Expand Down
3 changes: 1 addition & 2 deletions src/lootscraper/scraper/amazon_loot.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
if raw_offer.game_title is None:
raise ValueError("No game title found.")

probable_game_name = raw_offer.game_title
title = f"{raw_offer.game_title}: {raw_offer.title}"

# Date
Expand Down Expand Up @@ -152,7 +151,7 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
duration=AmazonLootScraper.get_duration(),
type=AmazonLootScraper.get_type(),
title=title,
probable_game_name=probable_game_name,
probable_game_name=raw_offer.game_title,
seen_last=datetime.now(timezone.utc),
valid_to=end_date,
rawtext=rawtext,
Expand Down
33 changes: 30 additions & 3 deletions src/lootscraper/scraper/scraper_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lootscraper.browser import get_new_page
from lootscraper.common import Category, OfferDuration, OfferType, Source
from lootscraper.config import Config
from lootscraper.utils import clean_game_title, clean_loot_title, clean_title

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable
Expand Down Expand Up @@ -57,7 +58,7 @@ async def scrape(self) -> list[Offer]:
if len(filtered_offers) > 0:
logger.info(f"Found {len(filtered_offers)} offers: {titles}.")
elif self.offers_expected():
logger.error("Found no offers, even though there hould be at least one.")
logger.error("Found no offers, even though there should be at least one.")
else:
logger.info("No offers found.")
return filtered_offers
Expand Down Expand Up @@ -180,7 +181,28 @@ async def read_offers(self) -> list[Offer]:
def clean_offers(self, offers: list[Offer]) -> list[Offer]:
"""Clean offer title etc."""
for offer in offers:
offer.title = offer.title.replace("\n", "").strip()
if offer.rawtext is None:
continue

try:
raw_title = offer.rawtext["gametitle"]
title_new = (
clean_game_title(raw_title)
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
raw_title = offer.rawtext["title"]
title_new = clean_title(raw_title, offer.type)

if title_new != offer.title:
offer.title = title_new

if offer.probable_game_name is not None:
offer.probable_game_name = clean_game_title(
offer.probable_game_name,
)

if offer.url is not None:
offer.url = offer.url.replace("\n", "").strip()
if offer.img_url is not None:
Expand Down Expand Up @@ -255,7 +277,6 @@ def is_demo(title: str) -> bool:
@staticmethod
def is_prerelease(title: str) -> bool:
"""Check if the given title is an alpha or beta version."""
# Check for demo in title
# Catches titles like
# - "Alpha: Title"
# - "Title (Alpha)"
Expand All @@ -275,6 +296,12 @@ def is_prerelease(title: str) -> bool:
re.IGNORECASE,
):
return True
if re.search(
r"^[\W]?early access[\W]|\Wearly access\W?((.*version.*)|(\(.*\)))?$",
title,
re.IGNORECASE,
):
return True
return False

@staticmethod
Expand Down
10 changes: 7 additions & 3 deletions src/lootscraper/scraper/steam_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
from typing import TYPE_CHECKING

from lootscraper.browser import get_new_page
from lootscraper.common import OfferDuration, Source
from lootscraper.common import OfferDuration, OfferType, Source
from lootscraper.database import Offer
from lootscraper.scraper.info_steam import skip_age_verification
from lootscraper.scraper.scraper_base import OfferHandler, RawOffer, Scraper
from lootscraper.utils import clean_title
from lootscraper.utils import clean_combined_title, clean_game_title

if TYPE_CHECKING:
from playwright.async_api import Locator, Page
Expand Down Expand Up @@ -129,7 +129,11 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
except ValueError:
logger.warning(f"Couldn't parse date {maybe_date}.")

probable_game_name = clean_title(raw_offer.title, self.get_type())
probable_game_name = (
clean_game_title(raw_offer.title)
if self.get_type() == OfferType.GAME
else clean_combined_title(raw_offer.title)[0]
)

return Offer(
source=self.get_source(),
Expand Down
31 changes: 30 additions & 1 deletion src/lootscraper/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
from lootscraper.database import Game, IgdbInfo, LootDatabase, Offer, SteamInfo
from lootscraper.processing import add_game_info
from lootscraper.scraper.scraper_base import Scraper
from lootscraper.utils import (
clean_game_title,
clean_loot_title,
clean_title,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -69,14 +74,38 @@ def fix_offer_titles(session: Session) -> None:
"""Trim offer titles and remove line breaks."""
offer: Offer
for offer in session.query(Offer):
title_new = offer.title.replace("\n", " ").strip()
if offer.rawtext is None:
continue

try:
raw_title = offer.rawtext["gametitle"]
title_new = (
clean_game_title(raw_title)
+ " - "
+ clean_loot_title(offer.rawtext["title"])
)
except KeyError:
raw_title = offer.rawtext["title"]
title_new = clean_title(raw_title, offer.type)

if title_new != offer.title:
log(
f"Cleaning up title for offer {offer.id}. "
f"Old: {offer.title}, new: {title_new}.",
)
offer.title = title_new

if offer.probable_game_name is not None:
new_name = clean_game_title(
offer.probable_game_name,
)
if new_name != offer.probable_game_name:
log(
f"Cleaning up probable game name for offer {offer.id}. "
f"Old: {offer.probable_game_name}, new: {new_name}.",
)
offer.probable_game_name = new_name

session.commit()


Expand Down
80 changes: 65 additions & 15 deletions src/lootscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,33 @@ def clean_nones(value: dict[str, Any]) -> dict[str, Any]:


def clean_title(title: str, type_: OfferType) -> str:
"""Cleans the title of an offer. This is different for games and loot.
For games, we remove some common parts of the title that are not needed.
"""
if type_ == OfferType.GAME:
return clean_game_title(title)

if type_ == OfferType.LOOT:
return clean_loot_title(title)
# The second element is the full offer title
return clean_combined_title(title)[1]

raise ValueError(f"Unknown type {type_}")


def clean_game_title(title: str) -> str:
return (
title.removesuffix(" on Origin")
title.replace("\n", "")
.replace(" - ", ": ")
.replace(" : ", ": ")
.strip()
.removeprefix("[VIP]")
.removeprefix("[ VIP ]")
.removesuffix(" on Origin")
.removesuffix(" Game of the Year Edition Deluxe")
.removesuffix(" Game of the Year Edition")
.removesuffix(" Definitive Edition")
.removesuffix(" Deluxe Edition")
.removesuffix(" (Mobile)")
.strip()
.removesuffix(":")
.removesuffix("-")
Expand All @@ -90,11 +102,23 @@ def clean_game_title(title: str) -> str:


def clean_loot_title(title: str) -> str:
return (
title.replace("\n", "")
.replace(" - ", ": ")
.replace(" : ", ": ")
.strip()
.removesuffix(":")
.removesuffix("-")
.strip()
)


def clean_combined_title(title: str) -> tuple[str, str]:
"""
Clean the loot title.
Clean the combined title.
Unfortunately Amazon loot offers come in free text format, so we
need to do some manual matching.
Unfortunately loot offers come in free text format, so we need to do some
manual matching.
Most of the time, it is the part before the first ": ", e.g.
"Lords Mobile: Warlord Pack"
Expand Down Expand Up @@ -125,27 +149,53 @@ def clean_loot_title(title: str) -> str:
in the name)
4. By the ": " pattern (TITLE: LOOT)
"""
probable_game_name: str | None = None
probable_game_name: str = ""
probable_loot_name: str = ""

match = re.compile(r"(.*) — .*: .*").match(title)
title = title.replace("\n", " ").strip()

# Special Steam format (TITLE — LOOT: LOOTDETAIL)
match = re.compile(r"(.*) — (.*: .*)").match(title)
if match and match.group(1):
probable_game_name = match.group(1)
if probable_game_name is None:
probable_loot_name = match.group(2)
# By the second colon (TITLE: TITLEDETAIL: LOOTDETAIL)
if not probable_game_name:
# Replace some very special characters that Steam uses sometimes
title = title.replace(":", ": ").replace(" — ", ": ").replace(" - ", ": ") # noqa
title_parts: list[str] = title.split(": ")
if probable_game_name is None and len(title_parts) >= 3:
if not probable_game_name and len(title_parts) >= 3:
probable_game_name = ": ".join(title_parts[:-1])
if probable_game_name is None:
match = re.compile(r"Get .* in (.*)").match(title)
probable_loot_name = title_parts[-1]
# By the "Get ... in [Game] pattern" (to catch games with a colon in the name)
if not probable_game_name:
match = re.compile(r"Get (.*) in (.*)").match(title)
if match and match.group(1):
probable_game_name = match.group(1)
if probable_game_name is None and len(title_parts) == 2:
probable_game_name = match.group(2)
probable_loot_name = match.group(1)
# By the ": " pattern (TITLE: LOOT)
if not probable_game_name and len(title_parts) == 2:
probable_game_name = ": ".join(title_parts[:-1])
if probable_game_name is None:
probable_loot_name = title_parts[-1]
# If we still don't have a game name, we just use the whole title
if not probable_game_name:
probable_game_name = title

return clean_game_title(probable_game_name)
probable_game_name = clean_game_title(probable_game_name)

# Capitalize first letter
probable_loot_name = probable_loot_name.strip()
probable_loot_name = probable_loot_name[:1].upper() + probable_loot_name[1:]

# Return the cleaned game and loot name. For clarity, we will use
# the format "Game: Loot" for the offer title.

resulting_offer_title = probable_game_name
if probable_loot_name:
resulting_offer_title += f" - {probable_loot_name}"

# Return both the cleaned game name and the resulting offer title
return (probable_game_name, resulting_offer_title)


def calc_real_valid_to(
Expand Down
Loading

0 comments on commit d25912a

Please sign in to comment.