Skip to content

Commit

Permalink
fix: correctly identify demos (#271)
Browse files Browse the repository at this point in the history
Signed-off-by: Eiko Wagenknecht <git@eiko-wagenknecht.de>
  • Loading branch information
eikowagenknecht committed Oct 23, 2023
1 parent d356ad0 commit c962437
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 11 deletions.
16 changes: 14 additions & 2 deletions src/lootscraper/scraper/scraper_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ async def scrape(self) -> list[Offer]:
f"/ {self.get_duration().value}.",
)
offers = await self.read_offers()
offers = self.clean_offers(offers)
unique_offers = self.deduplicate_offers(offers)
categorized_offers = self.categorize_offers(unique_offers)
filtered_offers = self.clean_offers(categorized_offers)
filtered_offers = self.filter_for_valid_offers(categorized_offers)

titles = ", ".join([offer.title for offer in filtered_offers])
if len(filtered_offers) > 0:
Expand Down Expand Up @@ -176,6 +177,17 @@ async def read_offers(self) -> list[Offer]:

return offers

def clean_offers(self, offers: list[Offer]) -> list[Offer]:
"""Clean offer title etc."""
for offer in offers:
offer.title = offer.title.replace("\n", "").strip()
if offer.url is not None:
offer.url = offer.url.replace("\n", "").strip()
if offer.img_url is not None:
offer.img_url = offer.img_url.replace("\n", "").strip()

return offers

def categorize_offers(self, offers: list[Offer]) -> list[Offer]:
"""Categorize offers by title (demo, etc.)."""
for offer in offers:
Expand Down Expand Up @@ -205,7 +217,7 @@ def deduplicate_offers(self, offers: list[Offer]) -> list[Offer]:

return new_offers

def clean_offers(self, offers: list[Offer]) -> list[Offer]:
def filter_for_valid_offers(self, offers: list[Offer]) -> list[Offer]:
"""Only keep valid offers."""
return list(
filter(
Expand Down
93 changes: 84 additions & 9 deletions src/lootscraper/tools.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
"""Contains tools that can be run from the command line."""

import asyncio
import logging

from playwright.async_api import BrowserContext
from sqlalchemy.orm import Session

from lootscraper.browser import get_browser_context
from lootscraper.common import Category
from lootscraper.common import Category, OfferDuration
from lootscraper.database import Game, IgdbInfo, LootDatabase, Offer, SteamInfo
from lootscraper.processing import add_game_info
from lootscraper.scraper.scraper_base import Scraper

logger = logging.getLogger(__name__)


def log(msg: str) -> None:
"""Log a message to the console and the logger."""
print(msg) # noqa
logger.info(msg)


async def refresh_all_games(session: Session, context: BrowserContext) -> None:
"""
Drop all games from the database and re-add them, scraping all
Expand All @@ -25,10 +34,10 @@ async def refresh_all_games(session: Session, context: BrowserContext) -> None:

all_offers = session.query(Offer).all()

logger.info("Gathering new information")
log("Gathering new information")
offer: Offer
for offer in all_offers:
logger.info(f"Adding game info for offer {offer.id}.")
log(f"Adding game info for offer {offer.id}.")
await add_game_info(offer, session, context)

session.commit()
Expand All @@ -39,7 +48,7 @@ def delete_invalid_offers(session: Session) -> None:
offer: Offer
for offer in session.query(Offer):
if offer.category in [Category.DEMO, Category.PRERELEASE]:
logger.info(f"Deleting invalid offer {offer.id}.")
log(f"Deleting invalid offer {offer.id}.")
session.delete(offer)

session.commit()
Expand All @@ -50,19 +59,73 @@ def fix_image_nones(session: Session) -> None:
offer: Offer
for offer in session.query(Offer):
if offer.img_url in ("", "None"):
logger.info(f"Cleaning up empty image URL for offer {offer.id}.")
log(f"Cleaning up empty image URL for offer {offer.id}.")
offer.img_url = None

session.commit()


def fix_offer_titles(session: Session) -> None:
"""Trim offer titles and remove line breaks."""
offer: Offer
for offer in session.query(Offer):
title_new = offer.title.replace("\n", " ").strip()
if title_new != offer.title:
log(
f"Cleaning up title for offer {offer.id}. "
f"Old: {offer.title}, new: {title_new}.",
)
offer.title = title_new

session.commit()


def fix_offer_categories(session: Session) -> None:
"""Fix offer categories (demo etc.)."""
offer: Offer
for offer in session.query(Offer):
if Scraper.is_demo(offer.title):
if offer.category != Category.DEMO:
log(
f"Cleaning up category for offer {offer.id}. "
f"Old: {offer.category}, new: {Category.DEMO}.",
)
offer.category = Category.DEMO
continue
if Scraper.is_prerelease(offer.title):
if offer.category != Category.PRERELEASE:
log(
f"Cleaning up category for offer {offer.id}. "
f"Old: {offer.category}, new: {Category.PRERELEASE}.",
)
offer.category = Category.PRERELEASE
continue
if Scraper.is_fake_always(offer.valid_to):
if offer.duration != OfferDuration.ALWAYS:
log(
f"Cleaning up duration for offer {offer.id}. "
f"Old: {offer.duration}, new: {OfferDuration.ALWAYS}.",
)
offer.duration = OfferDuration.ALWAYS
continue

session.commit()


async def run_cleanup() -> None:
"""Run cleanup functions."""
logger.info("Running cleanup")
"""Clean common problems."""
log("Running cleanup")
with LootDatabase(echo=False) as db:
delete_invalid_offers(db.Session())
fix_image_nones(db.Session())
fix_offer_titles(db.Session())
fix_offer_categories(db.Session())


async def run_refresh() -> None:
"""Refresh all game data."""
log("Running refresh")
with LootDatabase(echo=False) as db:
async with get_browser_context() as context:
await refresh_all_games(db.Session(), context)

Expand All @@ -71,7 +134,19 @@ def cleanup() -> None:
"""
Wrap cleanup functions synchronously.
Run this with `python -c 'import app.tools; app.tools.cleanup()'` for now.
TODO: Add an admin command for this (telegram).
Run this with `python -c 'import lootscraper.tools; lootscraper.tools.cleanup()'`
for now.
"""
# TODO: Add an admin command for this (telegram).
asyncio.run(run_cleanup())


def refresh() -> None:
"""
Wrap cleanup functions synchronously.
Run this with `python -c 'import lootscraper.tools; lootscraper.tools.refresh()'`
for now.
"""
# TODO: Add an admin command for this (telegram).
asyncio.run(run_refresh())

0 comments on commit c962437

Please sign in to comment.