diff --git a/src/lootscraper/scraper/scraper_base.py b/src/lootscraper/scraper/scraper_base.py index c49a5d1..1b72790 100644 --- a/src/lootscraper/scraper/scraper_base.py +++ b/src/lootscraper/scraper/scraper_base.py @@ -48,9 +48,10 @@ async def scrape(self) -> list[Offer]: f"/ {self.get_duration().value}.", ) offers = await self.read_offers() + offers = self.clean_offers(offers) unique_offers = self.deduplicate_offers(offers) categorized_offers = self.categorize_offers(unique_offers) - filtered_offers = self.clean_offers(categorized_offers) + filtered_offers = self.filter_for_valid_offers(categorized_offers) titles = ", ".join([offer.title for offer in filtered_offers]) if len(filtered_offers) > 0: @@ -176,6 +177,17 @@ async def read_offers(self) -> list[Offer]: return offers + def clean_offers(self, offers: list[Offer]) -> list[Offer]: + """Clean offer title etc.""" + for offer in offers: + offer.title = offer.title.replace("\n", "").strip() + if offer.url is not None: + offer.url = offer.url.replace("\n", "").strip() + if offer.img_url is not None: + offer.img_url = offer.img_url.replace("\n", "").strip() + + return offers + def categorize_offers(self, offers: list[Offer]) -> list[Offer]: """Categorize offers by title (demo, etc.).""" for offer in offers: @@ -205,7 +217,7 @@ def deduplicate_offers(self, offers: list[Offer]) -> list[Offer]: return new_offers - def clean_offers(self, offers: list[Offer]) -> list[Offer]: + def filter_for_valid_offers(self, offers: list[Offer]) -> list[Offer]: """Only keep valid offers.""" return list( filter( diff --git a/src/lootscraper/tools.py b/src/lootscraper/tools.py index c4fb4d5..4ee8fe4 100644 --- a/src/lootscraper/tools.py +++ b/src/lootscraper/tools.py @@ -1,3 +1,5 @@ +"""Contains tools that can be run from the command line.""" + import asyncio import logging @@ -5,13 +7,20 @@ from sqlalchemy.orm import Session from lootscraper.browser import get_browser_context -from lootscraper.common import Category +from lootscraper.common import Category, OfferDuration from lootscraper.database import Game, IgdbInfo, LootDatabase, Offer, SteamInfo from lootscraper.processing import add_game_info +from lootscraper.scraper.scraper_base import Scraper logger = logging.getLogger(__name__) +def log(msg: str) -> None: + """Log a message to the console and the logger.""" + print(msg) # noqa + logger.info(msg) + + async def refresh_all_games(session: Session, context: BrowserContext) -> None: """ Drop all games from the database and re-add them, scraping all @@ -25,10 +34,10 @@ async def refresh_all_games(session: Session, context: BrowserContext) -> None: all_offers = session.query(Offer).all() - logger.info("Gathering new information") + log("Gathering new information") offer: Offer for offer in all_offers: - logger.info(f"Adding game info for offer {offer.id}.") + log(f"Adding game info for offer {offer.id}.") await add_game_info(offer, session, context) session.commit() @@ -39,7 +48,7 @@ def delete_invalid_offers(session: Session) -> None: offer: Offer for offer in session.query(Offer): if offer.category in [Category.DEMO, Category.PRERELEASE]: - logger.info(f"Deleting invalid offer {offer.id}.") + log(f"Deleting invalid offer {offer.id}.") session.delete(offer) session.commit() @@ -50,19 +59,73 @@ def fix_image_nones(session: Session) -> None: offer: Offer for offer in session.query(Offer): if offer.img_url in ("", "None"): - logger.info(f"Cleaning up empty image URL for offer {offer.id}.") + log(f"Cleaning up empty image URL for offer {offer.id}.") offer.img_url = None session.commit() +def fix_offer_titles(session: Session) -> None: + """Trim offer titles and remove line breaks.""" + offer: Offer + for offer in session.query(Offer): + title_new = offer.title.replace("\n", " ").strip() + if title_new != offer.title: + log( + f"Cleaning up title for offer {offer.id}. " + f"Old: {offer.title}, new: {title_new}.", + ) + offer.title = title_new + + session.commit() + + +def fix_offer_categories(session: Session) -> None: + """Fix offer categories (demo etc.).""" + offer: Offer + for offer in session.query(Offer): + if Scraper.is_demo(offer.title): + if offer.category != Category.DEMO: + log( + f"Cleaning up category for offer {offer.id}. " + f"Old: {offer.category}, new: {Category.DEMO}.", + ) + offer.category = Category.DEMO + continue + if Scraper.is_prerelease(offer.title): + if offer.category != Category.PRERELEASE: + log( + f"Cleaning up category for offer {offer.id}. " + f"Old: {offer.category}, new: {Category.PRERELEASE}.", + ) + offer.category = Category.PRERELEASE + continue + if Scraper.is_fake_always(offer.valid_to): + if offer.duration != OfferDuration.ALWAYS: + log( + f"Cleaning up duration for offer {offer.id}. " + f"Old: {offer.duration}, new: {OfferDuration.ALWAYS}.", + ) + offer.duration = OfferDuration.ALWAYS + continue + + session.commit() + + async def run_cleanup() -> None: - """Run cleanup functions.""" - logger.info("Running cleanup") + """Clean common problems.""" + log("Running cleanup") with LootDatabase(echo=False) as db: delete_invalid_offers(db.Session()) fix_image_nones(db.Session()) + fix_offer_titles(db.Session()) + fix_offer_categories(db.Session()) + +async def run_refresh() -> None: + """Refresh all game data.""" + log("Running refresh") + with LootDatabase(echo=False) as db: async with get_browser_context() as context: await refresh_all_games(db.Session(), context) @@ -71,7 +134,19 @@ def cleanup() -> None: """ Wrap cleanup functions synchronously. - Run this with `python -c 'import app.tools; app.tools.cleanup()'` for now. - TODO: Add an admin command for this (telegram). + Run this with `python -c 'import lootscraper.tools; lootscraper.tools.cleanup()'` + for now. """ + # TODO: Add an admin command for this (telegram). asyncio.run(run_cleanup()) + + +def refresh() -> None: + """ + Wrap cleanup functions synchronously. + + Run this with `python -c 'import lootscraper.tools; lootscraper.tools.refresh()'` + for now. + """ + # TODO: Add an admin command for this (telegram). + asyncio.run(run_refresh())