Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update amazon page layout #297

Merged
merged 1 commit into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 26 additions & 14 deletions src/lootscraper/scraper/amazon_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from typing import TYPE_CHECKING

import schedule
from playwright.async_api import Error, Locator
from playwright.async_api import Error, Locator, TimeoutError

from lootscraper.browser import get_new_page
from lootscraper.common import OfferDuration, Source
from lootscraper.scraper.scraper_base import RawOffer, Scraper

Expand All @@ -32,7 +33,7 @@ def get_duration() -> OfferDuration:

@staticmethod
def get_schedule() -> list[schedule.Job]:
return [schedule.every(30).minutes]
return [schedule.every(60).minutes]

def offers_expected(self) -> bool:
return True
Expand All @@ -58,12 +59,6 @@ async def read_base_raw_offer(
if title is None:
raise ValueError("Couldn't find title.")

valid_to = await element.locator(
".item-card__availability-date p",
).text_content()
if valid_to is None:
raise ValueError(f"Couldn't find valid to for {title}.")

img_url = await element.locator(
'[data-a-target="card-image"] img',
).get_attribute("src")
Expand All @@ -73,19 +68,36 @@ async def read_base_raw_offer(
url = BASE_URL

try:
path = await element.locator(
'[data-a-target="learn-more-card"]',
).get_attribute("href", timeout=500)
path = await element.get_attribute("href", timeout=500)
if path is not None and not path.startswith("http"):
url += path
except Error:
# Some offers are claimed on site and don't have a specific path.
# That's fine.
pass
raise ValueError(f"Couldn't find detail page for {title}.") from None

try:
valid_to = await self.read_date_from_details_page(url)
except TimeoutError:
# Some offers just have no date. That's fine.
valid_to = None

return AmazonRawOffer(
title=title,
valid_to=valid_to,
url=url,
img_url=img_url,
)

async def read_date_from_details_page(
self,
url: str,
) -> str:
async with get_new_page(self.context) as page:
await page.goto(url, timeout=30000)

date = await page.locator(
".availability-date span:nth-child(2)",
).text_content()
if date is None:
raise ValueError("Couldn't find date.")

return date
55 changes: 30 additions & 25 deletions src/lootscraper/scraper/amazon_games.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
from datetime import date, datetime, time, timedelta, timezone
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING

from lootscraper.common import OfferType
Expand All @@ -25,7 +25,7 @@ def get_offer_handlers(self, page: Page) -> list[OfferHandler]:
OfferHandler(
page.locator(
'[data-a-target="offer-list-FGWP_FULL"] '
'[data-a-target="item-card"]',
" .item-card__action > a:first-child",
),
self.read_raw_offer,
self.normalize_offer,
Expand All @@ -35,10 +35,32 @@ def get_offer_handlers(self, page: Page) -> list[OfferHandler]:
async def page_loaded_hook(self, page: Page) -> None:
await Scraper.scroll_element_to_bottom(page, "root")

# Scroll through the carousel to load all offers
for _ in range(10):
next_button = page.locator(
'[data-a-target="grid-carousel-next-arrow-container"]',
)

if await next_button.is_disabled():
break

await next_button.click()

async def read_raw_offer(
self,
element: Locator,
) -> AmazonRawOffer:
# Rescroll to the right again (if it got lost)
for _ in range(10):
next_button = element.page.locator(
'[data-a-target="grid-carousel-next-arrow-container"]',
)

if await next_button.is_disabled():
break

await next_button.click()

return await self.read_base_raw_offer(element)

def normalize_offer(self, raw_offer: RawOffer) -> Offer:
Expand Down Expand Up @@ -80,45 +102,28 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
if raw_offer.valid_to:
logger.debug(f"Found date: {raw_offer.valid_to} for {raw_offer.title}")
try:
raw_date = raw_offer.valid_to.removeprefix("Ends ").lower()
if raw_date == "today":
raw_date = raw_offer.valid_to.removeprefix("Ends ")
if raw_date.lower() == "today":
parsed_date = datetime.now(tz=timezone.utc).replace(
hour=0,
minute=0,
second=0,
)
elif raw_date == "tomorrow":
elif raw_date.lower() == "tomorrow":
parsed_date = datetime.now(tz=timezone.utc).replace(
hour=0,
minute=0,
second=0,
) + timedelta(days=1)
else:
parsed_date = datetime.now(tz=timezone.utc).replace(
parsed_date = datetime.strptime(raw_date, "%b %d, %Y").replace(
tzinfo=timezone.utc,
hour=0,
minute=0,
second=0,
) + timedelta(days=int(raw_date.split(" ")[1]))

# Correct the year
guessed_end_date = date(
datetime.now(tz=timezone.utc).date().year,
parsed_date.month,
parsed_date.day,
)
yesterday = datetime.now(tz=timezone.utc).date() - timedelta(days=1)
if guessed_end_date < yesterday:
guessed_end_date = guessed_end_date.replace(
year=guessed_end_date.year + 1,
)

# Add 1 day because of the notation
# ("Ends today" means "Ends at 00:00:00 the next day")
end_date = datetime.combine(
guessed_end_date + timedelta(days=1),
time.min,
tzinfo=timezone.utc,
)
end_date = parsed_date
except (ValueError, IndexError):
logger.warning(f"Date parsing failed for {raw_offer.title}")

Expand Down
33 changes: 8 additions & 25 deletions src/lootscraper/scraper/amazon_loot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
from dataclasses import dataclass
from datetime import date, datetime, time, timedelta, timezone
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING

from lootscraper.common import OfferType
Expand Down Expand Up @@ -31,7 +31,7 @@ def get_offer_handlers(self, page: Page) -> list[OfferHandler]:
OfferHandler(
page.locator(
'[data-a-target="offer-list-IN_GAME_LOOT"] '
'[data-a-target="item-card"]',
" .item-card__action > a:first-child",
),
self.read_raw_offer,
self.normalize_offer,
Expand Down Expand Up @@ -104,45 +104,28 @@ def normalize_offer(self, raw_offer: RawOffer) -> Offer:
if raw_offer.valid_to:
logger.debug(f"Found date: {raw_offer.valid_to} for {raw_offer.title}")
try:
raw_date = raw_offer.valid_to.removeprefix("Ends ").lower()
if raw_date == "today":
raw_date = raw_offer.valid_to.removeprefix("Ends ")
if raw_date.lower() == "today":
parsed_date = datetime.now(tz=timezone.utc).replace(
hour=0,
minute=0,
second=0,
)
elif raw_date == "tomorrow":
elif raw_date.lower() == "tomorrow":
parsed_date = datetime.now(tz=timezone.utc).replace(
hour=0,
minute=0,
second=0,
) + timedelta(days=1)
else:
parsed_date = datetime.now(tz=timezone.utc).replace(
parsed_date = datetime.strptime(raw_date, "%b %d, %Y").replace(
tzinfo=timezone.utc,
hour=0,
minute=0,
second=0,
) + timedelta(days=int(raw_date.split(" ")[1]))

# Correct the year
guessed_end_date = date(
datetime.now(tz=timezone.utc).date().year,
parsed_date.month,
parsed_date.day,
)
yesterday = datetime.now(tz=timezone.utc).date() - timedelta(days=1)
if guessed_end_date < yesterday:
guessed_end_date = guessed_end_date.replace(
year=guessed_end_date.year + 1,
)

# Add 1 day because of the notation
# ("Ends today" means "Ends at 00:00:00 the next day")
end_date = datetime.combine(
guessed_end_date + timedelta(days=1),
time.min,
tzinfo=timezone.utc,
)
end_date = parsed_date
except (ValueError, IndexError):
logger.warning(f"Date parsing failed for {raw_offer.title}")

Expand Down
Loading