diff --git a/banz_scraper.py b/banz_scraper.py index 3e253b14..f2020803 100755 --- a/banz_scraper.py +++ b/banz_scraper.py @@ -23,25 +23,31 @@ import re import json -import lxml.html +from bs4 import BeautifulSoup import requests +import requests.cookies + +from typing import List, Tuple + +from requests.models import Response class BAnzScraper: - BASE_URL = 'https://www.bundesanzeiger.de/ebanzwww/wexsservlet?' - BASE = 'page.navid=to_official_part&global_data.designmode=eb' - YEAR = ('page.navid=official_starttoofficial_start_changeyear' - '&genericsearch_param.year=%s&genericsearch_param.edition=' - '&genericsearch_param.sort_type=') - LIST = ('genericsearch_param.edition=%s&genericsearch_param.sort_type=' - '&%%28page.navid%%3Dofficial_starttoofficial_start_update%%29=' - 'Veröffentlichungen+anzeigen') + BASE_URL = 'https://www.bundesanzeiger.de/pub/de/' + SET_YEAR_URL_PART: str MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember'] + SESSION_COOKIES = requests.cookies.RequestsCookieJar() + def get(self, url): - return requests.get(url) + return requests.get(url, cookies=self.SESSION_COOKIES) + + def post(self, *args, **kwargs) -> Response: + return requests.post(*args, **kwargs, cookies=self.SESSION_COOKIES, headers={ + "Referer": "https://www.bundesanzeiger.de/" + }) def scrape(self, low=0, high=10000): collection = {} @@ -55,67 +61,88 @@ def scrape(self, low=0, high=10000): collection.update(self.get_items(year, date)) return collection - def get_years(self): - url = self.BASE_URL + self.BASE + def get_years(self) -> List[int]: + url = self.BASE_URL + "amtlicher-teil" response = self.get(url) + self.SESSION_COOKIES = response.cookies + response.cookies years = [] - root = lxml.html.fromstring(response.text) - selector = '#td_sub_menu_v li' - for li in root.cssselect(selector): + + root = BeautifulSoup(response.text, features="lxml") + self.SET_YEAR_URL_PART = root.find("div", class_="pager_release_year_container").find("form")["action"] + + year_menu = root.find(id="id5") + + for option in year_menu.find_all("option"): try: - year = int(li.text_content()) + year = int(option.string) except ValueError: continue years.append(year) + return years - def get_dates(self, year): - url = self.BASE_URL + self.YEAR % year - response = self.get(url) + def get_dates(self, year) -> List[Tuple[str, str]]: + set_year_url = self.BASE_URL + self.SET_YEAR_URL_PART.replace("./", "") + response = self.post(set_year_url, data={"year": year}) + dates = [] - root = lxml.html.fromstring(response.text) - selector = 'select[name="genericsearch_param.edition"] option' - for option in root.cssselect(selector): - dates.append( - (option.attrib['value'], option.text_content().strip())) + root = BeautifulSoup(response.text, features="lxml") + + date_menu = root.find(id="id6") + for option in date_menu.find_all("option"): + dates.append((option["value"], option.string.strip())) + return dates - def get_items(self, year, date): - url = self.BASE_URL + self.LIST % date[0] - response = self.get(url) + def get_items(self, year, date: Tuple[str, str]): + set_date_url = self.BASE_URL + f"amtlicher-teil?&year={year}&edition={date[0]}" + response = self.get(set_date_url) + items = {} - root = lxml.html.fromstring(response.text) - selector = 'table[summary="Trefferliste"] tr' - for tr in root.cssselect(selector): - tds = tr.cssselect('td') - if len(tds) != 3: + root = BeautifulSoup(response.text, features="lxml") + + results = root.find(class_="result_container") + rows = results.find_all(class_="row") + + for row in rows: + if "sticky-top" in row["class"]: continue - public_body = tds[0].text_content().strip() - link = tds[1].cssselect('a')[0] - additional = [] - for c in tds[1].getchildren()[1:]: - if c.tail is not None and c.tail.strip(): - additional.append(c.tail.strip()) - orig_date = None - for a in additional: - match = re.search(r'[Vv]om (\d+)\. (\w+) (\d{4})', a, re.U) - if match is not None: - day = int(match.group(1)) - month = self.MONTHS.index(match.group(2)) + 1 - year = int(match.group(3)) - orig_date = f'{day:02}.{month:02}.{year}' - break - name = link.text_content()[1:] - name = re.sub(r'\s+', ' ', name) - ident = tds[2].text_content().strip() + + print("==========") + spans = row.find_all("span") + title_result = row.find(class_="title_result") + + orig_date = "" + match = re.search('[Vv]om: (\d+)\. ([\wä]+) (\d{4})', str(title_result), re.U) + if match: + day = int(match.group(1)) + month = self.MONTHS.index(match.group(2)) + 1 + year = int(match.group(3)) + orig_date = '%02d.%02d.%d' % (day, month, year) + + name = spans[0].string + public_body: str + if spans[1].string: + public_body = spans[1].string + else: + public_body = spans[1].contents[1] # Throw away br tag at the beginning + + ident: str + if spans[2].string: + ident = spans[2].string + else: + ident = spans[2].contents[1] # Throw away br tag at the beginning + items[ident] = { 'ident': ident, 'public_body': public_body, 'name': name, 'date': date[1], 'original_date': orig_date, - 'additional': additional + 'additional': [] # TODO } + print(items) return items @@ -131,7 +158,7 @@ def main(arguments): data = json.load(f) data.update(banz.scrape(minyear, maxyear)) with open(arguments[''], 'w') as f: - json.dump(data, f) + json.dump(data, f, indent=4) if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index d8fb88d8..b81a7c01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ lxml==4.6.3 cssselect==1.1.0 requests==2.25.1 docopt==0.6.2 +beautifulsoup4==4.9.3