Skip to content

Commit

Permalink
Reimplement some parts of banz_scraper to make it work again
Browse files Browse the repository at this point in the history
  • Loading branch information
jbruechert committed Mar 27, 2021
1 parent 1b16453 commit d44ff34
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 51 deletions.
130 changes: 79 additions & 51 deletions banz_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,31 @@
import re
import json

import lxml.html
from bs4 import BeautifulSoup
import requests
import requests.cookies

from typing import List, Tuple

from requests.models import Response


class BAnzScraper:
BASE_URL = 'https://www.bundesanzeiger.de/ebanzwww/wexsservlet?'
BASE = 'page.navid=to_official_part&global_data.designmode=eb'
YEAR = ('page.navid=official_starttoofficial_start_changeyear'
'&genericsearch_param.year=%s&genericsearch_param.edition='
'&genericsearch_param.sort_type=')
LIST = ('genericsearch_param.edition=%s&genericsearch_param.sort_type='
'&%%28page.navid%%3Dofficial_starttoofficial_start_update%%29='
'Veröffentlichungen+anzeigen')
BASE_URL = 'https://www.bundesanzeiger.de/pub/de/'
SET_YEAR_URL_PART: str

MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
'August', 'September', 'Oktober', 'November', 'Dezember']

SESSION_COOKIES = requests.cookies.RequestsCookieJar()

def get(self, url):
return requests.get(url)
return requests.get(url, cookies=self.SESSION_COOKIES)

def post(self, *args, **kwargs) -> Response:
return requests.post(*args, **kwargs, cookies=self.SESSION_COOKIES, headers={
"Referer": "https://www.bundesanzeiger.de/"
})

def scrape(self, low=0, high=10000):
collection = {}
Expand All @@ -55,66 +61,88 @@ def scrape(self, low=0, high=10000):
collection.update(self.get_items(year, date))
return collection

def get_years(self):
url = self.BASE_URL + self.BASE
def get_years(self) -> List[int]:
url = self.BASE_URL + "amtlicher-teil"
response = self.get(url)
self.SESSION_COOKIES = response.cookies
response.cookies
years = []
root = lxml.html.fromstring(response.text)
selector = '#td_sub_menu_v li'
for li in root.cssselect(selector):

root = BeautifulSoup(response.text, features="lxml")
self.SET_YEAR_URL_PART = root.find("div", class_="pager_release_year_container").find("form")["action"]

year_menu = root.find(id="id5")

for option in year_menu.find_all("option"):
try:
year = int(li.text_content())
year = int(option.string)
except ValueError:
continue
years.append(year)

return years

def get_dates(self, year):
url = self.BASE_URL + self.YEAR % year
response = self.get(url)
def get_dates(self, year) -> List[Tuple[str, str]]:
set_year_url = self.BASE_URL + self.SET_YEAR_URL_PART.replace("./", "")
response = self.post(set_year_url, data={"year": year})

dates = []
root = lxml.html.fromstring(response.text)
selector = 'select[name="genericsearch_param.edition"] option'
for option in root.cssselect(selector):
dates.append((option.attrib['value'], option.text_content().strip()))
root = BeautifulSoup(response.text, features="lxml")

date_menu = root.find(id="id6")
for option in date_menu.find_all("option"):
dates.append((option["value"], option.string.strip()))

return dates

def get_items(self, year, date):
url = self.BASE_URL + self.LIST % date[0]
response = self.get(url)
def get_items(self, year, date: Tuple[str, str]):
set_date_url = self.BASE_URL + f"amtlicher-teil?&year={year}&edition={date[0]}"
response = self.get(set_date_url)

items = {}
root = lxml.html.fromstring(response.text)
selector = 'table[summary="Trefferliste"] tr'
for tr in root.cssselect(selector):
tds = tr.cssselect('td')
if len(tds) != 3:
root = BeautifulSoup(response.text, features="lxml")

results = root.find(class_="result_container")
rows = results.find_all(class_="row")

for row in rows:
if "sticky-top" in row["class"]:
continue
public_body = tds[0].text_content().strip()
link = tds[1].cssselect('a')[0]
additional = []
for c in tds[1].getchildren()[1:]:
if c.tail is not None and c.tail.strip():
additional.append(c.tail.strip())
orig_date = None
for a in additional:
match = re.search(r'[Vv]om (\d+)\. (\w+) (\d{4})', a, re.U)
if match is not None:
day = int(match.group(1))
month = self.MONTHS.index(match.group(2)) + 1
year = int(match.group(3))
orig_date = f'{day:02}.{month:02}.{year}'
break
name = link.text_content()[1:]
name = re.sub(r'\s+', ' ', name)
ident = tds[2].text_content().strip()

print("==========")
spans = row.find_all("span")
title_result = row.find(class_="title_result")

orig_date = ""
match = re.search('[Vv]om: (\d+)\. ([\wä]+) (\d{4})', str(title_result), re.U)
if match:
day = int(match.group(1))
month = self.MONTHS.index(match.group(2)) + 1
year = int(match.group(3))
orig_date = '%02d.%02d.%d' % (day, month, year)

name = spans[0].string
public_body: str
if spans[1].string:
public_body = spans[1].string
else:
public_body = spans[1].contents[1] # Throw away br tag at the beginning

ident: str
if spans[2].string:
ident = spans[2].string
else:
ident = spans[2].contents[1] # Throw away br tag at the beginning

items[ident] = {
'ident': ident,
'public_body': public_body,
'name': name,
'date': date[1],
'original_date': orig_date,
'additional': additional
'additional': [] # TODO
}
print(items)
return items


Expand All @@ -130,7 +158,7 @@ def main(arguments):
data = json.load(f)
data.update(banz.scrape(minyear, maxyear))
with open(arguments['<outputfile>'], 'w') as f:
json.dump(data, f)
json.dump(data, f, indent=4)

if __name__ == '__main__':
from docopt import docopt
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ lxml==4.6.3
cssselect==1.1.0
requests==2.25.1
docopt==0.6.2
beautifulsoup4==4.9.3

0 comments on commit d44ff34

Please sign in to comment.