Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore banz scraper and lawgit functionality #18

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 81 additions & 52 deletions banz_scraper.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

"""BAnz-Scraper.

Usage:
Expand All @@ -21,25 +23,31 @@
import re
import json

import lxml.html
from bs4 import BeautifulSoup
import requests
import requests.cookies

from typing import List, Tuple

from requests.models import Response


class BAnzScraper:
BASE_URL = 'https://www.bundesanzeiger.de/ebanzwww/wexsservlet?'
BASE = 'page.navid=to_official_part&global_data.designmode=eb'
YEAR = ('page.navid=official_starttoofficial_start_changeyear'
'&genericsearch_param.year=%s&genericsearch_param.edition='
'&genericsearch_param.sort_type=')
LIST = ('genericsearch_param.edition=%s&genericsearch_param.sort_type='
'&%%28page.navid%%3Dofficial_starttoofficial_start_update%%29='
'Veröffentlichungen+anzeigen')
BASE_URL = 'https://www.bundesanzeiger.de/pub/de/'
SET_YEAR_URL_PART: str

MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
'August', 'September', 'Oktober', 'November', 'Dezember']

SESSION_COOKIES = requests.cookies.RequestsCookieJar()

def get(self, url):
return requests.get(url)
return requests.get(url, cookies=self.SESSION_COOKIES)

def post(self, *args, **kwargs) -> Response:
return requests.post(*args, **kwargs, cookies=self.SESSION_COOKIES, headers={
"Referer": "https://www.bundesanzeiger.de/"
})

def scrape(self, low=0, high=10000):
collection = {}
Expand All @@ -53,67 +61,88 @@ def scrape(self, low=0, high=10000):
collection.update(self.get_items(year, date))
return collection

def get_years(self):
url = self.BASE_URL + self.BASE
def get_years(self) -> List[int]:
url = self.BASE_URL + "amtlicher-teil"
response = self.get(url)
self.SESSION_COOKIES = response.cookies
response.cookies
years = []
root = lxml.html.fromstring(response.text)
selector = '#td_sub_menu_v li'
for li in root.cssselect(selector):

root = BeautifulSoup(response.text, features="lxml")
self.SET_YEAR_URL_PART = root.find("div", class_="pager_release_year_container").find("form")["action"]

year_menu = root.find(id="id5")

for option in year_menu.find_all("option"):
try:
year = int(li.text_content())
year = int(option.string)
except ValueError:
continue
years.append(year)

return years

def get_dates(self, year):
url = self.BASE_URL + self.YEAR % year
response = self.get(url)
def get_dates(self, year) -> List[Tuple[str, str]]:
set_year_url = self.BASE_URL + self.SET_YEAR_URL_PART.replace("./", "")
response = self.post(set_year_url, data={"year": year})

dates = []
root = lxml.html.fromstring(response.text)
selector = 'select[name="genericsearch_param.edition"] option'
for option in root.cssselect(selector):
dates.append(
(option.attrib['value'], option.text_content().strip()))
root = BeautifulSoup(response.text, features="lxml")

date_menu = root.find(id="id6")
for option in date_menu.find_all("option"):
dates.append((option["value"], option.string.strip()))

return dates

def get_items(self, year, date):
url = self.BASE_URL + self.LIST % date[0]
response = self.get(url)
def get_items(self, year, date: Tuple[str, str]):
set_date_url = self.BASE_URL + f"amtlicher-teil?&year={year}&edition={date[0]}"
response = self.get(set_date_url)

items = {}
root = lxml.html.fromstring(response.text)
selector = 'table[summary="Trefferliste"] tr'
for tr in root.cssselect(selector):
tds = tr.cssselect('td')
if len(tds) != 3:
root = BeautifulSoup(response.text, features="lxml")

results = root.find(class_="result_container")
rows = results.find_all(class_="row")

for row in rows:
if "sticky-top" in row["class"]:
continue
public_body = tds[0].text_content().strip()
link = tds[1].cssselect('a')[0]
additional = []
for c in tds[1].getchildren()[1:]:
if c.tail is not None and c.tail.strip():
additional.append(c.tail.strip())
orig_date = None
for a in additional:
match = re.search(r'[Vv]om (\d+)\. (\w+) (\d{4})', a, re.U)
if match is not None:
day = int(match.group(1))
month = self.MONTHS.index(match.group(2)) + 1
year = int(match.group(3))
orig_date = f'{day:02}.{month:02}.{year}'
break
name = link.text_content()[1:]
name = re.sub(r'\s+', ' ', name)
ident = tds[2].text_content().strip()

print("==========")
spans = row.find_all("span")
title_result = row.find(class_="title_result")

orig_date = ""
match = re.search(r'[Vv]om: (\d+)\. ([\wä]+) (\d{4})', str(title_result), re.U)
if match:
day = int(match.group(1))
month = self.MONTHS.index(match.group(2)) + 1
year = int(match.group(3))
orig_date = '%02d.%02d.%d' % (day, month, year)

name = spans[0].string
public_body: str
if spans[1].string:
public_body = spans[1].string
else:
public_body = spans[1].contents[1] # Throw away br tag at the beginning

ident: str
if spans[2].string:
ident = spans[2].string
else:
ident = spans[2].contents[1] # Throw away br tag at the beginning

items[ident] = {
'ident': ident,
'public_body': public_body,
'name': name,
'date': date[1],
'original_date': orig_date,
'additional': additional
'additional': [] # TODO
}
print(items)
return items


Expand All @@ -129,7 +158,7 @@ def main(arguments):
data = json.load(f)
data.update(banz.scrape(minyear, maxyear))
with open(arguments['<outputfile>'], 'w') as f:
json.dump(data, f)
json.dump(data, f, indent=4)


if __name__ == '__main__':
Expand Down
2 changes: 2 additions & 0 deletions bgbl_scraper.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

"""BGBl-Scraper.

Usage:
Expand Down
2 changes: 2 additions & 0 deletions lawde.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

"""LawDe.

Usage:
Expand Down
2 changes: 2 additions & 0 deletions lawdown.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

"""LawDown - Law To Markdown.

Usage:
Expand Down
73 changes: 44 additions & 29 deletions lawgit.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python3

"""LawGit - Semi-automatic law change commits.

Usage:
Expand All @@ -21,9 +23,15 @@
from datetime import datetime, timedelta
from collections import defaultdict

from git import Repo
from git import Repo, Commit, DiffIndex, Diff
from git.exc import GitCommandError

from typing import List, Dict, Tuple


def log(*message: str):
print(datetime.now(), ":", *message)


class TransientState(Exception):
pass
Expand Down Expand Up @@ -66,7 +74,7 @@ def load(self, source):
toc['part_i'] = 'I' * toc['part']
self.data[(toc['year'], toc['page'], toc['part'])] = toc

def find_candidates(self, lines):
def find_candidates(self, lines: List[str]):
candidates = []
for line in lines:
for c_re in self.change_re:
Expand Down Expand Up @@ -121,8 +129,8 @@ def __str__(self):
def load(self, source):
self.data = json.load(open(source))

def find_candidates(self, lines):
candidates = []
def find_candidates(self, lines: List[str]) -> List[str]:
candidates: List[str] = []
for line in lines:
line = re.sub(r'[^\w \.]', '', line)
line = re.sub(r' \d{4} ', ' ', line)
Expand Down Expand Up @@ -245,7 +253,7 @@ def get_message(self, key):

class LawGit:
laws = defaultdict(list)
law_changes = {}
law_changes: Dict[str, Tuple[bool, str, Path]] = {}
bgbl_changes = defaultdict(list)

def __init__(self, path, dry_run=False, consider_old=False, grep=None):
Expand All @@ -270,7 +278,7 @@ def prepare_commits(self):
source, key = result
date = source.get_date(key)
if not self.consider_old and date + timedelta(days=30 * 12) < datetime.now():
print(f"Skipped {law} {result} (too old)")
log(f"Skipped {law} {result} (too old)")
continue
branch_name = source.get_branch_name(key)
ident = source.get_ident(key)
Expand All @@ -279,17 +287,24 @@ def prepare_commits(self):
return branches

def collect_laws(self):
hcommit = self.repo.head.commit
wdiff = hcommit.diff(None, create_patch=True)
hcommit: Commit = self.repo.head.commit
wdiff: DiffIndex = hcommit.diff(None, create_patch=True)

for diff in wdiff:
law_name = diff.b_blob.path.split('/')[1]
if self.grep and self.grep not in law_name:
continue
filename = '/'.join(diff.b_blob.path.split('/')[:2] + ['index.md'])
filename = self.path / filename
if filename.exists():
self.laws[law_name].append(diff.b_blob.path)
self.law_changes[law_name] = (False, diff.diff, filename)
diff: Diff
if diff.b_blob:
law_name = diff.b_blob.path.split('/')[1]
if self.grep and self.grep not in law_name:
continue
filename = '/'.join(diff.b_blob.path.split('/')
[:2] + ['index.md'])
filename = self.path / filename
if filename.exists():
self.laws[law_name].append(diff.b_blob.path)
self.law_changes[law_name] = (
False, diff.diff.decode(), filename)
else:
log("Found deleted law?")

for filename in self.repo.untracked_files:
law_name = filename.split('/')[1]
Expand All @@ -302,19 +317,18 @@ def collect_laws(self):
self.law_changes[law_name] = (True, f.read(), filename)

def determine_source(self, law_name):
new_file, lines, filename = self.law_changes[law_name]
lines = [line.decode('utf-8') for line in lines.splitlines()]
new_file, text, filename = self.law_changes[law_name]
lines: List[str] = [line for line in text.splitlines()]
candidates = self.find_in_sources(lines)
if not candidates:
with open(filename) as f:
lines = [line.decode('utf-8')
for line in f.read().splitlines()]
lines = [line for line in f.read().splitlines()]
candidates.extend(self.find_in_sources(lines))
if not candidates:
return None
return sorted(candidates, key=lambda x: x[0].get_order_key(x[1]))[-1]

def find_in_sources(self, lines):
def find_in_sources(self, lines: List[str]):
candidates = []
for source in self.sources:
try:
Expand All @@ -333,11 +347,11 @@ def commit_branch(self, branch, commits):
if not self.dry_run:
self.repo.git.stash()
try:
print(f"git checkout -b {branch}")
log(f"git checkout -b {branch}")
if not self.dry_run:
self.repo.git.checkout(b=branch)
except GitCommandError:
print(f"git checkout {branch}")
log(f"git checkout {branch}")
if not self.dry_run:
self.repo.git.checkout(branch)
if not self.dry_run:
Expand All @@ -347,22 +361,23 @@ def commit_branch(self, branch, commits):
for law_name, source, key in commits[ident]:
for filename in self.laws[law_name]:
if (self.path / filename).exists():
print(f"git add {filename}")
log(f"git add {filename}")
if not self.dry_run:
self.repo.index.add([str(filename)])
else:
print(f"git rm {str(filename)}")
log(f"git rm {str(filename)}")
if not self.dry_run:
self.repo.index.remove([str(filename)])
msg = source.get_message(key)
print(f'git commit -m"{msg}"')

log(f'git commit -m"{msg}"')
if not self.dry_run:
self.repo.index.commit(msg)
print("")
print("git checkout master")
log("")
log("git checkout master")
if not self.dry_run:
self.repo.heads.master.checkout()
print(f"git merge {branch} --no-ff")
log(f"git merge {branch} --no-ff")
if not self.dry_run:
self.repo.git.merge(branch, no_ff=True)

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ lxml==4.6.3
cssselect==1.1.0
requests==2.25.1
docopt==0.6.2
beautifulsoup4==4.9.3