bundestag · jbruechert · Mar 27, 2021 · Mar 27, 2021 · Mar 27, 2021
diff --git a/banz_scraper.py b/banz_scraper.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 """BAnz-Scraper.
 
 Usage:
@@ -21,25 +23,31 @@
 import re
 import json
 
-import lxml.html
+from bs4 import BeautifulSoup
 import requests
+import requests.cookies
+
+from typing import List, Tuple
+
+from requests.models import Response
 
 
 class BAnzScraper:
-    BASE_URL = 'https://www.bundesanzeiger.de/ebanzwww/wexsservlet?'
-    BASE = 'page.navid=to_official_part&global_data.designmode=eb'
-    YEAR = ('page.navid=official_starttoofficial_start_changeyear'
-            '&genericsearch_param.year=%s&genericsearch_param.edition='
-            '&genericsearch_param.sort_type=')
-    LIST = ('genericsearch_param.edition=%s&genericsearch_param.sort_type='
-            '&%%28page.navid%%3Dofficial_starttoofficial_start_update%%29='
-            'Veröffentlichungen+anzeigen')
+    BASE_URL = 'https://www.bundesanzeiger.de/pub/de/'
+    SET_YEAR_URL_PART: str
 
     MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
               'August', 'September', 'Oktober', 'November', 'Dezember']
 
+    SESSION_COOKIES = requests.cookies.RequestsCookieJar()
+
     def get(self, url):
-        return requests.get(url)
+        return requests.get(url, cookies=self.SESSION_COOKIES)
+
+    def post(self, *args, **kwargs) -> Response:
+        return requests.post(*args, **kwargs, cookies=self.SESSION_COOKIES, headers={
+            "Referer": "https://www.bundesanzeiger.de/"
+        })
 
     def scrape(self, low=0, high=10000):
         collection = {}
@@ -53,67 +61,88 @@ def scrape(self, low=0, high=10000):
                 collection.update(self.get_items(year, date))
         return collection
 
-    def get_years(self):
-        url = self.BASE_URL + self.BASE
+    def get_years(self) -> List[int]:
+        url = self.BASE_URL + "amtlicher-teil"
         response = self.get(url)
+        self.SESSION_COOKIES = response.cookies
+        response.cookies
         years = []
-        root = lxml.html.fromstring(response.text)
-        selector = '#td_sub_menu_v li'
-        for li in root.cssselect(selector):
+
+        root = BeautifulSoup(response.text, features="lxml")
+        self.SET_YEAR_URL_PART = root.find("div", class_="pager_release_year_container").find("form")["action"]
+
+        year_menu = root.find(id="id5")
+
+        for option in year_menu.find_all("option"):
             try:
-                year = int(li.text_content())
+                year = int(option.string)
             except ValueError:
                 continue
             years.append(year)
+
         return years
 
-    def get_dates(self, year):
-        url = self.BASE_URL + self.YEAR % year
-        response = self.get(url)
+    def get_dates(self, year) -> List[Tuple[str, str]]:
+        set_year_url = self.BASE_URL + self.SET_YEAR_URL_PART.replace("./", "")
+        response = self.post(set_year_url, data={"year": year})
+
         dates = []
-        root = lxml.html.fromstring(response.text)
-        selector = 'select[name="genericsearch_param.edition"] option'
-        for option in root.cssselect(selector):
-            dates.append(
-                (option.attrib['value'], option.text_content().strip()))
+        root = BeautifulSoup(response.text, features="lxml")
+
+        date_menu = root.find(id="id6")
+        for option in date_menu.find_all("option"):
+            dates.append((option["value"], option.string.strip()))
+
         return dates
 
-    def get_items(self, year, date):
-        url = self.BASE_URL + self.LIST % date[0]
-        response = self.get(url)
+    def get_items(self, year, date: Tuple[str, str]):
+        set_date_url = self.BASE_URL + f"amtlicher-teil?&year={year}&edition={date[0]}"
+        response = self.get(set_date_url)
+
         items = {}
-        root = lxml.html.fromstring(response.text)
-        selector = 'table[summary="Trefferliste"] tr'
-        for tr in root.cssselect(selector):
-            tds = tr.cssselect('td')
-            if len(tds) != 3:
+        root = BeautifulSoup(response.text, features="lxml")
+
+        results = root.find(class_="result_container")
+        rows = results.find_all(class_="row")
+
+        for row in rows:
+            if "sticky-top" in row["class"]:
                 continue
-            public_body = tds[0].text_content().strip()
-            link = tds[1].cssselect('a')[0]
-            additional = []
-            for c in tds[1].getchildren()[1:]:
-                if c.tail is not None and c.tail.strip():
-                    additional.append(c.tail.strip())
-            orig_date = None
-            for a in additional:
-                match = re.search(r'[Vv]om (\d+)\. (\w+) (\d{4})', a, re.U)
-                if match is not None:
-                    day = int(match.group(1))
-                    month = self.MONTHS.index(match.group(2)) + 1
-                    year = int(match.group(3))
-                    orig_date = f'{day:02}.{month:02}.{year}'
-                    break
-            name = link.text_content()[1:]
-            name = re.sub(r'\s+', ' ', name)
-            ident = tds[2].text_content().strip()
+
+            print("==========")
+            spans = row.find_all("span")
+            title_result = row.find(class_="title_result")
+
+            orig_date = ""
+            match = re.search(r'[Vv]om: (\d+)\. ([\wä]+) (\d{4})', str(title_result), re.U)
+            if match:
+                day = int(match.group(1))
+                month = self.MONTHS.index(match.group(2)) + 1
+                year = int(match.group(3))
+                orig_date = '%02d.%02d.%d' % (day, month, year)
+
+            name = spans[0].string
+            public_body: str
+            if spans[1].string:
+                public_body = spans[1].string
+            else:
+                public_body = spans[1].contents[1]  # Throw away br tag at the beginning
+
+            ident: str
+            if spans[2].string:
+                ident = spans[2].string
+            else:
+                ident = spans[2].contents[1]  # Throw away br tag at the beginning
+
             items[ident] = {
                 'ident': ident,
                 'public_body': public_body,
                 'name': name,
                 'date': date[1],
                 'original_date': orig_date,
-                'additional': additional
+                'additional': []  # TODO
             }
+            print(items)
         return items
 
 
@@ -129,7 +158,7 @@ def main(arguments):
             data = json.load(f)
     data.update(banz.scrape(minyear, maxyear))
     with open(arguments['<outputfile>'], 'w') as f:
-        json.dump(data, f)
+        json.dump(data, f, indent=4)
 
 
 if __name__ == '__main__':

diff --git a/bgbl_scraper.py b/bgbl_scraper.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 """BGBl-Scraper.
 
 Usage:

diff --git a/lawde.py b/lawde.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 """LawDe.
 
 Usage:

diff --git a/lawdown.py b/lawdown.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 """LawDown - Law To Markdown.
 
 Usage:

diff --git a/lawgit.py b/lawgit.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 """LawGit - Semi-automatic law change commits.
 
 Usage:
@@ -21,9 +23,15 @@
 from datetime import datetime, timedelta
 from collections import defaultdict
 
-from git import Repo
+from git import Repo, Commit, DiffIndex, Diff
 from git.exc import GitCommandError
 
+from typing import List, Dict, Tuple
+
+
+def log(*message: str):
+    print(datetime.now(), ":", *message)
+
 
 class TransientState(Exception):
     pass
@@ -66,7 +74,7 @@ def load(self, source):
                 toc['part_i'] = 'I' * toc['part']
                 self.data[(toc['year'], toc['page'], toc['part'])] = toc
 
-    def find_candidates(self, lines):
+    def find_candidates(self, lines: List[str]):
         candidates = []
         for line in lines:
             for c_re in self.change_re:
@@ -121,8 +129,8 @@ def __str__(self):
     def load(self, source):
         self.data = json.load(open(source))
 
-    def find_candidates(self, lines):
-        candidates = []
+    def find_candidates(self, lines: List[str]) -> List[str]:
+        candidates: List[str] = []
         for line in lines:
             line = re.sub(r'[^\w \.]', '', line)
             line = re.sub(r' \d{4} ', ' ', line)
@@ -245,7 +253,7 @@ def get_message(self, key):
 
 class LawGit:
     laws = defaultdict(list)
-    law_changes = {}
+    law_changes: Dict[str, Tuple[bool, str, Path]] = {}
     bgbl_changes = defaultdict(list)
 
     def __init__(self, path, dry_run=False, consider_old=False, grep=None):
@@ -270,7 +278,7 @@ def prepare_commits(self):
             source, key = result
             date = source.get_date(key)
             if not self.consider_old and date + timedelta(days=30 * 12) < datetime.now():
-                print(f"Skipped {law} {result} (too old)")
+                log(f"Skipped {law} {result} (too old)")
                 continue
             branch_name = source.get_branch_name(key)
             ident = source.get_ident(key)
@@ -279,17 +287,24 @@ def prepare_commits(self):
         return branches
 
     def collect_laws(self):
-        hcommit = self.repo.head.commit
-        wdiff = hcommit.diff(None, create_patch=True)
+        hcommit: Commit = self.repo.head.commit
+        wdiff: DiffIndex = hcommit.diff(None, create_patch=True)
+
         for diff in wdiff:
-            law_name = diff.b_blob.path.split('/')[1]
-            if self.grep and self.grep not in law_name:
-                continue
-            filename = '/'.join(diff.b_blob.path.split('/')[:2] + ['index.md'])
-            filename = self.path / filename
-            if filename.exists():
-                self.laws[law_name].append(diff.b_blob.path)
-                self.law_changes[law_name] = (False, diff.diff, filename)
+            diff: Diff
+            if diff.b_blob:
+                law_name = diff.b_blob.path.split('/')[1]
+                if self.grep and self.grep not in law_name:
+                    continue
+                filename = '/'.join(diff.b_blob.path.split('/')
+                                    [:2] + ['index.md'])
+                filename = self.path / filename
+                if filename.exists():
+                    self.laws[law_name].append(diff.b_blob.path)
+                    self.law_changes[law_name] = (
+                        False, diff.diff.decode(), filename)
+            else:
+                log("Found deleted law?")
 
         for filename in self.repo.untracked_files:
             law_name = filename.split('/')[1]
@@ -302,19 +317,18 @@ def collect_laws(self):
                 self.law_changes[law_name] = (True, f.read(), filename)
 
     def determine_source(self, law_name):
-        new_file, lines, filename = self.law_changes[law_name]
-        lines = [line.decode('utf-8') for line in lines.splitlines()]
+        new_file, text, filename = self.law_changes[law_name]
+        lines: List[str] = [line for line in text.splitlines()]
         candidates = self.find_in_sources(lines)
         if not candidates:
             with open(filename) as f:
-                lines = [line.decode('utf-8')
-                         for line in f.read().splitlines()]
+                lines = [line for line in f.read().splitlines()]
             candidates.extend(self.find_in_sources(lines))
         if not candidates:
             return None
         return sorted(candidates, key=lambda x: x[0].get_order_key(x[1]))[-1]
 
-    def find_in_sources(self, lines):
+    def find_in_sources(self, lines: List[str]):
         candidates = []
         for source in self.sources:
             try:
@@ -333,11 +347,11 @@ def commit_branch(self, branch, commits):
         if not self.dry_run:
             self.repo.git.stash()
         try:
-            print(f"git checkout -b {branch}")
+            log(f"git checkout -b {branch}")
             if not self.dry_run:
                 self.repo.git.checkout(b=branch)
         except GitCommandError:
-            print(f"git checkout {branch}")
+            log(f"git checkout {branch}")
             if not self.dry_run:
                 self.repo.git.checkout(branch)
         if not self.dry_run:
@@ -347,22 +361,23 @@ def commit_branch(self, branch, commits):
             for law_name, source, key in commits[ident]:
                 for filename in self.laws[law_name]:
                     if (self.path / filename).exists():
-                        print(f"git add {filename}")
+                        log(f"git add {filename}")
                         if not self.dry_run:
                             self.repo.index.add([str(filename)])
                     else:
-                        print(f"git rm {str(filename)}")
+                        log(f"git rm {str(filename)}")
                         if not self.dry_run:
                             self.repo.index.remove([str(filename)])
             msg = source.get_message(key)
-            print(f'git commit -m"{msg}"')
+
+            log(f'git commit -m"{msg}"')
             if not self.dry_run:
                 self.repo.index.commit(msg)
-            print("")
-        print("git checkout master")
+            log("")
+        log("git checkout master")
         if not self.dry_run:
             self.repo.heads.master.checkout()
-        print(f"git merge {branch} --no-ff")
+        log(f"git merge {branch} --no-ff")
         if not self.dry_run:
             self.repo.git.merge(branch, no_ff=True)
 

diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ lxml==4.6.3
 cssselect==1.1.0
 requests==2.25.1
 docopt==0.6.2
+beautifulsoup4==4.9.3