Merge pull request #31 from bundestag/bgbl

Fix BGBl scraper.
bundestag · Mar 30, 2021 · fd0c108 · fd0c108
2 parents fe971ea + f3fca36
commit fd0c108
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ Last tested: 2017-01-14 SUCCESS
 Scrapes the table of contents of all issues of the Bundesgesetzblatt and dumps
 the result to JSON.
 
-Last tested: 2017-01-14 FAILED ("KeyError: xaversid")
+Last tested: 2021-03-30 SUCCESS
 
 ## banz_scraper.py
 
@@ -53,4 +53,4 @@ Checks the repositories working directory for changes, tries to find relations
 to table of content entries in BGBl and BAnz data, commits the changes to a branch
 and merges the branch into master.
 
-Last tested: 2017-01-14 SUCCESS
+Last tested: 2017-01-14 SUCCESS
diff --git a/bgbl_scraper.py b/bgbl_scraper.py
@@ -17,9 +17,11 @@
 """
 import sys
 from pathlib import Path
+import urllib.parse
 import re
 import json
 from collections import defaultdict
+import time
 import roman_numbers
 
 import lxml.html
@@ -31,25 +33,28 @@
 
 class BGBLScraper:
     BASE_URL = 'http://www.bgbl.de/xaver/bgbl/'
-    TOC = ('ajax.xav?q=toclevel'
-           '&n=')
-    TEXT = ('text.xav?tf=xaver.component.Text_0'
-            '&hlf=xaver.component.Hitlist_0'
-            '&tocid=')
 
     def __init__(self):
         self.session = requests.session()
         self.session.get(self.BASE_URL + 'start.xav')  # save cookies
 
-    def downloadUrl(self, url):
-        response = self.session.get(self.BASE_URL + url)
+    def downloadUrl(self, file, query={}):
+        query['request.preventCache'] = int(time.time()*1000)
+        response = self.session.get(f'{self.BASE_URL}{file}?{urllib.parse.urlencode(query)}')
         return response.json()
 
-    def downloadToc(self, id = 0):
-        return self.downloadUrl(self.TOC + str(id))['items'][0]
-
-    def downloadText(self, id) -> lxml.html.HtmlElement:
-        response = self.downloadUrl(self.TEXT + str(id))
+    def downloadToc(self, toc_id = 0):
+        response = self.downloadUrl('ajax.xav', {'q': 'toclevel', 'n': str(toc_id)})
+        return response['items'][0]
+
+    def downloadText(self, toc_id, doc_id) -> lxml.html.HtmlElement:
+        query = {
+            'tf': 'xaver.component.Text_0',
+            'hlf': 'xaver.component.Hitlist_0',
+            'tocid': str(toc_id),
+            'start': f"//*[@node_id='{doc_id}']",
+        }
+        response = self.downloadUrl('text.xav', query)
         return lxml.html.fromstring(response['innerhtml'])
 
     def scrape(self, year_low=0, year_high=sys.maxsize):
@@ -96,19 +101,19 @@ def get_part_toc(self, part_id):
     def get_year_toc(self, year_id):
         response = self.downloadToc(year_id)
         assert response['id'] == year_id
-        print(response)
         result = {}
         for item in response['c']:
             match = re.match(r'Nr\. (\d+) vom (\d{2}\.\d{2}\.\d{4})', item['l'])
             if match:
                 number = int(match.group(1))
                 date = match.group(2)
                 print(f"Getting Number TOC {number} from {date}")
-                result[number] = self.get_number_toc(item['id'])
+                result[number] = self.get_number_toc(item['id'], item['did'])
         return result
 
-    def get_number_toc(self, number_id):
-        root = self.downloadText(number_id)
+    def get_number_toc(self, number_id, number_did):
+        #response = self.downloadToc(number_id)
+        root = self.downloadText(number_id, number_did)
         toc = []
         for tr in root.cssselect('tr'):
             td: lxml.html.HtmlElement = tr.cssselect('td')[1]