Skip to content

Commit

Permalink
Merge pull request #31 from bundestag/bgbl
Browse files Browse the repository at this point in the history
Fix BGBl scraper.
  • Loading branch information
darkdragon-001 committed Mar 30, 2021
2 parents fe971ea + f3fca36 commit fd0c108
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 18 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Last tested: 2017-01-14 SUCCESS
Scrapes the table of contents of all issues of the Bundesgesetzblatt and dumps
the result to JSON.

Last tested: 2017-01-14 FAILED ("KeyError: xaversid")
Last tested: 2021-03-30 SUCCESS

## banz_scraper.py

Expand All @@ -53,4 +53,4 @@ Checks the repositories working directory for changes, tries to find relations
to table of content entries in BGBl and BAnz data, commits the changes to a branch
and merges the branch into master.

Last tested: 2017-01-14 SUCCESS
Last tested: 2017-01-14 SUCCESS
37 changes: 21 additions & 16 deletions bgbl_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
"""
import sys
from pathlib import Path
import urllib.parse
import re
import json
from collections import defaultdict
import time
import roman_numbers

import lxml.html
Expand All @@ -31,25 +33,28 @@

class BGBLScraper:
BASE_URL = 'http://www.bgbl.de/xaver/bgbl/'
TOC = ('ajax.xav?q=toclevel'
'&n=')
TEXT = ('text.xav?tf=xaver.component.Text_0'
'&hlf=xaver.component.Hitlist_0'
'&tocid=')

def __init__(self):
self.session = requests.session()
self.session.get(self.BASE_URL + 'start.xav') # save cookies

def downloadUrl(self, url):
response = self.session.get(self.BASE_URL + url)
def downloadUrl(self, file, query={}):
query['request.preventCache'] = int(time.time()*1000)
response = self.session.get(f'{self.BASE_URL}{file}?{urllib.parse.urlencode(query)}')
return response.json()

def downloadToc(self, id = 0):
return self.downloadUrl(self.TOC + str(id))['items'][0]

def downloadText(self, id) -> lxml.html.HtmlElement:
response = self.downloadUrl(self.TEXT + str(id))
def downloadToc(self, toc_id = 0):
response = self.downloadUrl('ajax.xav', {'q': 'toclevel', 'n': str(toc_id)})
return response['items'][0]

def downloadText(self, toc_id, doc_id) -> lxml.html.HtmlElement:
query = {
'tf': 'xaver.component.Text_0',
'hlf': 'xaver.component.Hitlist_0',
'tocid': str(toc_id),
'start': f"//*[@node_id='{doc_id}']",
}
response = self.downloadUrl('text.xav', query)
return lxml.html.fromstring(response['innerhtml'])

def scrape(self, year_low=0, year_high=sys.maxsize):
Expand Down Expand Up @@ -96,19 +101,19 @@ def get_part_toc(self, part_id):
def get_year_toc(self, year_id):
response = self.downloadToc(year_id)
assert response['id'] == year_id
print(response)
result = {}
for item in response['c']:
match = re.match(r'Nr\. (\d+) vom (\d{2}\.\d{2}\.\d{4})', item['l'])
if match:
number = int(match.group(1))
date = match.group(2)
print(f"Getting Number TOC {number} from {date}")
result[number] = self.get_number_toc(item['id'])
result[number] = self.get_number_toc(item['id'], item['did'])
return result

def get_number_toc(self, number_id):
root = self.downloadText(number_id)
def get_number_toc(self, number_id, number_did):
#response = self.downloadToc(number_id)
root = self.downloadText(number_id, number_did)
toc = []
for tr in root.cssselect('tr'):
td: lxml.html.HtmlElement = tr.cssselect('td')[1]
Expand Down

0 comments on commit fd0c108

Please sign in to comment.