-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
234 lines (197 loc) · 9.39 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-
import scrapy
import json
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
# Category(I-VII) Final importer state Number of Items State of origin (if not exporter) Intermediate location(s) (if any) Description of Items Comments on the transfer
# Category Final Importer state Number of Items State of origin (if not exporter) Intermediate location(s) (if any) Description of Items Comments on the transfer
class ArmsTransferItem(scrapy.Item):
table = scrapy.Field()
year = scrapy.Field()
exporter = scrapy.Field()
importer = scrapy.Field()
category = scrapy.Field()
num_items = scrapy.Field()
state_of_origin = scrapy.Field() # Only has a value if state_of_origin != exporter
intermediate_locations = scrapy.Field() # Only has a value if filled
description = scrapy.Field()
comments = scrapy.Field()
# Category(I-VII) Number of Items Description of Items Comments on the transfer
class HoldingsItem(scrapy.Item):
table = scrapy.Field()
year = scrapy.Field()
country = scrapy.Field()
category = scrapy.Field()
num_items = scrapy.Field()
description = scrapy.Field()
comments = scrapy.Field()
class YearsItem(scrapy.Item):
years = scrapy.Field()
country = scrapy.Field()
class CountryItem(scrapy.Item):
country = scrapy.Field()
class CountriesSpider(scrapy.Spider):
name = "countries"
allowed_domains = ['unroca.org']
custom_settings = {
"FEEDS": {
"countries.json": {
"format": "json",
"overwrite": True
},
},
}
def start_requests(self):
start_urls = ["https://www.unroca.org/api/country-list"]
for url in start_urls:
yield scrapy.Request(url, self.parse)
def parse(self, response):
# load the JSON data from the response
data = response.json()
for country in data:
item = CountryItem()
item["country"] = country["countryname_slug"]
yield item
# Scrape the available report years from list of countries
class YearsSpider(scrapy.Spider):
name = "years"
allowed_domains = ['unroca.org']
custom_settings = {
"FEEDS": {
"years.json": {
"format": "json",
"overwrite": True
},
},
}
@staticmethod
def get_country_names():
with open('countries.json') as f:
country_list = json.load(f)
for item in country_list:
yield item["country"]
def start_requests(self):
country_names = set(YearsSpider.get_country_names())
start_urls = [f"https://www.unroca.org/api/{country}" for country in country_names]
for url in start_urls:
yield scrapy.Request(url, self.parse)
def parse(self, response):
# load the JSON data from the response
data = response.json()
available_reports = YearsItem()
available_reports["country"] = data["country"]["countryname_slug"]
available_reports["years"] = data.get("available_reports", [])
yield available_reports
# Export & Import
def parse_arms_transfer_row(rows, table_id, row_year, row_state, is_export = True):
for row in rows:
item = ArmsTransferItem()
row_soup = BeautifulSoup(row.extract(), 'html.parser')
row_data = row_soup.find_all('td')
item['table'] = table_id
item['year'] = row_year
item['exporter'] = row_state if is_export else row_data[0].text.strip()
item['importer'] = row_data[0].text.strip() if is_export else row_state
item['category'] = row_soup.find('th').text.strip()
item['num_items'] = row_data[1].text.strip()
item['state_of_origin'] = row_data[2].text.strip()
item['intermediate_locations'] = row_data[3].text.strip()
item['description'] = row_data[4].text.strip()
item['comments'] = row_data[5].text.strip()
yield item
# Military & Production
def parse_holdings_row(rows, table_id, row_year, row_state):
for row in rows:
item = HoldingsItem()
row_soup = BeautifulSoup(row.extract(), 'html.parser')
row_data = row_soup.find_all('td')
item['table'] = table_id
item['year'] = row_year
item['country'] = row_state
item['category'] = row_soup.find('th').text.strip()
item['num_items'] = row_data[0].text.strip()
item['description'] = row_data[1].text.strip()
item['comments'] = row_data[2].text.strip()
yield item
class UnrocaSpider(scrapy.Spider):
# countryname_slug
name = 'unroca'
allowed_domains = ['unroca.org']
custom_settings = {
"FEEDS": {
"unroca.json": {
"format": "json",
"overwrite": True
},
},
}
@staticmethod
def get_country_names():
with open('countries.json') as f:
country_list = json.load(f)
for item in country_list:
yield item["country"]
@staticmethod
def get_country_years():
with open('years.json') as f:
country_list = json.load(f)
for item in country_list:
yield item
def start_requests(self):
country_names = set(UnrocaSpider.get_country_names())
years = UnrocaSpider.get_country_years()
country_years = dict()
for year in years:
country_years[year["country"]] = year["years"]
start_urls = []
for country in country_names:
for year in country_years[country]:
report_year = year["year"]
start_urls.append(f"https://www.unroca.org/{country}/report/{report_year}/")
for url in start_urls:
yield scrapy.Request(url, self.parse)
def parse(self, response):
if response.status == 200:
# Due to random UN redirects ensure that we are looking at a state's original report.
doc_h4 = response.selector.xpath('//h4[contains(@class, "unroca")]')
doc_h4_text = doc_h4.xpath('./text()')
if len(doc_h4_text) >= 1 and doc_h4_text[0].extract() == 'UNROCA original report':
report_details = doc_h4.xpath('following-sibling::*/text()')[0].extract().split()
reporting_year = report_details[-1]
reporting_state = ' '.join(report_details[:-1])
div_panels = response.selector.xpath('//div[contains(@class, "panel-body")]')
export_trows = div_panels[1].xpath('./table/tbody/tr')
import_trows = div_panels[2].xpath('./table/tbody/tr')
military_holdings_trows = div_panels[3].xpath('./table/tbody/tr')
national_productions_trows = div_panels[4].xpath('./table/tbody/tr')
_policies_trows = div_panels[5].xpath('./table/tbody/tr') # ignored
small_arms_exports_trows = div_panels[6].xpath('./table/tbody/tr') if len(div_panels) > 9 else [] # optional before 2006
light_weapons_exports_trows = div_panels[7].xpath('./table/tbody/tr') if len(div_panels) > 9 else [] # optional before 2006
small_arms_imports_trows = div_panels[8].xpath('./table/tbody/tr') if len(div_panels) > 9 else [] # optional before 2006
light_weapons_imports_trows = div_panels[9].xpath('./table/tbody/tr') if len(div_panels) > 9 else [] # optional before 2006
yield from parse_arms_transfer_row(export_trows, 'major_export', is_export = True, row_year = reporting_year, row_state = reporting_state)
yield from parse_arms_transfer_row(import_trows, 'major_import', is_export = False, row_year = reporting_year, row_state = reporting_state)
yield from parse_holdings_row(military_holdings_trows, 'military_holdings', row_year = reporting_year, row_state = reporting_state)
yield from parse_holdings_row(national_productions_trows, 'national_production', row_year = reporting_year, row_state = reporting_state)
yield from parse_arms_transfer_row(small_arms_exports_trows, 'small_arms_export', is_export = True, row_year = reporting_year, row_state = reporting_state)
yield from parse_arms_transfer_row(small_arms_imports_trows, 'small_arms_import', is_export = False, row_year = reporting_year, row_state = reporting_state)
yield from parse_arms_transfer_row(light_weapons_exports_trows, 'light_weapons_export', is_export = True, row_year = reporting_year, row_state = reporting_state)
yield from parse_arms_transfer_row(light_weapons_imports_trows, 'light_weapons_import', is_export = False, row_year = reporting_year, row_state = reporting_state)
# Bunch of magic to run it in sequence
@defer.inlineCallbacks
def crawl():
# uncomment if you want to refetch countries and years
# yield runner.crawl(CountriesSpider)
# yield runner.crawl(YearsSpider)
yield runner.crawl(UnrocaSpider)
reactor.stop()
# First scrape the countries,
# Then the available years,
# Then run the report scraper
if __name__ == "__main__":
configure_logging({"LOG_FORMAT": "%(levelname)s: %(message)s"})
runner = CrawlerRunner()
crawl()
reactor.run() # the script will block here until the last crawl call is finished