-
Notifications
You must be signed in to change notification settings - Fork 0
/
greenbook_scrape.py
74 lines (65 loc) · 2.82 KB
/
greenbook_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#Andrew Sager
#7/27/2017
import re
import csv
import requests
from openpyxl import Workbook
from uszipcode import ZipcodeSearchEngine
def get_total_pages(url):
page = requests.get(url.format(1))
page_source = page.text #get the source code
num = int(page_source.split(">Last(")[1].split(")")[0])
return num
def get_zip(url):
page = requests.get("http://" + url) #append http:// so we can load the page
page_source = page.text
pieces = page_source.split("\n")
#line contains a zip if it contains "postalCode" and "span":
pieces = [k for k in pieces if ("postalCode" in k and "span" in k)]
chars_removed = []
for item in pieces:
item = re.sub("\D", "", item) #helpful package that removes all non-digits
if (len(item) < 5): #if we haven't found a proper zipcode
continue
else:
return item[:5] #will truncate any ZIP+4s to ZIPs
return "" #didn't find any zipcodes in the given url (unlikely)
pages = []
search_url = 'https://www.greenbook.org/advancedsearchresult?page={0}&rd=V&focusgroupid=1&countryId=3'
num_pages = get_total_pages(search_url)
for i in range(num_pages):
page = requests.get(search_url.format(i+1)) #iterate through all pages of the focus groups
page_source = page.text #get the source code
pieces = page_source.split("\n") #break into lines
pieces = [k for k in pieces if '/company/' in k] #keep the line if it contains a company url
#continue breaking strings into pieces based on the source code:
s = "onclick=\"trackOutboundLink('//"
pieces = [k.split(s) for k in pieces if s in k]
pieces = [item for sublist in pieces for item in sublist]
pieces = list(set([k.split("\\r")[0] for k in pieces if ".org" in k]))
pieces = [k.split("\'")[0] for k in pieces]
pages.extend(pieces) #add all of the urls we found on this page
search = ZipcodeSearchEngine()
wb = Workbook()
dest_filename = "All Focus Group Centers.xlsx"
new_worksheet = wb.active
new_worksheet.title = "Sheet 1"
new_worksheet.cell(row=1,column=1).value = "Zipcode"
new_worksheet.cell(row=1,column=2).value = "City"
new_worksheet.cell(row=1,column=3).value = "State"
new_worksheet.cell(row=1,column=4).value = "Population"
new_worksheet.cell(row=1,column=5).value = "Density"
new_worksheet.cell(row=1,column=6).value = "URL"
i = 2 #we've already made headers so start at row two
for item in pages:
zipcode = get_zip(item)
zipcode_data = search.by_zipcode(zipcode)
new_worksheet.cell(row=i,column=1).value = zipcode
new_worksheet.cell(row=i,column=2).value = zipcode_data["City"]
new_worksheet.cell(row=i,column=3).value = zipcode_data["State"]
new_worksheet.cell(row=i,column=4).value = zipcode_data["Population"]
new_worksheet.cell(row=i,column=5).value = zipcode_data["Density"]
new_worksheet.cell(row=i,column=6).value = item
i+=1 #increment row to which we add data
wb.save(filename=dest_filename)
print("Compilation of Greenbook data is complete")