-
Notifications
You must be signed in to change notification settings - Fork 1
/
google_flights_scraper.py
118 lines (75 loc) · 3.07 KB
/
google_flights_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from bs4 import BeautifulSoup
import requests
import json
def get_price(soup_element):
price = soup_element.find('div','BVAVmf').find('div','YMlIz').get_text()
return price
def get_time(soup_element):
spans = soup_element.find('div','Ir0Voe').find('div','zxVSec', recursive=False).find_all('span', 'eoY5cb')
time = ""
for span in spans:
time = time + span.get_text() + "; "
return time
def get_airline(soup_element):
airline = soup_element.find('div','Ir0Voe').find('div','sSHqwe')
spans = airline.find_all('span', attrs={"class": None}, recursive=False)
result = ""
for span in spans:
result = result + span.get_text() + "; "
return result
def save_results(results, filepath):
with open(filepath, 'w', encoding='utf-8') as file:
json.dump(results, file, ensure_ascii=False, indent=4)
return
def get_flights_html(url):
payload = {
'source': 'google',
'render': 'html',
'url': url,
}
# Get response.
response = requests.request(
'POST',
'https://realtime.oxylabs.io/v1/queries',
auth=('username', 'password'),
json=payload,
)
response_json = response.json()
html = response_json['results'][0]['content']
return html
def extract_flight_information_from_soup(soup_of_the_whole_page):
flight_listings = soup_of_the_whole_page.find_all('li','pIav2d')
flights = []
for listing in flight_listings:
if listing is not None:
price = get_price(listing)
time = get_time(listing)
airline = get_airline(listing)
flight = {
"airline": airline,
"time": time,
"price": price
}
flights.append(flight)
return flights
def extract_flights_data_from_urls(urls):
constructed_flight_results = []
for url in urls:
html = get_flights_html(url)
soup = BeautifulSoup(html,'html.parser')
flights = extract_flight_information_from_soup(soup)
constructed_flight_results.append({
'url': url,
'flight_data': flights
})
return constructed_flight_results
def main():
results_file = 'data.json'
urls = [
'https://www.google.com/travel/flights?tfs=CBsQAhooEgoyMDI0LTA3LTI4agwIAxIIL20vMDE1NnFyDAgCEggvbS8wNGpwbBooEgoyMDI0LTA4LTAxagwIAhIIL20vMDRqcGxyDAgDEggvbS8wMTU2cUABSAFSA0VVUnABemxDalJJTkRCNVRGbDBOMU5UVEdOQlJ6aG5lRUZDUnkwdExTMHRMUzB0TFMxM1pXc3lOMEZCUVVGQlIxZ3dhRWxSUVRoaWFtTkJFZ1pWTWpnMk1qSWFDZ2lRYnhBQ0dnTkZWVkk0SEhEN2VBPT2YAQGyARIYASABKgwIAxIIL20vMDRqcGw&hl=en-US&curr=EUR&sa=X&ved=0CAoQtY0DahgKEwiAz9bF5PaEAxUAAAAAHQAAAAAQngM',
'https://www.google.com/travel/flights/search?tfs=CBwQAhooEgoyMDI0LTA3LTI4agwIAxIIL20vMDE1NnFyDAgDEggvbS8wN19rcRooEgoyMDI0LTA4LTAxagwIAxIIL20vMDdfa3FyDAgDEggvbS8wMTU2cUABSAFwAYIBCwj___________8BmAEB&hl=en-US&curr=EUR'
]
constructed_flight_results = extract_flights_data_from_urls(urls)
save_results(constructed_flight_results, results_file)
if __name__ == "__main__":
main()