-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
150 lines (112 loc) · 4.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from requests_html import HTML,HTMLSession
import concurrent.futures
import csv
import re
import time
# Tutorial help on multithreading
# https://www.youtube.com/watch?v=IEEhzQoKtQU&ab_channel=CoreySchafer
def get_link_info(threadinfo):
session = HTMLSession()
url = threadinfo["url"]
t = session.get(url)
# get the title. Bit of a messy way but it works
titleData = t.html.find(".p-title-value")[0].text
title = re.match(r'\[\w+\] ?(.+)',titleData).group(1).strip()
# get the container containing the info of interest
infoContainer = t.html.find(".bbWrapper")[0]
# get the datetime it was posted
datetimes = t.html.find("time")
datetimePosted = t.html.find("time")[0].attrs["datetime"]
unix_datetimePosted = t.html.find("time")[0].attrs["data-time"]
messageContainer = t.html.find(".js-replyNewMessageContainer")[0]
firstReply = t.html.find(".js-inlineModContainer ")[1]
print(firstReply)
if len(datetimes) > 1:
firstResponse = firstReply.find(".u-dt")[0].attrs["datetime"]
unix_firstResponse = firstReply.find(".u-dt")[0].attrs["data-time"]
else:
firstResponse = datetimePosted
unix_firstResponse = unix_datetimePosted
print("No second thread")
# print(unix_firstResponse,unix_datetimePosted)
deltaT_response = int(unix_firstResponse) - int(unix_datetimePosted)
forumName = t.html.find(".p-breadcrumbs")[0].text.replace("\n","-")
lockedStatusData = t.html.find(".blockStatus-message")
# print(len(lockedStatusData))
if len(lockedStatusData) < 2:
lockedStatus = "Open"
else:
lockedStatus = "Closed"
infoContainerText = infoContainer.text
locationPtn = re.compile(r'Location: ?([^\n]+)')
agePtn = re.compile(r'Age: ?([^\n]+)')
pricePtn = re.compile(r'Price: ?([^\n]+)')
conditionPtn = re.compile(r'Condition: ?([^\n]+)')
infosOfInterest = [locationPtn, agePtn, pricePtn, conditionPtn]
results = []
for field in infosOfInterest:
try:
data = field.finditer(infoContainerText).__next__().group(1)
results.append(data)
except StopIteration:
print(infoContainerText)
return
# print(location,age,price)
info = [title] + results + [ threadinfo["views"],threadinfo["replies"] ,lockedStatus , datetimePosted ,firstResponse,deltaT_response, url]
print(info)
write_to_CSV(info,forumName)
return f"Done scraping {url}"
def write_to_CSV(info,fileName):
try:
with open(f'{fileName}.csv', mode='a',newline='') as dataFile:
datafileWriter = csv.writer(dataFile, delimiter=';')
datafileWriter.writerow(info)
except UnicodeEncodeError:
print("UnicodeEncodeError due to emojies. Entry ignored")
def scape_threads_for_links(url):
session = HTMLSession()
r = session.get(url)
# Looks for all posts/threads on forum
entriesContainer = r.html.find(".structItemContainer")
entries = entriesContainer[0].find(".structItem ")
data = []
# within each post
for entry in entries:
entryContainer = entry.find(".structItem-title")[0]
metaData = entry.find(".structItem-cell--meta")[0].find(".pairs")
a = {}
a["replies"] = metaData[0].text.replace("Replies\n","")
a["views"] = metaData[1].text.replace("Views\n","")
title = entryContainer.find("a")[1].text
typeOfSale = entryContainer.find("a")[0].text
if typeOfSale == "[Reseller]" or typeOfSale == "[Wanted]" or typeOfSale == "[Feeler]":
continue
a["url"] = urlGlobal + str(entryContainer.find("a")[1].attrs["href"])
data.append(a)
return data
urlGlobal = "https://carbonite.co.za"
urlThreads = "?forums/nvidia_gpu/"
params = "&prefix_id=1&order=thread_fields_price&direction=desc"
# Script starts
session = HTMLSession()
r = session.get(urlGlobal + urlThreads + params)
# Gets the total number of pages
numberOfPages = int(r.html.find(".pageNav-page ")[-1].text)
print(f"Scraping {numberOfPages} pages")
# loops over total pages
for page in range(1,numberOfPages+1):
url = f"{urlGlobal+urlThreads}page-{str(page)}{params}"
print("Scraping: ",url)
# gets all the links to each thread
threads = scape_threads_for_links(url)
with concurrent.futures.ThreadPoolExecutor() as executor:
results = []
for thread in threads:
url = thread["url"]
print("Scraping thread: ",url)
# multi threading happens here
results.append(executor.submit(get_link_info, thread))
# sleep for 10ms. Dont want to punish their servers with 30 requests all at once (might get my your IP addr banned)
time.sleep(0.1)
for f in concurrent.futures.as_completed(results):
print(f.result())