Skip to content

Commit

Permalink
scrape all 251 pages & ignore encoding error
Browse files Browse the repository at this point in the history
  • Loading branch information
MakersMark3333 committed Apr 15, 2022
1 parent b9b55af commit 130ae75
Show file tree
Hide file tree
Showing 3 changed files with 3,273 additions and 67 deletions.
9 changes: 5 additions & 4 deletions CrawlObject.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,28 @@ def __scraper_content(self):
def __df(self, total_page):
rows = []
while self.__curr_page <= total_page:
xhtml = self.__scraper_content().decode('utf-8')
xhtml = self.__scraper_content().decode('utf-8', errors='ignore')
p = HTMLTableParser()
p.feed(xhtml)
title = ['编号', '时间', '程度', '分类', '摘要', '地址', '详情']
for i in range(1, len(p.tables[0])):
rows.append(p.tables[0][i])
print(str(self.__curr_page) + "pages scraped!")
self.__curr_page += 1
df = pd.DataFrame(columns = title, data = rows)
return df

# Function call, users need to provide total_page
def get(self, total_page):
df = self.__df(total_page)
df.to_csv('shanghai_test.csv', encoding = 'gbk')
df.to_csv('shanghai.csv', encoding = 'gbk', errors='ignore')

def pages_scraped(self):
return self.__curr_page - 1

if __name__ == "__main__":
test = Shanghai_Help_Scraper()
test.get(5)
print("Congratulations, you scraped " + str(test.pages_scraped()) + "pages!!")
test.get(251)
print("Congratulations, you scraped " + str(test.pages_scraped()) + " pages!!")


Loading

0 comments on commit 130ae75

Please sign in to comment.