-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawl_Cafef.py
73 lines (65 loc) · 2.8 KB
/
Crawl_Cafef.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests
from bs4 import BeautifulSoup
import pandas as pd
class StockScraper:
def __init__(self):
self.base_url = 'https://cafef.vn'
def get_top_stock(self):
try:
res = requests.get(f'{self.base_url}/get-top-stock.chn')
data = res.json()['stocks']
except:
return "Không thể tìm nạp dữ liệu"
return data
def get_info_stock(self, stock_name):
try:
res = requests.get(f'{self.base_url}/info.ashx?type=cp&symbol={stock_name}')
data = res.json()
except:
return "Không thể tìm nạp dữ liệu hoặc mã cổ phiếu cần tìm bị sai"
return data
def get_list_post(self, page_num):
try:
res = requests.get(f'{self.base_url}/timelinelist/18831/{page_num}.chn')
soup = BeautifulSoup(res.content, 'html.parser')
data = soup.find_all(class_='tlitem box-category-item')
except Exception as e:
return f"gặp lỗi: {e}"
return data
def get_info_post(self, data):
times = []
desc = []
links = []
titles = []
for i in range(len(data)):
times.append(data[i].find(class_='time time-ago').text)
desc.append(data[i].find(class_='sapo box-category-sapo').text)
links.append(self.base_url + data[i].find('a', href=True)['href'])
titles.append(data[i].find('a', href=True).text)
return times, desc, links, titles
def get_content_post(self, link_post):
content = ''
try:
res = requests.get(link_post)
soup = BeautifulSoup(res.content, 'html.parser')
c = soup.find(class_='detail-content afcbc-body')
p = c.find_all('p')
for i in range(len(p)):
content = content + p[i].text + '\n'
except Exception as e:
return f"gặp lỗi: {e}"
return content
def scrape_and_save_to_csv(self, num_pages, output_file):
df = pd.DataFrame()
for i in range(1, num_pages+1):
content = []
times, desc, links, titles = self.get_info_post(self.get_list_post(i))
for j in range(len(links)):
content.append(self.get_content_post(links[j]))
temp_df = pd.DataFrame({'Title': titles, 'Desc': desc, 'Content': content, 'Link': links, 'Time': times})
df = pd.concat([df, temp_df])
df.to_csv(output_file, index=False)
print("Dữ liệu đã được lưu vào file CSV.")
# Sử dụng phương thức scrape_and_save_to_csv để thu thập dữ liệu và lưu vào file CSV
scraper = StockScraper()
scraper.scrape_and_save_to_csv(15, 'stock_data.csv')