-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawling_data.py
77 lines (74 loc) · 2.99 KB
/
crawling_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
info = []
for n_page in range(1, 101):
page = 'https://www.newegg.com/GPUs-Video-Graphics-Cards/SubCategory/ID-48/Page-{}?Tid=7709'.format(n_page)
result = requests.get(page)
source = result.text
soup = BeautifulSoup(source, 'html.parser')
for i in range (len(soup.find_all('div', class_='item-container'))):
# if(len(soup.find_all('ul', class_='item-features'))==6):
id = soup.find_all('div', class_='item-container')[i].attrs['id']
title = soup.find_all('a', class_='item-title')[i].text
try:
brand = soup.find_all('a', class_='item-brand')[i].find_all('img')[0].attrs['title']
except:
brand = ''
if(len(soup.find_all('div', class_='item-branding')[i]) == 2):
rating = soup.find_all('div', class_='item-branding')[i].find_all('a', class_='item-rating')[0].attrs['title'].split()[-1]
n_rating = soup.find_all('div', class_='item-branding')[i].text.replace('(','').replace(')','')
else:
rating = ''
n_rating = ''
try:
price = float((list(soup.find_all('li', class_='price-current')[i].stripped_strings)[1]+list(soup.find_all('li', class_='price-current')[i].stripped_strings)[2]).replace(',',''))
except:
price = ''
shipping = soup.find_all('li', class_='price-ship')[i].text
img_url = soup.find_all('a', class_='item-img')[i].img.attrs['src']
try:
max_rslt = re.findall('(\d+ x \d+)',soup.find_all('ul', class_='item-features')[i].text)[0]
except:
max_rslt = ''
try:
dp = re.findall('DisplayPort: (\d+ x \S+ \d+.\d+\S)',soup.find_all('ul', class_='item-features')[i].text)[0]
except:
dp = ''
try:
hdmi = re.findall('HDMI: (\d+ x \S+ \d+.\d+)',soup.find_all('ul', class_='item-features')[i].text)[0]
except:
hdmi = ''
try:
dirx = re.findall('DirectX: (\S+ \d+)',soup.find_all('ul', class_='item-features')[i].text)[0]
except:
dirx = ''
try:
model = re.findall('Model #: (\S+)',soup.find_all('ul', class_='item-features')[i].text)[0]
if(model.find('Item')!=-1):
model = model.replace('Item', '')
elif(model.find('Return')!=-1):
model = model.replace('Return', '')
except:
model = ''
print('Trang {} - san pham {}'.format(n_page, i))
info_row = dict(
item_id=id,
title=title,
brand=brand,
rating=rating,
n_rating=n_rating,
price=price,
shipping=shipping,
img_url=img_url,
max_rslt=max_rslt,
dp=dp,
hdmi=hdmi,
dirx=dirx,
model=model,
)
info.append(info_row)
print(len(info))
df = pd.DataFrame(info)
df.to_csv('info_.csv', index=False)