forked from Leelst/Recipe_Recommendation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingredient_crawling.py
53 lines (50 loc) · 1.42 KB
/
ingredient_crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from bs4 import BeautifulSoup
import pandas as pd
import requests
from tqdm import tqdm
import time
path = 'TB_RECIPE_SEARCH-220701.csv'
df = pd.read_csv(path, encoding='cp949')
#널 값 제거하고 레시피 일련번호만 추출
df = df.dropna()
rcp_ids = df['RCP_SNO'].values
def crawling(id_num):
url = f'https://www.10000recipe.com/recipe/{id_num}'
response = requests.get(url)
if response.status_code == 200:
html = response.content
soup = BeautifulSoup(html, 'html.parser')
try:
res = soup.find('div', class_='ready_ingre3')
except:
return None
try:
ingredient_list = []
for n in res.find_all('ul'):
for tmp in n.find_all('li'):
lst = []
temp = tmp.get_text().split('\n')
for word in temp:
word = word.strip()
if word == '구매' or word =='':
pass
else:
lst.append(word)
ingredient_list.append(lst)
return ingredient_list
except:
return None
else:
print(response.status_code)
ingredient_list = []
for i in tqdm(range(len(rcp_ids))):
lst = crawling(rcp_ids[i])
if lst == None:
ingredient_list.append([])
else:
ingredient_list.append(lst)
data_dic = {'id' : rcp_ids,
"ingredient" : ingredient_list}
data = pd.DataFrame(data_dic)
#data.to_parquet('ingredient.parquet', index=False)
print(data)