-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
79 lines (64 loc) · 3.05 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import requests
from bs4 import BeautifulSoup
import time
base_url = 'http://www.bbc.co.uk'
search_url = 'http://www.bbc.co.uk/food/recipes/search'
query_params = {
"page": "1",
"keywords": "t",
"x": "0",
"y": "0",
"sortBy": "lastModified",
}
cookie = 'BBC-UID=1557a39a2f77f15cadde8197d113d75c3c7da8fed31896233b6b54639ea804440Mozilla/5.0%20(Windows%20NT%2010.0%3b%20WOW64)%20AppleWebKit/537.36%20(KHTML%2c%20like%20Gecko)%20Chrome/50.0.2661.94%20Safari/5; ckns_policy=111; ckns_policy_exp=1495018131230; s1=220.181.573AF71D0006CB00B052C72985; _cb_ls=1; _chartbeat2=CLVHREBFMBlYBCcyNy.1463482132006.1463482146992.1; ecos.dt=1463482147068; _chartbeat5=879,294,%2Ffood%2Frecipes%2Fsearch,http%3A%2F%2Fwww.bbc.co.uk%2Ffood%2Frecipes%2Fsearch%3Fpage%3D2%26keywords%3D%26x%3D18%26y%3D9%26diets%255B0%255D%3Dvegetarian%26sortBy%3DlastModified,D44zN8B61JnUBQdBOoCzint3nBBUi,*%5B%40id%3D\'column%2D1\'%5D%2Fdiv%5B2%5D%2Fol%5B1%5D%2Fli%5B7%5D%2Fa%5B1%5D,c,kh7jnufzSOFCY8GMrJn4Dy6vUo'
headers = {
'Origin': 'http://www.bbc.co.uk',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Referer': 'http://www.bbc.co.uk/food/recipes/search',
'Connection': 'keep-alive',
'DNT': '1',
}
def crawl(i, j):
search_range = range(i, j)
for page in search_range:
query_params['page'] = str(page)
while True:
try:
r = requests.post(search_url, data=query_params, headers=headers)
soup = BeautifulSoup(r.text)
recipes = soup.find(id="article-list").contents[0].contents
break
except AttributeError:
print("Page " + str(page) + " didn't load correctly with error code: " + str(
r.status_code) + ", trying again")
for recipe in recipes:
a = recipe.find("a")
name = a.text
link = a.get('href')
with open("links.txt", 'a') as f:
f.write(str((name, link, page)) + "\n")
print("Recipes on page " + str(page) + " saved")
time.sleep(2)
# Loads the list of links from the text file and turns them into Python
# readable tuples
def load_links(file_location):
with open(file_location, 'r') as f:
raw_links = f.readlines()
links = []
for rl in raw_links:
trimmed = rl[:-1]
links.append(eval(trimmed))
return links
# Retrieves the recipe from the URL and soupify it
def get_recipe(url):
r = requests.get(base_url + url)
while r.status_code != 200:
print("Page " + url + " didn't load correctly with error code: " + str(r.status_code) + ", trying again")
r = requests.get(base_url + url)
return BeautifulSoup(r.text)