-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
executable file
·86 lines (78 loc) · 3.09 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from bs4 import BeautifulSoup as BS
from requests import get, ConnectionError
from tempfile import NamedTemporaryFile
from time import sleep
from logger import log
import pickle
from os.path import exists
import re
"""
This module scrapes the first page of the linked mtgtop8 page and pulls all of the decklists into temporary files.
Does not yet scrape the second+ pages due to javascript shenanigans.
"""
mtgtop8compile = re.compile(r"d=(\d+)")
mtggoldfishcompile = re.compile(r"/(\d+)")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
if exists("./data/cached_decklists.pkl"):
with open("./data/cached_decklists.pkl") as fp:
cached_files = pickle.load(fp)
else:
cached_files = {}
def load_page(url):
log("\t\tLoading {}".format(url))
resp = get(url, headers=headers)
if resp.status_code != 200:
log("\t\tError connecting to website, respone code {}".format(resp.status_code), 'error')
return
bs = BS(resp.text, "html.parser")
if "mtgtop8" in url:
mtgtop8 = True
tables = bs.find_all("table")
# this should be it always, unless it changes the format of the site
decklist_tables = tables[4]
decks = decklist_tables.find_all("tr", class_="hover_tr")
urls = ["http://mtgtop8.com/{}".format(x.find("a")['href']) for x in decks]
else:
mtgtop8 = False
decks = bs.find_all("a", href = lambda href: href and "/deck" in href and "#paper" in href)
urls = ["https://mtggoldfish.com{}".format(x['href']) for x in decks]
if len(urls) == 0:
raise Exception("Issue grabbing lists.")
log("\t\t{} decks to download and process...".format(len(urls)))
decklists = []
for cur in urls:
try:
decklists.append(parse_deck_page(cur, mtgtop8))
except Exception as e:
log("\t\tError connecting to website: {}".format(e), 'error')
log("\t\tFinished processing decks.")
global cached_files
with open("./data/cached_decklists.pkl", "wb") as fp:
pickle.dump(cached_files, fp)
return decklists
def parse_deck_page(url, mtgtop8=True):
if mtgtop8:
link = "http://mtgtop8.com/mtgo?{}".format(mtgtop8compile.search(url).group())
else:
link = "https://www.mtggoldfish.com/deck/download{}".format(mtggoldfishcompile.search(url).group())
global cached_files
if link in cached_files:
if exists(cached_files[link]):
return cached_files[link]
else:
del cached_files[link]
try:
resp = get(link)
except:
raise ConnectionError("Could not connect to website")
if resp.status_code != 200:
raise ConnectionError("Error code ".format(resp.status_code))
if resp.text == "Throttled":
print "Breaking for 30 seconds while downloading list."
sleep(30)
return parse_deck_page(url, mtgtop8)
with NamedTemporaryFile(delete=False, dir="./data/temp") as fp:
fp.write(resp.text)
name = fp.name
cached_files[link] = name
return name