-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_symbols.py
88 lines (75 loc) · 2.66 KB
/
scrape_symbols.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""Scrape data for all symbols in data/symbols/*.json
Called from the command line as:
python scrape_symbols.py <exchange> [start-symbol]
"""
import json
import sys
import argparse
import time
import requests
import getquote as getquote
from dbhandler import DbHandler
if __name__ == "__main__":
# Start a timer for performance monitoring
start_time = time.perf_counter()
parser = argparse.ArgumentParser(description="Scrape symbols from the TSX or TSXV.")
parser.add_argument(
"exchange", choices=["TSX", "TSXV"], help="exchange from which to scrape. TSX or TSXV."
)
parser.add_argument(
"-t",
"--time",
type=int,
help="Time in seconds between network calls. To avoid getting blocked.",
)
group = parser.add_mutually_exclusive_group()
group.add_argument("-s", "--start", help="symbol from which to start scraping")
group.add_argument(
"-r", "--range", nargs=2, help="first and last symbol of the range to scrape"
)
args = parser.parse_args()
all_symbols = []
exchange = args.exchange
# Read the symbol list for the chosen exchange
if exchange == "TSX":
with open("data/symbols/TSX.json", "r") as infile:
all_symbols.extend(json.load(infile))
else:
with open("data/symbols/TSXV.json", "r") as infile:
all_symbols.extend(json.load(infile))
if args.time:
waittime = args.time
else:
waittime = 0
# If we specified a start or a range, trim the symbol list
if args.start:
all_symbols = [x for x in all_symbols if x >= args.start]
elif args.range:
all_symbols = [x for x in all_symbols if x >= args.range[0] and x <= args.range[1]]
# Uncomment to test parallel process launch
# from random import randint
# from time import sleep
# i = randint(0, 5)
# while i > 0:
# print(i)
# i = i - 1
print(
f"Preparing to scrape symbols from {all_symbols[0]} to {all_symbols[-1]} ({len(all_symbols)} symbols) on {exchange}"
)
s = requests.Session()
db_handler = DbHandler()
conn = db_handler.create_connection()
suspended = json.load(open("data/symbols/suspended.json", "r"))
for symbol in all_symbols:
try:
getquote.get_quote(s, conn, symbol, suspended)
except Exception as e:
print(e)
continue
# Sleep between each call
time.sleep(waittime)
end_time = time.perf_counter()
total_time = int(round(end_time - start_time, 0))
print(
f"Finished. Scraped from {all_symbols[0]} to {all_symbols[-1]} ({len(all_symbols)} symbols) on {exchange} in {total_time} s"
)