-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscrape_csv.py
118 lines (91 loc) · 3.22 KB
/
webscrape_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import tqdm
import time
import pathlib
from datetime import datetime
def filter_table(text):
ids = ['round', 'participant', 'time', 'score']
if text is None:
return False
for elem in ids:
if elem in text:
return text
return False
def line2dict(line, season):
data = dict()
for elem in line.findAll(class_=filter_table):
class_type = elem.attrs['class'][1]
if 'time' in class_type:
date = elem.get_text(' ')
date_value = datetime.strptime(date, '%b %d %I:%M %p')
year = season + (date_value.month < 7)
data['date'] = date_value.replace(year=year)
elif 'participant--home' in class_type:
data['home_team'] = elem.text
elif 'participant--away' in class_type:
data['away_team'] = elem.text
elif 'score--home' in class_type:
data['home_score'] = elem.text
elif 'score--away' in class_type:
data['away_score'] = elem.text
return data
def open_page(season):
# driver = Chrome()
url = f'https://www.flashscore.ca/volleyball/italy/superlega-{season}-{season+1}/results/'
with Chrome() as driver:
# Go to website
driver.get(url)
# Load all games
time.sleep(3)
driver.find_element_by_id('onetrust-accept-btn-handler').click()
time.sleep(3)
try:
while True:
driver.find_element_by_class_name('event__more').click()
time.sleep(3)
except Exception:
pass
return driver.page_source
def extract(path=None, season=2017):
# Create path to save CSV files
if path is None:
path = '.'
path = pathlib.Path(path)
path.mkdir(parents=True, exist_ok=True)
# Load in BeautifulSoup
page_source = open_page(season)
soup = BeautifulSoup(page_source, features='html.parser')
# Get table
table = soup.find('div', class_='sportName')
# Line ids are g_XXXX, where XXXX is some hashing string
lines = table.findAll('div', id=lambda x: x and x.startswith('g_'))
# Convert to DataFrame
df = pd.DataFrame(columns=['date', 'home_team', 'home_score', 'away_team', 'away_score'])
for i, line in enumerate(lines):
data = line2dict(line, season)
df.loc[i] = data
# Get last date of regular rounds
for i, line in enumerate(table.contents):
if 'Round' in line.text:
round_num = int(line.text.split(' ')[-1])
if round_num < 10:
continue
print(line.text)
ind = i + 1
break
date = table.contents[ind].contents[1].get_text(' ')
date_value = datetime.strptime(date, '%b %d %I:%M %p')
year = season + (date_value.month < 7)
last_date = date_value.replace(year=year)
# Drop playoffs
df = df.drop(df[df.date > last_date].index)[::-1].reset_index(drop=True)
# Save to CSV
filename = f'SuperLega-{season}-{season+1}.csv'
df.to_csv(path / filename, index=False)
return df
path = 'dataset_csv'
for i in tqdm.trange(2009, 2019):
df = extract(path=path, season=i)