-
Notifications
You must be signed in to change notification settings - Fork 0
/
ow_scraped_dfs.py
146 lines (85 loc) · 3.23 KB
/
ow_scraped_dfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
from bs4 import BeautifulSoup
# In[2]:
with open('blizzard_data.html',encoding="utf-8") as html_file:
soup = BeautifulSoup(html_file, 'lxml')
# <h2>Function takes "table_cols" from the main method as a parameter and returns a list of column headers for dataframe</h2>
# In[3]:
def get_col_heads(table_cols):
col_heads = []
for string in table_cols.strings:
col_heads.append(string)
return col_heads
# <h2> Function takes "rows" from main method as a parameter and returns a list full of list rows for dataframe</h2>
# In[4]:
def get_row_data(rows):
row_data = []
for row in rows:
cells = row.findChildren('td')
line = []
for cell in cells:
line.append(cell.string)
row_data.append(line)
return row_data
# <h2>All variables used in web scraping from HTML file</h2>
# In[5]:
#Scrape all h2 tags from html file and store in headers
headers = soup.find_all('h2')
#Lists to store column headers and data for each dataframe
pms_cols = [] #Player Map Stat column names
pms_row_data = [] #Player Map Stat row data
pahs_cols = [] #Player All Hero Stat column names
pahs_row_data = [] #Player All Hero Stat row data
hs_cols = [] #Hero Stat column names
hs_row_data = [] #Hero Stat row data
# <h2>"Main" Method</h2>
# In[6]:
#Loop through all heads in headers
for head in headers:
if head.get_text() == "Player Map Stat":
table = head.find_next_sibling('table')
cols = table.find('thead')
rows = cols.find_next_siblings('tr')
pms_row_data = get_row_data(rows)
pms_cols = get_col_heads(cols)
#If header = Player All Hero Stat
elif head.get_text() == "Player All Hero Stat":
table = head.find_next_sibling('table')
cols = table.find('thead')
rows = cols.find_next_siblings('tr')
pahs_row_data = get_row_data(rows)
pahs_cols = get_col_heads(cols)
#If header = Hero Stat
elif head.get_text() == "Hero Stat":
table = head.find_next_sibling('table')
cols = table.find('thead')
rows = cols.find_next_siblings('tr')
hs_row_data = get_row_data(rows)
hs_cols = get_col_heads(cols)
# <h2>Load <b>Player Map Stat</b> data into a dataframe</h2>
# In[7]:
pms_df = pd.DataFrame(pms_row_data, columns = pms_cols)
pms_df.head(10)
# <h2>Load <b>Player All Hero Stat</b> data into a dataframe</h2>
# In[8]:
pahs_df = pd.DataFrame(pahs_row_data, columns = pahs_cols)
pahs_df.head()
# <h2>Load <b>Hero Stat</b> data into a dataframe</h2>
# In[9]:
hs_df = pd.DataFrame(hs_row_data, columns = hs_cols)
hs_df.head()
# In[10]:
hs_df.dtypes
# <h2>Change appropriate columns in each table to floating point numbers</h2>
# In[11]:
pms_df['Amount'] = pd.to_numeric(pms_df['Amount'])
pahs_df['Amount'] = pd.to_numeric(pahs_df['Amount'])
hs_df['Amount'] = pd.to_numeric(hs_df['Amount'])
# <h2>Write all dataframes to a CSV file for data manipulation in other programs</h2>
# In[12]:
pms_df.to_csv('Player_Map_Stat.csv', index = None)
pahs_df.to_csv('Player_All_Hero_Stat.csv', index = None)
hs_df.to_csv('Hero_Stat.csv', index = None)