-
Notifications
You must be signed in to change notification settings - Fork 0
/
position_cluster.py
205 lines (168 loc) · 6.76 KB
/
position_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pdb
'''Functions'''
def guards(df):
df = df[df['Pos'] == 'G']
return df
def forwards(df):
df = df[df['Pos'] == 'F']
return df
def centers(df):
df = df[df['Pos'] == 'C']
return df
def ID(row):
row['ID'] = ",".join([row['Team'], str(row['Season'])])
return row
def pos_dfs(df):
'''
Create DataFrames for each posiiton with select features
INPUT: Player stats Dataframe
OUTPUT: Dataframes for each position ready forclustering
'''
# Features to Cluster Centers
c_reduced_df_cols = ['Player', 'MP', '2P', '3P', 'TRB', 'AST', 'STL',
'BLK', 'TOV', 'PTS', 'Team', 'Season', 'Pos', 'Height']
c_df_r = df[c_reduced_df_cols]
# Features to Cluster Forwards
f_reduced_df_cols = ['Player', 'MP', '2P', '2PA', '3P', '3PA', 'TRB', 'AST', 'STL',
'BLK', 'TOV', 'PTS', 'Team', 'Season', 'Pos']
f_df_r = df[f_reduced_df_cols]
# Features to Cluster Guards
g_reduced_df_cols = ['Player', 'MP', '3P', 'AST', 'STL', 'TOV',
'PTS', 'TRB', 'Team', 'Season', 'Pos']
g_df_r = df[g_reduced_df_cols]
# Drop NaNs from reduced DataFrames
c_df_r = centers(c_df_r.dropna())
f_df_r = forwards(f_df_r.dropna())
g_df_r = guards(g_df_r.dropna())
return c_df_r, f_df_r, g_df_r
def vectorize_and_standardize(df):
'''
INPUT: DataFrame
OUTPUT: Standardized NumPy Matrix, and arrays for players and positions
'''
# Vectorize
player = df['Player'].as_matrix()
position = df['Pos'].as_matrix()
X = df.drop(['Player', 'MP', 'Team', 'Season', 'Pos'], axis=1).as_matrix()
# Standardize
scale = StandardScaler()
X = scale.fit_transform(X)
return X, player, position
def create_clusters(X, nclusters):
'''
INPUT: X Matrix and number of Clusters
OUTPUT: Cluster labels for each observation
'''
kmeans = KMeans(n_clusters=nclusters, init='k-means++', n_init=20, max_iter=500, tol=0.0001,
precompute_distances='auto', verbose=0, random_state=None,
copy_x=True, n_jobs=-1, algorithm='auto')
kmeans.fit(X)
return kmeans.labels_
def add_clusters_to_dfs(df, clusters):
df['Cluster'] = clusters
return df
def position_cluster(row):
'''
INPUT: row in DataFrame
OUPTUT: concatenation of position and cluster number
'''
row['pos_cluster'] = row['Pos'] + str(row['Cluster'])
return row
def concatinate_dataframes(df1, df2, df3):
'''
Concatenate position specific DataFrames into one with cluster position column
INPUT: Three DataFrames (one for each position)
OUTPUT: One Concatenated DataFrame
'''
pos1 = df1[['Player', 'MP', 'Team', 'Season', 'Cluster', 'Pos']]
pos2 = df2[['Player', 'MP', 'Team', 'Season', 'Cluster', 'Pos']]
pos3 = df3[['Player', 'MP', 'Team', 'Season', 'Cluster', 'Pos']]
positions = [pos1, pos2, pos3]
players = pd.concat(positions)
players = players.apply(position_cluster, axis=1)
return players
def team_and_season_mp_by_cluster(df):
'''
INPUT: DataFrame of each player and assigned Cluster
OUTPUT: DataFrame of each team for each season with percentage of munutes played by each cluster
'''
pivot_df = pd.pivot_table(df, values='MP', index=['Team', 'Season'],
columns=['pos_cluster'], aggfunc='sum', fill_value=0)
pivot_df = pivot_df.reset_index()
team_and_season = pivot_df.iloc[:, :2].as_matrix()
clusts = pivot_df.iloc[:, 2:].as_matrix()
cols = pivot_df.columns.tolist()
mp = clusts.sum(axis=1)
clustsnorm = clusts / mp.reshape(-1, 1)
clusters_df = pd.DataFrame(np.hstack((team_and_season, clustsnorm)), columns=cols)
return clusters_df
def team_and_season_mp_by_class(df):
'''
INPUT: DataFrame of each player and Class
OUTPUT: DataFrame of each team for each season with percentage of munutes played by each class
'''
class_df = df[['Team', 'Season', 'Class', 'MP']]
pivot_df = pd.pivot_table(df, values='MP', index=['Team', 'Season'],
columns=['Class'], aggfunc='sum', fill_value=0)
pivot_df = pivot_df.reset_index()
team_and_season = pivot_df.iloc[:, :2].as_matrix()
classes = pivot_df.iloc[:, 2:].as_matrix()
cols = pivot_df.columns.tolist()
mp = classes.sum(axis=1)
classesnorm = classes / mp.reshape(-1, 1)
cl = np.array(cols[2:])
cols = cols[:2]
cols.append('exp_factor')
exp = cl * classesnorm
expfactor = exp.sum(axis=1)
classes_df = pd.DataFrame(np.hstack((team_and_season, expfactor.reshape(-1, 1))), columns=cols)
return classes_df
def team_experience_level(df):
'''
INPUT: Player Stats DataFrame
OUTPUT: DataFrame with mean Class
'''
df = df[['Team', 'Season', 'Class']].groupby(['Team', 'Season']).mean().round(2)
df.columns = ['Team', 'Season', 'Experience']
df = df.reset_index()
return df
def merge_dfs(df1, df2):
'''
INPUT: Two DataFrames with Team and Season columns
OUTPUT: One DataFrame that is the combination of the two
'''
df1 = df1.apply(ID, axis=1)
df2 = df2.apply(ID, axis=1)
df2 = df2.drop(['Team', 'Season'], axis=1)
merged_df = df1.merge(df2, on='ID', how='left')
merged_df = merged_df.drop(['ID'], axis=1)
return merged_df
if __name__ == '__main__':
df = pd.read_pickle('scraped_data/player_stats.pkl')
'''Create DataFrames by position ready for clustering'''
centers_df, forwards_df, guards_df = pos_dfs(df)
'''Vecotrize and Standardize DataFrames'''
X_c, player_c, position_c = vectorize_and_standardize(centers_df)
X_f, player_f, position_f = vectorize_and_standardize(forwards_df)
X_g, player_g, position_g = vectorize_and_standardize(guards_df)
'''Create Clusters'''
center_clusters = create_clusters(X_c, nclusters=3)
forward_clusters = create_clusters(X_f, nclusters=3)
guard_clusters = create_clusters(X_g, nclusters=4)
'''Add Cluster column to dataframe'''
centers_df = add_clusters_to_dfs(centers_df, center_clusters)
forwards_df = add_clusters_to_dfs(forwards_df, forward_clusters)
guards_df = add_clusters_to_dfs(guards_df, guard_clusters)
'''Create one Dataframe to rule them all'''
players_df = concatinate_dataframes(centers_df, forwards_df, guards_df)
'''Sum Position cluster minutes played by Team and Season'''
team_clusters_df = team_and_season_mp_by_cluster(players_df)
team_clusters_df.to_pickle('player_data/team_clusters.pkl')
'''Create Team Experience Level DataFrame'''
team_experience_df = team_and_season_mp_by_class(df)
team_experience_df.to_pickle('player_data/team_experience.pkl')