-
Notifications
You must be signed in to change notification settings - Fork 0
/
logRegNBA.py
101 lines (80 loc) · 3 KB
/
logRegNBA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 26 18:48:43 2020
@author: Drewb
"""
import pandas as pd
import numpy as np
nbaData = pd.read_csv('NBA201516.csv')
#Get rid of columns we are not using in our model.
del nbaData["Rk"]
del nbaData["Player"]
del nbaData["Age"]
del nbaData["Tm"]
del nbaData["G"]
del nbaData["GS"]
del nbaData["FG"]
del nbaData["FGA"]
del nbaData["FG%"]
del nbaData["3P"]
del nbaData["3PA"]
del nbaData["3P%"]
del nbaData["2P"]
del nbaData["2PA"]
del nbaData["2P%"]
del nbaData["eFG%"]
del nbaData["FT"]
del nbaData["FTA"]
del nbaData["FT%"]
del nbaData["ORB"]
del nbaData["DRB"]
del nbaData["BLK"]
del nbaData["TOV"]
del nbaData["PF"]
nbaData["Good Rebounder"] = np.where(nbaData["TRB"]>8, 'Yes', 'No') #Defining Good Rebounder Column
nbaData = nbaData[["Good Rebounder","MP","Pos","PS/G","AST","STL"]] #5 predictor columns (beta1,beta2,beta3,beta4,beta5) and 1 response column (beta0)
nbaData["Pos"] = nbaData["Pos"].str[:2] #creates the 5 distict Pos values. Deprecates '-'
#transforming categorical data into numerical using indicator variables
nbaData["RebounderNumeric"] = (nbaData["Good Rebounder"]=='Yes')
nbaData["RebounderNumeric"] = (nbaData["Good Rebounder"]=='Yes').astype(int)
del nbaData["Good Rebounder"]
nbaData["Pos"] = nbaData["Pos"].astype('category')
nbaData["Pos"] = nbaData["Pos"].cat.codes
#PG = 2, SG = 4, SF = 3, C = 0, PF = 1
nbaData = nbaData.dropna(axis=0, how='any') #drop any rows containing an NaN
nbaData = nbaData[["RebounderNumeric","MP","Pos","PS/G","AST","STL"]]
np.random.seed(0)
numberRows = len(nbaData)
randomlyShuffledRows = np.random.permutation(numberRows)
trainingRows = randomlyShuffledRows[0:320] #use first 320 random rows for training
testRows = randomlyShuffledRows[320:] #remaining rows are test set
xTrain = nbaData.iloc[trainingRows,1:6]
yTrain = nbaData.iloc[trainingRows,0]
xTest = nbaData.iloc[testRows,1:6]
yTest = nbaData.iloc[testRows,0]
from sklearn import linear_model
reg = linear_model.LogisticRegression(solver='lbfgs') #Silencing FutureWarning on scikit
reg.fit(xTrain,yTrain)
model_prediction = reg.predict(xTest)
#model_prediction is binary
diff = (model_prediction - yTest)
print(sum(abs(diff))) #wrong predictions
log_odds = reg.coef_ #print value of beta coefficients
beta_0 = reg.intercept_ #print value of beta0 (y-intercept)
print(log_odds)
print(beta_0)
#scores the accuracy of logistic regression prediction.
score = reg.score(xTest,yTest)
print(score)
del nbaData["RebounderNumeric"]
nbaData = nbaData[["Pos","MP","PS/G","AST","STL"]]
person1 = [4, 15, 10, 4, 2] #SG, 15MP, 10PS/G, 4AST, 2STL
person2 = [0, 30, 5, 5, 4] #C, 30MP, 5PS/G, 5AST, 4STL
person3 = [3, 24, 20, 3, 1.8] #SF, 35MP, 20PS/G, 3AST, 1.8STL
newTestSet = np.vstack((person1,person2,person3))
#vertically stacking person1, person2, person3 in 2-D array
goodRebounder = reg.predict_proba(newTestSet) #input type for any sklearn .predict() functions
print(goodRebounder)
print("Probabilities of player being a good rebounder (log odds):")
for i in range(3):
print(goodRebounder[i][1])