-
Notifications
You must be signed in to change notification settings - Fork 0
/
svm.py
313 lines (240 loc) · 10.2 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as pdf
import seaborn as sn
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm, multiclass
import warnings
import time
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
# plot c vs gamma for each degree given the polynomial kernel grid search results
def plotGS3D(results, param1, param2, param3, foldNumber, classification):
# list of figures and polynomial degrees
figs = []
degrees = [2, 3, 4]
# Change classification to actual words
if classification == 'ovo':
cl = "One Vs. One"
else:
cl = "One Vs. All"
param1Name = "C"
param2Name = "Gamma"
# get the mean score of the cv's and reshape for graphing purposes
meanScore = results['mean_test_score']
meanScore = np.array(meanScore).reshape(len(param3), len(param2), len(param1))
# plot c vs gamma for each degree
for i,matrix in enumerate(meanScore[:]):
fig, axes = plt.subplots(1,1)
for index, value in enumerate(param2):
axes.plot(param1, matrix[index, :], '-o', label=param2Name + ': ' + str(value), alpha=0.7)
axes.set_title("Fold #" + str(foldNumber) + " GridSearch Score (Degree " + str(degrees[i]) +" Poly " + cl + ")", fontsize=16, fontweight= 'bold')
axes.set_xlabel(param1Name, fontsize=12)
axes.set_ylabel('CV Average Score', fontsize=12)
axes.legend(loc='best', fontsize=11)
axes.grid('on')
figs.append(fig)
return figs
# plot the confusion matrices
def plotConfusionMatrix(confusion, kernel, classification):
# Change classification to actual words
if classification == 'ovo':
classification = "One Vs. One"
else:
classification = "One Vs. All"
# plot the matrix as heatmap
fig = plt.subplot()
sn.heatmap(confusion, ax=fig, fmt ='g', cmap='Oranges', annot=True)
plt.savefig('Confusion Matrix - ' + kernel.title() + " " + classification + ".png")
plt.close()
return
# plot the time versus the accuracy given the ovo and ovr results
def plotTimeVAccuracy(kernel, ovoTime, ovrTime, ovoAcc, ovrAcc):
fig, axes = plt.subplots(1,1)
axes.bar(ovoTime, ovoAcc, width=0.5, label= "One Vs. One")
axes.text(ovoTime, ovoAcc+0.5, str(round(ovoAcc,2)), fontweight='bold', color='blue')
axes.bar(ovrTime, ovrAcc, width=0.5, label= "One Vs. All")
axes.text(ovrTime, ovrAcc+0.5, str(round(ovrAcc,2)), fontweight='bold', color='orange')
axes.set_title(kernel.title() + " Kernel: Time Vs. Accuracy", fontsize=20, fontweight='bold')
axes.set_xlabel("Time (in seconds)", fontsize=16)
axes.set_ylabel('Mean Accuracy (%)', fontsize=16)
axes.legend(loc="best", fontsize=15)
axes.grid('on')
return fig
# plot the results of the grid search
def plotGridSearch(results, param1, param2, param1Name, param2Name, foldNumber, kernel, classification):
# change classification for labeling purposes
if classification == 'ovo':
cl = 'One Vs One'
else:
cl = 'One Vs All'
# ensure the kernel title is capitalized
kernel = kernel.title()
# get the mean score of the cv's and reshape for graphing purposes
meanScore = results['mean_test_score']
meanScore = np.array(meanScore).reshape(len(param2), len(param1))
fig, axes = plt.subplots(1,1)
for index, value in enumerate(param2):
axes.plot(param1, meanScore[index, :], '-o', label=param2Name + ': ' + str(value), alpha=0.7)
axes.set_title("Fold #" + str(foldNumber) + " GridSearch Score (" + kernel + " " + cl + ")", fontsize=16, fontweight= 'bold')
axes.set_xlabel(param1Name, fontsize=12)
axes.set_ylabel('CV Average Score', fontsize=12)
axes.legend(loc='best', fontsize=11)
axes.grid('on')
return fig
# perform svm with the provided kernel and type of classification
def svm(data, kernel, classification, weighted=False, plot=False, onePlot=False):
# list of labels
labels = ["1", "2", "3", "5", "6", "7"]
# initialize lists for confusion matrices, accuracies, and best parameters
confusions = []
accuracies = []
bestParams = []
plots = []
timePerFold = []
# set the proper classification
if classification == 'ovo' :
# if weighted option is chosen, balance the dataset so that the weights are inversely proportional to frequency
if weighted == False:
svc = multiclass.OneVsOneClassifier(SVC(kernel=kernel))
else:
svc = multiclass.OneVsOneClassifier(SVC(kernel=kernel, class_weight='balanced'))
# initialize parameters for GridSearchCV
params = {
"estimator__C": [0.01, 1, 10, 100, 500, 1000],
'estimator__gamma': [0.01, 1, 10]
}
c = "estimator__C"
gamma = 'estimator__gamma'
degree = 'estimator__degree'
elif classification == 'ovr':
svc = SVC(kernel=kernel)
# initialize parameters for GridSearchCV
params = {
"C": [0.01, 1, 10, 100, 500, 1000],
"gamma": [0.01, 1, 10]
}
c = "C"
gamma = 'gamma'
degree = 'degree'
else:
print('Invalid Classifier Type')
return
# if we have a polynomial kernel we want to reduce the penalty parameter to reduce training time
if kernel == 'poly':
params[c] = [0.001, 0.01, 0.1]
params[degree] = [2,3,4]
elif kernel == 'sigmoid':
params[gamma].append(0.001)
params[gamma].append(0.0001)
# split the data into features and labels
data = data.values
x = data[:,0:9]
y = data[:,9:]
# normalize the data
scaler = StandardScaler()
x = scaler.fit_transform(x)
# perform k-fold cross validation, in this case we're using 5 folds
kf = KFold(n_splits=5, random_state=1, shuffle=True)
foldNumber = 0
totalTime = 0
for train, test in kf.split(data):
# increase which fold we're on
foldNumber += 1
# get training and test splits
x_train, x_test = x[train], x[test]
y_train, y_test = y[train], y[test]
y_train, y_test = y_train.ravel(), y_test.ravel()
# perform Grid Search on the training sets
clf = GridSearchCV(svc, cv=5, param_grid=params, iid=True)
# start time for the training
t0 = time.time()
# fit the model
clf.fit(x_train, y_train)
# end time for the training
t1 = time.time()
timePerFold.append(t1-t0)
totalTime += t1-t0
# test the model
y_pred = clf.predict(x_test)
# save the best parameters for this fold
bestParams.append(clf.best_params_)
# plot the grid search results, save for later
if (plot and kernel != 'poly') or onePlot:
plots.append(plotGridSearch(clf.cv_results_, params[c], params[gamma], "C", "Gamma", foldNumber, kernel, classification))
elif plot and kernel == 'poly':
plots.append(plotGS3D(clf.cv_results_, params[c], params[gamma], params[degree], foldNumber, classification))
# append confusion matrix and accuracy to respective list
accuracies.append(accuracy_score(y_test, y_pred))
confusions.append(confusion_matrix(y_test, y_pred))
# get mean accuracy
meanAccuracy = np.mean(accuracies) * 100
# add weighted to kernel name if applicable
if weighted:
kernel += "_Weighted"
# save all the plots as a pdf
if plot:
# if we are on the polynomial kernel, flatten the list of lists
if kernel == 'poly' or kernel == 'poly_Weighted':
plots = [plot for subplot in plots for plot in subplot]
file = pdf.PdfPages(kernel.title() + "_Kernel_" + classification.upper() + "_Classification.pdf")
for fig in plots:
file.savefig(fig, bbox_inches='tight')
plt.close(fig)
file.close()
# get averaged confusion matrix
confusions = [pd.DataFrame(data=c, columns=labels, index=labels) for c in confusions]
concatCM = pd.concat(confusions)
cm_total = concatCM.groupby(concatCM.index)
cm_average = cm_total.mean()
# plot average confusion matrix
plotConfusionMatrix(cm_average, kernel, classification)
elif onePlot:
plots[1].show()
# print some useful information
print("-"*300)
print("Classification: ", classification)
print("Kernel: ", kernel)
print("Mean Accuracy: ", meanAccuracy)
print("Time it took to train: ", totalTime)
print("Time per Fold", timePerFold)
print("Best Parameters per Fold: ", bestParams)
print()
return totalTime, meanAccuracy
# open the data file and run SVM
def main():
# input = "../Data/glass.data"
input = "glass.data"
headers = ["Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "type"]
data = pd.read_csv(input, names=headers)
data.drop(["Id"], axis=1, inplace=True)
# if we'd like to plot all the results, set plot to true
plot = False
onePlot = True
# list for each type of kernel
kernels = ['linear', 'rbf', 'sigmoid', 'poly']
if plot:
onePlot = False
timeVacc = pdf.PdfPages("Time_VS_Accuracy.pdf")
# run each type of kernel with both 1v1 and 1vAll
for k in kernels:
ovoTime, ovoAccuracy = svm(data, kernel=k, classification='ovo', plot=plot, onePlot=onePlot)
ovrTime, ovrAccuracy = svm(data, kernel=k, classification='ovr', plot=plot)
onePlot = False
print("*"*300)
if plot:
# plot the time vs the accuracy and save to pdf
fig = plotTimeVAccuracy(k, ovoTime, ovrTime, ovoAccuracy, ovrAccuracy)
timeVacc.savefig(fig, bbox_inches='tight')
plt.close(fig)
if plot:
timeVacc.close()
# run each type of kernel with 1v1 where the classes are reweighted
for k in kernels:
ovoTime, ovoAccuracy = svm(data, kernel=k, classification='ovo', weighted=True, plot=plot)
if __name__ == "__main__":
main()