-
Notifications
You must be signed in to change notification settings - Fork 2
/
gmeans.py
102 lines (94 loc) · 3.96 KB
/
gmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from sklearn.cluster import KMeans;
from sklearn.preprocessing import scale;
from statsmodels.stats.diagnostic import normal_ad as ad_test;
import numpy as np;
import pandas as pd
import matplotlib.pyplot as plt;
from matplotlib.patches import Ellipse
#change the path in the read.csv function
inputData = pd.read_csv('hw45-r3b-test-data.csv')
def normalityTest(testData):
fit = KMeans(n_clusters=2)
result = fit.fit(testData)
centers = result.cluster_centers_
projectionvector = centers[0] - centers[1]
projectionvector = np.matrix(projectionvector)
projectionvector = projectionvector.transpose()
projectedData = testData * projectionvector
modprojectionVector = 0
j=1
for j in range(0,len(projectionvector)):
modprojectionVector = modprojectionVector + (projectionvector[j]**2)
standardizedData = projectedData/modprojectionVector
statistic, pvalue = ad_test(standardizedData)
return pvalue
def gmeans(X,alpha=0.0001,k=1):
needtoinc = True
trialData = X
fit = KMeans(n_clusters=k)
initresult = fit.fit(trialData)
centers = initresult.cluster_centers_
while(needtoinc):
needtoinc = False
i=0
normTestData = trialData[initresult.labels_ == i]
normTestData = np.matrix(normTestData)
pvalue = normalityTest(normTestData)
if pvalue <= alpha:
needtoinc = True
tempresults = KMeans(2)
tempresults = tempresults.fit(normTestData)
newcenters = tempresults.cluster_centers_
else:
newcenters = centers[i, :]
k = centers.shape[0]
for i in range(1, k):
normTestData = trialData[initresult.labels_ == i]
normTestData = np.matrix(normTestData)
pvalue = normalityTest(normTestData)
if pvalue <= alpha:
needtoinc = True
tempresults = KMeans(2)
tempresults = tempresults.fit(normTestData)
newcenters = np.vstack((newcenters, tempresults.cluster_centers_))
else:
newcenters = np.vstack((newcenters, centers[i,:]))
centers = newcenters
initresult = KMeans(centers.shape[0],init=centers).fit(trialData)
centers = initresult.cluster_centers_
print 'optimal no of clusters:',centers.shape[0]
x = trialData.as_matrix()
plt.figure()
plt.scatter(x[:,0],x[:,1],c=initresult.labels_)
plt.xlable('x1')
plt.ylable('x2')
for i in range(0,centers.shape[0]):
covariance = np.cov(x[initresult.labels_==i,][:,[0,1]].transpose())
val,vec = np.linalg.eig(covariance)
e1 = Ellipse(xy=np.mean(x[initresult.labels_==i,][:,[0,1]].transpose(),axis = 1), width=6*np.sqrt(val[0]),height=6*np.sqrt(val[1]),angle=np.degrees(np.arctan2(*vec[1])),facecolor='none',edgecolor='blue')
plt.gca().add_artist(e1)
plt.show()
plt.figure()
plt.scatter(x[:,0],x[:,2],c=initresult.labels_)
plt.xlable('x1')
plt.ylable('x3')
for i in range(0,centers.shape[0]):
covariance = np.cov(x[initresult.labels_==i,][:,[0,2]].transpose())
val,vec = np.linalg.eig(covariance)
e1 = Ellipse(xy=np.mean(x[initresult.labels_==i,][:,[0,2]].transpose(),axis = 1), width=6*np.sqrt(val[0]),height=6*np.sqrt(val[1]),angle=np.degrees(np.arctan2(*vec[1])),facecolor='none',edgecolor='blue')
plt.gca().add_artist(e1)
plt.show()
plt.figure()
plt.scatter(x[:,1],x[:,2],c=initresult.labels_)
plt.xlable('x2')
plt.ylable('x3')
for i in range(0,centers.shape[0]):
covariance = np.cov(x[initresult.labels_==i,][:,[1,2]].transpose())
val,vec = np.linalg.eig(covariance)
e1 = Ellipse(xy=np.mean(x[initresult.labels_==i,][:,[1,2]].transpose(),axis = 1), width=6*np.sqrt(val[0]),height=6*np.sqrt(val[1]),angle=np.degrees(np.arctan2(*vec[1])),facecolor='none',edgecolor='blue')
plt.gca().add_artist(e1)
plt.show()
print "input data ready"
print inputData.shape
#change the alpha
gmeans(inputData,0.0001,1)