clustering.py


import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

def Cluster(features,sample_size):
	tic = time.clock()
	print('Starting clustering (using KMeans)...')
	#Number of clusters for K-Means
	N_CLUSTS = 5#250
	#Number of clusters used for validation
	N_VAL_CLUSTS = 1#50

	SEED = 42
	np.random.seed(SEED)

	#how many images to take?
	SAMP_SIZE = 8

	#km = KMeans(n_clusters = N_CLUSTS, n_jobs = -1)
	#clust_preds = km.fit_predict(StandardScaler().fit_transform(features))

	#val_clusters = np.random.choice(range(N_CLUSTS), N_VAL_CLUSTS, replace = False)
	#val_sample = np.array(sample_size)[np.in1d(clust_preds, val_clusters)]

	#print('val sample' , val_sample)
	kmeans = KMeans(n_clusters=2, random_state=0).fit(vgg16_feature_list_np)
	reduced_data = PCA(n_components=512).fit_transform(features)
	km = KMeans(n_clusters = N_CLUSTS, n_init=10)	
	km.fit(reduced_data)

	# Step size of the mesh. Decrease to increase the quality of the VQ.
	h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

	# Plot the decision boundary. For that, we will assign a color to each
	x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
	y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

	# Obtain labels for each point in mesh. Use last trained model.
	Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

	# Put the result into a color plot
	Z = Z.reshape(xx.shape)
	plt.figure(1)
	plt.clf()
	plt.imshow(Z, interpolation='nearest',
	           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
	           cmap=plt.cm.Paired,
	           aspect='auto', origin='lower')

	plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
	# Plot the centroids as a white X
	centroids = kmeans.cluster_centers_
	plt.scatter(centroids[:, 0], centroids[:, 1],
	            marker='x', s=169, linewidths=3,
	            color='w', zorder=10)
	plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
	          'Centroids are marked with white cross')
	plt.xlim(x_min, x_max)
	plt.ylim(y_min, y_max)
	plt.xticks(())
	plt.yticks(())
	plt.show()
	toc = time.clock()

	print("Total time taken for clustering = ", toc-tic)