forked from gkagkos/Probabilistic-PCA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
256 lines (198 loc) · 9.43 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from Dataset_Generator import datasets
from Utils import *
import random
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.model_selection import cross_val_score
from PPCA import PPCA
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) # Directory of the script
cifar10_dir = os.path.join(ROOT_DIR, 'data/CIFAR10/') # Directory of the cifar dataset
mnist_dir = os.path.join(ROOT_DIR, 'data/MNIST/')
toba_dir = os.path.join(ROOT_DIR, 'data/Toba')
# for CIFAR10
num_pics_CIFAR10 = 10000 # number of pictures to use for train
# for MNIST
num_pics_MNIST = 10000
# for the toy datasets
num_points = 10000 # number of data points
N = 50 # data dimensionality MUST BE ALWAYS SMALLER THAN LATENT
max_iterations = 2 # number of maximum iterations
cifar = False
mnist = True
multivariate = False
def compute_scores(X, n_features):
# Imported from sklearn to check the scores with different ways. Currently is not used
n_components = np.arange(0, n_features, 5) # options for n_components
pca = PCA(svd_solver='full')
fa = FactorAnalysis()
pca_scores, fa_scores = [], []
for n in n_components:
pca.n_components = n
fa.n_components = n
pca_scores.append(np.mean(cross_val_score(pca, X, cv=5)))
fa_scores.append(np.mean(cross_val_score(fa, X, cv=5)))
print("Number of component", n)
n_components_pca = n_components[np.argmax(pca_scores)]
n_components_fa = n_components[np.argmax(fa_scores)]
return [n_components_fa, n_components_pca]
def calculate_for_Cifar(num_pics_to_load):
plt.ion()
X_train, y_train, X_test, y_test = datasets().load_CIFAR10(cifar10_dir)
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
X_train = X_train[:num_pics_to_load, :] # take only the first num_pics pictures
print(X_train.shape)
ppca = PPCA(max_iterations=max_iterations)
print("=======>Training Phase<=======")
fitted_data = ppca.fit(X_train)
reduced_data = ppca.transform_data(fitted_data)
created_data = ppca.inverse_transform(reduced_data)
error_train = get_relative_error(X_train, created_data, num_pics_to_load)
print("The training avg error of the dataset is: {0}".format(np.mean(error_train)))
print("=======>Testing Phase<=======")
reduced_data = ppca.transform_data(X_test)
created_data = ppca.inverse_transform(reduced_data)
error_test = get_relative_error(X_test, created_data, num_pics_to_load)
print("The testing avg error of the dataset is: {0}".format(np.mean(error_test)))
plt.figure()
plt.xlabel('Error(%)')
plt.ylabel('Count')
plt.title('Error of Reconstructing CIFAR Test Set with PPCA(' + str(ppca.num_components) + " components)")
plt.hist(list(error_test), bins=100, color="#3F5D7D") # fancy color
plt.show()
def calculate_for_Mnist(num_pics_to_load):
X_train, y_train, X_test, y_test = datasets().load_MNIST(mnist_dir)
new_X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
new_X_train = new_X_train[:num_pics_to_load, :] # take only the first num_pics pictures
ppca = PPCA(max_iterations=max_iterations, num_components=200)
print("=======>Training Phase<=======")
fitted_data = ppca.fit(new_X_train)
reduced_data = ppca.transform_data(fitted_data)
created_data = ppca.inverse_transform(reduced_data)
error_train = get_relative_error(new_X_train, created_data, num_pics_to_load)
print("The avg error of the dataset is: {0}".format(np.mean(error_train)))
print(created_data.shape)
created_data = np.reshape(created_data, (10000, 784))
plt.figure()
plt.xlabel('Error(%)')
plt.ylabel('Count of examples')
plt.title('Error of Reconstructing MNIST with PPCA(' + str(ppca.num_components) + " components)")
plt.hist(list(error_train), bins=100, color="#3F5D7D") # fancy color
plt.xlim([0, 100])
plt.show()
# visualize a sample of reconstructed data images
created_data = np.reshape(created_data, (created_data.shape[0], 28, 28))
# randomly select 5 images from 1 to num_pics_to_load
rand_Images_idx = random.sample(range(num_pics_to_load), 3)
for i, idx in enumerate(rand_Images_idx):
plt.figure()
plt.imshow(created_data[i].astype('uint8'))
plt.xlabel("Actual "
"Number {}".format(y_train[i]))
plt.title("Reconstructed image")
plt.show()
for i, idx in enumerate(rand_Images_idx):
plt.figure()
plt.imshow(X_train[i].astype('uint8'))
plt.xlabel("Actual Number {}".format(y_train[i]))
plt.title("Original Picture")
plt.show()
print("=======>Testing Phase<=======")
reduced_data = ppca.transform_data(X_test)
created_data = ppca.inverse_transform(reduced_data)
error_test = get_relative_error(X_test, created_data, num_pics_to_load)
print("The testing avg error of the dataset is: {0}".format(np.mean(error_test)))
plt.figure()
plt.xlabel('Error(%)')
plt.ylabel('Count')
plt.title('Error of Reconstructing MNIST Test Set with PPCA(' + str(ppca.num_components) + " components)")
plt.hist(list(error_test), bins=100, color="#3F5D7D") # fancy color
plt.show()
# visualize a sample of reconstructed data images
created_data = np.reshape(created_data, (created_data.shape[0], 28, 28))
# randomly select 5 images from 1 to num_pics_to_load
rand_Images_idx = random.sample(range(num_pics_to_load), 5)
for i, idx in enumerate(rand_Images_idx):
plt.figure()
plt.imshow(created_data[i].astype('uint8'))
plt.xlabel("Actual Number {}".format(y_train[i]))
plt.show()
def calculate_for_Multivariate():
plt.ion()
X_train, X_test = datasets().build_A_toy_dataset(N=N, num_points=num_points)
# print(X_train.shape)
ppca = PPCA(max_iterations=max_iterations)
print("=======>Training Phase<=======")
fitted_data = ppca.fit(X_train)
reduced_data = ppca.transform_data(fitted_data)
created_data = ppca.inverse_transform(reduced_data)
percentage = int(num_points * 0.8) # get the integer number of points of 0.8% of the whole dataset
r = range(0, percentage)
error = get_relative_error(X_train, created_data, percentage + 1) # +1 cuz it is fucking annoying :)
# print(percentage, error.shape)
print("The training avg error of the dataset is: {0}".format(np.mean(error)))
plt.bar(r, error, width=1, color="blue")
plt.xlabel('Data Points')
plt.ylabel('Error')
plt.title('Error of Reconstructing Training Set 1 with PPCA(' + str(ppca.num_components) + " components)")
plt.show()
print("=======>Testing Phase<=======")
reduced_data = ppca.transform_data(X_test)
created_data = ppca.inverse_transform(reduced_data)
percentage = int(num_points * 0.2) # get the integer number of points of the rest 0.2% of the whole dataset
error = get_relative_error(X_test, created_data, percentage + 1) # + 1 cuz it is fucking annoying :)
r = range(0, percentage)
print("The testing avg error of the dataset is: {0}".format(np.mean(error)))
plt.bar(r, error, width=1, color="blue")
plt.xlabel('Data Points')
plt.ylabel('Error(%)')
plt.title('Relative Error of Reconstructing Test Set 1 with PPCA(' + str(ppca.num_components) + " components)")
plt.show()
def calculate_for_Mnist_PCA(num_pics_to_load):
X_train, y_train, X_test, y_test = datasets().load_MNIST(mnist_dir)
new_X_train = np.reshape(X_train, (X_train.shape[0], -1))
new_X_train = new_X_train[:num_pics_to_load, :] # take only the first num_pics pictures
pca = PCA(n_components=200)
print("=======>Training Phase<=======")
pca.fit(new_X_train)
data_reduced = np.dot(new_X_train, pca.components_.T) # transform
created_data = np.dot(data_reduced, pca.components_)
error_train = get_relative_error(new_X_train, created_data, num_pics_to_load)
print("The avg error of the dataset is: {0}".format(np.mean(error_train)))
plt.figure()
plt.xlabel('Error(%)')
plt.ylabel('Count of examples')
plt.title('Error of Reconstructing MNIST with PCA (' + str(200) + " components)")
plt.hist(list(error_train), bins=100, color="#3F5D7D") # fancy color
plt.xlim([0, 100])
plt.show()
# visualize a sample of reconstructed data images
created_data = np.reshape(created_data, (created_data.shape[0], 28, 28))
# randomly select 5 images from 1 to num_pics_to_load
rand_Images_idx = random.sample(range(1000), 2)
for i, idx in enumerate(rand_Images_idx):
plt.figure()
plt.imshow(created_data[i].astype('uint8'))
plt.xlabel("Actual Number {}".format(y_train[i]))
plt.title("Reconstructed image")
plt.show()
for i, idx in enumerate(rand_Images_idx):
plt.figure()
plt.imshow(X_train[i].astype('uint8'))
plt.xlabel("Actual Number {}".format(y_train[i]))
plt.title("Original Picture")
plt.show()
if __name__ == '__main__':
if cifar is True:
# Do PPCA on CIFAR10 data set
calculate_for_Cifar(num_pics_to_load=num_pics_CIFAR10)
if mnist is True:
# Do PPCA on Mnist data set
calculate_for_Mnist(num_pics_to_load=num_pics_MNIST)
# Do PCA on Mnist data set
calculate_for_Mnist_PCA(num_pics_to_load=num_pics_MNIST)
if multivariate is True:
# Do PPCA on multivariate gaussian set
calculate_for_Multivariate()