-
Notifications
You must be signed in to change notification settings - Fork 1
/
comm_fn.py
496 lines (404 loc) · 16.7 KB
/
comm_fn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
from __future__ import print_function
import os.path
import argparse
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import learning_curve, train_test_split
from sklearn.preprocessing import StandardScaler
def load_glove_model(gloveFile, word_count=100000):
"""
Loads a selection of the most common glove vectors into a dictionary.
:param gloveFile: textfile
Contains glove vectors. Each line contains a word string followed by a
'n_features'-dimensional vector to describe the word. Where n_features
is the number of features.
:param word_count: int, default: 100000
Number of words to load from the gloveFile
:return: dictionary
{'word': vector}
word = string of the word we wish to load
vector = 'n_features'-d vectors to describe the word
"""
path = './data/gloveFile_done_%d.npy' % (word_count)
# Saves time by loading existing file, if available.
if os.path.exists(path):
glove = np.load(path).item()
else:
f = open(gloveFile,'r')
glove = {}
count = 0
for line in f:
split_line = line.split()
word = split_line[0]
embedding = [float(val) for val in split_line[1:]]
embedding = np.asanyarray(embedding)
glove[word] = embedding
count += 1
if count >= word_count:
break
# Saves the vectors, so we can load it faster next time.
np.save(path, glove)
return glove
def sentences2vec(sentences, glove):
""" Uses the glove word vectors to convert whole sentences to single
vectors by averaging the input vectors.
:param sentences: list
List if sentences, where each element is a list of words as strings.
Example: [['first', 'sentence'], ['second', 'sentence'],...]
:param glove: dictionary
{'word': vector}, where vector is 'n_features'-dimensional.
:return: list
List of sentences as a single vector.
"""
sentence_vectors = []
for sentence in sentences:
matched_words = len(sentence)
# All vectors have the same dimensionality. Using the most common
# 'word' to set the size of our new sentence vector.
sum_of_words = np.zeros(len(glove[',']))
# Add a neutral word if we don't have its glove vector. This works
# faster than checking to see if we have it and then adjusting
# matched_words to take the correct average.
for word in sentence:
sum_of_words += glove.get(word, 0)
# In theory, we should have sentences that have at least 1 element.
# However for some datasets, such as CR, the if statement is necessary,
# to ensure we don't divide by zero.
if matched_words != 0:
sentence_vector = sum_of_words/matched_words
sentence_vectors.append(sentence_vector)
return sentence_vectors
def load_data(type='SO', loc='./data/', remove_stop=False):
"""Loads data from the directory, loc.
:param type: string # Can I do this to speed things up?
MR = movie review sentiment
SO = subjective/objective classification
CR = product reviews
MPQA = opinion polarity
:param loc: string
Locates the directory in which the data is saved.
:return: pos, list
Contains a list of all the positive sentence examples as embedded lists
of word strings. Example:
[['first', 'pos', 'sentence'], ['second', 'pos', 'sentence'],...]
:return: neg, list
Contains a list of all the negative sentence examples as embedded lists
of word strings. Example:
[['first', 'neg', 'sentence'], ['second', 'neg', 'sentence'],...]
"""
if type == 'MR':
pos, neg = load_mr(loc)
if type == 'SO':
pos, neg = load_so(loc)
print(remove_stop)
if type == 'CR':
pos, neg = load_cr(loc)
if type == 'MPQA':
pos, neg = load_mpqa(loc)
# Remove stop words
if remove_stop == True:
pos = remove_stop_words(pos)
neg = remove_stop_words(neg)
return pos, neg
def load_so(loc):
"""Loads the objective vs subjective data, setting positive examples as
objective examples
:param loc: string
Locates the directory in which the data is saved.
:return: pos, list
Contains a list of all the positive sentence examples as embedded lists
of word strings. Example:
[['first', 'pos', 'sentence'], ['second', 'pos', 'sentence'],...]
:return: neg, list
Contains a list of all the negative sentence examples as embedded lists
of word strings. Example:
[['first', 'neg', 'sentence'], ['second', 'neg', 'sentence'],...]
"""
pos, neg = [], []
with open(loc + 'plot.tok.gt9.5000.txt', 'rb') as f:
for line in f:
pos.append(word_tokenize(line.decode('latin-1')))
with open(loc + 'quote.tok.gt9.5000.txt', 'rb') as f:
for line in f:
neg.append(word_tokenize(line.decode('latin-1')))
return pos, neg
def load_mr(loc):
"""Loads the positive vs negative movie reviews, setting positive
examples as the positive movie reviews.
:param loc: string
Locates the directory in which the data is saved.
:return: pos, list
Contains a list of all the positive sentence examples as embedded lists
of word strings. Example:
[['first', 'pos', 'sentence'], ['second', 'pos', 'sentence'],...]
:return: neg, list
Contains a list of all the negative sentence examples as embedded lists
of word strings. Example:
[['first', 'neg', 'sentence'], ['second', 'neg', 'sentence'],...]
"""
pos, neg = [], []
# why are we using 'rb'?
with open(loc + 'rt-polarity.neg.txt', 'rb') as f:
for line in f:
# adds each sentence as a list, which is itself a list of words
# Why did I have to use 'latin-1'? why not 'utf-8'...read about this!
neg.append(word_tokenize(line.decode('latin-1')))
with open(loc + 'rt-polarity.pos.txt', 'rb') as f:
for line in f:
pos.append(word_tokenize(line.decode('latin-1')))
return pos, neg
def load_cr(loc):
"""Loads the positive vs negative customer reviews, setting positive
examples as the positive customer reviews.
:param loc: string
Locates the directory in which the data is saved.
:return: pos, list
Contains a list of all the positive sentence examples as embedded lists
of word strings. Example:
[['first', 'pos', 'sentence'], ['second', 'pos', 'sentence'],...]
:return: neg, list
Contains a list of all the negative sentence examples as embedded lists
of word strings. Example:
[['first', 'neg', 'sentence'], ['second', 'neg', 'sentence'],...]
"""
pos, neg = [], []
with open(loc + 'custrev.neg.txt', 'rb') as f:
for line in f:
neg.append(word_tokenize(line))
with open(loc + 'custrev.pos.txt', 'rb') as f:
for line in f:
pos.append(word_tokenize(line))
return pos, neg
def load_mpqa(loc):
"""Loads the opinion polarity data in to positive and negative examples.
:param loc: string
Locates the directory in which the data is saved.
:return: pos, list
Contains a list of all the positive sentence examples as embedded lists
of word strings. Example:
[['first', 'pos', 'sentence'], ['second', 'pos', 'sentence'],...]
:return: neg, list
Contains a list of all the negative sentence examples as embedded lists
of word strings. Example:
[['first', 'neg', 'sentence'], ['second', 'neg', 'sentence'],...]
"""
pos, neg = [], []
with open(loc + 'mpqa.pos.txt', 'rb') as f:
for line in f:
pos.append(word_tokenize(line))
with open(loc + 'mpqa.neg.txt', 'rb') as f:
for line in f:
neg.append(word_tokenize(line))
return pos, neg
def minibatch(rand, X_data, y_data, batch_size):
""" Takes a random batch of rows (examples) from X_data and y_data.
Parameters
----------
rand : RandomState numpy object
X_data : array-like, shape (n_samples, n_features)
Training samples.
y_data : array-like, shape (n_samples, 1)
Training labels.
n_samples : int
Number of samples in X_data (or, equivalently, in y_data).
batch_size : int
The size of the batch to be returned. i.e the number of examples (rows)
to be returned
Returns
-------
X_batch : array-like, shape (batch_size, n_features)
Randomly selected set of training samples from X_data.
y_batch : array-like, shape (batch_size, 1)
Corresponding labels for the randomly selected examples from X_data.
"""
indices = rand.choice(X_data.shape[0], batch_size)
X_batch, y_batch = X_data[indices], y_data[indices]
return X_batch, y_batch
def plot_curve(clf, X_train, y_train, title, train_sizes, label='label', color='b', cv=3,
figure=1):
""" Plot curves using cross-validation set for hyperparmeter evaluation.
This function uses k-fold cross-validation (default is 3-fold) to plot
learning curves.
Parameters
----------
clf : Classifier
X_train : array-like, shape (n_samples, n_features)
Training samples.
y_train : array-like, shape (n_samples, 1)
Training labels.
title : string
Title of the chart.
label : string, default: 'label'
Label corresponding to the curve which is to be plotted.
color : string, default: 'b'
cv : integer, default: 3
Number of 'folds' in our cross-validation training split.
figure : integer, default:1
Sets the current figure.
Returns
-------
plt : matplotlib object
"""
plt.figure(figure)
plt.title(title)
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.grid()
train_sizes, train_scores, cv_scores = learning_curve(
clf, X_train, y_train, train_sizes=train_sizes, cv=cv)
cv_scores_mean = np.mean(cv_scores, axis=1)
plt.plot(train_sizes, cv_scores_mean, 'o-', color=color, label=label)
plt.legend(loc="best")
return plt
def remove_stop_words(full_sentences):
""" Removes stopwords from full_sentences.
Parameters
----------
full_sentences: list
Each item in the list represents a sentence. Each sentence contains
tokenized words.
Returns
-------
filtered_sentences : list
New list of sentences with stopwords removed.
"""
stop_words = stopwords.words('english')
stop_words.extend((',', '.')) # Removing punctuation increases accuracy.
filtered_sentences = []
for sentence in full_sentences:
filtered_sentence = [w for w in sentence
if w.lower() not in stop_words]
filtered_sentences.append(filtered_sentence)
return filtered_sentences
def save_misclf_data(pos_ind, neg_ind, pos_sentences, neg_sentences, type):
"""
Saves misclassified samples in two seperate text documents.
:param pos_ind: list
Indices of all positive samples that were classified as negative.
:param neg_ind: list
Indices of all negative samples that were classified as positive.
:param pos_sentences: list
List of all positive samples.
:param neg_sentences: list
List of all positive samples.
:param type: string
Type of supervised learning problem e.g. 'SO'
:return:
"""
pos_new = []
for sentence in pos_sentences:
# Make sentences easier to read
string_sent = [x.encode('UTF8') for x in sentence]
string_sent = " ".join(string_sent)
pos_new.append(string_sent)
# These sentences should have been classified as negative
misclf_pos = [pos_new[i] for i in pos_ind]
neg_new = []
for sentence in neg_sentences:
string_sent = [x.encode('UTF8') for x in sentence]
string_sent = " ".join(string_sent)
neg_new.append(string_sent)
# These sentences should have been classified as positive
misclf_neg = [neg_new[i-len(pos_sentences)] for i in neg_ind]
np.savetxt('./results/test_pos_%s.txt' % (type), misclf_pos,
delimiter='.', fmt='%s')
np.savetxt('./results/test_neg_%s.txt' % (type), misclf_neg,
delimiter='.', fmt='%s')
def PC_comparision(X_train, X_test):
"""
Preprocesses training and test data with PCA. Using 2, 5, 10, 20, 50, 100,
150, 200, 250 and 300 PCs.
:param X_train: array-like, shape (n_samples, n_features)
:param X_test: array-like, shape (n_samples, n_features)
:return: dictionary
Two dictionaries each in the same format. The number of PCs used
to construct the array as the key and the array itself (test or
training depending on dictionary) is the value.
"""
PCs = [2, 5, 10, 20, 50, 100, 150, 200, 250, 300]
X_train_PCs = {}
X_test_PCs = {}
for pc in PCs:
pca = PCA(n_components=pc)
pca.fit(X_train)
X_train1 = pca.transform(X_train)
X_test1 = pca.transform(X_test)
X_train_PCs[pc] = X_train1
X_test_PCs[pc] = X_test1
print('Collected all PCs')
return X_train_PCs, X_test_PCs
def process_pca(n_components, X_train, X_test):
# Optional PCA preprocessing
pca = PCA(n_components)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
return X_train, X_test
def standard(args=None):
# Dictionary {word:vector}, where each word is a key corresponding to a
# 'n_features'-d row vector, shape (n_features,).
model = load_glove_model('./data/glove.840B.300d.txt')
# Load data in to lists of positive and negative examples. If no args
# given, assume 'SO'.
try:
pos, neg = load_data(args.input, remove_stop=args.stop)
except:
pos, neg = load_data('SO', remove_stop=args.stop)
print('Using default dataset: "SO"')
# Shape (n_pos_samples, n_features)
pos_vectors = np.asarray(sentences2vec(pos, model))
# Shape (n_neg_samples, n_features)
neg_vectors = np.asarray(sentences2vec(neg, model))
# Prepare a matrix containing both positive and negative samples
X = np.r_[pos_vectors, neg_vectors]
y = np.zeros(X.shape[0])
y[:pos_vectors.shape[0]] = 1.0
X_with_ind = np.c_[
range(X.shape[0]), X] # Using 'range' to produce an index
# Randomly split the X and y arrays into 30/70 test/train split.
X_train, X_test, y_train, y_test = train_test_split(X_with_ind,
y,
test_size=0.3,
random_state=0)
# Index to convert from X_test (shuffled data) back to X.
# For example, if sample 3 in y_test is missclassified, we use the third value
# from X_ind to get back the original index of the example. If this value is
# 300, this tells us that the 300th example has been missclassified.
X_ind = X_test[:, 0]
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]
# Standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)
# Optional PCA preprocessing
try:
if args.pca: X_train, X_test = process_pca(args.pca, X_train, X_test)
except:
pass
return X_train, X_test, y_train, y_test, X_ind
def parse_options():
# Handles the command-line argument, which specifies the dataset to be used.
parser = argparse.ArgumentParser(description='Choose data to load into the'
' Logistic Regression'
' classifier.')
parser.add_argument('-i','--input',
help='Specify input data: "MR", "SO", "CR", "MPQA"',
choices=["MR", "SO", "CR", "MPQA"],
default="SO")
parser.add_argument('-p','--pca',
help='Specify the number of Principal components.',
type=int)
parser.add_argument('-s', '--stop', action='store_true',
help='Using the -s or --stop flag, removes the stopword '
'from the data.')
parser.add_argument('-d', '--data',
help='Specify the dataset size you would like to use for '
'producing learning curves.',
type=int,
nargs='+')
return parser.parse_args()