-
Notifications
You must be signed in to change notification settings - Fork 0
/
hw4.py
343 lines (262 loc) · 12.1 KB
/
hw4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
'''
Galileo Steinberg
CS115B Spring 2024 Homework 4
Distributional Semantics Takes the SAT
'''
import random
import numpy as np
import scipy
import scipy.linalg as scipy_linalg
random.seed(42)
def compute_co_occurrence(corpus):
"""
Computes the co-occurrence matrix C, such that
C[w, c] = the number of (w, c) and (c, w) bigrams in the corpus.
Multiplies the entire matrix by 10 (to pretend that we see these sentences 10 times) and then smooth the counts by
adding 1 to all cells.
"""
words = set(word for sentence in corpus for word in sentence.split())
word_to_id = {word: i for i, word in enumerate(words)}
id_to_word = {i: word for word, i in word_to_id.items()}
co_occurrence_matrix = np.zeros((len(words), len(words)), dtype=int)
for sentence in corpus:
tokens = sentence.split()
for i, token in enumerate(tokens):
for j in range(max(0, i - 1), min(i + 2, len(tokens))):
if i != j:
co_occurrence_matrix[word_to_id[tokens[i]], word_to_id[tokens[j]]] += 1
co_occurrence_matrix = co_occurrence_matrix + co_occurrence_matrix.T
co_occurrence_matrix = 10 * co_occurrence_matrix + 1
return co_occurrence_matrix, word_to_id, id_to_word
def compute_PPMI(c):
'''
Computes the positive pointwise mutual information (PPMI) for each word w and context word c
'''
total_sum = c.sum()
words_sum = c.sum(axis=1, keepdims=True)
P_w_c = c / total_sum
P_w = words_sum / total_sum
P_c = words_sum.T / total_sum
PPMI = np.maximum(np.log2(P_w_c / (P_w * P_c)), 0)
PPMI[np.isnan(PPMI)] = 0
return PPMI
def part1():
"""
Runs part one of Homework 4.
Creates the co-occurrence matrix.
Prints the co-occurrence matrix.
Creates the ppmi matrix.
Prints the ppmi matrix.
Evaluate word similarity with different distance metrics for each word in the word pairs.
Reduce dimensions with SVD and check the distance metrics on the word pairs again.
"""
pairs = [
("women", "men"),
("women", "dogs"),
("men", "dogs"),
("feed", "like"),
("feed", "bite"),
("like", "bite"),
]
# TODO: STUDENT CODE HERE
with open('dist_sim_data.txt', 'r') as file:
corpus = [line.strip() for line in file if line.strip()]
C, word_to_id, id_to_word = compute_co_occurrence(corpus)
PPMI = compute_PPMI(C)
dogs_index = word_to_id['dogs']
vector_before = C[dogs_index]
vector_after = PPMI[dogs_index]
print("Word vector for 'dogs' before PPMI reweighting:", vector_before)
print("Word vector for 'dogs' after PPMI reweighting:", vector_after)
print("\nEuclidean distance between word pairs: ")
for word1, word2 in pairs:
idx1, idx2 = word_to_id[word1], word_to_id[word2]
vector1 = PPMI[idx1, :]
vector2 = PPMI[idx2, :]
distance = scipy.linalg.norm(vector1 - vector2)
print(f"Distance between {word1} and {word2}: {distance}")
U, E, Vt = scipy_linalg.svd(PPMI, full_matrices=False)
E = np.diag(E) # compute E
print("\n",np.allclose(PPMI, U.dot(E).dot(Vt)))
V = Vt.T # compute V = conjugate transpose of Vt
reduced_PPMI = PPMI.dot(V[:, 0:3])
print("\nEuclidean distances between word pairs on reduced PPMI matrix:")
for word1, word2 in pairs:
idx1, idx2 = word_to_id[word1], word_to_id[word2]
vector1 = reduced_PPMI[idx1, :]
vector2 = reduced_PPMI[idx2, :]
distance = scipy.linalg.norm(vector1 - vector2)
print(f"Distance between {word1} and {word2} (reduced space): {distance}")
# PART 2.1
def load_word_vectors(file_path):
word_vectors = {}
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
word = parts[0]
vector = np.array(parts[1:], dtype=np.float32)
word_vectors[word] = vector
return word_vectors
def load_synonyms_list(file_path):
synonyms = {}
with open(file_path, 'r', encoding='utf-8') as f:
next(f) # Assuming the first line is a header and should be skipped
for line in f:
verb, synonym = line.strip().split('\t')
# Remove 'to_' prefix from both verb and synonym if present
verb = verb.replace('to_', '')
synonym = synonym.replace('to_', '')
if verb not in synonyms:
synonyms[verb] = []
synonyms[verb].append(synonym)
return synonyms
def create_synonym_test_cases():
synonyms_list = load_synonyms_list('EN_syn_verb.txt')
test_cases = []
verbs = list(synonyms_list.keys()) # All available verbs from synonyms list
all_possible_non_synonyms = set(verbs)
mcq_count = 0
while mcq_count < 1000:
for verb in verbs:
if mcq_count >= 1000:
break
synonyms = synonyms_list[verb] # Retrieve synonyms for the current verb
for synonym in synonyms:
current_non_synonyms_pool = list(all_possible_non_synonyms - {verb} - set(synonyms))
if len(current_non_synonyms_pool) < 4: continue
non_synonyms = random.sample(current_non_synonyms_pool, 4)
test_cases.append((verb, synonym, non_synonyms))
mcq_count += 1
if mcq_count == 1000: break
with open('synonym_test_set.txt', 'w', encoding='utf-8') as f:
for verb, correct_synonym, distractors in test_cases:
distractors_str = ' '.join(distractors)
line = f"{verb} {correct_synonym} {distractors_str}\n"
f.write(line)
def read_synonym_test_cases_from_file(file_path):
test_cases = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split()
verb = parts[0]
correct_synonym = parts[1]
distractors = parts[2:]
test_cases.append((verb, correct_synonym, distractors))
return test_cases
def compute_euclidean_distance(vec1, vec2):
return np.linalg.norm(vec1-vec2)
def compute_cosine_similarity(vec1, vec2):
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def evaluate_test_cases(test_cases, word_vectors):
correct_euclidean, correct_cosine = 0, 0
num_evaluated = 0
for verb, correct_synonym, distractors in test_cases:
# Perform the evaluation only if the verb and correct synonym are in the word vectors
if verb in word_vectors and correct_synonym in word_vectors:
num_evaluated += 1
verb_vector = word_vectors[verb]
correct_vector = word_vectors[correct_synonym]
choices = distractors + [correct_synonym]
choices_vectors = [(choice, word_vectors.get(choice, None)) for choice in choices if choice in word_vectors]
# Compute Euclidean distances and cosine similarities
distances = [(choice[0], compute_euclidean_distance(verb_vector, choice[1])) for choice in choices_vectors]
similarities = [(choice[0], compute_cosine_similarity(verb_vector, choice[1])) for choice in choices_vectors]
# Identify the most similar choice to the verb's vector
closest_by_distance = min(distances, key=lambda x: x[1])[0]
closest_by_similarity = max(similarities, key=lambda x: x[1])[0]
# Increment correct counters
if closest_by_distance == correct_synonym:
correct_euclidean += 1
if closest_by_similarity == correct_synonym:
correct_cosine += 1
# Print results
accuracy_euclidean = correct_euclidean / num_evaluated if num_evaluated else 0
accuracy_cosine = correct_cosine / num_evaluated if num_evaluated else 0
print(f"Accuracy using Euclidean distance: {accuracy_euclidean:.2%}")
print(f"Accuracy using Cosine similarity: {accuracy_cosine:.2%}")
def run_synonym_test():
create_synonym_test_cases()
# Load both Google's word2vec and COMPOSES word vectors
google_word_vectors = load_word_vectors('GoogleNews-vectors-negative300-filtered.txt')
composes_word_vectors = load_word_vectors('EN-wform.w.2.ppmi.svd.500-filtered.txt')
test_cases = read_synonym_test_cases_from_file('synonym_test_set.txt')
print("\nSynonym Test Evaluation with word2vec:")
evaluate_test_cases(test_cases, google_word_vectors)
print("\nSynonym Test Evaluation with COMPOSES:")
evaluate_test_cases(test_cases, composes_word_vectors)
# PART 2.2
def load_sat_questions(file_path):
"""Load SAT analogy questions, considering block separation, source lines, and word separation."""
filtered_lines = []
with open(file_path, 'r', encoding='utf-8') as file:
# Split content into blocks separated by blank lines
for line in file:
if not line.startswith('#'):
filtered_lines.append(line.rstrip())
content = '\n'.join(filtered_lines) # Reconstruct content without comment lines
content_blocks = content.strip().split('\n\n')
sat_questions = []
for block in content_blocks:
lines = [line.rstrip() for line in block.split('\n') if not line.startswith('#')][1:]
stem = lines[0][:-4].split() # Separate words in the stem line, discard POS
choices = [line[:-4].split() for line in lines[1:6]] # Separate words in choice lines, discard POS
correct_choice = lines[-1].strip() # The correct choice is indicated by the last line
sat_questions.append({
'stem': stem, # This will be a list of two words [word1, word2]
'choices': choices, # A list of pairs, each being a list of two words [[word1, word2], ...]
'correct_letter': correct_choice
})
return sat_questions
def solve_analogy(stem, choices, word_vectors):
# Attempt to retrieve vectors for the stem words. Skip this question if any vector is missing.
if stem[0] not in word_vectors or stem[1] not in word_vectors:
return None
stem_vec_diff = word_vectors[stem[0]] - word_vectors[stem[1]]
best_choice = None
max_similarity = float('-inf')
# Loop through the choices to find the one most similar to the stem's relation vector
for i, (choice_a, choice_b) in enumerate(choices):
if choice_a not in word_vectors or choice_b not in word_vectors:
continue # Skip choices with missing vectors
choice_vec_diff = word_vectors[choice_a] - word_vectors[choice_b]
similarity = compute_cosine_similarity(stem_vec_diff, choice_vec_diff)
if similarity > max_similarity:
max_similarity = similarity
best_choice = chr(ord('a') + i)
return best_choice
def evaluate_sat(sat_questions, word_vectors, vectors_name):
correct, total = 0, 0
for question in sat_questions:
answer = solve_analogy(question['stem'], question['choices'], word_vectors)
if answer and answer == question['correct_letter']:
correct += 1
if answer: # Only consider questions where an answer could be generated
total += 1
# Calculate and print accuracy directly within the function
accuracy = correct / total if total > 0 else 0
print(f"{vectors_name} Accuracy: {accuracy:.2%}")
def run_sat_test():
"""
Sets up the SAT test, loads the word embeddings and runs the evaluation.
Prints the overall accuracy of the SAT task.
"""
# TODO: STUDENT CODE HERE
google_word_vectors = load_word_vectors('GoogleNews-vectors-negative300-filtered.txt')
composes_word_vectors = load_word_vectors('EN-wform.w.2.ppmi.svd.500-filtered.txt')
sat_questions = load_sat_questions('SAT-package-V3.txt')
# Evaluate SAT questions for word2vec
print("\nSAT Analogy Test Evaluation with word2vec:")
evaluate_sat(sat_questions, google_word_vectors, "word2vec")
# Evaluate SAT questions for COMPOSES
print("\nSAT Analogy Test Evaluation with COMPOSES:")
evaluate_sat(sat_questions, composes_word_vectors, "COMPOSES")
def part2():
"""
Runs the two tasks for part two of Homework 4.
"""
run_synonym_test()
run_sat_test()
if __name__ == "__main__":
# DO NOT MODIFY HERE
part1()
part2()