-
Notifications
You must be signed in to change notification settings - Fork 75
/
word2vec.py
96 lines (85 loc) · 3.63 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from input_data import InputData
import numpy
from model import SkipGramModel
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import sys
class Word2Vec:
def __init__(self,
input_file_name,
output_file_name,
emb_dimension=100,
batch_size=50,
window_size=5,
iteration=1,
initial_lr=0.025,
min_count=5):
"""Initilize class parameters.
Args:
input_file_name: Name of a text data from file. Each line is a sentence splited with space.
output_file_name: Name of the final embedding file.
emb_dimention: Embedding dimention, typically from 50 to 500.
batch_size: The count of word pairs for one forward.
window_size: Max skip length between words.
iteration: Control the multiple training iterations.
initial_lr: Initial learning rate.
min_count: The minimal word frequency, words with lower frequency will be filtered.
Returns:
None.
"""
self.data = InputData(input_file_name, min_count)
self.output_file_name = output_file_name
self.emb_size = len(self.data.word2id)
self.emb_dimension = emb_dimension
self.batch_size = batch_size
self.window_size = window_size
self.iteration = iteration
self.initial_lr = initial_lr
self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
self.use_cuda = torch.cuda.is_available()
if self.use_cuda:
self.skip_gram_model.cuda()
self.optimizer = optim.SGD(
self.skip_gram_model.parameters(), lr=self.initial_lr)
def train(self):
"""Multiple training.
Returns:
None.
"""
pair_count = self.data.evaluate_pair_count(self.window_size)
batch_count = self.iteration * pair_count / self.batch_size
process_bar = tqdm(range(int(batch_count)))
# self.skip_gram_model.save_embedding(
# self.data.id2word, 'begin_embedding.txt', self.use_cuda)
for i in process_bar:
pos_pairs = self.data.get_batch_pairs(self.batch_size,
self.window_size)
neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
pos_u = [pair[0] for pair in pos_pairs]
pos_v = [pair[1] for pair in pos_pairs]
pos_u = Variable(torch.LongTensor(pos_u))
pos_v = Variable(torch.LongTensor(pos_v))
neg_v = Variable(torch.LongTensor(neg_v))
if self.use_cuda:
pos_u = pos_u.cuda()
pos_v = pos_v.cuda()
neg_v = neg_v.cuda()
self.optimizer.zero_grad()
loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
loss.backward()
self.optimizer.step()
process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
(loss.data[0],
self.optimizer.param_groups[0]['lr']))
if i * self.batch_size % 100000 == 0:
lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
for param_group in self.optimizer.param_groups:
param_group['lr'] = lr
self.skip_gram_model.save_embedding(
self.data.id2word, self.output_file_name, self.use_cuda)
if __name__ == '__main__':
w2v = Word2Vec(input_file_name=sys.argv[1], output_file_name=sys.argv[2])
w2v.train()