From 9b8ade928f17675e13d47b68397049354a8d0750 Mon Sep 17 00:00:00 2001 From: Jimmie Goode Date: Wed, 14 Oct 2015 16:31:27 -0500 Subject: [PATCH 1/5] Added method to restrict vocab of Word2Vec most similar search --- gensim/models/word2vec.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ea4c6af3bc..4994cc07fc 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1179,6 +1179,90 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None): # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn] + + def most_similar_in_list(self, positive=[], negative=[], topn=10, restrict_vocab=None): + """ + Find the top-N most similar words. Positive words contribute positively towards the + similarity, negative words negatively. + + This method computes cosine similarity between a simple mean of the projection + weight vectors of the given words and the vectors for each word in the model. + The method corresponds to the `word-analogy` and `distance` scripts in the original + word2vec implementation. + + If topn is False, most_similar returns the vector of similarity scores. + + `restrict_vocab` is optional. An integer values limits the range of vectors which + are searched for most-similar values. For example, restrict_vocab=10000 would + only check the first 10000 word vectors in the vocabulary order. (This may be + meaningful if you've sorted the vocabulary by descending frequency.) + If `restrict_vocab` is a list, then only the vectors for the words in that list are + searched for most-similar values. + + Example:: + + >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) + [('queen', 0.50882536), ...] + + """ + self.init_sims() + + if isinstance(positive, string_types) and not negative: + # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) + positive = [positive] + + # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words + positive = [ + (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word + for word in positive + ] + negative = [ + (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word + for word in negative + ] + + # compute the weighted average of all words + all_words, mean = set(), [] + for word, weight in positive + negative: + if isinstance(word, ndarray): + mean.append(weight * word) + elif word in self.vocab: + mean.append(weight * self.syn0norm[self.vocab[word].index]) + all_words.add(self.vocab[word].index) + else: + raise KeyError("word '%s' not in vocabulary" % word) + if not mean: + raise ValueError("cannot compute similarity with no input") + mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) + + if restrict_vocab is None: + limited = self.syn0norm + elif isinstance(restrict_vocab, int): + # Same behavior as `most_similar` method + limited = self.syn0norm[:restrict_vocab] + elif isinstance(restrict_vocab, list): + restrict_vocab = list(set(restrict_vocab)) + for word in restrict_vocab: + if word not in self.vocab: + raise KeyError("word '%s' not in vocabulary" % word) + restrict_vocab_idx = [self.vocab[word].index for word in restrict_vocab] + limited = self.syn0norm[restrict_vocab_idx] + + dists = dot(limited, mean) + if not topn: + return dists + best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) + + # ignore (don't return) words from the input + if not isinstance(restrict_vocab, list): + result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] + else: + result = [] + for sim in best: + idx = restrict_vocab_idx[sim] + if idx not in all_words: + result.append((self.index2word[idx], float(dists[sim]))) + return result[:topn] def most_similar_cosmul(self, positive=[], negative=[], topn=10): """ From 51d0bc2e0e5eb90a6132d3817cda546c04ebfa08 Mon Sep 17 00:00:00 2001 From: Jimmie Goode Date: Tue, 27 Oct 2015 15:13:56 -0500 Subject: [PATCH 2/5] Removed old most_similar method, renamed new method --- gensim/models/word2vec.py | 62 --------------------------------------- 1 file changed, 62 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 4994cc07fc..3296b44564 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1130,68 +1130,6 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None): If topn is False, most_similar returns the vector of similarity scores. - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example:: - - >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - """ - self.init_sims() - - if isinstance(positive, string_types) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) - positive = [positive] - - # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words - positive = [ - (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in positive - ] - negative = [ - (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in negative - ] - - # compute the weighted average of all words - all_words, mean = set(), [] - for word, weight in positive + negative: - if isinstance(word, ndarray): - mean.append(weight * word) - elif word in self.vocab: - mean.append(weight * self.syn0norm[self.vocab[word].index]) - all_words.add(self.vocab[word].index) - else: - raise KeyError("word '%s' not in vocabulary" % word) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - - limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] - dists = dot(limited, mean) - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) - # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] - return result[:topn] - - def most_similar_in_list(self, positive=[], negative=[], topn=10, restrict_vocab=None): - """ - Find the top-N most similar words. Positive words contribute positively towards the - similarity, negative words negatively. - - This method computes cosine similarity between a simple mean of the projection - weight vectors of the given words and the vectors for each word in the model. - The method corresponds to the `word-analogy` and `distance` scripts in the original - word2vec implementation. - - If topn is False, most_similar returns the vector of similarity scores. - `restrict_vocab` is optional. An integer values limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be From d389db1ebf9ddd5eadb8e973b218375516da9149 Mon Sep 17 00:00:00 2001 From: Jimmie Goode Date: Wed, 2 Dec 2015 16:20:23 -0600 Subject: [PATCH 3/5] Added support for pretrained word2vec model. --- gensim/models/word2vec.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 3296b44564..8fd973c904 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -342,7 +342,8 @@ def __init__( self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, hashfxn=hash, iter=1, null_word=0, - trim_rule=None, sorted_vocab=1): + trim_rule=None, sorted_vocab=1, + pretrained_model=None): """ Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. @@ -428,7 +429,11 @@ def __init__( self.train_count = 0 self.total_train_time = 0 self.sorted_vocab = sorted_vocab - + self.pretrained_model = pretrained_model + if self.pretrained_model is not None: + if self.vector_size != self.pretrained_model.vector_size: + raise Exception('Embedding dimension of pretrained model does not match.') + if sentences is not None: if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") @@ -947,10 +952,25 @@ def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") self.syn0 = empty((len(self.vocab), self.vector_size), dtype=REAL) + num_pretrained = 0 # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once for i in xrange(len(self.vocab)): - # construct deterministic seed from word AND seed argument - self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed)) + word = self.index2word[i] + if self.pretrained_model is not None: + word_idx_pretrained = self.pretrained_model.vocab.get(word, None) + if word_idx_pretrained is not None: + # use pretrained vector + self.syn0[i] = self.pretrained_model.syn0[word_idx_pretrained.index] + num_pretrained += 1 + else: + self.syn0[i] = self.seeded_vector(str(word) + str(self.seed)) + else: + # construct deterministic seed from word AND seed argument + self.syn0[i] = self.seeded_vector(str(word) + str(self.seed)) + + if self.pretrained_model is not None: + print 'Set weights using {:,} pretrained vectors of a possible {:,}.'.format(num_pretrained, len(self.vocab)) + if self.hs: self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL) if self.negative: @@ -1176,7 +1196,7 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None): if restrict_vocab is None: limited = self.syn0norm elif isinstance(restrict_vocab, int): - # Same behavior as `most_similar` method + # Same behavior as original `most_similar` method limited = self.syn0norm[:restrict_vocab] elif isinstance(restrict_vocab, list): restrict_vocab = list(set(restrict_vocab)) From 3972f9c7079ab7fe68a2b873a1a588d8f7a4f39e Mon Sep 17 00:00:00 2001 From: jimgoo Date: Thu, 21 Jan 2016 14:05:06 -0600 Subject: [PATCH 4/5] Removed unwanted cast to ASCII. --- gensim/models/word2vec.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8fd973c904..dbe7124589 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -966,7 +966,13 @@ def reset_weights(self): self.syn0[i] = self.seeded_vector(str(word) + str(self.seed)) else: # construct deterministic seed from word AND seed argument - self.syn0[i] = self.seeded_vector(str(word) + str(self.seed)) + if not isinstance(word, basestring): + # allow for integer "words" + word_str = str(word) + else: + # don't convert everything to ASCII b/c some Unicode words might fail. + word_str = word + self.syn0[i] = self.seeded_vector(word_str + str(self.seed)) if self.pretrained_model is not None: print 'Set weights using {:,} pretrained vectors of a possible {:,}.'.format(num_pretrained, len(self.vocab)) From 3462e602e503f98261bbf95e29cff49bc7540f22 Mon Sep 17 00:00:00 2001 From: Jimmie Goode Date: Tue, 9 Feb 2016 10:25:20 -0600 Subject: [PATCH 5/5] Triggering Travis build. --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8fd973c904..4b4fd64aa8 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1205,7 +1205,7 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None): raise KeyError("word '%s' not in vocabulary" % word) restrict_vocab_idx = [self.vocab[word].index for word in restrict_vocab] limited = self.syn0norm[restrict_vocab_idx] - + dists = dot(limited, mean) if not topn: return dists