From 9b8ade928f17675e13d47b68397049354a8d0750 Mon Sep 17 00:00:00 2001
From: Jimmie Goode <jimmiegoode@gmail.com>
Date: Wed, 14 Oct 2015 16:31:27 -0500
Subject: [PATCH 1/5] Added method to restrict vocab of Word2Vec most similar
 search

---
 gensim/models/word2vec.py | 84 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index ea4c6af3bc..4994cc07fc 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1179,6 +1179,90 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None):
         # ignore (don't return) words from the input
         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
         return result[:topn]
+    
+    def most_similar_in_list(self, positive=[], negative=[], topn=10, restrict_vocab=None):
+        """
+        Find the top-N most similar words. Positive words contribute positively towards the
+        similarity, negative words negatively.
+
+        This method computes cosine similarity between a simple mean of the projection
+        weight vectors of the given words and the vectors for each word in the model.
+        The method corresponds to the `word-analogy` and `distance` scripts in the original
+        word2vec implementation.
+
+        If topn is False, most_similar returns the vector of similarity scores.
+
+        `restrict_vocab` is optional. An integer values limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 word vectors in the vocabulary order. (This may be
+        meaningful if you've sorted the vocabulary by descending frequency.)
+        If `restrict_vocab` is a list, then only the vectors for the words in that list are
+        searched for most-similar values. 
+
+        Example::
+
+          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
+          [('queen', 0.50882536), ...]
+
+        """
+        self.init_sims()
+
+        if isinstance(positive, string_types) and not negative:
+            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
+            positive = [positive]
+
+        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
+        positive = [
+            (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
+            for word in positive
+        ]
+        negative = [
+            (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
+            for word in negative
+        ]
+
+        # compute the weighted average of all words
+        all_words, mean = set(), []
+        for word, weight in positive + negative:
+            if isinstance(word, ndarray):
+                mean.append(weight * word)
+            elif word in self.vocab:
+                mean.append(weight * self.syn0norm[self.vocab[word].index])
+                all_words.add(self.vocab[word].index)
+            else:
+                raise KeyError("word '%s' not in vocabulary" % word)
+        if not mean:
+            raise ValueError("cannot compute similarity with no input")
+        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+
+        if restrict_vocab is None:
+            limited = self.syn0norm
+        elif isinstance(restrict_vocab, int):
+            # Same behavior as `most_similar` method
+            limited = self.syn0norm[:restrict_vocab]
+        elif isinstance(restrict_vocab, list):
+            restrict_vocab = list(set(restrict_vocab))
+            for word in restrict_vocab:
+                if word not in self.vocab:
+                    raise KeyError("word '%s' not in vocabulary" % word)
+            restrict_vocab_idx = [self.vocab[word].index for word in restrict_vocab]
+            limited = self.syn0norm[restrict_vocab_idx]
+        
+        dists = dot(limited, mean)
+        if not topn:
+            return dists
+        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
+
+        # ignore (don't return) words from the input
+        if not isinstance(restrict_vocab, list):
+            result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
+        else:
+            result = []
+            for sim in best:
+                idx = restrict_vocab_idx[sim]
+                if idx not in all_words:
+                    result.append((self.index2word[idx], float(dists[sim])))
+        return result[:topn]
 
     def most_similar_cosmul(self, positive=[], negative=[], topn=10):
         """

From 51d0bc2e0e5eb90a6132d3817cda546c04ebfa08 Mon Sep 17 00:00:00 2001
From: Jimmie Goode <jimmiegoode@gmail.com>
Date: Tue, 27 Oct 2015 15:13:56 -0500
Subject: [PATCH 2/5] Removed old most_similar method, renamed new method

---
 gensim/models/word2vec.py | 62 ---------------------------------------
 1 file changed, 62 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 4994cc07fc..3296b44564 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1130,68 +1130,6 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None):
 
         If topn is False, most_similar returns the vector of similarity scores.
 
-        `restrict_vocab` is an optional integer which limits the range of vectors which
-        are searched for most-similar values. For example, restrict_vocab=10000 would
-        only check the first 10000 word vectors in the vocabulary order. (This may be
-        meaningful if you've sorted the vocabulary by descending frequency.)
-
-        Example::
-
-          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
-          [('queen', 0.50882536), ...]
-
-        """
-        self.init_sims()
-
-        if isinstance(positive, string_types) and not negative:
-            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
-            positive = [positive]
-
-        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
-        positive = [
-            (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
-            for word in positive
-        ]
-        negative = [
-            (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
-            for word in negative
-        ]
-
-        # compute the weighted average of all words
-        all_words, mean = set(), []
-        for word, weight in positive + negative:
-            if isinstance(word, ndarray):
-                mean.append(weight * word)
-            elif word in self.vocab:
-                mean.append(weight * self.syn0norm[self.vocab[word].index])
-                all_words.add(self.vocab[word].index)
-            else:
-                raise KeyError("word '%s' not in vocabulary" % word)
-        if not mean:
-            raise ValueError("cannot compute similarity with no input")
-        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
-
-        limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
-        dists = dot(limited, mean)
-        if not topn:
-            return dists
-        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
-        # ignore (don't return) words from the input
-        result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
-        return result[:topn]
-    
-    def most_similar_in_list(self, positive=[], negative=[], topn=10, restrict_vocab=None):
-        """
-        Find the top-N most similar words. Positive words contribute positively towards the
-        similarity, negative words negatively.
-
-        This method computes cosine similarity between a simple mean of the projection
-        weight vectors of the given words and the vectors for each word in the model.
-        The method corresponds to the `word-analogy` and `distance` scripts in the original
-        word2vec implementation.
-
-        If topn is False, most_similar returns the vector of similarity scores.
-
         `restrict_vocab` is optional. An integer values limits the range of vectors which
         are searched for most-similar values. For example, restrict_vocab=10000 would
         only check the first 10000 word vectors in the vocabulary order. (This may be

From d389db1ebf9ddd5eadb8e973b218375516da9149 Mon Sep 17 00:00:00 2001
From: Jimmie Goode <jimmiegoode@gmail.com>
Date: Wed, 2 Dec 2015 16:20:23 -0600
Subject: [PATCH 3/5] Added support for pretrained word2vec model.

---
 gensim/models/word2vec.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 3296b44564..8fd973c904 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -342,7 +342,8 @@ def __init__(
             self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001,
             sg=1, hs=1, negative=0, cbow_mean=0, hashfxn=hash, iter=1, null_word=0,
-            trim_rule=None, sorted_vocab=1):
+            trim_rule=None, sorted_vocab=1,
+            pretrained_model=None):
         """
         Initialize the model from an iterable of `sentences`. Each sentence is a
         list of words (unicode strings) that will be used for training.
@@ -428,7 +429,11 @@ def __init__(
         self.train_count = 0
         self.total_train_time = 0
         self.sorted_vocab = sorted_vocab
-
+        self.pretrained_model = pretrained_model
+        if self.pretrained_model is not None:
+            if self.vector_size != self.pretrained_model.vector_size:
+                raise Exception('Embedding dimension of pretrained model does not match.')
+            
         if sentences is not None:
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
@@ -947,10 +952,25 @@ def reset_weights(self):
         """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
         logger.info("resetting layer weights")
         self.syn0 = empty((len(self.vocab), self.vector_size), dtype=REAL)
+        num_pretrained = 0
         # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
         for i in xrange(len(self.vocab)):
-            # construct deterministic seed from word AND seed argument
-            self.syn0[i] = self.seeded_vector(self.index2word[i] + str(self.seed))
+            word = self.index2word[i]
+            if self.pretrained_model is not None:
+                word_idx_pretrained = self.pretrained_model.vocab.get(word, None)
+                if word_idx_pretrained is not None:
+                    # use pretrained vector
+                    self.syn0[i] = self.pretrained_model.syn0[word_idx_pretrained.index]
+                    num_pretrained += 1
+                else:
+                    self.syn0[i] = self.seeded_vector(str(word) + str(self.seed))
+            else:
+                # construct deterministic seed from word AND seed argument
+                self.syn0[i] = self.seeded_vector(str(word) + str(self.seed))
+
+        if self.pretrained_model is not None:
+            print 'Set weights using {:,} pretrained vectors of a possible {:,}.'.format(num_pretrained, len(self.vocab))
+                
         if self.hs:
             self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL)
         if self.negative:
@@ -1176,7 +1196,7 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None):
         if restrict_vocab is None:
             limited = self.syn0norm
         elif isinstance(restrict_vocab, int):
-            # Same behavior as `most_similar` method
+            # Same behavior as original `most_similar` method
             limited = self.syn0norm[:restrict_vocab]
         elif isinstance(restrict_vocab, list):
             restrict_vocab = list(set(restrict_vocab))

From 3972f9c7079ab7fe68a2b873a1a588d8f7a4f39e Mon Sep 17 00:00:00 2001
From: jimgoo <jimmiegoode@gmail.com>
Date: Thu, 21 Jan 2016 14:05:06 -0600
Subject: [PATCH 4/5] Removed unwanted cast to ASCII.

---
 gensim/models/word2vec.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 8fd973c904..dbe7124589 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -966,7 +966,13 @@ def reset_weights(self):
                     self.syn0[i] = self.seeded_vector(str(word) + str(self.seed))
             else:
                 # construct deterministic seed from word AND seed argument
-                self.syn0[i] = self.seeded_vector(str(word) + str(self.seed))
+                if not isinstance(word, basestring):
+                    # allow for integer "words"
+                    word_str = str(word)
+                else:
+                    # don't convert everything to ASCII b/c some Unicode words might fail.
+                    word_str = word
+                self.syn0[i] = self.seeded_vector(word_str + str(self.seed))
 
         if self.pretrained_model is not None:
             print 'Set weights using {:,} pretrained vectors of a possible {:,}.'.format(num_pretrained, len(self.vocab))

From 3462e602e503f98261bbf95e29cff49bc7540f22 Mon Sep 17 00:00:00 2001
From: Jimmie Goode <jimmiegoode@gmail.com>
Date: Tue, 9 Feb 2016 10:25:20 -0600
Subject: [PATCH 5/5] Triggering Travis build.

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 8fd973c904..4b4fd64aa8 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1205,7 +1205,7 @@ def most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None):
                     raise KeyError("word '%s' not in vocabulary" % word)
             restrict_vocab_idx = [self.vocab[word].index for word in restrict_vocab]
             limited = self.syn0norm[restrict_vocab_idx]
-        
+
         dists = dot(limited, mean)
         if not topn:
             return dists