piskvorky · saroufimc1 · Dec 15, 2017 · Jan 17, 2018 · Feb 14, 2018 · menshikh-iv
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Author: Jayant Jain <jayantjain1992@gmail.com>
-# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
+# Copyright (C) 2017 Radim Rehurek <radim@rare-technologies.com>
+# Copyright (C) 2017 Carl Saroufim <carl_saroufim@hotmail.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -390,26 +390,30 @@ def struct_unpack(self, file_handle, fmt):
 
     def init_ngrams(self):
         """
-        Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams.
-        Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
-        vectors are discarded here to save space.
-
+        Computes ngrams of all words present in vocabulary and stores vectors for those ngrams.
         """
         self.wv.ngrams = {}
         all_ngrams = []
+        self.wv.syn0_vocab = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL)
         self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL)
 
         for w, vocab in self.wv.vocab.items():
             all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n)
-            self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index])
+            self.wv.syn0_vocab[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index])
 
-        all_ngrams = set(all_ngrams)
-        self.num_ngram_vectors = len(all_ngrams)
+        self.wv.hash2index = {}
         ngram_indices = []
+        new_hash_count = 0
         for i, ngram in enumerate(all_ngrams):
-            ngram_hash = ft_hash(ngram)
-            ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket)
-            self.wv.ngrams[ngram] = i
+            ngram_hash = ft_hash(ngram) % self.bucket
+            if ngram_hash in self.wv.hash2index:
+                self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
+            else:
+                ngram_indices.append(len(self.wv.vocab) + ngram_hash)
+                self.wv.hash2index[ngram_hash] = new_hash_count
+                self.wv.ngrams[ngram] = new_hash_count
+                new_hash_count = new_hash_count + 1
+
         self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
 
         ngram_weights = self.wv.syn0_ngrams
@@ -420,17 +424,17 @@ def init_ngrams(self):
         )
 
         for w, vocab in self.wv.vocab.items():
-            word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n)
-            for word_ngram in word_ngrams:
-                self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])
-
-            self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
+            word_vec = np.copy(self.wv.syn0_vocab[vocab.index])
+            ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n)
+            for ngram in ngrams:
+                word_vec += self.wv.syn0_ngrams[self.wv.ngrams[ngram]]
+            word_vec /= (len(ngrams) + 1)
+            self.wv.syn0[vocab.index] += word_vec
         logger.info(
             "loaded %s weight matrix for fastText model from %s",
             self.wv.syn0.shape, self.file_name
         )
 
-
 def compute_ngrams(word, min_n, max_n):
     BOW, EOW = ('<', '>')  # Used by FastText to attach to all words as prefix and suffix
     extended_word = BOW + word + EOW