piskvorky · menshikh-iv · Jun 28, 2017 · May 22, 2017 · May 22, 2017 · May 22, 2017
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
@@ -35,7 +35,7 @@
 import numpy as np
 from numpy import float32 as REAL, sqrt, newaxis
 from gensim import utils
-from gensim.models.keyedvectors import KeyedVectors
+from gensim.models.keyedvectors import KeyedVectors, Vocab
 from gensim.models.word2vec import Word2Vec
 
 from six import string_types
@@ -233,11 +233,12 @@ def load_fasttext_format(cls, model_file, encoding='utf8'):
 
         `model_file` is the path to the FastText output files.
         FastText outputs two training files - `/path/to/train.vec` and `/path/to/train.bin`
-        Expected value for this example: `/path/to/train`
+        Expected value for this example: `/path/to/train`. However, you only need .bin
+        file to load the entire model.
 
         """
         model = cls()
-        model.wv = cls.load_word2vec_format('%s.vec' % model_file, encoding=encoding)
+        model.file_name = model_file
         model.load_binary_data('%s.bin' % model_file, encoding=encoding)
         return model
 
@@ -284,12 +285,12 @@ def load_model_params(self, file_handle):
     def load_dict(self, file_handle, encoding='utf8'):
         vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i')
         # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
-        assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
-        assert len(self.wv.vocab) == vocab_size, 'mismatch between vocab sizes'
+        logger.info("loading vocabulary words for fastText model from %s.bin", self.file_name)
+
         self.struct_unpack(file_handle, '@1q')  # number of tokens
         if self.new_format:
             pruneidx_size, = self.struct_unpack(file_handle, '@q')
-        for i in range(nwords):
+        for i in range(vocab_size):
             word_bytes = b''
             char_byte = file_handle.read(1)
             # Read vocab word
@@ -298,8 +299,25 @@ def load_dict(self, file_handle, encoding='utf8'):
                 char_byte = file_handle.read(1)
             word = word_bytes.decode(encoding)
             count, _ = self.struct_unpack(file_handle, '@qb')
-            assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
-            self.wv.vocab[word].count = count
+
+            if i == nwords and i < vocab_size:
+                """
+                To handle the error in pretrained vector wiki.fr (French).
+                For more info : https://github.com/facebookresearch/fastText/issues/218
+
+                """
+                assert word == "__label__"
+                continue   # don't add word to vocab
+
+            self.wv.vocab[word] = Vocab(index=i, count=count)
+            self.wv.index2word.append(word)
+
+        assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
+        if len(self.wv.vocab) != vocab_size:
+            logger.warning("mismatch between vocab sizes")
+            logger.warning("If you are loading any model other than pretrained vector wiki.fr, ")
+            logger.warning("Please report to Gensim.")
+
 
         if self.new_format:
             for j in range(pruneidx_size):
@@ -337,8 +355,12 @@ def init_ngrams(self):
         """
         self.wv.ngrams = {}
         all_ngrams = []
-        for w, v in self.wv.vocab.items():
+        self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL)
+
+        for w, vocab in self.wv.vocab.items():
             all_ngrams += self.compute_ngrams(w, self.wv.min_n, self.wv.max_n)
+            self.wv.syn0[vocab.index] += np.array(self.wv.syn0_all[vocab.index])
+
         all_ngrams = set(all_ngrams)
         self.num_ngram_vectors = len(all_ngrams)
         ngram_indices = []
@@ -348,6 +370,18 @@ def init_ngrams(self):
             self.wv.ngrams[ngram] = i
         self.wv.syn0_all = self.wv.syn0_all.take(ngram_indices, axis=0)
 
+        ngram_weights = self.wv.syn0_all
+
+        logger.info("loading weights for %s vocabulary words for fastText models from %s.bin", len(self.wv.vocab), self.file_name)
+
+        for w, vocab in self.wv.vocab.items():
+            word_ngrams = self.compute_ngrams(w, self.wv.min_n, self.wv.max_n)
+            for word_ngram in word_ngrams:
+                self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]])
+
+            self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1)
+        logger.info("loaded %s weight matrix for fastText model from %s.bin", self.wv.syn0.shape, self.file_name)
+
     @staticmethod
     def compute_ngrams(word, min_n, max_n):
         ngram_indices = []

diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py
@@ -64,7 +64,6 @@ def testTraining(self):
         self.model_sanity(trained_model)
 
         # Tests temporary training files deleted
-        self.assertFalse(os.path.exists('%s.vec' % testfile()))
         self.assertFalse(os.path.exists('%s.bin' % testfile()))
 
     def testMinCount(self):
@@ -115,7 +114,7 @@ def testNormalizedVectorsNotSaved(self):
         self.assertTrue(loaded_kv.syn0_all_norm is None)
 
     def testLoadFastTextFormat(self):
-        """Test model successfully loaded from fastText .vec and .bin files"""
+        """Test model successfully loaded from fastText .bin files"""
         try:
             model = fasttext.FastText.load_fasttext_format(self.test_model_file)
         except Exception as exc:
@@ -166,7 +165,7 @@ def testLoadFastTextFormat(self):
         self.model_sanity(model)
 
     def testLoadFastTextNewFormat(self):
-        """ Test model successfully loaded from fastText (new format) .vec and .bin files """
+        """ Test model successfully loaded from fastText (new format) .bin files """
         try:
             new_model = fasttext.FastText.load_fasttext_format(self.test_new_model_file)
         except Exception as exc: