From 76d194b3701ad176119d09913b321806c6aa3fc9 Mon Sep 17 00:00:00 2001
From: Fernando Camargo <fernando.camargo.ti@gmail.com>
Date: Thu, 21 Jun 2018 22:02:10 -0300
Subject: [PATCH] Add `ns_exponent` parameter to control the negative sampling
 distribution for `*2vec` models. Fix #2090 (#2093)

* Adding ns_exponent parameter to control the negative sampling distribution.

* Fixed a code style problem.

* Updated the documentation of the ns_exponent parameter.
---
 gensim/models/base_any2vec.py |  7 ++++++-
 gensim/models/doc2vec.py      | 18 +++++++++++++++---
 gensim/models/fasttext.py     | 16 +++++++++++-----
 gensim/models/word2vec.py     | 25 ++++++++++++++++---------
 4 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
index 3b6dabf95e..5cdb54930d 100644
--- a/gensim/models/base_any2vec.py
+++ b/gensim/models/base_any2vec.py
@@ -526,7 +526,7 @@ def _set_train_params(self, **kwargs):
         raise NotImplementedError()
 
     def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000,
-                 trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1,
+                 trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
                  min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
         """
 
@@ -603,6 +603,7 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac
         self.min_alpha = float(min_alpha)
         self.hs = int(hs)
         self.negative = int(negative)
+        self.ns_exponent = ns_exponent
         self.cbow_mean = int(cbow_mean)
         self.compute_loss = bool(compute_loss)
         self.running_training_loss = 0
@@ -1098,6 +1099,10 @@ def load(cls, *args, **kwargs):
 
         """
         model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
+        if not hasattr(model, 'ns_exponent'):
+            model.ns_exponent = 0.75
+        if not hasattr(model.vocabulary, 'ns_exponent'):
+            model.vocabulary.ns_exponent = 0.75
         if model.negative and hasattr(model.wv, 'index2word'):
             model.vocabulary.make_cum_table(model.wv)  # rebuild cum_table from vocabulary
         if not hasattr(model, 'corpus_count'):
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index c2cfd1e16d..d73e6e777a 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -483,6 +483,12 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
             If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
             should be drawn (usually between 5-20).
             If set to 0, no negative sampling is used.
+        ns_exponent : float, optional
+            The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
+            to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
+            than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
+            More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
+            other values may perform better for recommendation applications.
         dm_mean : {1,0}, optional
             If 0 , use the sum of the context word vectors. If 1, use the mean.
             Only applies when `dm` is used in non-concatenative mode.
@@ -546,7 +552,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
         self.dm_tag_count = int(dm_tag_count)
 
         kwargs['null_word'] = dm_concat
-        vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word']
+        vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'ns_exponent']
         vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs)
         self.vocabulary = Doc2VecVocab(**vocabulary_kwargs)
 
@@ -1086,7 +1092,7 @@ class Doc2VecVocab(Word2VecVocab):
     This includes a mapping from words found in the corpus to their total frequency count.
 
     """
-    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
+    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
         """
 
         Parameters
@@ -1105,11 +1111,17 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
         null_word : {0, 1}
             If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words).
             This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter.
+        ns_exponent : float, optional
+            The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
+            to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
+            than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
+            More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
+            other values may perform better for recommendation applications.
 
         """
         super(Doc2VecVocab, self).__init__(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=sorted_vocab, null_word=null_word)
+            sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)
 
     def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None):
         """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count.
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index e9c3dda239..20430eb1e8 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -243,8 +243,8 @@ class FastText(BaseWordEmbeddingsModel):
     """
     def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1,
-                 bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
+                 negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
+                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
         """
 
         Parameters
@@ -290,6 +290,12 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
             If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
             should be drawn (usually between 5-20).
             If set to 0, no negative sampling is used.
+        ns_exponent : float, optional
+            The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
+            to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
+            than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
+            More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
+            other values may perform better for recommendation applications.
         cbow_mean : {1,0}, optional
             If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
         hashfxn : function, optional
@@ -352,7 +358,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
         self.wv = FastTextKeyedVectors(size, min_n, max_n)
         self.vocabulary = FastTextVocab(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=bool(sorted_vocab), null_word=null_word)
+            sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
         self.trainables = FastTextTrainables(
             vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
         self.wv.bucket = self.bucket
@@ -903,10 +909,10 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
 
 class FastTextVocab(Word2VecVocab):
     """Vocabulary used by :class:`~gensim.models.fasttext.FastText`."""
-    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
+    def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
         super(FastTextVocab, self).__init__(
             max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=sorted_vocab, null_word=null_word)
+            sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)
 
     def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None,
                       min_count=None, sample=None, dry_run=False):
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 018a19c467..d163784c1c 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -628,7 +628,7 @@ class Word2Vec(BaseWordEmbeddingsModel):
     """
     def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
-                 sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
+                 sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                  trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
                  max_final_vocab=None):
         """
@@ -661,6 +661,12 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
             If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
             should be drawn (usually between 5-20).
             If set to 0, no negative sampling is used.
+        ns_exponent : float, optional
+            The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
+            to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
+            than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
+            More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
+            other values may perform better for recommendation applications.
         cbow_mean : {0, 1}, optional
             If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
         alpha : float, optional
@@ -731,8 +737,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
 
         self.wv = Word2VecKeyedVectors(size)
         self.vocabulary = Word2VecVocab(
-            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
-            sorted_vocab=bool(sorted_vocab), null_word=null_word, max_final_vocab=max_final_vocab)
+            max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab),
+            null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent)
         self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
 
         super(Word2Vec, self).__init__(
@@ -1444,7 +1450,7 @@ class Word2VecVocab(utils.SaveLoad):
     """Vocabulary used by :class:`~gensim.models.word2vec.Word2Vec`."""
     def __init__(
             self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0,
-            max_final_vocab=None):
+            max_final_vocab=None, ns_exponent=0.75):
         self.max_vocab_size = max_vocab_size
         self.min_count = min_count
         self.sample = sample
@@ -1453,6 +1459,7 @@ def __init__(
         self.cum_table = None  # for negative sampling
         self.raw_vocab = None
         self.max_final_vocab = max_final_vocab
+        self.ns_exponent = ns_exponent
 
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         """Do an initial scan of all words appearing in sentences."""
@@ -1698,9 +1705,9 @@ def create_binary_tree(self, wv):
 
             logger.info("built huffman tree with maximum node depth %i", max_depth)
 
-    def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
-        """Create a cumulative-distribution table using stored vocabulary word counts for drawing random words
-        in the negative-sampling training routines.
+    def make_cum_table(self, wv, domain=2**31 - 1):
+        """Create a cumulative-distribution table using stored vocabulary word counts for
+        drawing random words in the negative-sampling training routines.
 
         To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]),
         then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`).
@@ -1714,10 +1721,10 @@ def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
         # compute sum of all power (Z in paper)
         train_words_pow = 0.0
         for word_index in xrange(vocab_size):
-            train_words_pow += wv.vocab[wv.index2word[word_index]].count**power
+            train_words_pow += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent
         cumulative = 0.0
         for word_index in xrange(vocab_size):
-            cumulative += wv.vocab[wv.index2word[word_index]].count**power
+            cumulative += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent
             self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
         if len(self.cum_table) > 0:
             assert self.cum_table[-1] == domain