From 76d194b3701ad176119d09913b321806c6aa3fc9 Mon Sep 17 00:00:00 2001 From: Fernando Camargo Date: Thu, 21 Jun 2018 22:02:10 -0300 Subject: [PATCH] Add `ns_exponent` parameter to control the negative sampling distribution for `*2vec` models. Fix #2090 (#2093) * Adding ns_exponent parameter to control the negative sampling distribution. * Fixed a code style problem. * Updated the documentation of the ns_exponent parameter. --- gensim/models/base_any2vec.py | 7 ++++++- gensim/models/doc2vec.py | 18 +++++++++++++++--- gensim/models/fasttext.py | 16 +++++++++++----- gensim/models/word2vec.py | 25 ++++++++++++++++--------- 4 files changed, 48 insertions(+), 18 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 3b6dabf95e..5cdb54930d 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -526,7 +526,7 @@ def _set_train_params(self, **kwargs): raise NotImplementedError() def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, - trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, + trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): """ @@ -603,6 +603,7 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac self.min_alpha = float(min_alpha) self.hs = int(hs) self.negative = int(negative) + self.ns_exponent = ns_exponent self.cbow_mean = int(cbow_mean) self.compute_loss = bool(compute_loss) self.running_training_loss = 0 @@ -1098,6 +1099,10 @@ def load(cls, *args, **kwargs): """ model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs) + if not hasattr(model, 'ns_exponent'): + model.ns_exponent = 0.75 + if not hasattr(model.vocabulary, 'ns_exponent'): + model.vocabulary.ns_exponent = 0.75 if model.negative and hasattr(model.wv, 'index2word'): model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary if not hasattr(model, 'corpus_count'): diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c2cfd1e16d..d73e6e777a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -483,6 +483,12 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. + ns_exponent : float, optional + The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion + to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more + than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. + More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that + other values may perform better for recommendation applications. dm_mean : {1,0}, optional If 0 , use the sum of the context word vectors. If 1, use the mean. Only applies when `dm` is used in non-concatenative mode. @@ -546,7 +552,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0 self.dm_tag_count = int(dm_tag_count) kwargs['null_word'] = dm_concat - vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word'] + vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'ns_exponent'] vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs) self.vocabulary = Doc2VecVocab(**vocabulary_kwargs) @@ -1086,7 +1092,7 @@ class Doc2VecVocab(Word2VecVocab): This includes a mapping from words found in the corpus to their total frequency count. """ - def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): + def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75): """ Parameters @@ -1105,11 +1111,17 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T null_word : {0, 1} If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words). This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter. + ns_exponent : float, optional + The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion + to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more + than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. + More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that + other values may perform better for recommendation applications. """ super(Doc2VecVocab, self).__init__( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=sorted_vocab, null_word=null_word) + sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent) def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None): """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count. diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index e9c3dda239..20430eb1e8 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -243,8 +243,8 @@ class FastText(BaseWordEmbeddingsModel): """ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, - bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()): + negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, + sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()): """ Parameters @@ -290,6 +290,12 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. + ns_exponent : float, optional + The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion + to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more + than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. + More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that + other values may perform better for recommendation applications. cbow_mean : {1,0}, optional If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. hashfxn : function, optional @@ -352,7 +358,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, self.wv = FastTextKeyedVectors(size, min_n, max_n) self.vocabulary = FastTextVocab( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=bool(sorted_vocab), null_word=null_word) + sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent) self.trainables = FastTextTrainables( vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) self.wv.bucket = self.bucket @@ -903,10 +909,10 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse class FastTextVocab(Word2VecVocab): """Vocabulary used by :class:`~gensim.models.fasttext.FastText`.""" - def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): + def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75): super(FastTextVocab, self).__init__( max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=sorted_vocab, null_word=null_word) + sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent) def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None, min_count=None, sample=None, dry_run=False): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 018a19c467..d163784c1c 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -628,7 +628,7 @@ class Word2Vec(BaseWordEmbeddingsModel): """ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), max_final_vocab=None): """ @@ -661,6 +661,12 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. + ns_exponent : float, optional + The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion + to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more + than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. + More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that + other values may perform better for recommendation applications. cbow_mean : {0, 1}, optional If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. alpha : float, optional @@ -731,8 +737,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, self.wv = Word2VecKeyedVectors(size) self.vocabulary = Word2VecVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=bool(sorted_vocab), null_word=null_word, max_final_vocab=max_final_vocab) + max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab), + null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent) self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) super(Word2Vec, self).__init__( @@ -1444,7 +1450,7 @@ class Word2VecVocab(utils.SaveLoad): """Vocabulary used by :class:`~gensim.models.word2vec.Word2Vec`.""" def __init__( self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, - max_final_vocab=None): + max_final_vocab=None, ns_exponent=0.75): self.max_vocab_size = max_vocab_size self.min_count = min_count self.sample = sample @@ -1453,6 +1459,7 @@ def __init__( self.cum_table = None # for negative sampling self.raw_vocab = None self.max_final_vocab = max_final_vocab + self.ns_exponent = ns_exponent def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): """Do an initial scan of all words appearing in sentences.""" @@ -1698,9 +1705,9 @@ def create_binary_tree(self, wv): logger.info("built huffman tree with maximum node depth %i", max_depth) - def make_cum_table(self, wv, power=0.75, domain=2**31 - 1): - """Create a cumulative-distribution table using stored vocabulary word counts for drawing random words - in the negative-sampling training routines. + def make_cum_table(self, wv, domain=2**31 - 1): + """Create a cumulative-distribution table using stored vocabulary word counts for + drawing random words in the negative-sampling training routines. To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`). @@ -1714,10 +1721,10 @@ def make_cum_table(self, wv, power=0.75, domain=2**31 - 1): # compute sum of all power (Z in paper) train_words_pow = 0.0 for word_index in xrange(vocab_size): - train_words_pow += wv.vocab[wv.index2word[word_index]].count**power + train_words_pow += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent cumulative = 0.0 for word_index in xrange(vocab_size): - cumulative += wv.vocab[wv.index2word[word_index]].count**power + cumulative += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain