Skip to content

Commit

Permalink
Add ns_exponent parameter to control the negative sampling distribu…
Browse files Browse the repository at this point in the history
…tion for `*2vec` models. Fix #2090 (#2093)

* Adding ns_exponent parameter to control the negative sampling distribution.

* Fixed a code style problem.

* Updated the documentation of the ns_exponent parameter.
  • Loading branch information
fernandocamargoai authored and menshikh-iv committed Jun 22, 2018
1 parent 309da79 commit 76d194b
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 18 deletions.
7 changes: 6 additions & 1 deletion gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def _set_train_params(self, **kwargs):
raise NotImplementedError()

def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000,
trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1,
trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1,
min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs):
"""
Expand Down Expand Up @@ -603,6 +603,7 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac
self.min_alpha = float(min_alpha)
self.hs = int(hs)
self.negative = int(negative)
self.ns_exponent = ns_exponent
self.cbow_mean = int(cbow_mean)
self.compute_loss = bool(compute_loss)
self.running_training_loss = 0
Expand Down Expand Up @@ -1098,6 +1099,10 @@ def load(cls, *args, **kwargs):
"""
model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs)
if not hasattr(model, 'ns_exponent'):
model.ns_exponent = 0.75
if not hasattr(model.vocabulary, 'ns_exponent'):
model.vocabulary.ns_exponent = 0.75
if model.negative and hasattr(model.wv, 'index2word'):
model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary
if not hasattr(model, 'corpus_count'):
Expand Down
18 changes: 15 additions & 3 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,12 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
other values may perform better for recommendation applications.
dm_mean : {1,0}, optional
If 0 , use the sum of the context word vectors. If 1, use the mean.
Only applies when `dm` is used in non-concatenative mode.
Expand Down Expand Up @@ -546,7 +552,7 @@ def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0
self.dm_tag_count = int(dm_tag_count)

kwargs['null_word'] = dm_concat
vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word']
vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'ns_exponent']
vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs)
self.vocabulary = Doc2VecVocab(**vocabulary_kwargs)

Expand Down Expand Up @@ -1086,7 +1092,7 @@ class Doc2VecVocab(Word2VecVocab):
This includes a mapping from words found in the corpus to their total frequency count.
"""
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
"""
Parameters
Expand All @@ -1105,11 +1111,17 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
null_word : {0, 1}
If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words).
This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
other values may perform better for recommendation applications.
"""
super(Doc2VecVocab, self).__init__(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=sorted_vocab, null_word=null_word)
sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)

def scan_vocab(self, documents, docvecs, progress_per=10000, trim_rule=None):
"""Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count.
Expand Down
16 changes: 11 additions & 5 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ class FastText(BaseWordEmbeddingsModel):
"""
def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1,
bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
"""
Parameters
Expand Down Expand Up @@ -290,6 +290,12 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
other values may perform better for recommendation applications.
cbow_mean : {1,0}, optional
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hashfxn : function, optional
Expand Down Expand Up @@ -352,7 +358,7 @@ def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5,
self.wv = FastTextKeyedVectors(size, min_n, max_n)
self.vocabulary = FastTextVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=bool(sorted_vocab), null_word=null_word)
sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
self.trainables = FastTextTrainables(
vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
self.wv.bucket = self.bucket
Expand Down Expand Up @@ -903,10 +909,10 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse

class FastTextVocab(Word2VecVocab):
"""Vocabulary used by :class:`~gensim.models.fasttext.FastText`."""
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
super(FastTextVocab, self).__init__(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=sorted_vocab, null_word=null_word)
sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)

def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None,
min_count=None, sample=None, dry_run=False):
Expand Down
25 changes: 16 additions & 9 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ class Word2Vec(BaseWordEmbeddingsModel):
"""
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
max_final_vocab=None):
"""
Expand Down Expand Up @@ -661,6 +661,12 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
other values may perform better for recommendation applications.
cbow_mean : {0, 1}, optional
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
alpha : float, optional
Expand Down Expand Up @@ -731,8 +737,8 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,

self.wv = Word2VecKeyedVectors(size)
self.vocabulary = Word2VecVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=bool(sorted_vocab), null_word=null_word, max_final_vocab=max_final_vocab)
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab),
null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent)
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)

super(Word2Vec, self).__init__(
Expand Down Expand Up @@ -1444,7 +1450,7 @@ class Word2VecVocab(utils.SaveLoad):
"""Vocabulary used by :class:`~gensim.models.word2vec.Word2Vec`."""
def __init__(
self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0,
max_final_vocab=None):
max_final_vocab=None, ns_exponent=0.75):
self.max_vocab_size = max_vocab_size
self.min_count = min_count
self.sample = sample
Expand All @@ -1453,6 +1459,7 @@ def __init__(
self.cum_table = None # for negative sampling
self.raw_vocab = None
self.max_final_vocab = max_final_vocab
self.ns_exponent = ns_exponent

def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
"""Do an initial scan of all words appearing in sentences."""
Expand Down Expand Up @@ -1698,9 +1705,9 @@ def create_binary_tree(self, wv):

logger.info("built huffman tree with maximum node depth %i", max_depth)

def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
"""Create a cumulative-distribution table using stored vocabulary word counts for drawing random words
in the negative-sampling training routines.
def make_cum_table(self, wv, domain=2**31 - 1):
"""Create a cumulative-distribution table using stored vocabulary word counts for
drawing random words in the negative-sampling training routines.
To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]),
then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`).
Expand All @@ -1714,10 +1721,10 @@ def make_cum_table(self, wv, power=0.75, domain=2**31 - 1):
# compute sum of all power (Z in paper)
train_words_pow = 0.0
for word_index in xrange(vocab_size):
train_words_pow += wv.vocab[wv.index2word[word_index]].count**power
train_words_pow += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent
cumulative = 0.0
for word_index in xrange(vocab_size):
cumulative += wv.vocab[wv.index2word[word_index]].count**power
cumulative += wv.vocab[wv.index2word[word_index]].count**self.ns_exponent
self.cum_table[word_index] = round(cumulative / train_words_pow * domain)
if len(self.cum_table) > 0:
assert self.cum_table[-1] == domain
Expand Down

0 comments on commit 76d194b

Please sign in to comment.