Skip to content

Commit

Permalink
Add dtype to LDAModel. Partially fix (piskvorky#1656)
Browse files Browse the repository at this point in the history
* Add dtype to LdaModel, assert about it everywhere

* Implement loading of old models without dtype

* Use assert_allclose instead of == in LdaModel tests. Use np.issubdtype when checking if something is float.

* Fix AuthorTopicModel

* Fix matutils.dirichlet_expectation

 * replace assert with docstring comment
 * add test to check that it really saves dtype for different inputs

* Change default to np.float32 and cleanup

* Fix wrong logging message

* Remove out-of-place comment

* Cleanup PEP8

* Add dtype to sklearn LdaTransformer

* Set precision explicitly in lda model converters

* Add dtype to LdaMulticore

* Set dtype to float64 explicitly to retain backward compatibility in models using LdaModel

* Cleanup asserts and fix another place calculating in float64

* Fix import

* Fix remarks by piskvorky

* Add backward compatibility tests

* Add missing .npy files

* Fix dirichlet_expectation not working with np.float16

* Fix path to saved model
  • Loading branch information
xelez authored and KMarie1 committed Nov 26, 2017
1 parent f85575e commit 02d848f
Show file tree
Hide file tree
Showing 22 changed files with 157 additions and 58 deletions.
3 changes: 1 addition & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,13 +602,12 @@ def jaccard_distance(set1, set2):
def dirichlet_expectation(alpha):
"""
For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.
"""
if len(alpha.shape) == 1:
result = psi(alpha) - psi(np.sum(alpha))
else:
result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
return result.astype(alpha.dtype) # keep the same precision as input
return result.astype(alpha.dtype, copy=False) # keep the same precision as input


def qr_destroy(la):
Expand Down
4 changes: 4 additions & 0 deletions gensim/models/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(self, eta, lambda_shape, gamma_shape):
self.sstats = np.zeros(lambda_shape)
self.gamma = np.zeros(gamma_shape)
self.numdocs = 0
self.dtype = np.float64 # To be compatible with LdaState


def construct_doc2author(corpus, author2doc):
Expand Down Expand Up @@ -203,6 +204,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
>>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data
"""
# NOTE: this doesn't call constructor of a base class, but duplicates most of this code
# so we have to set dtype to float64 default here
self.dtype = np.float64

# NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
# infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def suggested_lda_model(self):
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
"""
alpha, beta = self.hdp_to_lda()
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state, dtype=np.float64)
ldam.expElogbeta[:] = beta
return ldam

Expand Down
89 changes: 66 additions & 23 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def update_dir_prior(prior, N, logphat, rho):
**Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.**
http://jonathan-huang.org/research/dirichlet/dirichlet.pdf
"""
dprior = np.copy(prior)
dprior = np.copy(prior) # TODO: unused var???
gradf = N * (psi(np.sum(prior)) - psi(prior) + logphat)

c = N * polygamma(1, np.sum(prior))
Expand All @@ -90,10 +90,11 @@ class LdaState(utils.SaveLoad):
"""

def __init__(self, eta, shape):
self.eta = eta
self.sstats = np.zeros(shape)
def __init__(self, eta, shape, dtype=np.float32):
self.eta = eta.astype(dtype, copy=False)
self.sstats = np.zeros(shape, dtype=dtype)
self.numdocs = 0
self.dtype = dtype

def reset(self):
"""
Expand Down Expand Up @@ -168,6 +169,17 @@ def get_lambda(self):

def get_Elogbeta(self):
return dirichlet_expectation(self.get_lambda())

@classmethod
def load(cls, fname, *args, **kwargs):
result = super(LdaState, cls).load(fname, *args, **kwargs)

# dtype could be absent in old models
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy)
logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)

return result
# endclass LdaState


Expand All @@ -194,7 +206,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
random_state=None, ns_conf=None, minimum_phi_value=0.01,
per_word_topics=False, callbacks=None):
per_word_topics=False, callbacks=None, dtype=np.float32):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -236,9 +248,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
`minimum_probability` controls filtering the topics returned for a document (bow).
`random_state` can be a np.random.RandomState object or the seed for one
`random_state` can be a np.random.RandomState object or the seed for one.
`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training.
`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training
`dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype.
Example:
Expand All @@ -250,6 +264,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data
"""
self.dtype = dtype

# store user-supplied parameters
self.id2word = id2word
Expand Down Expand Up @@ -333,10 +348,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

# Initialize the variational distribution q(beta|lambda)
self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

# Check that we haven't accidentally fall back to np.float64
assert self.eta.dtype == self.dtype
assert self.expElogbeta.dtype == self.dtype

# if a training corpus was provided, start estimating the model right away
if corpus is not None:
use_numpy = self.dispatcher is not None
Expand All @@ -357,25 +376,25 @@ def init_dir_prior(self, prior, name):

if isinstance(prior, six.string_types):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
elif prior == 'asymmetric':
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype)
init_prior /= init_prior.sum()
logger.info("using asymmetric %s %s", name, list(init_prior))
elif prior == 'auto':
is_auto = True
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
if name == 'alpha':
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
else:
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
elif isinstance(prior, list):
init_prior = np.asarray(prior)
init_prior = np.asarray(prior, dtype=self.dtype)
elif isinstance(prior, np.ndarray):
init_prior = prior
init_prior = prior.astype(self.dtype, copy=False)
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
init_prior = np.asarray([prior] * prior_shape)
init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype)
else:
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)

Expand All @@ -388,6 +407,7 @@ def __str__(self):

def sync_state(self):
self.expElogbeta = np.exp(self.state.get_Elogbeta())
assert self.expElogbeta.dtype == self.dtype

def clear(self):
"""Clear model state (free up some memory). Used in the distributed algo."""
Expand Down Expand Up @@ -421,11 +441,15 @@ def inference(self, chunk, collect_sstats=False):
logger.debug("performing inference on a chunk of %i documents", len(chunk))

# Initialize the variational distribution q(theta|gamma) for the chunk
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)

assert Elogtheta.dtype == self.dtype
assert expElogtheta.dtype == self.dtype

if collect_sstats:
sstats = np.zeros_like(self.expElogbeta)
sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
else:
sstats = None
converged = 0
Expand All @@ -440,7 +464,7 @@ def inference(self, chunk, collect_sstats=False):
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
cts = np.array([cnt for _, cnt in doc], dtype=self.dtype)
gammad = gamma[d, :]
Elogthetad = Elogtheta[d, :]
expElogthetad = expElogtheta[d, :]
Expand All @@ -467,6 +491,7 @@ def inference(self, chunk, collect_sstats=False):
converged += 1
break
gamma[d, :] = gammad
assert gammad.dtype == self.dtype
if collect_sstats:
# Contribution of document d to the expected sufficient
# statistics for the M step.
Expand All @@ -481,6 +506,9 @@ def inference(self, chunk, collect_sstats=False):
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
sstats *= self.expElogbeta
assert sstats.dtype == self.dtype

assert gamma.dtype == self.dtype
return gamma, sstats

def do_estep(self, chunk, state=None):
Expand All @@ -494,6 +522,7 @@ def do_estep(self, chunk, state=None):
gamma, sstats = self.inference(chunk, collect_sstats=True)
state.sstats += sstats
state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator
assert gamma.dtype == self.dtype
return gamma

def update_alpha(self, gammat, rho):
Expand All @@ -503,10 +532,12 @@ def update_alpha(self, gammat, rho):
"""
N = float(len(gammat))
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
assert logphat.dtype == self.dtype

self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
logger.info("optimized alpha %s", list(self.alpha))

assert self.alpha.dtype == self.dtype
return self.alpha

def update_eta(self, lambdat, rho):
Expand All @@ -516,9 +547,11 @@ def update_eta(self, lambdat, rho):
"""
N = float(lambdat.shape[0])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
assert logphat.dtype == self.dtype

self.eta = update_dir_prior(self.eta, N, logphat, rho)

assert self.eta.dtype == self.dtype
return self.eta

def log_perplexity(self, chunk, total_docs=None):
Expand Down Expand Up @@ -650,7 +683,7 @@ def rho():
logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False

reallen = 0
Expand Down Expand Up @@ -694,7 +727,7 @@ def rho():
logger.info('initializing workers')
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False
# endfor single corpus iteration

Expand Down Expand Up @@ -775,6 +808,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
gammad = gamma[d]
Elogthetad = dirichlet_expectation(gammad)

assert gammad.dtype == self.dtype
assert Elogthetad.dtype == self.dtype

# E[log p(doc | theta, beta)]
score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)

Expand Down Expand Up @@ -823,6 +859,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

# add a little random jitter, to randomize results around the same alpha
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
# random_state.rand returns float64, but converting back to dtype won't speed up anything

sorted_topics = list(matutils.argsort(sort_alpha))
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
Expand Down Expand Up @@ -859,7 +896,7 @@ def show_topic(self, topicid, topn=10):
def get_topics(self):
"""
Returns:
np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents
the term topic matrix learned during inference.
"""
topics = self.state.get_lambda()
Expand Down Expand Up @@ -1031,6 +1068,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100,
>>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2`
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`
Note: this ignores difference in model dtypes
"""

distances = {
Expand Down Expand Up @@ -1189,9 +1227,14 @@ def load(cls, fname, *args, **kwargs):
result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)`
logging.warning("random_state not set so using default value")

# dtype could be absent in old models
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy)
logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)

state_fname = utils.smart_extension(fname, '.state')
try:
result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
result.state = LdaState.load(state_fname, *args, **kwargs)
except Exception as e:
logging.warning("failed to load state from %s: %s", state_fname, e)

Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@

import logging

import numpy as np

from gensim import utils
from gensim.models.ldamodel import LdaModel, LdaState

Expand Down Expand Up @@ -82,7 +84,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001, random_state=None, minimum_probability=0.01,
minimum_phi_value=0.01, per_word_topics=False):
minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -148,7 +150,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype
)

def update(self, corpus, chunks_as_numpy=False):
Expand Down
7 changes: 4 additions & 3 deletions gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
if initialize == 'gensim':
lda_model = ldamodel.LdaModel(
corpus, id2word=self.id2word, num_topics=self.num_topics,
passes=passes, alpha=self.alphas, random_state=random_state
passes=passes, alpha=self.alphas, random_state=random_state,
dtype=np.float64
)
self.sstats = np.transpose(lda_model.state.sstats)
if initialize == 'ldamodel':
Expand Down Expand Up @@ -244,7 +245,7 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
vocab_len = self.vocab_len
bound = 0.0

lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word)
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)

Expand Down Expand Up @@ -419,7 +420,7 @@ def __getitem__(self, doc):
"""
Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed.
"""
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word)
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)

Expand Down
3 changes: 2 additions & 1 deletion gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
model_gensim = LdaModel(
id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
alpha=mallet_model.alpha, iterations=iterations,
gamma_threshold=gamma_threshold
gamma_threshold=gamma_threshold,
dtype=numpy.float64 # don't loose precision when converting from MALLET
)
model_gensim.expElogbeta[:] = mallet_model.wordtopics
return model_gensim
3 changes: 2 additions & 1 deletion gensim/models/wrappers/ldavowpalwabbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,8 @@ def vwmodel2ldamodel(vw_model, iterations=50):
model_gensim = LdaModel(
num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
dtype=numpy.float32
)
model_gensim.expElogbeta[:] = vw_model._get_topics()
return model_gensim
Loading

0 comments on commit 02d848f

Please sign in to comment.