Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding dtype to LDAModel to speed it up #1656

Merged
merged 21 commits into from
Nov 14, 2017
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,13 +600,12 @@ def jaccard_distance(set1, set2):
def dirichlet_expectation(alpha):
"""
For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.

"""
if len(alpha.shape) == 1:
result = psi(alpha) - psi(np.sum(alpha))
else:
result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
return result.astype(alpha.dtype) # keep the same precision as input
return result
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please return astype, because
np.float32 -> np.float32
np.float64 -> np.float64
but
np.float16 -> np.float32

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, my bad, you're right!
Then tests that I added in separate file aren't needed.



def qr_destroy(la):
Expand Down
4 changes: 4 additions & 0 deletions gensim/models/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(self, eta, lambda_shape, gamma_shape):
self.sstats = np.zeros(lambda_shape)
self.gamma = np.zeros(gamma_shape)
self.numdocs = 0
self.dtype = np.float64 # To be compatible with LdaState


def construct_doc2author(corpus, author2doc):
Expand Down Expand Up @@ -203,6 +204,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
>>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data

"""
# NOTE: this doesn't call constructor of a base class, but duplicates most of this code
# so we have to set dtype to float64 default here
self.dtype = np.float64

# NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
# infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
Expand Down
3 changes: 2 additions & 1 deletion gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,8 @@ def suggested_lda_model(self):
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
"""
alpha, beta = self.hdp_to_lda()
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word,
random_state=self.random_state, dtype=np.float64)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code style: no vertical indent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

ldam.expElogbeta[:] = beta
return ldam

Expand Down
88 changes: 66 additions & 22 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad):

"""

def __init__(self, eta, shape):
self.eta = eta
self.sstats = np.zeros(shape)
def __init__(self, eta, shape, dtype=np.float32):
self.eta = eta.astype(dtype, copy=False)
self.sstats = np.zeros(shape, dtype)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using positional arguments can lead to subtle bugs with numpy. Better use explicit names for keyword parameters: dtype=dtype.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

self.numdocs = 0
self.dtype = dtype

def reset(self):
"""
Expand Down Expand Up @@ -165,6 +166,18 @@ def get_lambda(self):

def get_Elogbeta(self):
return dirichlet_expectation(self.get_lambda())

@classmethod
def load(cls, fname, *args, **kwargs):
result = super(LdaState, cls).load(fname, *args, **kwargs)

# Check if `dtype` is set after main pickle load
# if not, then it's an old model and we should set it to default `np.float64`
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was used before as default in numpy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Old LDA used float64, really?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty much everything is using float64 cause it's default dtype when creating arrays.

logging.warning("dtype was not set in LdaState, so using np.float64")

return result
# endclass LdaState


Expand All @@ -191,7 +204,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
random_state=None, ns_conf=None, minimum_phi_value=0.01,
per_word_topics=False, callbacks=None):
per_word_topics=False, callbacks=None, dtype=np.float32):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -233,9 +246,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,

`minimum_probability` controls filtering the topics returned for a document (bow).

`random_state` can be a np.random.RandomState object or the seed for one
`random_state` can be a np.random.RandomState object or the seed for one.

`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training.

`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training
`dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype.

Example:

Expand All @@ -247,6 +262,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data

"""
self.dtype = dtype

# store user-supplied parameters
self.id2word = id2word
Expand Down Expand Up @@ -330,10 +346,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

# Initialize the variational distribution q(beta|lambda)
self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

# Check that we haven't accidentally fall back to np.float64
assert self.eta.dtype == self.dtype
assert self.expElogbeta.dtype == self.dtype

# if a training corpus was provided, start estimating the model right away
if corpus is not None:
use_numpy = self.dispatcher is not None
Expand All @@ -354,25 +374,25 @@ def init_dir_prior(self, prior, name):

if isinstance(prior, six.string_types):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
elif prior == 'asymmetric':
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype)
init_prior /= init_prior.sum()
logger.info("using asymmetric %s %s", name, list(init_prior))
elif prior == 'auto':
is_auto = True
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
if name == 'alpha':
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
else:
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
elif isinstance(prior, list):
init_prior = np.asarray(prior)
init_prior = np.asarray(prior, dtype=self.dtype)
elif isinstance(prior, np.ndarray):
init_prior = prior
init_prior = prior.astype(self.dtype, copy=False)
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
init_prior = np.asarray([prior] * prior_shape)
init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype)
else:
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)

Expand All @@ -385,6 +405,7 @@ def __str__(self):

def sync_state(self):
self.expElogbeta = np.exp(self.state.get_Elogbeta())
assert self.expElogbeta.dtype == self.dtype

def clear(self):
"""Clear model state (free up some memory). Used in the distributed algo."""
Expand Down Expand Up @@ -418,11 +439,15 @@ def inference(self, chunk, collect_sstats=False):
logger.debug("performing inference on a chunk of %i documents", len(chunk))

# Initialize the variational distribution q(theta|gamma) for the chunk
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)

assert Elogtheta.dtype == self.dtype
assert expElogtheta.dtype == self.dtype

if collect_sstats:
sstats = np.zeros_like(self.expElogbeta)
sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
else:
sstats = None
converged = 0
Expand All @@ -437,7 +462,7 @@ def inference(self, chunk, collect_sstats=False):
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
cts = np.array([cnt for _, cnt in doc], dtype=self.dtype)
gammad = gamma[d, :]
Elogthetad = Elogtheta[d, :]
expElogthetad = expElogtheta[d, :]
Expand All @@ -464,6 +489,7 @@ def inference(self, chunk, collect_sstats=False):
converged += 1
break
gamma[d, :] = gammad
assert gammad.dtype == self.dtype
if collect_sstats:
# Contribution of document d to the expected sufficient
# statistics for the M step.
Expand All @@ -478,6 +504,9 @@ def inference(self, chunk, collect_sstats=False):
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
sstats *= self.expElogbeta
assert sstats.dtype == self.dtype

assert gamma.dtype == self.dtype
return gamma, sstats

def do_estep(self, chunk, state=None):
Expand All @@ -491,6 +520,7 @@ def do_estep(self, chunk, state=None):
gamma, sstats = self.inference(chunk, collect_sstats=True)
state.sstats += sstats
state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator
assert gamma.dtype == self.dtype
return gamma

def update_alpha(self, gammat, rho):
Expand All @@ -500,10 +530,12 @@ def update_alpha(self, gammat, rho):
"""
N = float(len(gammat))
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
assert logphat.dtype == self.dtype

self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
logger.info("optimized alpha %s", list(self.alpha))

assert self.alpha.dtype == self.dtype
return self.alpha

def update_eta(self, lambdat, rho):
Expand All @@ -513,9 +545,11 @@ def update_eta(self, lambdat, rho):
"""
N = float(lambdat.shape[0])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
assert logphat.dtype == self.dtype

self.eta = update_dir_prior(self.eta, N, logphat, rho)

assert self.eta.dtype == self.dtype
return self.eta

def log_perplexity(self, chunk, total_docs=None):
Expand Down Expand Up @@ -647,7 +681,7 @@ def rho():
logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False

reallen = 0
Expand Down Expand Up @@ -691,7 +725,7 @@ def rho():
logger.info('initializing workers')
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False
# endfor single corpus iteration

Expand Down Expand Up @@ -772,6 +806,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
gammad = gamma[d]
Elogthetad = dirichlet_expectation(gammad)

assert gammad.dtype == self.dtype
assert Elogthetad.dtype == self.dtype

# E[log p(doc | theta, beta)]
score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)

Expand Down Expand Up @@ -820,6 +857,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

# add a little random jitter, to randomize results around the same alpha
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
# random_state.rand returns float64, but converting back to dtype won't speed up anything
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe .astype (for consistency only) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consistency vs one additional array copy. I'm not sure :)


sorted_topics = list(matutils.argsort(sort_alpha))
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
Expand Down Expand Up @@ -856,7 +894,7 @@ def show_topic(self, topicid, topn=10):
def get_topics(self):
"""
Returns:
np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents
the term topic matrix learned during inference.
"""
topics = self.state.get_lambda()
Expand Down Expand Up @@ -1028,6 +1066,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100,
>>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2`
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`

Note: this ignores difference in model dtypes
"""

distances = {
Expand Down Expand Up @@ -1186,9 +1225,14 @@ def load(cls, fname, *args, **kwargs):
result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)`
logging.warning("random_state not set so using default value")

# the same goes for dtype (except it was added later)
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was used before as default in numpy
logging.warning("dtype was not set, so using np.float64")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A more concrete message please. When reading this warning, users will be left scratching their heads: set where? Why? What does this mean to me?

How about "dtype not set in saved %s file %s, assuming np.float64" % (result.__class__.__name__, fname)?
And only log at INFO or even DEBUG level, since it's an expected state when loading an old model, nothing out of ordinary.

Question: isn't it better to infer the dtype from the loaded object? Can it ever happen that it's something else, not np.float64?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed message, decided info level suites better.

About inferring. Not clear how to do it. Infer from LdaState.eta and LdaState.sstats? But then we had test that their sum is np.float64, so it's safe to assume that we don't loose precision when setting dtype to np.float64 and np.float32 is not enough.

Anyway, let's imagine situation some of nd.arrays are somehow of different dtype, like np.float32 and some are np.float64. The right dtype is still np.float64.


state_fname = utils.smart_extension(fname, '.state')
try:
result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
result.state = LdaState.load(state_fname, *args, **kwargs)
except Exception as e:
logging.warning("failed to load state from %s: %s", state_fname, e)

Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@

import logging

import numpy as np

from gensim import utils
from gensim.models.ldamodel import LdaModel, LdaState

Expand Down Expand Up @@ -82,7 +84,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001, random_state=None, minimum_probability=0.01,
minimum_phi_value=0.01, per_word_topics=False):
minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -148,7 +150,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype
)

def update(self, corpus, chunks_as_numpy=False):
Expand Down
9 changes: 6 additions & 3 deletions gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
if initialize == 'gensim':
lda_model = ldamodel.LdaModel(
corpus, id2word=self.id2word, num_topics=self.num_topics,
passes=passes, alpha=self.alphas, random_state=random_state
passes=passes, alpha=self.alphas, random_state=random_state,
dtype=np.float64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it will be a good idea to change default behaviour (to float32)?
CC @piskvorky @xelez

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not now, LdaSeqModel will require modifications similar to those I made in LdaModel to handle dtype properly.

)
self.sstats = np.transpose(lda_model.state.sstats)
if initialize == 'ldamodel':
Expand Down Expand Up @@ -244,7 +245,8 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
vocab_len = self.vocab_len
bound = 0.0

lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word)
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word,
dtype=np.float64)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code style: no vertical indent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)

Expand Down Expand Up @@ -419,7 +421,8 @@ def __getitem__(self, doc):
"""
Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed.
"""
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word)
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word,
dtype=np.float64)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code style: no vertical indent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)

Expand Down
3 changes: 2 additions & 1 deletion gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
model_gensim = LdaModel(
id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
alpha=mallet_model.alpha, iterations=iterations,
gamma_threshold=gamma_threshold
gamma_threshold=gamma_threshold,
dtype=numpy.float64 # don't loose precision when converting from MALLET
)
model_gensim.expElogbeta[:] = mallet_model.wordtopics
return model_gensim
3 changes: 2 additions & 1 deletion gensim/models/wrappers/ldavowpalwabbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,8 @@ def vwmodel2ldamodel(vw_model, iterations=50):
model_gensim = LdaModel(
num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
dtype=numpy.float32
)
model_gensim.expElogbeta[:] = vw_model._get_topics()
return model_gensim
Loading