-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding dtype to LDAModel to speed it up #1656
Changes from 15 commits
8e4c56a
cdc603f
38ebc1b
c44ca8a
e4d98ba
88845fd
f2e22f4
a95648e
c6cd798
d2651f9
bc6cc4f
f7baf17
ef319f3
998b0b2
7541dd9
5abf970
61c3263
4705422
956612a
add7bc0
4deee31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -538,7 +538,8 @@ def suggested_lda_model(self): | |
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. | ||
""" | ||
alpha, beta = self.hdp_to_lda() | ||
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state) | ||
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, | ||
random_state=self.random_state, dtype=np.float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code style: no vertical indent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
ldam.expElogbeta[:] = beta | ||
return ldam | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad): | |
|
||
""" | ||
|
||
def __init__(self, eta, shape): | ||
self.eta = eta | ||
self.sstats = np.zeros(shape) | ||
def __init__(self, eta, shape, dtype=np.float32): | ||
self.eta = eta.astype(dtype, copy=False) | ||
self.sstats = np.zeros(shape, dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using positional arguments can lead to subtle bugs with numpy. Better use explicit names for keyword parameters: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
self.numdocs = 0 | ||
self.dtype = dtype | ||
|
||
def reset(self): | ||
""" | ||
|
@@ -165,6 +166,18 @@ def get_lambda(self): | |
|
||
def get_Elogbeta(self): | ||
return dirichlet_expectation(self.get_lambda()) | ||
|
||
@classmethod | ||
def load(cls, fname, *args, **kwargs): | ||
result = super(LdaState, cls).load(fname, *args, **kwargs) | ||
|
||
# Check if `dtype` is set after main pickle load | ||
# if not, then it's an old model and we should set it to default `np.float64` | ||
if not hasattr(result, 'dtype'): | ||
result.dtype = np.float64 # float64 was used before as default in numpy | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Old LDA used float64, really? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pretty much everything is using |
||
logging.warning("dtype was not set in LdaState, so using np.float64") | ||
|
||
return result | ||
# endclass LdaState | ||
|
||
|
||
|
@@ -191,7 +204,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, | ||
iterations=50, gamma_threshold=0.001, minimum_probability=0.01, | ||
random_state=None, ns_conf=None, minimum_phi_value=0.01, | ||
per_word_topics=False, callbacks=None): | ||
per_word_topics=False, callbacks=None, dtype=np.float32): | ||
""" | ||
If given, start training from the iterable `corpus` straight away. If not given, | ||
the model is left untrained (presumably because you want to call `update()` manually). | ||
|
@@ -233,9 +246,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
|
||
`minimum_probability` controls filtering the topics returned for a document (bow). | ||
|
||
`random_state` can be a np.random.RandomState object or the seed for one | ||
`random_state` can be a np.random.RandomState object or the seed for one. | ||
|
||
`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training. | ||
|
||
`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training | ||
`dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype. | ||
|
||
Example: | ||
|
||
|
@@ -247,6 +262,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data | ||
|
||
""" | ||
self.dtype = dtype | ||
|
||
# store user-supplied parameters | ||
self.id2word = id2word | ||
|
@@ -330,10 +346,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, | |
raise RuntimeError("failed to initialize distributed LDA (%s)" % err) | ||
|
||
# Initialize the variational distribution q(beta|lambda) | ||
self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) | ||
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) | ||
self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype) | ||
self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) | ||
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) | ||
|
||
# Check that we haven't accidentally fall back to np.float64 | ||
assert self.eta.dtype == self.dtype | ||
assert self.expElogbeta.dtype == self.dtype | ||
|
||
# if a training corpus was provided, start estimating the model right away | ||
if corpus is not None: | ||
use_numpy = self.dispatcher is not None | ||
|
@@ -354,25 +374,25 @@ def init_dir_prior(self, prior, name): | |
|
||
if isinstance(prior, six.string_types): | ||
if prior == 'symmetric': | ||
logger.info("using symmetric %s at %s", name, 1.0 / prior_shape) | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)]) | ||
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) | ||
elif prior == 'asymmetric': | ||
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)]) | ||
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype) | ||
init_prior /= init_prior.sum() | ||
logger.info("using asymmetric %s %s", name, list(init_prior)) | ||
elif prior == 'auto': | ||
is_auto = True | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)]) | ||
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype) | ||
if name == 'alpha': | ||
logger.info("using autotuned %s, starting with %s", name, list(init_prior)) | ||
else: | ||
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior)) | ||
elif isinstance(prior, list): | ||
init_prior = np.asarray(prior) | ||
init_prior = np.asarray(prior, dtype=self.dtype) | ||
elif isinstance(prior, np.ndarray): | ||
init_prior = prior | ||
init_prior = prior.astype(self.dtype, copy=False) | ||
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real): | ||
init_prior = np.asarray([prior] * prior_shape) | ||
init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype) | ||
else: | ||
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name) | ||
|
||
|
@@ -385,6 +405,7 @@ def __str__(self): | |
|
||
def sync_state(self): | ||
self.expElogbeta = np.exp(self.state.get_Elogbeta()) | ||
assert self.expElogbeta.dtype == self.dtype | ||
|
||
def clear(self): | ||
"""Clear model state (free up some memory). Used in the distributed algo.""" | ||
|
@@ -418,11 +439,15 @@ def inference(self, chunk, collect_sstats=False): | |
logger.debug("performing inference on a chunk of %i documents", len(chunk)) | ||
|
||
# Initialize the variational distribution q(theta|gamma) for the chunk | ||
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)) | ||
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False) | ||
Elogtheta = dirichlet_expectation(gamma) | ||
expElogtheta = np.exp(Elogtheta) | ||
|
||
assert Elogtheta.dtype == self.dtype | ||
assert expElogtheta.dtype == self.dtype | ||
|
||
if collect_sstats: | ||
sstats = np.zeros_like(self.expElogbeta) | ||
sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype) | ||
else: | ||
sstats = None | ||
converged = 0 | ||
|
@@ -437,7 +462,7 @@ def inference(self, chunk, collect_sstats=False): | |
ids = [int(idx) for idx, _ in doc] | ||
else: | ||
ids = [idx for idx, _ in doc] | ||
cts = np.array([cnt for _, cnt in doc]) | ||
cts = np.array([cnt for _, cnt in doc], dtype=self.dtype) | ||
gammad = gamma[d, :] | ||
Elogthetad = Elogtheta[d, :] | ||
expElogthetad = expElogtheta[d, :] | ||
|
@@ -464,6 +489,7 @@ def inference(self, chunk, collect_sstats=False): | |
converged += 1 | ||
break | ||
gamma[d, :] = gammad | ||
assert gammad.dtype == self.dtype | ||
if collect_sstats: | ||
# Contribution of document d to the expected sufficient | ||
# statistics for the M step. | ||
|
@@ -478,6 +504,9 @@ def inference(self, chunk, collect_sstats=False): | |
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk} | ||
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. | ||
sstats *= self.expElogbeta | ||
assert sstats.dtype == self.dtype | ||
|
||
assert gamma.dtype == self.dtype | ||
return gamma, sstats | ||
|
||
def do_estep(self, chunk, state=None): | ||
|
@@ -491,6 +520,7 @@ def do_estep(self, chunk, state=None): | |
gamma, sstats = self.inference(chunk, collect_sstats=True) | ||
state.sstats += sstats | ||
state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator | ||
assert gamma.dtype == self.dtype | ||
return gamma | ||
|
||
def update_alpha(self, gammat, rho): | ||
|
@@ -500,10 +530,12 @@ def update_alpha(self, gammat, rho): | |
""" | ||
N = float(len(gammat)) | ||
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N | ||
assert logphat.dtype == self.dtype | ||
|
||
self.alpha = update_dir_prior(self.alpha, N, logphat, rho) | ||
logger.info("optimized alpha %s", list(self.alpha)) | ||
|
||
assert self.alpha.dtype == self.dtype | ||
return self.alpha | ||
|
||
def update_eta(self, lambdat, rho): | ||
|
@@ -513,9 +545,11 @@ def update_eta(self, lambdat, rho): | |
""" | ||
N = float(lambdat.shape[0]) | ||
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,)) | ||
assert logphat.dtype == self.dtype | ||
|
||
self.eta = update_dir_prior(self.eta, N, logphat, rho) | ||
|
||
assert self.eta.dtype == self.dtype | ||
return self.eta | ||
|
||
def log_perplexity(self, chunk, total_docs=None): | ||
|
@@ -647,7 +681,7 @@ def rho(): | |
logger.info('initializing %s workers', self.numworkers) | ||
self.dispatcher.reset(self.state) | ||
else: | ||
other = LdaState(self.eta, self.state.sstats.shape) | ||
other = LdaState(self.eta, self.state.sstats.shape, self.dtype) | ||
dirty = False | ||
|
||
reallen = 0 | ||
|
@@ -691,7 +725,7 @@ def rho(): | |
logger.info('initializing workers') | ||
self.dispatcher.reset(self.state) | ||
else: | ||
other = LdaState(self.eta, self.state.sstats.shape) | ||
other = LdaState(self.eta, self.state.sstats.shape, self.dtype) | ||
dirty = False | ||
# endfor single corpus iteration | ||
|
||
|
@@ -772,6 +806,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0): | |
gammad = gamma[d] | ||
Elogthetad = dirichlet_expectation(gammad) | ||
|
||
assert gammad.dtype == self.dtype | ||
assert Elogthetad.dtype == self.dtype | ||
|
||
# E[log p(doc | theta, beta)] | ||
score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc) | ||
|
||
|
@@ -820,6 +857,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): | |
|
||
# add a little random jitter, to randomize results around the same alpha | ||
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) | ||
# random_state.rand returns float64, but converting back to dtype won't speed up anything | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consistency vs one additional array copy. I'm not sure :) |
||
|
||
sorted_topics = list(matutils.argsort(sort_alpha)) | ||
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] | ||
|
@@ -856,7 +894,7 @@ def show_topic(self, topicid, topn=10): | |
def get_topics(self): | ||
""" | ||
Returns: | ||
np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents | ||
np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents | ||
the term topic matrix learned during inference. | ||
""" | ||
topics = self.state.get_lambda() | ||
|
@@ -1028,6 +1066,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, | |
>>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2` | ||
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` | ||
|
||
Note: this ignores difference in model dtypes | ||
""" | ||
|
||
distances = { | ||
|
@@ -1186,9 +1225,14 @@ def load(cls, fname, *args, **kwargs): | |
result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` | ||
logging.warning("random_state not set so using default value") | ||
|
||
# the same goes for dtype (except it was added later) | ||
if not hasattr(result, 'dtype'): | ||
result.dtype = np.float64 # float64 was used before as default in numpy | ||
logging.warning("dtype was not set, so using np.float64") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A more concrete message please. When reading this warning, users will be left scratching their heads: set where? Why? What does this mean to me? How about Question: isn't it better to infer the dtype from the loaded object? Can it ever happen that it's something else, not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed message, decided info level suites better. About inferring. Not clear how to do it. Infer from Anyway, let's imagine situation some of |
||
|
||
state_fname = utils.smart_extension(fname, '.state') | ||
try: | ||
result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) | ||
result.state = LdaState.load(state_fname, *args, **kwargs) | ||
except Exception as e: | ||
logging.warning("failed to load state from %s: %s", state_fname, e) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,7 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ | |
if initialize == 'gensim': | ||
lda_model = ldamodel.LdaModel( | ||
corpus, id2word=self.id2word, num_topics=self.num_topics, | ||
passes=passes, alpha=self.alphas, random_state=random_state | ||
passes=passes, alpha=self.alphas, random_state=random_state, | ||
dtype=np.float64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it will be a good idea to change default behaviour (to float32)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not now, |
||
) | ||
self.sstats = np.transpose(lda_model.state.sstats) | ||
if initialize == 'ldamodel': | ||
|
@@ -244,7 +245,8 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods, | |
vocab_len = self.vocab_len | ||
bound = 0.0 | ||
|
||
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word) | ||
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, | ||
dtype=np.float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code style: no vertical indent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len)) | ||
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda) | ||
|
||
|
@@ -419,7 +421,8 @@ def __getitem__(self, doc): | |
""" | ||
Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed. | ||
""" | ||
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word) | ||
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, | ||
dtype=np.float64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Code style: no vertical indent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len)) | ||
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please return
astype
, becausenp.float32 -> np.float32
np.float64 -> np.float64
but
np.float16 -> np.float32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, my bad, you're right!
Then tests that I added in separate file aren't needed.