piskvorky · menshikh-iv · Nov 14, 2017 · Oct 26, 2017 · Oct 28, 2017 · Oct 28, 2017
diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -600,13 +600,12 @@ def jaccard_distance(set1, set2):
 def dirichlet_expectation(alpha):
     """
     For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.
-
     """
     if len(alpha.shape) == 1:
         result = psi(alpha) - psi(np.sum(alpha))
     else:
         result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
-    return result.astype(alpha.dtype)  # keep the same precision as input
+    return result
 
 
 def qr_destroy(la):

diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py
@@ -70,6 +70,7 @@ def __init__(self, eta, lambda_shape, gamma_shape):
         self.sstats = np.zeros(lambda_shape)
         self.gamma = np.zeros(gamma_shape)
         self.numdocs = 0
+        self.dtype = np.float64  # To be compatible with LdaState
 
 
 def construct_doc2author(corpus, author2doc):
@@ -203,6 +204,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
         >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data
 
         """
+        # NOTE: this doesn't call constructor of a base class, but duplicates most of this code
+        # so we have to set dtype to float64 default here
+        self.dtype = np.float64
 
         # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
         # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.

diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
@@ -538,7 +538,8 @@ def suggested_lda_model(self):
         The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
         """
         alpha, beta = self.hdp_to_lda()
-        ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
+        ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word,
+                                 random_state=self.random_state, dtype=np.float64)
         ldam.expElogbeta[:] = beta
         return ldam
 

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad):
 
     """
 
-    def __init__(self, eta, shape):
-        self.eta = eta
-        self.sstats = np.zeros(shape)
+    def __init__(self, eta, shape, dtype=np.float32):
+        self.eta = eta.astype(dtype, copy=False)
+        self.sstats = np.zeros(shape, dtype)
         self.numdocs = 0
+        self.dtype = dtype
 
     def reset(self):
         """
@@ -165,6 +166,18 @@ def get_lambda(self):
 
     def get_Elogbeta(self):
         return dirichlet_expectation(self.get_lambda())
+
+    @classmethod
+    def load(cls, fname, *args, **kwargs):
+        result = super(LdaState, cls).load(fname, *args, **kwargs)
+
+        # Check if `dtype` is set after main pickle load
+        # if not, then it's an old model and we should set it to default `np.float64`
+        if not hasattr(result, 'dtype'):
+            result.dtype = np.float64  # float64 was used before as default in numpy
+            logging.warning("dtype was not set in LdaState, so using np.float64")
+
+        return result
 # endclass LdaState
 
 
@@ -191,7 +204,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                  alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
                  iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
                  random_state=None, ns_conf=None, minimum_phi_value=0.01,
-                 per_word_topics=False, callbacks=None):
+                 per_word_topics=False, callbacks=None, dtype=np.float32):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -233,9 +246,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         `minimum_probability` controls filtering the topics returned for a document (bow).
 
-        `random_state` can be a np.random.RandomState object or the seed for one
+        `random_state` can be a np.random.RandomState object or the seed for one.
+
+        `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training.
 
-        `callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training
+        `dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype.
 
         Example:
 
@@ -247,6 +262,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data
 
         """
+        self.dtype = dtype
 
         # store user-supplied parameters
         self.id2word = id2word
@@ -330,10 +346,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
                 raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
 
         # Initialize the variational distribution q(beta|lambda)
-        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
-        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
+        self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
+        self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
         self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))
 
+        # Check that we haven't accidentally fall back to np.float64
+        assert self.eta.dtype == self.dtype
+        assert self.expElogbeta.dtype == self.dtype
+
         # if a training corpus was provided, start estimating the model right away
         if corpus is not None:
             use_numpy = self.dispatcher is not None
@@ -354,25 +374,25 @@ def init_dir_prior(self, prior, name):
 
         if isinstance(prior, six.string_types):
             if prior == 'symmetric':
-                logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
-                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
+                logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
+                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
             elif prior == 'asymmetric':
-                init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
+                init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype)
                 init_prior /= init_prior.sum()
                 logger.info("using asymmetric %s %s", name, list(init_prior))
             elif prior == 'auto':
                 is_auto = True
-                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
+                init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
                 if name == 'alpha':
                     logger.info("using autotuned %s, starting with %s", name, list(init_prior))
             else:
                 raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
         elif isinstance(prior, list):
-            init_prior = np.asarray(prior)
+            init_prior = np.asarray(prior, dtype=self.dtype)
         elif isinstance(prior, np.ndarray):
-            init_prior = prior
+            init_prior = prior.astype(self.dtype, copy=False)
         elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
-            init_prior = np.asarray([prior] * prior_shape)
+            init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype)
         else:
             raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)
 
@@ -385,6 +405,7 @@ def __str__(self):
 
     def sync_state(self):
         self.expElogbeta = np.exp(self.state.get_Elogbeta())
+        assert self.expElogbeta.dtype == self.dtype
 
     def clear(self):
         """Clear model state (free up some memory). Used in the distributed algo."""
@@ -418,11 +439,15 @@ def inference(self, chunk, collect_sstats=False):
             logger.debug("performing inference on a chunk of %i documents", len(chunk))
 
         # Initialize the variational distribution q(theta|gamma) for the chunk
-        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
+        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
         Elogtheta = dirichlet_expectation(gamma)
         expElogtheta = np.exp(Elogtheta)
+
+        assert Elogtheta.dtype == self.dtype
+        assert expElogtheta.dtype == self.dtype
+
         if collect_sstats:
-            sstats = np.zeros_like(self.expElogbeta)
+            sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
         else:
             sstats = None
         converged = 0
@@ -437,7 +462,7 @@ def inference(self, chunk, collect_sstats=False):
                 ids = [int(idx) for idx, _ in doc]
             else:
                 ids = [idx for idx, _ in doc]
-            cts = np.array([cnt for _, cnt in doc])
+            cts = np.array([cnt for _, cnt in doc], dtype=self.dtype)
             gammad = gamma[d, :]
             Elogthetad = Elogtheta[d, :]
             expElogthetad = expElogtheta[d, :]
@@ -464,6 +489,7 @@ def inference(self, chunk, collect_sstats=False):
                     converged += 1
                     break
             gamma[d, :] = gammad
+            assert gammad.dtype == self.dtype
             if collect_sstats:
                 # Contribution of document d to the expected sufficient
                 # statistics for the M step.
@@ -478,6 +504,9 @@ def inference(self, chunk, collect_sstats=False):
             # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
             # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
             sstats *= self.expElogbeta
+            assert sstats.dtype == self.dtype
+
+        assert gamma.dtype == self.dtype
         return gamma, sstats
 
     def do_estep(self, chunk, state=None):
@@ -491,6 +520,7 @@ def do_estep(self, chunk, state=None):
         gamma, sstats = self.inference(chunk, collect_sstats=True)
         state.sstats += sstats
         state.numdocs += gamma.shape[0]  # avoids calling len(chunk) on a generator
+        assert gamma.dtype == self.dtype
         return gamma
 
     def update_alpha(self, gammat, rho):
@@ -500,10 +530,12 @@ def update_alpha(self, gammat, rho):
         """
         N = float(len(gammat))
         logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
+        assert logphat.dtype == self.dtype
 
         self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
         logger.info("optimized alpha %s", list(self.alpha))
 
+        assert self.alpha.dtype == self.dtype
         return self.alpha
 
     def update_eta(self, lambdat, rho):
@@ -513,9 +545,11 @@ def update_eta(self, lambdat, rho):
         """
         N = float(lambdat.shape[0])
         logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
+        assert logphat.dtype == self.dtype
 
         self.eta = update_dir_prior(self.eta, N, logphat, rho)
 
+        assert self.eta.dtype == self.dtype
         return self.eta
 
     def log_perplexity(self, chunk, total_docs=None):
@@ -647,7 +681,7 @@ def rho():
                 logger.info('initializing %s workers', self.numworkers)
                 self.dispatcher.reset(self.state)
             else:
-                other = LdaState(self.eta, self.state.sstats.shape)
+                other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
             dirty = False
 
             reallen = 0
@@ -691,7 +725,7 @@ def rho():
                         logger.info('initializing workers')
                         self.dispatcher.reset(self.state)
                     else:
-                        other = LdaState(self.eta, self.state.sstats.shape)
+                        other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
                     dirty = False
             # endfor single corpus iteration
 
@@ -772,6 +806,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
                 gammad = gamma[d]
             Elogthetad = dirichlet_expectation(gammad)
 
+            assert gammad.dtype == self.dtype
+            assert Elogthetad.dtype == self.dtype
+
             # E[log p(doc | theta, beta)]
             score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
 
@@ -820,6 +857,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
 
             # add a little random jitter, to randomize results around the same alpha
             sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
+            # random_state.rand returns float64, but converting back to dtype won't speed up anything
 
             sorted_topics = list(matutils.argsort(sort_alpha))
             chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
@@ -856,7 +894,7 @@ def show_topic(self, topicid, topn=10):
     def get_topics(self):
         """
         Returns:
-            np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
+            np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents
             the term topic matrix learned during inference.
         """
         topics = self.state.get_lambda()
@@ -1028,6 +1066,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100,
         >>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2`
         >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`
 
+        Note: this ignores difference in model dtypes
         """
 
         distances = {
@@ -1186,9 +1225,14 @@ def load(cls, fname, *args, **kwargs):
             result.random_state = utils.get_random_state(None)  # using default value `get_random_state(None)`
             logging.warning("random_state not set so using default value")
 
+        # the same goes for dtype (except it was added later)
+        if not hasattr(result, 'dtype'):
+            result.dtype = np.float64  # float64 was used before as default in numpy
+            logging.warning("dtype was not set, so using np.float64")
+
         state_fname = utils.smart_extension(fname, '.state')
         try:
-            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
+            result.state = LdaState.load(state_fname, *args, **kwargs)
         except Exception as e:
             logging.warning("failed to load state from %s: %s", state_fname, e)
 

diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py
@@ -49,6 +49,8 @@
 
 import logging
 
+import numpy as np
+
 from gensim import utils
 from gensim.models.ldamodel import LdaModel, LdaState
 
@@ -82,7 +84,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
                  chunksize=2000, passes=1, batch=False, alpha='symmetric',
                  eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
                  gamma_threshold=0.001, random_state=None, minimum_probability=0.01,
-                 minimum_phi_value=0.01, per_word_topics=False):
+                 minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32):
         """
         If given, start training from the iterable `corpus` straight away. If not given,
         the model is left untrained (presumably because you want to call `update()` manually).
@@ -148,7 +150,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
             id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
             decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
             gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
-            minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics
+            minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype
         )
 
     def update(self, corpus, chunks_as_numpy=False):

diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
@@ -130,7 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
             if initialize == 'gensim':
                 lda_model = ldamodel.LdaModel(
                     corpus, id2word=self.id2word, num_topics=self.num_topics,
-                    passes=passes, alpha=self.alphas, random_state=random_state
+                    passes=passes, alpha=self.alphas, random_state=random_state,
+                    dtype=np.float64
                 )
                 self.sstats = np.transpose(lda_model.state.sstats)
             if initialize == 'ldamodel':
@@ -244,7 +245,8 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
         vocab_len = self.vocab_len
         bound = 0.0
 
-        lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word)
+        lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word,
+                                dtype=np.float64)
         lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
         ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)
 
@@ -419,7 +421,8 @@ def __getitem__(self, doc):
         """
         Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed.
         """
-        lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word)
+        lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word,
+                                      dtype=np.float64)
         lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
         ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)
 

diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py
@@ -373,7 +373,8 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
     model_gensim = LdaModel(
         id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
         alpha=mallet_model.alpha, iterations=iterations,
-        gamma_threshold=gamma_threshold
+        gamma_threshold=gamma_threshold,
+        dtype=numpy.float64  # don't loose precision when converting from MALLET
     )
     model_gensim.expElogbeta[:] = mallet_model.wordtopics
     return model_gensim
diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py
@@ -586,7 +586,8 @@ def vwmodel2ldamodel(vw_model, iterations=50):
     model_gensim = LdaModel(
         num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
         passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
-        offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold
+        offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
+        dtype=numpy.float32
     )
     model_gensim.expElogbeta[:] = vw_model._get_topics()
     return model_gensim