piskvorky · parulsethi · Oct 6, 2017 · Oct 6, 2017 · Oct 6, 2017 · Oct 6, 2017
diff --git a/gensim/visualization/gensim_wrap.py b/gensim/visualization/gensim_wrap.py
@@ -0,0 +1,135 @@
+"""
+pyLDAvis Gensim
+===============
+Helper functions to visualize LDA models trained by Gensim
+"""
+
+from __future__ import absolute_import
+import funcy as fp
+import numpy as np
+import pandas as pd
+from scipy.sparse import issparse
+from past.builtins import xrange
+from . import prepare as vis_prepare
+
+
+def _extract_data(topic_model, corpus, dictionary, texts=None, doc_topic_dists=None):
+   import gensim
+
+   if not gensim.matutils.ismatrix(corpus):
+      corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))
+   else:
+      corpus_csc = corpus
+      # Need corpus to be a streaming gensim list corpus for len and inference functions below:
+      corpus = gensim.matutils.Sparse2Corpus(corpus_csc)
+
+   doc_word_dists = corpus_csc.todense()
+   doc_word_dists = doc_word_dists / doc_word_dists.sum(axis=1)
+
+   vocab = list(dictionary.token2id.keys())
+   # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
+   # for now, I'll just make sure we don't ever get zeros...
+   beta = 0.01
+   fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
+   term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
+   term_freqs[term_freqs == 0] = beta
+   doc_lengths = corpus_csc.sum(axis=0).A.ravel()
+
+   assert term_freqs.shape[0] == len(dictionary), 'Term frequencies and dictionary have different shape {} != {}'.format(term_freqs.shape[0], len(dictionary))
+   assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(corpus))
+
+   if hasattr(topic_model, 'lda_alpha'):
+       num_topics = len(topic_model.lda_alpha)
+   else:
+       num_topics = topic_model.num_topics
+
+   if doc_topic_dists is None:
+      # If its an HDP model.
+      if hasattr(topic_model, 'lda_beta'):
+          gamma = topic_model.inference(corpus)
+      else:
+          gamma, _ = topic_model.inference(corpus)
+      doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
+   else:
+      if isinstance(doc_topic_dists, list):
+         doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T
+      elif issparse(doc_topic_dists):
+         doc_topic_dists = doc_topic_dists.T.todense()
+      doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)
+
+   assert doc_topic_dists.shape[1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(doc_topic_dists.shape[1], num_topics)
+
+   term_topic_dists = []
+   for word_id in list(dictionary.token2id.values()):
+    values = []
+    for topic_id in range(0, num_topics):
+      values.append((topic_id, topic_model.expElogbeta[topic_id][word_id]))
+    term_topic_dists.append(values)
+   term_topic_dists = gensim.matutils.corpus2dense(term_topic_dists, num_topics).T
+   term_topic_dists = term_topic_dists / term_topic_dists.sum(axis=1)
+
+   # get the topic-term distribution straight from gensim without
+   # iterating over tuples
+   if hasattr(topic_model, 'lda_beta'):
+       topic = topic_model.lda_beta
+   else:
+       topic = topic_model.state.get_lambda()
+   topic = topic / topic.sum(axis=1)[:, None]
+   topic_term_dists = topic[:, fnames_argsort]
+
+   assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]
+
+   # convert tokenised texts of documents to list of strings
+   texts = [' '.join(doc) for doc in texts]
+
+   return {'doc_topic_dists': doc_topic_dists, 'doc_word_dists': doc_word_dists, 'topic_word_dists': topic_term_dists,
+           'word_topic_dists': term_topic_dists, 'doc_tag': range(0, doc_topic_dists.shape[0]), 'doc_texts': texts,
+           'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}
+
+def prepare(topic_model, corpus, dictionary, texts=None, doc_topic_dist=None, **kwargs):
+    """Transforms the Gensim TopicModel and related corpus and dictionary into
+    the data structures needed for the visualization.
+
+    Parameters
+    ----------
+    topic_model : gensim.models.ldamodel.LdaModel
+        An already trained Gensim LdaModel. The other gensim model types are
+        not supported (PRs welcome).
+
+    corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix
+        The corpus in bag of word form, the same docs used to train the model.
+        The corpus is transformed into a csc matrix internally, if you intend to
+        call prepare multiple times it is a good idea to first call
+        `gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead.
+
+    For example: [(50, 3), (63, 5), ....]
+
+    dictionary: gensim.corpora.Dictionary
+        The dictionary object used to create the corpus. Needed to extract the
+        actual terms (not ids).
+
+    doc_topic_dist (optional): Document topic distribution from LDA (default=None)
+        The document topic distribution that is eventually visualised, if you will
+        be calling `prepare` multiple times it's a good idea to explicitly pass in
+        `doc_topic_dist` as inferring this for large corpora can be quite
+        expensive.
+
+    **kwargs :
+        additional keyword arguments are passed through to :func:`pyldavis.prepare`.
+
+    Returns
+    -------
+    prepared_data : PreparedData
+        the data structures used in the visualization
+
+    Example
+    --------
+    For example usage please see this notebook:
+    http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb
+
+    See
+    ------
+    See `pyLDAvis.prepare` for **kwargs.
+    """
+    opts = fp.merge(_extract_data(topic_model, corpus, dictionary, texts, doc_topic_dist), kwargs)
+    return vis_prepare(**opts)
diff --git a/gensim/visualization/prepare.py b/gensim/visualization/prepare.py
@@ -0,0 +1,198 @@
+# code modified from https://github.com/bmabey/pyLDAvis
+
+from __future__ import absolute_import
+from past.builtins import basestring
+from collections import namedtuple
+import json
+import logging
+from joblib import Parallel, delayed, cpu_count
+import numpy as np
+import pandas as pd
+from scipy.stats import entropy
+from scipy.spatial.distance import pdist, squareform
+from .utils import NumPyEncoder
+try:
+    from sklearn.manifold import MDS, TSNE
+    sklearn_present = True
+except ImportError:
+    sklearn_present = False
+
+
+
+def _jensen_shannon(_P, _Q):
+    _M = 0.5 * (_P + _Q)
+    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
+
+
+def _pcoa(pair_dists, n_components=2):
+    """Principal Coordinate Analysis,
+    aka Classical Multidimensional Scaling
+    """
+    # code referenced from skbio.stats.ordination.pcoa
+    # https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py
+
+    # pairwise distance matrix is assumed symmetric
+    pair_dists = np.asarray(pair_dists, np.float64)
+
+    # perform SVD on double centred distance matrix
+    n = pair_dists.shape[0]
+    H = np.eye(n) - np.ones((n, n)) / n
+    B = - H.dot(pair_dists ** 2).dot(H) / 2
+    eigvals, eigvecs = np.linalg.eig(B)
+
+    # Take first n_components of eigenvalues and eigenvectors
+    # sorted in decreasing order
+    ix = eigvals.argsort()[::-1][:n_components]
+    eigvals = eigvals[ix]
+    eigvecs = eigvecs[:, ix]
+
+    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
+    # at least 1 eigenvalue must be zero
+    eigvals[np.isclose(eigvals, 0)] = 0
+    if np.any(eigvals < 0):
+        ix_neg = eigvals < 0
+        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
+        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)
+
+    return np.sqrt(eigvals) * eigvecs
+
+
+def js_PCoA(distributions):
+    """Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis
+    (aka Classical Multidimensional Scaling)
+    Parameters
+    ----------
+    distributions : array-like, shape (`n_dists`, `k`)
+        Matrix of distributions probabilities.
+    Returns
+    -------
+    pcoa : array, shape (`n_dists`, 2)
+    """
+    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
+    return _pcoa(dist_matrix)
+
+def js_MMDS(distributions, **kwargs):
+    """Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling
+    Parameters
+    ----------
+    distributions : array-like, shape (`n_dists`, `k`)
+        Matrix of distributions probabilities.
+    **kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()`
+    Returns
+    -------
+    mmds : array, shape (`n_dists`, 2)
+    """
+    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
+    model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs)
+    return model.fit_transform(dist_matrix)
+
+def js_TSNE(distributions, **kwargs):
+    """Dimension reduction via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding
+    Parameters
+    ----------
+    distributions : array-like, shape (`n_dists`, `k`)
+        Matrix of distributions probabilities.
+    **kwargs : Keyword argument to be passed to `sklearn.manifold.TSNE()`
+    Returns
+    -------
+    tsne : array, shape (`n_dists`, 2)
+    """
+    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
+    model = TSNE(n_components=2, random_state=0, metric='precomputed', **kwargs)
+    return model.fit_transform(dist_matrix)
+
+
+def _doc_coordinates(mds, doc_topic_dists, doc_tag, doc_texts):
+    K = doc_topic_dists.shape[0]
+    mds_res = mds(doc_topic_dists)
+    assert mds_res.shape == (K, 2)
+    mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'docs': doc_tag, 'doc_texts':doc_texts})
+    return mds_df
+
+def _topic_coordinates(mds, topic_word_dists, topic_proportion):
+    K = topic_word_dists.shape[0]
+    mds_res = mds(topic_word_dists)
+    assert mds_res.shape == (K, 2)
+    mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'topics': range(1, K + 1), 'Freq': topic_proportion * 100})
+    return mds_df
+
+def _word_coordinates(mds, word_topic_dists, vocab, word_proportion):
+    K = word_topic_dists.shape[0]
+    mds_res = mds(word_topic_dists)
+    assert mds_res.shape == (K, 2)
+    mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'vocab': vocab, 'Freq': word_proportion * 100})
+    return mds_df
+
+
+def _info(dists, fst, scnd):
+    dists = dists / dists.sum()
+    pd_data = pd.DataFrame(dists)
+    pd_data = pd_data.stack().reset_index().rename(columns={'level_0':fst,'level_1':scnd, 0:'Freq'})
+
+    return pd_data
+
+
+def prepare(doc_topic_dists, doc_word_dists, topic_word_dists, word_topic_dists, 
+            vocab, doc_tag, doc_texts, doc_lengths, mds=js_PCoA):
+    """Transforms the topic model distributions and related corpus data into
+    the data structures needed for the visualization.
+    """
+    # parse mds
+    if isinstance(mds, basestring):
+        mds = mds.lower()
+        if mds == 'pcoa':
+            mds = js_PCoA
+        elif mds in ('mmds', 'tsne'):
+            if sklearn_present:
+                mds_opts = {'mmds': js_MMDS, 'tsne': js_TSNE}
+                mds = mds_opts[mds]
+            else:
+                logging.warning('sklearn not present, switch to PCoA')
+                mds = js_PCoA
+        else:
+            logging.warning('Unknown mds `%s`, switch to PCoA' % mds)
+            mds = js_PCoA
+
+    vocab = pd.Series(vocab, name='vocab')
+    doc_tag = pd.Series(doc_tag, name='docs')
+    doc_texts = pd.Series(doc_texts, name='doc_texts')
+
+    topic_freq = np.dot(doc_topic_dists.T, doc_lengths)
+    topic_proportion = topic_freq / topic_freq.sum()
+
+    word_freq = np.dot(topic_word_dists.T, topic_freq)
+    word_proportion = word_freq / word_freq.sum()
+
+    word_doc_dists = doc_word_dists.T
+    topic_doc_dists = doc_topic_dists.T
+
+    doc_topic_info = _info(doc_topic_dists, 'Doc', 'Topic')
+    doc_word_info = _info(doc_word_dists, 'Doc', 'Word')
+    topic_doc_info = _info(topic_doc_dists, 'Topic', 'Doc')
+    topic_word_info = _info(topic_word_dists, 'Topic', 'Word')
+    word_doc_info = _info(word_doc_dists, 'Word', 'Doc')
+    word_topic_info = _info(word_topic_dists, 'Word', 'Topic')
+
+    doc_coordinates = _doc_coordinates(mds, doc_topic_dists, doc_tag)
+    topic_coordinates = _topic_coordinates(mds, topic_word_dists, topic_proportion)
+    word_coordinates = _word_coordinates(mds, word_topic_dists, vocab, word_proportion)
+
+    return PreparedData(doc_coordinates, topic_coordinates, word_coordinates, doc_topic_info, doc_word_info, topic_doc_info, topic_word_info, word_doc_info, word_topic_info)
+
+
+class PreparedData(namedtuple('PreparedData', ['doc_coordinates', 'topic_coordinates', 'word_coordinates', 'doc_topic_info', 'doc_word_info', 
+                                               'topic_doc_info', 'topic_word_info', 'word_doc_info', 'word_topic_info'])):
+    def to_dict(self):
+        return {'doc_mds': self.doc_coordinates.to_dict(orient='list'),
+               'topic_mds': self.topic_coordinates.to_dict(orient='list'),
+               'word_mds': self.word_coordinates.to_dict(orient='list'),
+               'doc_topic.info': self.doc_topic_info.to_dict(orient='list'),
+               'doc_word.info': self.doc_word_info.to_dict(orient='list'),
+               'topic_doc.info': self.topic_doc_info.to_dict(orient='list'),
+               'topic_word.info': self.topic_word_info.to_dict(orient='list'),
+               'word_doc.info': self.word_doc_info.to_dict(orient='list'),
+               'word_topic.info': self.word_topic_info.to_dict(orient='list')
+               }
+
+    def to_json(self):
+        return json.dumps(self.to_dict(), cls=NumPyEncoder)