Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Topic model visualization #1616

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions gensim/visualization/gensim_wrap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
pyLDAvis Gensim
===============
Helper functions to visualize LDA models trained by Gensim
"""

from __future__ import absolute_import
import funcy as fp
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from past.builtins import xrange
from . import prepare as vis_prepare


def _extract_data(topic_model, corpus, dictionary, texts=None, doc_topic_dists=None):
import gensim

if not gensim.matutils.ismatrix(corpus):
corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))
else:
corpus_csc = corpus
# Need corpus to be a streaming gensim list corpus for len and inference functions below:
corpus = gensim.matutils.Sparse2Corpus(corpus_csc)

doc_word_dists = corpus_csc.todense()
doc_word_dists = doc_word_dists / doc_word_dists.sum(axis=1)

vocab = list(dictionary.token2id.keys())
# TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
# for now, I'll just make sure we don't ever get zeros...
beta = 0.01
fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
term_freqs[term_freqs == 0] = beta
doc_lengths = corpus_csc.sum(axis=0).A.ravel()

assert term_freqs.shape[0] == len(dictionary), 'Term frequencies and dictionary have different shape {} != {}'.format(term_freqs.shape[0], len(dictionary))
assert doc_lengths.shape[0] == len(corpus), 'Document lengths and corpus have different sizes {} != {}'.format(doc_lengths.shape[0], len(corpus))

if hasattr(topic_model, 'lda_alpha'):
num_topics = len(topic_model.lda_alpha)
else:
num_topics = topic_model.num_topics

if doc_topic_dists is None:
# If its an HDP model.
if hasattr(topic_model, 'lda_beta'):
gamma = topic_model.inference(corpus)
else:
gamma, _ = topic_model.inference(corpus)
doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
else:
if isinstance(doc_topic_dists, list):
doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T
elif issparse(doc_topic_dists):
doc_topic_dists = doc_topic_dists.T.todense()
doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

assert doc_topic_dists.shape[1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(doc_topic_dists.shape[1], num_topics)

term_topic_dists = []
for word_id in list(dictionary.token2id.values()):
values = []
for topic_id in range(0, num_topics):
values.append((topic_id, topic_model.expElogbeta[topic_id][word_id]))
term_topic_dists.append(values)
term_topic_dists = gensim.matutils.corpus2dense(term_topic_dists, num_topics).T
term_topic_dists = term_topic_dists / term_topic_dists.sum(axis=1)

# get the topic-term distribution straight from gensim without
# iterating over tuples
if hasattr(topic_model, 'lda_beta'):
topic = topic_model.lda_beta
else:
topic = topic_model.state.get_lambda()
topic = topic / topic.sum(axis=1)[:, None]
topic_term_dists = topic[:, fnames_argsort]

assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]

# convert tokenised texts of documents to list of strings
texts = [' '.join(doc) for doc in texts]

return {'doc_topic_dists': doc_topic_dists, 'doc_word_dists': doc_word_dists, 'topic_word_dists': topic_term_dists,
'word_topic_dists': term_topic_dists, 'doc_tag': range(0, doc_topic_dists.shape[0]), 'doc_texts': texts,
'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}

def prepare(topic_model, corpus, dictionary, texts=None, doc_topic_dist=None, **kwargs):
"""Transforms the Gensim TopicModel and related corpus and dictionary into
the data structures needed for the visualization.

Parameters
----------
topic_model : gensim.models.ldamodel.LdaModel
An already trained Gensim LdaModel. The other gensim model types are
not supported (PRs welcome).

corpus : array-like list of bag of word docs in tuple form or scipy CSC matrix
The corpus in bag of word form, the same docs used to train the model.
The corpus is transformed into a csc matrix internally, if you intend to
call prepare multiple times it is a good idea to first call
`gensim.matutils.corpus2csc(corpus)` and pass in the csc matrix instead.

For example: [(50, 3), (63, 5), ....]

dictionary: gensim.corpora.Dictionary
The dictionary object used to create the corpus. Needed to extract the
actual terms (not ids).

doc_topic_dist (optional): Document topic distribution from LDA (default=None)
The document topic distribution that is eventually visualised, if you will
be calling `prepare` multiple times it's a good idea to explicitly pass in
`doc_topic_dist` as inferring this for large corpora can be quite
expensive.

**kwargs :
additional keyword arguments are passed through to :func:`pyldavis.prepare`.

Returns
-------
prepared_data : PreparedData
the data structures used in the visualization

Example
--------
For example usage please see this notebook:
http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/Gensim%20Newsgroup.ipynb

See
------
See `pyLDAvis.prepare` for **kwargs.
"""
opts = fp.merge(_extract_data(topic_model, corpus, dictionary, texts, doc_topic_dist), kwargs)
return vis_prepare(**opts)
198 changes: 198 additions & 0 deletions gensim/visualization/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# code modified from https://github.com/bmabey/pyLDAvis

from __future__ import absolute_import
from past.builtins import basestring
from collections import namedtuple
import json
import logging
from joblib import Parallel, delayed, cpu_count
import numpy as np
import pandas as pd
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform
from .utils import NumPyEncoder
try:
from sklearn.manifold import MDS, TSNE
sklearn_present = True
except ImportError:
sklearn_present = False



def _jensen_shannon(_P, _Q):
_M = 0.5 * (_P + _Q)
return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))


def _pcoa(pair_dists, n_components=2):
"""Principal Coordinate Analysis,
aka Classical Multidimensional Scaling
"""
# code referenced from skbio.stats.ordination.pcoa
# https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py

# pairwise distance matrix is assumed symmetric
pair_dists = np.asarray(pair_dists, np.float64)

# perform SVD on double centred distance matrix
n = pair_dists.shape[0]
H = np.eye(n) - np.ones((n, n)) / n
B = - H.dot(pair_dists ** 2).dot(H) / 2
eigvals, eigvecs = np.linalg.eig(B)

# Take first n_components of eigenvalues and eigenvectors
# sorted in decreasing order
ix = eigvals.argsort()[::-1][:n_components]
eigvals = eigvals[ix]
eigvecs = eigvecs[:, ix]

# replace any remaining negative eigenvalues and associated eigenvectors with zeroes
# at least 1 eigenvalue must be zero
eigvals[np.isclose(eigvals, 0)] = 0
if np.any(eigvals < 0):
ix_neg = eigvals < 0
eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

return np.sqrt(eigvals) * eigvecs


def js_PCoA(distributions):
"""Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis
(aka Classical Multidimensional Scaling)
Parameters
----------
distributions : array-like, shape (`n_dists`, `k`)
Matrix of distributions probabilities.
Returns
-------
pcoa : array, shape (`n_dists`, 2)
"""
dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
return _pcoa(dist_matrix)

def js_MMDS(distributions, **kwargs):
"""Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling
Parameters
----------
distributions : array-like, shape (`n_dists`, `k`)
Matrix of distributions probabilities.
**kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()`
Returns
-------
mmds : array, shape (`n_dists`, 2)
"""
dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs)
return model.fit_transform(dist_matrix)

def js_TSNE(distributions, **kwargs):
"""Dimension reduction via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding
Parameters
----------
distributions : array-like, shape (`n_dists`, `k`)
Matrix of distributions probabilities.
**kwargs : Keyword argument to be passed to `sklearn.manifold.TSNE()`
Returns
-------
tsne : array, shape (`n_dists`, 2)
"""
dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
model = TSNE(n_components=2, random_state=0, metric='precomputed', **kwargs)
return model.fit_transform(dist_matrix)


def _doc_coordinates(mds, doc_topic_dists, doc_tag, doc_texts):
K = doc_topic_dists.shape[0]
mds_res = mds(doc_topic_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'docs': doc_tag, 'doc_texts':doc_texts})
return mds_df

def _topic_coordinates(mds, topic_word_dists, topic_proportion):
K = topic_word_dists.shape[0]
mds_res = mds(topic_word_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'topics': range(1, K + 1), 'Freq': topic_proportion * 100})
return mds_df

def _word_coordinates(mds, word_topic_dists, vocab, word_proportion):
K = word_topic_dists.shape[0]
mds_res = mds(word_topic_dists)
assert mds_res.shape == (K, 2)
mds_df = pd.DataFrame({'x': mds_res[:,0], 'y': mds_res[:,1], 'vocab': vocab, 'Freq': word_proportion * 100})
return mds_df


def _info(dists, fst, scnd):
dists = dists / dists.sum()
pd_data = pd.DataFrame(dists)
pd_data = pd_data.stack().reset_index().rename(columns={'level_0':fst,'level_1':scnd, 0:'Freq'})

return pd_data


def prepare(doc_topic_dists, doc_word_dists, topic_word_dists, word_topic_dists,
vocab, doc_tag, doc_texts, doc_lengths, mds=js_PCoA):
"""Transforms the topic model distributions and related corpus data into
the data structures needed for the visualization.
"""
# parse mds
if isinstance(mds, basestring):
mds = mds.lower()
if mds == 'pcoa':
mds = js_PCoA
elif mds in ('mmds', 'tsne'):
if sklearn_present:
mds_opts = {'mmds': js_MMDS, 'tsne': js_TSNE}
mds = mds_opts[mds]
else:
logging.warning('sklearn not present, switch to PCoA')
mds = js_PCoA
else:
logging.warning('Unknown mds `%s`, switch to PCoA' % mds)
mds = js_PCoA

vocab = pd.Series(vocab, name='vocab')
doc_tag = pd.Series(doc_tag, name='docs')
doc_texts = pd.Series(doc_texts, name='doc_texts')

topic_freq = np.dot(doc_topic_dists.T, doc_lengths)
topic_proportion = topic_freq / topic_freq.sum()

word_freq = np.dot(topic_word_dists.T, topic_freq)
word_proportion = word_freq / word_freq.sum()

word_doc_dists = doc_word_dists.T
topic_doc_dists = doc_topic_dists.T

doc_topic_info = _info(doc_topic_dists, 'Doc', 'Topic')
doc_word_info = _info(doc_word_dists, 'Doc', 'Word')
topic_doc_info = _info(topic_doc_dists, 'Topic', 'Doc')
topic_word_info = _info(topic_word_dists, 'Topic', 'Word')
word_doc_info = _info(word_doc_dists, 'Word', 'Doc')
word_topic_info = _info(word_topic_dists, 'Word', 'Topic')

doc_coordinates = _doc_coordinates(mds, doc_topic_dists, doc_tag)
topic_coordinates = _topic_coordinates(mds, topic_word_dists, topic_proportion)
word_coordinates = _word_coordinates(mds, word_topic_dists, vocab, word_proportion)

return PreparedData(doc_coordinates, topic_coordinates, word_coordinates, doc_topic_info, doc_word_info, topic_doc_info, topic_word_info, word_doc_info, word_topic_info)


class PreparedData(namedtuple('PreparedData', ['doc_coordinates', 'topic_coordinates', 'word_coordinates', 'doc_topic_info', 'doc_word_info',
'topic_doc_info', 'topic_word_info', 'word_doc_info', 'word_topic_info'])):
def to_dict(self):
return {'doc_mds': self.doc_coordinates.to_dict(orient='list'),
'topic_mds': self.topic_coordinates.to_dict(orient='list'),
'word_mds': self.word_coordinates.to_dict(orient='list'),
'doc_topic.info': self.doc_topic_info.to_dict(orient='list'),
'doc_word.info': self.doc_word_info.to_dict(orient='list'),
'topic_doc.info': self.topic_doc_info.to_dict(orient='list'),
'topic_word.info': self.topic_word_info.to_dict(orient='list'),
'word_doc.info': self.word_doc_info.to_dict(orient='list'),
'word_topic.info': self.word_topic_info.to_dict(orient='list')
}

def to_json(self):
return json.dumps(self.to_dict(), cls=NumPyEncoder)
Loading