Skip to content

Commit

Permalink
Fix #963 : Passing all the params when calling lda.get_document_topic…
Browse files Browse the repository at this point in the history
…s for a corpus (#978)

* Fix PR #963

* Added entry in CHANGELOG.md

* Revert "Added entry in CHANGELOG.md"

This reverts commit 49dce4d.

* Added new entry in CHANGELOG.md

* Update test_ldamodel.py

* Update CHANGELOG.md

* Updated tutorial for get_document_topics

* Added parameters in ldamulticore constructor

* Added parameters for ldamulticore

* Update test_ldamodel.py

* test added to check filtering effect of minimum_probability and minimum_phi_value
  • Loading branch information
Partho Mandal authored and tmylk committed Nov 11, 2016
1 parent 951eebf commit 17a01cf
Show file tree
Hide file tree
Showing 6 changed files with 388 additions and 21 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
Changes
=======

0.13.4, 2016-10-25
* Passed all the params through the apply call in lda.get_document_topics(), test case to use the per_word_topics through the corpus in test_ldamodel (@parthoiiitm, [#978](https://github.com/RaRe-Technologies/gensim/pull/978))

0.13.3, 2016-10-20

* Add vocabulary expansion feature to word2vec. (@isohyt, [#900](https://github.com/RaRe-Technologies/gensim/pull/900))
Expand Down
329 changes: 316 additions & 13 deletions docs/notebooks/topic_methods.ipynb

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):


class TransformedCorpus(CorpusABC):
def __init__(self, obj, corpus, chunksize=None):
def __init__(self, obj, corpus, chunksize=None, **kwargs):
self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
for key, value in kwargs.items(): #add the new parameters like per_word_topics to base class object of LdaModel
setattr(self.obj, key, value)
self.metadata = False

def __len__(self):
Expand Down Expand Up @@ -156,12 +158,12 @@ def __getitem__(self, vec):
raise NotImplementedError('cannot instantiate abstract base class')


def _apply(self, corpus, chunksize=None):
def _apply(self, corpus, chunksize=None, **kwargs):
"""
Apply the transformation to a whole corpus (as opposed to a single document)
and return the result as another corpus.
"""
return TransformedCorpus(self, corpus, chunksize)
return TransformedCorpus(self, corpus, chunksize, **kwargs)
#endclass TransformationABC


Expand Down
14 changes: 11 additions & 3 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
distributed=False, chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0,
eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None, ns_conf={}):
minimum_probability=0.01, random_state=None, ns_conf={},
minimum_phi_value=0.01, per_word_topics=False):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -297,6 +298,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
self.passes = passes
self.update_every = update_every
self.eval_every = eval_every
self.minimum_phi_value = minimum_phi_value
self.per_word_topics = per_word_topics

self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

Expand Down Expand Up @@ -916,7 +919,12 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N
# if the input vector is a corpus, return a transformed corpus
is_corpus, corpus = utils.is_corpus(bow)
if is_corpus:
return self._apply(corpus)
kwargs = dict(
per_word_topics = per_word_topics,
minimum_probability = minimum_probability,
minimum_phi_value = minimum_phi_value
)
return self._apply(corpus, **kwargs)

gamma, phis = self.inference([bow], collect_sstats=True)
topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution
Expand Down Expand Up @@ -977,7 +985,7 @@ def __getitem__(self, bow, eps=None):
Ignore topics with very low probability (below `eps`).
"""
return self.get_document_topics(bow, eps)
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)

def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
"""
Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ class LdaMulticore(LdaModel):
def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001, random_state=None):
gamma_threshold=0.001, random_state=None, minimum_probability=0.01,
minimum_phi_value=0.01, per_word_topics=False):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -144,7 +145,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state)
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability= minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)


def update(self, corpus, chunks_as_numpy=False):
Expand Down
49 changes: 49 additions & 0 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,55 @@ def testGetDocumentTopics(self):
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))

#Test case to use the get_document_topic function for the corpus
all_topics = model.get_document_topics(self.corpus, per_word_topics=True)

self.assertEqual(model.state.numdocs, len(corpus))

for topic in all_topics:
self.assertTrue(isinstance(topic, tuple))
for k, v in topic[0]: # list of doc_topics
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))

for w, topic_list in topic[1]: # list of word_topics
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(topic_list, list))

for w, phi_values in topic[2]: # list of word_phis
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(phi_values, list))

#Test case to check the filtering effect of minimum_probability and minimum_phi_value
doc_topic_count_na = 0
word_phi_count_na = 0

all_topics = model.get_document_topics(self.corpus, minimum_probability=0.8, minimum_phi_value=1.0, per_word_topics=True)

self.assertEqual(model.state.numdocs, len(corpus))

for topic in all_topics:
self.assertTrue(isinstance(topic, tuple))
for k, v in topic[0]: # list of doc_topics
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))
if len(topic[0]) != 0:
doc_topic_count_na += 1

for w, topic_list in topic[1]: # list of word_topics
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(topic_list, list))

for w, phi_values in topic[2]: # list of word_phis
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(phi_values, list))
if len(phi_values) != 0:
word_phi_count_na += 1

self.assertTrue(model.state.numdocs > doc_topic_count_na)
self.assertTrue( sum([len(i) for i in corpus]) > word_phi_count_na)


doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus[1], per_word_topics=True)

for k, v in doc_topics:
Expand Down

0 comments on commit 17a01cf

Please sign in to comment.