Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PR #963 : Passing all the params through the apply call in lda.get_document_topics #978

Merged
merged 11 commits into from
Nov 11, 2016
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
Changes
=======

0.13.4, 2016-10-25
* Passed all the params through the apply call in lda.get_document_topics(), test case to use the per_word_topics through the corpus in test_ldamodel (@parthoiiitm, [#978](https://github.com/RaRe-Technologies/gensim/pull/978))

0.13.3, 2016-10-20

* Add vocabulary expansion feature to word2vec. (@isohyt, [#900](https://github.com/RaRe-Technologies/gensim/pull/900))
Expand Down
329 changes: 316 additions & 13 deletions docs/notebooks/topic_methods.ipynb

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):


class TransformedCorpus(CorpusABC):
def __init__(self, obj, corpus, chunksize=None):
def __init__(self, obj, corpus, chunksize=None, **kwargs):
self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
for key, value in kwargs.items(): #add the new parameters like per_word_topics to base class object of LdaModel
setattr(self.obj, key, value)
self.metadata = False

def __len__(self):
Expand Down Expand Up @@ -156,12 +158,12 @@ def __getitem__(self, vec):
raise NotImplementedError('cannot instantiate abstract base class')


def _apply(self, corpus, chunksize=None):
def _apply(self, corpus, chunksize=None, **kwargs):
"""
Apply the transformation to a whole corpus (as opposed to a single document)
and return the result as another corpus.
"""
return TransformedCorpus(self, corpus, chunksize)
return TransformedCorpus(self, corpus, chunksize, **kwargs)
#endclass TransformationABC


Expand Down
14 changes: 11 additions & 3 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
distributed=False, chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0,
eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None, ns_conf={}):
minimum_probability=0.01, random_state=None, ns_conf={},
minimum_phi_value=0.01, per_word_topics=False):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -297,6 +298,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
self.passes = passes
self.update_every = update_every
self.eval_every = eval_every
self.minimum_phi_value = minimum_phi_value
self.per_word_topics = per_word_topics

self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

Expand Down Expand Up @@ -916,7 +919,12 @@ def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=N
# if the input vector is a corpus, return a transformed corpus
is_corpus, corpus = utils.is_corpus(bow)
if is_corpus:
return self._apply(corpus)
kwargs = dict(
per_word_topics = per_word_topics,
minimum_probability = minimum_probability,
minimum_phi_value = minimum_phi_value
)
return self._apply(corpus, **kwargs)

gamma, phis = self.inference([bow], collect_sstats=True)
topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution
Expand Down Expand Up @@ -977,7 +985,7 @@ def __getitem__(self, bow, eps=None):
Ignore topics with very low probability (below `eps`).

"""
return self.get_document_topics(bow, eps)
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)

def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
"""
Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ class LdaMulticore(LdaModel):
def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001, random_state=None):
gamma_threshold=0.001, random_state=None, minimum_probability=0.01,
minimum_phi_value=0.01, per_word_topics=False):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -144,7 +145,8 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
super(LdaMulticore, self).__init__(corpus=corpus, num_topics=num_topics,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state)
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability= minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)


def update(self, corpus, chunks_as_numpy=False):
Expand Down
20 changes: 20 additions & 0 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,26 @@ def testGetDocumentTopics(self):
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))

#Test case to use the get_document_topic function for the corpus
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a new test for minimum_prob and minimum_phi_value. Select values that change the number of topics and docs returned

all_topics = model.get_document_topics(self.corpus, minimum_probability=0.5, minimum_phi_value=0.3, per_word_topics=True)

self.assertEqual(model.state.numdocs, len(corpus))
Copy link
Contributor

@tmylk tmylk Nov 8, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so the filtering params have no effect? It is a better test if they change the outcome

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am seeing the effect in case I choose minimum_probability=0.8 and minimum_phi_value=1.0, then most of the doc topic list and word phi list will be blank. Should I use that as a test case?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep


for topic in all_topics:
self.assertTrue(isinstance(topic, tuple))
for k, v in topic[0]: # list of doc_topics
self.assertTrue(isinstance(k, int))
self.assertTrue(isinstance(v, float))

for w, topic_list in topic[1]: # list of word_topics
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(topic_list, list))

for w, phi_values in topic[2]: # list of word_phis
self.assertTrue(isinstance(w, int))
self.assertTrue(isinstance(phi_values, list))


doc_topics, word_topics, word_phis = model.get_document_topics(self.corpus[1], per_word_topics=True)

for k, v in doc_topics:
Expand Down