Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Mallet wrapper & test for HDPTransform #1555

Merged
merged 8 commits into from
Sep 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def __getitem__(self, bow, iterations=100):

def load_word_topics(self):
logger.info("loading assigned topics from %s", self.fstate())
word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64)
Copy link
Owner

@piskvorky piskvorky Aug 31, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really needed? Double precision is rarely necessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double precision used for all TMs now.

Copy link
Owner

@piskvorky piskvorky Aug 31, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question is "why"? Looks wasteful.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, I don't implement TMs in gensim.

Copy link
Owner

@piskvorky piskvorky Sep 1, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That doesn't sound right. I'm pretty sure many models used to use single precision (RP and LSI, for sure).

Can you check when the change to double precision happened, and why?

if hasattr(self.id2word, 'token2id'):
word2id = self.id2word.token2id
else:
Expand Down
20 changes: 10 additions & 10 deletions gensim/test/test_sklearn_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ def testModelNotFitted(self):
class TestHdpTransformer(unittest.TestCase):
def setUp(self):
numpy.random.seed(0)
self.model = HdpTransformer(id2word=dictionary)
self.model = HdpTransformer(id2word=dictionary, random_state=42)
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.model.fit(self.corpus)

Expand All @@ -855,23 +855,23 @@ def testTransform(self):
doc = self.corpus[0]
transformed_doc = self.model.transform(doc)
expected_doc = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148]]
self.assertTrue(numpy.allclose(transformed_doc, expected_doc))
self.assertTrue(numpy.allclose(transformed_doc, expected_doc, atol=1e-2))

# tranform multiple documents
docs = [self.corpus[0], self.corpus[1]]
transformed_docs = self.model.transform(docs)
expected_docs = [[0.81043386270128193, 0.049357139518070477, 0.035840906753517532, 0.026542006926698079, 0.019925705902962578, 0.014776690981729117, 0.011068909979528148],
[0.0368655605, 0.709055041, 0.194436428, 0.0151706795, 0.0113863652, 1.00000000e-12, 1.00000000e-12]]
self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
[0.03795908, 0.39542609, 0.50650585, 0.0151082, 0.01132749, 0., 0.]]
self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0], atol=1e-2))
self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1], atol=1e-2))

def testPartialFit(self):
for i in range(10):
for i in range(5):
self.model.partial_fit(X=self.corpus) # fit against the model again
doc = list(self.corpus)[0] # transform only the first document
transformed = self.model.transform(doc)
expected = numpy.array([0.76777752, 0.01757334, 0.01600339, 0.01374061, 0.01275931, 0.01126313, 0.01058131, 0.01167185])
passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1e-1)

transformed = self.model.transform(list(self.corpus)[0])
expected = numpy.array([0.77901173, 0.0232508, 0.02054655, 0.01769651, 0.01600487, 0.01478038, 0.01237056, 0.01194372, 0.01070444])
passed = numpy.allclose(transformed[0], expected, atol=1e-2)
self.assertTrue(passed)

def testSetGetParams(self):
Expand Down