diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a1afb1ce1..5fbee1495e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,8 +2,10 @@ Changes ======= 0.13.2, 2016-08-19 - -* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) +* export_phrases in Phrases model changed. Fixed issue #794 and added test cases in test/test_phrases.py(@AadityaJ, +[#879](https://github.com/RaRe-Technologies/gensim/pull/879)) + - bigram construction can now support multiple bigrams within one sentence +* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, [#771](https://github.com/RaRe-Technologies/gensim/pull/771)) - assigning wordtopics value of word_topics to keep backward compatibility, for now * topics, topn parameters changed to num_topics and num_words in show_topics() and print_topics()(@droudy, [#755](https://github.com/RaRe-Technologies/gensim/pull/755)) - In hdpmodel and dtmmodel @@ -45,7 +47,7 @@ Changes * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607) * Easy import of GloVe vectors using Gensim (Manas Ranjan Kar, #625) - Allow easy port of GloVe vectors into Gensim - - Standalone script with command line arguments, compatible with Python>=2.6 + - Standalone script with command line arguments, compatible with Python>=2.6 - Usage: python -m gensim.scripts.glove2word2vec -i glove_vectors.txt -o output_word2vec_compatible.txt * Add `similar_by_word()` and `similar_by_vector()` to word2vec (@isohyt, #381) * Convenience method for similarity of two out of training sentences to doc2vec (@ellolo, #707) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index aa8c339aa5..3c0af1a4db 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -214,6 +214,8 @@ def export_phrases(self, sentences): if score > threshold: yield (b' '.join((word_a, word_b)), score) last_bigram = True + continue + last_bigram = False def __getitem__(self, sentence): """ diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 010d0a00de..52a6f203f2 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -32,7 +32,8 @@ ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'] + ['graph', 'minors', 'survey'], + ['graph', 'minors', 'survey','human','interface'] #test bigrams within same sentence ] @@ -58,23 +59,26 @@ def testBigramConstruction(self): bigram2_seen = True if bigram1_seen and bigram2_seen: break - + self.assertTrue(bigram1_seen and bigram2_seen) # check the same thing, this time using single doc transformation + # last sentence should contain both graph_minors and human_interface self.assertTrue(u'response_time' in bigram[sentences[1]]) self.assertTrue(u'response_time' in bigram[sentences[4]]) + self.assertTrue(u'graph_minors' in bigram[sentences[-3]]) self.assertTrue(u'graph_minors' in bigram[sentences[-2]]) self.assertTrue(u'graph_minors' in bigram[sentences[-1]]) + self.assertTrue(u'human_interface' in bigram[sentences[-1]]) def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) - + # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False - + for phrase, score in bigram.export_phrases(sentences): if not bigram1_seen and b'response time' == phrase: bigram1_seen = True @@ -82,10 +86,10 @@ def testExportPhrases(self): bigram2_seen = True if bigram1_seen and bigram2_seen: break - + self.assertTrue(bigram1_seen) self.assertTrue(bigram2_seen) - + def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0