diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 46e915af17..be735b865a 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -250,7 +250,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): yield (out_delimiter.join((word_a, word_b)), score) last_bigram = True continue - last_bigram = False + last_bigram = False def __getitem__(self, sentence): """ diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index ce0b73b65a..ba2cfc7192 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -127,20 +127,31 @@ def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) - # with this setting we should get response_time and graph_minors - bigram1_seen = False - bigram2_seen = False + seen_bigrams = set() for phrase, score in bigram.export_phrases(sentences): - if not bigram1_seen and b'response time' == phrase: - bigram1_seen = True - elif not bigram2_seen and b'graph minors' == phrase: - bigram2_seen = True - if bigram1_seen and bigram2_seen: - break + seen_bigrams.add(phrase) + + assert seen_bigrams == set([ + b'response time', + b'graph minors', + b'human interface' + ]) + + def test_multiple_bigrams_single_entry(self): + """ a single entry should produce multiple bigrams. """ + bigram = Phrases(sentences, min_count=1, threshold=1) + + seen_bigrams = set() + + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_bigrams.add(phrase) - self.assertTrue(bigram1_seen) - self.assertTrue(bigram2_seen) + assert seen_bigrams == set([ + b'graph minors', + b'human interface' + ]) def testBadParameters(self): """Test the phrases module with bad parameters."""