Skip to content

Commit

Permalink
Fix a single phrase not returning back multiple bigrams, #794 (#1362)
Browse files Browse the repository at this point in the history
* fixing a single phrase not returning back multiple bigrams, #794

* addressing feedback: removing print statement.
  • Loading branch information
toumorokoshi authored and tmylk committed May 23, 2017
1 parent ac980b4 commit 5242a32
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 12 deletions.
2 changes: 1 addition & 1 deletion gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
yield (out_delimiter.join((word_a, word_b)), score)
last_bigram = True
continue
last_bigram = False
last_bigram = False

def __getitem__(self, sentence):
"""
Expand Down
33 changes: 22 additions & 11 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,31 @@ def testExportPhrases(self):
"""Test Phrases bigram export_phrases functionality."""
bigram = Phrases(sentences, min_count=1, threshold=1)

# with this setting we should get response_time and graph_minors
bigram1_seen = False
bigram2_seen = False
seen_bigrams = set()

for phrase, score in bigram.export_phrases(sentences):
if not bigram1_seen and b'response time' == phrase:
bigram1_seen = True
elif not bigram2_seen and b'graph minors' == phrase:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
seen_bigrams.add(phrase)

assert seen_bigrams == set([
b'response time',
b'graph minors',
b'human interface'
])

def test_multiple_bigrams_single_entry(self):
""" a single entry should produce multiple bigrams. """
bigram = Phrases(sentences, min_count=1, threshold=1)

seen_bigrams = set()

test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
for phrase, score in bigram.export_phrases(test_sentences):
seen_bigrams.add(phrase)

self.assertTrue(bigram1_seen)
self.assertTrue(bigram2_seen)
assert seen_bigrams == set([
b'graph minors',
b'human interface'
])

def testBadParameters(self):
"""Test the phrases module with bad parameters."""
Expand Down

0 comments on commit 5242a32

Please sign in to comment.