Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing a single phrase not returning back multiple bigrams, #794 #1362

Merged
merged 2 commits into from
May 23, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
yield (out_delimiter.join((word_a, word_b)), score)
last_bigram = True
continue
last_bigram = False
last_bigram = False

def __getitem__(self, sentence):
"""
Expand Down
33 changes: 22 additions & 11 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,31 @@ def testExportPhrases(self):
"""Test Phrases bigram export_phrases functionality."""
bigram = Phrases(sentences, min_count=1, threshold=1)

# with this setting we should get response_time and graph_minors
bigram1_seen = False
bigram2_seen = False
seen_bigrams = set()

for phrase, score in bigram.export_phrases(sentences):
if not bigram1_seen and b'response time' == phrase:
bigram1_seen = True
elif not bigram2_seen and b'graph minors' == phrase:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
seen_bigrams.add(phrase)

assert seen_bigrams == set([
b'response time',
b'graph minors',
b'human interface'
])

def test_multiple_bigrams_single_entry(self):
""" a single entry should produce multiple bigrams. """
bigram = Phrases(sentences, min_count=1, threshold=1)

seen_bigrams = set()

test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
for phrase, score in bigram.export_phrases(test_sentences):
seen_bigrams.add(phrase)

self.assertTrue(bigram1_seen)
self.assertTrue(bigram2_seen)
assert seen_bigrams == set([
b'graph minors',
b'human interface'
])

def testBadParameters(self):
"""Test the phrases module with bad parameters."""
Expand Down