Skip to content

Commit

Permalink
Fix empty output bug in Phrases. Fix #1401 (#1853)
Browse files Browse the repository at this point in the history
* bm25 scoring function updated

* Fixes #1401 , Phrases behavious now consistent on different versions , test added for empty iterator

* Fixes #1401 , IS_SINGLE Function updated

* Fixes #1401 , IS_SINGLE function updated

* Fixes #1401 , IS_SINGLE function updated

* Fixes #1401 , tests for phrasified sentences added
  • Loading branch information
sj29-innovate authored and menshikh-iv committed Feb 15, 2018
1 parent 06e126a commit 2722744
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
6 changes: 5 additions & 1 deletion gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def _is_single(obj):
is a corpus if it is an iterable of documents.
"""
obj_iter = iter(obj)
temp_iter = obj_iter
try:
peek = next(obj_iter)
obj_iter = it.chain([peek], obj_iter)
Expand All @@ -117,9 +118,12 @@ def _is_single(obj):
if isinstance(peek, string_types):
# It's a document, return the iterator
return True, obj_iter
if temp_iter == obj:
# Checking for iterator to the object
return False, obj_iter
else:
# If the first item isn't a string, assume obj is a corpus
return False, obj_iter
return False, obj


class SentenceAnalyzer(object):
Expand Down
10 changes: 10 additions & 0 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,16 @@ def setUp(self):
self.bigram_unicode = Phrases(
self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms)

def testEmptyPhrasifiedSentencesIterator(self):
bigram_phrases = Phrases(self.sentences)
bigram_phraser = Phraser(bigram_phrases)
trigram_phrases = Phrases(bigram_phraser[self.sentences])
trigram_phraser = Phraser(trigram_phrases)
trigrams = trigram_phraser[bigram_phraser[self.sentences]]
fst, snd = list(trigrams), list(trigrams)
self.assertEqual(fst, snd)
self.assertNotEqual(snd, [])

def testEmptyInputsOnBigramConstruction(self):
"""Test that empty inputs don't throw errors and return the expected result."""
# Empty list -> empty list
Expand Down

0 comments on commit 2722744

Please sign in to comment.