From 272274405ea3a614a84f15c10e71e3b7ddfd5cf3 Mon Sep 17 00:00:00 2001 From: Samyak Jain Date: Thu, 15 Feb 2018 17:08:03 +0530 Subject: [PATCH] Fix empty output bug in `Phrases`. Fix #1401 (#1853) * bm25 scoring function updated * Fixes #1401 , Phrases behavious now consistent on different versions , test added for empty iterator * Fixes #1401 , IS_SINGLE Function updated * Fixes #1401 , IS_SINGLE function updated * Fixes #1401 , IS_SINGLE function updated * Fixes #1401 , tests for phrasified sentences added --- gensim/models/phrases.py | 6 +++++- gensim/test/test_phrases.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 8d2e41b045..30a8913745 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -108,6 +108,7 @@ def _is_single(obj): is a corpus if it is an iterable of documents. """ obj_iter = iter(obj) + temp_iter = obj_iter try: peek = next(obj_iter) obj_iter = it.chain([peek], obj_iter) @@ -117,9 +118,12 @@ def _is_single(obj): if isinstance(peek, string_types): # It's a document, return the iterator return True, obj_iter + if temp_iter == obj: + # Checking for iterator to the object + return False, obj_iter else: # If the first item isn't a string, assume obj is a corpus - return False, obj_iter + return False, obj class SentenceAnalyzer(object): diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index f0e9cea864..58d0cfff93 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -160,6 +160,16 @@ def setUp(self): self.bigram_unicode = Phrases( self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) + def testEmptyPhrasifiedSentencesIterator(self): + bigram_phrases = Phrases(self.sentences) + bigram_phraser = Phraser(bigram_phrases) + trigram_phrases = Phrases(bigram_phraser[self.sentences]) + trigram_phraser = Phraser(trigram_phrases) + trigrams = trigram_phraser[bigram_phraser[self.sentences]] + fst, snd = list(trigrams), list(trigrams) + self.assertEqual(fst, snd) + self.assertNotEqual(snd, []) + def testEmptyInputsOnBigramConstruction(self): """Test that empty inputs don't throw errors and return the expected result.""" # Empty list -> empty list