Skip to content

Commit

Permalink
Fix Issue piskvorky#805: Added check in summarize_corpus for too few …
Browse files Browse the repository at this point in the history
…words (piskvorky#885)

* Added check in summarize_corpus to fix bug in summarizer

* Fix piskvorky#805: Added check in summarizing text

* Added test for checking low number of distinct words in text

* Text split method changed to allow running in Python 3.3 and above.

* Change to fix test in python versions 3.3 and higher

* Added blank line test_wikicorpus.py file

Added blank line to fix issue with travis CI
  • Loading branch information
anmolgulati authored and harshuljain13 committed Sep 30, 2016
1 parent cc931b7 commit c0f5896
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 1 deletion.
5 changes: 5 additions & 0 deletions gensim/summarization/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ def summarize_corpus(corpus, ratio=0.2):
_set_graph_edge_weights(graph)
_remove_unreachable_nodes(graph)

# Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends.
if len(graph.nodes()) < 3:
logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
return

pagerank_scores = _pagerank(graph)

hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)
Expand Down
10 changes: 10 additions & 0 deletions gensim/test/test_data/testlowdistinctwords.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
here here.
there there.
here here.
there there.
here here.
there there.
here here.
there there.
here here.
there there.
14 changes: 14 additions & 0 deletions gensim/test/test_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,20 @@ def test_keywords_runs(self):
kwds_lst = keywords(text, split=True)
self.assertTrue(len(kwds_lst))

def test_low_distinct_words_corpus_summarization_is_none(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f:
text = f.read()

# Generate the corpus.
sentences = text.split("\n")
tokens = [sentence.split() for sentence in sentences]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

self.assertTrue(summarize_corpus(corpus) is None)

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
2 changes: 1 addition & 1 deletion gensim/test/test_wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ class TestWikiCorpus(unittest.TestCase):
def setUp(self):
wc = WikiCorpus(datapath(FILENAME))


def test_get_texts_returns_generator_of_lists(self):

if sys.version_info < (2, 7, 0):
return

wc = WikiCorpus(datapath(FILENAME))
l = wc.get_texts()
self.assertEqual(type(l), types.GeneratorType)
Expand Down

0 comments on commit c0f5896

Please sign in to comment.