From 2f0446f4db744aff6926a37a8ffab2569e6bd2e0 Mon Sep 17 00:00:00 2001 From: Aman Date: Thu, 29 Sep 2016 13:59:08 +0530 Subject: [PATCH 1/3] Warn and return raw text if only one sentence given to summarizer. Fix for #851 (#887) --- CHANGELOG.md | 1 + gensim/summarization/summarizer.py | 5 +++++ gensim/test/test_summarization.py | 11 +++++++++++ gensim/test/test_wikicorpus.py | 3 +++ 4 files changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bc4acd608..6e61f4756c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Changes - bigram construction can now support multiple bigrams within one sentence * Fixed issue #838, RuntimeWarning: overflow encountered in exp (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895)) * Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884)) +* Fixed issue #851, In summarizer.py, check for single sentence as an input added to avoid ZeroDivionError, added test cases in test/test_summarization.py(@metalaman, #887) 0.13.2, 2016-08-19 diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 234dcec377..71e6640790 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -193,6 +193,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False): logger.warning("Input text is empty.") return + # If only one sentence is present, the function return the input text (Avoids ZeroDivisionError). + if len(sentences) == 1: + logger.warning("Summarization not performed since the document has only one sentence.") + return text + # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least " + str(INPUT_MIN_LENGTH) + " sentences.") diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index fde845dc93..220224601f 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -88,6 +88,17 @@ def test_text_summarization_raises_exception_on_short_input_text(self): self.assertTrue(summarize(text) is not None) + def test_text_summarization_returns_input_on_single_input_sentence(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: + text = f.read() + + # Keeps the first sentence only. + text = text.split('\n')[0] + + self.assertEqual(summarize(text),text) + def test_corpus_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 77c4212831..6ccdf6b0bf 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -12,11 +12,13 @@ import os import sys import types + import unittest from gensim.corpora.wikicorpus import WikiCorpus + module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2' @@ -29,6 +31,7 @@ def setUp(self): def test_get_texts_returns_generator_of_lists(self): + if sys.version_info < (2, 7, 0): return wc = WikiCorpus(datapath(FILENAME)) From f7dd826005eb394b7c0c05b71441da194e3ee215 Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Thu, 29 Sep 2016 19:21:20 +0530 Subject: [PATCH 2/3] Fix Issue #805: Added check in summarize_corpus for too few words (#885) * Added check in summarize_corpus to fix bug in summarizer * Fix #805: Added check in summarizing text * Added test for checking low number of distinct words in text * Text split method changed to allow running in Python 3.3 and above. * Change to fix test in python versions 3.3 and higher * Added blank line test_wikicorpus.py file Added blank line to fix issue with travis CI --- gensim/summarization/summarizer.py | 5 +++++ gensim/test/test_data/testlowdistinctwords.txt | 10 ++++++++++ gensim/test/test_summarization.py | 14 ++++++++++++++ gensim/test/test_wikicorpus.py | 2 +- 4 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_data/testlowdistinctwords.txt diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 71e6640790..13e384ca09 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -158,6 +158,11 @@ def summarize_corpus(corpus, ratio=0.2): _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) + # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. + if len(graph.nodes()) < 3: + logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") + return + pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) diff --git a/gensim/test/test_data/testlowdistinctwords.txt b/gensim/test/test_data/testlowdistinctwords.txt new file mode 100644 index 0000000000..70e20fa3d3 --- /dev/null +++ b/gensim/test/test_data/testlowdistinctwords.txt @@ -0,0 +1,10 @@ +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. \ No newline at end of file diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 220224601f..97571de78d 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -171,6 +171,20 @@ def test_keywords_runs(self): kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst)) + def test_low_distinct_words_corpus_summarization_is_none(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f: + text = f.read() + + # Generate the corpus. + sentences = text.split("\n") + tokens = [sentence.split() for sentence in sentences] + dictionary = Dictionary(tokens) + corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] + + self.assertTrue(summarize_corpus(corpus) is None) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 6ccdf6b0bf..b12108073f 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -29,11 +29,11 @@ class TestWikiCorpus(unittest.TestCase): def setUp(self): wc = WikiCorpus(datapath(FILENAME)) - def test_get_texts_returns_generator_of_lists(self): if sys.version_info < (2, 7, 0): return + wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertEqual(type(l), types.GeneratorType) From 1c34dfc20143eec7c397ff50247d6c7f9364bcbd Mon Sep 17 00:00:00 2001 From: Aman Date: Thu, 29 Sep 2016 19:22:30 +0530 Subject: [PATCH 3/3] Fix #851, Error is raised instead of returning text [WiP] (#902) * Update summarizer.py Return statement removed and error raised. * Update test_summarization.py Removed test for single sentence input. * Update CHANGELOG.md * Update summarizer.py * Update test_wikicorpus.py * Update test_summarization.py --- CHANGELOG.md | 2 +- gensim/summarization/summarizer.py | 5 ++--- gensim/test/test_summarization.py | 4 ++-- gensim/test/test_wikicorpus.py | 6 +++--- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e61f4756c..2f7ef32f59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ Changes - bigram construction can now support multiple bigrams within one sentence * Fixed issue #838, RuntimeWarning: overflow encountered in exp (@markroxor, [#895](https://github.com/RaRe-Technologies/gensim/pull/895)) * Changed some log messages to warnings as suggested in issue #828. (@rhnvrm, [#884](https://github.com/RaRe-Technologies/gensim/pull/884)) -* Fixed issue #851, In summarizer.py, check for single sentence as an input added to avoid ZeroDivionError, added test cases in test/test_summarization.py(@metalaman, #887) +* Fixed issue #851, In summarizer.py, RunTimeError is raised if single sentence input is provided to avoid ZeroDivionError. (@metalaman, #887) 0.13.2, 2016-08-19 diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 13e384ca09..0779011999 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -198,10 +198,9 @@ def summarize(text, ratio=0.2, word_count=None, split=False): logger.warning("Input text is empty.") return - # If only one sentence is present, the function return the input text (Avoids ZeroDivisionError). + # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). if len(sentences) == 1: - logger.warning("Summarization not performed since the document has only one sentence.") - return text + raise ValueError("input must have more than one sentence") # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 97571de78d..bd215efcab 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -87,7 +87,7 @@ def test_text_summarization_raises_exception_on_short_input_text(self): text = "\n".join(text.split('\n')[:8]) self.assertTrue(summarize(text) is not None) - + def test_text_summarization_returns_input_on_single_input_sentence(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -97,7 +97,7 @@ def test_text_summarization_returns_input_on_single_input_sentence(self): # Keeps the first sentence only. text = text.split('\n')[0] - self.assertEqual(summarize(text),text) + self.assertRaises(ValueError,summarize,text) def test_corpus_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index b12108073f..7ac953d847 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -12,17 +12,17 @@ import os import sys import types - +import logging import unittest from gensim.corpora.wikicorpus import WikiCorpus - module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2' +logger = logging.getLogger(__name__) class TestWikiCorpus(unittest.TestCase): @@ -30,7 +30,7 @@ def setUp(self): wc = WikiCorpus(datapath(FILENAME)) def test_get_texts_returns_generator_of_lists(self): - + logger.debug("Current Python Version is "+str(sys.version_info)) if sys.version_info < (2, 7, 0): return