piskvorky · tmylk · Oct 7, 2015 · Aug 29, 2015 · Aug 29, 2015 · Aug 29, 2015
diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
@@ -11,13 +11,16 @@
 from itertools import combinations as _combinations
 from six.moves.queue import Queue as _Queue
 from six.moves import xrange
+from six import iteritems
 
 
 WINDOW_SIZE = 2
 
-"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
+"""
+Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
 Example: filter for nouns and adjectives:
-INCLUDING_FILTER = ['NN', 'JJ']"""
+INCLUDING_FILTER = ['NN', 'JJ']
+"""
 INCLUDING_FILTER = ['NN', 'JJ']
 EXCLUDING_FILTER = []
 
@@ -26,13 +29,17 @@ def _get_pos_filters():
     return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)
 
 
-def _get_words_for_graph(tokens):
-    include_filters, exclude_filters = _get_pos_filters()
+def _get_words_for_graph(tokens, pos_filter):
+    if pos_filter is None:
+        include_filters, exclude_filters = _get_pos_filters()
+    else:
+        include_filters = set(pos_filter)
+        exclude_filters = frozenset([])
     if include_filters and exclude_filters:
         raise ValueError("Can't use both include and exclude filters, should use only one")
 
     result = []
-    for word, unit in tokens.iteritems():
+    for word, unit in iteritems(tokens):
         if exclude_filters and unit.tag in exclude_filters:
             continue
         if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
@@ -111,7 +118,7 @@ def _extract_tokens(lemmas, scores, ratio, words):
 
 def _lemmas_to_words(tokens):
     lemma_to_word = {}
-    for word, unit in tokens.iteritems():
+    for word, unit in iteritems(tokens):
         lemma = unit.token
         if lemma in lemma_to_word:
             lemma_to_word[lemma].append(word)
@@ -189,13 +196,13 @@ def _format_results(_keywords, combined_keywords, split, scores):
     return "\n".join(combined_keywords)
 
 
-def keywords(text, ratio=0.2, words=None, split=False, scores=False):
+def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False):
     # Gets a dict of word -> lemma
     tokens = _clean_text_by_word(text)
     split_text = list(_tokenize_by_word(text))
 
     # Creates the graph and adds the edges
-    graph = _build_graph(_get_words_for_graph(tokens))
+    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
     _set_graph_edges(graph, tokens, split_text)
     del split_text  # It's no longer used
 
@@ -206,7 +213,14 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False):
 
     extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
 
-    lemmas_to_word = _lemmas_to_words(tokens)
+    # The results can be polluted by many variations of the same word
+    if lemmatize:
+        lemmas_to_word = {}
+        for word, unit in iteritems(tokens):
+            lemmas_to_word[unit.token] = [word]
+    else:
+        lemmas_to_word = _lemmas_to_words(tokens)
+
     keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)
 
     # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined

diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py
@@ -2,10 +2,9 @@
 # -*- coding: utf-8 -*-
 #
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
 from numpy import empty as empty_matrix
 from scipy.sparse import csr_matrix
-from scipy.linalg import eig
+from scipy.sparse.linalg import eigs
 from six.moves import xrange
 
 try:
@@ -21,8 +20,10 @@ def pagerank_weighted(graph, damping=0.85):
     probability_matrix = build_probability_matrix(graph)
 
     pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
-    vals, vecs = eig(pagerank_matrix, left=True, right=False)  # TODO optimize this.
-    return process_results(graph, vecs)
+
+    vals, vecs = eigs(pagerank_matrix.T, k=1)  # TODO raise an error if matrix has complex eigenvectors?
+
+    return process_results(graph, vecs.real)
 
 
 def build_adjacency_matrix(graph):
@@ -37,7 +38,7 @@ def build_adjacency_matrix(graph):
         neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
         for j in xrange(length):
             edge_weight = float(graph.edge_weight((current_node, nodes[j])))
-            if i != j and edge_weight != 0:
+            if i != j and edge_weight != 0.0:
                 row.append(i)
                 col.append(j)
                 data.append(edge_weight / neighbors_sum)
@@ -49,7 +50,7 @@ def build_probability_matrix(graph):
     dimension = len(graph.nodes())
     matrix = empty_matrix((dimension, dimension))
 
-    probability = 1 / float(dimension)
+    probability = 1.0 / float(dimension)
     matrix.fill(probability)
 
     return matrix
@@ -58,6 +59,6 @@ def build_probability_matrix(graph):
 def process_results(graph, vecs):
     scores = {}
     for i, node in enumerate(graph.nodes()):
-        scores[node] = abs(vecs[i][0])  # TODO: this is wasteful (only compute the principal component).
+        scores[node] = abs(vecs[i, :])
 
     return scores
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
@@ -10,13 +10,14 @@
 from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
 from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights
 from gensim.corpora import Dictionary
-from scipy.sparse import csr_matrix
 from math import log10 as _log10
 from six.moves import xrange
 
 
 INPUT_MIN_LENGTH = 10
 
+WEIGHT_THRESHOLD = 1.e-3
+
 logger = logging.getLogger(__name__)
 
 
@@ -26,7 +27,7 @@ def _set_graph_edge_weights(graph):
 
     for i in xrange(len(documents)):
         for j in xrange(len(documents)):
-            if i == j:
+            if i == j or weights[i][j] < WEIGHT_THRESHOLD:
                 continue
 
             sentence_1 = documents[i]

diff --git a/gensim/test/test_data/mihalcea_tarau.kw.txt b/gensim/test/test_data/mihalcea_tarau.kw.txt
@@ -0,0 +1,14 @@
+hurricane
+coast
+saturday
+storm
+flood
+flooding
+gilbert
+winds heavy
+strong
+defense
+puerto
+north
+weather
+southeast
diff --git a/gensim/test/test_data/mihalcea_tarau.kwpos.txt b/gensim/test/test_data/mihalcea_tarau.kwpos.txt
@@ -0,0 +1,17 @@
+hurricane
+gilbert
+storm
+coast
+saturday
+winds heavy
+weather
+flood
+flooding
+strong
+defense
+people
+cabral said
+associated
+north
+residents
+southeast
diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# encoding: utf-8
+#
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated test to reproduce the results of Mihalcea and Tarau (2004).
+
+Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm.
+As a validation of the gensim implementation we reproduced its results
+in this test.
+
+"""
+
+import os.path
+import logging
+import unittest
+
+from gensim import utils
+from gensim.corpora import Dictionary
+from gensim.summarization import keywords
+
+
+class TestKeywordsTest(unittest.TestCase):
+
+    def test_text_keywords(self):
+        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+
+        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
+            text = f.read()
+
+        # calculate keywords
+        generated_keywords = keywords(text, split=True)
+
+        # To be compared to the reference.
+        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
+            kw = f.read().strip().split("\n")
+
+        self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw)))
+
+    def test_text_keywords_words(self):
+        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+
+        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
+            text = f.read()
+
+        # calculate exactly 13 keywords
+        generated_keywords = keywords(text, words=15, split=True)
+
+        self.assertEqual(len(generated_keywords), 15)
+
+    def test_text_keywords_pos(self):
+        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+
+        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
+            text = f.read()
+
+        # calculate keywords using only certain parts of speech
+        generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], split=True)
+
+        # To be compared to the reference.
+        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
+            kw = f.read().strip().split("\n")
+
+        self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw)))
+
+    def test_text_summarization_raises_exception_on_short_input_text(self):
+        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+
+        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
+            text = f.read()
+
+        # Keeps the first 8 sentences to make the text shorter.
+        text = "\n".join(text.split('\n')[:8])
+
+        self.assertTrue(keywords(text) is not None)
+
+    def test_keywords_ratio(self):
+        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
+
+        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
+            text = f.read()
+
+        # Check ratio parameter is well behaved.  Because length is taken on tokenized clean text
+        # we just check that ratio 40% is twice as long as ratio 20%
+        selected_docs_20 = keywords(text, ratio=0.2, split=True)
+        selected_docs_40 = keywords(text, ratio=0.4, split=True)
+
+        self.assertAlmostEqual(float(len(selected_docs_40))/len(selected_docs_20), 1.9, places=1)
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    unittest.main()
diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py
@@ -13,6 +13,7 @@
 """
 
 import os.path
+import logging
 import unittest
 
 from gensim import utils
@@ -128,3 +129,7 @@ def test_corpus_summarization_ratio(self):
             expected_summary_length = int(len(corpus) * ratio)
 
             self.assertEqual(len(selected_docs), expected_summary_length)
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    unittest.main()