Fix pagerank algorithm. Fix piskvorky#805 (piskvorky#1653)

* added a regression test for summarization.keywords() * handled case with graph smaller than 3 nodes * removed TODO about complex eigenvectors * added more comments
horpto · Oct 28, 2017 · 860483a · 860483a
1 parent daad9f7
commit 860483a
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 4 deletions.
diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py
@@ -2,7 +2,9 @@
 # -*- coding: utf-8 -*-
 #
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+import numpy
 from numpy import empty as empty_matrix
+from scipy.linalg import eig
 from scipy.sparse import csr_matrix
 from scipy.sparse.linalg import eigs
 from six.moves import xrange
@@ -21,9 +23,10 @@ def pagerank_weighted(graph, damping=0.85):
 
     pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
 
-    vals, vecs = eigs(pagerank_matrix.T, k=1)  # TODO raise an error if matrix has complex eigenvectors?
+    vec = principal_eigenvector(pagerank_matrix.T)
 
-    return process_results(graph, vecs.real)
+    # Because pagerank_matrix is positive, vec is always real (i.e. not complex)
+    return process_results(graph, vec.real)
 
 
 def build_adjacency_matrix(graph):
@@ -56,9 +59,23 @@ def build_probability_matrix(graph):
     return matrix
 
 
-def process_results(graph, vecs):
+def principal_eigenvector(a):
+    # Note that we prefer to use `eigs` even for dense matrix
+    # because we need only one eigenvector. See #441, #438 for discussion.
+
+    # But it doesn't work for dim A < 3, so we just handle this special case
+    if len(a) < 3:
+        vals, vecs = eig(a)
+        ind = numpy.abs(vals).argmax()
+        return vecs[:, ind]
+    else:
+        vals, vecs = eigs(a, k=1)
+        return vecs[:, 0]
+
+
+def process_results(graph, vec):
     scores = {}
     for i, node in enumerate(graph.nodes()):
-        scores[node] = abs(vecs[i, :])
+        scores[node] = abs(vec[i])
 
     return scores
diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py
@@ -89,6 +89,12 @@ def test_keywords_ratio(self):
 
         self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)
 
+    def test_text_keywords_with_small_graph(self):
+        # regression test, we get graph 2x2 on this text
+        text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious'
+        kwds = keywords(text, words=1, split=True)
+        self.assertTrue(len(kwds))
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)