From 860483ab75e8246b22a23762c11463282a285529 Mon Sep 17 00:00:00 2001 From: Alexander Ankudinov Date: Thu, 26 Oct 2017 16:08:40 +0500 Subject: [PATCH] Fix pagerank algorithm. Fix #805 (#1653) * added a regression test for summarization.keywords() * handled case with graph smaller than 3 nodes * removed TODO about complex eigenvectors * added more comments --- gensim/summarization/pagerank_weighted.py | 25 +++++++++++++++++++---- gensim/test/test_keywords.py | 6 ++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index 1978c6e1c7..f5a24635a1 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -2,7 +2,9 @@ # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +import numpy from numpy import empty as empty_matrix +from scipy.linalg import eig from scipy.sparse import csr_matrix from scipy.sparse.linalg import eigs from six.moves import xrange @@ -21,9 +23,10 @@ def pagerank_weighted(graph, damping=0.85): pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix - vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? + vec = principal_eigenvector(pagerank_matrix.T) - return process_results(graph, vecs.real) + # Because pagerank_matrix is positive, vec is always real (i.e. not complex) + return process_results(graph, vec.real) def build_adjacency_matrix(graph): @@ -56,9 +59,23 @@ def build_probability_matrix(graph): return matrix -def process_results(graph, vecs): +def principal_eigenvector(a): + # Note that we prefer to use `eigs` even for dense matrix + # because we need only one eigenvector. See #441, #438 for discussion. + + # But it doesn't work for dim A < 3, so we just handle this special case + if len(a) < 3: + vals, vecs = eig(a) + ind = numpy.abs(vals).argmax() + return vecs[:, ind] + else: + vals, vecs = eigs(a, k=1) + return vecs[:, 0] + + +def process_results(graph, vec): scores = {} for i, node in enumerate(graph.nodes()): - scores[node] = abs(vecs[i, :]) + scores[node] = abs(vec[i]) return scores diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index 76bd448d5c..c8fae400da 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -89,6 +89,12 @@ def test_keywords_ratio(self): self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1) + def test_text_keywords_with_small_graph(self): + # regression test, we get graph 2x2 on this text + text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious' + kwds = keywords(text, words=1, split=True) + self.assertTrue(len(kwds)) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)