Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Summarisation optimisations #441

Merged
merged 13 commits into from Oct 7, 2015
Merged
32 changes: 23 additions & 9 deletions gensim/summarization/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@
from itertools import combinations as _combinations
from six.moves.queue import Queue as _Queue
from six.moves import xrange
from six import iteritems


WINDOW_SIZE = 2

"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
"""
Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
Example: filter for nouns and adjectives:
INCLUDING_FILTER = ['NN', 'JJ']"""
INCLUDING_FILTER = ['NN', 'JJ']
"""
INCLUDING_FILTER = ['NN', 'JJ']
EXCLUDING_FILTER = []

Expand All @@ -26,13 +29,17 @@ def _get_pos_filters():
return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)


def _get_words_for_graph(tokens):
include_filters, exclude_filters = _get_pos_filters()
def _get_words_for_graph(tokens, pos_filter):
if pos_filter is None:
include_filters, exclude_filters = _get_pos_filters()
else:
include_filters = set(pos_filter)
exclude_filters = frozenset([])
if include_filters and exclude_filters:
raise ValueError("Can't use both include and exclude filters, should use only one")

result = []
for word, unit in tokens.iteritems():
for word, unit in iteritems(tokens):
if exclude_filters and unit.tag in exclude_filters:
continue
if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
Expand Down Expand Up @@ -111,7 +118,7 @@ def _extract_tokens(lemmas, scores, ratio, words):

def _lemmas_to_words(tokens):
lemma_to_word = {}
for word, unit in tokens.iteritems():
for word, unit in iteritems(tokens):
lemma = unit.token
if lemma in lemma_to_word:
lemma_to_word[lemma].append(word)
Expand Down Expand Up @@ -189,13 +196,13 @@ def _format_results(_keywords, combined_keywords, split, scores):
return "\n".join(combined_keywords)


def keywords(text, ratio=0.2, words=None, split=False, scores=False):
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False):
# Gets a dict of word -> lemma
tokens = _clean_text_by_word(text)
split_text = list(_tokenize_by_word(text))

# Creates the graph and adds the edges
graph = _build_graph(_get_words_for_graph(tokens))
graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
_set_graph_edges(graph, tokens, split_text)
del split_text # It's no longer used

Expand All @@ -206,7 +213,14 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False):

extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

lemmas_to_word = _lemmas_to_words(tokens)
# The results can be polluted by many variations of the same word
if lemmatize:
lemmas_to_word = {}
for word, unit in iteritems(tokens):
lemmas_to_word[unit.token] = [word]
else:
lemmas_to_word = _lemmas_to_words(tokens)

keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
Expand Down
15 changes: 8 additions & 7 deletions gensim/summarization/pagerank_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from numpy import empty as empty_matrix
from scipy.sparse import csr_matrix
from scipy.linalg import eig
from scipy.sparse.linalg import eigs
from six.moves import xrange

try:
Expand All @@ -21,8 +20,10 @@ def pagerank_weighted(graph, damping=0.85):
probability_matrix = build_probability_matrix(graph)

pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
vals, vecs = eig(pagerank_matrix, left=True, right=False) # TODO optimize this.
return process_results(graph, vecs)

vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors?

return process_results(graph, vecs.real)


def build_adjacency_matrix(graph):
Expand All @@ -37,7 +38,7 @@ def build_adjacency_matrix(graph):
neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
for j in xrange(length):
edge_weight = float(graph.edge_weight((current_node, nodes[j])))
if i != j and edge_weight != 0:
if i != j and edge_weight != 0.0:
row.append(i)
col.append(j)
data.append(edge_weight / neighbors_sum)
Expand All @@ -49,7 +50,7 @@ def build_probability_matrix(graph):
dimension = len(graph.nodes())
matrix = empty_matrix((dimension, dimension))

probability = 1 / float(dimension)
probability = 1.0 / float(dimension)
matrix.fill(probability)

return matrix
Expand All @@ -58,6 +59,6 @@ def build_probability_matrix(graph):
def process_results(graph, vecs):
scores = {}
for i, node in enumerate(graph.nodes()):
scores[node] = abs(vecs[i][0]) # TODO: this is wasteful (only compute the principal component).
scores[node] = abs(vecs[i, :])

return scores
5 changes: 3 additions & 2 deletions gensim/summarization/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights
from gensim.corpora import Dictionary
from scipy.sparse import csr_matrix
from math import log10 as _log10
from six.moves import xrange


INPUT_MIN_LENGTH = 10

WEIGHT_THRESHOLD = 1.e-3

logger = logging.getLogger(__name__)


Expand All @@ -26,7 +27,7 @@ def _set_graph_edge_weights(graph):

for i in xrange(len(documents)):
for j in xrange(len(documents)):
if i == j:
if i == j or weights[i][j] < WEIGHT_THRESHOLD:
continue

sentence_1 = documents[i]
Expand Down
14 changes: 14 additions & 0 deletions gensim/test/test_data/mihalcea_tarau.kw.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
hurricane
coast
saturday
storm
flood
flooding
gilbert
winds heavy
strong
defense
puerto
north
weather
southeast
17 changes: 17 additions & 0 deletions gensim/test/test_data/mihalcea_tarau.kwpos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
hurricane
gilbert
storm
coast
saturday
winds heavy
weather
flood
flooding
strong
defense
people
cabral said
associated
north
residents
southeast
93 changes: 93 additions & 0 deletions gensim/test/test_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python
# encoding: utf-8
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated test to reproduce the results of Mihalcea and Tarau (2004).

Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm.
As a validation of the gensim implementation we reproduced its results
in this test.

"""

import os.path
import logging
import unittest

from gensim import utils
from gensim.corpora import Dictionary
from gensim.summarization import keywords


class TestKeywordsTest(unittest.TestCase):

def test_text_keywords(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()

# calculate keywords
generated_keywords = keywords(text, split=True)

# To be compared to the reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
kw = f.read().strip().split("\n")

self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw)))

def test_text_keywords_words(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()

# calculate exactly 13 keywords
generated_keywords = keywords(text, words=15, split=True)

self.assertEqual(len(generated_keywords), 15)

def test_text_keywords_pos(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()

# calculate keywords using only certain parts of speech
generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], split=True)

# To be compared to the reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
kw = f.read().strip().split("\n")

self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw)))

def test_text_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()

# Keeps the first 8 sentences to make the text shorter.
text = "\n".join(text.split('\n')[:8])

self.assertTrue(keywords(text) is not None)

def test_keywords_ratio(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()

# Check ratio parameter is well behaved. Because length is taken on tokenized clean text
# we just check that ratio 40% is twice as long as ratio 20%
selected_docs_20 = keywords(text, ratio=0.2, split=True)
selected_docs_40 = keywords(text, ratio=0.4, split=True)

self.assertAlmostEqual(float(len(selected_docs_40))/len(selected_docs_20), 1.9, places=1)

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
5 changes: 5 additions & 0 deletions gensim/test/test_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"""

import os.path
import logging
import unittest

from gensim import utils
Expand Down Expand Up @@ -128,3 +129,7 @@ def test_corpus_summarization_ratio(self):
expected_summary_length = int(len(corpus) * ratio)

self.assertEqual(len(selected_docs), expected_summary_length)

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()