Skip to content

Commit

Permalink
ported to Python3
Browse files Browse the repository at this point in the history
  • Loading branch information
tsptoni committed Nov 23, 2017
1 parent 00d483b commit ebe6c9e
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 46 deletions.
2 changes: 1 addition & 1 deletion summa/commons.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

from graph import Graph
from .graph import Graph


def build_graph(sequence):
Expand Down
12 changes: 6 additions & 6 deletions summa/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import networkx as _nx
from networkx.drawing.nx_agraph import graphviz_layout
from os import system as _shell
from summarizer import get_graph as _get_sentence_graph
from keywords import get_graph as _get_word_graph
from pagerank_weighted import pagerank_weighted_scipy as _pagerank_weighted_scipy
from preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
from .summarizer import get_graph as _get_sentence_graph
from .keywords import get_graph as _get_word_graph
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank_weighted_scipy
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word

NODE_COLOR = {'r': 239, 'g': 10, 'b': 10}

Expand Down Expand Up @@ -55,7 +55,7 @@ def _center_positions(positions):
delta_y = (min_y + max_y) / 2

centered_positions = {}
for key, position in positions.iteritems():
for key, position in positions.items():
new_position = (round(position[0] - delta_x, 2), round(position[1] - delta_y, 2))
centered_positions[key] = new_position
return centered_positions
Expand Down
28 changes: 15 additions & 13 deletions summa/keywords.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from itertools import combinations as _combinations
from Queue import Queue as _Queue
# from Queue import Queue as _Queue
from queue import Queue as _Queue
# from multiprocessing import Queue as _Queue

from pagerank_weighted import pagerank_weighted_scipy as _pagerank
from preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
from preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
from commons import build_graph as _build_graph
from commons import remove_unreachable_nodes as _remove_unreachable_nodes
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
from .preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes

WINDOW_SIZE = 2

Expand All @@ -26,7 +28,7 @@ def _get_words_for_graph(tokens):
raise ValueError("Can't use both include and exclude filters, should use only one")

result = []
for word, unit in tokens.iteritems():
for word, unit in tokens.items():
if exclude_filters and unit.tag in exclude_filters:
continue
if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
Expand Down Expand Up @@ -75,15 +77,15 @@ def _update_queue(queue, word):

def _process_text(graph, tokens, split_text):
queue = _init_queue(split_text)
for i in xrange(WINDOW_SIZE, len(split_text)):
for i in range(WINDOW_SIZE, len(split_text)):
word = split_text[i]
_process_word(graph, tokens, queue, word)
_update_queue(queue, word)


def _queue_iterator(queue):
iterations = queue.qsize()
for i in xrange(iterations):
for i in range(iterations):
var = queue.get()
yield var
queue.put(var)
Expand All @@ -105,7 +107,7 @@ def _extract_tokens(lemmas, scores, ratio, words):

def _lemmas_to_words(tokens):
lemma_to_word = {}
for word, unit in tokens.iteritems():
for word, unit in tokens.items():
lemma = unit.token
if lemma in lemma_to_word:
lemma_to_word[lemma].append(word)
Expand Down Expand Up @@ -144,14 +146,14 @@ def _get_combined_keywords(_keywords, split_text):
result = []
_keywords = _keywords.copy()
len_text = len(split_text)
for i in xrange(len_text):
for i in range(len_text):
word = _strip_word(split_text[i])
if word in _keywords:
combined_word = [word]
if i + 1 == len_text: result.append(word) # appends last word if keyword and doesn't iterate
for j in xrange(i + 1, len_text):
for j in range(i + 1, len_text):
other_word = _strip_word(split_text[j])
if other_word in _keywords and other_word == split_text[j].decode("utf-8"):
if other_word in _keywords and other_word == split_text[j]:
combined_word.append(other_word)
else:
for keyword in combined_word: _keywords.pop(keyword)
Expand Down
6 changes: 3 additions & 3 deletions summa/pagerank_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def pagerank_weighted(graph, initial_value=None, damping=0.85):
scores = dict.fromkeys(graph.nodes(), initial_value)

iteration_quantity = 0
for iteration_number in xrange(100):
for iteration_number in range(100):
iteration_quantity += 1
convergence_achieved = 0
for i in graph.nodes():
Expand Down Expand Up @@ -56,10 +56,10 @@ def build_adjacency_matrix(graph):
nodes = graph.nodes()
length = len(nodes)

for i in xrange(length):
for i in range(length):
current_node = nodes[i]
neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
for j in xrange(length):
for j in range(length):
edge_weight = float(graph.edge_weight((current_node, nodes[j])))
if i != j and edge_weight != 0:
row.append(i)
Expand Down
2 changes: 1 addition & 1 deletion summa/preprocessing/snowball.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"""
from __future__ import unicode_literals, print_function

from porter import PorterStemmer
from .porter import PorterStemmer


class SnowballStemmer():
Expand Down
23 changes: 12 additions & 11 deletions summa/preprocessing/textcleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
logger.info("'pattern' package not found; tag filters are not available for English")
HAS_PATTERN = False

from snowball import SnowballStemmer
from stopwords import get_stopwords_by_language
from .snowball import SnowballStemmer
from .stopwords import get_stopwords_by_language
import re # http://regex101.com/#python to test regex
from summa.syntactic_unit import SyntacticUnit

Expand Down Expand Up @@ -91,14 +91,14 @@ def to_unicode(text, encoding='utf8', errors='strict'):
# Taken from gensim
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
def strip_punctuation(s):
s = to_unicode(s)
#s = to_unicode(s)
return RE_PUNCT.sub(" ", s)


# Taken from gensim
RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
def strip_numeric(s):
s = to_unicode(s)
#s = to_unicode(s)
return RE_NUMERIC.sub("", s)


Expand All @@ -123,12 +123,13 @@ def filter_words(sentences):
# filters = []

apply_filters_to_token = lambda token: apply_filters(token, filters)
return map(apply_filters_to_token, sentences)
return list(map(apply_filters_to_token, sentences))


# Taken from six
def u(s):
return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
# return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
return s.replace(r'\\', r'\\\\')


# Taken from gensim
Expand All @@ -137,9 +138,9 @@ def deaccent(text):
Remove accentuation from the given string. Input text is either a unicode string or utf8
encoded bytestring.
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
# if not isinstance(text, unicode):
# # assume utf8 for byte strings, use default (strict) error handling
# text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
Expand All @@ -153,7 +154,7 @@ def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False
and removing accent marks.
"""
lowercase = lowercase or to_lower or lower
text = to_unicode(text, errors=errors)
#text = to_unicode(text, errors=errors)
if lowercase:
text = text.lower()
if deacc:
Expand All @@ -164,7 +165,7 @@ def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False

def merge_syntactic_units(original_units, filtered_units, tags=None):
units = []
for i in xrange(len(original_units)):
for i in range(len(original_units)):
if filtered_units[i] == '':
continue

Expand Down
12 changes: 6 additions & 6 deletions summa/summarizer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

from math import log10 as _log10
from pagerank_weighted import pagerank_weighted_scipy as _pagerank
from preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from commons import build_graph as _build_graph
from commons import remove_unreachable_nodes as _remove_unreachable_nodes
from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from .commons import build_graph as _build_graph
from .commons import remove_unreachable_nodes as _remove_unreachable_nodes


def _set_graph_edge_weights(graph):
Expand All @@ -25,8 +25,8 @@ def _set_graph_edge_weights(graph):
def _create_valid_graph(graph):
nodes = graph.nodes()

for i in xrange(len(nodes)):
for j in xrange(len(nodes)):
for i in range(len(nodes)):
for j in range(len(nodes)):
if i == j:
continue

Expand Down
10 changes: 5 additions & 5 deletions summa/textrank.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python -W ignore::DeprecationWarning

import sys, getopt
from summarizer import summarize
from keywords import keywords
from .summarizer import summarize
from .keywords import keywords

# Types of summarization
SENTENCE = 0
Expand All @@ -13,7 +13,7 @@ def get_arguments():
try:
opts, args = getopt.getopt(sys.argv[1:], "t:s:r:w:h", ["text=", "summary=", "ratio=", "words=", "help"])
except getopt.GetoptError as err:
print str(err)
print(str(err))
usage()
sys.exit(2)
path = None
Expand Down Expand Up @@ -52,7 +52,7 @@ def get_arguments():
\tprints this help
"""
def usage():
print help_text
print(help_text)


def textrank(text, summarize_by=SENTENCE, ratio=0.2, words=None):
Expand All @@ -68,7 +68,7 @@ def main():
with open(path) as file:
text = file.read()

print textrank(text, summarize_by, ratio, words)
print(textrank(text, summarize_by, ratio, words))


if __name__ == "__main__":
Expand Down

0 comments on commit ebe6c9e

Please sign in to comment.