ported to Python3

tsptoni · Nov 23, 2017 · ebe6c9e · ebe6c9e
1 parent 00d483b
commit ebe6c9e
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 46 deletions.
diff --git a/summa/commons.py b/summa/commons.py
@@ -1,5 +1,5 @@
 
-from graph import Graph
+from .graph import Graph
 
 
 def build_graph(sequence):

diff --git a/summa/export.py b/summa/export.py
@@ -2,11 +2,11 @@
 import networkx as _nx
 from networkx.drawing.nx_agraph import graphviz_layout
 from os import system as _shell
-from summarizer import get_graph as _get_sentence_graph
-from keywords import get_graph as _get_word_graph
-from pagerank_weighted import pagerank_weighted_scipy as _pagerank_weighted_scipy
-from preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
-from preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
+from .summarizer import get_graph as _get_sentence_graph
+from .keywords import get_graph as _get_word_graph
+from .pagerank_weighted import pagerank_weighted_scipy as _pagerank_weighted_scipy
+from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
+from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
 
 NODE_COLOR = {'r': 239, 'g': 10, 'b': 10}
 
@@ -55,7 +55,7 @@ def _center_positions(positions):
     delta_y = (min_y + max_y) / 2
 
     centered_positions = {}
-    for key, position in positions.iteritems():
+    for key, position in positions.items():
         new_position = (round(position[0] - delta_x, 2), round(position[1] - delta_y, 2))
         centered_positions[key] = new_position
     return centered_positions

diff --git a/summa/keywords.py b/summa/keywords.py
@@ -1,11 +1,13 @@
 from itertools import combinations as _combinations
-from Queue import Queue as _Queue
+# from Queue import Queue as _Queue
+from queue import Queue as _Queue
+# from multiprocessing import Queue as _Queue
 
-from pagerank_weighted import pagerank_weighted_scipy as _pagerank
-from preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
-from preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
-from commons import build_graph as _build_graph
-from commons import remove_unreachable_nodes as _remove_unreachable_nodes
+from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
+from .preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
+from .preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
+from .commons import build_graph as _build_graph
+from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
 
 WINDOW_SIZE = 2
 
@@ -26,7 +28,7 @@ def _get_words_for_graph(tokens):
         raise ValueError("Can't use both include and exclude filters, should use only one")
 
     result = []
-    for word, unit in tokens.iteritems():
+    for word, unit in tokens.items():
         if exclude_filters and unit.tag in exclude_filters:
             continue
         if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
@@ -75,15 +77,15 @@ def _update_queue(queue, word):
 
 def _process_text(graph, tokens, split_text):
     queue = _init_queue(split_text)
-    for i in xrange(WINDOW_SIZE, len(split_text)):
+    for i in range(WINDOW_SIZE, len(split_text)):
         word = split_text[i]
         _process_word(graph, tokens, queue, word)
         _update_queue(queue, word)
 
 
 def _queue_iterator(queue):
     iterations = queue.qsize()
-    for i in xrange(iterations):
+    for i in range(iterations):
         var = queue.get()
         yield var
         queue.put(var)
@@ -105,7 +107,7 @@ def _extract_tokens(lemmas, scores, ratio, words):
 
 def _lemmas_to_words(tokens):
     lemma_to_word = {}
-    for word, unit in tokens.iteritems():
+    for word, unit in tokens.items():
         lemma = unit.token
         if lemma in lemma_to_word:
             lemma_to_word[lemma].append(word)
@@ -144,14 +146,14 @@ def _get_combined_keywords(_keywords, split_text):
     result = []
     _keywords = _keywords.copy()
     len_text = len(split_text)
-    for i in xrange(len_text):
+    for i in range(len_text):
         word = _strip_word(split_text[i])
         if word in _keywords:
             combined_word = [word]
             if i + 1 == len_text: result.append(word)   # appends last word if keyword and doesn't iterate
-            for j in xrange(i + 1, len_text):
+            for j in range(i + 1, len_text):
                 other_word = _strip_word(split_text[j])
-                if other_word in _keywords and other_word == split_text[j].decode("utf-8"):
+                if other_word in _keywords and other_word == split_text[j]:
                     combined_word.append(other_word)
                 else:
                     for keyword in combined_word: _keywords.pop(keyword)

diff --git a/summa/pagerank_weighted.py b/summa/pagerank_weighted.py
@@ -20,7 +20,7 @@ def pagerank_weighted(graph, initial_value=None, damping=0.85):
     scores = dict.fromkeys(graph.nodes(), initial_value)
 
     iteration_quantity = 0
-    for iteration_number in xrange(100):
+    for iteration_number in range(100):
         iteration_quantity += 1
         convergence_achieved = 0
         for i in graph.nodes():
@@ -56,10 +56,10 @@ def build_adjacency_matrix(graph):
     nodes = graph.nodes()
     length = len(nodes)
 
-    for i in xrange(length):
+    for i in range(length):
         current_node = nodes[i]
         neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
-        for j in xrange(length):
+        for j in range(length):
             edge_weight = float(graph.edge_weight((current_node, nodes[j])))
             if i != j and edge_weight != 0:
                 row.append(i)

diff --git a/summa/preprocessing/snowball.py b/summa/preprocessing/snowball.py
@@ -20,7 +20,7 @@
 """
 from __future__ import unicode_literals, print_function
 
-from porter import PorterStemmer
+from .porter import PorterStemmer
 
 
 class SnowballStemmer():

diff --git a/summa/preprocessing/textcleaner.py b/summa/preprocessing/textcleaner.py
@@ -13,8 +13,8 @@
     logger.info("'pattern' package not found; tag filters are not available for English")
     HAS_PATTERN = False
 
-from snowball import SnowballStemmer
-from stopwords import get_stopwords_by_language
+from .snowball import SnowballStemmer
+from .stopwords import get_stopwords_by_language
 import re  # http://regex101.com/#python to test regex
 from summa.syntactic_unit import SyntacticUnit
 
@@ -91,14 +91,14 @@ def to_unicode(text, encoding='utf8', errors='strict'):
 # Taken from gensim
 RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
 def strip_punctuation(s):
-    s = to_unicode(s)
+    #s = to_unicode(s)
     return RE_PUNCT.sub(" ", s)
 
 
 # Taken from gensim
 RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
 def strip_numeric(s):
-    s = to_unicode(s)
+    #s = to_unicode(s)
     return RE_NUMERIC.sub("", s)
 
 
@@ -123,12 +123,13 @@ def filter_words(sentences):
     # filters = []
 
     apply_filters_to_token = lambda token: apply_filters(token, filters)
-    return map(apply_filters_to_token, sentences)
+    return list(map(apply_filters_to_token, sentences))
 
 
 # Taken from six
 def u(s):
-    return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
+    # return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
+    return s.replace(r'\\', r'\\\\')
 
 
 # Taken from gensim
@@ -137,9 +138,9 @@ def deaccent(text):
     Remove accentuation from the given string. Input text is either a unicode string or utf8
     encoded bytestring.
     """
-    if not isinstance(text, unicode):
-        # assume utf8 for byte strings, use default (strict) error handling
-        text = text.decode('utf8')
+    # if not isinstance(text, unicode):
+    #     # assume utf8 for byte strings, use default (strict) error handling
+    #     text = text.decode('utf8')
     norm = unicodedata.normalize("NFD", text)
     result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
     return unicodedata.normalize("NFC", result)
@@ -153,7 +154,7 @@ def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False
     and removing accent marks.
     """
     lowercase = lowercase or to_lower or lower
-    text = to_unicode(text, errors=errors)
+    #text = to_unicode(text, errors=errors)
     if lowercase:
         text = text.lower()
     if deacc:
@@ -164,7 +165,7 @@ def tokenize(text, lowercase=False, deacc=False, errors="strict", to_lower=False
 
 def merge_syntactic_units(original_units, filtered_units, tags=None):
     units = []
-    for i in xrange(len(original_units)):
+    for i in range(len(original_units)):
         if filtered_units[i] == '':
             continue
 

diff --git a/summa/summarizer.py b/summa/summarizer.py
@@ -1,9 +1,9 @@
 
 from math import log10 as _log10
-from pagerank_weighted import pagerank_weighted_scipy as _pagerank
-from preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
-from commons import build_graph as _build_graph
-from commons import remove_unreachable_nodes as _remove_unreachable_nodes
+from .pagerank_weighted import pagerank_weighted_scipy as _pagerank
+from .preprocessing.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
+from .commons import build_graph as _build_graph
+from .commons import remove_unreachable_nodes as _remove_unreachable_nodes
 
 
 def _set_graph_edge_weights(graph):
@@ -25,8 +25,8 @@ def _set_graph_edge_weights(graph):
 def _create_valid_graph(graph):
     nodes = graph.nodes()
 
-    for i in xrange(len(nodes)):
-        for j in xrange(len(nodes)):
+    for i in range(len(nodes)):
+        for j in range(len(nodes)):
             if i == j:
                 continue
 

diff --git a/summa/textrank.py b/summa/textrank.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python -W ignore::DeprecationWarning
 
 import sys, getopt
-from summarizer import summarize
-from keywords import keywords
+from .summarizer import summarize
+from .keywords import keywords
 
 # Types of summarization
 SENTENCE = 0
@@ -13,7 +13,7 @@ def get_arguments():
     try:
         opts, args = getopt.getopt(sys.argv[1:], "t:s:r:w:h", ["text=", "summary=", "ratio=", "words=", "help"])
     except getopt.GetoptError as err:
-        print str(err)
+        print(str(err))
         usage()
         sys.exit(2)
     path = None
@@ -52,7 +52,7 @@ def get_arguments():
 \tprints this help
 """
 def usage():
-    print help_text
+    print(help_text)
 
 
 def textrank(text, summarize_by=SENTENCE, ratio=0.2, words=None):
@@ -68,7 +68,7 @@ def main():
     with open(path) as file:
         text = file.read()
 
-    print textrank(text, summarize_by, ratio, words)
+    print(textrank(text, summarize_by, ratio, words))
 
 
 if __name__ == "__main__":