diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index 9a44367..04f96f4 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -26,7 +26,7 @@
Beautiful Soup defines classes for two main parsing strategies:
- * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+ * BeautifulStoneSoup, for parsing XML, HTML, or your domain-specific
language that kind of looks like XML.
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
@@ -77,21 +77,20 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
"""
-from __future__ import generators
+
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "3.2.1"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
__license__ = "New-style BSD"
-from sgmllib import SGMLParser, SGMLParseError
+from html.parser import HTMLParser
import codecs
-import markupbase
+import _markupbase
import types
import re
-import sgmllib
try:
- from htmlentitydefs import name2codepoint
+ from html.entities import name2codepoint
except ImportError:
name2codepoint = {}
try:
@@ -100,8 +99,7 @@
from sets import Set as set
#These hacks make Beautiful Soup able to parse XML with namespaces
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
-markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+_markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
@@ -118,7 +116,7 @@ class PageElement(object):
def _invert(h):
"Cheap function to invert a hash."
i = {}
- for k,v in h.items():
+ for k,v in list(h.items()):
i[v] = k
return i
@@ -178,7 +176,7 @@ def extract(self):
#this element (and any children) hadn't been parsed. Connect
#the two.
lastChild = self._lastRecursiveChild()
- nextElement = lastChild.next
+ nextElement = lastChild.__next__
if self.previous:
self.previous.next = nextElement
@@ -203,7 +201,7 @@ def _lastRecursiveChild(self):
return lastChild
def insert(self, position, newChild):
- if isinstance(newChild, basestring) \
+ if isinstance(newChild, str) \
and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
@@ -257,7 +255,7 @@ def insert(self, position, newChild):
newChild.nextSibling.previousSibling = newChild
newChildsLastElement.next = nextChild
- if newChildsLastElement.next:
+ if newChildsLastElement.__next__:
newChildsLastElement.next.previous = newChildsLastElement
self.contents.insert(position, newChild)
@@ -358,7 +356,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
return [element for element in generator()
if isinstance(element, Tag)]
# findAll*('tag-name')
- elif isinstance(name, basestring):
+ elif isinstance(name, str):
return [element for element in generator()
if isinstance(element, Tag) and
element.name == name]
@@ -371,7 +369,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
g = generator()
while True:
try:
- i = g.next()
+ i = next(g)
except StopIteration:
break
if i:
@@ -387,7 +385,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
def nextGenerator(self):
i = self
while i is not None:
- i = i.next
+ i = i.__next__
yield i
def nextSiblingGenerator(self):
@@ -422,19 +420,19 @@ def substituteEncoding(self, str, encoding=None):
def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode.
."""
- if isinstance(s, unicode):
+ if isinstance(s, str):
if encoding:
s = s.encode(encoding)
elif isinstance(s, str):
if encoding:
s = s.encode(encoding)
else:
- s = unicode(s)
+ s = str(s)
else:
if encoding:
s = self.toEncoding(str(s), encoding)
else:
- s = unicode(s)
+ s = str(s)
return s
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
@@ -447,7 +445,7 @@ def _sub_entity(self, x):
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
-class NavigableString(unicode, PageElement):
+class NavigableString(str, PageElement):
def __new__(cls, value):
"""Create a new NavigableString.
@@ -457,9 +455,9 @@ def __new__(cls, value):
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
- if isinstance(value, unicode):
- return unicode.__new__(cls, value)
- return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+ if isinstance(value, str):
+ return str.__new__(cls, value)
+ return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
return (NavigableString.__str__(self),)
@@ -471,7 +469,7 @@ def __getattr__(self, attr):
if attr == 'string':
return self
else:
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+ raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
def __unicode__(self):
return str(self).decode(DEFAULT_OUTPUT_ENCODING)
@@ -515,23 +513,23 @@ def _convertEntities(self, match):
escaped."""
x = match.group(1)
if self.convertHTMLEntities and x in name2codepoint:
- return unichr(name2codepoint[x])
+ return chr(name2codepoint[x])
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
if self.convertXMLEntities:
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
else:
- return u'&%s;' % x
+ return '&%s;' % x
elif len(x) > 0 and x[0] == '#':
# Handle numeric entities
if len(x) > 1 and x[1] == 'x':
- return unichr(int(x[2:], 16))
+ return chr(int(x[2:], 16))
else:
- return unichr(int(x[1:]))
+ return chr(int(x[1:]))
elif self.escapeUnrecognizedEntities:
- return u'&%s;' % x
+ return '&%s;' % x
else:
- return u'&%s;' % x
+ return '&%s;' % x
def __init__(self, parser, name, attrs=None, parent=None,
previous=None):
@@ -545,7 +543,7 @@ def __init__(self, parser, name, attrs=None, parent=None,
if attrs is None:
attrs = []
elif isinstance(attrs, dict):
- attrs = attrs.items()
+ attrs = list(attrs.items())
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
@@ -556,11 +554,11 @@ def __init__(self, parser, name, attrs=None, parent=None,
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
# Convert any HTML, XML, or numeric entities in the attribute values.
- convert = lambda(k, val): (k,
+ convert = lambda k_val: (k_val[0],
re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
self._convertEntities,
- val))
- self.attrs = map(convert, self.attrs)
+ k_val[1]))
+ self.attrs = list(map(convert, self.attrs))
def getString(self):
if (len(self.contents) == 1
@@ -574,16 +572,16 @@ def setString(self, string):
string = property(getString, setString)
- def getText(self, separator=u""):
+ def getText(self, separator=""):
if not len(self.contents):
- return u""
- stopNode = self._lastRecursiveChild().next
+ return ""
+ stopNode = self._lastRecursiveChild().__next__
strings = []
current = self.contents[0]
while current is not stopNode:
if isinstance(current, NavigableString):
strings.append(current.strip())
- current = current.next
+ current = current.__next__
return separator.join(strings)
text = property(getText)
@@ -606,7 +604,7 @@ def index(self, element):
raise ValueError("Tag.index: element not in tag")
def has_key(self, key):
- return self._getAttrMap().has_key(key)
+ return key in self._getAttrMap()
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
@@ -624,7 +622,7 @@ def __len__(self):
def __contains__(self, x):
return x in self.contents
- def __nonzero__(self):
+ def __bool__(self):
"A tag is non-None even if it has no contents."
return True
@@ -650,14 +648,14 @@ def __delitem__(self, key):
#We don't break because bad HTML can define the same
#attribute multiple times.
self._getAttrMap()
- if self.attrMap.has_key(key):
+ if key in self.attrMap:
del self.attrMap[key]
def __call__(self, *args, **kwargs):
"""Calling a tag like a function is the same as calling its
findAll() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
- return apply(self.findAll, args, kwargs)
+ return self.findAll(*args, **kwargs)
def __getattr__(self, tag):
#print "Getattr %s.%s" % (self.__class__, tag)
@@ -665,7 +663,7 @@ def __getattr__(self, tag):
return self.find(tag[:-3])
elif tag.find('__') != 0:
return self.find(tag)
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+ raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
@@ -709,7 +707,7 @@ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
if self.attrs:
for key, val in self.attrs:
fmt = '%s="%s"'
- if isinstance(val, basestring):
+ if isinstance(val, str):
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
val = self.substituteEncoding(val, encoding)
@@ -786,7 +784,7 @@ def decompose(self):
return
current = self.contents[0]
while current is not None:
- next = current.next
+ next = current.__next__
if isinstance(current, Tag):
del current.contents[:]
current.parent = None
@@ -879,11 +877,11 @@ def childGenerator(self):
def recursiveChildGenerator(self):
if not len(self.contents):
raise StopIteration
- stopNode = self._lastRecursiveChild().next
+ stopNode = self._lastRecursiveChild().__next__
current = self.contents[0]
while current is not stopNode:
yield current
- current = current.next
+ current = current.__next__
# Next, a couple classes to represent queries and their results.
@@ -893,7 +891,7 @@ class SoupStrainer:
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
- if isinstance(attrs, basestring):
+ if isinstance(attrs, str):
kwargs['class'] = _match_css_class(attrs)
attrs = None
if kwargs:
@@ -929,7 +927,7 @@ def searchTag(self, markupName=None, markupAttrs={}):
else:
match = True
markupAttrMap = None
- for attr, matchAgainst in self.attrs.items():
+ for attr, matchAgainst in list(self.attrs.items()):
if not markupAttrMap:
if hasattr(markupAttrs, 'get'):
markupAttrMap = markupAttrs
@@ -967,12 +965,12 @@ def search(self, markup):
found = self.searchTag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
- isinstance(markup, basestring):
+ isinstance(markup, str):
if self._matches(markup, self.text):
found = markup
else:
- raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
+ raise Exception("I don't know how to match against a %s" \
+ % markup.__class__)
return found
def _matches(self, markup, matchAgainst):
@@ -987,8 +985,8 @@ def _matches(self, markup, matchAgainst):
#other ways of matching match the tag name as a string.
if isinstance(markup, Tag):
markup = markup.name
- if markup and not isinstance(markup, basestring):
- markup = unicode(markup)
+ if markup and not isinstance(markup, str):
+ markup = str(markup)
#Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
@@ -996,10 +994,10 @@ def _matches(self, markup, matchAgainst):
elif hasattr(matchAgainst, '__iter__'): # list-like
result = markup in matchAgainst
elif hasattr(matchAgainst, 'items'):
- result = markup.has_key(matchAgainst)
- elif matchAgainst and isinstance(markup, basestring):
- if isinstance(markup, unicode):
- matchAgainst = unicode(matchAgainst)
+ result = matchAgainst in markup
+ elif matchAgainst and isinstance(markup, str):
+ if isinstance(markup, str):
+ matchAgainst = str(matchAgainst)
else:
matchAgainst = str(matchAgainst)
@@ -1024,7 +1022,7 @@ def buildTagMap(default, *args):
for portion in args:
if hasattr(portion, 'items'):
#It's a map. Merge it.
- for k,v in portion.items():
+ for k,v in list(portion.items()):
built[k] = v
elif hasattr(portion, '__iter__'): # is a list
#It's a list. Map each item to the default.
@@ -1037,7 +1035,7 @@ def buildTagMap(default, *args):
# Now, the parser classes.
-class BeautifulStoneSoup(Tag, SGMLParser):
+class BeautifulStoneSoup(Tag, HTMLParser):
"""This class contains the basic parser and search code. It defines
a parser that knows nothing about tag behavior except for the
@@ -1067,7 +1065,7 @@ class BeautifulStoneSoup(Tag, SGMLParser):
lambda x: '')
]
- ROOT_TAG_NAME = u'[document]'
+ ROOT_TAG_NAME = '[document]'
HTML_ENTITIES = "html"
XML_ENTITIES = "xml"
@@ -1088,9 +1086,9 @@ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
provided markup (which can be a string or a file-like object)
is fed into the underlying parser.
- sgmllib will process most bad HTML, and the BeautifulSoup
+ html.parser will process most bad HTML, and the BeautifulSoup
class has some tricks for dealing with some HTML that kills
- sgmllib, but Beautiful Soup can nonetheless choke or lose data
+ HTMLParser, but Beautiful Soup can nonetheless choke or lose data
if your data uses self-closing tags or declarations
incorrectly.
@@ -1100,7 +1098,7 @@ class has some tricks for dealing with some HTML that kills
you'll get better performance.
The default parser massage techniques fix the two most common
- instances of invalid HTML that choke sgmllib:
+ instances of invalid HTML that choke HTMLParser:
(No space between name of closing tag and tag close)
(Extraneous whitespace in declaration)
@@ -1138,7 +1136,7 @@ class has some tricks for dealing with some HTML that kills
self.escapeUnrecognizedEntities = False
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
- SGMLParser.__init__(self)
+ HTMLParser.__init__(self)
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
@@ -1151,7 +1149,7 @@ class has some tricks for dealing with some HTML that kills
self.markup = None # The markup can now be GCed
def convert_charref(self, name):
- """This method fixes a bug in Python's SGMLParser."""
+ """This method fixes a bug in Python's HTMLParser."""
try:
n = int(name)
except ValueError:
@@ -1163,14 +1161,14 @@ def convert_charref(self, name):
def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode.
markup = self.markup
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None
else:
dammit = UnicodeDammit\
(markup, [self.fromEncoding, inDocumentEncoding],
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
- markup = dammit.unicode
+ markup = dammit.str
self.originalEncoding = dammit.originalEncoding
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
if markup:
@@ -1187,20 +1185,20 @@ def _feed(self, inDocumentEncoding=None, isHTML=False):
del(self.markupMassage)
self.reset()
- SGMLParser.feed(self, markup)
+ HTMLParser.feed(self, markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
def __getattr__(self, methodName):
- """This method routes method call requests to either the SGMLParser
+ """This method routes method call requests to either the HTMLParser
superclass or the Tag superclass, depending on the method name."""
#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
if methodName.startswith('start_') or methodName.startswith('end_') \
or methodName.startswith('do_'):
- return SGMLParser.__getattr__(self, methodName)
+ return HTMLParser.__getattr__(self, methodName)
elif not methodName.startswith('__'):
return Tag.__getattr__(self, methodName)
else:
@@ -1209,13 +1207,13 @@ def __getattr__(self, methodName):
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
- return self.SELF_CLOSING_TAGS.has_key(name) \
- or self.instanceSelfClosingTags.has_key(name)
+ return name in self.SELF_CLOSING_TAGS \
+ or name in self.instanceSelfClosingTags
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
self.hidden = 1
- SGMLParser.reset(self)
+ HTMLParser.reset(self)
self.currentData = []
self.currentTag = None
self.tagStack = []
@@ -1239,7 +1237,7 @@ def pushTag(self, tag):
def endData(self, containerClass=NavigableString):
if self.currentData:
- currentData = u''.join(self.currentData)
+ currentData = ''.join(self.currentData)
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
not set([tag.name for tag in self.tagStack]).intersection(
self.PRESERVE_WHITESPACE_TAGS)):
@@ -1302,7 +1300,7 @@ def _smartPop(self, name):
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None
- isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ isResetNesting = name in self.RESET_NESTING_TAGS
popTo = None
inclusive = True
for i in range(len(self.tagStack)-1, 0, -1):
@@ -1315,7 +1313,7 @@ def _smartPop(self, name):
if (nestingResetTriggers is not None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers is None and isResetNesting
- and self.RESET_NESTING_TAGS.has_key(p.name)):
+ and p.name in self.RESET_NESTING_TAGS):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
@@ -1386,7 +1384,7 @@ def handle_pi(self, text):
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later."""
if text[:3] == "xml":
- text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
@@ -1396,7 +1394,7 @@ def handle_comment(self, text):
def handle_charref(self, ref):
"Handle character references as data."
if self.convertEntities:
- data = unichr(int(ref))
+ data = chr(int(ref))
else:
data = '%s;' % ref
self.handle_data(data)
@@ -1408,7 +1406,7 @@ def handle_entityref(self, ref):
data = None
if self.convertHTMLEntities:
try:
- data = unichr(name2codepoint[ref])
+ data = chr(name2codepoint[ref])
except KeyError:
pass
@@ -1449,7 +1447,7 @@ def handle_decl(self, data):
self._toStringSubclass(data, Declaration)
def parse_declaration(self, i):
- """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ """Treat a bogus HTML declaration as raw data. Treat a CDATA
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '= 4) and (xml_data[:2] == '\xfe\xff') \
and (xml_data[2:4] != '\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ xml_data = str(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
(xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ xml_data = str(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ xml_data = str(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == '\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ xml_data = str(xml_data[3:], 'utf-8').encode('utf-8')
else:
sniffed_xml_encoding = 'ascii'
pass
@@ -1972,7 +1970,7 @@ def _ebcdic_to_ascii(self, s):
250,251,252,253,254,255)
import string
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
MS_CHARS = { '\x80' : ('euro', '20AC'),
@@ -2015,4 +2013,4 @@ def _ebcdic_to_ascii(self, s):
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
+ print(soup.prettify())
diff --git a/activity/activity.info b/activity/activity.info
index 7c37656..3b81ca8 100644
--- a/activity/activity.info
+++ b/activity/activity.info
@@ -4,7 +4,7 @@ activity_version = 25
license = GPLv2+;GPLv3+;BSD
icon = slicelogo
bundle_id = org.sugarlabs.InfoSlicer
-exec = sugar-activity activity.InfoslicerActivity
+exec = sugar-activity3 activity.InfoslicerActivity
show_launcher = yes
summary = Is it possible to have my own encyclopedia? Yes! Find your favorite information on the web and package it with InfoSlicer creating incredible collections.
tags = Tools;Internet
diff --git a/book.py b/book.py
index 4b348b9..4784884 100644
--- a/book.py
+++ b/book.py
@@ -157,7 +157,7 @@ def __init__(self, preinstalled, root):
self.revision = 1
if not os.path.exists(self.root):
- os.makedirs(self.root, 0775)
+ os.makedirs(self.root, 0o775)
for i in preinstalled:
filepath = os.path.join(get_bundle_path(), 'examples', i[1])
@@ -196,7 +196,7 @@ def _save(self, uid, contents):
directory = os.path.join(self.root, str(uid))
if not os.path.exists(directory):
- os.makedirs(directory, 0777)
+ os.makedirs(directory, 0o777)
contents = contents.replace(
'', '\n'
@@ -232,7 +232,7 @@ def __init__(self, filepath=None):
zip = zipfile.ZipFile(filepath, 'r')
for i in zip.namelist():
path = os.path.join(root, i)
- os.makedirs(os.path.dirname(path), 0775)
+ os.makedirs(os.path.dirname(path), 0o775)
file(path, 'wb').write(zip.read(i))
zip.close()
diff --git a/infoslicer/processing/Article.py b/infoslicer/processing/Article.py
index b3311b0..8d66f37 100644
--- a/infoslicer/processing/Article.py
+++ b/infoslicer/processing/Article.py
@@ -4,8 +4,8 @@
from gi.repository import GdkPixbuf
from random import Random
-from Article_Data import *
-from Section import *
+from .Article_Data import *
+from .Section import *
import logging
logger = logging.getLogger('infoslicer')
diff --git a/infoslicer/processing/Article_Builder.py b/infoslicer/processing/Article_Builder.py
index 5d2a429..43f1bae 100644
--- a/infoslicer/processing/Article_Builder.py
+++ b/infoslicer/processing/Article_Builder.py
@@ -1,8 +1,8 @@
# Copyright (C) IBM Corporation 2008
-from BeautifulSoup import Tag
-from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
-from Article_Data import *
+from .BeautifulSoup import Tag
+from .NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
+from .Article_Data import *
import re
import os
import logging
@@ -51,7 +51,7 @@ def get_article_from_dita(image_path, dita):
input.shortdesc.extract()
has_shortdesc = True
taglist = input.findAll(re.compile("refbody|section|p|ph|image"))
- for i in xrange(len(taglist)):
+ for i in range(len(taglist)):
tag = taglist[len(taglist) - i - 1]
if tag.name == "ph":
id = tag['id']
@@ -63,7 +63,7 @@ def get_article_from_dita(image_path, dita):
sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text)
sentence_data_list.insert(0, sentence_data)
elif tag.name == "p":
- if not tag.has_key("id"):
+ if "id" not in tag:
id = -1
else:
id = tag['id']
@@ -75,7 +75,7 @@ def get_article_from_dita(image_path, dita):
sentence_data_list = []
current_p_id = id
elif tag.name == "refbody" :
- if tag.findParent("reference").has_key("id"):
+ if "id" in tag.findParent("reference"):
id = "r" + tag.findParent("reference")['id']
else:
id = "r90000"
diff --git a/infoslicer/processing/BeautifulSoup.py b/infoslicer/processing/BeautifulSoup.py
index 666a210..c26490d 100644
--- a/infoslicer/processing/BeautifulSoup.py
+++ b/infoslicer/processing/BeautifulSoup.py
@@ -77,7 +77,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
"""
-from __future__ import generators
+
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "3.1.0.1"
@@ -85,12 +85,12 @@
__license__ = "New-style BSD"
import codecs
-import markupbase
+import _markupbase
import types
import re
-from HTMLParser import HTMLParser, HTMLParseError
+from html.parser import HTMLParser
try:
- from htmlentitydefs import name2codepoint
+ from html.entities import name2codepoint
except ImportError:
name2codepoint = {}
try:
@@ -99,18 +99,18 @@
from sets import Set as set
#These hacks make Beautiful Soup able to parse XML with namespaces
-markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+_markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
# First, the classes that represent markup elements.
-def sob(unicode, encoding):
+def sob(str, encoding):
"""Returns either the given Unicode string or its encoding."""
if encoding is None:
- return unicode
+ return str
else:
- return unicode.encode(encoding)
+ return str.encode(encoding)
class PageElement:
"""Contains the navigational information for some part of the page
@@ -154,7 +154,7 @@ def extract(self):
#this element (and any children) hadn't been parsed. Connect
#the two.
lastChild = self._lastRecursiveChild()
- nextElement = lastChild.next
+ nextElement = lastChild.__next__
if self.previous:
self.previous.next = nextElement
@@ -179,8 +179,8 @@ def _lastRecursiveChild(self):
return lastChild
def insert(self, position, newChild):
- if (isinstance(newChild, basestring)
- or isinstance(newChild, unicode)) \
+ if (isinstance(newChild, str)
+ or isinstance(newChild, str)) \
and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
@@ -234,7 +234,7 @@ def insert(self, position, newChild):
newChild.nextSibling.previousSibling = newChild
newChildsLastElement.next = nextChild
- if newChildsLastElement.next:
+ if newChildsLastElement.__next__:
newChildsLastElement.next.previous = newChildsLastElement
self.contents.insert(position, newChild)
@@ -335,7 +335,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
g = generator()
while True:
try:
- i = g.next()
+ i = next(g)
except StopIteration:
break
if i:
@@ -351,7 +351,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs):
def nextGenerator(self):
i = self
while i:
- i = i.next
+ i = i.__next__
yield i
def nextSiblingGenerator(self):
@@ -386,22 +386,22 @@ def substituteEncoding(self, str, encoding=None):
def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode.
."""
- if isinstance(s, unicode):
+ if isinstance(s, str):
if encoding:
s = s.encode(encoding)
elif isinstance(s, str):
if encoding:
s = s.encode(encoding)
else:
- s = unicode(s)
+ s = str(s)
else:
if encoding:
s = self.toEncoding(str(s), encoding)
else:
- s = unicode(s)
+ s = str(s)
return s
-class NavigableString(unicode, PageElement):
+class NavigableString(str, PageElement):
def __new__(cls, value):
"""Create a new NavigableString.
@@ -411,12 +411,12 @@ def __new__(cls, value):
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
- if isinstance(value, unicode):
- return unicode.__new__(cls, value)
- return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+ if isinstance(value, str):
+ return str.__new__(cls, value)
+ return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
- return (unicode(self),)
+ return (str(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@@ -425,7 +425,7 @@ def __getattr__(self, attr):
if attr == 'string':
return self
else:
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+ raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr))
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
return self.decode().encode(encoding)
@@ -436,23 +436,23 @@ def decodeGivenEventualEncoding(self, eventualEncoding):
class CData(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
- return u''
+ return ''
class ProcessingInstruction(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
output = self
- if u'%SOUP-ENCODING%' in output:
+ if '%SOUP-ENCODING%' in output:
output = self.substituteEncoding(output, eventualEncoding)
- return u'' + output + u'?>'
+ return '' + output + '?>'
class Comment(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
- return u''
+ return ''
class Declaration(NavigableString):
def decodeGivenEventualEncoding(self, eventualEncoding):
- return u''
+ return ''
class Tag(PageElement):
@@ -461,7 +461,7 @@ class Tag(PageElement):
def _invert(h):
"Cheap function to invert a hash."
i = {}
- for k,v in h.items():
+ for k,v in list(h.items()):
i[v] = k
return i
@@ -480,23 +480,23 @@ def _convertEntities(self, match):
escaped."""
x = match.group(1)
if self.convertHTMLEntities and x in name2codepoint:
- return unichr(name2codepoint[x])
+ return chr(name2codepoint[x])
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
if self.convertXMLEntities:
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
else:
- return u'&%s;' % x
+ return '&%s;' % x
elif len(x) > 0 and x[0] == '#':
# Handle numeric entities
if len(x) > 1 and x[1] == 'x':
- return unichr(int(x[2:], 16))
+ return chr(int(x[2:], 16))
else:
- return unichr(int(x[1:]))
+ return chr(int(x[1:]))
elif self.escapeUnrecognizedEntities:
- return u'&%s;' % x
+ return '&%s;' % x
else:
- return u'&%s;' % x
+ return '&%s;' % x
def __init__(self, parser, name, attrs=None, parent=None,
previous=None):
@@ -525,7 +525,7 @@ def convert(kval):
return kval
return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
self._convertEntities, val))
- self.attrs = map(convert, self.attrs)
+ self.attrs = list(map(convert, self.attrs))
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
@@ -534,7 +534,7 @@ def get(self, key, default=None):
return self._getAttrMap().get(key, default)
def has_key(self, key):
- return self._getAttrMap().has_key(key)
+ return key in self._getAttrMap()
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
@@ -552,7 +552,7 @@ def __len__(self):
def __contains__(self, x):
return x in self.contents
- def __nonzero__(self):
+ def __bool__(self):
"A tag is non-None even if it has no contents."
return True
@@ -578,14 +578,14 @@ def __delitem__(self, key):
#We don't break because bad HTML can define the same
#attribute multiple times.
self._getAttrMap()
- if self.attrMap.has_key(key):
+ if key in self.attrMap:
del self.attrMap[key]
def __call__(self, *args, **kwargs):
"""Calling a tag like a function is the same as calling its
findAll() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
- return apply(self.findAll, args, kwargs)
+ return self.findAll(*args, **kwargs)
def __getattr__(self, tag):
#print "Getattr %s.%s" % (self.__class__, tag)
@@ -593,7 +593,7 @@ def __getattr__(self, tag):
return self.find(tag[:-3])
elif tag.find('__') != 0:
return self.find(tag)
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+ raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag))
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
@@ -823,13 +823,13 @@ def _getAttrMap(self):
def recursiveChildGenerator(self):
if not len(self.contents):
raise StopIteration
- stopNode = self._lastRecursiveChild().next
+ stopNode = self._lastRecursiveChild().__next__
current = self.contents[0]
while current is not stopNode:
if not current:
break
yield current
- current = current.next
+ current = current.__next__
def childGenerator(self):
if not len(self.contents):
@@ -883,7 +883,7 @@ def searchTag(self, markupName=None, markupAttrs={}):
else:
match = True
markupAttrMap = None
- for attr, matchAgainst in self.attrs.items():
+ for attr, matchAgainst in list(self.attrs.items()):
if not markupAttrMap:
if hasattr(markupAttrs, 'get'):
markupAttrMap = markupAttrs
@@ -924,14 +924,14 @@ def search(self, markup):
if self._matches(markup, self.text):
found = markup
else:
- raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
+ raise Exception("I don't know how to match against a %s" \
+ % markup.__class__)
return found
def _matches(self, markup, matchAgainst):
#print "Matching %s against %s" % (markup, matchAgainst)
result = False
- if matchAgainst == True and type(matchAgainst) == types.BooleanType:
+ if matchAgainst == True and type(matchAgainst) == bool:
result = markup != None
elif callable(matchAgainst):
result = matchAgainst(markup)
@@ -941,7 +941,7 @@ def _matches(self, markup, matchAgainst):
if isinstance(markup, Tag):
markup = markup.name
if markup is not None and not isString(markup):
- markup = unicode(markup)
+ markup = str(markup)
#Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
@@ -950,10 +950,10 @@ def _matches(self, markup, matchAgainst):
and (markup is not None or not isString(matchAgainst))):
result = markup in matchAgainst
elif hasattr(matchAgainst, 'items'):
- result = markup.has_key(matchAgainst)
+ result = matchAgainst in markup
elif matchAgainst and isString(markup):
- if isinstance(markup, unicode):
- matchAgainst = unicode(matchAgainst)
+ if isinstance(markup, str):
+ matchAgainst = str(matchAgainst)
else:
matchAgainst = str(matchAgainst)
@@ -974,13 +974,13 @@ def isList(l):
"""Convenience method that works with all 2.x versions of Python
to determine whether or not something is listlike."""
return ((hasattr(l, '__iter__') and not isString(l))
- or (type(l) in (types.ListType, types.TupleType)))
+ or (type(l) in (list, tuple)))
def isString(s):
"""Convenience method that works with all 2.x versions of Python
to determine whether or not something is stringlike."""
try:
- return isinstance(s, unicode) or isinstance(s, basestring)
+ return isinstance(s, str) or isinstance(s, str)
except NameError:
return isinstance(s, str)
@@ -992,7 +992,7 @@ def buildTagMap(default, *args):
for portion in args:
if hasattr(portion, 'items'):
#It's a map. Merge it.
- for k,v in portion.items():
+ for k,v in list(portion.items()):
built[k] = v
elif isList(portion) and not isString(portion):
#It's a list. Map each item to the default.
@@ -1037,7 +1037,7 @@ def handle_pi(self, text):
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later."""
if text[:3] == "xml":
- text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
@@ -1047,7 +1047,7 @@ def handle_comment(self, text):
def handle_charref(self, ref):
"Handle character references as data."
if self.soup.convertEntities:
- data = unichr(int(ref))
+ data = chr(int(ref))
else:
data = '%s;' % ref
self.handle_data(data)
@@ -1059,7 +1059,7 @@ def handle_entityref(self, ref):
data = None
if self.soup.convertHTMLEntities:
try:
- data = unichr(name2codepoint[ref])
+ data = chr(name2codepoint[ref])
except KeyError:
pass
@@ -1111,12 +1111,12 @@ def parse_declaration(self, i):
j = k+3
self._toStringSubclass(data, CData)
else:
- try:
- j = HTMLParser.parse_declaration(self, i)
- except HTMLParseError:
- toHandle = self.rawdata[i:]
- self.handle_data(toHandle)
- j = i + len(toHandle)
+ #try:
+ j = HTMLParser.parse_declaration(self, i)
+ # except HTMLParseError: '''HTMLParseError is deprecated'''
+ # toHandle = self.rawdata[i:]
+ # self.handle_data(toHandle)
+ # j = i + len(toHandle)
return j
@@ -1150,7 +1150,7 @@ class BeautifulStoneSoup(Tag):
lambda x: '')
]
- ROOT_TAG_NAME = u'[document]'
+ ROOT_TAG_NAME = '[document]'
HTML_ENTITIES = "html"
XML_ENTITIES = "xml"
@@ -1239,14 +1239,14 @@ class has some tricks for dealing with some HTML that kills
def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode.
markup = self.markup
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None
else:
dammit = UnicodeDammit\
(markup, [self.fromEncoding, inDocumentEncoding],
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
- markup = dammit.unicode
+ markup = dammit.str
self.originalEncoding = dammit.originalEncoding
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
if markup:
@@ -1272,8 +1272,8 @@ def _feed(self, inDocumentEncoding=None, isHTML=False):
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
- return self.SELF_CLOSING_TAGS.has_key(name) \
- or self.instanceSelfClosingTags.has_key(name)
+ return name in self.SELF_CLOSING_TAGS \
+ or name in self.instanceSelfClosingTags
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
@@ -1308,7 +1308,7 @@ def pushTag(self, tag):
def endData(self, containerClass=NavigableString):
if self.currentData:
- currentData = u''.join(self.currentData)
+ currentData = ''.join(self.currentData)
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
not set([tag.name for tag in self.tagStack]).intersection(
self.PRESERVE_WHITESPACE_TAGS)):
@@ -1371,7 +1371,7 @@ def _smartPop(self, name):
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None
- isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ isResetNesting = name in self.RESET_NESTING_TAGS
popTo = None
inclusive = True
for i in range(len(self.tagStack)-1, 0, -1):
@@ -1384,7 +1384,7 @@ def _smartPop(self, name):
if (nestingResetTriggers != None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
- and self.RESET_NESTING_TAGS.has_key(p.name)):
+ and p.name in self.RESET_NESTING_TAGS):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
@@ -1402,7 +1402,7 @@ def unknown_starttag(self, name, attrs, selfClosing=0):
if self.quoteStack:
#This is not a real tag.
#print "<%s> is not real!" % name
- attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+ attrs = ''.join([' %s="%s"' % (x_y[0], x_y[1]) for x_y in attrs])
self.handle_data('<%s%s>' % (name, attrs))
return
self.endData()
@@ -1496,7 +1496,7 @@ class BeautifulSoup(BeautifulStoneSoup):
BeautifulStoneSoup before writing your own subclass."""
def __init__(self, *args, **kwargs):
- if not kwargs.has_key('smartQuotesTo'):
+ if 'smartQuotesTo' not in kwargs:
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
kwargs['isHTML'] = True
BeautifulStoneSoup.__init__(self, *args, **kwargs)
@@ -1680,7 +1680,7 @@ def popTag(self):
parent._getAttrMap()
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
isinstance(tag.contents[0], NavigableString) and
- not parent.attrMap.has_key(tag.name)):
+ tag.name not in parent.attrMap):
parent[tag.name] = tag.contents[0]
BeautifulStoneSoup.popTag(self)
@@ -1754,9 +1754,9 @@ def __init__(self, markup, overrideEncodings=[],
self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
- if markup == '' or isinstance(markup, unicode):
+ if markup == '' or isinstance(markup, str):
self.originalEncoding = None
- self.unicode = unicode(markup)
+ self.str = str(markup)
return
u = None
@@ -1769,7 +1769,7 @@ def __init__(self, markup, overrideEncodings=[],
if u: break
# If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
+ if not u and chardet and not isinstance(self.markup, str):
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252:
@@ -1778,7 +1778,7 @@ def __init__(self, markup, overrideEncodings=[],
u = self._convertFrom(proposed_encoding)
if u: break
- self.unicode = u
+ self.str = u
if not u: self.originalEncoding = None
def _subMSChar(self, match):
@@ -1786,7 +1786,7 @@ def _subMSChar(self, match):
entity."""
orig = match.group(1)
sub = self.MS_CHARS.get(orig)
- if type(sub) == types.TupleType:
+ if type(sub) == tuple:
if self.smartQuotesTo == 'xml':
sub = ''.encode() + sub[1].encode() + ';'.encode()
else:
@@ -1816,7 +1816,7 @@ def _convertFrom(self, proposed):
u = self._toUnicode(markup, proposed)
self.markup = u
self.originalEncoding = proposed
- except Exception, e:
+ except Exception as e:
# print "That didn't work!"
# print e
return None
@@ -1845,7 +1845,7 @@ def _toUnicode(self, data, encoding):
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
- newdata = unicode(data, encoding)
+ newdata = str(data, encoding)
return newdata
def _detectEncoding(self, xml_data, isHTML=False):
@@ -1858,41 +1858,41 @@ def _detectEncoding(self, xml_data, isHTML=False):
elif xml_data[:4] == '\x00\x3c\x00\x3f':
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+ xml_data = str(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
and (xml_data[2:4] != '\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+ xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+ xml_data = str(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
(xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+ xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+ xml_data = str(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+ xml_data = str(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == '\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+ xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == '\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+ xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == '\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+ xml_data = str(xml_data[3:], 'utf-8').encode('utf-8')
else:
sniffed_xml_encoding = 'ascii'
pass
@@ -1957,7 +1957,7 @@ def _ebcdic_to_ascii(self, s):
250,251,252,253,254,255)
import string
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
MS_CHARS = { '\x80' : ('euro', '20AC'),
@@ -2000,4 +2000,4 @@ def _ebcdic_to_ascii(self, s):
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
+ print((soup.prettify()))
diff --git a/infoslicer/processing/HTML_Parser.py b/infoslicer/processing/HTML_Parser.py
index adb6eb0..a1144c9 100644
--- a/infoslicer/processing/HTML_Parser.py
+++ b/infoslicer/processing/HTML_Parser.py
@@ -1,7 +1,7 @@
# Copyright (C) IBM Corporation 2008
-from BeautifulSoup import BeautifulSoup, Tag
-from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
+from .BeautifulSoup import BeautifulSoup, Tag
+from .NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup
import re
from datetime import date
@@ -78,9 +78,9 @@ def image_handler(self):
too_small = False
image_path = img['src']
alt_text = ""
- if img.has_key("width") and img.has_key("height") and int(img['width']) <= 70 and int(img['height']) <= 70:
+ if "width" in img and "height" in img and int(img['width']) <= 70 and int(img['height']) <= 70:
too_small = True
- if img.has_key("alt") and img['alt'] != "":
+ if "alt" in img and img['alt'] != "":
alt_text = img['alt']
else:
alt_text = image_path.split("/")[-1]
@@ -213,7 +213,7 @@ def tag_generator(self, tag, contents=None, attrs=[]):
@param attrs: Optional, attributes to add to tag
@return: new Tag object
"""
- if self.ids.has_key(tag) and attrs == []:
+ if tag in self.ids and attrs == []:
self.ids[tag] += 1
attrs = [("id", str(self.ids[tag]))]
if attrs != []:
@@ -232,7 +232,7 @@ def unTag(self, tag):
"""
for child in tag.findChildren(True, recursive=False):
self.unTag(child)
- if (self.remove_classes_regexp != "") and (tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)):
+ if (self.remove_classes_regexp != "") and ("class" in tag and (re.match(self.remove_classes_regexp, tag["class"]) != None)):
tag.extract()
elif tag.name in self.keep_tags:
new_tag = Tag(self.input, tag.name)
diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py
index cdd5108..05220d4 100644
--- a/infoslicer/processing/HTML_strip.py
+++ b/infoslicer/processing/HTML_strip.py
@@ -14,7 +14,7 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
from re import sub
from infoslicer.processing.Article_Data import Sentence_Data, \
Paragraph_Data, \
diff --git a/infoslicer/processing/MediaWiki_Helper.py b/infoslicer/processing/MediaWiki_Helper.py
index 988e418..b2caae3 100644
--- a/infoslicer/processing/MediaWiki_Helper.py
+++ b/infoslicer/processing/MediaWiki_Helper.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) IBM Corporation 2008
-import urllib
+import urllib.request, urllib.parse, urllib.error
from xml.dom import minidom
import logging
@@ -14,7 +14,7 @@
"""
Extend urllib class to spoof user-agent
"""
-class NewURLopener(urllib.FancyURLopener):
+class NewURLopener(urllib.request.FancyURLopener):
version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"
class PageNotFoundError(Exception):
@@ -61,7 +61,7 @@ def resolveTitle(self, title, wiki=defaultWiki):
#check page exists, return None if it doesn't
page = xmldoc.getElementsByTagName("page")
if (page != []):
- if ("missing" in page[0].attributes.keys()):
+ if ("missing" in list(page[0].attributes.keys())):
raise PageNotFoundError("The article with title '%s' could not be found on wiki '%s'" % (title, wiki))
#check if there are any redirection tags defined
redirectList = xmldoc.getElementsByTagName("r")
@@ -107,12 +107,12 @@ def getDoc(self, path):
@param path: location of remote file
@return: page contents
@rtype: string"""
- urllib._urlopener = NewURLopener()
+ urllib.request._urlopener = NewURLopener()
logger.debug("opening " + path)
logger.debug("proxies: " + str(self.proxies))
pathencoded = self.urlEncodeNonAscii(path)
logger.debug("pathencoded " + pathencoded)
- doc = urllib.urlopen(pathencoded, proxies=self.proxies)
+ doc = urllib.request.urlopen(pathencoded, proxies=self.proxies)
output = doc.read()
doc.close()
logger.debug("url opened successfully")
@@ -151,7 +151,7 @@ def getImageURLs(self, title, wiki=defaultWiki, revision=None):
xmldoc = minidom.parseString(self.getDoc(path))
imglist = xmldoc.getElementsByTagName("im")
outputlist = []
- for i in xrange(len(imglist)):
+ for i in range(len(imglist)):
#create the API request string
path = "http://%s/w/api.php?action=query&titles=%s&prop=imageinfo&iiprop=url&format=xml" % (wiki, imglist[i].attributes["title"].value.replace(" ","_"))
xmldoc2 = minidom.parseString(self.getDoc(path))
diff --git a/infoslicer/processing/MediaWiki_Parser.py b/infoslicer/processing/MediaWiki_Parser.py
index 1596c57..c6a29c1 100644
--- a/infoslicer/processing/MediaWiki_Parser.py
+++ b/infoslicer/processing/MediaWiki_Parser.py
@@ -1,6 +1,6 @@
# Copyright (C) IBM Corporation 2008
-from HTML_Parser import HTML_Parser, NoDocException
+from .HTML_Parser import HTML_Parser, NoDocException
import re
import logging
@@ -36,7 +36,7 @@ def specialise(self):
#infobox should be first table
first_table = self.input.find("table")
#the word "infobox" should be in the class name somewhere
- if (first_table != None and first_table.has_key("class") and (re.match(re.compile("infobox"), first_table["class"]) != None)):
+ if (first_table != None and "class" in first_table and (re.match(re.compile("infobox"), first_table["class"]) != None)):
#make a new output tag to work with
infobox_tag = self.tag_generator("section", attrs=[("id", "infobox")])
#sometimes infobox data is in an inner table
diff --git a/infoslicer/processing/NewtifulSoup.py b/infoslicer/processing/NewtifulSoup.py
index 4e26a12..74e89be 100644
--- a/infoslicer/processing/NewtifulSoup.py
+++ b/infoslicer/processing/NewtifulSoup.py
@@ -1,6 +1,6 @@
# Copyright (C) IBM Corporation 2008
-from BeautifulSoup import BeautifulStoneSoup
+from .BeautifulSoup import BeautifulStoneSoup
#Extend beautiful soup HTML parsing library
#to recognise new self-closing tag
diff --git a/infoslicer/processing/Paragraph.py b/infoslicer/processing/Paragraph.py
index 563fd16..b59983e 100644
--- a/infoslicer/processing/Paragraph.py
+++ b/infoslicer/processing/Paragraph.py
@@ -1,6 +1,6 @@
# Copyright (C) IBM Corporation 2008
-from Sentence import *
+from .Sentence import *
import logging
logger = logging.getLogger('infoslicer')
diff --git a/infoslicer/processing/Section.py b/infoslicer/processing/Section.py
index bc5f847..27a4f3e 100644
--- a/infoslicer/processing/Section.py
+++ b/infoslicer/processing/Section.py
@@ -1,6 +1,6 @@
# Copyright (C) IBM Corporation 2008
-from Paragraph import *
+from .Paragraph import *
import logging
logger = logging.getLogger('infoslicer')
diff --git a/infoslicer/processing/Sentence.py b/infoslicer/processing/Sentence.py
index 9659dbb..805f66d 100644
--- a/infoslicer/processing/Sentence.py
+++ b/infoslicer/processing/Sentence.py
@@ -7,7 +7,7 @@
from gi.repository import GdkPixbuf
import logging
-from Article_Data import *
+from .Article_Data import *
"""
Created by Jonathan Mace
diff --git a/infoslicer/widgets/Edit_Pane.py b/infoslicer/widgets/Edit_Pane.py
index d7ab056..ff26ae0 100644
--- a/infoslicer/widgets/Edit_Pane.py
+++ b/infoslicer/widgets/Edit_Pane.py
@@ -9,8 +9,8 @@
from sugar3.graphics.toolcombobox import ToolComboBox
-from Reading_View import Reading_View
-from Editing_View import Editing_View
+from .Reading_View import Reading_View
+from .Editing_View import Editing_View
from infoslicer.processing.Article import Article
logger = logging.getLogger('infoslicer')
diff --git a/infoslicer/widgets/Editable_Textbox.py b/infoslicer/widgets/Editable_Textbox.py
index b1da2d2..6d6d1a0 100644
--- a/infoslicer/widgets/Editable_Textbox.py
+++ b/infoslicer/widgets/Editable_Textbox.py
@@ -5,13 +5,13 @@
from gi.repository import Gdk
from gi.repository import GObject
from gi.repository import Pango
-import cPickle
+import pickle
import copy
-from Textbox import Textbox
+from .Textbox import Textbox
import logging
-SNAP_SENTENCE, SNAP_PARAGRAPH, SNAP_SECTION, SNAP_NONE = range(4)
+SNAP_SENTENCE, SNAP_PARAGRAPH, SNAP_SECTION, SNAP_NONE = list(range(4))
class Editable_Textbox( Textbox ):
"""
@@ -266,7 +266,7 @@ def drag_data_received_event(self, widget, context, x, y, selection_data, info,
a = self.article
insert_loc = self.get_mouse_iter(x, y)
data_received_type = str(selection_data.get_data_type())
- data = cPickle.loads(str(selection_data.get_data()))
+ data = pickle.loads(str(selection_data.get_data()))
if data_received_type == "sentence":
bestpoint = insert_loc
@@ -293,7 +293,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, time, data)
if self.snapto == SNAP_SECTION:
atom = Gdk.atom_intern("section", only_if_exists=False)
- string = cPickle.dumps(a.getSelection())
+ string = pickle.dumps(a.getSelection())
selection_data.set(atom, 8, string)
self.stop_emission("drag-data-get")
diff --git a/infoslicer/widgets/Editing_View.py b/infoslicer/widgets/Editing_View.py
index 3f9ecdc..32f7b12 100644
--- a/infoslicer/widgets/Editing_View.py
+++ b/infoslicer/widgets/Editing_View.py
@@ -4,7 +4,7 @@
from gi.repository import Gtk
from gi.repository import Gdk
from gi.repository import GObject
-from Editable_Textbox import Editable_Textbox
+from .Editable_Textbox import Editable_Textbox
class Editing_View( Gtk.VBox ):
"""
diff --git a/infoslicer/widgets/Format_Pane.py b/infoslicer/widgets/Format_Pane.py
index 1be64f2..d45d0c1 100644
--- a/infoslicer/widgets/Format_Pane.py
+++ b/infoslicer/widgets/Format_Pane.py
@@ -4,7 +4,7 @@
from gi.repository import Gtk
from gettext import gettext as _
-from Editing_View import Editing_View
+from .Editing_View import Editing_View
class Format_Pane(Editing_View):
"""
diff --git a/infoslicer/widgets/Gallery_View.py b/infoslicer/widgets/Gallery_View.py
index 7cd3ce8..0d17480 100644
--- a/infoslicer/widgets/Gallery_View.py
+++ b/infoslicer/widgets/Gallery_View.py
@@ -4,10 +4,10 @@
from gi.repository import GObject
from gi.repository import GdkPixbuf
import os
-import cPickle
+import pickle
import logging
-from Editable_Textbox import Editable_Textbox
+from .Editable_Textbox import Editable_Textbox
from infoslicer.processing.Article_Data import *
from infoslicer.processing.Article import Article
import book
@@ -162,7 +162,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, timestamp,
paragraph1data = Paragraph_Data(0, self.source_article_id, 0, 0, [imagedata])
paragraph2data = Paragraph_Data(0, self.source_article_id, 0, 0, [captiondata])
sectionsdata = [Section_Data(0, self.source_article_id, 0, [paragraph1data, paragraph2data])]
- string = cPickle.dumps(sectionsdata)
+ string = pickle.dumps(sectionsdata)
selection_data.set(atom, 8, string)
def _validate_image_list(root, image_list):
@@ -171,7 +171,7 @@ def _validate_image_list(root, image_list):
@param image_list: list of images to validate
@return: list of images with corrected paths, and broken images removed
"""
- for i in xrange(len(image_list)):
+ for i in range(len(image_list)):
if not os.access(image_list[i][0], os.F_OK):
if os.access(os.path.join(root, image_list[i][0]), os.F_OK):
image_list[i] = (os.path.join(root, image_list[i][0]),
diff --git a/infoslicer/widgets/Image_Pane.py b/infoslicer/widgets/Image_Pane.py
index 473253c..356de50 100644
--- a/infoslicer/widgets/Image_Pane.py
+++ b/infoslicer/widgets/Image_Pane.py
@@ -7,8 +7,8 @@
import logging
from gettext import gettext as _
-from Editing_View import Editing_View
-from Gallery_View import Gallery_View
+from .Editing_View import Editing_View
+from .Gallery_View import Gallery_View
from infoslicer.processing.Article import Article
logger = logging.getLogger('infoslicer')
diff --git a/infoslicer/widgets/Journal_Gallery_View.py b/infoslicer/widgets/Journal_Gallery_View.py
index 5358d05..6ed7e22 100644
--- a/infoslicer/widgets/Journal_Gallery_View.py
+++ b/infoslicer/widgets/Journal_Gallery_View.py
@@ -15,11 +15,11 @@
from gi.repository import GObject
from gi.repository import GdkPixbuf
import os
-import cPickle
+import pickle
import pickle
import logging
-from Editable_Textbox import Editable_Textbox
+from .Editable_Textbox import Editable_Textbox
from infoslicer.processing.Article_Data import *
from infoslicer.processing.Article import Article
import book
@@ -158,7 +158,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, timestamp,
paragraph1data = Paragraph_Data(0, self.source_article_id, 0, 0, [imagedata])
paragraph2data = Paragraph_Data(0, self.source_article_id, 0, 0, [captiondata])
sectionsdata = [Section_Data(0, self.source_article_id, 0, [paragraph1data, paragraph2data])]
- string = cPickle.dumps(sectionsdata)
+ string = pickle.dumps(sectionsdata)
selection_data.set(atom, 8, string)
def add_image(self, image_path, title):
diff --git a/infoslicer/widgets/Journal_Image_Pane.py b/infoslicer/widgets/Journal_Image_Pane.py
index 38c1c23..6ae8c5b 100644
--- a/infoslicer/widgets/Journal_Image_Pane.py
+++ b/infoslicer/widgets/Journal_Image_Pane.py
@@ -7,7 +7,7 @@
import logging
from gettext import gettext as _
-from Editing_View import Editing_View
+from .Editing_View import Editing_View
from infoslicer.widgets.Journal_Gallery_View import Journal_Gallery_View
from infoslicer.processing.Article import Article
diff --git a/infoslicer/widgets/Reading_View.py b/infoslicer/widgets/Reading_View.py
index 3c40757..bc67cf0 100644
--- a/infoslicer/widgets/Reading_View.py
+++ b/infoslicer/widgets/Reading_View.py
@@ -3,7 +3,7 @@
gi.require_version('Gtk', '3.0')
from gi.repository import Gtk
from gi.repository import GObject
-from Readonly_Textbox import Readonly_Textbox
+from .Readonly_Textbox import Readonly_Textbox
import logging
logger = logging.getLogger('infoslicer')
diff --git a/infoslicer/widgets/Readonly_Textbox.py b/infoslicer/widgets/Readonly_Textbox.py
index 3d8e40f..745458a 100644
--- a/infoslicer/widgets/Readonly_Textbox.py
+++ b/infoslicer/widgets/Readonly_Textbox.py
@@ -2,11 +2,11 @@
from gi.repository import Gtk
from gi.repository import Gdk
from gi.repository import Pango
-import cPickle
+import pickle
import logging
-from Textbox import Textbox
+from .Textbox import Textbox
-SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = range(4)
+SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = list(range(4))
class Readonly_Textbox( Textbox ):
"""
@@ -172,7 +172,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, time, data)
if self.selectionmode == SELECT_SECTION:
atom = Gdk.atom_intern("section", only_if_exists=False)
- string = cPickle.dumps(a.getSelection())
+ string = pickle.dumps(a.getSelection())
selection_data.set(atom, 8, string)
self.stop_emission("drag-data-get")
self.set_editable(False)
diff --git a/infoslicer/widgets/Textbox.py b/infoslicer/widgets/Textbox.py
index 079c775..0d71a5f 100644
--- a/infoslicer/widgets/Textbox.py
+++ b/infoslicer/widgets/Textbox.py
@@ -4,9 +4,9 @@
from gi.repository import Gtk
from gi.repository import GObject
from gi.repository import Pango
-import cPickle
+import pickle
-SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = range(4)
+SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = list(range(4))
class Textbox( Gtk.TextView ):
"""
diff --git a/net.py b/net.py
index b74fcdf..fbbfe63 100644
--- a/net.py
+++ b/net.py
@@ -16,7 +16,7 @@
import os
import shutil
-import urllib
+import urllib.request, urllib.parse, urllib.error
import logging
from gettext import gettext as _
@@ -48,11 +48,11 @@ def download_wiki_article(title, wiki, progress):
progress.set_label(_('"%s" successfully downloaded') % title)
- except PageNotFoundError, e:
+ except PageNotFoundError as e:
elogger.debug('download_and_add: %s' % e)
progress.set_label(_('"%s" could not be found') % title)
- except Exception, e:
+ except Exception as e:
elogger.debug('download_and_add: %s' % e)
progress.set_label(_('Error downloading "%s"; check your connection') % title)
@@ -70,7 +70,7 @@ def image_handler(root, uid, document):
logger.debug('image_handler: %s' % dir_path)
if not os.path.exists(dir_path):
- os.makedirs(dir_path, 0777)
+ os.makedirs(dir_path, 0o777)
for image in document.findAll("image"):
fail = False
@@ -83,7 +83,7 @@ def image_handler(root, uid, document):
else:
image_title = path.rsplit("/", 1)[-1]
# attempt to fix incomplete paths
- if (not path.startswith("http://")) and document.source != None and document.source.has_key("href"):
+ if (not path.startswith("http://")) and document.source != None and "href" in document.source:
if path.startswith("//upload"):
path = 'http:' + path
elif path.startswith("/"):
@@ -111,19 +111,19 @@ def _open_url(url):
"""
retrieves content from specified url
"""
- urllib._urlopener = _new_url_opener()
+ urllib.request._urlopener = _new_url_opener()
try:
logger.debug("opening " + url)
logger.debug("proxies: " + str(proxies))
- doc = urllib.urlopen(url, proxies=proxies)
+ doc = urllib.request.urlopen(url, proxies=proxies)
output = doc.read()
doc.close()
logger.debug("url opened succesfully")
return output
- except IOError, e:
+ except IOError as e:
elogger.debug('_open_url: %s' % e)
-class _new_url_opener(urllib.FancyURLopener):
+class _new_url_opener(urllib.request.FancyURLopener):
version = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1b2)" \
"Gecko/20081218 Gentoo Iceweasel/3.1b2"