diff --git a/BeautifulSoup.py b/BeautifulSoup.py index 9a44367..04f96f4 100644 --- a/BeautifulSoup.py +++ b/BeautifulSoup.py @@ -26,7 +26,7 @@ Beautiful Soup defines classes for two main parsing strategies: - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + * BeautifulStoneSoup, for parsing XML, HTML, or your domain-specific language that kind of looks like XML. * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid @@ -77,21 +77,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. """ -from __future__ import generators + __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "3.2.1" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "New-style BSD" -from sgmllib import SGMLParser, SGMLParseError +from html.parser import HTMLParser import codecs -import markupbase +import _markupbase import types import re -import sgmllib try: - from htmlentitydefs import name2codepoint + from html.entities import name2codepoint except ImportError: name2codepoint = {} try: @@ -100,8 +99,7 @@ from sets import Set as set #These hacks make Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match +_markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" @@ -118,7 +116,7 @@ class PageElement(object): def _invert(h): "Cheap function to invert a hash." i = {} - for k,v in h.items(): + for k,v in list(h.items()): i[v] = k return i @@ -178,7 +176,7 @@ def extract(self): #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._lastRecursiveChild() - nextElement = lastChild.next + nextElement = lastChild.__next__ if self.previous: self.previous.next = nextElement @@ -203,7 +201,7 @@ def _lastRecursiveChild(self): return lastChild def insert(self, position, newChild): - if isinstance(newChild, basestring) \ + if isinstance(newChild, str) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) @@ -257,7 +255,7 @@ def insert(self, position, newChild): newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild - if newChildsLastElement.next: + if newChildsLastElement.__next__: newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild) @@ -358,7 +356,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs): return [element for element in generator() if isinstance(element, Tag)] # findAll*('tag-name') - elif isinstance(name, basestring): + elif isinstance(name, str): return [element for element in generator() if isinstance(element, Tag) and element.name == name] @@ -371,7 +369,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs): g = generator() while True: try: - i = g.next() + i = next(g) except StopIteration: break if i: @@ -387,7 +385,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs): def nextGenerator(self): i = self while i is not None: - i = i.next + i = i.__next__ yield i def nextSiblingGenerator(self): @@ -422,19 +420,19 @@ def substituteEncoding(self, str, encoding=None): def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. .""" - if isinstance(s, unicode): + if isinstance(s, str): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: - s = unicode(s) + s = str(s) else: if encoding: s = self.toEncoding(str(s), encoding) else: - s = unicode(s) + s = str(s) return s BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" @@ -447,7 +445,7 @@ def _sub_entity(self, x): return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): def __new__(cls, value): """Create a new NavigableString. @@ -457,9 +455,9 @@ def __new__(cls, value): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + if isinstance(value, str): + return str.__new__(cls, value) + return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (NavigableString.__str__(self),) @@ -471,7 +469,7 @@ def __getattr__(self, attr): if attr == 'string': return self else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) def __unicode__(self): return str(self).decode(DEFAULT_OUTPUT_ENCODING) @@ -515,23 +513,23 @@ def _convertEntities(self, match): escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) + return chr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: - return u'&%s;' % x + return '&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) + return chr(int(x[2:], 16)) else: - return unichr(int(x[1:])) + return chr(int(x[1:])) elif self.escapeUnrecognizedEntities: - return u'&%s;' % x + return '&%s;' % x else: - return u'&%s;' % x + return '&%s;' % x def __init__(self, parser, name, attrs=None, parent=None, previous=None): @@ -545,7 +543,7 @@ def __init__(self, parser, name, attrs=None, parent=None, if attrs is None: attrs = [] elif isinstance(attrs, dict): - attrs = attrs.items() + attrs = list(attrs.items()) self.attrs = attrs self.contents = [] self.setup(parent, previous) @@ -556,11 +554,11 @@ def __init__(self, parser, name, attrs=None, parent=None, self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, + convert = lambda k_val: (k_val[0], re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, - val)) - self.attrs = map(convert, self.attrs) + k_val[1])) + self.attrs = list(map(convert, self.attrs)) def getString(self): if (len(self.contents) == 1 @@ -574,16 +572,16 @@ def setString(self, string): string = property(getString, setString) - def getText(self, separator=u""): + def getText(self, separator=""): if not len(self.contents): - return u"" - stopNode = self._lastRecursiveChild().next + return "" + stopNode = self._lastRecursiveChild().__next__ strings = [] current = self.contents[0] while current is not stopNode: if isinstance(current, NavigableString): strings.append(current.strip()) - current = current.next + current = current.__next__ return separator.join(strings) text = property(getText) @@ -606,7 +604,7 @@ def index(self, element): raise ValueError("Tag.index: element not in tag") def has_key(self, key): - return self._getAttrMap().has_key(key) + return key in self._getAttrMap() def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, @@ -624,7 +622,7 @@ def __len__(self): def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -650,14 +648,14 @@ def __delitem__(self, key): #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() - if self.attrMap.has_key(key): + if key in self.attrMap: del self.attrMap[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" - return apply(self.findAll, args, kwargs) + return self.findAll(*args, **kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) @@ -665,7 +663,7 @@ def __getattr__(self, tag): return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, @@ -709,7 +707,7 @@ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' - if isinstance(val, basestring): + if isinstance(val, str): if self.containsSubstitutions and '%SOUP-ENCODING%' in val: val = self.substituteEncoding(val, encoding) @@ -786,7 +784,7 @@ def decompose(self): return current = self.contents[0] while current is not None: - next = current.next + next = current.__next__ if isinstance(current, Tag): del current.contents[:] current.parent = None @@ -879,11 +877,11 @@ def childGenerator(self): def recursiveChildGenerator(self): if not len(self.contents): raise StopIteration - stopNode = self._lastRecursiveChild().next + stopNode = self._lastRecursiveChild().__next__ current = self.contents[0] while current is not stopNode: yield current - current = current.next + current = current.__next__ # Next, a couple classes to represent queries and their results. @@ -893,7 +891,7 @@ class SoupStrainer: def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name - if isinstance(attrs, basestring): + if isinstance(attrs, str): kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: @@ -929,7 +927,7 @@ def searchTag(self, markupName=None, markupAttrs={}): else: match = True markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): + for attr, matchAgainst in list(self.attrs.items()): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs @@ -967,12 +965,12 @@ def search(self, markup): found = self.searchTag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, str): if self._matches(markup, self.text): found = markup else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + raise Exception("I don't know how to match against a %s" \ + % markup.__class__) return found def _matches(self, markup, matchAgainst): @@ -987,8 +985,8 @@ def _matches(self, markup, matchAgainst): #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name - if markup and not isinstance(markup, basestring): - markup = unicode(markup) + if markup and not isinstance(markup, str): + markup = str(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. @@ -996,10 +994,10 @@ def _matches(self, markup, matchAgainst): elif hasattr(matchAgainst, '__iter__'): # list-like result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) + result = matchAgainst in markup + elif matchAgainst and isinstance(markup, str): + if isinstance(markup, str): + matchAgainst = str(matchAgainst) else: matchAgainst = str(matchAgainst) @@ -1024,7 +1022,7 @@ def buildTagMap(default, *args): for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. - for k,v in portion.items(): + for k,v in list(portion.items()): built[k] = v elif hasattr(portion, '__iter__'): # is a list #It's a list. Map each item to the default. @@ -1037,7 +1035,7 @@ def buildTagMap(default, *args): # Now, the parser classes. -class BeautifulStoneSoup(Tag, SGMLParser): +class BeautifulStoneSoup(Tag, HTMLParser): """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the @@ -1067,7 +1065,7 @@ class BeautifulStoneSoup(Tag, SGMLParser): lambda x: '') ] - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' HTML_ENTITIES = "html" XML_ENTITIES = "xml" @@ -1088,9 +1086,9 @@ def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, provided markup (which can be a string or a file-like object) is fed into the underlying parser. - sgmllib will process most bad HTML, and the BeautifulSoup + html.parser will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data + HTMLParser, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. @@ -1100,7 +1098,7 @@ class has some tricks for dealing with some HTML that kills you'll get better performance. The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: + instances of invalid HTML that choke HTMLParser:
(No space between name of closing tag and tag close) (Extraneous whitespace in declaration) @@ -1138,7 +1136,7 @@ class has some tricks for dealing with some HTML that kills self.escapeUnrecognizedEntities = False self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) + HTMLParser.__init__(self) if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() @@ -1151,7 +1149,7 @@ class has some tricks for dealing with some HTML that kills self.markup = None # The markup can now be GCed def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" + """This method fixes a bug in Python's HTMLParser.""" try: n = int(name) except ValueError: @@ -1163,14 +1161,14 @@ def convert_charref(self, name): def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup - if isinstance(markup, unicode): + if isinstance(markup, str): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) - markup = dammit.unicode + markup = dammit.str self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: @@ -1187,20 +1185,20 @@ def _feed(self, inDocumentEncoding=None, isHTML=False): del(self.markupMassage) self.reset() - SGMLParser.feed(self, markup) + HTMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser + """This method routes method call requests to either the HTMLParser superclass or the Tag superclass, depending on the method name.""" #print "__getattr__ called on %s.%s" % (self.__class__, methodName) if methodName.startswith('start_') or methodName.startswith('end_') \ or methodName.startswith('do_'): - return SGMLParser.__getattr__(self, methodName) + return HTMLParser.__getattr__(self, methodName) elif not methodName.startswith('__'): return Tag.__getattr__(self, methodName) else: @@ -1209,13 +1207,13 @@ def __getattr__(self, methodName): def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) + return name in self.SELF_CLOSING_TAGS \ + or name in self.instanceSelfClosingTags def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 - SGMLParser.reset(self) + HTMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] @@ -1239,7 +1237,7 @@ def pushTag(self, tag): def endData(self, containerClass=NavigableString): if self.currentData: - currentData = u''.join(self.currentData) + currentData = ''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.PRESERVE_WHITESPACE_TAGS)): @@ -1302,7 +1300,7 @@ def _smartPop(self, name): nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + isResetNesting = name in self.RESET_NESTING_TAGS popTo = None inclusive = True for i in range(len(self.tagStack)-1, 0, -1): @@ -1315,7 +1313,7 @@ def _smartPop(self, name): if (nestingResetTriggers is not None and p.name in nestingResetTriggers) \ or (nestingResetTriggers is None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): + and p.name in self.RESET_NESTING_TAGS): #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag @@ -1386,7 +1384,7 @@ def handle_pi(self, text): object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + text = "xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): @@ -1396,7 +1394,7 @@ def handle_comment(self, text): def handle_charref(self, ref): "Handle character references as data." if self.convertEntities: - data = unichr(int(ref)) + data = chr(int(ref)) else: data = '&#%s;' % ref self.handle_data(data) @@ -1408,7 +1406,7 @@ def handle_entityref(self, ref): data = None if self.convertHTMLEntities: try: - data = unichr(name2codepoint[ref]) + data = chr(name2codepoint[ref]) except KeyError: pass @@ -1449,7 +1447,7 @@ def handle_decl(self, data): self._toStringSubclass(data, Declaration) def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA + """Treat a bogus HTML declaration as raw data. Treat a CDATA declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '= 4) and (xml_data[:2] == '\xfe\xff') \ and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + xml_data = str(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + xml_data = str(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + xml_data = str(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass @@ -1972,7 +1970,7 @@ def _ebcdic_to_ascii(self, s): 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80' : ('euro', '20AC'), @@ -2015,4 +2013,4 @@ def _ebcdic_to_ascii(self, s): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print(soup.prettify()) diff --git a/activity/activity.info b/activity/activity.info index 7c37656..3b81ca8 100644 --- a/activity/activity.info +++ b/activity/activity.info @@ -4,7 +4,7 @@ activity_version = 25 license = GPLv2+;GPLv3+;BSD icon = slicelogo bundle_id = org.sugarlabs.InfoSlicer -exec = sugar-activity activity.InfoslicerActivity +exec = sugar-activity3 activity.InfoslicerActivity show_launcher = yes summary = Is it possible to have my own encyclopedia? Yes! Find your favorite information on the web and package it with InfoSlicer creating incredible collections. tags = Tools;Internet diff --git a/book.py b/book.py index 4b348b9..4784884 100644 --- a/book.py +++ b/book.py @@ -157,7 +157,7 @@ def __init__(self, preinstalled, root): self.revision = 1 if not os.path.exists(self.root): - os.makedirs(self.root, 0775) + os.makedirs(self.root, 0o775) for i in preinstalled: filepath = os.path.join(get_bundle_path(), 'examples', i[1]) @@ -196,7 +196,7 @@ def _save(self, uid, contents): directory = os.path.join(self.root, str(uid)) if not os.path.exists(directory): - os.makedirs(directory, 0777) + os.makedirs(directory, 0o777) contents = contents.replace( '', '\n' @@ -232,7 +232,7 @@ def __init__(self, filepath=None): zip = zipfile.ZipFile(filepath, 'r') for i in zip.namelist(): path = os.path.join(root, i) - os.makedirs(os.path.dirname(path), 0775) + os.makedirs(os.path.dirname(path), 0o775) file(path, 'wb').write(zip.read(i)) zip.close() diff --git a/infoslicer/processing/Article.py b/infoslicer/processing/Article.py index b3311b0..8d66f37 100644 --- a/infoslicer/processing/Article.py +++ b/infoslicer/processing/Article.py @@ -4,8 +4,8 @@ from gi.repository import GdkPixbuf from random import Random -from Article_Data import * -from Section import * +from .Article_Data import * +from .Section import * import logging logger = logging.getLogger('infoslicer') diff --git a/infoslicer/processing/Article_Builder.py b/infoslicer/processing/Article_Builder.py index 5d2a429..43f1bae 100644 --- a/infoslicer/processing/Article_Builder.py +++ b/infoslicer/processing/Article_Builder.py @@ -1,8 +1,8 @@ # Copyright (C) IBM Corporation 2008 -from BeautifulSoup import Tag -from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup -from Article_Data import * +from .BeautifulSoup import Tag +from .NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup +from .Article_Data import * import re import os import logging @@ -51,7 +51,7 @@ def get_article_from_dita(image_path, dita): input.shortdesc.extract() has_shortdesc = True taglist = input.findAll(re.compile("refbody|section|p|ph|image")) - for i in xrange(len(taglist)): + for i in range(len(taglist)): tag = taglist[len(taglist) - i - 1] if tag.name == "ph": id = tag['id'] @@ -63,7 +63,7 @@ def get_article_from_dita(image_path, dita): sentence_data = Sentence_Data(id, source_article_id, source_section_id, source_paragraph_id, source_sentence_id, text) sentence_data_list.insert(0, sentence_data) elif tag.name == "p": - if not tag.has_key("id"): + if "id" not in tag: id = -1 else: id = tag['id'] @@ -75,7 +75,7 @@ def get_article_from_dita(image_path, dita): sentence_data_list = [] current_p_id = id elif tag.name == "refbody" : - if tag.findParent("reference").has_key("id"): + if "id" in tag.findParent("reference"): id = "r" + tag.findParent("reference")['id'] else: id = "r90000" diff --git a/infoslicer/processing/BeautifulSoup.py b/infoslicer/processing/BeautifulSoup.py index 666a210..c26490d 100644 --- a/infoslicer/processing/BeautifulSoup.py +++ b/infoslicer/processing/BeautifulSoup.py @@ -77,7 +77,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. """ -from __future__ import generators + __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "3.1.0.1" @@ -85,12 +85,12 @@ __license__ = "New-style BSD" import codecs -import markupbase +import _markupbase import types import re -from HTMLParser import HTMLParser, HTMLParseError +from html.parser import HTMLParser try: - from htmlentitydefs import name2codepoint + from html.entities import name2codepoint except ImportError: name2codepoint = {} try: @@ -99,18 +99,18 @@ from sets import Set as set #These hacks make Beautiful Soup able to parse XML with namespaces -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match +_markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. -def sob(unicode, encoding): +def sob(str, encoding): """Returns either the given Unicode string or its encoding.""" if encoding is None: - return unicode + return str else: - return unicode.encode(encoding) + return str.encode(encoding) class PageElement: """Contains the navigational information for some part of the page @@ -154,7 +154,7 @@ def extract(self): #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._lastRecursiveChild() - nextElement = lastChild.next + nextElement = lastChild.__next__ if self.previous: self.previous.next = nextElement @@ -179,8 +179,8 @@ def _lastRecursiveChild(self): return lastChild def insert(self, position, newChild): - if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ + if (isinstance(newChild, str) + or isinstance(newChild, str)) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) @@ -234,7 +234,7 @@ def insert(self, position, newChild): newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild - if newChildsLastElement.next: + if newChildsLastElement.__next__: newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild) @@ -335,7 +335,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs): g = generator() while True: try: - i = g.next() + i = next(g) except StopIteration: break if i: @@ -351,7 +351,7 @@ def _findAll(self, name, attrs, text, limit, generator, **kwargs): def nextGenerator(self): i = self while i: - i = i.next + i = i.__next__ yield i def nextSiblingGenerator(self): @@ -386,22 +386,22 @@ def substituteEncoding(self, str, encoding=None): def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. .""" - if isinstance(s, unicode): + if isinstance(s, str): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: - s = unicode(s) + s = str(s) else: if encoding: s = self.toEncoding(str(s), encoding) else: - s = unicode(s) + s = str(s) return s -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): def __new__(cls, value): """Create a new NavigableString. @@ -411,12 +411,12 @@ def __new__(cls, value): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + if isinstance(value, str): + return str.__new__(cls, value) + return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -425,7 +425,7 @@ def __getattr__(self, attr): if attr == 'string': return self else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.decode().encode(encoding) @@ -436,23 +436,23 @@ def decodeGivenEventualEncoding(self, eventualEncoding): class CData(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + return '' class ProcessingInstruction(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): output = self - if u'%SOUP-ENCODING%' in output: + if '%SOUP-ENCODING%' in output: output = self.substituteEncoding(output, eventualEncoding) - return u'' + return '' class Comment(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + return '' class Declaration(NavigableString): def decodeGivenEventualEncoding(self, eventualEncoding): - return u'' + return '' class Tag(PageElement): @@ -461,7 +461,7 @@ class Tag(PageElement): def _invert(h): "Cheap function to invert a hash." i = {} - for k,v in h.items(): + for k,v in list(h.items()): i[v] = k return i @@ -480,23 +480,23 @@ def _convertEntities(self, match): escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: - return unichr(name2codepoint[x]) + return chr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: - return u'&%s;' % x + return '&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': - return unichr(int(x[2:], 16)) + return chr(int(x[2:], 16)) else: - return unichr(int(x[1:])) + return chr(int(x[1:])) elif self.escapeUnrecognizedEntities: - return u'&%s;' % x + return '&%s;' % x else: - return u'&%s;' % x + return '&%s;' % x def __init__(self, parser, name, attrs=None, parent=None, previous=None): @@ -525,7 +525,7 @@ def convert(kval): return kval return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) - self.attrs = map(convert, self.attrs) + self.attrs = list(map(convert, self.attrs)) def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or @@ -534,7 +534,7 @@ def get(self, key, default=None): return self._getAttrMap().get(key, default) def has_key(self, key): - return self._getAttrMap().has_key(key) + return key in self._getAttrMap() def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, @@ -552,7 +552,7 @@ def __len__(self): def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -578,14 +578,14 @@ def __delitem__(self, key): #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() - if self.attrMap.has_key(key): + if key in self.attrMap: del self.attrMap[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" - return apply(self.findAll, args, kwargs) + return self.findAll(*args, **kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) @@ -593,7 +593,7 @@ def __getattr__(self, tag): return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, @@ -823,13 +823,13 @@ def _getAttrMap(self): def recursiveChildGenerator(self): if not len(self.contents): raise StopIteration - stopNode = self._lastRecursiveChild().next + stopNode = self._lastRecursiveChild().__next__ current = self.contents[0] while current is not stopNode: if not current: break yield current - current = current.next + current = current.__next__ def childGenerator(self): if not len(self.contents): @@ -883,7 +883,7 @@ def searchTag(self, markupName=None, markupAttrs={}): else: match = True markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): + for attr, matchAgainst in list(self.attrs.items()): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs @@ -924,14 +924,14 @@ def search(self, markup): if self._matches(markup, self.text): found = markup else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + raise Exception("I don't know how to match against a %s" \ + % markup.__class__) return found def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: + if matchAgainst == True and type(matchAgainst) == bool: result = markup != None elif callable(matchAgainst): result = matchAgainst(markup) @@ -941,7 +941,7 @@ def _matches(self, markup, matchAgainst): if isinstance(markup, Tag): markup = markup.name if markup is not None and not isString(markup): - markup = unicode(markup) + markup = str(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. @@ -950,10 +950,10 @@ def _matches(self, markup, matchAgainst): and (markup is not None or not isString(matchAgainst))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) + result = matchAgainst in markup elif matchAgainst and isString(markup): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) + if isinstance(markup, str): + matchAgainst = str(matchAgainst) else: matchAgainst = str(matchAgainst) @@ -974,13 +974,13 @@ def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" return ((hasattr(l, '__iter__') and not isString(l)) - or (type(l) in (types.ListType, types.TupleType))) + or (type(l) in (list, tuple))) def isString(s): """Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.""" try: - return isinstance(s, unicode) or isinstance(s, basestring) + return isinstance(s, str) or isinstance(s, str) except NameError: return isinstance(s, str) @@ -992,7 +992,7 @@ def buildTagMap(default, *args): for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. - for k,v in portion.items(): + for k,v in list(portion.items()): built[k] = v elif isList(portion) and not isString(portion): #It's a list. Map each item to the default. @@ -1037,7 +1037,7 @@ def handle_pi(self, text): object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + text = "xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): @@ -1047,7 +1047,7 @@ def handle_comment(self, text): def handle_charref(self, ref): "Handle character references as data." if self.soup.convertEntities: - data = unichr(int(ref)) + data = chr(int(ref)) else: data = '&#%s;' % ref self.handle_data(data) @@ -1059,7 +1059,7 @@ def handle_entityref(self, ref): data = None if self.soup.convertHTMLEntities: try: - data = unichr(name2codepoint[ref]) + data = chr(name2codepoint[ref]) except KeyError: pass @@ -1111,12 +1111,12 @@ def parse_declaration(self, i): j = k+3 self._toStringSubclass(data, CData) else: - try: - j = HTMLParser.parse_declaration(self, i) - except HTMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) + #try: + j = HTMLParser.parse_declaration(self, i) + # except HTMLParseError: '''HTMLParseError is deprecated''' + # toHandle = self.rawdata[i:] + # self.handle_data(toHandle) + # j = i + len(toHandle) return j @@ -1150,7 +1150,7 @@ class BeautifulStoneSoup(Tag): lambda x: '') ] - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' HTML_ENTITIES = "html" XML_ENTITIES = "xml" @@ -1239,14 +1239,14 @@ class has some tricks for dealing with some HTML that kills def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup - if isinstance(markup, unicode): + if isinstance(markup, str): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) - markup = dammit.unicode + markup = dammit.str self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: @@ -1272,8 +1272,8 @@ def _feed(self, inDocumentEncoding=None, isHTML=False): def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) + return name in self.SELF_CLOSING_TAGS \ + or name in self.instanceSelfClosingTags def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) @@ -1308,7 +1308,7 @@ def pushTag(self, tag): def endData(self, containerClass=NavigableString): if self.currentData: - currentData = u''.join(self.currentData) + currentData = ''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.PRESERVE_WHITESPACE_TAGS)): @@ -1371,7 +1371,7 @@ def _smartPop(self, name): nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + isResetNesting = name in self.RESET_NESTING_TAGS popTo = None inclusive = True for i in range(len(self.tagStack)-1, 0, -1): @@ -1384,7 +1384,7 @@ def _smartPop(self, name): if (nestingResetTriggers != None and p.name in nestingResetTriggers) \ or (nestingResetTriggers == None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): + and p.name in self.RESET_NESTING_TAGS): #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag @@ -1402,7 +1402,7 @@ def unknown_starttag(self, name, attrs, selfClosing=0): if self.quoteStack: #This is not a real tag. #print "<%s> is not real!" % name - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + attrs = ''.join([' %s="%s"' % (x_y[0], x_y[1]) for x_y in attrs]) self.handle_data('<%s%s>' % (name, attrs)) return self.endData() @@ -1496,7 +1496,7 @@ class BeautifulSoup(BeautifulStoneSoup): BeautifulStoneSoup before writing your own subclass.""" def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): + if 'smartQuotesTo' not in kwargs: kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) @@ -1680,7 +1680,7 @@ def popTag(self): parent._getAttrMap() if (isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString) and - not parent.attrMap.has_key(tag.name)): + tag.name not in parent.attrMap): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) @@ -1754,9 +1754,9 @@ def __init__(self, markup, overrideEncodings=[], self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] - if markup == '' or isinstance(markup, unicode): + if markup == '' or isinstance(markup, str): self.originalEncoding = None - self.unicode = unicode(markup) + self.str = str(markup) return u = None @@ -1769,7 +1769,7 @@ def __init__(self, markup, overrideEncodings=[], if u: break # If no luck and we have auto-detection library, try that: - if not u and chardet and not isinstance(self.markup, unicode): + if not u and chardet and not isinstance(self.markup, str): u = self._convertFrom(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: @@ -1778,7 +1778,7 @@ def __init__(self, markup, overrideEncodings=[], u = self._convertFrom(proposed_encoding) if u: break - self.unicode = u + self.str = u if not u: self.originalEncoding = None def _subMSChar(self, match): @@ -1786,7 +1786,7 @@ def _subMSChar(self, match): entity.""" orig = match.group(1) sub = self.MS_CHARS.get(orig) - if type(sub) == types.TupleType: + if type(sub) == tuple: if self.smartQuotesTo == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: @@ -1816,7 +1816,7 @@ def _convertFrom(self, proposed): u = self._toUnicode(markup, proposed) self.markup = u self.originalEncoding = proposed - except Exception, e: + except Exception as e: # print "That didn't work!" # print e return None @@ -1845,7 +1845,7 @@ def _toUnicode(self, data, encoding): elif data[:4] == '\xff\xfe\x00\x00': encoding = 'utf-32le' data = data[4:] - newdata = unicode(data, encoding) + newdata = str(data, encoding) return newdata def _detectEncoding(self, xml_data, isHTML=False): @@ -1858,41 +1858,41 @@ def _detectEncoding(self, xml_data, isHTML=False): elif xml_data[:4] == '\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + xml_data = str(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + xml_data = str(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + xml_data = str(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + xml_data = str(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') else: sniffed_xml_encoding = 'ascii' pass @@ -1957,7 +1957,7 @@ def _ebcdic_to_ascii(self, s): 250,251,252,253,254,255) import string c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP) MS_CHARS = { '\x80' : ('euro', '20AC'), @@ -2000,4 +2000,4 @@ def _ebcdic_to_ascii(self, s): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print((soup.prettify())) diff --git a/infoslicer/processing/HTML_Parser.py b/infoslicer/processing/HTML_Parser.py index adb6eb0..a1144c9 100644 --- a/infoslicer/processing/HTML_Parser.py +++ b/infoslicer/processing/HTML_Parser.py @@ -1,7 +1,7 @@ # Copyright (C) IBM Corporation 2008 -from BeautifulSoup import BeautifulSoup, Tag -from NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup +from .BeautifulSoup import BeautifulSoup, Tag +from .NewtifulSoup import NewtifulStoneSoup as BeautifulStoneSoup import re from datetime import date @@ -78,9 +78,9 @@ def image_handler(self): too_small = False image_path = img['src'] alt_text = "" - if img.has_key("width") and img.has_key("height") and int(img['width']) <= 70 and int(img['height']) <= 70: + if "width" in img and "height" in img and int(img['width']) <= 70 and int(img['height']) <= 70: too_small = True - if img.has_key("alt") and img['alt'] != "": + if "alt" in img and img['alt'] != "": alt_text = img['alt'] else: alt_text = image_path.split("/")[-1] @@ -213,7 +213,7 @@ def tag_generator(self, tag, contents=None, attrs=[]): @param attrs: Optional, attributes to add to tag @return: new Tag object """ - if self.ids.has_key(tag) and attrs == []: + if tag in self.ids and attrs == []: self.ids[tag] += 1 attrs = [("id", str(self.ids[tag]))] if attrs != []: @@ -232,7 +232,7 @@ def unTag(self, tag): """ for child in tag.findChildren(True, recursive=False): self.unTag(child) - if (self.remove_classes_regexp != "") and (tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): + if (self.remove_classes_regexp != "") and ("class" in tag and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) diff --git a/infoslicer/processing/HTML_strip.py b/infoslicer/processing/HTML_strip.py index cdd5108..05220d4 100644 --- a/infoslicer/processing/HTML_strip.py +++ b/infoslicer/processing/HTML_strip.py @@ -14,7 +14,7 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -from HTMLParser import HTMLParser +from html.parser import HTMLParser from re import sub from infoslicer.processing.Article_Data import Sentence_Data, \ Paragraph_Data, \ diff --git a/infoslicer/processing/MediaWiki_Helper.py b/infoslicer/processing/MediaWiki_Helper.py index 988e418..b2caae3 100644 --- a/infoslicer/processing/MediaWiki_Helper.py +++ b/infoslicer/processing/MediaWiki_Helper.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright (C) IBM Corporation 2008 -import urllib +import urllib.request, urllib.parse, urllib.error from xml.dom import minidom import logging @@ -14,7 +14,7 @@ """ Extend urllib class to spoof user-agent """ -class NewURLopener(urllib.FancyURLopener): +class NewURLopener(urllib.request.FancyURLopener): version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11" class PageNotFoundError(Exception): @@ -61,7 +61,7 @@ def resolveTitle(self, title, wiki=defaultWiki): #check page exists, return None if it doesn't page = xmldoc.getElementsByTagName("page") if (page != []): - if ("missing" in page[0].attributes.keys()): + if ("missing" in list(page[0].attributes.keys())): raise PageNotFoundError("The article with title '%s' could not be found on wiki '%s'" % (title, wiki)) #check if there are any redirection tags defined redirectList = xmldoc.getElementsByTagName("r") @@ -107,12 +107,12 @@ def getDoc(self, path): @param path: location of remote file @return: page contents @rtype: string""" - urllib._urlopener = NewURLopener() + urllib.request._urlopener = NewURLopener() logger.debug("opening " + path) logger.debug("proxies: " + str(self.proxies)) pathencoded = self.urlEncodeNonAscii(path) logger.debug("pathencoded " + pathencoded) - doc = urllib.urlopen(pathencoded, proxies=self.proxies) + doc = urllib.request.urlopen(pathencoded, proxies=self.proxies) output = doc.read() doc.close() logger.debug("url opened successfully") @@ -151,7 +151,7 @@ def getImageURLs(self, title, wiki=defaultWiki, revision=None): xmldoc = minidom.parseString(self.getDoc(path)) imglist = xmldoc.getElementsByTagName("im") outputlist = [] - for i in xrange(len(imglist)): + for i in range(len(imglist)): #create the API request string path = "http://%s/w/api.php?action=query&titles=%s&prop=imageinfo&iiprop=url&format=xml" % (wiki, imglist[i].attributes["title"].value.replace(" ","_")) xmldoc2 = minidom.parseString(self.getDoc(path)) diff --git a/infoslicer/processing/MediaWiki_Parser.py b/infoslicer/processing/MediaWiki_Parser.py index 1596c57..c6a29c1 100644 --- a/infoslicer/processing/MediaWiki_Parser.py +++ b/infoslicer/processing/MediaWiki_Parser.py @@ -1,6 +1,6 @@ # Copyright (C) IBM Corporation 2008 -from HTML_Parser import HTML_Parser, NoDocException +from .HTML_Parser import HTML_Parser, NoDocException import re import logging @@ -36,7 +36,7 @@ def specialise(self): #infobox should be first table first_table = self.input.find("table") #the word "infobox" should be in the class name somewhere - if (first_table != None and first_table.has_key("class") and (re.match(re.compile("infobox"), first_table["class"]) != None)): + if (first_table != None and "class" in first_table and (re.match(re.compile("infobox"), first_table["class"]) != None)): #make a new output tag to work with infobox_tag = self.tag_generator("section", attrs=[("id", "infobox")]) #sometimes infobox data is in an inner table diff --git a/infoslicer/processing/NewtifulSoup.py b/infoslicer/processing/NewtifulSoup.py index 4e26a12..74e89be 100644 --- a/infoslicer/processing/NewtifulSoup.py +++ b/infoslicer/processing/NewtifulSoup.py @@ -1,6 +1,6 @@ # Copyright (C) IBM Corporation 2008 -from BeautifulSoup import BeautifulStoneSoup +from .BeautifulSoup import BeautifulStoneSoup #Extend beautiful soup HTML parsing library #to recognise new self-closing tag diff --git a/infoslicer/processing/Paragraph.py b/infoslicer/processing/Paragraph.py index 563fd16..b59983e 100644 --- a/infoslicer/processing/Paragraph.py +++ b/infoslicer/processing/Paragraph.py @@ -1,6 +1,6 @@ # Copyright (C) IBM Corporation 2008 -from Sentence import * +from .Sentence import * import logging logger = logging.getLogger('infoslicer') diff --git a/infoslicer/processing/Section.py b/infoslicer/processing/Section.py index bc5f847..27a4f3e 100644 --- a/infoslicer/processing/Section.py +++ b/infoslicer/processing/Section.py @@ -1,6 +1,6 @@ # Copyright (C) IBM Corporation 2008 -from Paragraph import * +from .Paragraph import * import logging logger = logging.getLogger('infoslicer') diff --git a/infoslicer/processing/Sentence.py b/infoslicer/processing/Sentence.py index 9659dbb..805f66d 100644 --- a/infoslicer/processing/Sentence.py +++ b/infoslicer/processing/Sentence.py @@ -7,7 +7,7 @@ from gi.repository import GdkPixbuf import logging -from Article_Data import * +from .Article_Data import * """ Created by Jonathan Mace diff --git a/infoslicer/widgets/Edit_Pane.py b/infoslicer/widgets/Edit_Pane.py index d7ab056..ff26ae0 100644 --- a/infoslicer/widgets/Edit_Pane.py +++ b/infoslicer/widgets/Edit_Pane.py @@ -9,8 +9,8 @@ from sugar3.graphics.toolcombobox import ToolComboBox -from Reading_View import Reading_View -from Editing_View import Editing_View +from .Reading_View import Reading_View +from .Editing_View import Editing_View from infoslicer.processing.Article import Article logger = logging.getLogger('infoslicer') diff --git a/infoslicer/widgets/Editable_Textbox.py b/infoslicer/widgets/Editable_Textbox.py index b1da2d2..6d6d1a0 100644 --- a/infoslicer/widgets/Editable_Textbox.py +++ b/infoslicer/widgets/Editable_Textbox.py @@ -5,13 +5,13 @@ from gi.repository import Gdk from gi.repository import GObject from gi.repository import Pango -import cPickle +import pickle import copy -from Textbox import Textbox +from .Textbox import Textbox import logging -SNAP_SENTENCE, SNAP_PARAGRAPH, SNAP_SECTION, SNAP_NONE = range(4) +SNAP_SENTENCE, SNAP_PARAGRAPH, SNAP_SECTION, SNAP_NONE = list(range(4)) class Editable_Textbox( Textbox ): """ @@ -266,7 +266,7 @@ def drag_data_received_event(self, widget, context, x, y, selection_data, info, a = self.article insert_loc = self.get_mouse_iter(x, y) data_received_type = str(selection_data.get_data_type()) - data = cPickle.loads(str(selection_data.get_data())) + data = pickle.loads(str(selection_data.get_data())) if data_received_type == "sentence": bestpoint = insert_loc @@ -293,7 +293,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, time, data) if self.snapto == SNAP_SECTION: atom = Gdk.atom_intern("section", only_if_exists=False) - string = cPickle.dumps(a.getSelection()) + string = pickle.dumps(a.getSelection()) selection_data.set(atom, 8, string) self.stop_emission("drag-data-get") diff --git a/infoslicer/widgets/Editing_View.py b/infoslicer/widgets/Editing_View.py index 3f9ecdc..32f7b12 100644 --- a/infoslicer/widgets/Editing_View.py +++ b/infoslicer/widgets/Editing_View.py @@ -4,7 +4,7 @@ from gi.repository import Gtk from gi.repository import Gdk from gi.repository import GObject -from Editable_Textbox import Editable_Textbox +from .Editable_Textbox import Editable_Textbox class Editing_View( Gtk.VBox ): """ diff --git a/infoslicer/widgets/Format_Pane.py b/infoslicer/widgets/Format_Pane.py index 1be64f2..d45d0c1 100644 --- a/infoslicer/widgets/Format_Pane.py +++ b/infoslicer/widgets/Format_Pane.py @@ -4,7 +4,7 @@ from gi.repository import Gtk from gettext import gettext as _ -from Editing_View import Editing_View +from .Editing_View import Editing_View class Format_Pane(Editing_View): """ diff --git a/infoslicer/widgets/Gallery_View.py b/infoslicer/widgets/Gallery_View.py index 7cd3ce8..0d17480 100644 --- a/infoslicer/widgets/Gallery_View.py +++ b/infoslicer/widgets/Gallery_View.py @@ -4,10 +4,10 @@ from gi.repository import GObject from gi.repository import GdkPixbuf import os -import cPickle +import pickle import logging -from Editable_Textbox import Editable_Textbox +from .Editable_Textbox import Editable_Textbox from infoslicer.processing.Article_Data import * from infoslicer.processing.Article import Article import book @@ -162,7 +162,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, timestamp, paragraph1data = Paragraph_Data(0, self.source_article_id, 0, 0, [imagedata]) paragraph2data = Paragraph_Data(0, self.source_article_id, 0, 0, [captiondata]) sectionsdata = [Section_Data(0, self.source_article_id, 0, [paragraph1data, paragraph2data])] - string = cPickle.dumps(sectionsdata) + string = pickle.dumps(sectionsdata) selection_data.set(atom, 8, string) def _validate_image_list(root, image_list): @@ -171,7 +171,7 @@ def _validate_image_list(root, image_list): @param image_list: list of images to validate @return: list of images with corrected paths, and broken images removed """ - for i in xrange(len(image_list)): + for i in range(len(image_list)): if not os.access(image_list[i][0], os.F_OK): if os.access(os.path.join(root, image_list[i][0]), os.F_OK): image_list[i] = (os.path.join(root, image_list[i][0]), diff --git a/infoslicer/widgets/Image_Pane.py b/infoslicer/widgets/Image_Pane.py index 473253c..356de50 100644 --- a/infoslicer/widgets/Image_Pane.py +++ b/infoslicer/widgets/Image_Pane.py @@ -7,8 +7,8 @@ import logging from gettext import gettext as _ -from Editing_View import Editing_View -from Gallery_View import Gallery_View +from .Editing_View import Editing_View +from .Gallery_View import Gallery_View from infoslicer.processing.Article import Article logger = logging.getLogger('infoslicer') diff --git a/infoslicer/widgets/Journal_Gallery_View.py b/infoslicer/widgets/Journal_Gallery_View.py index 5358d05..6ed7e22 100644 --- a/infoslicer/widgets/Journal_Gallery_View.py +++ b/infoslicer/widgets/Journal_Gallery_View.py @@ -15,11 +15,11 @@ from gi.repository import GObject from gi.repository import GdkPixbuf import os -import cPickle +import pickle import pickle import logging -from Editable_Textbox import Editable_Textbox +from .Editable_Textbox import Editable_Textbox from infoslicer.processing.Article_Data import * from infoslicer.processing.Article import Article import book @@ -158,7 +158,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, timestamp, paragraph1data = Paragraph_Data(0, self.source_article_id, 0, 0, [imagedata]) paragraph2data = Paragraph_Data(0, self.source_article_id, 0, 0, [captiondata]) sectionsdata = [Section_Data(0, self.source_article_id, 0, [paragraph1data, paragraph2data])] - string = cPickle.dumps(sectionsdata) + string = pickle.dumps(sectionsdata) selection_data.set(atom, 8, string) def add_image(self, image_path, title): diff --git a/infoslicer/widgets/Journal_Image_Pane.py b/infoslicer/widgets/Journal_Image_Pane.py index 38c1c23..6ae8c5b 100644 --- a/infoslicer/widgets/Journal_Image_Pane.py +++ b/infoslicer/widgets/Journal_Image_Pane.py @@ -7,7 +7,7 @@ import logging from gettext import gettext as _ -from Editing_View import Editing_View +from .Editing_View import Editing_View from infoslicer.widgets.Journal_Gallery_View import Journal_Gallery_View from infoslicer.processing.Article import Article diff --git a/infoslicer/widgets/Reading_View.py b/infoslicer/widgets/Reading_View.py index 3c40757..bc67cf0 100644 --- a/infoslicer/widgets/Reading_View.py +++ b/infoslicer/widgets/Reading_View.py @@ -3,7 +3,7 @@ gi.require_version('Gtk', '3.0') from gi.repository import Gtk from gi.repository import GObject -from Readonly_Textbox import Readonly_Textbox +from .Readonly_Textbox import Readonly_Textbox import logging logger = logging.getLogger('infoslicer') diff --git a/infoslicer/widgets/Readonly_Textbox.py b/infoslicer/widgets/Readonly_Textbox.py index 3d8e40f..745458a 100644 --- a/infoslicer/widgets/Readonly_Textbox.py +++ b/infoslicer/widgets/Readonly_Textbox.py @@ -2,11 +2,11 @@ from gi.repository import Gtk from gi.repository import Gdk from gi.repository import Pango -import cPickle +import pickle import logging -from Textbox import Textbox +from .Textbox import Textbox -SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = range(4) +SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = list(range(4)) class Readonly_Textbox( Textbox ): """ @@ -172,7 +172,7 @@ def drag_data_get_event(self, widget, context, selection_data, info, time, data) if self.selectionmode == SELECT_SECTION: atom = Gdk.atom_intern("section", only_if_exists=False) - string = cPickle.dumps(a.getSelection()) + string = pickle.dumps(a.getSelection()) selection_data.set(atom, 8, string) self.stop_emission("drag-data-get") self.set_editable(False) diff --git a/infoslicer/widgets/Textbox.py b/infoslicer/widgets/Textbox.py index 079c775..0d71a5f 100644 --- a/infoslicer/widgets/Textbox.py +++ b/infoslicer/widgets/Textbox.py @@ -4,9 +4,9 @@ from gi.repository import Gtk from gi.repository import GObject from gi.repository import Pango -import cPickle +import pickle -SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = range(4) +SELECT_SENTENCE, SELECT_PARAGRAPH, SELECT_SECTION, FULL_EDIT = list(range(4)) class Textbox( Gtk.TextView ): """ diff --git a/net.py b/net.py index b74fcdf..fbbfe63 100644 --- a/net.py +++ b/net.py @@ -16,7 +16,7 @@ import os import shutil -import urllib +import urllib.request, urllib.parse, urllib.error import logging from gettext import gettext as _ @@ -48,11 +48,11 @@ def download_wiki_article(title, wiki, progress): progress.set_label(_('"%s" successfully downloaded') % title) - except PageNotFoundError, e: + except PageNotFoundError as e: elogger.debug('download_and_add: %s' % e) progress.set_label(_('"%s" could not be found') % title) - except Exception, e: + except Exception as e: elogger.debug('download_and_add: %s' % e) progress.set_label(_('Error downloading "%s"; check your connection') % title) @@ -70,7 +70,7 @@ def image_handler(root, uid, document): logger.debug('image_handler: %s' % dir_path) if not os.path.exists(dir_path): - os.makedirs(dir_path, 0777) + os.makedirs(dir_path, 0o777) for image in document.findAll("image"): fail = False @@ -83,7 +83,7 @@ def image_handler(root, uid, document): else: image_title = path.rsplit("/", 1)[-1] # attempt to fix incomplete paths - if (not path.startswith("http://")) and document.source != None and document.source.has_key("href"): + if (not path.startswith("http://")) and document.source != None and "href" in document.source: if path.startswith("//upload"): path = 'http:' + path elif path.startswith("/"): @@ -111,19 +111,19 @@ def _open_url(url): """ retrieves content from specified url """ - urllib._urlopener = _new_url_opener() + urllib.request._urlopener = _new_url_opener() try: logger.debug("opening " + url) logger.debug("proxies: " + str(proxies)) - doc = urllib.urlopen(url, proxies=proxies) + doc = urllib.request.urlopen(url, proxies=proxies) output = doc.read() doc.close() logger.debug("url opened succesfully") return output - except IOError, e: + except IOError as e: elogger.debug('_open_url: %s' % e) -class _new_url_opener(urllib.FancyURLopener): +class _new_url_opener(urllib.request.FancyURLopener): version = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1b2)" \ "Gecko/20081218 Gentoo Iceweasel/3.1b2"