Skip to content

Commit

Permalink
Add content method to Context for proper content decoding in Python 3.
Browse files Browse the repository at this point in the history
…Fixes #10.

Context deals with bytes, but the regular expressions in Crawler really deal
with strings, which makes Python 3 sad.  Adds Context.content to return
decoded pages based upon their reported content encoding.
  • Loading branch information
wickman committed Nov 22, 2014
1 parent d4f501a commit 7abe3ad
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 33 deletions.
4 changes: 4 additions & 0 deletions pex/bin/pex.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,7 @@ def main():
log('Running PEX file at %s with args %s' % (pex_builder.path(), args), v=options.verbosity)
pex = PEX(pex_builder.path(), interpreter=pex_builder.interpreter)
return pex.run(args=list(args))


if __name__ == '__main__':
main()
12 changes: 7 additions & 5 deletions pex/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import re
import threading
import traceback

from .compatibility import PY3
from .http import Context
Expand Down Expand Up @@ -61,8 +62,7 @@ class Crawler(object):
def crawl_local(cls, link):
try:
dirents = os.listdir(link.path)
# except OSError as e:
except Exception as e:
except OSError as e:
TRACER.log('Failed to read %s: %s' % (link.path, e), V=1)
return set(), set()
files, dirs = partition([os.path.join(link.path, fn) for fn in dirents], os.path.isdir)
Expand All @@ -71,9 +71,8 @@ def crawl_local(cls, link):
@classmethod
def crawl_remote(cls, context, link):
try:
content = context.read(link)
# except context.Error as e:
except Exception as e:
content = context.content(link)
except context.Error as e:
TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
return set(), set()
links = set(link.join(href) for href in PageParser.links(content))
Expand Down Expand Up @@ -111,6 +110,9 @@ def execute():
roots, rels = self.crawl_link(self.context, link)
except Exception as e:
TRACER.log('Unknown exception encountered: %s' % e)
for line in traceback.format_exc().splitlines():
TRACER.log(line)
queue.task_done()
continue
links.update(roots)
if follow_links:
Expand Down
26 changes: 26 additions & 0 deletions pex/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import uuid
from abc import abstractmethod
from email import message_from_string

from .common import safe_mkdtemp, safe_open
from .compatibility import AbstractClass, PY3
Expand Down Expand Up @@ -40,6 +41,8 @@ class Context(AbstractClass):
specialized by individual implementations.
"""

DEFAULT_ENCODING = 'iso-8859-1'

class Error(Exception):
"""Error base class for Contexts to wrap application-specific exceptions."""
pass
Expand Down Expand Up @@ -75,6 +78,12 @@ def read(self, link):
with contextlib.closing(self.open(link)) as fp:
return fp.read()

def content(self, link):
"""Return the encoded content associated with the link.
:param link: The :class:`Link` to read.
"""

def fetch(self, link, into=None):
"""Fetch the binary content associated with the link and write to a file.
Expand Down Expand Up @@ -104,6 +113,14 @@ class UrllibContext(Context):
def open(self, link):
return urllib_request.urlopen(link.url)

def content(self, link):
if link.local:
raise self.Error('Context.content only works with remote URLs.')

with contextlib.closing(self.open(link)) as fp:
encoding = message_from_string(str(fp.headers)).get_content_charset(self.DEFAULT_ENCODING)
return fp.read().decode(encoding, errors='replace')


Context.register(UrllibContext)

Expand All @@ -124,6 +141,7 @@ def detect_algorithm(cls, link):

def __init__(self, request, link, chunk_size=16384):
self._iterator = request.iter_content(chunk_size)
self.encoding = request.encoding
self._bytes = b''
self._link = link
self._hasher, self._hash_value = self.detect_algorithm(link)
Expand Down Expand Up @@ -199,6 +217,14 @@ def open(self, link):
link,
'Exceeded max retries of %d' % self._max_retries))

def content(self, link):
if link.local:
raise self.Error('Context.content only works with remote URLs.')

with contextlib.closing(self.open(link)) as request:
return request.read().decode(request.encoding or self.DEFAULT_ENCODING, errors='replace')


if requests:
Context.register(RequestsContext)

Expand Down
7 changes: 7 additions & 0 deletions scripts/coverage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

coverage run -p -m py.test tests
coverage run -p -m pex.bin.pex --help >&/dev/null
coverage run -p -m pex.bin.pex scripts/do_nothing.py
coverage run -p -m pex.bin.pex -r requests scripts/do_nothing.py
coverage run -p -m pex.bin.pex -r setuptools -s . scripts/do_nothing.py
Empty file added scripts/do_nothing.py
Empty file.
63 changes: 35 additions & 28 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ envlist =
py{py,27,34}-requests,style,isort-check

[testenv]
commands = py.test {posargs:}
commands =
py.test {posargs:}
deps =
pytest
twitter.common.contextutil>=0.3.1,<0.4.0
Expand All @@ -22,38 +23,25 @@ deps =
cachecontrol: lockfile
coverage: coverage

# Would love if you didn't have to enumerate environments here :-\
[testenv:py26]
[testenv:py26-requests]
[testenv:py26-requests-cachecontrol]
[testenv:py27]
[testenv:py27-requests]
[testenv:py27-requests-cachecontrol]
[testenv:py33]
[testenv:py33-requests]
[testenv:py33-requests-cachecontrol]
[testenv:py34]
[testenv:py34-requests]
[testenv:py34-requests-cachecontrol]
[testenv:pypy]
[testenv:pypy-requests]
[testenv:pypy-requests-cachecontrol]

# environments where we measure coverage
[coverage-commands]
[integration]
commands =
# TODO(wickman) Add 'pex' tool integration tests better than just --help.
coverage run -p -m py.test tests
coverage run -p -m pex.bin.pex -- --help
# This is necessary due to https://bitbucket.org/hpk42/tox/issue/175/cant-do-substitution-base-commands
bash scripts/coverage.sh

[testenv:py27-coverage]
commands = {[integration]commands}

[testenv:py27-requests-cachecontrol-coverage]
commands = {[coverage-commands]commands}
commands = {[integration]commands}

[testenv:py34-requests-cachecontrol-coverage]
commands = {[coverage-commands]commands}
commands = {[integration]commands}

[testenv:py34-coverage]
commands = {[integration]commands}

[testenv:pypy-requests-cachecontrol-coverage]
commands = {[coverage-commands]commands}
commands = {[integration]commands}

[testenv:coverage]
basepython = python2.7
Expand All @@ -62,8 +50,10 @@ deps =
tox
commands =
# meta
tox -e py27-coverage
tox -e py27-requests-cachecontrol-coverage
tox -e py34-requests-cachecontrol-coverage
tox -e py34-coverage
tox -e pypy-requests-cachecontrol-coverage
python scripts/combine_coverage.py
coverage report
Expand All @@ -80,12 +70,12 @@ commands =
[testenv:isort-run]
basepython = python2.7
deps = isort
commands = isort -ns __init__.py -rc -c {toxinidir}/pex {toxinidir}/tests
commands = isort -ns __init__.py -rc {toxinidir}/pex {toxinidir}/tests

[testenv:isort-check]
basepython = python2.7
deps = isort
commands = isort -ns __init__.py -rc {toxinidir}/pex {toxinidir}/tests
commands = isort -ns __init__.py -rc -c {toxinidir}/pex {toxinidir}/tests

[testenv:postreview]
basepython = python2.7
Expand Down Expand Up @@ -113,3 +103,20 @@ commands = pex -s . -r setuptools -r wheel -o dist/pex -e pex.bin.pex:main -v

[testenv:py34-package]
commands = pex -s . -r setuptools -r wheel -o dist/pex -e pex.bin.pex:main -v

# Would love if you didn't have to enumerate environments here :-\
[testenv:py26]
[testenv:py26-requests]
[testenv:py26-requests-cachecontrol]
[testenv:py27]
[testenv:py27-requests]
[testenv:py27-requests-cachecontrol]
[testenv:py33]
[testenv:py33-requests]
[testenv:py33-requests-cachecontrol]
[testenv:py34]
[testenv:py34-requests]
[testenv:py34-requests-cachecontrol]
[testenv:pypy]
[testenv:pypy-requests]
[testenv:pypy-requests-cachecontrol]

0 comments on commit 7abe3ad

Please sign in to comment.