Skip to content

Commit

Permalink
Merge pull request #158 from scrapinghub/handling-errbacks
Browse files Browse the repository at this point in the history
Handling errbacks
  • Loading branch information
pawelmhm authored Feb 14, 2024
2 parents 652b835 + 3b5a394 commit f496cd3
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 16 deletions.
22 changes: 20 additions & 2 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,20 @@ callback
- optional

Must exist as method of scheduled spider, does not need to contain string "self".
If not passed or not found on spider default callback `parse`_ will be used.
If not passed default Scrapy callback `parse`_ will be used. If there is no spider method
with name specified by callback argument or callback is not callable API will return 400 HTTP error.

Example request with callback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&callback=parse_page``

errback
- type: string
- optional

Scrapy errback for request made from spider. It must exist as method of
scheduled spider, otherwise exception will be raised. String does not need to contain 'self'.
scheduled spider, otherwise API will return 400 HTTP error. String does not need to contain 'self'.
Defaults to None, can be adjusted with `DEFAULT_ERRBACK_NAME`_ setting.

Example request with errback: ``/crawl.json?url=https://quotes.toscrape.com/&spider_name=toscrape-css&errback=my_errback``

max_requests
- type: integer
Expand Down Expand Up @@ -517,6 +523,18 @@ Encoding that's used to encode log messages.

Default: ``utf-8``.

DEFAULT_ERRBACK_NAME
~~~~~~~~~~~~~~~~~~~~

Default: ``None``

String with the name of the default errback_.

Use this setting to set default errback for scrapy spider requests made from ScrapyRT.
Errback must exist as method of spider and must be callable, otherwise 400 HTTP error will be raised.

.. _errback: https://docs.scrapy.org/en/latest/topics/request-response.htm#using-errbacks-to-catch-exceptions-in-request-processing


Spider settings
---------------
Expand Down
4 changes: 3 additions & 1 deletion scrapyrt/conf/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,6 @@
# disable in production
DEBUG = True

TWISTED_REACTOR = None
TWISTED_REACTOR = None

DEFAULT_ERRBACK_NAME = None
32 changes: 25 additions & 7 deletions scrapyrt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from copy import deepcopy
import datetime
import os
import traceback

from scrapy import signals
from scrapy.crawler import CrawlerRunner, Crawler
Expand Down Expand Up @@ -109,6 +110,7 @@ def __init__(self, spider_name, request_kwargs,
self.items = []
self.items_dropped = []
self.errors = []
self.user_error = None
self.max_requests = int(max_requests) if max_requests else None
self.timeout_limit = int(app_settings.TIMEOUT_LIMIT)
self.request_count = 0
Expand All @@ -120,7 +122,7 @@ def __init__(self, spider_name, request_kwargs,
# because we need to know if spider has method available
self.callback_name = request_kwargs.pop('callback', None) or 'parse'
# do the same for errback
self.errback_name = request_kwargs.pop('errback', None) or 'parse'
self.errback_name = request_kwargs.pop('errback', None) or app_settings.DEFAULT_ERRBACK_NAME

if request_kwargs.get("url"):
self.request = self.create_spider_request(deepcopy(request_kwargs))
Expand Down Expand Up @@ -171,17 +173,30 @@ def spider_idle(self, spider):
"""
if spider is self.crawler.spider and self.request and not self._request_scheduled:
callback = getattr(self.crawler.spider, self.callback_name)
assert callable(callback), 'Invalid callback'
self.request = self.request.replace(callback=callback)
try:
callback = getattr(self.crawler.spider, self.callback_name)
assert callable(callback), 'Invalid callback'
self.request = self.request.replace(callback=callback)
except (AssertionError, AttributeError):
msg = f"Invalid spider callback {self.callback_name}, callback not callable or not a method of a spider {self.spider_name}"
self.user_error = Error(400, message=msg)
try:
if self.errback_name:
errback = getattr(self.crawler.spider, self.errback_name)
assert callable(errback), 'Invalid errback'
self.request = self.request.replace(errback=errback)
except (AssertionError, AttributeError):
msg = f"Invalid spider errback {self.errback_name}, errback not callable or not a method of a spider {self.spider_name}"
self.user_error = Error(400, message=msg)
if self.user_error:
log.msg(self.user_error.message, level=log.ERROR)
return

errback = getattr(self.crawler.spider, self.errback_name)
assert callable(errback), 'Invalid errback'
self.request = self.request.replace(errback=errback)
modify_request = getattr(
self.crawler.spider, "modify_realtime_request", None)
if callable(modify_request):
self.request = modify_request(self.request)

spider.crawler.engine.crawl(self.request)
self._request_scheduled = True
raise DontCloseSpider
Expand Down Expand Up @@ -238,6 +253,9 @@ def return_items(self, result):
"stats": stats,
"spider_name": self.spider_name,
}

results["user_error"] = self.user_error

if self.debug:
results["errors"] = self.errors
return results
Expand Down
3 changes: 3 additions & 0 deletions scrapyrt/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,9 @@ def run_crawl(self, spider_name, scrapy_request_args,

def prepare_response(self, result, *args, **kwargs):
items = result.get("items")
user_error = result.get("user_error", None)
if user_error:
raise user_error
response = {
"status": "ok",
"items": items,
Expand Down
17 changes: 11 additions & 6 deletions tests/test_crawl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ def test_spider_opened(self):

def test_raise_error_if_not_callable(self):
self.spider.parse_something = None
self.assertRaises(
AssertionError, self.crawl_manager.spider_idle, self.spider)
self._call_spider_idle()
self.assertIsNotNone(self.crawl_manager.user_error)
msg = "Invalid spider callback parse_something"
assert re.search(msg, self.crawl_manager.user_error.message)
self.assertFalse(self.crawler.engine.crawl.called)

def test_modify_realtime_request(self):
Expand Down Expand Up @@ -142,15 +144,17 @@ def test_pass_wrong_spider_errback(self):
mng = self.create_crawl_manager(
{'url': 'http://localhost', 'errback': 'handle_error'}
)

try:
with pytest.raises(AttributeError) as err:
mng.spider_idle(self.spider)
mng.spider_idle(self.spider)
except DontCloseSpider:
pass

assert mng.request.errback is None
msg = "has no attribute 'handle_error'"
assert re.search(msg, str(err))

self.assertIsNotNone(mng.user_error)
msg = "Invalid spider errback"
assert re.search(msg, mng.user_error.message)

def test_pass_good_spider_errback(self):
mng = self.create_crawl_manager(
Expand Down Expand Up @@ -330,6 +334,7 @@ def setUp(self):
'items_dropped': self.crawl_manager.items_dropped,
'stats': self.stats.copy(),
'spider_name': self.spider.name,
'user_error': None,
}

def test_return_items(self):
Expand Down
11 changes: 11 additions & 0 deletions tests/test_resource_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,17 @@ def test_prepare_response(self, resource):
for key, value in expected:
assert prepared_res[key] == value

def test_prepare_response_user_error_raised(self, resource):
result = {
'items': [1, 2],
'stats': [99],
'spider_name': 'test'
}
result['user_error'] = Exception("my exception")
with pytest.raises(Exception) as e_info:
resource.prepare_response(result)
assert e_info.message == "my exception"


class TestCrawlResourceGetRequiredArgument(unittest.TestCase):

Expand Down

0 comments on commit f496cd3

Please sign in to comment.