From a18ad4f8adb731b32441a71f6c92d535c84e105c Mon Sep 17 00:00:00 2001 From: nieweiming <1048594443@qq.com> Date: Mon, 26 Apr 2021 15:53:01 +0800 Subject: [PATCH] Added maximum idle waiting time MAX_IDLE_TIME_BEFORE_CLOSE. --- src/scrapy_redis/spiders.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/scrapy_redis/spiders.py b/src/scrapy_redis/spiders.py index e739570f..fe18eabd 100644 --- a/src/scrapy_redis/spiders.py +++ b/src/scrapy_redis/spiders.py @@ -2,6 +2,7 @@ from scrapy.exceptions import DontCloseSpider from scrapy.spiders import Spider, CrawlSpider from collections import Iterable +import time from . import connection, defaults @@ -16,7 +17,10 @@ class RedisMixin(object): # Redis client placeholder. server = None - + + # 记录空闲开始的时间 + spider_idle_start_time = time.time_ns() + def start_requests(self): """Returns a batch of start requests from redis.""" return self.next_requests() @@ -140,10 +144,32 @@ def schedule_next_requests(self): self.crawler.engine.crawl(req, spider=self) def spider_idle(self): - """Schedules a request if available, otherwise waits.""" - # XXX: Handle a sentinel to close the spider. + """ + Schedules a request if available, otherwise waits. + or close spider when waiting seconds > MAX_IDLE_TIME_BEFORE_CLOSE. + MAX_IDLE_TIME_BEFORE_CLOSE will not affect SCHEDULER_IDLE_BEFORE_CLOSE. + ------------- + 当空闲等待的时间大于IDLE_TIME_BEFORE_CLOSE时, 关闭爬虫. + MAX_IDLE_TIME_BEFORE_CLOSE 不会影响SCHEDULER_IDLE_BEFORE_CLOSE的使用. + """ + + if self.server is not None: + if self.count_start_urls() > 0: + self.spider_idle_start_time = time.time_ns() self.schedule_next_requests() + + _idle_time = self.settings.getint("MAX_IDLE_TIME_BEFORE_CLOSE") + if _idle_time != 0: + _idle_time_ns = _idle_time * 10**9 + if (time.time_ns() - self.spider_idle_start_time) > _idle_time_ns: + return raise DontCloseSpider + + def count_start_urls(self): + """统计start_urls的数量""" + use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) + count_size = self.server.scard if use_set else self.server.llen + return count_size(self.redis_key) class RedisSpider(RedisMixin, Spider):