From 50b8c6f0fda09883cda9705c27e1f7df78360031 Mon Sep 17 00:00:00 2001 From: HairlessVillager <64526732+HairlessVillager@users.noreply.github.com> Date: Sun, 7 Jul 2024 03:41:02 +0800 Subject: [PATCH] fix: Scheduler not compatible with BaseDupeFilter (#294) * fix: Scheduler not compatible with BaseDupeFilter Co-authored-by: R Max Espinoza --- src/scrapy_redis/scheduler.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/scrapy_redis/scheduler.py b/src/scrapy_redis/scheduler.py index 0814d59a..ba50a101 100644 --- a/src/scrapy_redis/scheduler.py +++ b/src/scrapy_redis/scheduler.py @@ -37,6 +37,7 @@ def __init__( flush_on_start=False, queue_key=defaults.SCHEDULER_QUEUE_KEY, queue_cls=defaults.SCHEDULER_QUEUE_CLASS, + dupefilter=None, dupefilter_key=defaults.SCHEDULER_DUPEFILTER_KEY, dupefilter_cls=defaults.SCHEDULER_DUPEFILTER_CLASS, idle_before_close=0, @@ -56,6 +57,8 @@ def __init__( Requests queue key. queue_cls : str Importable path to the queue class. + dupefilter: Dupefilter + Custom dupefilter instance. dupefilter_key : str Duplicates filter key. dupefilter_cls : str @@ -72,6 +75,7 @@ def __init__( self.flush_on_start = flush_on_start self.queue_key = queue_key self.queue_cls = queue_cls + self.df = dupefilter self.dupefilter_cls = dupefilter_cls self.dupefilter_key = dupefilter_key self.idle_before_close = idle_before_close @@ -105,6 +109,10 @@ def from_settings(cls, settings): if val: kwargs[name] = val + dupefilter_cls = load_object(kwargs["dupefilter_cls"]) + if not hasattr(dupefilter_cls, "from_spider"): + kwargs["dupefilter"] = dupefilter_cls.from_settings(settings) + # Support serializer as a path to a module. if isinstance(kwargs.get("serializer"), str): kwargs["serializer"] = importlib.import_module(kwargs["serializer"]) @@ -137,7 +145,8 @@ def open(self, spider): f"Failed to instantiate queue class '{self.queue_cls}': {e}" ) - self.df = load_object(self.dupefilter_cls).from_spider(spider) + if not self.df: + self.df = load_object(self.dupefilter_cls).from_spider(spider) if self.flush_on_start: self.flush()