Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dev] Add customize fingerprint #280

Merged
merged 6 commits into from
Jun 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/scrapy_redis/dupefilter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import logging
import hashlib
import json
import time

from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
from scrapy.utils.python import to_unicode
from w3lib.url import canonicalize_url

from . import defaults
from .connection import get_redis_from_settings
Expand Down Expand Up @@ -112,8 +116,14 @@ def request_fingerprint(self, request):
str

"""
return request_fingerprint(request)

fingerprint_data = {
"method": to_unicode(request.method),
"url": canonicalize_url(request.url),
"body": (request.body or b"").hex(),
}
fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
return hashlib.sha1(fingerprint_json.encode()).hexdigest()

@classmethod
def from_spider(cls, spider):
settings = spider.settings
Expand Down
24 changes: 20 additions & 4 deletions tests/test_dupefilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def get_redis_mock():

def sadd(key, fp, added=0, db={}):
fingerprints = db.setdefault(key, set())
if key not in fingerprints:
fingerprints.add(key)
if fp not in fingerprints:
fingerprints.add(fp)
added += 1
return added

Expand All @@ -30,8 +30,24 @@ def setup(self):

def test_request_seen(self):
req = Request('http://example.com')
assert not self.df.request_seen(req)
assert self.df.request_seen(req)

def same_request():
assert not self.df.request_seen(req)
assert self.df.request_seen(req)

def diff_method():
diff_method = Request('http://example.com', method='POST')
assert self.df.request_seen(req)
assert not self.df.request_seen(diff_method)

def diff_url():
diff_url = Request('http://example2.com')
assert self.df.request_seen(req)
assert not self.df.request_seen(diff_url)

same_request()
diff_method()
diff_url()

def test_overridable_request_fingerprinter(self):
req = Request('http://example.com')
Expand Down