From a206399dd09945a82e2485a32128ebe95a61b8c5 Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Sat, 26 Mar 2022 18:19:54 +0800 Subject: [PATCH 01/11] add import hint for pytest --- CONTRIBUTING.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 3135238e..85742679 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -84,13 +84,18 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. To get flake8 and tox, just pip install them into your virtualenv. -6. Commit your changes and push your branch to GitHub:: +6. Note that if the error of `No module named scrapy_redis` shows, please change the import lines:: + + from scrapy_redis import xxx # from this + from src.scrapy_redis import xxx # to this + +7. Commit your changes and push your branch to GitHub:: $ git add . $ git commit -m "Your detailed description of your changes." $ git push origin name-of-your-bugfix-or-feature -7. Submit a pull request through the GitHub website. +8. Submit a pull request through the GitHub website. Pull Request Guidelines ----------------------- From d6695c973c3b65c83123bbc6d52eb28e0f0237da Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Sat, 26 Mar 2022 18:20:21 +0800 Subject: [PATCH 02/11] update pytest usage --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 85742679..5d0c59c6 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -79,7 +79,7 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: $ flake8 scrapy_redis tests - $ pytest --ignore=setup.py + $ python -m pytest --ignore=setup.py $ tox To get flake8 and tox, just pip install them into your virtualenv. From b6241bd35f6cf906acdc1c7785cdf2540b8ce4fe Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Sat, 26 Mar 2022 18:23:34 +0800 Subject: [PATCH 03/11] update deprecated scrapy.utils.request usage --- src/scrapy_redis/queue.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scrapy_redis/queue.py b/src/scrapy_redis/queue.py index 476cefd6..0d01f528 100644 --- a/src/scrapy_redis/queue.py +++ b/src/scrapy_redis/queue.py @@ -1,4 +1,4 @@ -from scrapy.utils.reqser import request_to_dict, request_from_dict +from scrapy.utils.request import request_from_dict from . import picklecompat @@ -39,13 +39,13 @@ def __init__(self, server, spider, key, serializer=None): def _encode_request(self, request): """Encode a request object""" - obj = request_to_dict(request, self.spider) + obj = request.to_dict(spider=self.spider) return self.serializer.dumps(obj) def _decode_request(self, encoded_request): """Decode an request previously encoded""" obj = self.serializer.loads(encoded_request) - return request_from_dict(obj, self.spider) + return request_from_dict(obj, spider=self.spider) def __len__(self): """Return the length of the queue""" From f8a1c0ec497dbd2d98afd92eba8bafd5442b0bf5 Mon Sep 17 00:00:00 2001 From: Jeremy Chou Date: Mon, 28 Mar 2022 16:58:16 +0800 Subject: [PATCH 04/11] Update .gitignore & LICENSE (#225) * update .gitignore & LICENSE * remove contribution section and add alternative choice --- .gitignore | 3 ++- LICENSE | 2 +- README.rst | 13 ++++--------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 939332ee..4c871135 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,5 @@ target/ .ropeproject # Extra -.DS_Store \ No newline at end of file +.DS_Store +.vscode \ No newline at end of file diff --git a/LICENSE b/LICENSE index cff628cc..1ff8f3a9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011-2016, Rolando Espinoza +Copyright (c) 2022, Rolando Espinoza Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/README.rst b/README.rst index a9c13f39..54435e0c 100644 --- a/README.rst +++ b/README.rst @@ -254,16 +254,11 @@ Then: spider starts crawling it. * Also please pay attention to json formatting. - -Contributions -------------- - -Donate BTC: ``13haqimDV7HbGWtz7uC6wP1zvsRWRAhPmF`` -Donate BCC: ``CSogMjdfPZnKf1p5ocu3gLR54Pa8M42zZM`` - -Donate ETH: ``0x681d9c8a2a3ff0b612ab76564e7dca3f2ccc1c0d`` +Alternative Choice +--------------------------- -Donate LTC: ``LaPHpNS1Lns3rhZSvvkauWGDfCmDLKT8vP`` +Frontera_ is a web crawling framework consisting of `crawl frontier`_, and distribution/scaling primitives, allowing to build a large scale online web crawler. .. _Frontera: https://github.com/scrapinghub/frontera +.. _crawl frontier: http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html From a1703bb5fc03c08a9972d430f04ad00f88c6e1f4 Mon Sep 17 00:00:00 2001 From: Jeremy Chou Date: Mon, 28 Mar 2022 16:58:44 +0800 Subject: [PATCH 05/11] [docs] Remove docs $ prefix (#229) * remove docs $ prefix * align code indent --- CONTRIBUTING.rst | 24 +++++++++++++++--------- README.rst | 36 ++++++++++++++++++------------------ docs/installation.rst | 8 ++++---- 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 5d0c59c6..25ccc54b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -62,25 +62,31 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. 1. Fork the `scrapy-redis` repo on GitHub. 2. Clone your fork locally:: - $ git clone git@github.com:your_name_here/scrapy-redis.git + git clone git@github.com:your_name_here/scrapy-redis.git 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: - $ mkvirtualenv scrapy-redis - $ cd scrapy-redis/ - $ python setup.py develop + mkvirtualenv scrapy-redis + cd scrapy-redis/ + python setup.py develop 4. Create a branch for local development:: - $ git checkout -b name-of-your-bugfix-or-feature + git checkout -b name-of-your-bugfix-or-feature Now you can make your changes locally. 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: +<<<<<<< HEAD $ flake8 scrapy_redis tests $ python -m pytest --ignore=setup.py $ tox +======= + flake8 scrapy_redis tests + pytest --ignore=setup.py + tox +>>>>>>> [docs] Remove docs $ prefix (#229) To get flake8 and tox, just pip install them into your virtualenv. @@ -91,9 +97,9 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. 7. Commit your changes and push your branch to GitHub:: - $ git add . - $ git commit -m "Your detailed description of your changes." - $ git push origin name-of-your-bugfix-or-feature + git add . + git commit -m "Your detailed description of your changes." + git push origin name-of-your-bugfix-or-feature 8. Submit a pull request through the GitHub website. @@ -115,4 +121,4 @@ Tips To run a subset of tests:: - $ pytest tests/test_scrapy_redis + pytest tests/test_scrapy_redis diff --git a/README.rst b/README.rst index 54435e0c..fd2cc8ed 100644 --- a/README.rst +++ b/README.rst @@ -77,9 +77,9 @@ Installation From `github`:: - $ git clone https://github.com/darkrho/scrapy-redis.git - $ cd scrapy-redis - $ python setup.py install + git clone https://github.com/darkrho/scrapy-redis.git + cd scrapy-redis + python setup.py install .. note:: For using this json supported data feature, please make sure you have not installed the scrapy-redis through pip. If you already did it, you first uninstall that one. .. code:: @@ -190,28 +190,28 @@ across multiple spider instances, highly suitable for broad crawls. 2. Run the crawler for first time then stop it:: - $ cd example-project - $ scrapy crawl dmoz - ... [dmoz] ... - ^C + cd example-project + scrapy crawl dmoz + ... [dmoz] ... + ^C 3. Run the crawler again to resume stopped crawling:: - $ scrapy crawl dmoz - ... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled) + scrapy crawl dmoz + ... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled) 4. Start one or more additional scrapy crawlers:: - $ scrapy crawl dmoz - ... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled) + scrapy crawl dmoz + ... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled) 5. Start one or more post-processing workers:: - $ python process_items.py dmoz:items -v - ... - Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/) - Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/) - ... + python process_items.py dmoz:items -v + ... + Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/) + Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/) + ... Feeding a Spider from Redis @@ -240,11 +240,11 @@ Then: 1. run the spider:: - scrapy runspider myspider.py + scrapy runspider myspider.py 2. push json data to redis:: - redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }' + redis-cli lpush myspider '{"url": "https://exaple.com", "meta": {"job-id":"123xsd", "start-date":"dd/mm/yy"}, "url_cookie_key":"fertxsas" }' .. note:: diff --git a/docs/installation.rst b/docs/installation.rst index acb737f0..179e246a 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -12,7 +12,7 @@ To install Scrapy-Redis, run this command in your terminal: .. code-block:: console - $ pip install scrapy-redis + pip install scrapy-redis If you don't have `pip`_ installed, this `Python installation guide`_ can guide you through the process. @@ -30,19 +30,19 @@ You can either clone the public repository: .. code-block:: console - $ git clone git://github.com/rolando/scrapy-redis + git clone git://github.com/rolando/scrapy-redis Or download the `tarball`_: .. code-block:: console - $ curl -OL https://github.com/rolando/scrapy-redis/tarball/master + curl -OL https://github.com/rolando/scrapy-redis/tarball/master Once you have a copy of the source, you can install it with: .. code-block:: console - $ pip install -e . + pip install -e . .. _Github repo: https://github.com/rolando/scrapy-redis From 1fdea073fba8f3b7e141692eff1def4276b968cf Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Mon, 28 Mar 2022 17:20:10 +0800 Subject: [PATCH 06/11] add import hint for pytest --- CONTRIBUTING.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 25ccc54b..1d431497 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -90,18 +90,22 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. To get flake8 and tox, just pip install them into your virtualenv. -6. Note that if the error of `No module named scrapy_redis` shows, please change the import lines:: +6. Note that if the error of `No module named scrapy_redis` shows, please check `scrapy-redis` of your branch has install properly or not:: + + pip install . # if not, install scrapy-redis by this line, do not use pip install scrapy-redis, it will install online version. + +7. Or simply change import lines in those files you want to modify:: from scrapy_redis import xxx # from this from src.scrapy_redis import xxx # to this -7. Commit your changes and push your branch to GitHub:: +8. Commit your changes and push your branch to GitHub:: git add . git commit -m "Your detailed description of your changes." git push origin name-of-your-bugfix-or-feature -8. Submit a pull request through the GitHub website. +9. Submit a pull request through the GitHub website. Pull Request Guidelines ----------------------- From 3c1d40d5ea73952fc30a7a23a51bbb5cba3c4b00 Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Mon, 28 Mar 2022 17:23:31 +0800 Subject: [PATCH 07/11] add import hint for pytest --- CONTRIBUTING.rst | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 1d431497..54db78f8 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -78,34 +78,24 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: -<<<<<<< HEAD $ flake8 scrapy_redis tests $ python -m pytest --ignore=setup.py $ tox -======= - flake8 scrapy_redis tests - pytest --ignore=setup.py - tox ->>>>>>> [docs] Remove docs $ prefix (#229) To get flake8 and tox, just pip install them into your virtualenv. -6. Note that if the error of `No module named scrapy_redis` shows, please check `scrapy-redis` of your branch has install properly or not:: - - pip install . # if not, install scrapy-redis by this line, do not use pip install scrapy-redis, it will install online version. - -7. Or simply change import lines in those files you want to modify:: +6. Note that if the error of `No module named scrapy_redis` shows, please change the import lines:: from scrapy_redis import xxx # from this from src.scrapy_redis import xxx # to this -8. Commit your changes and push your branch to GitHub:: +7. Commit your changes and push your branch to GitHub:: git add . git commit -m "Your detailed description of your changes." git push origin name-of-your-bugfix-or-feature -9. Submit a pull request through the GitHub website. +8. Submit a pull request through the GitHub website. Pull Request Guidelines ----------------------- From 7c0451949179bbc0cff3b364693c562fd6ac031a Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Mon, 28 Mar 2022 19:59:17 +0800 Subject: [PATCH 08/11] add text color helper --- src/scrapy_redis/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/scrapy_redis/utils.py b/src/scrapy_redis/utils.py index b1a46813..fcaa649b 100644 --- a/src/scrapy_redis/utils.py +++ b/src/scrapy_redis/utils.py @@ -1,6 +1,17 @@ import six +class TextColor: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + def bytes_to_str(s, encoding='utf-8'): """Returns a str if a bytes object is given.""" if six.PY3 and isinstance(s, bytes): From d14ab8bbd2a9e89295faf3d95a420e9292f4a0d5 Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Mon, 28 Mar 2022 20:00:57 +0800 Subject: [PATCH 09/11] add json type check add json formatted_data type check and warning message --- src/scrapy_redis/spiders.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/scrapy_redis/spiders.py b/src/scrapy_redis/spiders.py index f4191bfa..c82ad2e3 100644 --- a/src/scrapy_redis/spiders.py +++ b/src/scrapy_redis/spiders.py @@ -3,6 +3,7 @@ from scrapy import signals, FormRequest from scrapy.exceptions import DontCloseSpider from scrapy.spiders import Spider, CrawlSpider +from scrapy_redis import TextColor import time from . import connection, defaults @@ -169,7 +170,14 @@ def make_request_from_data(self, data): formatted_data = bytes_to_str(data, self.redis_encoding) # change to json array - parameter = json.loads(formatted_data) + parameter = {} + if type(formatted_data) == dict: + parameter = json.loads(formatted_data) + else: + print(TextColor.WARNING + "WARNING: String request is deprecated, please use JSON data format. \ + Detail information, please check https://github.com/rmax/scrapy-redis#features" + TextColor.ENDC) + return FormRequest(formatted_data, dont_filter=True) + url = parameter['url'] del parameter['url'] metadata = {} From 083cd218367cd5042780feb6830635ae8f65ee59 Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Mon, 28 Mar 2022 20:01:38 +0800 Subject: [PATCH 10/11] fix subset not equal assert --- tests/test_spiders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_spiders.py b/tests/test_spiders.py index dbdfcbe3..74b635e8 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -148,7 +148,7 @@ def test_consume_urls_from_redis(start_urls_as_zset, start_urls_as_set, spider_c start_requests = list(spider.start_requests()) if start_urls_as_zset or start_urls_as_set: assert len(start_requests) == batch_size - assert set(start_requests).issubset(reqs) + assert set(map(lambda x: x.url, start_requests)).issubset(map(lambda x: x.url, reqs)) else: assert start_requests == reqs[:batch_size] From 15ecd672b95edfc3578ee6277d86a77f06e8968a Mon Sep 17 00:00:00 2001 From: LuckyPigeon Date: Tue, 29 Mar 2022 03:16:19 +0800 Subject: [PATCH 11/11] update test & install guide --- CONTRIBUTING.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index cac84819..b9f9f4ba 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -79,23 +79,28 @@ Ready to contribute? Here's how to set up `scrapy-redis` for local development. 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: flake8 scrapy_redis tests + pip install . python -m pytest --ignore=setup.py tox To get flake8 and tox, just pip install them into your virtualenv. -6. Note that if the error of `No module named scrapy_redis` shows, please change the import lines:: +6. Note that if the error of `No module named scrapy_redis` shows, please install `scrapy-redis` of your branch by:: + pip install . + +7. Or change the import lines:: + from scrapy_redis import xxx # from this from src.scrapy_redis import xxx # to this -7. Commit your changes and push your branch to GitHub:: +8. Commit your changes and push your branch to GitHub:: git add . git commit -m "Your detailed description of your changes." git push origin name-of-your-bugfix-or-feature -8. Submit a pull request through the GitHub website. +9. Submit a pull request through the GitHub website. Pull Request Guidelines -----------------------