diff --git a/.gitignore b/.gitignore index 287b0b569..4b9aaea27 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ var/ *.egg-info/ .installed.cfg *.egg +MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -52,3 +53,6 @@ docs/_build/ # PyBuilder target/ + +# Virtualenv +.env/ diff --git a/.travis.yml b/.travis.yml index a2f138639..837e16675 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ services: - docker - mysql - postgresql + - redis-server env: global: @@ -26,6 +27,21 @@ matrix: - docker - mysql - postgresql + - redis-server + - python: 3.6 + env: TOXENV=py36 + services: + - docker + - mysql + - postgresql + - redis-server + - python: 3.7 + env: TOXENV=py37 + services: + - docker + - mysql + - postgresql + - redis-server install: - pip install -U tox wheel codecov @@ -57,6 +73,9 @@ script: tox after_success: - codecov +after_script: + - cat broker.log + deploy: provider: pypi distributions: sdist bdist_wheel diff --git a/README.md b/README.md index 9c4f8ef68..438c89730 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ # Frontera +[![pypi](https://img.shields.io/pypi/v/frontera)](https://pypi.org/project/frontera/) +[![python versions](https://img.shields.io/pypi/pyversions/frontera.svg)](https://pypi.org/project/frontera/) +[![Build Status](https://travis-ci.org/scrapinghub/frontera.svg?branch=master)](https://travis-ci.org/scrapinghub/frontera) +[![codecov](https://codecov.io/gh/scrapinghub/frontera/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapinghub/frontera) + ## Overview -Frontera is a web crawling framework consisting of [crawl frontier](http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html), -and distribution/scaling primitives, allowing to build a large scale online web crawler. +Frontera is a web crawling framework consisting of [crawl frontier](http://nlp.stanford.edu/IR-book/html/htmledition/the-url-frontier-1.html), and distribution/scaling primitives, allowing to build a large scale online web crawler. Frontera takes care of the logic and policies to follow during the crawl. It stores and prioritises links extracted by the crawler to decide which pages to visit next, and capable of doing it in distributed manner. @@ -11,12 +15,14 @@ the crawler to decide which pages to visit next, and capable of doing it in dist ## Main features - Online operation: small requests batches, with parsing done right after fetch. -- Pluggable backend architecture: low-level storage logic is separated from crawling policy. -- Three run modes: single process, distributed spiders, distributed backend and spiders. +- Pluggable backend architecture: low-level backend access logic is separated from crawling strategy. +- Two run modes: single process and distributed. +- Built-in SqlAlchemy, Redis and HBase backends. +- Built-in Apache Kafka and ZeroMQ message buses. +- Built-in crawling strategies: breadth-first, depth-first, Discovery (with support of robots.txt and sitemaps). +- Battle tested: our biggest deployment is 60 spiders/strategy workers delivering 50-60M of documents daily for 45 days, without downtime, - Transparent data flow, allowing to integrate custom components easily using Kafka. - Message bus abstraction, providing a way to implement your own transport (ZeroMQ and Kafka are available out of the box). -- RDBMS and HBase backends. -- Revisiting logic with RDBMS. - Optional use of Scrapy for fetching and parsing. - 3-clause BSD license, allowing to use in any commercial product. - Python 3 support. @@ -37,5 +43,3 @@ $ pip install frontera Join our Google group at https://groups.google.com/a/scrapinghub.com/forum/#!forum/frontera or check GitHub issues and pull requests. - - diff --git a/docs/source/conf.py b/docs/source/conf.py index e35c09e2b..9f4305251 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -48,16 +48,16 @@ # General information about the project. project = u'Frontera' -copyright = u'2014-2016, Frontera authors' +copyright = u'2014-2018, Frontera authors' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.6' +version = '0.8' # The full version, including alpha/beta/rc tags. -release = '0.6.0' +release = '0.8.0.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/index.rst b/docs/source/index.rst index d36ab0905..4c9f71c01 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,16 +4,14 @@ Frontera |version| documentation ================================ -`Frontera`_ is a web crawling tool box, allowing to build crawlers of any scale and purpose. +`Frontera`_ is a web crawling tool box, allowing to build crawlers of any scale and purpose. It includes: -`Frontera`_ provides :ref:`crawl frontier ` framework by managing *when* and *what* to crawl next, -and checking for *crawling goal* accomplishment. +* :ref:`crawl frontier ` framework managing *when* and *what* to crawl and checking for crawling goal* accomplishment, -Frontera also provides replication, sharding and isolation of all crawler components to scale and distribute it. +* workers, Scrapy wrappers, and data bus components to scale and distribute the crawler. Frontera contain components to allow creation of fully-operational web crawler with `Scrapy`_. Even though it was -originally designed for Scrapy, it can also be used with any other crawling framework/system as the framework offers -a generic tool box. +originally designed for Scrapy, it can also be used with any other crawling framework/system. Introduction @@ -53,18 +51,22 @@ Using Frontera :hidden: topics/installation + topics/strategies topics/frontier-objects topics/frontier-middlewares topics/frontier-canonicalsolvers topics/frontier-backends topics/message_bus - topics/own_crawling_strategy + topics/custom_crawling_strategy topics/scrapy-integration topics/frontera-settings :doc:`topics/installation` HOWTO and Dependencies options. +:doc:`topics/strategies` + A list of built-in crawling strategies. + :doc:`topics/frontier-objects` Understand the classes used to represent requests and responses. @@ -75,13 +77,13 @@ Using Frontera Identify and make use of canonical url of document. :doc:`topics/frontier-backends` - Define your own crawling policy and custom storage. + Built-in backends, and tips on implementing your own. :doc:`topics/message_bus` Built-in message bus reference. -:doc:`topics/own_crawling_strategy` - Implementing own crawling strategy for distributed backend. +:doc:`topics/custom_crawling_strategy` + Implementing your own crawling strategy. :doc:`topics/scrapy-integration` Learn how to use Frontera with Scrapy. @@ -130,13 +132,10 @@ Developer documentation topics/tests topics/loggers topics/frontier-tester - topics/faq topics/contributing topics/glossary - - :doc:`topics/architecture` See how Frontera works and its different components. @@ -158,13 +157,9 @@ Developer documentation :doc:`topics/frontier-tester` Test your frontier in an easy way. -:doc:`topics/faq` - Frequently asked questions. - :doc:`topics/contributing` HOWTO contribute. - :doc:`topics/glossary` Glossary of terms. diff --git a/docs/source/topics/_images/high-level-arc.png b/docs/source/topics/_images/high-level-arc.png index 542e3e145..cedb7eff8 100644 Binary files a/docs/source/topics/_images/high-level-arc.png and b/docs/source/topics/_images/high-level-arc.png differ diff --git a/docs/source/topics/architecture.rst b/docs/source/topics/architecture.rst index 2e69c9138..2777d44ce 100644 --- a/docs/source/topics/architecture.rst +++ b/docs/source/topics/architecture.rst @@ -102,7 +102,7 @@ Where *sharded* means component consumes messages of assigned partition only, e. stream, and *replicated* is when components consume stream regardless of partitioning. Such design allows to operate online. Crawling strategy can be changed without having to stop the crawl. Also -:doc:`crawling strategy ` can be implemented as a separate module; containing logic +:doc:`crawling strategy ` can be implemented as a separate module; containing logic for checking the crawling stopping condition, URL ordering, and scoring model. Frontera is polite to web hosts by design and each host is downloaded by no more than one spider process. diff --git a/docs/source/topics/cluster-setup.rst b/docs/source/topics/cluster-setup.rst index 26c51c117..281c95690 100644 --- a/docs/source/topics/cluster-setup.rst +++ b/docs/source/topics/cluster-setup.rst @@ -20,7 +20,7 @@ Things to setup before you start Things to implement before you start ==================================== -* :doc:`Crawling strategy ` +* :doc:`Crawling strategy ` or :doc:`pick one from Frontera package ` * Spider code Configuring Kafka @@ -43,7 +43,7 @@ Configuring HBase Configuring Frontera ==================== -Every Frontera component requires it's own configuration module, but some options are shared, so we recommend to create +Every Frontera component requires its own configuration module, but some options are shared, so we recommend to create a common modules and import settings from it in component's modules. 1. Create a common module and add there: :: @@ -88,7 +88,7 @@ a common modules and import settings from it in component's modules. from __future__ import absolute_import from .worker import * - CRAWLING_STRATEGY = '' # path to the crawling strategy class + STRATEGY = '' # path to the crawling strategy class LOGGING_CONFIG='logging-sw.conf' # if needed The logging can be configured according to https://docs.python.org/2/library/logging.config.html see the @@ -101,6 +101,7 @@ The logging can be configured according to https://docs.python.org/2/library/log BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' KAFKA_GET_TIMEOUT = 0.5 + LOCAL_MODE = False # by default Frontera is prepared for single process mode 6. Configure Scrapy settings module. It's located in Scrapy project folder and referenced in scrapy.cfg. Let's add @@ -125,9 +126,11 @@ Starting the cluster First, let's start storage worker: :: # start DB worker only for batch generation - $ python -m frontera.worker.db --config [db worker config module] --no-incoming - ... - # Then start next one dedicated to spider log processing + # use single instance for every 10 partitions + $ python -m frontera.worker.db --config [db worker config module] --no-incoming --partitions 0 1 + + + # Optionally, start next one dedicated to spider log processing. $ python -m frontera.worker.db --no-batches --config [db worker config module] @@ -141,17 +144,18 @@ Next, let's start strategy workers, one process per spider log partition: :: You should notice that all processes are writing messages to the log. It's ok if nothing is written in streams, because of absence of seed URLs in the system. -Let's put our seeds in text file, one URL per line and start spiders. A single spider per spider feed partition: :: +Let's put our seeds in text file, one URL per line and run:: + + $ python -m frontera.utils.add_seeds --config [your_frontera_config] --seeds-file [path to your seeds file] + +Finally, a single spider per spider feed partition: :: - $ scrapy crawl [spider] -L INFO -s SEEDS_SOURCE = 'seeds.txt' -s SPIDER_PARTITION_ID=0 - ... $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=1 $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=2 ... $ scrapy crawl [spider] -L INFO -s SPIDER_PARTITION_ID=N -You should end up with N spider processes running. Usually it's enough for a single instance to read seeds from -``SEEDS_SOURCE`` variable to pass seeds to Frontera cluster. Seeds are only read if spider queue is empty. -::setting:`SPIDER_PARTITION_ID` can be read from config file also. +You should end up with N spider processes running. Also :setting:`SPIDER_PARTITION_ID` can be read from config file. -After some time seeds will pass the streams and will be scheduled for downloading by workers. Crawler is bootstrapped. +You're done, crawler should start crawling. Any component can be restarted any time, without major data loss. However, +for pausing its enough to stop batch gen only. diff --git a/docs/source/topics/custom_crawling_strategy.rst b/docs/source/topics/custom_crawling_strategy.rst new file mode 100644 index 000000000..949925065 --- /dev/null +++ b/docs/source/topics/custom_crawling_strategy.rst @@ -0,0 +1,195 @@ +================================ +Writing custom crawling strategy +================================ + +Crawling strategy is an essential part of Frontera-based crawler and it's guiding the crawler by instructing it which pages to crawl, when and with what priority. + + +Crawler workflow +================ + +Frontera-based crawler consist of multiple processes, which are running indefinitely. The state in these processes are +persisted to a permanent storage. When processes are stopped the state is flushed and will be loaded next time when +access to certain data item is needed. Therefore it's easy to pause the crawl by stopping the processes, do the +maintenance or modify the code and start again without restarting the crawl from the beginning. + + IMPORTANT DETAIL + Spider log (see http://frontera.readthedocs.io/en/latest/topics/glossary.html) is using hostname-based partitioning. + The content generated from particular host will always land to the same partition (and therefore strategy worker + instance). That guarantees the crawling strategy you design will be always dealing with same subset of hostnames + on every SW instance. It also means the same domain cannot be operated from multiple strategy worker instances. + To get the hostname the 2-nd level domain name is used with public suffix resolved. + + +To restart the crawl the + +* queue contents +* link states +* domain metadata + +needs to be cleaned up. This is usually done by means of truncation of tables. + + +Crawling strategy class +======================= + +It has to be inherited from BaseCrawlingStrategy and implement it's API. + +.. autoclass:: frontera.strategy.BaseCrawlingStrategy + + **Methods** + + .. automethod:: frontera.strategy.BaseCrawlingStrategy.from_worker + .. automethod:: frontera.strategy.BaseCrawlingStrategy.read_seeds + .. automethod:: frontera.strategy.BaseCrawlingStrategy.page_crawled + .. automethod:: frontera.strategy.BaseCrawlingStrategy.filter_extracted_links + .. automethod:: frontera.strategy.BaseCrawlingStrategy.links_extracted + .. automethod:: frontera.strategy.BaseCrawlingStrategy.request_error + .. automethod:: frontera.strategy.BaseCrawlingStrategy.finished + .. automethod:: frontera.strategy.BaseCrawlingStrategy.close + .. automethod:: frontera.strategy.BaseCrawlingStrategy.schedule + .. automethod:: frontera.strategy.BaseCrawlingStrategy.create_request + .. automethod:: frontera.strategy.BaseCrawlingStrategy.refresh_states + + +The class can be put in any module and passed to :term:`strategy worker` or local Scrapy process using command line +option or :setting:`CRAWLING_STRATEGY` setting on startup. + +The strategy class can use its own storage or any other kind of resources. All items from :term:`spider log` will be +passed through these methods. Scores returned doesn't have to be the same as in method arguments. +Periodically ``finished()`` method is called to check if crawling goal is achieved. + +Workflow +-------- + +There essentially two workflows: seeds addition (or injection) and main workflow. When crawl starts from scratch it +has to run the seed injection first and then proceed with main workflow. When paused/resumed crawler is running +main workflow. + +Seeds addition +^^^^^^^^^^^^^^ + +The purpose of this step is to inject the seeds into the crawler pipeline. The framework allows to process the seeds +stream (which is read from file placed locally or in S3), create requests needed, get their link states, and schedule +them. Once requests are scheduled they will get to the queue and propagate to spiders. + +To enter this workflow user is running strategy worker in add seeds mode providing arguments to crawling strategy +from command line. In particular --seeds-url is used with s3 or local file URL containing seeds to inject. + +1. from_worker() → init() +1. read_seeds(stream from file, None if file isn't present) +1. exit + +It's very convenient to run seeds addition using helper app in Frontera:: + + $ python -m frontera.utils.add_seeds --config ... --seeds-file ... + + +Main +^^^^ + +This is the main cycle used when crawl is in progress. In a nutshell on every spider event the specific handler is +called, depending on the type of event. When strategy worker is getting the SIGTERM signal it's trying to stop politely +by calling close(). In its normal state it listens for a spider log and executes the event handlers. + +1. `from_worker()` → init() +2. `page_crawled(response)` OR `page_error(request, error)` OR `filter_extracted_links(request, links)` and subsequent + `links_extracted(request, links)` +3. `close()` +4. exit + +Scheduling and creating requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ultimate goal of crawling strategy is scheduling of requests. To schedule request there is a method +schedule(request, score). The request is an instance of :class:`Request ` class and is +often available from arguments of event handlers: _page_crawled_, _page_error_ and _links_extracted_, or can be created +on-demand using :attr:`create_request ` method. + + IMPORTANT NOTICE + + The request created with create_request() has no state (meta[b'state']) after creation. To get the states strategy + worker needs to access the backend, and this is not happenning when you call create_request(). Instead it is + expected you will create a batch of requests and call refresh_states(iterable) on the whole batch of requests. + After refresh_states is done, you will have a states available for your newly created requests. + + The Request objects created by strategy worker for event handlers are always having the states assigned. + + +State operations +^^^^^^^^^^^^^^^^ + +Every link has a state. The purpose of this states is to allow the developer to persist the state of the link in the +system (allow restart of SW components without data loss) and use it for decision making. The states are cached in +strategy worker, flushed to backend and will be loaded when needed. States are defined in +:class:`frontera.core.components.States` and can have following values: + +* NOT_CRAWLED, +* QUEUED, +* CRAWLED, +* ERROR + +NOT_CRAWLED is assigned when link is new, and wasn't seen previously, the rest of the state values must be assigned +in the crawling strategy code. + +States allow to check that link was visited or discovered, and perform analysis of the states database to collect the +state statistics using MapReduce style jobs. + + +Components +========== + +There are certain building blocks and successful solutions exist for the common problems. + +DomainMetadata +-------------- + +It's often needed to persist per-host metadata in the permanent storage. To solve this there is a +:class:`frontera.core.components.DomainMetadata` instance in backend. It's has an interface of Python mapping types +(https://docs.python.org/3/library/stdtypes.html?highlight=mapping#mapping-types-dict ). It's expected that one will +be using domain names as keys and dicts as values. It's convenient to store there per-domin statistics, ban states, +the count of links found, etc. + + +PublicSuffix +------------ +When crawling multiple domains (especially unknown ones) it's important to resolve the 2-nd level domain name properly +using publicsuffix. + +Is a library from publicsuffix module provided by https://publicsuffix.org/. The purpose is to maintain a publicsuffix +of ccTLDs and name resolution routines for them in a single library. For us it's convenient to use these library +everywhere where domain name resolution is needed. Here are few examples: + +* www.london.co.uk → london.co.uk +* images.yandex.ru → yandex.ru +* t.co → t.co + + As you may see the number of dots of reverted domain name cannot be used for domain name resolution. + +Useful details +============== + +Debugging crawling strategy +--------------------------- +The best approach I found is to log all the events and outcomes using Python native logging. I.e. to setup the logger +for crawling strategy class and use it. When debug output is needed you will be able to set the logger to output to +a file, with a specific format and log level. After you have logging output set up you should start the crawl of +problematic website locally, collect and analyse the log output. + +Other approaches include analysis of links database, inspecting of domain metadata and states tables, collecting the +log output of link states changes (experimental SW feature). + +Meta fields +----------- + +== ============== =================================================================================================================================================== ========= +# name description presence +== ============== =================================================================================================================================================== ========= +1 b"slot" Queue partitioning key in bytes, highest priority. Use it if your app requires partitioning other than default 2-nd level domain-based partitioning Optional +2 b"domain" Dict generated by Frontera DomainMiddleware, and containing parsed domain name Always +3 b"state" Integer representing the link state, set by strategy worker. Link states are defined in frontera.core.components.States Always +4 b"encoding" In response, for HTML, encoding detected by Scrapy Optional +5 b"scrapy_meta" When scheduling can be used to set meta field for Scrapy Optional +== ============== =================================================================================================================================================== ========= + +Keys and string types in nested structures are always bytes. diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 805fc52f7..d9db1c7cc 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -128,50 +128,38 @@ Default: ``frontera.contrib.canonicalsolvers.Basic`` The :class:`CanonicalSolver ` to be used by the frontier for resolving canonical URLs. For more info see :ref:`Canonical URL Solver `. -.. setting:: SPIDER_LOG_CONSUMER_BATCH_SIZE - -SPIDER_LOG_CONSUMER_BATCH_SIZE ------------------------------- - -Default: ``512`` - -This is a batch size used by strategy and db workers for consuming of spider log stream. Increasing it -will cause worker to spend more time on every task, but processing more items per task, therefore leaving less time for -other tasks during some fixed time interval. Reducing it will result to running several tasks within the same time -interval, but with less overall efficiency. Use it when your consumers too slow, or too fast. +.. setting:: DELAY_ON_EMPTY -.. setting:: SCORING_LOG_CONSUMER_BATCH_SIZE +DELAY_ON_EMPTY +-------------- -SCORING_LOG_CONSUMER_BATCH_SIZE -------------------------------- +Default: ``5.0`` -Default: ``512`` +Delay between calls to backend for new batches in Scrapy scheduler, when queue size is getting below +``CONCURRENT_REQUESTS``. When backend has no requests to fetch, this delay helps to exhaust the rest of the buffer +without hitting backend on every request. Increase it if calls to your backend is taking too long, and decrease +if you need a fast spider bootstrap from seeds. -This is a batch size used by db worker for consuming of scoring log stream. Use it when you need to adjust scoring log -consumption speed. +.. setting:: DISCOVERY_MAX_PAGES -.. setting:: CRAWLING_STRATEGY +DISCOVERY_MAX_PAGES +------------------- -CRAWLING_STRATEGY ------------------ +Default: ``100`` -Default: ``None`` +The maximum number of pages to schedule by Discovery crawling strategy. -The path to crawling strategy class, instantiated and used in :term:`strategy worker` to prioritize and stop crawling in -distributed run mode. -.. setting:: DELAY_ON_EMPTY +.. setting:: DOMAIN_STATS_LOG_INTERVAL -DELAY_ON_EMPTY --------------- +DOMAIN_STATS_LOG_INTERVAL +------------------------- -Default: ``5.0`` +Default: ``300`` -Delay between calls to backend for new batches in Scrapy scheduler, when queue size is getting below -``CONCURRENT_REQUESTS``. When backend has no requests to fetch, this delay helps to exhaust the rest of the buffer -without hitting backend on every request. Increase it if calls to your backend is taking too long, and decrease -if you need a fast spider bootstrap from seeds. +Time interval in seconds to rotate the domain statistics in :term:`db worker` batch generator. Enabled only when +logging set to DEBUG. .. setting:: KAFKA_GET_TIMEOUT @@ -183,6 +171,17 @@ Default: ``5.0`` Time process should block until requested amount of data will be received from message bus. This is a general message bus setting with obsolete Kafka-related name. + +.. setting:: LOCAL_MODE + +LOCAL_MODE +---------- + +Default: ``True`` + +Sets single process run mode. Crawling strategy together with backend are used from the same spider process. + + .. setting:: LOGGING_CONFIG LOGGING_CONFIG @@ -262,6 +261,45 @@ Default: ``30.0`` Used in DB worker, and it's a time interval between production of new batches for all partitions. If partition is busy, it will be skipped. +.. setting:: OVERUSED_KEEP_PER_KEY + +OVERUSED_KEEP_PER_KEY +--------------------- + +Default: ``1000`` + +After the purging this number of requests will be left in the queue. + + +.. setting:: OVERUSED_KEEP_KEYS + +OVERUSED_KEEP_KEYS +------------------ + +Default: ``100`` + +The number of keys for purging to leave. + +.. setting:: OVERUSED_MAX_KEYS + +OVERUSED_MAX_KEYS +----------------- + +Default: ``None`` + +A threshold triggering the keys purging in OverusedBuffer. The purging will end up leaving :ref:`OVERUSED_KEEP_KEYS`. +``None`` disables purging. + +.. setting:: OVERUSED_MAX_PER_KEY + +OVERUSED_MAX_PER_KEY +-------------------- + +Default: ``None`` + +Purging will start when reaching this number of requests per key and leave :ref:`OVERUSED_KEEP_PER_KEY` requests. +``None`` disables purging. + .. setting:: OVERUSED_SLOT_FACTOR OVERUSED_SLOT_FACTOR @@ -292,6 +330,29 @@ Default: ``'frontera.core.models.Response'`` The :class:`Response ` model to be used by the frontier. +.. setting:: SPIDER_LOG_CONSUMER_BATCH_SIZE + +SPIDER_LOG_CONSUMER_BATCH_SIZE +------------------------------ + +Default: ``512`` + +This is a batch size used by strategy and db workers for consuming of spider log stream. Increasing it +will cause worker to spend more time on every task, but processing more items per task, therefore leaving less time for +other tasks during some fixed time interval. Reducing it will result to running several tasks within the same time +interval, but with less overall efficiency. Use it when your consumers too slow, or too fast. + +.. setting:: SCORING_LOG_CONSUMER_BATCH_SIZE + +SCORING_LOG_CONSUMER_BATCH_SIZE +------------------------------- + +Default: ``512`` + +This is a batch size used by db worker for consuming of scoring log stream. Use it when you need to adjust scoring log +consumption speed. + + .. setting:: SCORING_PARTITION_ID SCORING_PARTITION_ID @@ -350,6 +411,35 @@ Default: ``False`` Determines if content should be sent over the message bus and stored in the backend: a serious performance killer. +.. setting:: STRATEGY + +STRATEGY +-------- + +Default: ``frontera.worker.strategies.basic.BasicCrawlingStrategy`` + +The path to crawling strategy class. + +.. setting:: STRATEGY_ARGS + +STRATEGY_ARGS +------------- + +Default: ``{}`` + +Dict with default arguments for crawling strategy. Can be overridien with command line option in +:term:`strategy worker`. + +.. setting:: SW_FLUSH_INTERVAL + +SW_FLUSH_INTERVAL +----------------- + +Default: ``300`` + +Interval between flushing of states in :term:`strategy worker`. Also used to set initial random delay to flush states +periodically, using formula ``RANDINT(SW_FLUSH_INTERVAL)``. + .. setting:: TEST_MODE TEST_MODE @@ -359,6 +449,15 @@ Default: ``False`` Whether to enable frontier test mode. See :ref:`Frontier test mode ` +.. setting:: USER_AGENT + +USER_AGENT +---------- + +Default: ``FronteraDiscoveryBot`` + +User agent string in use by Discovery crawling strategy. + @@ -471,7 +570,8 @@ Default:: 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel' } -This is mapping with SQLAlchemy models used by backends. It is mainly used for customization. +This is mapping with SQLAlchemy models used by backends. It is mainly used for customization. This setting uses a +dictionary where ``key`` represents the name of the model to define and ``value`` the model to use. Revisiting backend @@ -511,6 +611,37 @@ Default: ``False`` Enables dropping and creation of new HBase tables on worker start. +.. setting:: HBASE_DOMAIN_METADATA_TABLE + +HBASE_DOMAIN_METADATA_TABLE +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``domain_metadata`` + +Name of the domain metadata table in HBase. + + +.. setting:: HBASE_DOMAIN_METADATA_CACHE_SIZE + +HBASE_DOMAIN_METADATA_CACHE_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: 1000 + +The count of domain-value pairs cached in memory in :term:`strategy worker`. Pairs are evicted from cache using LRU +policy. + + +.. setting:: HBASE_DOMAIN_METADATA_BATCH_SIZE + +HBASE_DOMAIN_METADATA_BATCH_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: 100 + +Maximum count of domain-value pairs kept in write buffer before actual write happens. + + .. setting:: HBASE_METADATA_TABLE HBASE_METADATA_TABLE @@ -538,6 +669,15 @@ Default: ``queue`` Name of HBase priority queue table. +.. settings:: HBASE_STATE_WRITE_LOG_SIZE + +HBASE_STATE_WRITE_LOG_SIZE +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``15000`` + +Number of state changes in the :term:`state cache` of :term:`strategy worker`, before it get's flushed to HBase and cleared. + .. setting:: HBASE_STATE_CACHE_SIZE_LIMIT HBASE_STATE_CACHE_SIZE_LIMIT @@ -545,7 +685,17 @@ HBASE_STATE_CACHE_SIZE_LIMIT Default: ``3000000`` -Number of items in the :term:`state cache` of :term:`strategy worker`, before it get's flushed to HBase and cleared. +Number of cached state changes in the :term:`state cache` of :term:`strategy worker`. Internally there is ``cachetools.LRUCache`` +storing all the recent state changes, discarding least recently used when the cache gets over its capacity. + +.. setting:: HBASE_STATES_TABLE + +HBASE_STATES_TABLE +^^^^^^^^^^^^^^^^^^ + +Default: ``states`` + +Name of the table used by :term:`strategy worker` to store link states. .. setting:: HBASE_THRIFT_HOST @@ -632,11 +782,28 @@ Hostname and port of kafka broker, separated with :. Can be a string with hostna KAFKA_CODEC ----------- -Default: ``None`` +Default: ``KAFKA_CODEC`` -Kafka-python 1.0.x version compression codec to use, is a string or None and could be one of ``snappy``, ``gzip`` or +Kafka-python 1.0.x version compression codec to use, is a string and could be one of ``none``, ``snappy``, ``gzip`` or ``lz4``. + +.. setting:: KAFKA_CERT_PATH + +KAFKA_CERT_PATH +--------------- + +OS path to the folder with three certificate files: ca-cert.pem, client-cert.pem, client-key.pem. + + +.. setting:: KAFKA_ENABLE_SSL + +KAFKA_ENABLE_SSL +---------------- + +Boolean. Set to True to enable SSL connection in Kafka client. + + .. setting:: SPIDER_LOG_DBW_GROUP SPIDER_LOG_DBW_GROUP diff --git a/docs/source/topics/frontier-backends.rst b/docs/source/topics/frontier-backends.rst index 706da8377..0f3d7d468 100644 --- a/docs/source/topics/frontier-backends.rst +++ b/docs/source/topics/frontier-backends.rst @@ -2,20 +2,21 @@ Backends ======== -Frontier :class:`Backend ` is where the crawling logic/policies lies, essentially a -brain of your crawler. :class:`Queue `, -:class:`Metadata ` and :class:`States ` are classes -where all low level code is meant to be placed, and -Backend opposite, operates on a higher levels. Frontera is bundled with database and in-memory implementations of -Queue, Metadata and States which can be combined in your custom backends or used standalone by directly -instantiating :class:`FrontierManager ` and Backend. - -Backend methods are called by the FrontierManager after +A :class:`DistributedBackend ` is used to separate higher level code +of :term:`crawling strategy` from low level storage API. :class:`Queue `, +:class:`Metadata `, :class:`States ` and + :class:`DomainMetadata ` are inner components of the DistributedBackend. +The latter is meant to instantiate and hold the references to the objects of above mentioned classes. Frontera is +bundled with database and in-memory implementations of Queue, Metadata, States and DomainMetadata which can be combined +in your custom backends or used standalone by directly instantiating specific variant of +:class:`FrontierManager `. + +DistributedBackend methods are called by the FrontierManager after :class:`Middleware `, using hooks for :class:`Request ` and :class:`Response ` processing according to :ref:`frontier data flow `. -Unlike Middleware, that can have many different instances activated, only one Backend can be used per +Unlike Middleware, that can have many different instances activated, only one DistributedBackend can be used per frontier. @@ -24,11 +25,11 @@ frontier. Activating a backend ==================== -To activate the frontier backend component, set it through the :setting:`BACKEND` setting. +To activate the specific backend, set it through the :setting:`BACKEND` setting. Here’s an example:: - BACKEND = 'frontera.contrib.backends.memory.FIFO' + BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend' Keep in mind that some backends may need to be additionally configured through a particular setting. See :ref:`backends documentation ` for more info. @@ -38,9 +39,9 @@ Keep in mind that some backends may need to be additionally configured through a Writing your own backend ======================== -Each backend component is a single Python class inherited from :class:`Backend ` or +Each backend component is a single Python class inherited from :class:`DistributedBackend ` and using one or all of -:class:`Queue`, :class:`Metadata` and :class:`States`. +:class:`Queue`, :class:`Metadata`, :class:`States` and :class:`DomainMetadata`. :class:`FrontierManager` will communicate with active backend through the methods described below. @@ -99,6 +100,8 @@ Backend should communicate with low-level storage by means of these classes: Metadata ^^^^^^^^ +Is used to store the contents of the crawl. + .. autoclass:: frontera.core.components.Metadata **Methods** @@ -115,6 +118,8 @@ Known implementations are: :class:`MemoryMetadata` and :class:`sqlalchemy.compon Queue ^^^^^ +Is a priority queue and used to persist requests scheduled for crawling. + .. autoclass:: frontera.core.components.Queue **Methods** @@ -130,6 +135,9 @@ Known implementations are: :class:`MemoryQueue` and :class:`sqlalchemy.component States ^^^^^^ +Is a storage used for checking and storing the link states. Where state is a short integer of one of states descibed in +:class:`frontera.core.components.States`. + .. autoclass:: frontera.core.components.States **Methods** @@ -145,69 +153,45 @@ States Known implementations are: :class:`MemoryStates` and :class:`sqlalchemy.components.States`. +DomainMetadata +^^^^^^^^^^^^^^ -.. _frontier-built-in-backend: - -Built-in backend reference -========================== - -This article describes all backend components that come bundled with Frontera. - -To know the default activated :class:`Backend ` check the -:setting:`BACKEND` setting. - - -.. _frontier-backends-basic-algorithms: - -Basic algorithms -^^^^^^^^^^^^^^^^ -Some of the built-in :class:`Backend ` objects implement basic algorithms as -as `FIFO`_/`LIFO`_ or `DFS`_/`BFS`_ for page visit ordering. +Is used to store per-domain flags, counters or even robots.txt contents to help :term:`crawling strategy` maintain +features like per-domain number of crawled pages limit or automatic banning. -Differences between them will be on storage engine used. For instance, -:class:`memory.FIFO ` and -:class:`sqlalchemy.FIFO ` will use the same logic but with different -storage engines. +.. autoclass:: frontera.core.components.DomainMetadata -All these backend variations are using the same :class:`CommonBackend ` class -implementing one-time visit crawling policy with priority queue. - -.. autoclass:: frontera.contrib.backends.CommonBackend - - -.. _frontier-backends-memory: + **Methods** -Memory backends -^^^^^^^^^^^^^^^ + .. automethod:: frontera.core.components.DomainMetadata.__setitem__ -This set of :class:`Backend ` objects will use an `heapq`_ module as queue and native -dictionaries as storage for :ref:`basic algorithms `. + .. automethod:: frontera.core.components.DomainMetadata.__getitem__ + .. automethod:: frontera.core.components.DomainMetadata.__delitem__ -.. class:: frontera.contrib.backends.memory.BASE + .. automethod:: frontera.core.components.DomainMetadata.__contains__ - Base class for in-memory :class:`Backend ` objects. -.. class:: frontera.contrib.backends.memory.FIFO +Known implementations are: native dict and :class:`sqlalchemy.components.DomainMetadata`. - In-memory :class:`Backend ` implementation of `FIFO`_ algorithm. -.. class:: frontera.contrib.backends.memory.LIFO +.. _frontier-built-in-backend: - In-memory :class:`Backend ` implementation of `LIFO`_ algorithm. +Built-in backend reference +========================== -.. class:: frontera.contrib.backends.memory.BFS +This article describes all backend components that come bundled with Frontera. - In-memory :class:`Backend ` implementation of `BFS`_ algorithm. -.. class:: frontera.contrib.backends.memory.DFS +.. _frontier-backends-memory: - In-memory :class:`Backend ` implementation of `DFS`_ algorithm. +Memory backend +^^^^^^^^^^^^^^ -.. class:: frontera.contrib.backends.memory.RANDOM +This implementation is using `heapq`_ module to store the requests queue and native dicts for other purposes and is +meant to be used for educational or testing purposes only. - In-memory :class:`Backend ` implementation of a random selection - algorithm. +.. autoclass:: frontera.contrib.backends.memory.MemoryDistributedBackend .. _frontier-backends-sqlalchemy: @@ -215,60 +199,17 @@ dictionaries as storage for :ref:`basic algorithms ` objects will use `SQLAlchemy`_ as storage for -:ref:`basic algorithms `. +This implementations is using RDBMS storage with `SQLAlchemy`_ library. By default it uses an in-memory SQLite database as a storage engine, but `any databases supported by SQLAlchemy`_ can be used. - If you need to use your own `declarative sqlalchemy models`_, you can do it by using the :setting:`SQLALCHEMYBACKEND_MODELS` setting. -This setting uses a dictionary where ``key`` represents the name of the model to define and ``value`` the model to use. - For a complete list of all settings used for SQLAlchemy backends check the :doc:`settings ` section. -.. class:: frontera.contrib.backends.sqlalchemy.BASE - - Base class for SQLAlchemy :class:`Backend ` objects. - -.. class:: frontera.contrib.backends.sqlalchemy.FIFO - - SQLAlchemy :class:`Backend ` implementation of `FIFO`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.LIFO - - SQLAlchemy :class:`Backend ` implementation of `LIFO`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.BFS - - SQLAlchemy :class:`Backend ` implementation of `BFS`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.DFS - - SQLAlchemy :class:`Backend ` implementation of `DFS`_ algorithm. - -.. class:: frontera.contrib.backends.sqlalchemy.RANDOM - - SQLAlchemy :class:`Backend ` implementation of a random selection - algorithm. - - -Revisiting backend -^^^^^^^^^^^^^^^^^^ - -Based on custom SQLAlchemy backend, and queue. Crawling starts with seeds. After seeds are crawled, every new -document will be scheduled for immediate crawling. On fetching every new document will be scheduled for recrawling -after fixed interval set by :setting:`SQLALCHEMYBACKEND_REVISIT_INTERVAL`. - -Current implementation of revisiting backend has no prioritization. During long term runs spider could go idle, because -there are no documents available for crawling, but there are documents waiting for their scheduled revisit time. - - -.. class:: frontera.contrib.backends.sqlalchemy.revisiting.Backend - - Base class for SQLAlchemy :class:`Backend ` implementation of revisiting back-end. +.. autoclass:: frontera.contrib.backends.sqlalchemy.Distributed HBase backend @@ -278,23 +219,27 @@ HBase backend Is more suitable for large scale web crawlers. Settings reference can be found here :ref:`hbase-settings`. Consider tunning a block cache to fit states within one block for average size website. To achieve this it's recommended to use -:attr:`hostname_local_fingerprint ` +:attr:`hostname_local_fingerprint ` to achieve documents +closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` setting. -to achieve documents closeness within the same host. This function can be selected with :setting:`URL_FINGERPRINT_FUNCTION` -setting. -.. TODO: document details of block cache tuning, - BC* settings and queue get operation concept, - hbase tables schema and data flow - Queue exploration - shuffling with MR jobs +Redis backend +^^^^^^^^^^^^^ + +.. autoclass:: frontera.contrib.backends.redis_backend.RedisBackend + +This is similar to the HBase backend. It is suitable for large scale crawlers that still has a limited scope. It is +recommended to ensure Redis is allowed to use enough memory to store all data the crawler needs. In case of Redis +running out of memory, the crawler will log this and continue. When the crawler is unable to write metadata or queue +items to the database; that metadata or queue items are lost. + +In case of connection errors; the crawler will attempt to reconnect three times. If the third attempt at connecting +to Redis fails, the worker will skip that Redis operation and continue operating. + -.. _FIFO: http://en.wikipedia.org/wiki/FIFO -.. _LIFO: http://en.wikipedia.org/wiki/LIFO_(computing) -.. _DFS: http://en.wikipedia.org/wiki/Depth-first_search -.. _BFS: http://en.wikipedia.org/wiki/Breadth-first_search .. _OrderedDict: https://docs.python.org/2/library/collections.html#collections.OrderedDict .. _heapq: https://docs.python.org/2/library/heapq.html .. _SQLAlchemy: http://www.sqlalchemy.org/ .. _any databases supported by SQLAlchemy: http://docs.sqlalchemy.org/en/latest/dialects/index.html .. _declarative sqlalchemy models: http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/index.html + diff --git a/docs/source/topics/frontier-tester.rst b/docs/source/topics/frontier-tester.rst index eef48bfe9..8b37ef490 100644 --- a/docs/source/topics/frontier-tester.rst +++ b/docs/source/topics/frontier-tester.rst @@ -55,7 +55,7 @@ settings, but also can be modified when creating the FrontierTester with the ``m An example of use ================= -A working example using test data from graphs and :ref:`basic backends `:: +A working example using test data from graphs and :ref:`backends `:: from frontera import FrontierManager, Settings, FrontierTester, graphs diff --git a/docs/source/topics/glossary.rst b/docs/source/topics/glossary.rst index fb9ab6c38..6321b15d4 100644 --- a/docs/source/topics/glossary.rst +++ b/docs/source/topics/glossary.rst @@ -16,8 +16,8 @@ Glossary A stream of messages from :term:`db worker` to spiders containing new batches of documents to crawl. strategy worker - Special type of worker, running the crawling strategy code: scoring the links, deciding if link needs to be - scheduled (consults :term:`state cache`) and when to stop crawling. That type of worker is sharded. + Special type of worker, running the :term:`crawling strategy` code: scoring the links, deciding if link needs + to be scheduled (consults :term:`state cache`) and when to stop crawling. That type of worker is sharded. db worker Is responsible for communicating with storage DB, and mainly saving metadata and content along with @@ -34,3 +34,7 @@ Glossary spider A process retrieving and extracting content from the Web, using :term:`spider feed` as incoming queue and storing results to :term:`spider log`. In this documentation fetcher is used as synonym. + + crawling strategy + A class containing crawling logic covering seeds addition, processing of downloaded content and scheduling of + new requests to crawl. diff --git a/docs/source/topics/installation.rst b/docs/source/topics/installation.rst index 8f4ef86e9..399f6425b 100644 --- a/docs/source/topics/installation.rst +++ b/docs/source/topics/installation.rst @@ -27,6 +27,9 @@ Each option installs dependencies needed for particular functionality. * *zeromq* - ZeroMQ message bus, * *kafka* - Kafka message bus, * *distributed* - workers dependencies. +* *s3* - dependencies required for seeds addition from S3 share, +* *redis* - RedisBackend dependencies, +* *strategies* - built-in crawling strategy dependencies. .. _Python: http://www.python.org .. _pip: http://www.pip-installer.org/en/latest/installing.html diff --git a/docs/source/topics/loggers.rst b/docs/source/topics/loggers.rst index 1714ed778..aced204ab 100644 --- a/docs/source/topics/loggers.rst +++ b/docs/source/topics/loggers.rst @@ -20,6 +20,7 @@ Loggers used * sqlalchemy.states * sqlalchemy.queue * offset-fetcher +* overusedbuffer * messagebus-backend * cf-server * db-worker diff --git a/docs/source/topics/message_bus.rst b/docs/source/topics/message_bus.rst index b5cdfb7d3..95478361e 100644 --- a/docs/source/topics/message_bus.rst +++ b/docs/source/topics/message_bus.rst @@ -2,7 +2,7 @@ Message bus =========== -Message bus ss the transport layer abstraction mechanism. Frontera provides interface and several implementations. +Message bus is the transport layer abstraction mechanism. Frontera provides interface and several implementations. Only one message bus can be used in crawler at the time, and it's selected with :setting:`MESSAGE_BUS` setting. Spiders process can use @@ -31,7 +31,7 @@ components startup to avoid message loss: #. :term:`db worker` #. :term:`strategy worker` -#. :term:`spiders` +#. :term:`spider`s Unfortunately, it's not possible to avoid message loss when stopping running crawler with unfinished crawl. We recommend to use Kafka message bus if your crawler application is sensitive to small message loss. @@ -97,4 +97,4 @@ JSON Module: frontera.contrib.backends.remote.codecs.json -.. _msgpack: http://msgpack.org/index.html \ No newline at end of file +.. _msgpack: http://msgpack.org/index.html diff --git a/docs/source/topics/overview.rst b/docs/source/topics/overview.rst index 619f3964a..1db9ea99e 100644 --- a/docs/source/topics/overview.rst +++ b/docs/source/topics/overview.rst @@ -28,7 +28,7 @@ Here are few cases, external crawl frontier can be suitable for: * URL ordering/queueing isolation from the spider (e.g. distributed cluster of spiders, need of remote management of ordering/queueing), -* URL (meta)data storage is needed (e.g. to demonstrate it's contents somewhere), +* URL (meta)data storage is needed (e.g. to be able to pause and resume the crawl), * advanced URL ordering logic is needed, when it's hard to maintain code within spider/fetcher. @@ -48,31 +48,8 @@ If website is big, and it's expensive to crawl the whole website, Frontera can b the most important documents. -Distributed load, few websites -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -If website needs to be crawled faster than single spider one could use distributed spiders mode. In this mode Frontera -is distributing spider processes and using one instance of backend worker. Requests are distributed using -:term:`message bus` of your choice and distribution logic can be adjusted using custom partitioning. By default requests -are distributed to spiders randomly, and desired request rate can be set in spiders. - -Consider also using proxy services, such as `Crawlera`_. - - -Revisiting -^^^^^^^^^^ - -There is a set of websites and one need to re-crawl them on timely (or other) manner. Frontera provides simple -revisiting backend, scheduling already visited documents for next visit using time interval set by option. This -backend is using general relational database for persistence and can be used in single process or distributed -spiders modes. - -Watchdog use case - when one needs to be notified about document changes, also could be addressed with such a backend -and minor customization. - - -Broad crawling -^^^^^^^^^^^^^^ +Broad crawling of many websites +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This use case requires full distribution: spiders and backend. In addition to spiders process one should be running :term:`strategy worker` (s) and :term:`db worker` (s), depending on chosen partitioning scheme. diff --git a/docs/source/topics/own_crawling_strategy.rst b/docs/source/topics/own_crawling_strategy.rst deleted file mode 100644 index 6067105eb..000000000 --- a/docs/source/topics/own_crawling_strategy.rst +++ /dev/null @@ -1,26 +0,0 @@ -================= -Crawling strategy -================= - -Use ``cluster`` example and ``frontera.worker.strategies.bfs`` module for reference. In general, you need to write a -crawling strategy class by subclassing: - -.. autoclass:: frontera.worker.strategies.BaseCrawlingStrategy - - **Methods** - - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.from_worker - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.add_seeds - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.page_crawled - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.page_error - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.finished - .. automethod:: frontera.worker.strategies.BaseCrawlingStrategy.close - - -The class can be put in any module and passed to :term:`strategy worker` using command line option or -:setting:`CRAWLING_STRATEGY` setting on startup. - -The strategy class instantiated in strategy worker, and can use it's own storage or any other kind of resources. All -items from :term:`spider log` will be passed through these methods. Scores returned doesn't have to be the same as in -method arguments. Periodically ``finished()`` method is called to check if crawling goal is achieved. - diff --git a/docs/source/topics/quick-start-distributed.rst b/docs/source/topics/quick-start-distributed.rst index db37edad1..9b5e0577a 100644 --- a/docs/source/topics/quick-start-distributed.rst +++ b/docs/source/topics/quick-start-distributed.rst @@ -4,14 +4,17 @@ Quick start distributed mode Here is a guide how to quickly setup Frontera for single-machine, multiple process, local hacking. We're going to deploy the simpliest possible setup with SQLite and ZeroMQ. Please proceed to :doc:`cluster-setup` article for a -production setup details for broad crawlers. +production setup details. + +Our crawler will have absolute minimum of components needed to work 1 :term:`spider`, 1 :term:`strategy worker` and +1 batch-gen, scoring worker. .. _basic_requirements: Prerequisites ============= -Here is what services needs to be installed and configured before running Frontera: +Here is what needs to be installed and configured before running Frontera: - Python 2.7+ or 3.4+ - Scrapy @@ -41,6 +44,9 @@ settings files, please consult :doc:`settings reference ` to Start cluster ============= + IMPORTANT! Because we're using ZeroMQ, and queue is stored in memory the order of the components starting is + important, please follow as described. + First, let's start ZeroMQ broker. :: $ python -m frontera.contrib.messagebus.zeromq.broker @@ -49,25 +55,25 @@ You should see a log output of broker with statistics on messages transmitted. All further commands have to be made from ``general-spider`` root directory. -Second, let's start DB worker. :: +Second, there are Spanish (.es zone) internet URLs from DMOZ directory in general spider repository, let's use them as +seeds to bootstrap crawling:: + + $ python -m frontera.utils.add_seeds --config config.dbw --seeds-file seeds_es_smp.txt - $ python -m frontera.worker.db --config frontier.workersettings +You should notice the log output and message saying that seeds addition is finished. +Third, starting the :term:`strategy worker`:: -You should notice that DB is writing messages to the output. It's ok if nothing is written in ZeroMQ sockets, because -of absence of seed URLs in the system. + $ python -m frontera.worker.strategy --config config.sw -There are Spanish (.es zone) internet URLs from DMOZ directory in general spider repository, let's use them as -seeds to bootstrap crawling. -Starting the spiders: :: +Fourth, starting the Scrapy spider:: - $ scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider_settings -s SEEDS_SOURCE=seeds_es_smp.txt -s SPIDER_PARTITION_ID=0 - $ scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider_settings -s SPIDER_PARTITION_ID=1 + $ python -m scrapy crawl general +Finally, the DB worker:: -You should end up with 2 spider processes running. Each should read it's own Frontera config, and first one is using -``SEEDS_SOURCE`` option to read seeds to bootstrap Frontera cluster. + $ python -m frontera.worker.db --no-incoming --config config.dbw --partitions 0 -After some time seeds will pass the streams and will be scheduled for downloading by workers. At this moment crawler -is bootstrapped. Now you can periodically check DB worker output or ``metadata`` table contents to see that there is -actual activity. +You should notice in logs that DB worker is trying to generate batches and after a short period the Scrapy is crawling +pages, also check the stats change in ZMQ broker and strategy worker. That's it, crawler is running with default +:term:`crawling strategy`. diff --git a/docs/source/topics/quick-start-single.rst b/docs/source/topics/quick-start-single.rst index f1fe6b39d..7a2f58b33 100644 --- a/docs/source/topics/quick-start-single.rst +++ b/docs/source/topics/quick-start-single.rst @@ -2,8 +2,14 @@ Quick start single process ========================== -1. Create your spider -===================== +The idea is that you develop and debug crawling strategy in single process mode locally and use distributed one when +deploying crawling strategy for crawling in production at scale. Single process is also good as a first step to get +something running quickly. + + Note, that this tutorial doesn't work for :class:`frontera.contrib.backends.memory.MemoryDistributedBackend`. + +1. Create your Scrapy spider +============================ Create your Scrapy project as you usually do. Enter a directory where you’d like to store your code and then run:: @@ -41,22 +47,43 @@ See :doc:`installation`. This article about :doc:`integration with Scrapy ` explains this step in detail. +4. Choose your crawling strategy +================================ + +Here are the options you would need to redefine when running in single process mode the crawler configured for +distributed mode:: + + # these two parameters are pointing Frontera that it will run locally -4. Choose your backend + SPIDER_FEED_PARTITIONS = 1 + SPIDER_LOG_PARTITIONS = 1 + + +5. Choose your backend ====================== -Configure frontier settings to use a built-in backend like in-memory BFS:: +Configure frontier settings to use a built-in backend like:: + + BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + - BACKEND = 'frontera.contrib.backends.memory.BFS' +6. Inject the seed URLs +======================= -5. Run the spider +This step is required only if your crawling strategy requires seeds injection from external source.:: + + $ python -m frontera.utils.add_seeds --config [your_frontera_config] --seeds-file [path to your seeds file] + +After script is finished succesfully your seeds should be stored in backend's queue and scheduled for crawling. + +7. Run the spider ================= Run your Scrapy spider as usual from the command line:: scrapy crawl myspider -And that's it! You got your spider running integrated with Frontera. +And that's it! You got your crawler running integrated with Frontera. What else? ========== @@ -81,7 +108,3 @@ Frontera provides many powerful features for making frontier management easy and * Logging facility that you can hook on to for catching errors and debug your frontiers. - - - - diff --git a/docs/source/topics/run-modes.rst b/docs/source/topics/run-modes.rst index a323d8377..3736de412 100644 --- a/docs/source/topics/run-modes.rst +++ b/docs/source/topics/run-modes.rst @@ -7,42 +7,26 @@ A diagram showing architecture of running modes: .. image:: _images/high-level-arc.png -==================== ========================================================================= ====================================================== ===================== -Mode Parent class Components needed Available backends -==================== ========================================================================= ====================================================== ===================== -Single process :class:`Backend ` single process running the crawler Memory, SQLAlchemy -Distributed spiders :class:`Backend ` spiders and single :term:`db worker` Memory, SQLAlchemy -Distributed backends :class:`DistributedBackend ` spiders, :term:`strategy worker` (s) and db worker(s). SQLAlchemy, HBase -==================== ========================================================================= ====================================================== ===================== +==================== ====================================================== +Mode Components needed +==================== ====================================================== +Single process single process running the crawler +Distributed spiders, :term:`strategy worker` (s) and db worker(s). +==================== ====================================================== Single process ============== -Frontera is instantiated in the same process as fetcher (for example in Scrapy). To achieve that use :setting:`BACKEND` -setting set to storage backend subclass of :class:`Backend `. This run mode is -suitable for small number of documents and time non-critical applications. +Frontera is instantiated in the same process as fetcher (for example in Scrapy). Read more on how to use that mode +:doc:`here `. -Distributed spiders -=================== +This mode is suitable for developing the crawling strategy locally and applications where its critical to fetch +small number of documents fast. -Spiders are distributed and backend isn't. Backend is running in :term:`db worker` and it's communicating with -spiders using :term:`message bus`. -1. Use :setting:`BACKEND` in spider processes set to - :class:`MessageBusBackend ` -2. In DB worker :setting:`BACKEND` should point to :class:`Backend ` subclasse. -3. Every spider process should have it's own :setting:`SPIDER_PARTITION_ID`, starting from 0 to - :setting:`SPIDER_FEED_PARTITIONS`. -4. Both spiders and workers should have it's :setting:`MESSAGE_BUS` setting set to the message bus class of your choice, - and other implementation depending settings. - -This mode is suitable for applications where it's critical to fetch documents fast, at the same time amount of them -is relatively small. - - -Distributed spiders and backend -=============================== +Distributed +=========== Spiders and backend are distributed. Backend is divided on two parts: :term:`strategy worker` and :term:`db worker`. Strategy worker instances are assigned to their own part of :term:`spider log`. @@ -58,6 +42,6 @@ Strategy worker instances are assigned to their own part of :term:`spider log`. 5. Both spiders and workers should have it's :setting:`MESSAGE_BUS` setting set to the message bus class of your choice and selected message bus have to be configured. -Only Kafka message bus can be used in this mode out of the box and SQLAlchemy and HBase distributed backends. +Only Kafka message bus can be used in this mode out of the box. -This mode is suitable for broad crawling and large amount of pages. +This mode is designed for crawling of web-scale large amount of domains and pages. diff --git a/docs/source/topics/scrapy-integration.rst b/docs/source/topics/scrapy-integration.rst index cbe196b6c..7069667e7 100644 --- a/docs/source/topics/scrapy-integration.rst +++ b/docs/source/topics/scrapy-integration.rst @@ -2,8 +2,22 @@ Using the Frontier with Scrapy ============================== -Using Frontera is quite easy, it includes a set of `Scrapy middlewares`_ and Scrapy scheduler that encapsulates -Frontera usage and can be easily configured using `Scrapy settings`_. +To use Frontera with Scrapy, you will need to add `Scrapy middlewares`_ and redefine the default Scrapy scheduler with +custom Frontera scheduler. Both can be done by modifying `Scrapy settings`_. + + +The purpose +=========== + +Scrapy is expected to be used as a fetching, HTML parsing and links extracting component. Your spider code have + to produce responses and requests from extracted links. That's all. Frontera's business is to keep the links, queue +and schedule links when needed. + +Please make sure all the middlewares affecting the crawling, like DepthMiddleware, OffsiteMiddleware or +RobotsTxtMiddleware are disabled. + +All other use cases when Scrapy is busy items generation, scraping from HTML, scheduling links directly trying to bypass +Frontera, are doomed to cause countless hours of maintenance. Please don't use Frontera integrated with Scrapy that way. Activating the frontier @@ -32,7 +46,6 @@ Create a Frontera ``settings.py`` file and add it to your Scrapy settings:: Another option is to put these settings right into Scrapy settings module. - Organizing files ================ @@ -43,8 +56,6 @@ When using frontier with a Scrapy project, we propose the following directory st frontera/ __init__.py settings.py - middlewares.py - backends.py spiders/ ... __init__.py @@ -54,8 +65,6 @@ When using frontier with a Scrapy project, we propose the following directory st These are basically: - ``my_scrapy_project/frontera/settings.py``: the Frontera settings file. -- ``my_scrapy_project/frontera/middlewares.py``: the middlewares used by the Frontera. -- ``my_scrapy_project/frontera/backends.py``: the backend(s) used by the Frontera. - ``my_scrapy_project/spiders``: the Scrapy spiders folder - ``my_scrapy_project/settings.py``: the Scrapy settings file - ``scrapy.cfg``: the Scrapy config file @@ -103,33 +112,67 @@ Writing Scrapy spider Spider logic ------------ -Creation of basic Scrapy spider is described at `Quick start single process`_ page. -It's also a good practice to prevent spider from closing because of insufficiency of queued requests transport::: +Creation of new Scrapy project is described at `Quick start single process`_ page. Again, your spider code have + to produce responses and requests from extracted links. Also, make sure exceptions caused by request processing are +not intercepted by any of the middlewares. Otherwise errors delivery to :term:`crawling strategy` will be broken. + +Here is an example code to start:: - @classmethod - def from_crawler(cls, crawler, *args, **kwargs): - spider = cls(*args, **kwargs) - spider._set_crawler(crawler) - spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) - return spider + from scrapy import Spider + from scrapy.linkextractors import LinkExtractor + from scrapy.http import Request + from scrapy.http.response.html import HtmlResponse + + class CommonPageSpider(Spider): + + name = "commonpage" + + def __init__(self, *args, **kwargs): + super(CommonPageSpider, self).__init__(*args, **kwargs) + self.le = LinkExtractor() + + def parse(self, response): + if not isinstance(response, HtmlResponse): + return + for link in self.le.extract_links(response): + r = Request(url=link.url) + r.meta.update(link_text=link.text) + yield r - def spider_idle(self): - self.log("Spider idle signal caught.") - raise DontCloseSpider Configuration guidelines ------------------------ -There several tunings you can make for efficient broad crawling. +Please specify a correct user agent string to disclose yourself to webmasters:: + + USER_AGENT = 'Some-Bot (+http://url/to-the-page-describing-the-purpose-of-crawling)' + -Adding one of seed loaders for bootstrapping of crawling process:: +When using Frontera robots.txt obeying have to be implemented in :term:`crawling strategy`:: + + ROBOTSTXT_OBEY = False + +Disable some of the spider and downloader middlewares which may affect the crawling:: SPIDER_MIDDLEWARES.update({ - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, + 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None, + 'scrapy.spidermiddlewares.referer.RefererMiddleware': None, + 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': None, + 'scrapy.spidermiddlewares.depth.DepthMiddleware': None, + 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': None }) + DOWNLOADER_MIDDLEWARES.update({ + 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': None, + }) + + del DOWNLOADER_MIDDLEWARES_BASE['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] + + +There several tunings you can make for efficient broad crawling. + Various settings suitable for broad crawling:: HTTPCACHE_ENABLED = False # Turns off disk cache, which has low hit ratio during broad crawls @@ -160,65 +203,4 @@ Check also `Scrapy broad crawling`_ recommendations. .. _`Quick start single process`: http://frontera.readthedocs.org/en/latest/topics/quick-start-single.html .. _`Scrapy broad crawling`: http://doc.scrapy.org/en/master/topics/broad-crawls.html - - -Scrapy Seed Loaders -=================== - -Frontera has some built-in Scrapy middlewares for seed loading. - -Seed loaders use the ``process_start_requests`` method to generate requests from a source that are added later to the -:class:`FrontierManager `. - - -Activating a Seed loader ------------------------- - -Just add the Seed Loader middleware to the ``SPIDER_MIDDLEWARES`` scrapy settings:: - - SPIDER_MIDDLEWARES.update({ - 'frontera.contrib.scrapy.middlewares.seeds.FileSeedLoader': 650 - }) - - -.. _seed_loader_file: - -FileSeedLoader --------------- - -Load seed URLs from a file. The file must be formatted contain one URL per line:: - - http://www.asite.com - http://www.anothersite.com - ... - -Yo can disable URLs using the ``#`` character:: - - ... - #http://www.acommentedsite.com - ... - -**Settings**: - -- ``SEEDS_SOURCE``: Path to the seeds file - - -.. _seed_loader_s3: - -S3SeedLoader ------------- - -Load seeds from a file stored in an Amazon S3 bucket - -File format should the same one used in :ref:`FileSeedLoader `. - -Settings: - -- ``SEEDS_SOURCE``: Path to S3 bucket file. eg: ``s3://some-project/seed-urls/`` - -- ``SEEDS_AWS_ACCESS_KEY``: S3 credentials Access Key - -- ``SEEDS_AWS_SECRET_ACCESS_KEY``: S3 credentials Secret Access Key - - .. _`Scrapy Middleware doc`: http://doc.scrapy.org/en/latest/topics/spider-middleware.html diff --git a/docs/source/topics/strategies.rst b/docs/source/topics/strategies.rst new file mode 100644 index 000000000..f5743aa1c --- /dev/null +++ b/docs/source/topics/strategies.rst @@ -0,0 +1,44 @@ +=================== +Crawling strategies +=================== + +Basic +===== + +Location: :class:`frontera.strategy.basic.BasicCrawlingStrategy` + +Designed to showcase the minimum amount of code needed to implement working :term:`crawling strategy`. It reads the seed +URLs, schedules all of them and crawls indefinitely all links that is discovered during the crawl. + +Used for testing purposes too. + + +Breadth-first +============= + +Location: :class:`frontera.strategy.depth.BreadthFirstCrawlingStrategy` + +Starts with seed URLs provided and prioritizes links depending on their distance from seed page. The bigger the distance, +the lower the priority. This will cause close pages to be crawled first. + + +Depth-first +=========== + +Location: :class:`frontera.strategy.depth.DepthFirstCrawlingStrategy` + +The same as breadth-first, but prioritization is opposite: the bigger the distance the higher the priority. Thus, +crawling deeper links first. + + +Discovery +========= + +Location: :class:`frontera.strategy.discovery.Discovery` + +This crawling strategy is used for crawling and discovery of websites in the Web. It respects robots.txt rules, +follows sitemap.xml and has a limit on a number of pages to crawl from every website. It will also skip the website in +case of fatal errors like connection reset or dns resolution errors. There are two settings used to configure it + +* :setting:`DISCOVERY_MAX_PAGES`, +* :setting:`USER_AGENT` diff --git a/docs/source/topics/tests.rst b/docs/source/topics/tests.rst index 92aa601cb..7e678a607 100644 --- a/docs/source/topics/tests.rst +++ b/docs/source/topics/tests.rst @@ -133,35 +133,6 @@ You can define the following test:: ... -Testing basic algorithms -======================== - -If your backend uses any of the :ref:`basic algorithms logics `, you can just -inherit the correponding test base class for each logic and sequences will be automatically tested for it:: - - from tests import backends - - - class TestMyBackendFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendFIFO' - - - class TestMyBackendLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendLIFO' - - - class TestMyBackendDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendDFS' - - - class TestMyBackendBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendBFS' - - - class TestMyBackendRANDOM(backends.RANDOMBackendTest): - backend_class = 'frontera.contrib.backends.abackend.MyBackendRANDOM' - - .. _pytest: http://pytest.org/latest/ diff --git a/examples/cluster/bc/broadcrawl/__init__.py b/examples/cluster/bc/broadcrawl/__init__.py deleted file mode 100644 index 6c22dcbcd..000000000 --- a/examples/cluster/bc/broadcrawl/__init__.py +++ /dev/null @@ -1,116 +0,0 @@ -# -*- coding: utf-8 -*- -from frontera.core.components import States -from frontera.worker.strategies import BaseCrawlingStrategy -from frontera.contrib.backends.hbase import HBaseBackend -from cachetools import LRUCache -from msgpack import packb, unpackb -import logging -from datetime import timedelta - -from six.moves.urllib import parse as urlparse -import six - - -class DomainCache(LRUCache): - def __init__(self, maxsize, connection, table_name): - super(DomainCache, self).__init__(maxsize) - self.table = connection.table(table_name) - - def popitem(self): - key, value = super(DomainCache, self).popitem() - self._store_item(self.table, key, value) - - def __missing__(self, key): - row = self.table.row(key) - if not row: - super(DomainCache, self).__missing__(key) - raise KeyError - value = {} - for k, v in row.iteritems(): - cf, _, col = k.partition(':') - value[col] = unpackb(v) - self.__setitem__(key, value) - return value - - def _store_item(self, batch, key, value): - data = {} - assert isinstance(value, dict) - for k, v in six.iteritems(value): - data["m:%s" % k] = packb(v) - batch.put(key, data) - - def flush(self): - with self.table.batch() as b: - for k, v in six.iteritems(self): - self._store_item(b, k, v) - - -class BCPerHostLimit(BaseCrawlingStrategy): - - def __init__(self, manager, mb_stream, states_context): - settings = manager.settings - backend = manager.backend - assert isinstance(backend, HBaseBackend), "This strategy supports HBaseBackend only." - self.conn = backend.connection - self.domain_cache = DomainCache(10000, self.conn, "domain_metadata") - self.max_pages_per_hostname = settings.get("MAX_PAGES_PER_HOSTNAME") - assert self.max_pages_per_hostname is not None - self.logger = logging.getLogger("bcperhostlimit-strategy") - super(BCPerHostLimit, self).__init__(manager, mb_stream, states_context) - - @classmethod - def from_worker(cls, manager, mb_scheduler, states_context): - return cls(manager, mb_scheduler, states_context) - - def add_seeds(self, seeds): - self._schedule_and_count(seeds) - - def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED - domain = self._get_domain_bucket(response.url) - domain['cp'] = domain.get('cp', 0)+1 - - def links_extracted(self, request, links): - self._schedule_and_count(links) - - def page_error(self, request, error): - request.meta['state'] = States.ERROR - self.schedule(request, score=0.0, dont_queue=True) - - def _schedule_and_count(self, links): - counts = dict() - for link in links: - if link.meta[b'state'] is not States.NOT_CRAWLED: - continue - link.meta[b'state'] = States.QUEUED - url_parts = urlparse.urlparse(link.url) - if not url_parts.hostname: - continue - hostname = url_parts.hostname - if hostname not in counts: - domain = self.domain_cache.setdefault(hostname, {}) - counts[hostname] = domain.get('sc', 0) - if counts[hostname] >= self.max_pages_per_hostname: - self.logger.debug("Reached per host limit for URL %s, " - "already scheduled %d of %d allowed.", link.url, counts[hostname], - self.max_pages_per_hostname) - continue - path_parts = url_parts.path.split('/') - score = 0.5 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) - self.schedule(link, score) - counts[hostname] += 1 - if counts[hostname] == self.max_pages_per_hostname: - self.logger.info("Reached per host limit for domain %s (%d)", hostname, self.max_pages_per_hostname) - - for hostname, count in six.iteritems(counts): - domain = self.domain_cache.setdefault(hostname, {}) - domain['sc'] = domain.get('sc', 0)+count - - def _get_domain_bucket(self, url): - parsed = urlparse.urlsplit(url) - hostname, _, _ = parsed.netloc.partition(':') - return self.domain_cache.setdefault(hostname, {}) - - def close(self): - self.domain_cache.flush() - super(BCPerHostLimit, self).close() diff --git a/examples/cluster/bc/spiders/bc.py b/examples/cluster/bc/spiders/bc.py index ac3ee3bf3..98914e372 100644 --- a/examples/cluster/bc/spiders/bc.py +++ b/examples/cluster/bc/spiders/bc.py @@ -4,6 +4,7 @@ from scrapy.http.response.html import HtmlResponse from scrapy.linkextractors import LinkExtractor from scrapy import signals +from scrapy.exceptions import DontCloseSpider class BCSpider(Spider): name = 'bc' diff --git a/examples/general-spider/README.md b/examples/general-spider/README.md deleted file mode 100644 index bb1cb17d9..000000000 --- a/examples/general-spider/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# general-spider -A general spider with links extraction for Frontera diff --git a/examples/general-spider/config/__init__.py b/examples/general-spider/config/__init__.py new file mode 100644 index 000000000..01aa296ae --- /dev/null +++ b/examples/general-spider/config/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +SPIDER_FEED_PARTITIONS = 1 +SPIDER_LOG_PARTITIONS = 1 +LOCAL_MODE=False \ No newline at end of file diff --git a/examples/general-spider/config/dbw.py b/examples/general-spider/config/dbw.py new file mode 100644 index 000000000..4f48a6fe4 --- /dev/null +++ b/examples/general-spider/config/dbw.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from config.worker import * + +SQLALCHEMYBACKEND_ENGINE = 'sqlite:///queue.sqlite' + + + diff --git a/examples/general-spider/frontier/single.py b/examples/general-spider/config/single.py similarity index 90% rename from examples/general-spider/frontier/single.py rename to examples/general-spider/config/single.py index f86c135bd..c7208a706 100644 --- a/examples/general-spider/frontier/single.py +++ b/examples/general-spider/config/single.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- -import logging +from __future__ import absolute_import + BACKEND = 'frontera.contrib.backends.sqlalchemy.revisiting.Backend' SQLALCHEMYBACKEND_ENGINE = 'sqlite:///url_storage.sqlite' diff --git a/examples/general-spider/frontier/spider_settings.py b/examples/general-spider/config/spider.py similarity index 89% rename from examples/general-spider/frontier/spider_settings.py rename to examples/general-spider/config/spider.py index 2e35ff6cb..8f6d0a816 100644 --- a/examples/general-spider/frontier/spider_settings.py +++ b/examples/general-spider/config/spider.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from frontera.settings.default_settings import MIDDLEWARES +from config import * MAX_NEXT_REQUESTS = 256 DELAY_ON_EMPTY = 5.0 @@ -13,6 +15,3 @@ # Crawl frontier backend #-------------------------------------------------------- BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' -SPIDER_FEED_PARTITIONS = 2 - - diff --git a/examples/general-spider/config/sw.py b/examples/general-spider/config/sw.py new file mode 100644 index 000000000..06afb1808 --- /dev/null +++ b/examples/general-spider/config/sw.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from config.worker import * + +SQLALCHEMYBACKEND_ENGINE = 'sqlite:///strategy.sqlite' + + + diff --git a/examples/general-spider/frontier/workersettings.py b/examples/general-spider/config/worker.py similarity index 63% rename from examples/general-spider/frontier/workersettings.py rename to examples/general-spider/config/worker.py index fa0e59adf..fe850d316 100644 --- a/examples/general-spider/frontier/workersettings.py +++ b/examples/general-spider/config/worker.py @@ -1,22 +1,18 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import from frontera.settings.default_settings import MIDDLEWARES +from config import * MAX_NEXT_REQUESTS = 512 -SPIDER_FEED_PARTITIONS = 2 -SPIDER_LOG_PARTITIONS = 1 #-------------------------------------------------------- # Url storage #-------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.sqlalchemy.SQLAlchemyBackend' -#BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' +BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' -SQLALCHEMYBACKEND_ENGINE = 'sqlite:///url_storage_dist.sqlite' SQLALCHEMYBACKEND_ENGINE_ECHO = False -SQLALCHEMYBACKEND_DROP_ALL_TABLES = True -SQLALCHEMYBACKEND_CLEAR_CONTENT = True from datetime import timedelta SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=3) diff --git a/examples/general-spider/frontier/__init__.py b/examples/general-spider/frontier/__init__.py deleted file mode 100644 index 7c68785e9..000000000 --- a/examples/general-spider/frontier/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/examples/general-spider/general/settings.py b/examples/general-spider/general/settings.py index e29b425e7..54d192353 100644 --- a/examples/general-spider/general/settings.py +++ b/examples/general-spider/general/settings.py @@ -5,10 +5,9 @@ NEWSPIDER_MODULE = 'general.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'topic (+http://www.yourdomain.com)' +USER_AGENT = 'Frontera-based example bot (+https://github.com/scrapinghub/frontera)' SPIDER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, 'scrapy.spidermiddleware.depth.DepthMiddleware': None, 'scrapy.spidermiddleware.offsite.OffsiteMiddleware': None, @@ -46,3 +45,5 @@ REACTOR_THREADPOOL_MAXSIZE = 32 DNS_TIMEOUT = 180 +FRONTERA_SETTINGS = 'config.spider' +HTTPERROR_ALLOW_ALL = True \ No newline at end of file diff --git a/examples/general-spider/logging.conf b/examples/general-spider/logging.conf index 9fd40f7eb..cab082e0c 100644 --- a/examples/general-spider/logging.conf +++ b/examples/general-spider/logging.conf @@ -5,20 +5,20 @@ keys=root,dbw,sw keys=hand01 [formatters] -keys=form01,form02 +keys=form01 [logger_root] -level=DEBUG +level=INFO handlers=hand01 [logger_dbw] -level=DEBUG +level=INFO handlers=hand01 qualname=db-worker propagate=0 [logger_sw] -level=DEBUG +level=INFO handlers=hand01 qualname=strategy-worker propagate=0 @@ -27,11 +27,8 @@ propagate=0 class=StreamHandler level=NOTSET args=(sys.stdout,) -formatter=form02 +formatter=form01 [formatter_form01] format=%(asctime)s %(levelname)-8s %(name)-15s %(message)s -[formatter_form02] -format=%(log_color)s %(asctime)s %(levelname)-8s %(name)-15s %(message)s -class=colorlog.ColoredFormatter diff --git a/frontera/__init__.py b/frontera/__init__.py index 8f97ddc24..fe024fd87 100644 --- a/frontera/__init__.py +++ b/frontera/__init__.py @@ -1,5 +1,4 @@ from __future__ import absolute_import -from .core.manager import FrontierManager from .core.models import Request, Response from .core.components import Backend, DistributedBackend, Middleware from .settings import Settings diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 2dc89a1ee..40a96afc6 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -1,86 +1 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import -from collections import OrderedDict - -from frontera import Backend -from frontera.core.components import States - - -class CommonBackend(Backend): - """ - A simpliest possible backend, performing one-time crawl: if page was crawled once, it will not be crawled again. - """ - component_name = 'Common Backend' - - @classmethod - def from_manager(cls, manager): - return cls(manager) - - def frontier_start(self): - self.metadata.frontier_start() - self.queue.frontier_start() - self.states.frontier_start() - self.queue_size = self.queue.count() - - def frontier_stop(self): - self.metadata.frontier_stop() - self.queue.frontier_stop() - self.states.frontier_stop() - - def add_seeds(self, seeds): - for seed in seeds: - seed.meta[b'depth'] = 0 - self.metadata.add_seeds(seeds) - self.states.fetch([seed.meta[b'fingerprint'] for seed in seeds]) - self.states.set_states(seeds) - self._schedule(seeds) - self.states.update_cache(seeds) - - def _schedule(self, requests): - batch = [] - queue_incr = 0 - for request in requests: - schedule = True if request.meta[b'state'] in [States.NOT_CRAWLED, States.ERROR, None] else False - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, schedule)) - if schedule: - queue_incr += 1 - request.meta[b'state'] = States.QUEUED - self.queue.schedule(batch) - self.metadata.update_score(batch) - self.queue_size += queue_incr - - def _get_score(self, obj): - return obj.meta.get(b'score', 1.0) - - def get_next_requests(self, max_next_requests, **kwargs): - partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions - batch = [] - for partition_id in partitions: - batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) - self.queue_size -= len(batch) - return batch - - def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED - self.states.update_cache(response) - self.metadata.page_crawled(response) - - def links_extracted(self, request, links): - to_fetch = OrderedDict() - for link in links: - to_fetch[link.meta[b'fingerprint']] = link - link.meta[b'depth'] = request.meta.get(b'depth', 0)+1 - self.states.fetch(to_fetch.keys()) - self.states.set_states(links) - unique_links = to_fetch.values() - self.metadata.links_extracted(request, unique_links) - self._schedule(unique_links) - self.states.update_cache(unique_links) - - def request_error(self, request, error): - request.meta[b'state'] = States.ERROR - self.metadata.request_error(request, error) - self.states.update_cache(request) - - def finished(self): - return self.queue_size == 0 diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase/__init__.py similarity index 57% rename from frontera/contrib/backends/hbase.py rename to frontera/contrib/backends/hbase/__init__.py index 8f60cb6d3..11db9d889 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase/__init__.py @@ -1,18 +1,19 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.utils.url import parse_domain_from_url_fast +from __future__ import absolute_import, division from frontera import DistributedBackend from frontera.core.components import Metadata, Queue, States from frontera.core.models import Request from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.utils.misc import chunks, get_crc32 +from frontera.utils.misc import chunks, get_crc32, time_elapsed from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder +from frontera.contrib.backends.hbase.domaincache import DomainCache from happybase import Connection -from msgpack import Unpacker, Packer +from msgpack import Unpacker, Packer, packb import six from six.moves import range from w3lib.util import to_bytes +from cachetools import LRUCache from struct import pack, unpack from datetime import datetime @@ -21,10 +22,9 @@ from binascii import hexlify, unhexlify from io import BytesIO from random import choice -from collections import Iterable +from collections import defaultdict, Iterable import logging - _pack_functions = { 'url': to_bytes, 'depth': lambda x: pack('>I', 0), @@ -32,9 +32,11 @@ 'status_code': lambda x: pack('>H', x), 'state': lambda x: pack('>B', x), 'error': to_bytes, - 'domain_fingerprint': to_bytes, + 'domain_fprint': to_bytes, 'score': lambda x: pack('>f', x), - 'content': to_bytes + 'content': to_bytes, + 'headers': packb, + 'dest_fprint': to_bytes } @@ -62,11 +64,28 @@ def utcnow_timestamp(): return timegm(d.timetuple()) -class HBaseQueue(Queue): +class LRUCacheWithStats(LRUCache): + """Extended version of standard LRUCache with counting stats.""" + + EVICTED_STATNAME = 'states.cache.evicted' + + def __init__(self, stats=None, *args, **kwargs): + super(LRUCacheWithStats, self).__init__(*args, **kwargs) + self._stats = stats + if self._stats is not None: + self._stats.setdefault(self.EVICTED_STATNAME, 0) + + def popitem(self): + key, val = super(LRUCacheWithStats, self).popitem() + if self._stats: + self._stats[self.EVICTED_STATNAME] += 1 + return key, val + +class HBaseQueue(Queue): GET_RETRIES = 3 - def __init__(self, connection, partitions, table_name, drop=False): + def __init__(self, connection, partitions, table_name, drop=False, use_snappy=False): self.connection = connection self.partitions = [i for i in range(0, partitions)] self.partitioner = Crc32NamePartitioner(self.partitions) @@ -79,10 +98,14 @@ def __init__(self, connection, partitions, table_name, drop=False): tables.remove(self.table_name) if self.table_name not in tables: - self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}}) + schema = {'f': {'max_versions': 1}} + if use_snappy: + schema['f']['compression'] = 'SNAPPY' + self.connection.create_table(self.table_name, schema) class DumbResponse: pass + self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request) @@ -97,12 +120,7 @@ def schedule(self, batch): now = int(time()) for fprint, score, request, schedule in batch: if schedule: - if b'domain' not in request.meta: # TODO: this have to be done always by DomainMiddleware, - # so I propose to require DomainMiddleware by HBaseBackend and remove that code - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) - request.meta[b'domain'] = {'name': hostname} + assert b'domain' in request.meta timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now to_schedule.setdefault(timestamp, []).append((request, score)) for timestamp, batch in six.iteritems(to_schedule): @@ -127,6 +145,7 @@ def _schedule(self, batch, timestamp): :param batch: iterable of Request objects :return: """ + def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError @@ -141,15 +160,19 @@ def get_interval(score, resolution): for request, score in batch: domain = request.meta[b'domain'] fingerprint = request.meta[b'fingerprint'] - if type(domain) == dict: + slot = request.meta.get(b'slot') + if slot is not None: + partition_id = self.partitioner.partition(slot, self.partitions) + key_crc32 = get_crc32(slot) + elif type(domain) == dict: partition_id = self.partitioner.partition(domain[b'name'], self.partitions) - host_crc32 = get_crc32(domain[b'name']) + key_crc32 = get_crc32(domain[b'name']) elif type(domain) == int: partition_id = self.partitioner.partition_by_hash(domain, self.partitions) - host_crc32 = domain + key_crc32 = domain else: - raise TypeError("domain of unknown type.") - item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score) + raise TypeError("partitioning key and info isn't provided") + item = (unhexlify(fingerprint), key_crc32, self.encoder.encode_request(request), score) score = 1 - score # because of lexicographical sort in HBase rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str) data.setdefault(rk, []).append((score, item)) @@ -185,9 +208,9 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): :return: list of :class:`Request ` objects. """ min_requests = kwargs.pop('min_requests') - min_hosts = kwargs.pop('min_hosts') - max_requests_per_host = kwargs.pop('max_requests_per_host') - assert(max_n_requests > min_requests) + min_hosts = kwargs.pop('min_hosts', None) + max_requests_per_host = kwargs.pop('max_requests_per_host', None) + assert (max_n_requests > min_requests) table = self.connection.table(self.table_name) meta_map = {} @@ -195,9 +218,10 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): limit = min_requests tries = 0 count = 0 - prefix = '%d_' % partition_id - now_ts = int(time()) - filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) + prefix = to_bytes('%d_' % partition_id) + # now_ts = int(time()) + # TODO: figure out how to use filter here, Thrift filter above causes full scan + # filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 @@ -206,26 +230,33 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): meta_map.clear() queue.clear() count = 0 - for rk, data in table.scan(limit=int(limit), batch_size=256, filter=filter): - for cq, buf in six.iteritems(data): - if cq == b'f:t': - continue - stream = BytesIO(buf) - unpacker = Unpacker(stream) - for item in unpacker: - fprint, host_crc32, _, _ = item - if host_crc32 not in queue: - queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + # XXX pypy hot-fix: non-exhausted generator must be closed manually + # otherwise "finally" piece in table.scan() method won't be executed + # immediately to properly close scanner (http://pypy.org/compat.html) + scan_gen = table.scan(limit=int(limit), batch_size=256, row_prefix=prefix, sorted_columns=True) + try: + for rk, data in scan_gen: + for cq, buf in six.iteritems(data): + if cq == b'f:t': continue - queue[host_crc32].append(fprint) - count += 1 - - if fprint not in meta_map: - meta_map[fprint] = [] - meta_map[fprint].append((rk, item)) - if count > max_n_requests: - break + stream = BytesIO(buf) + unpacker = Unpacker(stream) + for item in unpacker: + fprint, key_crc32, _, _ = item + if key_crc32 not in queue: + queue[key_crc32] = [] + if max_requests_per_host is not None and len(queue[key_crc32]) > max_requests_per_host: + continue + queue[key_crc32].append(fprint) + count += 1 + + if fprint not in meta_map: + meta_map[fprint] = [] + meta_map[fprint].append((rk, item)) + if count > max_n_requests: + break + finally: + scan_gen.close() if min_hosts is not None and len(queue.keys()) < min_hosts: continue @@ -269,46 +300,56 @@ def count(self): class HBaseState(States): - - def __init__(self, connection, table_name, cache_size_limit): + def __init__(self, connection, table_name, cache_size_limit, + write_log_size, drop_all_tables): self.connection = connection - self._table_name = table_name + self._table_name = to_bytes(table_name) self.logger = logging.getLogger("hbase.states") - self._state_cache = {} - self._cache_size_limit = cache_size_limit + self._state_batch = self.connection.table( + self._table_name).batch(batch_size=write_log_size) + self._state_stats = defaultdict(int) + self._state_cache = LRUCacheWithStats(maxsize=cache_size_limit, + stats=self._state_stats) + self._state_last_updates = 0 + + tables = set(connection.tables()) + if drop_all_tables and self._table_name in tables: + connection.delete_table(self._table_name, disable=True) + tables.remove(self._table_name) + + if self._table_name not in tables: + schema = {'s': {'max_versions': 1, 'block_cache_enabled': 1, + 'bloom_filter_type': 'ROW', 'in_memory': True, } + } + connection.create_table(self._table_name, schema) def update_cache(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] - - def put(obj): - self._state_cache[obj.meta[b'fingerprint']] = obj.meta[b'state'] - [put(obj) for obj in objs] + for obj in objs: + fingerprint, state = obj.meta[b'fingerprint'], obj.meta[b'state'] + # prepare & write state change to happybase batch + self._state_batch.put(unhexlify(fingerprint), prepare_hbase_object(state=state)) + # update LRU cache with the state update + self._state_cache[fingerprint] = state + self._state_last_updates += 1 + self._update_batch_stats() def set_states(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] + for obj in objs: + obj.meta[b'state'] = self._state_cache.get(obj.meta[b'fingerprint'], States.DEFAULT) - def get(obj): - fprint = obj.meta[b'fingerprint'] - obj.meta[b'state'] = self._state_cache[fprint] if fprint in self._state_cache else States.DEFAULT - [get(obj) for obj in objs] - - def flush(self, force_clear): - if len(self._state_cache) > self._cache_size_limit: - force_clear = True - table = self.connection.table(self._table_name) - for chunk in chunks(list(self._state_cache.items()), 32768): - with table.batch(transaction=True) as b: - for fprint, state in chunk: - hb_obj = prepare_hbase_object(state=state) - b.put(unhexlify(fprint), hb_obj) - if force_clear: - self.logger.debug("Cache has %d requests, clearing" % len(self._state_cache)) - self._state_cache.clear() + def flush(self): + self._state_batch.send() def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] - self.logger.debug("cache size %s" % len(self._state_cache)) - self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) + self._update_cache_stats(hits=len(fingerprints) - len(to_fetch), + misses=len(to_fetch)) + if not to_fetch: + return + self.logger.debug('Fetching %d/%d elements from HBase (cache size %d)', + len(to_fetch), len(fingerprints), len(self._state_cache)) for chunk in chunks(to_fetch, 65536): keys = [unhexlify(fprint) for fprint in chunk] table = self.connection.table(self._table_name) @@ -318,6 +359,24 @@ def fetch(self, fingerprints): state = unpack('>B', cells[b's:state'])[0] self._state_cache[hexlify(key)] = state + def _update_batch_stats(self): + new_batches_count, self._state_last_updates = divmod( + self._state_last_updates, self._state_batch._batch_size) + self._state_stats['states.batches.sent'] += new_batches_count + + def _update_cache_stats(self, hits, misses): + total_hits = self._state_stats['states.cache.hits'] + hits + total_misses = self._state_stats['states.cache.misses'] + misses + total = total_hits + total_misses + self._state_stats['states.cache.hits'] = total_hits + self._state_stats['states.cache.misses'] = total_misses + self._state_stats['states.cache.ratio'] = total_hits / total if total else 0 + + def get_stats(self): + stats = self._state_stats.copy() + self._state_stats.clear() + return stats + class HBaseMetadata(Metadata): def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_size, store_content): @@ -329,8 +388,6 @@ def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_si if self._table_name not in tables: schema = {'m': {'max_versions': 1}, - 's': {'max_versions': 1, 'block_cache_enabled': 1, - 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': {'max_versions': 1} } if use_snappy: @@ -355,12 +412,22 @@ def add_seeds(self, seeds): obj = prepare_hbase_object(url=seed.url, depth=0, created_at=utcnow_timestamp(), - domain_fingerprint=seed.meta[b'domain'][b'fingerprint']) + domain_fprint=seed.meta[b'domain'][b'fingerprint']) self.batch.put(unhexlify(seed.meta[b'fingerprint']), obj) def page_crawled(self, response): - obj = prepare_hbase_object(status_code=response.status_code, content=response.body) if self.store_content else \ - prepare_hbase_object(status_code=response.status_code) + headers = response.headers + redirect_urls = response.request.meta.get(b'redirect_urls') + redirect_fprints = response.request.meta.get(b'redirect_fingerprints') + if redirect_urls: + for url, fprint in zip(redirect_urls, redirect_fprints): + obj = prepare_hbase_object(url=url, + created_at=utcnow_timestamp(), + dest_fprint=redirect_fprints[-1]) + self.batch.put(fprint, obj) + obj = prepare_hbase_object(status_code=response.status_code, headers=headers, + content=response.body) if self.store_content else \ + prepare_hbase_object(status_code=response.status_code, headers=headers) self.batch.put(unhexlify(response.meta[b'fingerprint']), obj) def links_extracted(self, request, links): @@ -370,16 +437,22 @@ def links_extracted(self, request, links): for link_fingerprint, (link, link_url, link_domain) in six.iteritems(links_dict): obj = prepare_hbase_object(url=link_url, created_at=utcnow_timestamp(), - domain_fingerprint=link_domain[b'fingerprint']) + domain_fprint=link_domain[b'fingerprint']) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): obj = prepare_hbase_object(url=request.url, created_at=utcnow_timestamp(), error=error, - domain_fingerprint=request.meta[b'domain'][b'fingerprint']) + domain_fprint=request.meta[b'domain'][b'fingerprint']) rk = unhexlify(request.meta[b'fingerprint']) self.batch.put(rk, obj) + if b'redirect_urls' in request.meta: + for url, fprint in zip(request.meta[b'redirect_urls'], request.meta[b'redirect_fingerprints']): + obj = prepare_hbase_object(url=url, + created_at=utcnow_timestamp(), + dest_fprint=request.meta[b'redirect_fingerprints'][-1]) + self.batch.put(fprint, obj) def update_score(self, batch): if not isinstance(batch, dict): @@ -410,36 +483,64 @@ def __init__(self, manager): 'host': host, 'port': int(port), 'table_prefix': namespace, - 'table_prefix_separator': ':' + 'table_prefix_separator': ':', + 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) + self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None + self._domain_metadata = None + + def _init_states(self, settings): + self._states = HBaseState(connection=self.connection, + table_name=settings.get('HBASE_STATES_TABLE'), + cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), + write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'), + drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES')) + + def _init_queue(self, settings): + self._queue = HBaseQueue(self.connection, self.queue_partitions, + settings.get('HBASE_QUEUE_TABLE'), drop=settings.get('HBASE_DROP_ALL_TABLES'), + use_snappy=settings.get('HBASE_USE_SNAPPY')) + + def _init_metadata(self, settings): + self._metadata = HBaseMetadata(self.connection, settings.get('HBASE_METADATA_TABLE'), + settings.get('HBASE_DROP_ALL_TABLES'), + settings.get('HBASE_USE_SNAPPY'), + settings.get('HBASE_BATCH_SIZE'), + settings.get('STORE_CONTENT')) + + def _init_domain_metadata(self, settings): + self._domain_metadata = DomainCache(settings.get('HBASE_DOMAIN_METADATA_CACHE_SIZE'), self.connection, + settings.get('HBASE_DOMAIN_METADATA_TABLE'), + batch_size=settings.get('HBASE_DOMAIN_METADATA_BATCH_SIZE')) @classmethod def strategy_worker(cls, manager): o = cls(manager) - settings = manager.settings - o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'), - settings.get('HBASE_STATE_CACHE_SIZE_LIMIT')) + o._init_states(manager.settings) + o._init_domain_metadata(manager.settings) return o @classmethod def db_worker(cls, manager): o = cls(manager) - settings = manager.settings - drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') - o._queue = HBaseQueue(o.connection, o.queue_partitions, - settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables) - o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, - settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), - settings.get('STORE_CONTENT')) + o._init_queue(manager.settings) + o._init_metadata(manager.settings) + return o + + @classmethod + def local(cls, manager): + o = cls(manager) + o._init_queue(manager.settings) + o._init_states(manager.settings) return o @property @@ -454,13 +555,17 @@ def queue(self): def states(self): return self._states + @property + def domain_metadata(self): + return self._domain_metadata + def frontier_start(self): - for component in [self.metadata, self.queue, self.states]: - if component: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: + if component is not None: component.frontier_start() def frontier_stop(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_stop() self.connection.close() @@ -481,16 +586,20 @@ def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): - next_pages = [] self.logger.debug("Querying queue table.") - partitions = set(kwargs.pop('partitions', [])) - for partition_id in range(0, self.queue_partitions): - if partition_id not in partitions: - continue - results = self.queue.get_next_requests(max_next_requests, partition_id, - min_requests=self._min_requests, - min_hosts=self._min_hosts, - max_requests_per_host=self._max_requests_per_host) - next_pages.extend(results) - self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) - return next_pages + results = [] + for partition_id in set(kwargs.pop('partitions', [i for i in range(self.queue_partitions)])): + requests = self.queue.get_next_requests( + max_next_requests, partition_id, + min_requests=self._min_requests, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host) + results.extend(requests) + self.logger.debug("Got %d requests for partition id %d", len(requests), partition_id) + return results + + def get_stats(self): + stats = {} + if self._states: + stats.update(self._states.get_stats()) + return stats diff --git a/frontera/contrib/backends/hbase/domaincache.py b/frontera/contrib/backends/hbase/domaincache.py new file mode 100644 index 000000000..8729e4ff7 --- /dev/null +++ b/frontera/contrib/backends/hbase/domaincache.py @@ -0,0 +1,313 @@ +from __future__ import absolute_import + +import logging +from collections import defaultdict +from time import time + +import six +from msgpack import packb, unpackb +from w3lib.util import to_bytes, to_native_str + +from frontera.core.components import DomainMetadata +from frontera.contrib.backends.hbase.utils import HardenedBatch +from frontera.utils.msgpack import restruct_for_pack + +import collections +from cachetools import Cache + + +DEFAULT_HBASE_THRIFT_FRAME_SIZE = 2097152 + + +class LRUCache(Cache): + """Least Recently Used (LRU) cache implementation.""" + + def __init__(self, maxsize, getsizeof=None): + Cache.__init__(self, maxsize, getsizeof=getsizeof) + self.__order = collections.OrderedDict() + + def __getitem__(self, key, cache_getitem=Cache.__getitem__): + value = cache_getitem(self, key) + self._update_order(key) + return value + + def __setitem__(self, key, value, cache_setitem=Cache.__setitem__): + cache_setitem(self, key, value) + self._update_order(key) + + def __delitem__(self, key, cache_delitem=Cache.__delitem__): + cache_delitem(self, key) + del self.__order[key] + + def popitem(self): + """Remove and return the `(key, value)` pair least recently used.""" + try: + key = next(iter(self.__order)) + except StopIteration: + raise KeyError('%s is empty' % self.__class__.__name__) + else: + return (key, self.pop(key)) + + if hasattr(collections.OrderedDict, 'move_to_end'): + def _update_order(self, key): + try: + self.__order.move_to_end(key) + except KeyError: + self.__order[key] = None + else: + def _update_order(self, key): + try: + self.__order[key] = self.__order.pop(key) + except KeyError: + self.__order[key] = None + + +class DomainCache(LRUCache, DomainMetadata): + """ + This is an implementation of Domain metadata cache backed by HBase table. It's main purpose is to store the domain + metadata in Python-friendly structures while providing fast and reliable access. + The container has these features: + * LRU logic, + * two generations, second generation is used for evicted items when HBase batch isn't full, + * batched HBase writes, + * Python 3 and PyPy ready. + + This container has these limitations: + 1. value is always of dict type + 2. data in value cannot be bigger than MAX_VALUE_SIZE (which is usually ~2Mb), Otherwise fields will be dropped + with error message + 3. 255 > len(key) > 0 + 4. key and keys within value dict are always of native string type + 5. all keys are utf-8 strings. + 6. iterator of this container iterates only on first generation content. + """ + + MAX_VALUE_SIZE = int(DEFAULT_HBASE_THRIFT_FRAME_SIZE * 0.95) + LOG_INTERVAL = 60.0 + + def __init__(self, maxsize, connection, table_name, set_fields=None, on_get_func=None, batch_size=100): + super(DomainCache, self).__init__(maxsize) + + self._second_gen = dict() + + table_name = to_bytes(table_name) + self._table = self._get_domain_table(connection, table_name) + self._batch = HardenedBatch(self._table, batch_size=batch_size) + self._set_fields = set(set_fields) if set_fields else set() + self._on_get_func = on_get_func + + self.logger = logging.getLogger("domain-cache") + self.stats = defaultdict(int) + self.next_log = time() + self.LOG_INTERVAL + self.batch_size = batch_size + + # Primary methods + + def __setitem__(self, key, value): + self._key_check(key) + assert isinstance(value, dict) + super(DomainCache, self).__setitem__(key, value) + + def __getitem__(self, key): + self._key_check(key) + try: + value = Cache.__getitem__(self, key) + except KeyError: + try: + value = self._second_gen[key] + except KeyError: + try: + value = self._get_item(key) + except KeyError as ke3: + raise ke3 + else: + self.__setitem__(key, value) + else: + self.__setitem__(key, value) + if key in self._second_gen: # the second gen clean up could be triggered during set in first gen + del self._second_gen[key] + else: + self._update_order(key) + return value + + def __delitem__(self, key): + self._key_check(key) + not_found = True + if super(DomainCache, self).__contains__(key): + super(DomainCache, self).__delitem__(key) + not_found = False + if key in self._second_gen: + del self._second_gen[key] + not_found = False + rk = to_bytes(key) + if self._table.row(rk): + self._table.delete(rk) + not_found = False + if not_found: + raise KeyError + + def __contains__(self, key): + self._key_check(key) + self.stats["contains"] += 1 + if super(DomainCache, self).__contains__(key): + self.stats["contains_in_memory"] += 1 + return True + if key in self._second_gen: + self.stats["contains_in_secgen"] += 1 + return True + if self._table.row(to_bytes(key)): + self.stats["contains_in_hbase"] += 1 + return True + self.stats["contains_false"] += 1 + return False + + def popitem(self): + """ + Called every time item is evicted by LRU cache + """ + key, value = super(DomainCache, self).popitem() + self._second_gen[key] = value + self.stats["pops"] += 1 + if len(self._second_gen) >= self.batch_size: + self._flush_second_gen() + self._second_gen.clear() + self.stats["flushes"] += 1 + + # These methods aren't meant to be implemented + + def __missing__(self, key): + raise KeyError + + __len__ = None + + def __bool__(self): + return True + + clear = None + + maxsize = None + + # Secondary methods, those that are depend on primary + + def get(self, key, default=None): + """ + HBase-optimized get + """ + self._key_check(key) + self._log_and_rotate_stats() + if super(DomainCache, self).__contains__(key) or key in self._second_gen: + self.stats["gets_memory_hit"] += 1 + return self[key] + try: + value = self._get_item(key) + except KeyError: + self.stats["gets_miss"] += 1 + return default + else: + self.stats["gets_hbase_hit"] += 1 + return value + + def setdefault(self, key, default=None): + """ + HBase-optimized setdefault + """ + self._key_check(key) + self.stats["gets"] += 1 + self._log_and_rotate_stats() + if super(DomainCache, self).__contains__(key) or key in self._second_gen: + value = self[key] + self.stats["gets_memory_hit"] += 1 + else: + try: + value = self._get_item(key) + except KeyError: + self.stats["gets_miss"] += 1 + value = default + else: + self.stats["gets_hbase_hit"] += 1 + self[key] = value + return value + + def flush(self): + for k, v in six.iteritems(self): + try: + self._store_item_batch(k, v) + except Exception: + self.logger.exception("Error storing kv pair %s, %s", k, v) + pass + self._flush_second_gen() + self._batch.send() + + # private + + def _flush_second_gen(self): + for key, value in six.iteritems(self._second_gen): + self._store_item_batch(key, value) + self._batch.send() + + def _log_and_rotate_stats(self): + if not self.logger.isEnabledFor(logging.DEBUG): + return + if time() > self.next_log: + for k, v in self.stats.items(): + self.logger.debug("%s = %d", k, v) + self.next_log = time() + self.LOG_INTERVAL + self.stats = defaultdict(int) + + def _get_domain_table(self, connection, table_name): + tables = set(connection.tables()) + if table_name not in tables: + schema = {'m': {'max_versions': 1}} + connection.create_table(table_name, schema) + return connection.table(table_name) + + def _get_item(self, key): + self.stats["hbase_gets"] += 1 + hbase_key = to_bytes(key) + row = self._table.row(hbase_key) + if not row: + self.stats["hbase_misses"] += 1 + super(DomainCache, self).__missing__(key) + raise KeyError + value = {} + for k, v in six.iteritems(row): + cf, _, col = k.partition(b':') + col = to_native_str(col) + value[col] = unpackb(v, encoding='utf-8') + # XXX extract some fields as a set for faster in-checks + if col in self._set_fields: + value[col] = set(value[col]) + if self._on_get_func: + self._on_get_func(value) + return value + + def _store_item_batch(self, key, value): + data = {} + self._key_check(key) + for k, v in six.iteritems(value): + if k.startswith('_'): + continue + # convert set to list manually for successful serialization + v = restruct_for_pack(v) + k = to_bytes(k) + data[b"m:%s" % k] = packb(v, use_bin_type=True) + tries = 3 + while data and tries > 0: + try: + self._batch.put(key, data) + except ValueError: + self.logger.exception("Exception happened during item storing, %d tries left", tries) + data_lengths = dict((k, len(v)) for k, v in six.iteritems(data)) + self.logger.info("RK %s per-column lengths %s", key, str(data_lengths)) + for k, length in data_lengths.items(): + if length > self.MAX_VALUE_SIZE: + self.logger.info("Dropping key %s", k) + del data[k] + tries -= 1 + continue + else: + break + + def _key_check(self, key): + if len(key) == 0 or len(key) > 255: + raise KeyError("Key cannot be empty or longer than 255 chars") \ No newline at end of file diff --git a/frontera/contrib/backends/hbase/utils.py b/frontera/contrib/backends/hbase/utils.py new file mode 100644 index 000000000..6dc862e92 --- /dev/null +++ b/frontera/contrib/backends/hbase/utils.py @@ -0,0 +1,22 @@ +from __future__ import absolute_import +from happybase import Batch + +from thriftpy2.transport import TTransportException +import logging + + +class HardenedBatch(Batch): + def __init__(self, table, timestamp=None, batch_size=None, + transaction=False, wal=True): + super(HardenedBatch, self).__init__(table, timestamp=timestamp, batch_size=batch_size, transaction=transaction, + wal=wal) + self.logger = logging.getLogger("happybase.batch") + + def send(self): + try: + super(HardenedBatch, self).send() + except TTransportException: + self.logger.exception("Exception happened during batch persistence") + self.logger.warning("Cleaning up the batch") + self._reset_mutations() + pass diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index e96cd29a8..dda33851c 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -1,16 +1,13 @@ from __future__ import absolute_import -import logging -import random -from collections import deque, Iterable -from frontera.contrib.backends import CommonBackend -from frontera.core.components import Metadata, Queue, States -from frontera.core import OverusedBuffer -from frontera.utils.heap import Heap +from collections import Iterable + +import logging +import six from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.core.components import Metadata, Queue, States, DistributedBackend +from frontera.utils.heap import Heap from frontera.utils.url import parse_domain_from_url_fast -import six -from six.moves import map from six.moves import range @@ -82,45 +79,6 @@ def _compare_pages(self, first, second): return cmp(first.meta[b'_scr'], second.meta[b'_scr']) -class MemoryDequeQueue(Queue): - def __init__(self, partitions, is_fifo=True): - """ - Deque-based queue (see collections module). Efficient queue for LIFO and FIFO strategies. - :param partitions: int count of partitions - :param type: bool, True for FIFO, False for LIFO - """ - self.partitions = [i for i in range(0, partitions)] - self.partitioner = Crc32NamePartitioner(self.partitions) - self.logger = logging.getLogger("memory.dequequeue") - self.queues = {} - self.is_fifo = is_fifo - for partition in self.partitions: - self.queues[partition] = deque() - - def count(self): - return sum([len(h) for h in six.itervalues(self.queues)]) - - def get_next_requests(self, max_n_requests, partition_id, **kwargs): - batch = [] - pop_op = self.queues[partition_id].popleft if self.is_fifo else self.queues[partition_id].pop - while max_n_requests > 0 and self.queues[partition_id]: - batch.append(pop_op()) - max_n_requests -= 1 - return batch - - def schedule(self, batch): - for fprint, score, request, schedule in batch: - if schedule: - request.meta[b'_scr'] = score - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) - partition_id = self.partitions[0] - else: - partition_id = self.partitioner.partition(hostname, self.partitions) - self.queues[partition_id].append(request) - - class MemoryStates(States): def __init__(self, cache_size_limit): @@ -146,117 +104,71 @@ def set_states(self, objs): def fetch(self, fingerprints): pass - def flush(self, force_clear=False): + def flush(self): if len(self._cache) > self._cache_size_limit: - force_clear = True - if force_clear: self.logger.debug("Cache has %d items, clearing", len(self._cache)) self._cache.clear() -class MemoryBaseBackend(CommonBackend): - """ - Base class for in-memory heapq Backend objects. - """ - component_name = 'Memory Base Backend' - +class MemoryDistributedBackend(DistributedBackend): def __init__(self, manager): - self.manager = manager settings = manager.settings - self._metadata = MemoryMetadata() - self._states = MemoryStates(settings.get("STATE_CACHE_SIZE")) - self._queue = self._create_queue(settings) - self._id = 0 + self._states = MemoryStates(1000) + self._queue = MemoryQueue(settings.get('SPIDER_FEED_PARTITIONS')) + self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') + self._domain_metadata = dict() + + def add_seeds(self, seeds): + pass + + def page_crawled(self, response): + pass + + def request_error(self, page, error): + pass + + def finished(self): + pass + + def links_extracted(self, request, links): + pass @property def metadata(self): return self._metadata + @property + def queue(self): + return self._queue + @property def states(self): return self._states @property - def queue(self): - return self._queue + def domain_metadata(self): + return self._domain_metadata + + def get_next_requests(self, max_n_requests, **kwargs): + next_pages = [] + partitions = set(kwargs.pop('partitions', [])) + for partition_id in range(0, self.queue_partitions): + if partition_id not in partitions: + continue + results = self.queue.get_next_requests(max_n_requests, partition_id) + next_pages.extend(results) + self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) + return next_pages @classmethod - def from_manager(cls, manager): + def strategy_worker(cls, manager): return cls(manager) - def _create_queue(self, settings): - return MemoryQueue(1) - - def add_seeds(self, seeds): - for seed in seeds: - seed.meta[b'id'] = self._id - self._id += 1 - super(MemoryBaseBackend, self).add_seeds(seeds) - - def links_extracted(self, request, links): - for link in links: - link.meta[b'id'] = self._id - self._id += 1 - super(MemoryBaseBackend, self).links_extracted(request, links) - - def finished(self): - return self.queue.count() == 0 - - -class MemoryDFSQueue(MemoryQueue): - def _compare_pages(self, first, second): - return cmp((second.meta[b'depth'], first.meta[b'id']), - (first.meta[b'depth'], second.meta[b'id'])) - - -class MemoryBFSQueue(MemoryQueue): - def _compare_pages(self, first, second): - return cmp((first.meta[b'depth'], first.meta[b'id']), - (second.meta[b'depth'], second.meta[b'id'])) - - -class MemoryRandomQueue(MemoryQueue): - def _compare_pages(self, first, second): - return random.choice([-1, 0, 1]) - - -class MemoryFIFOBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryDequeQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryLIFOBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryDequeQueue(settings.get('SPIDER_FEED_PARTITIONS'), is_fifo=False) - - -class MemoryDFSBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryDFSQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryBFSBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryBFSQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryRandomBackend(MemoryBaseBackend): - def _create_queue(self, settings): - return MemoryRandomQueue(settings.get('SPIDER_FEED_PARTITIONS')) - - -class MemoryDFSOverusedBackend(MemoryDFSBackend): - def __init__(self, manager): - super(MemoryDFSOverusedBackend, self).__init__(manager) - self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests) - - def get_next_requests(self, max_next_requests, **kwargs): - return self.overused_buffer.get_next_requests(max_next_requests, **kwargs) + @classmethod + def db_worker(cls, manager): + return cls(manager) + @classmethod + def local(cls, manager): + return cls(manager) -BASE = MemoryBaseBackend -FIFO = MemoryFIFOBackend -LIFO = MemoryLIFOBackend -DFS = MemoryDFSBackend -BFS = MemoryBFSBackend -RANDOM = MemoryRandomBackend diff --git a/frontera/contrib/backends/partitioners.py b/frontera/contrib/backends/partitioners.py index 5b425c20e..b038000f6 100644 --- a/frontera/contrib/backends/partitioners.py +++ b/frontera/contrib/backends/partitioners.py @@ -1,9 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from struct import unpack -from binascii import unhexlify from frontera.core.components import Partitioner +from cityhash import CityHash64 from frontera.utils.misc import get_crc32 @@ -27,9 +26,8 @@ class FingerprintPartitioner(Partitioner): def partition(self, key, partitions=None): if not partitions: partitions = self.partitions - digest = unhexlify(key[0:2] + key[5:7] + key[10:12] + key[15:17]) - value = unpack(" now_ts: + continue + if host_crc32 not in queue: + queue[host_crc32] = [] + if max_requests_per_host is not None and len(queue[host_crc32]) >= max_requests_per_host: + continue + queue[host_crc32].append(item) + if len(queue[host_crc32]) > max_host_items: + max_host_items = len(queue[host_crc32]) + count += 1 + to_remove.append(data) + if count >= max_n_requests: + break + return start, count, max_host_items + + def get_next_requests(self, max_n_requests, partition_id, **kwargs): + """ + Fetch new batch from priority queue. + :param max_n_requests: maximum number of requests + :param partition_id: partition id to get batch from + :param min_hosts: minimum number of hosts + :param max_requests_per_host: maximum number of requests per host + :return: list of :class:`Request ` objects. + """ + max_requests_per_host = kwargs.pop('max_requests_per_host') + min_hosts = kwargs.pop('min_hosts') + queue = {} + count = 0 + now_ts = int(time()) + max_host_items = 0 + to_remove = [] + start = 0 + last_start = -1 + while (count < max_n_requests or len(queue) < min_hosts) and last_start < start: + last_start = start + start, subset_count, max_host_items = self._get_items( + partition_id, start, now_ts, queue, max_requests_per_host, max_host_items, count, + max_n_requests, to_remove) + count += subset_count + + self._logger.debug("Finished: hosts {}, requests {}".format(len(queue.keys()), count)) + + results = [] + for host_crc32, items in queue.items(): + for item in items: + (_, _, _, encoded, score) = item + to_remove.append(packb(item)) + request = self._decoder.decode_request(encoded) + request.meta[FIELD_SCORE] = score + results.append(request) + if len(to_remove) > 0: + self._redis.zrem(partition_id, *to_remove) + return results + + def schedule(self, batch): + to_schedule = dict() + now = int(time()) + for fprint, score, request, schedule in batch: + if schedule: + # TODO: This is done by DomainMiddleware - RedisBackend should require DomainMiddleware + if FIELD_DOMAIN not in request.meta: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + if not hostname: + self._logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) + request.meta[FIELD_DOMAIN] = {'name': hostname} + timestamp = request.meta[FIELD_CRAWL_AT] if FIELD_CRAWL_AT in request.meta else now + to_schedule.setdefault(timestamp, []).append((request, score)) + for timestamp, batch in to_schedule.items(): + self._schedule(batch, timestamp) + + @classmethod + def get_interval_start(cls, score): + if score < cls.MIN_SCORE or score > cls.MAX_SCORE: + raise OverflowError + i = int(score / cls.SCORE_STEP) + if i % 10 == 0 and i > 0: + i -= 1 # last interval is inclusive from right + return i * cls.SCORE_STEP + + def _schedule(self, batch, timestamp): + data = dict() + for request, score in batch: + domain = request.meta[FIELD_DOMAIN] + fingerprint = request.meta[FIELD_FINGERPRINT] + if type(domain) == dict: + partition_id = self._partitioner.partition(domain[FIELD_NAME], self._partitions) + host_crc32 = get_crc32(domain[FIELD_NAME]) + elif type(domain) == int: + partition_id = self._partitioner.partition_by_hash(domain, self._partitions) + host_crc32 = domain + else: + raise TypeError("domain of unknown type.") + item = (timestamp, fingerprint, host_crc32, self._encoder.encode_request(request), score) + interval_start = self.get_interval_start(score) + data.setdefault(partition_id, {})[packb(item)] = int(interval_start * 100) + for (key, items) in data.items(): + self._redis_pipeline.zadd(key, mapping=items) + self._redis_pipeline.execute() + + def count(self): + return sum([self._redis.zcard(partition_id) for partition_id in self._partitions]) + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + +class RedisState(States): + def __init__(self, pool, cache_size_limit): + self._redis = RedisOperation(pool) + self._redis_pipeline = RedisPipeline(pool) + self._cache = {} + self._cache_size_limit = cache_size_limit + self._logger = logging.getLogger("redis_backend.states") + + def update_cache(self, objs): + objs = objs if isinstance(objs, Iterable) else [objs] + + def put(obj): + self._cache[obj.meta[FIELD_FINGERPRINT]] = obj.meta[FIELD_STATE] + + [put(obj) for obj in objs] + + def set_states(self, objs): + objs = objs if isinstance(objs, Iterable) else [objs] + + def get(obj): + fprint = obj.meta[FIELD_FINGERPRINT] + obj.meta[FIELD_STATE] = self._cache[fprint] if fprint in self._cache else States.DEFAULT + + [get(obj) for obj in objs] + + def flush(self, force_clear=False): + if len(self._cache) > self._cache_size_limit: + force_clear = True + [self._redis_pipeline.hmset(fprint, {FIELD_STATE: state}) for (fprint, state) in self._cache.items()] + self._redis_pipeline.execute() + if force_clear: + self._logger.debug("Cache has %d requests, clearing" % len(self._cache)) + self._cache.clear() + + def fetch(self, fingerprints): + to_fetch = [f for f in fingerprints if f not in self._cache] + self._logger.debug("cache size %s" % len(self._cache)) + self._logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) + [self._redis_pipeline.hgetall(key) for key in to_fetch] + responses = self._redis_pipeline.execute() + for index, key in enumerate(to_fetch): + response = responses[index] + if len(response) > 0 and FIELD_STATE in response: + self._cache[key] = response[FIELD_STATE] + else: + self._cache[key] = self.NOT_CRAWLED + + def frontier_start(self): + pass + + def frontier_stop(self): + self.flush(False) + + +class RedisMetadata(Metadata): + def __init__(self, pool, delete_all_keys): + self._redis = RedisOperation(pool) + self._redis_pipeline = RedisPipeline(pool) + self._logger = logging.getLogger("redis_backend.metadata") + if delete_all_keys: + self._redis.flushdb() + + @classmethod + def timestamp(cls): + return str(datetime.utcnow().replace(microsecond=0)) + + def _create_seed(self, seed): + return { + FIELD_URL: seed.url, + FIELD_DEPTH: 0, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: seed.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + + def add_seeds(self, seeds): + [self._redis_pipeline.hmset(seed.meta[FIELD_FINGERPRINT], self._create_seed(seed)) for seed in seeds] + self._redis_pipeline.execute() + + def _create_request_error(self, page, error): + return { + FIELD_URL: page.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_ERROR: error, + FIELD_DOMAIN_FINGERPRINT: page.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + + def request_error(self, page, error): + self._redis.hmset(page.meta[FIELD_FINGERPRINT], self._create_request_error(page, error)) + + @staticmethod + def _create_crawl_info(response): + return { + FIELD_STATUS_CODE: response.status_code + } + + def page_crawled(self, response): + self._redis.hmset(response.meta[FIELD_FINGERPRINT], self._create_crawl_info(response)) + + def _create_link_extracted(self, link): + return { + FIELD_URL: link.url, + FIELD_CREATED_AT: self.timestamp(), + FIELD_DOMAIN_FINGERPRINT: link.meta[FIELD_DOMAIN][FIELD_FINGERPRINT] + } + + def links_extracted(self, _, links): + links_deduped = {} + for link in links: + link_fingerprint = link.meta[FIELD_FINGERPRINT] + if link_fingerprint in links_deduped: + continue + links_deduped[link_fingerprint] = link + [self._redis_pipeline.hmset(fingerprint, self._create_link_extracted(link)) for (fingerprint, link) in + links_deduped.items()] + self._redis_pipeline.execute() + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + +class RedisBackend(DistributedBackend): + component_name = 'Redis Backend' + + def __init__(self, manager): + self.manager = manager + self._logger = logging.getLogger("redis_backend.backend") + settings = manager.settings + port = settings.get('REDIS_PORT') + host = settings.get('REDIS_HOST') + self._min_hosts = settings.get('BC_MIN_HOSTS') + self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') + + self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') + self._logger.info("RedisBackend started with {} partitions".format(self.queue_partitions)) + self.pool = ConnectionPool(host=host, port=port, db=0) + self._metadata = None + self._queue = None + self._states = None + + @classmethod + def strategy_worker(cls, manager): + o = cls(manager) + o._init(manager, "strategy_worker") + return o + + @classmethod + def db_worker(cls, manager): + o = cls(manager) + o._init(manager, "db_worker") + return o + + @classmethod + def local(cls, manager): + o = cls(manager) + o._init(manager) + return o + + def _init(self, manager, typ="all"): + settings = manager.settings + if typ in ["strategy_worker", "all"]: + self._states = RedisState(self.pool, settings.get('REDIS_STATE_CACHE_SIZE_LIMIT')) + if typ in ["db_worker", "all"]: + clear = settings.get('REDIS_DROP_ALL_TABLES') + self._queue = RedisQueue(manager, self.pool, self.queue_partitions, delete_all_keys=clear) + self._metadata = RedisMetadata( + self.pool, + clear + ) + + @property + def metadata(self): + return self._metadata + + @property + def queue(self): + return self._queue + + @property + def states(self): + return self._states + + def frontier_start(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_start() + + def frontier_stop(self): + for component in [self.metadata, self.queue, self.states]: + if component: + component.frontier_stop() + self.pool.disconnect() + + def add_seeds(self, seeds): + self.metadata.add_seeds(seeds) + + def page_crawled(self, response): + self.metadata.page_crawled(response) + + def links_extracted(self, request, links): + self.metadata.links_extracted(request, links) + + def request_error(self, page, error): + self.metadata.request_error(page, error) + + def finished(self): + raise NotImplementedError + + def get_next_requests(self, max_next_requests, **kwargs): + next_pages = [] + self._logger.debug("Querying queue table.") + partitions = set(kwargs.pop('partitions', [])) + for partition_id in partitions: + results = self.queue.get_next_requests(max_next_requests, partition_id, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host) + next_pages.extend(results) + self._logger.debug("Got %d requests for partition id %d", len(results), partition_id) + return next_pages diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index 8c7987bef..2c9822950 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -3,18 +3,64 @@ """ from __future__ import absolute_import import json +import six from base64 import b64decode, b64encode from frontera.core.codec import BaseDecoder, BaseEncoder -from w3lib.util import to_unicode, to_native_str -from frontera.utils.misc import dict_to_unicode, dict_to_bytes +from w3lib.util import to_unicode, to_bytes + + +def _convert_and_save_type(obj): + """ + :param obj: dict object + + The purpose of this method is to transform the given dict + into a form that would be able to serialize with JSONEncoder. + In order to implement this, this method converts all byte strings + inside a dict to unicode and saves their type for reverting to its + original state. The type and the value are stored as a tuple in the + following format: (original_type, converted value). All other objects + like dict, tuple, list are converted to the same format for the sake + of serialization and for the ease of reverting. + Refer `https://github.com/scrapinghub/frontera/pull/233#discussion_r97432868` + for the detailed explanation about the design. + """ + if isinstance(obj, bytes): + return 'bytes', to_unicode(obj) + elif isinstance(obj, dict): + return 'dict', [(_convert_and_save_type(k), _convert_and_save_type(v)) for k, v in six.iteritems(obj)] + elif isinstance(obj, (list, tuple)): + return type(obj).__name__, [_convert_and_save_type(item) for item in obj] + return 'other', obj + + +def _convert_from_saved_type(obj): + """ + :param obj: object returned by `_convert_and_save_type` + + Restores the original state of the object converted + earlier by `_convert_and_save_type`. This method considers every + first element of the nested tuple as the original type information and + the second value to be the converted value. It applies the original type + recursively on the object to retrieve the original form of the object. + """ + assert len(obj) == 2 + obj_type, obj_value = obj + if obj_type == 'bytes': + return to_bytes(obj_value) + elif obj_type == 'dict': + return dict([(_convert_from_saved_type(k), _convert_from_saved_type(v)) for k, v in obj_value]) + elif obj_type in ['list', 'tuple']: + _type = list if obj_type == 'list' else tuple + return _type([_convert_from_saved_type(item) for item in obj_value]) + return obj_value def _prepare_request_message(request): - return {'url': to_unicode(request.url), - 'method': to_unicode(request.method), - 'headers': dict_to_unicode(request.headers), - 'cookies': dict_to_unicode(request.cookies), - 'meta': dict_to_unicode(request.meta)} + return {'url': request.url, + 'method': request.method, + 'headers': request.headers, + 'cookies': request.cookies, + 'meta': request.meta} def _prepare_links_message(links): @@ -22,10 +68,10 @@ def _prepare_links_message(links): def _prepare_response_message(response, send_body): - return {'url': to_unicode(response.url), + return {'url': response.url, 'status_code': response.status_code, - 'meta': dict_to_unicode(response.meta), - 'body': to_unicode(b64encode(response.body)) if send_body else None} + 'meta': response.meta, + 'body': b64encode(response.body) if send_body else None} class CrawlFrontierJSONEncoder(json.JSONEncoder): @@ -45,11 +91,9 @@ def __init__(self, request_model, *a, **kw): self.send_body = kw.pop('send_body', False) super(Encoder, self).__init__(request_model, *a, **kw) - def encode_add_seeds(self, seeds): - return self.encode({ - 'type': 'add_seeds', - 'seeds': [_prepare_request_message(seed) for seed in seeds] - }) + def encode(self, obj): + encoded = _convert_and_save_type(obj) + return super(Encoder, self).encode(encoded) def encode_page_crawled(self, response): return self.encode({ @@ -93,6 +137,12 @@ def encode_offset(self, partition_id, offset): 'offset': int(offset) }) + def encode_stats(self, stats): + return self.encode({ + 'type': 'stats', + 'stats': stats + }) + class Decoder(json.JSONDecoder, BaseDecoder): def __init__(self, request_model, response_model, *a, **kw): @@ -101,52 +151,47 @@ def __init__(self, request_model, response_model, *a, **kw): super(Decoder, self).__init__(*a, **kw) def _response_from_object(self, obj): - url = to_native_str(obj[b'url']) + url = obj['url'] request = self._request_model(url=url, - meta=obj[b'meta']) + meta=obj['meta']) return self._response_model(url=url, - status_code=obj[b'status_code'], - body=b64decode(obj[b'body']), + status_code=obj['status_code'], + body=b64decode(obj['body']) if obj['body'] is not None else None, request=request) def _request_from_object(self, obj): - return self._request_model(url=to_native_str(obj[b'url']), - method=obj[b'method'], - headers=obj[b'headers'], - cookies=obj[b'cookies'], - meta=obj[b'meta']) + return self._request_model(url=obj['url'], + method=obj['method'], + headers=obj['headers'], + cookies=obj['cookies'], + meta=obj['meta']) def decode(self, message): - message = dict_to_bytes(super(Decoder, self).decode(message)) - if message[b'type'] == b'links_extracted': - request = self._request_from_object(message[b'r']) - links = [self._request_from_object(link) for link in message[b'links']] + message = _convert_from_saved_type(super(Decoder, self).decode(message)) + if message['type'] == 'links_extracted': + request = self._request_from_object(message['r']) + links = [self._request_from_object(link) for link in message['links']] return ('links_extracted', request, links) - if message[b'type'] == b'page_crawled': - response = self._response_from_object(message[b'r']) + if message['type'] == 'page_crawled': + response = self._response_from_object(message['r']) return ('page_crawled', response) - if message[b'type'] == b'request_error': - request = self._request_from_object(message[b'r']) - return ('request_error', request, to_native_str(message[b'error'])) - if message[b'type'] == b'update_score': - return ('update_score', self._request_from_object(message[b'r']), message[b'score'], message[b'schedule']) - if message[b'type'] == b'add_seeds': - seeds = [] - for seed in message[b'seeds']: - request = self._request_from_object(seed) - seeds.append(request) - return ('add_seeds', seeds) - if message[b'type'] == b'new_job_id': - return ('new_job_id', int(message[b'job_id'])) - if message[b'type'] == b'offset': - return ('offset', int(message[b'partition_id']), int(message[b'offset'])) - return TypeError('Unknown message type') + if message['type'] == 'request_error': + request = self._request_from_object(message['r']) + return ('request_error', request, message['error']) + if message['type'] == 'update_score': + return ('update_score', self._request_from_object(message['r']), message['score'], message['schedule']) + if message['type'] == 'new_job_id': + return ('new_job_id', int(message['job_id'])) + if message['type'] == 'offset': + return ('offset', int(message['partition_id']), int(message['offset'])) + if message['type'] == 'stats': + return ('stats', message['stats']) + raise TypeError('Unknown message type') def decode_request(self, message): - obj = dict_to_bytes(super(Decoder, self).decode(message)) - return self._request_model(url=to_native_str(obj[b'url']), - method=obj[b'method'], - headers=obj[b'headers'], - cookies=obj[b'cookies'], - meta=obj[b'meta']) - + obj = _convert_from_saved_type(super(Decoder, self).decode(message)) + return self._request_model(url=obj['url'], + method=obj['method'], + headers=obj['headers'], + cookies=obj['cookies'], + meta=obj['meta']) \ No newline at end of file diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 701f61732..0bb1bdc2d 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -3,67 +3,48 @@ """ from __future__ import absolute_import -from msgpack import packb, unpackb - from frontera.core.codec import BaseDecoder, BaseEncoder -import six +from frontera.utils.msgpack import restruct_for_pack +from msgpack import packb, unpackb from w3lib.util import to_native_str def _prepare_request_message(request): - def serialize(obj): - """Recursively walk object's hierarchy.""" - if isinstance(obj, six.text_type): - return obj.encode('utf8') - if isinstance(obj, (bool, six.integer_types, float, six.binary_type)): - return obj - elif isinstance(obj, dict): - obj = obj.copy() - for key in obj: - obj[key] = serialize(obj[key]) - return obj - elif isinstance(obj, list): - return [serialize(item) for item in obj] - elif isinstance(obj, tuple): - return tuple(serialize([item for item in obj])) - elif hasattr(obj, '__dict__'): - return serialize(obj.__dict__) - else: - return None - return [request.url, request.method, request.headers, request.cookies, serialize(request.meta)] + return [request.url, request.method, request.headers, request.cookies, restruct_for_pack(request.meta)] def _prepare_response_message(response, send_body): - return [response.url, response.status_code, response.meta, response.body if send_body else None] + return [response.url, response.status_code, response.meta, response.headers, response.body if send_body else None] class Encoder(BaseEncoder): def __init__(self, request_model, *a, **kw): self.send_body = True if 'send_body' in kw and kw['send_body'] else False - def encode_add_seeds(self, seeds): - return packb([b'as', [_prepare_request_message(seed) for seed in seeds]]) - def encode_page_crawled(self, response): - return packb([b'pc', _prepare_response_message(response, self.send_body)]) + return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True) def encode_links_extracted(self, request, links): - return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]]) + return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]], + use_bin_type=True) def encode_request_error(self, request, error): - return packb([b're', _prepare_request_message(request), str(error)]) + return packb([b're', _prepare_request_message(request), str(error)], use_bin_type=True) def encode_request(self, request): - return packb(_prepare_request_message(request)) + return packb(_prepare_request_message(request), use_bin_type=True) def encode_update_score(self, request, score, schedule): - return packb([b'us', _prepare_request_message(request), score, schedule]) + return packb([b'us', _prepare_request_message(request), score, schedule], use_bin_type=True) def encode_new_job_id(self, job_id): - return packb([b'njid', int(job_id)]) + return packb([b'njid', int(job_id)], use_bin_type=True) def encode_offset(self, partition_id, offset): - return packb([b'of', int(partition_id), int(offset)]) + return packb([b'of', int(partition_id), int(offset)], use_bin_type=True) + + def encode_stats(self, stats): + return packb([b'st', stats], use_bin_type=True) class Decoder(BaseDecoder): @@ -75,7 +56,8 @@ def _response_from_object(self, obj): url = to_native_str(obj[0]) return self._response_model(url=url, status_code=obj[1], - body=obj[3], + body=obj[4], + headers=obj[3], request=self._request_model(url=url, meta=obj[2])) @@ -87,7 +69,7 @@ def _request_from_object(self, obj): meta=obj[4]) def decode(self, buffer): - obj = unpackb(buffer) + obj = unpackb(buffer, encoding='utf-8') if obj[0] == b'pc': return ('page_crawled', self._response_from_object(obj[1])) @@ -99,13 +81,15 @@ def decode(self, buffer): return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3]) if obj[0] == b're': return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2])) - if obj[0] == b'as': - return ('add_seeds', [self._request_from_object(x) for x in obj[1]]) if obj[0] == b'njid': return ('new_job_id', int(obj[1])) if obj[0] == b'of': return ('offset', int(obj[1]), int(obj[2])) - return TypeError('Unknown message type') + if obj[0] == b'st': + return ('stats', obj[1]) + raise TypeError('Unknown message type') def decode_request(self, buffer): - return self._request_from_object(unpackb(buffer)) + return self._request_from_object(unpackb(buffer, encoding='utf-8')) + + diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index f3827c22a..b9cdf63c9 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -27,7 +27,10 @@ def __init__(self, manager): self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, - self._logger.debug) + max_per_key=settings.get('OVERUSED_MAX_PER_KEY'), + keep_per_key=settings.get("OVERUSED_KEEP_PER_KEY"), + max_keys=settings.get('OVERUSED_MAX_KEYS'), + keep_keys=settings.get('OVERUSED_KEEP_KEYS')) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod @@ -39,12 +42,10 @@ def frontier_start(self): def frontier_stop(self): self.spider_log_producer.flush() + self.consumer.close() def add_seeds(self, seeds): - per_host = aggregate_per_host(seeds) - for host_fprint, host_links in six.iteritems(per_host): - self.spider_log_producer.send(host_fprint, - self._encoder.encode_add_seeds(host_links)) + raise NotImplementedError("The seeds addition using spider log isn't allowed") def page_crawled(self, response): host_fprint = get_host_fprint(response) diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index b8e7b8aa1..9b46d2efe 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -1,108 +1,12 @@ from __future__ import absolute_import -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker -from sqlalchemy.engine.reflection import Inspector - -from frontera.core.components import DistributedBackend -from frontera.contrib.backends import CommonBackend -from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States +from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States, DomainMetadata from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase +from frontera.core.components import DistributedBackend from frontera.utils.misc import load_object - - -class SQLAlchemyBackend(CommonBackend): - def __init__(self, manager): - self.manager = manager - settings = manager.settings - engine = settings.get('SQLALCHEMYBACKEND_ENGINE') - engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO') - drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - models = settings.get('SQLALCHEMYBACKEND_MODELS') - - self.engine = create_engine(engine, echo=engine_echo) - self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) - - if drop_all_tables: - DeclarativeBase.metadata.drop_all(self.engine) - DeclarativeBase.metadata.create_all(self.engine) - - self.session_cls = sessionmaker() - self.session_cls.configure(bind=self.engine) - - if clear_content: - session = self.session_cls() - for name, table in DeclarativeBase.metadata.tables.items(): - session.execute(table.delete()) - session.close() - self._metadata = Metadata(self.session_cls, self.models['MetadataModel'], - settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) - self._states = States(self.session_cls, self.models['StateModel'], - settings.get('STATE_CACHE_SIZE_LIMIT')) - self._queue = self._create_queue(settings) - - def frontier_stop(self): - super(SQLAlchemyBackend, self).frontier_stop() - self.engine.dispose() - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - @property - def queue(self): - return self._queue - - @property - def metadata(self): - return self._metadata - - @property - def states(self): - return self._states - - -class FIFOBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy FIFO Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - ordering='created') - - -class LIFOBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy LIFO Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - ordering='created_desc') - - -class DFSBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy DFS Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - def _get_score(self, obj): - return -obj.meta[b'depth'] - - -class BFSBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy BFS Backend' - - def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) - - def _get_score(self, obj): - return obj.meta[b'depth'] - - -BASE = CommonBackend -LIFO = LIFOBackend -FIFO = FIFOBackend -DFS = DFSBackend -BFS = BFSBackend +from sqlalchemy import create_engine +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.orm import sessionmaker class Distributed(DistributedBackend): @@ -119,57 +23,60 @@ def __init__(self, manager): self._metadata = None self._queue = None self._states = None + self._domain_metadata = None + + def check_and_create_tables(self, is_drop, is_clear, models): + inspector = Inspector.from_engine(self.engine) + for model in models: + if is_drop: + if model.__table__.name in inspector.get_table_names(): + model.__table__.drop(bind=self.engine) + if model.__table__.name not in inspector.get_table_names(): + model.__table__.create(bind=self.engine) + if is_clear: + session = self.session_cls() + session.execute(model.__table__.delete()) + session.close() + + def _init_strategy_worker(self, manager): + settings = manager.settings + drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') + clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') + model_states = self.models['StateModel'] + model_dm = self.models['DomainMetadataModel'] + self.check_and_create_tables(drop_all_tables, clear_content, (model_states, model_dm)) + self._states = States(self.session_cls, model_states, + settings.get('STATE_CACHE_SIZE_LIMIT')) + self._domain_metadata = DomainMetadata(self.session_cls) + + def _init_db_worker(self, manager): + settings = manager.settings + drop = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') + clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') + metadata_m = self.models['MetadataModel'] + queue_m = self.models['QueueModel'] + self.check_and_create_tables(drop, clear_content, (metadata_m, queue_m)) + self._metadata = Metadata(self.session_cls, metadata_m, + settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) + self._queue = Queue(self.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) @classmethod def strategy_worker(cls, manager): b = cls(manager) - settings = manager.settings - drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - model = b.models['StateModel'] - inspector = Inspector.from_engine(b.engine) - - if drop_all_tables: - if model.__table__.name in inspector.get_table_names(): - model.__table__.drop(bind=b.engine) - model.__table__.create(bind=b.engine) - - if clear_content: - session = b.session_cls() - session.execute(model.__table__.delete()) - session.close() - b._states = States(b.session_cls, model, - settings.get('STATE_CACHE_SIZE_LIMIT')) + b._init_strategy_worker(manager) return b @classmethod def db_worker(cls, manager): b = cls(manager) - settings = manager.settings - drop = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - inspector = Inspector.from_engine(b.engine) - - metadata_m = b.models['MetadataModel'] - queue_m = b.models['QueueModel'] - if drop: - existing = inspector.get_table_names() - if metadata_m.__table__.name in existing: - metadata_m.__table__.drop(bind=b.engine) - if queue_m.__table__.name in existing: - queue_m.__table__.drop(bind=b.engine) - metadata_m.__table__.create(bind=b.engine) - queue_m.__table__.create(bind=b.engine) - - if clear_content: - session = b.session_cls() - session.execute(metadata_m.__table__.delete()) - session.execute(queue_m.__table__.delete()) - session.close() - - b._metadata = Metadata(b.session_cls, metadata_m, - settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) - b._queue = Queue(b.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) + b._init_db_worker(manager) + return b + + @classmethod + def local(cls, manager): + b = cls(manager) + b._init_db_worker(manager) + b._init_strategy_worker(manager) return b @property @@ -184,13 +91,17 @@ def metadata(self): def states(self): return self._states + @property + def domain_metadata(self): + return self._domain_metadata + def frontier_start(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_start() def frontier_stop(self): - for component in [self.metadata, self.queue, self.states]: + for component in [self.metadata, self.queue, self.states, self.domain_metadata]: if component: component.frontier_stop() diff --git a/frontera/contrib/backends/sqlalchemy/components.py b/frontera/contrib/backends/sqlalchemy/components.py index 8661ac576..160cf7a2d 100644 --- a/frontera/contrib/backends/sqlalchemy/components.py +++ b/frontera/contrib/backends/sqlalchemy/components.py @@ -1,18 +1,19 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import logging -from datetime import datetime + from time import time, sleep +import logging +import six from cachetools import LRUCache -from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from datetime import datetime from frontera.contrib.backends.memory import MemoryStates -from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase -from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase, DomainMetadataModel as DomainMetadataKV +from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue, DomainMetadata as BaseDomainMetadata from frontera.core.models import Request, Response from frontera.utils.misc import get_crc32, chunks from frontera.utils.url import parse_domain_from_url_fast -import six from six.moves import range from w3lib.util import to_native_str, to_bytes @@ -23,6 +24,8 @@ def func_wrapper(self, *args, **kwargs): while True: try: return func(self, *args, **kwargs) + except KeyError as exc: + raise except Exception as exc: self.logger.exception(exc) self.session.rollback() @@ -38,7 +41,7 @@ def func_wrapper(self, *args, **kwargs): class Metadata(BaseMetadata): def __init__(self, session_cls, model_cls, cache_size): - self.session = session_cls(expire_on_commit=False) # FIXME: Should be explicitly mentioned in docs + self.session = session_cls(expire_on_commit=False) self.model = model_cls self.table = DeclarativeBase.metadata.tables['metadata'] self.cache = LRUCache(cache_size) @@ -135,13 +138,13 @@ def fetch(self, fingerprints): self._cache[to_bytes(state.fingerprint)] = state.state @retry_and_rollback - def flush(self, force_clear=False): + def flush(self): for fingerprint, state_val in six.iteritems(self._cache): state = self.model(fingerprint=to_native_str(fingerprint), state=state_val) self.session.merge(state) self.session.commit() self.logger.debug("State cache has been flushed.") - super(States, self).flush(force_clear) + super(States, self).flush() class Queue(BaseQueue): @@ -275,3 +278,39 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): self.session.delete(item) self.session.commit() return results + + +class DomainMetadata(BaseDomainMetadata): + def __init__(self, session_cls): + self.session = session_cls(expire_on_commit=False) + self.table = DeclarativeBase.metadata.tables['domain_metadata'] + self.logger = logging.getLogger("sqlalchemy.domain_metadata") + + def frontier_stop(self): + self.session.close() + + @retry_and_rollback + def __setitem__(self, key, value): + pair = DomainMetadataKV(key=key, value=value) + self.session.merge(pair) + self.session.commit() + + @retry_and_rollback + def __getitem__(self, key): + result = self.session.query(DomainMetadataKV).filter(DomainMetadataKV.key == key).first() + if result is None: + raise KeyError + return result.value + + @retry_and_rollback + def __contains__(self, key): + result = self.session.query(DomainMetadataKV.key).filter(DomainMetadataKV.key == key).first() + if result is not None: + return True + return False + + @retry_and_rollback + def __delitem__(self, key): + self.session.query(DomainMetadataKV).filter(DomainMetadataKV.key == key).delete(synchronize_session=False) + self.session.commit() + diff --git a/frontera/contrib/backends/sqlalchemy/models.py b/frontera/contrib/backends/sqlalchemy/models.py index 8211d21c6..8be5fb0af 100644 --- a/frontera/contrib/backends/sqlalchemy/models.py +++ b/frontera/contrib/backends/sqlalchemy/models.py @@ -90,3 +90,24 @@ def query(cls, session): def __repr__(self): return '' % (self.url, self.id) + + +class DomainMetadataModel(DeclarativeBase): + __tablename__ = 'domain_metadata' + __table_args__ = ( + { + 'mysql_charset': 'utf8', + 'mysql_engine': 'InnoDB', + 'mysql_row_format': 'DYNAMIC', + }, + ) + + key = Column(String(256), primary_key=True, nullable=False) + value = Column(PickleType()) + + @classmethod + def query(cls, session): + return session.query(cls) + + def __repr__(self): + return '' % (self.key) \ No newline at end of file diff --git a/frontera/contrib/backends/sqlalchemy/revisiting.py b/frontera/contrib/backends/sqlalchemy/revisiting.py deleted file mode 100644 index b2b574715..000000000 --- a/frontera/contrib/backends/sqlalchemy/revisiting.py +++ /dev/null @@ -1,132 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -import logging -from datetime import datetime, timedelta -from time import time, sleep -from calendar import timegm - -from sqlalchemy import Column, BigInteger - -from frontera import Request -from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.contrib.backends.sqlalchemy import SQLAlchemyBackend -from frontera.contrib.backends.sqlalchemy.models import QueueModelMixin, DeclarativeBase -from frontera.core.components import Queue as BaseQueue, States -from frontera.utils.misc import get_crc32 -from frontera.utils.url import parse_domain_from_url_fast -from six.moves import range - - -def utcnow_timestamp(): - d = datetime.utcnow() - return timegm(d.timetuple()) - - -class RevisitingQueueModel(QueueModelMixin, DeclarativeBase): - __tablename__ = 'revisiting_queue' - - crawl_at = Column(BigInteger, nullable=False) - - -def retry_and_rollback(func): - def func_wrapper(self, *args, **kwargs): - tries = 5 - while True: - try: - return func(self, *args, **kwargs) - except Exception as exc: - self.logger.exception(exc) - self.session.rollback() - sleep(5) - tries -= 1 - if tries > 0: - self.logger.info("Tries left %i" % tries) - continue - else: - raise exc - return func_wrapper - - -class RevisitingQueue(BaseQueue): - def __init__(self, session_cls, queue_cls, partitions): - self.session = session_cls() - self.queue_model = queue_cls - self.logger = logging.getLogger("sqlalchemy.revisiting.queue") - self.partitions = [i for i in range(0, partitions)] - self.partitioner = Crc32NamePartitioner(self.partitions) - - def frontier_stop(self): - self.session.close() - - def get_next_requests(self, max_n_requests, partition_id, **kwargs): - results = [] - try: - for item in self.session.query(self.queue_model).\ - filter(RevisitingQueueModel.crawl_at <= utcnow_timestamp(), - RevisitingQueueModel.partition_id == partition_id).\ - limit(max_n_requests): - method = 'GET' if not item.method else item.method - results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, - cookies=item.cookies)) - self.session.delete(item) - self.session.commit() - except Exception as exc: - self.logger.exception(exc) - self.session.rollback() - return results - - @retry_and_rollback - def schedule(self, batch): - to_save = [] - for fprint, score, request, schedule in batch: - if schedule: - _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) - if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) - partition_id = self.partitions[0] - host_crc32 = 0 - else: - partition_id = self.partitioner.partition(hostname, self.partitions) - host_crc32 = get_crc32(hostname) - schedule_at = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else utcnow_timestamp() - q = self.queue_model(fingerprint=fprint, score=score, url=request.url, meta=request.meta, - headers=request.headers, cookies=request.cookies, method=request.method, - partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6, - crawl_at=schedule_at) - to_save.append(q) - request.meta[b'state'] = States.QUEUED - self.session.bulk_save_objects(to_save) - self.session.commit() - - @retry_and_rollback - def count(self): - return self.session.query(self.queue_model).count() - - -class Backend(SQLAlchemyBackend): - - def _create_queue(self, settings): - self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") - assert isinstance(self.interval, timedelta) - self.interval = self.interval.total_seconds() - return RevisitingQueue(self.session_cls, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) - - def _schedule(self, requests): - batch = [] - for request in requests: - if request.meta[b'state'] in [States.NOT_CRAWLED]: - request.meta[b'crawl_at'] = utcnow_timestamp() - elif request.meta[b'state'] in [States.CRAWLED, States.ERROR]: - request.meta[b'crawl_at'] = utcnow_timestamp() + self.interval - else: - continue # QUEUED - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, True)) - self.queue.schedule(batch) - self.metadata.update_score(batch) - self.queue_size += len(batch) - - def page_crawled(self, response): - super(Backend, self).page_crawled(response) - self.states.set_states(response.request) - self._schedule([response.request]) - self.states.update_cache(response.request) diff --git a/frontera/contrib/canonicalsolvers/basic.py b/frontera/contrib/canonicalsolvers/basic.py index 944d8c6c1..1909fdf0a 100644 --- a/frontera/contrib/canonicalsolvers/basic.py +++ b/frontera/contrib/canonicalsolvers/basic.py @@ -31,6 +31,9 @@ def links_extracted(self, request, links): def request_error(self, page, error): self._set_canonical(page) + def create_request(self, request): + self._set_canonical(request) + def _set_canonical(self, obj): if b'redirect_urls' in obj.meta: redirect_urls = obj.meta[b'redirect_urls'] diff --git a/frontera/contrib/messagebus/kafka/__init__.py b/frontera/contrib/messagebus/kafka/__init__.py index 7c68785e9..e69de29bb 100644 --- a/frontera/contrib/messagebus/kafka/__init__.py +++ b/frontera/contrib/messagebus/kafka/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/offsets_fetcher.py similarity index 95% rename from frontera/contrib/messagebus/kafka/async.py rename to frontera/contrib/messagebus/kafka/offsets_fetcher.py index 8ef89f4f0..15630e290 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/offsets_fetcher.py @@ -22,7 +22,7 @@ class OffsetsFetcherAsync(object): 'heartbeat_interval_ms': 3000, 'retry_backoff_ms': 100, 'api_version': (0, 9), - 'metric_group_prefix': '', + 'metric_group_prefix': '' } def __init__(self, **configs): @@ -162,9 +162,9 @@ def offsets(self, partitions, timestamp): Returns: dict: TopicPartition and message offsets """ - while True: + retries = 3 + while retries > 0: offsets = {} - ok = True for future in self._send_offset_request(partitions, timestamp): self._client.poll(future=future) @@ -178,11 +178,14 @@ def offsets(self, partitions, timestamp): if future.exception.invalid_metadata: refresh_future = self._client.cluster.request_update() - self._client.poll(future=refresh_future, sleep=True) - ok = False - break - if ok: + self._client.poll(future=refresh_future) + log.warning("Got exception %s and kept the loop", future.exception) + if offsets: return offsets + retries -= 1 + log.warning("Retrying the offsets fetch loop (%d retries left)", retries) + log.error("Unsuccessful offsets retrieval") + return {} def _send_offset_request(self, partitions, timestamp): """Fetch a single offset before the given timestamp for the partition. @@ -201,16 +204,17 @@ def _send_offset_request(self, partitions, timestamp): if node_id is None: log.debug("Partition %s is unknown for fetching offset," " wait for metadata refresh", partition) - return Future().failure(Errors.StaleMetadata(partition)) + return [Future().failure(Errors.StaleMetadata(partition))] elif node_id == -1: log.debug("Leader for partition %s unavailable for fetching offset," " wait for metadata refresh", partition) - return Future().failure(Errors.LeaderNotAvailableError(partition)) + return [Future().failure(Errors.LeaderNotAvailableError(partition))] nodes_per_partitions.setdefault(node_id, []).append(partition) # Client returns a future that only fails on network issues # so create a separate future and attach a callback to update it # based on response error codes + futures = [] for node_id, partitions in six.iteritems(nodes_per_partitions): request = OffsetRequest[0]( @@ -219,8 +223,13 @@ def _send_offset_request(self, partitions, timestamp): future_request = Future() _f = self._client.send(node_id, request) _f.add_callback(self._handle_offset_response, partitions, future_request) - _f.add_errback(lambda e: future_request.failure(e)) + + def errback(e): + log.error("Offset request errback error %s", e) + future_request.failure(e) + _f.add_errback(errback) futures.append(future_request) + return futures def _handle_offset_response(self, partitions, future, response): diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 490262891..10e5d5b53 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -8,24 +8,40 @@ from kafka import KafkaConsumer, KafkaProducer, TopicPartition from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner -from frontera.contrib.messagebus.kafka.async import OffsetsFetcherAsync +from frontera.contrib.messagebus.kafka.offsets_fetcher import OffsetsFetcherAsync from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseSpiderFeedStream, \ - BaseStreamConsumer, BaseScoringLogStream, BaseStreamProducer + BaseStreamConsumer, BaseScoringLogStream, BaseStreamProducer, BaseStatsLogStream from twisted.internet.task import LoopingCall from traceback import format_tb +from os.path import join as os_path_join +DEFAULT_BATCH_SIZE = 1024 * 1024 +DEFAULT_BUFFER_MEMORY = 130 * 1024 * 1024 +DEFAULT_MAX_REQUEST_SIZE = 4 * 1024 * 1024 + logger = getLogger("messagebus.kafka") +def _prepare_kafka_ssl_kwargs(cert_path): + """Prepare SSL kwargs for Kafka producer/consumer.""" + return { + 'security_protocol': 'SSL', + 'ssl_cafile': os_path_join(cert_path, 'ca-cert.pem'), + 'ssl_certfile': os_path_join(cert_path, 'client-cert.pem'), + 'ssl_keyfile': os_path_join(cert_path, 'client-key.pem') + } + + class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. """ - def __init__(self, location, topic, group, partition_id): + def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id): self._location = location self._group = group self._topic = topic + kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {} self._consumer = KafkaConsumer( bootstrap_servers=self._location, group_id=self._group, @@ -33,33 +49,19 @@ def __init__(self, location, topic, group, partition_id): consumer_timeout_ms=100, client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), request_timeout_ms=120 * 1000, + heartbeat_interval_ms=10000, + **kwargs ) + # explicitly causing consumer to bootstrap the cluster metadata + self._consumer.topics() + if partition_id is not None: - self._partition_ids = [TopicPartition(self._topic, partition_id)] - self._consumer.assign(self._partition_ids) + self._partitions = [TopicPartition(self._topic, partition_id)] + self._consumer.assign(self._partitions) else: - self._partition_ids = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] + self._partitions = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] self._consumer.subscribe(topics=[self._topic]) - if self._consumer._use_consumer_group(): - self._consumer._coordinator.ensure_coordinator_known() - self._consumer._coordinator.ensure_active_group() - - self._consumer._update_fetch_positions(self._partition_ids) - self._start_looping_call() - - def _start_looping_call(self, interval=60): - def errback(failure): - logger.exception(failure.value) - if failure.frames: - logger.critical(str("").join(format_tb(failure.getTracebackObject()))) - self._poll_task.start(interval).addErrback(errback) - - self._poll_task = LoopingCall(self._poll_client) - self._poll_task.start(interval).addErrback(errback) - - def _poll_client(self): - self._consumer._client.poll() def get_messages(self, timeout=0.1, count=1): result = [] @@ -73,33 +75,31 @@ def get_messages(self, timeout=0.1, count=1): return result def get_offset(self, partition_id): - for tp in self._partition_ids: + for tp in self._partitions: if tp.partition == partition_id: return self._consumer.position(tp) raise KeyError("Can't find partition %d", partition_id) def close(self): - self._poll_task.stop() self._consumer.commit() - # getting kafka client event loop running some more and execute commit - tries = 3 - while tries: - self.get_messages() - sleep(2.0) - tries -= 1 self._consumer.close() class SimpleProducer(BaseStreamProducer): - def __init__(self, location, topic, compression): + def __init__(self, location, enable_ssl, cert_path, topic, compression, **kwargs): self._location = location self._topic = topic self._compression = compression - self._create() + self._create(enable_ssl, cert_path, **kwargs) - def _create(self): - self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, - compression_type=self._compression) + def _create(self, enable_ssl, cert_path, **kwargs): + max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE) + kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}) + self._producer = KafkaProducer(bootstrap_servers=self._location, + retries=5, + compression_type=self._compression, + max_request_size=max_request_size, + **kwargs) def send(self, key, *messages): for msg in messages: @@ -113,13 +113,19 @@ def close(self): class KeyedProducer(BaseStreamProducer): - def __init__(self, location, topic_done, partitioner, compression): + def __init__(self, location, enable_ssl, cert_path, topic_done, partitioner, compression, **kwargs): self._location = location self._topic_done = topic_done self._partitioner = partitioner self._compression = compression - self._producer = KafkaProducer(bootstrap_servers=self._location, partitioner=partitioner, retries=5, - compression_type=self._compression) + max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE) + kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}) + self._producer = KafkaProducer(bootstrap_servers=self._location, + partitioner=partitioner, + retries=5, + compression_type=self._compression, + max_request_size=max_request_size, + **kwargs) def send(self, key, *messages): for msg in messages: @@ -140,10 +146,14 @@ def __init__(self, messagebus): self._topic = messagebus.topic_done self._codec = messagebus.codec self._partitions = messagebus.spider_log_partitions + self._enable_ssl = messagebus.enable_ssl + self._cert_path = messagebus.cert_path def producer(self): - return KeyedProducer(self._location, self._topic, FingerprintPartitioner(self._partitions), - self._codec) + return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, + FingerprintPartitioner(self._partitions), self._codec, + batch_size=DEFAULT_BATCH_SIZE, + buffer_memory=DEFAULT_BUFFER_MEMORY) def consumer(self, partition_id, type): """ @@ -153,7 +163,7 @@ def consumer(self, partition_id, type): :return: """ group = self._sw_group if type == b'sw' else self._db_group - c = Consumer(self._location, self._topic, group, partition_id) + c = Consumer(self._location, self._enable_ssl, self._cert_path, self._topic, group, partition_id) assert len(c._consumer.partitions_for_topic(self._topic)) == self._partitions return c @@ -165,14 +175,23 @@ def __init__(self, messagebus): self._topic = messagebus.topic_todo self._max_next_requests = messagebus.max_next_requests self._hostname_partitioning = messagebus.hostname_partitioning - self._offset_fetcher = OffsetsFetcherAsync(bootstrap_servers=self._location, topic=self._topic, - group_id=self._general_group) + self._enable_ssl = messagebus.enable_ssl + self._cert_path = messagebus.cert_path + kwargs = { + 'bootstrap_servers': self._location, + 'topic': self._topic, + 'group_id': self._general_group, + } + if self._enable_ssl: + kwargs.update(_prepare_kafka_ssl_kwargs(self._cert_path)) + self._offset_fetcher = OffsetsFetcherAsync(**kwargs) self._codec = messagebus.codec self._partitions = messagebus.spider_feed_partitions def consumer(self, partition_id): - c = Consumer(self._location, self._topic, self._general_group, partition_id) - assert len(c._consumer.partitions_for_topic(self._topic)) == self._partitions + c = Consumer(self._location, self._enable_ssl, self._cert_path, self._topic, self._general_group, partition_id) + assert len(c._consumer.partitions_for_topic(self._topic)) == self._partitions, \ + "Number of kafka topic partitions doesn't match value in config for spider feed" return c def available_partitions(self): @@ -186,7 +205,9 @@ def available_partitions(self): def producer(self): partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ else FingerprintPartitioner(self._partitions) - return KeyedProducer(self._location, self._topic, partitioner, self._codec) + return KeyedProducer(self._location, self._enable_ssl, self._cert_path, self._topic, partitioner, self._codec, + batch_size=DEFAULT_BATCH_SIZE, + buffer_memory=DEFAULT_BUFFER_MEMORY) class ScoringLogStream(BaseScoringLogStream): @@ -195,12 +216,28 @@ def __init__(self, messagebus): self._group = messagebus.scoringlog_dbw_group self._location = messagebus.kafka_location self._codec = messagebus.codec + self._cert_path = messagebus.cert_path + self._enable_ssl = messagebus.enable_ssl def consumer(self): - return Consumer(self._location, self._topic, self._group, partition_id=None) + return Consumer(self._location, self._enable_ssl, self._cert_path, self._topic, self._group, partition_id=None) def producer(self): - return SimpleProducer(self._location, self._topic, self._codec) + return SimpleProducer(self._location, self._enable_ssl, self._cert_path, self._topic, self._codec, + batch_size=DEFAULT_BATCH_SIZE, + buffer_memory=DEFAULT_BUFFER_MEMORY) + + +class StatsLogStream(ScoringLogStream, BaseStatsLogStream): + """Stats log stream implementation for Kafka message bus. + + The interface is the same as for scoring log stream, so it's better + to reuse it with proper topic and group. + """ + def __init__(self, messagebus): + super(StatsLogStream, self).__init__(messagebus) + self._topic = messagebus.topic_stats + self._group = messagebus.statslog_reader_group class MessageBus(BaseMessageBus): @@ -208,16 +245,20 @@ def __init__(self, settings): self.topic_todo = settings.get('SPIDER_FEED_TOPIC') self.topic_done = settings.get('SPIDER_LOG_TOPIC') self.topic_scoring = settings.get('SCORING_LOG_TOPIC') + self.topic_stats = settings.get('STATS_LOG_TOPIC') self.spiderlog_dbw_group = settings.get('SPIDER_LOG_DBW_GROUP') self.spiderlog_sw_group = settings.get('SPIDER_LOG_SW_GROUP') self.scoringlog_dbw_group = settings.get('SCORING_LOG_DBW_GROUP') + self.statslog_reader_group = settings.get('STATS_LOG_READER_GROUP') self.spider_feed_group = settings.get('SPIDER_FEED_GROUP') self.spider_partition_id = settings.get('SPIDER_PARTITION_ID') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.hostname_partitioning = settings.get('QUEUE_HOSTNAME_PARTITIONING') self.codec = settings.get('KAFKA_CODEC') self.kafka_location = settings.get('KAFKA_LOCATION') + self.enable_ssl = settings.get('KAFKA_ENABLE_SSL') + self.cert_path = settings.get('KAFKA_CERT_PATH') self.spider_log_partitions = settings.get('SPIDER_LOG_PARTITIONS') self.spider_feed_partitions = settings.get('SPIDER_FEED_PARTITIONS') @@ -229,3 +270,6 @@ def spider_feed(self): def scoring_log(self): return ScoringLogStream(self) + + def stats_log(self): + return StatsLogStream(self) diff --git a/frontera/contrib/messagebus/zeromq/__init__.py b/frontera/contrib/messagebus/zeromq/__init__.py index ab1a56155..5f52295ec 100644 --- a/frontera/contrib/messagebus/zeromq/__init__.py +++ b/frontera/contrib/messagebus/zeromq/__init__.py @@ -8,7 +8,7 @@ import six from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseSpiderFeedStream, BaseScoringLogStream + BaseSpiderFeedStream, BaseScoringLogStream, BaseStatsLogStream, BaseStreamProducer from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner from frontera.contrib.messagebus.zeromq.socket_config import SocketConfig from six.moves import range @@ -61,7 +61,7 @@ def get_offset(self, partition_id): return self.counter -class Producer(object): +class Producer(BaseStreamProducer): def __init__(self, context, location, identity): self.identity = identity self.sender = context.zeromq.socket(zmq.PUB) @@ -98,7 +98,7 @@ def flush(self): pass def get_offset(self, partition_id): - return self.counters[partition_id] + return self.counters.get(partition_id, None) class SpiderLogProducer(Producer): @@ -123,9 +123,9 @@ def consumer(self, partition_id, type): return Consumer(self.context, location, partition_id, b'sl') -class UpdateScoreProducer(Producer): - def __init__(self, context, location): - super(UpdateScoreProducer, self).__init__(context, location, b'us') +class NonPartitionedProducer(Producer): + def __init__(self, context, location, identity): + super(NonPartitionedProducer, self).__init__(context, location, identity) def send(self, key, *messages): # Guarantee that msg is actually a list or tuple (should always be true) @@ -155,7 +155,7 @@ def consumer(self): return Consumer(self.context, self.out_location, None, b'us') def producer(self): - return UpdateScoreProducer(self.context, self.in_location) + return NonPartitionedProducer(self.context, self.in_location, b'us') class SpiderFeedProducer(Producer): @@ -194,6 +194,29 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) +class DevNullProducer(BaseStreamProducer): + def send(self, key, *messages): + pass + + def flush(self): + pass + + def get_offset(self, partition_id): + pass + + +class StatsLogStream(BaseStatsLogStream): + def __init__(self, messagebus): + self.context = messagebus.context + self.in_location = messagebus.socket_config.stats_out() + + def consumer(self): + pass + + def producer(self): + return DevNullProducer() + + class Context(object): zeromq = zmq.Context() @@ -221,3 +244,6 @@ def scoring_log(self): def spider_feed(self): return SpiderFeedStream(self) + + def stats_log(self): + return StatsLogStream(self) diff --git a/frontera/contrib/messagebus/zeromq/socket_config.py b/frontera/contrib/messagebus/zeromq/socket_config.py index 6ecddf842..097034c9f 100644 --- a/frontera/contrib/messagebus/zeromq/socket_config.py +++ b/frontera/contrib/messagebus/zeromq/socket_config.py @@ -61,3 +61,9 @@ def db_out(self): TCP socket for outgoing DW messages """ return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 5) + + def stats_out(self): + """ + TCP socket for outgoing stats + """ + return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 6) diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index c8a0f117a..8f81af184 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -90,14 +90,17 @@ def links_extracted(self, request, links): def request_error(self, request, error): return self._add_domain(request) + def create_request(self, request): + return self._add_domain(request) + def _add_domain(self, obj): - obj.meta[b'domain'] = self.parse_domain_info(obj.url, self.manager.test_mode) + obj.meta[b'domain'] = self._parse_domain_info(obj.url) if b'redirect_urls' in obj.meta: - obj.meta[b'redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode) + obj.meta[b'redirect_domains'] = [self._parse_domain_info(url) for url in obj.meta[b'redirect_urls']] return obj - def parse_domain_info(self, url, test_mode=False): + def _parse_domain_info(self, url, test_mode=False): if test_mode: match = re.match('([A-Z])\w+', url) netloc = name = to_bytes(match.groups()[0]) if match else b'?' diff --git a/frontera/contrib/middlewares/fingerprint.py b/frontera/contrib/middlewares/fingerprint.py index bb08fca7c..ca79efc3a 100644 --- a/frontera/contrib/middlewares/fingerprint.py +++ b/frontera/contrib/middlewares/fingerprint.py @@ -41,6 +41,9 @@ def links_extracted(self, request, links): def request_error(self, request, error): return self._add_fingerprint(request) + def create_request(self, request): + return self._add_fingerprint(request) + def _add_fingerprint(self, obj): raise NotImplementedError diff --git a/frontera/contrib/scrapy/converters.py b/frontera/contrib/scrapy/converters.py index f569024c7..50615c8fb 100644 --- a/frontera/contrib/scrapy/converters.py +++ b/frontera/contrib/scrapy/converters.py @@ -1,6 +1,7 @@ from __future__ import absolute_import from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse +from scrapy.http.response.html import TextResponse from frontera.core.models import Request as FrontierRequest from frontera.core.models import Response as FrontierResponse @@ -40,8 +41,8 @@ def to_frontier(self, scrapy_request): b'scrapy_meta': scrapy_meta, b'origin_is_frontier': True, }) - if b'redirect_urls' in scrapy_meta: - meta[b'redirect_urls'] = scrapy_meta[b'redirect_urls'] + if 'redirect_urls' in scrapy_meta: + meta[b'redirect_urls'] = scrapy_meta['redirect_urls'] return FrontierRequest(url=scrapy_request.url, method=scrapy_request.method, headers=scrapy_request.headers, @@ -81,8 +82,10 @@ def to_frontier(self, scrapy_response): """response: Scrapy > Frontier""" frontier_request = scrapy_response.meta[b'frontier_request'] frontier_request.meta[b'scrapy_meta'] = scrapy_response.meta - if b'redirect_urls' in scrapy_response.meta: - frontier_request.meta[b'redirect_urls'] = scrapy_response.meta[b'redirect_urls'] + if 'redirect_urls' in scrapy_response.meta: + frontier_request.meta[b'redirect_urls'] = scrapy_response.meta['redirect_urls'] + if isinstance(scrapy_response, TextResponse): + frontier_request.meta[b'encoding'] = scrapy_response.encoding del scrapy_response.meta[b'frontier_request'] return FrontierResponse(url=scrapy_response.url, status_code=scrapy_response.status, @@ -92,11 +95,19 @@ def to_frontier(self, scrapy_response): def from_frontier(self, response): """response: Frontier > Scrapy""" - return ScrapyResponse(url=response.url, - status=response.status_code, - headers=response.headers, - body=response.body, - request=self._request_converter.from_frontier(response.request)) + if b'encoding' in response.meta: + return TextResponse(url=response.url, + status=response.status_code, + headers=response.headers, + body=response.body, + request=self._request_converter.from_frontier(response.request), + encoding=response.meta[b'encoding']) + else: + return ScrapyResponse(url=response.url, + status=response.status_code, + headers=response.headers, + body=response.body, + request=self._request_converter.from_frontier(response.request)) def _find_method(obj, func): diff --git a/frontera/contrib/scrapy/messagebus_stats.py b/frontera/contrib/scrapy/messagebus_stats.py new file mode 100644 index 000000000..c39f09e79 --- /dev/null +++ b/frontera/contrib/scrapy/messagebus_stats.py @@ -0,0 +1,79 @@ +import logging +from traceback import format_tb + +from scrapy import signals +from scrapy.exceptions import NotConfigured +from twisted.internet.task import LoopingCall + +from frontera.contrib.scrapy.settings_adapter import ScrapySettingsAdapter +from frontera.utils.misc import utc_timestamp, load_object + + +logger = logging.getLogger(__name__) + +# scrapy stats ignored by the exporter by default +STATS_DEFAULT_BLACKLIST = [ + "start_time", +] + + +class StatsExporterToMessageBus(object): + """Export crawl stats to message bus.""" + + def __init__(self, crawler): + settings = ScrapySettingsAdapter(crawler.settings) + self.partition_id = settings.get('SPIDER_PARTITION_ID') + # XXX this can be improved later by reusing spider's producer + # (crawler->engine->slot->scheduler->frontier->manager-> backend->_producer) + # but the topic is hard-coded in the current scheme, so it requires some + # preliminary changes in Frontera itself. + message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) + stats_log = message_bus.stats_log() + if not stats_log: + raise NotConfigured + self.stats_producer = stats_log.producer() + self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) + codec_path = settings.get('MESSAGE_BUS_CODEC') + encoder_cls = load_object(codec_path + ".Encoder") + self._stats_encoder = encoder_cls(request_model=None) # no need to encode requests + self._export_stats_task = None + + @classmethod + def from_crawler(cls, crawler): + obj = cls(crawler) + crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) + return obj + + def spider_opened(self, spider): + + def errback_export_stats(failure): + logger.exception(failure.value) + if failure.frames: + logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + self._export_stats_task.start(self._stats_interval)\ + .addErrback(errback_export_stats) + + self._export_stats_task = LoopingCall(self.export_stats, spider) + self._export_stats_task.start(self._stats_interval)\ + .addErrback(errback_export_stats) + + def spider_closed(self, spider): + if self._export_stats_task: + self._export_stats_task.stop() + self._export_stats_task = None + self.stats_producer.flush() + self.stats_producer.close() + + def export_stats(self, spider): + all_stats = spider.crawler.stats.get_stats() + stats = {key: all_stats[key] for key in all_stats + if key not in STATS_DEFAULT_BLACKLIST} + if not stats: + return # no need to send empty stats + stats['_timestamp'] = utc_timestamp() + stats['_tags'] = {'source': 'spider', 'partition_id': self.partition_id} + key = 'spider-{}-{}'.format(self.partition_id, stats['_timestamp']) + encoded_msg = self._stats_encoder.encode_stats(stats) + self.stats_producer.send(key, encoded_msg) + logger.debug("Sent spider stats to message bus: %s", stats) diff --git a/frontera/contrib/scrapy/middlewares/seeds/__init__.py b/frontera/contrib/scrapy/middlewares/seeds/__init__.py deleted file mode 100644 index 09cd0b7cd..000000000 --- a/frontera/contrib/scrapy/middlewares/seeds/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from __future__ import absolute_import - - -class SeedLoader(object): - def __init__(self, crawler): - self.crawler = crawler - self.configure(crawler.settings) - - def configure(self, settings): - raise NotImplementedError - - @classmethod - def from_crawler(cls, crawler): - return cls(crawler) - - def process_start_requests(self, start_requests, spider): - urls = [url for url in self.load_seeds() if not url.startswith('#')] - return [spider.make_requests_from_url(url) for url in urls] - - def load_seeds(self): - raise NotImplementedError - - - diff --git a/frontera/contrib/scrapy/middlewares/seeds/file.py b/frontera/contrib/scrapy/middlewares/seeds/file.py deleted file mode 100644 index c70953de0..000000000 --- a/frontera/contrib/scrapy/middlewares/seeds/file.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import absolute_import -import codecs - -from scrapy.exceptions import NotConfigured - -from frontera.contrib.scrapy.middlewares.seeds import SeedLoader - - -class FileSeedLoader(SeedLoader): - def configure(self, settings): - self.seeds_source = settings.get('SEEDS_SOURCE') - if not self.seeds_source: - raise NotConfigured - - def load_seeds(self): - # TODO check if it's an existing file or a folder - return self.load_seeds_from_file(self.seeds_source) - - def load_seeds_from_file(self, file_path): - with codecs.open(file_path, 'rU') as f: - return self.load_seeds_from_data((f)) - - def load_seeds_from_data(self, data): - seeds = [] - for seed in data: - clean_seed = self.clean_seed(seed) - if clean_seed: - seeds.append(clean_seed) - return seeds - - def clean_seed(self, url): - return url.strip('\t\n\r') diff --git a/frontera/contrib/scrapy/middlewares/seeds/s3.py b/frontera/contrib/scrapy/middlewares/seeds/s3.py deleted file mode 100644 index abaf3c3b6..000000000 --- a/frontera/contrib/scrapy/middlewares/seeds/s3.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import absolute_import -from six.moves.urllib.parse import urlparse -from boto import connect_s3 -from scrapy.exceptions import NotConfigured - -from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader - - -class S3SeedLoader(FileSeedLoader): - def configure(self, settings): - source = settings.get('SEEDS_SOURCE') - u = urlparse(source) - if not u.hostname or not u.scheme == 's3': - raise NotConfigured - self.bucket_name = u.hostname - self.bucket_keys_prefix = u.path.lstrip('/') - self.s3_aws_access_key = settings.get('SEEDS_AWS_ACCESS_KEY') - self.s3_aws_secret_key = settings.get('SEEDS_AWS_SECRET_ACCESS_KEY') - - def load_seeds(self): - conn = connect_s3(self.s3_aws_access_key, - self.s3_aws_secret_key) - bucket = conn.get_bucket(self.bucket_name) - seeds = [] - for key in bucket.list(self.bucket_keys_prefix): - if key.name.endswith(".txt"): - data = key.get_contents_as_string(encoding='utf-8').split() - file_seeds = self.load_seeds_from_data(data) - seeds.extend(file_seeds) - return seeds diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index f83f08cfa..962ee8a36 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -89,14 +89,11 @@ def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): - if not self._request_is_redirected(request): - self.frontier.add_seeds([request]) - self.stats_manager.add_seeds() - return True - elif self.redirect_enabled: + if self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True + self.logger.warning("The enqueue_request failed on %s", request.url) return False def next_request(self): @@ -133,7 +130,7 @@ def open(self, spider): def close(self, reason): self.logger.info("Finishing frontier (%s)", reason) self.frontier.stop() - self.stats_manager.set_iterations(self.frontier.manager.iteration) + self.stats_manager.set_iterations(getattr(self.frontier.manager, 'iteration', 0)) self.stats_manager.set_pending_requests(len(self)) def __len__(self): @@ -163,12 +160,9 @@ def _get_pending_request(self): def _get_exception_code(self, exception): try: return exception.__class__.__name__ - except: + except Exception: return '?' - def _request_is_redirected(self, request): - return request.meta.get(b'redirect_times', 0) > 0 - def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { diff --git a/frontera/contrib/scrapy/schedulers/recording.py b/frontera/contrib/scrapy/schedulers/recording.py index bcaed25e3..29bd671e8 100644 --- a/frontera/contrib/scrapy/schedulers/recording.py +++ b/frontera/contrib/scrapy/schedulers/recording.py @@ -144,5 +144,5 @@ def process_exception(self, request, exception, spider): def _get_exception_code(self, exception): try: return exception.__class__.__name__ - except: + except Exception: return '?' diff --git a/frontera/core/__init__.py b/frontera/core/__init__.py index 7623db480..4a490b6c8 100644 --- a/frontera/core/__init__.py +++ b/frontera/core/__init__.py @@ -2,6 +2,8 @@ from six.moves.urllib.parse import urlparse from socket import getaddrinfo from collections import defaultdict, deque +from logging import getLogger, DEBUG +from random import sample import six @@ -28,14 +30,17 @@ class OverusedBuffer(object): A buffering object for implementing the buffer of Frontera requests for overused domains/ips. It can be used when customizing backend to address efficient downloader pool usage. """ - def __init__(self, _get_func, log_func=None): + def __init__(self, _get_func, max_per_key, keep_per_key, max_keys, keep_keys): """ :param _get_func: reference to get_next_requests() method of binded class - :param log_func: optional logging function, for logging of internal state """ self._pending = defaultdict(deque) self._get = _get_func - self._log = log_func + self._log = getLogger("overusedbuffer") + self._max_per_key = max_per_key + self._keep_per_key = keep_per_key + self._max_keys = max_keys + self._keep_keys = keep_keys def _get_key(self, request, type): return get_slot_key(request, type) @@ -43,6 +48,9 @@ def _get_key(self, request, type): def _get_pending_count(self): return sum(six.moves.map(len, six.itervalues(self._pending))) + def _get_key_count(self): + return len(self._pending) + def _get_pending(self, max_n_requests, overused_set): pending = self._pending i, keys = 0, set(pending) - overused_set @@ -51,16 +59,39 @@ def _get_pending(self, max_n_requests, overused_set): for key in keys.copy(): try: yield pending[key].popleft() + self._check_and_purge(key) i += 1 except IndexError: keys.discard(key) del pending[key] - def get_next_requests(self, max_n_requests, **kwargs): - if self._log: - self._log("Overused keys: %s" % str(kwargs['overused_keys'])) - self._log("Pending: %d" % self._get_pending_count()) + def _check_and_purge(self, key): + pending = self._pending[key] + if self._max_per_key is not None and len(pending) > self._max_per_key: + self._log.warning("Purging of key %s, of size %d has started", key, + len(pending)) + purged = 0 + while len(pending) > self._keep_per_key: + pending.popleft() + purged += 1 + self._log.warning("%d requests purged", purged) + def _check_and_purge_keys(self): + if self._max_keys is not None and len(self._pending) > self._max_keys: + self._log.warning("Purging the keys") + new_keys = set(sample(self._pending.keys(), self._keep_keys)) + keys = set(self._pending.keys()) + while keys: + key = keys.pop() + if key not in new_keys: + del self._pending[key] + self._log.warning("Finished purging of keys") + + def get_next_requests(self, max_n_requests, **kwargs): + if self._log.isEnabledFor(DEBUG): + self._log.debug("Overused keys: %s", str(kwargs['overused_keys'])) + self._log.debug("Pending: %i", self._get_pending_count()) + self._check_and_purge_keys() overused_set = set(kwargs['overused_keys']) requests = list(self._get_pending(max_n_requests, overused_set)) @@ -71,6 +102,9 @@ def get_next_requests(self, max_n_requests, **kwargs): key = self._get_key(request, kwargs['key_type']) if key in overused_set: self._pending[key].append(request) + # contacts-crawler strategy related hack + if self._max_per_key: + self._check_and_purge(key) else: requests.append(request) return requests diff --git a/frontera/core/codec.py b/frontera/core/codec.py index 45f6e0068..a2fceb2bc 100644 --- a/frontera/core/codec.py +++ b/frontera/core/codec.py @@ -31,16 +31,6 @@ def decode_request(self, buffer): @six.add_metaclass(ABCMeta) class BaseEncoder(object): - @abstractmethod - def encode_add_seeds(self, seeds): - """ - Encodes add_seeds message - - :param list seeds: A list of frontier Request objects - :return: bytes encoded message - """ - pass - @abstractmethod def encode_page_crawled(self, response): """ @@ -118,3 +108,13 @@ def encode_offset(self, partition_id, offset): :return: bytes encoded message """ pass + + @abstractmethod + def encode_stats(self, stats): + """ + Encodes current crawl stats. + + :param stats: a dictionary with stats + :return: bytes encoded message + """ + pass \ No newline at end of file diff --git a/frontera/core/components.py b/frontera/core/components.py index 33529c7bc..8c29ca17f 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -1,6 +1,7 @@ from __future__ import absolute_import -from abc import ABCMeta, abstractmethod, abstractproperty + import six +from abc import ABCMeta, abstractmethod class StartStopMixin(object): @@ -22,15 +23,6 @@ class Metadata(StartStopMixin): """Interface definition for a frontier metadata class. This class is responsible for storing documents metadata, including content and optimized for write-only data flow.""" - @abstractmethod - def add_seeds(self, seeds): - """ - This method is called when new seeds are added to the frontier. - - :param list seeds: A list of :class:`Request ` objects. - """ - pass - @abstractmethod def page_crawled(self, response): """ @@ -99,8 +91,8 @@ def count(self): @six.add_metaclass(ABCMeta) class States(StartStopMixin): - """Interface definition for a document states management class. This class is responsible for providing actual - documents state, and persist the state changes in batch-oriented manner.""" + """Interface definition for a link states management class. This class is responsible for providing actual + link state, and persist the state changes in batch-oriented manner.""" NOT_CRAWLED = 0 QUEUED = 1 @@ -126,11 +118,9 @@ def set_states(self, objs): raise NotImplementedError @abstractmethod - def flush(self, force_clear): + def flush(self): """ Flushes internal cache to storage. - - :param force_clear: boolean, True - signals to clear cache after flush """ raise NotImplementedError @@ -144,6 +134,53 @@ def fetch(self, fingerprints): raise NotImplementedError +@six.add_metaclass(ABCMeta) +class DomainMetadata(StartStopMixin): + """ + Interface definition for a domain metadata storage. It's main purpose is to store the per-domain metadata using + Python-friendly structures. Meant to be used by crawling strategy to store counters and flags in low level + facilities provided by Backend. + """ + + @abstractmethod + def __setitem__(self, key, value): + """ + Puts key, value tuple in storage. + + :param key: str + :param value: Any + """ + raise NotImplementedError + + @abstractmethod + def __getitem__(self, key): + """ + Retrieves the value associated with the storage. Raises KeyError if key is absent. + + :param key: str + :return value: Any + """ + raise NotImplementedError + + @abstractmethod + def __delitem__(self, key): + """ + Removes the tuple associated with key from storage. Raises KeyError if key is absent. + + :param key: str + """ + raise NotImplementedError + + def __contains__(self, key): + """ + Checks if key is present in the storage. + + :param key: str + :return: boolean + """ + raise NotImplementedError + + @six.add_metaclass(ABCMeta) class Component(Metadata): """ @@ -189,6 +226,15 @@ class Middleware(Component): """Interface definition for a Frontier Middlewares""" component_name = 'Base Middleware' + def create_request(self, request): + """ + Applying middleware logic on newly created request. + + :param reqeust: :class:`Request ` object + :return: an instance of :class:`Request ` object. + """ + pass + @six.add_metaclass(ABCMeta) class CanonicalSolver(Middleware): @@ -197,27 +243,38 @@ class CanonicalSolver(Middleware): class PropertiesMixin(object): - @abstractproperty + @property + @abstractmethod def queue(self): """ :return: associated :class:`Queue ` object """ raise NotImplementedError - @abstractproperty + @property + @abstractmethod def metadata(self): """ :return: associated :class:`Metadata ` object """ raise NotImplementedError - @abstractproperty + @property + @abstractmethod def states(self): """ :return: associated :class:`States ` object """ raise NotImplementedError + @property + @abstractmethod + def domain_metadata(self): + """ + :return: associated :class:`DomainMetadata ` object + """ + raise NotImplementedError + @six.add_metaclass(ABCMeta) class Backend(PropertiesMixin, Component): @@ -257,6 +314,21 @@ def strategy_worker(cls, manager): def db_worker(cls, manager): raise NotImplementedError + @classmethod + def local(cls, manager): + raise NotImplementedError + + def get_stats(self): + """ + Returns a dictionary with distributed backend stats. + + Depending on a backend type the method may return different stats to be sent to a message bus. + Called by :class:`StatsExportMixin ` for workers. + + :return: dict of stats key/values. + """ + return None + class Partitioner(object): """ diff --git a/frontera/core/manager.py b/frontera/core/manager.py index c83bb780b..b7e6a31fd 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -1,15 +1,77 @@ from __future__ import absolute_import -from collections import OrderedDict +import logging +from abc import ABCMeta, abstractmethod +from collections import Iterable + +import six + +from frontera.core import models +from frontera.core.components import Backend, DistributedBackend, Middleware, CanonicalSolver from frontera.exceptions import NotConfigured +from frontera.settings import Settings from frontera.utils.misc import load_object -from frontera.settings import Settings, BaseSettings -from frontera.core.components import Backend, DistributedBackend, Middleware, CanonicalSolver -from frontera.core import models -import logging -class ComponentsPipelineMixin(object): +class BackendMixin(object): + def __init__(self, backend, db_worker=False, strategy_worker=False): + # Load backend + self._logger_components.debug("Loading backend '%s'", backend) + self._backend = self._load_backend(backend, db_worker, strategy_worker) + self._backend.frontier_start() + + def _load_backend(self, backend, db_worker, strategy_worker): + # FIXME remove obsolete + cls = load_object(backend) + assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ + if issubclass(cls, DistributedBackend): + if db_worker: + return cls.db_worker(self) + if strategy_worker: + return cls.strategy_worker(self) + return cls.local(self) + else: + assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ + "subclasses are allowed to use" + if hasattr(cls, 'from_manager'): + return cls.from_manager(self) + else: + return cls() + + @property + def backend(self): + """ + The :class:`Backend ` object to be used by the frontier. \ + Can be defined with :setting:`BACKEND` setting. + """ + return self._backend + + def close(self): + self.backend.frontier_stop() + + +class StrategyMixin(object): + def __init__(self, strategy_class, strategy_args, scoring_stream): + self._scoring_stream = scoring_stream if scoring_stream else LocalUpdateScoreStream(self.backend.queue) + self._states_context = StatesContext(self.backend.states) + if isinstance(strategy_class, str): + strategy_class = load_object(strategy_class) + self._strategy = strategy_class.from_worker(self, strategy_args, self._scoring_stream, self._states_context) + + @property + def strategy(self): + return self._strategy + + @property + def states_context(self): + return self._states_context + + def close(self): + self.strategy.close() + self.states_context.flush() + + +class ComponentsPipelineMixin(BackendMixin): def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=False, strategy_worker=False): self._logger_components = logging.getLogger("manager.components") @@ -18,13 +80,11 @@ def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=Fa # Load canonical solver self._logger_components.debug("Loading canonical url solver '%s'", canonicalsolver) - self._canonicalsolver = self._load_object(canonicalsolver) - assert isinstance(self.canonicalsolver, CanonicalSolver), \ - "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ - - # Load backend - self._logger_components.debug("Loading backend '%s'", backend) - self._backend = self._load_backend(backend, db_worker, strategy_worker) + if canonicalsolver: + self._canonicalsolver = self._load_object(canonicalsolver) + assert isinstance(self.canonicalsolver, CanonicalSolver), \ + "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ + BackendMixin.__init__(self, backend, db_worker, strategy_worker) @property def canonicalsolver(self): @@ -41,31 +101,6 @@ def middlewares(self): """ return self._middlewares - @property - def backend(self): - """ - The :class:`Backend ` object to be used by the frontier. \ - Can be defined with :setting:`BACKEND` setting. - """ - return self._backend - - def _load_backend(self, backend, db_worker, strategy_worker): - cls = load_object(backend) - assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ - if issubclass(cls, DistributedBackend): - if db_worker: - return cls.db_worker(self) - if strategy_worker: - return cls.strategy_worker(self) - raise RuntimeError("Distributed backends are meant to be used in workers.") - else: - assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ - "subclasses are allowed to use." - if hasattr(cls, 'from_manager'): - return cls.from_manager(self) - else: - return cls() - def _load_middlewares(self, middleware_names): # TO-DO: Use dict for middleware ordering mws = [] @@ -81,9 +116,11 @@ def _load_middlewares(self, middleware_names): return mws - def _process_components(self, method_name, obj=None, return_classes=None, **kwargs): + def _process_components(self, method_name, obj=None, return_classes=None, components=None, **kwargs): + pipeline = self._components_pipeline if components is None else \ + [self._components_pipeline[c] for c in components] return_obj = obj - for component_category, component, check_response in self._components_pipeline: + for component_category, component, check_response in pipeline: components = component if isinstance(component, list) else [component] for component in components: result = self._process_component(component=component, method_name=method_name, @@ -110,8 +147,22 @@ def _process_component(self, component, method_name, component_category, obj, re return_obj.__class__.__name__) return return_obj + def close(self): + BackendMixin.close(self) + super(ComponentsPipelineMixin, self).close() -class BaseManager(object): + +class StrategyComponentsPipelineMixin(ComponentsPipelineMixin, StrategyMixin): + def __init__(self, backend, strategy_class, strategy_args, scoring_stream, **kwargs): + super(StrategyComponentsPipelineMixin, self).__init__(backend, **kwargs) + StrategyMixin.__init__(self, strategy_class, strategy_args, scoring_stream) + + def close(self): + StrategyMixin.close(self) + super(StrategyComponentsPipelineMixin, self).close() + + +class BaseContext(object): def __init__(self, request_model, response_model, settings=None): # Settings @@ -121,7 +172,7 @@ def __init__(self, request_model, response_model, settings=None): self._logger = logging.getLogger("manager") # Log frontier manager starting - self._logger.info('-'*80) + self._logger.info('-' * 80) self._logger.info('Starting Frontier Manager...') # Load request model @@ -137,7 +188,7 @@ def __init__(self, request_model, response_model, settings=None): @classmethod def from_settings(cls, settings=None): manager_settings = Settings(settings) - return BaseManager(request_model=manager_settings.REQUEST_MODEL, + return BaseContext(request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, settings=manager_settings) @@ -179,15 +230,85 @@ def settings(self): return self._settings -class FrontierManager(BaseManager, ComponentsPipelineMixin): +class BaseManager(object): + def get_next_requests(self, max_next_requests=0, **kwargs): + """ + Returns a list of next requests to be crawled. Optionally a maximum number of pages can be passed. If no + value is passed, \ + :attr:`FrontierManager.max_next_requests ` + will be used instead. (:setting:`MAX_NEXT_REQUESTS` setting). + + :param int max_next_requests: Maximum number of requests to be returned by this method. + :param dict kwargs: Arbitrary arguments that will be passed to backend. + + :return: list of :class:`Request ` objects. + """ + + # log (in) + self._logger.debug('GET_NEXT_REQUESTS(in) max_next_requests=%s', max_next_requests) + + # get next requests + next_requests = self.backend.get_next_requests(max_next_requests, **kwargs) + + # log (out) + self._logger.debug('GET_NEXT_REQUESTS(out) returned_requests=%s', len(next_requests)) + return next_requests + + def page_crawled(self, response): + """ + Informs the frontier about the crawl result. + + :param object response: The :class:`Response ` object for the crawled page. + + :return: None. + """ + self._logger.debug('PAGE_CRAWLED url=%s status=%s', response.url, response.status_code) + self._process_components(method_name='page_crawled', + obj=response, + return_classes=self.response_model) + + def links_extracted(self, request, links): + """ + Informs the frontier about extracted links for the request. + + :param object request: The :class:`Request ` object from which the links where crawled. + :param list links: A list of :class:`Request ` objects generated from the links \ + extracted for the request. + + :return: None. + """ + self._logger.debug('LINKS_EXTRACTED url=%s links=%d', request.url, len(links)) + self._process_components(method_name='links_extracted', + obj=request, + return_classes=self.request_model, + components=(0, 1), + links=links) + + def links_extracted_after(self, request, filtered): + self._process_components(method_name='links_extracted', + obj=request, + return_classes=self.request_model, + components=(2,), + links=filtered) + + def request_error(self, request, error): + self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) + return self._process_components(method_name='request_error', + obj=request, + return_classes=self.request_model, + error=error) + + +class LocalFrontierManager(BaseContext, StrategyComponentsPipelineMixin, BaseManager): """ The :class:`FrontierManager ` object encapsulates the whole frontier, providing an API to interact with. It's also responsible of loading and communicating all different frontier components. """ - def __init__(self, request_model, response_model, backend, middlewares=None, test_mode=False, max_requests=0, - max_next_requests=0, auto_start=True, settings=None, canonicalsolver=None, db_worker=False, - strategy_worker=False): + + def __init__(self, request_model, response_model, backend, strategy_class, strategy_args, middlewares=None, + test_mode=False, max_requests=0, max_next_requests=0, auto_start=True, settings=None, + canonicalsolver=None): """ :param object/string request_model: The :class:`Request ` object to be \ used by the frontier. @@ -217,13 +338,9 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes :param object/string canonicalsolver: The :class:`CanonicalSolver ` object to be used by frontier. - - :param bool db_worker: True if class is instantiated in DB worker environment - - :param bool strategy_worker: True if class is instantiated in strategy worker environment """ - BaseManager.__init__(self, request_model, response_model, settings=settings) + BaseContext.__init__(self, request_model, response_model, settings=settings) # Test mode self._test_mode = test_mode @@ -240,20 +357,21 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes # Manager finished flag self._finished = False - ComponentsPipelineMixin.__init__(self, backend=backend, middlewares=middlewares, - canonicalsolver=canonicalsolver, db_worker=db_worker, - strategy_worker=strategy_worker) + StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, None, + middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=False, strategy_worker=False) # Init frontier components pipeline + # Some code relies on the order, modify carefully self._components_pipeline = [ ('Middleware', self.middlewares, True), ('CanonicalSolver', self.canonicalsolver, False), - ('Backend', self.backend, False) + ('Strategy', self.strategy, False) ] # Log frontier manager start self._logger.info('Frontier Manager Started!') - self._logger.info('-'*80) + self._logger.info('-' * 80) # start/stop self._started = False @@ -270,18 +388,18 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False): :ref:`frontier default settings ` are used. """ manager_settings = Settings.object_from(settings) - return FrontierManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - backend=manager_settings.BACKEND, - middlewares=manager_settings.MIDDLEWARES, - test_mode=manager_settings.TEST_MODE, - max_requests=manager_settings.MAX_REQUESTS, - max_next_requests=manager_settings.MAX_NEXT_REQUESTS, - auto_start=manager_settings.AUTO_START, - settings=manager_settings, - canonicalsolver=manager_settings.CANONICAL_SOLVER, - db_worker=db_worker, - strategy_worker=strategy_worker) + return LocalFrontierManager(request_model=manager_settings.REQUEST_MODEL, + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + strategy_class=manager_settings.STRATEGY, + strategy_args=manager_settings.STRATEGY_ARGS, + middlewares=manager_settings.MIDDLEWARES, + test_mode=manager_settings.TEST_MODE, + max_requests=manager_settings.MAX_REQUESTS, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + auto_start=manager_settings.AUTO_START, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER) @property def test_mode(self): @@ -337,7 +455,7 @@ def finished(self): Boolean value indicating if the frontier has finished. See :ref:`Finish conditions `. """ if not self._finished: - return self.backend.finished() + return self.strategy.finished() return True def start(self): @@ -362,26 +480,19 @@ def stop(self): self._check_startstop() self._logger.debug('STOP') self._process_components(method_name='frontier_stop') + StrategyComponentsPipelineMixin.close(self) self._stopped = True - def add_seeds(self, seeds): + def add_seeds(self, seeds_file): """ - Adds a list of seed requests (seed URLs) as entry point for the crawl. + Performs seeds addition procedure. Using file-like object, calls read_seeds method of crawling strategy. - :param list seeds: A list of :class:`Request ` objects. + :param file seeds_file: A file-like object opened in binary mode which will be passed to read_seeds :return: None. """ self._check_startstop() - # FIXME probably seeds should be a generator here - assert len(seeds), "Empty seeds list" - for seed in seeds: - assert isinstance(seed, self._request_model), "Seed objects must subclass '%s', '%s' found" % \ - (self._request_model.__name__, type(seed).__name__) - self._logger.debug('ADD_SEEDS urls_length=%d', len(seeds)) - self._process_components(method_name='add_seeds', - obj=seeds, - return_classes=(list,)) # TODO: Dar vuelta + self.strategy.read_seeds(seeds_file) def get_next_requests(self, max_next_requests=0, **kwargs): """ @@ -409,15 +520,11 @@ def get_next_requests(self, max_next_requests=0, **kwargs): if not max_next_requests: max_next_requests = self.max_requests - self.n_requests else: - if self.n_requests+max_next_requests > self.max_requests: + if self.n_requests + max_next_requests > self.max_requests: max_next_requests = self.max_requests - self.n_requests - # log (in) - self._logger.debug('GET_NEXT_REQUESTS(in) max_next_requests=%s n_requests=%s/%s', - max_next_requests, self.n_requests, self.max_requests or '-') - # get next requests - next_requests = self.backend.get_next_requests(max_next_requests, **kwargs) + next_requests = super(LocalFrontierManager, self).get_next_requests(max_next_requests, **kwargs) # Increment requests counter self._n_requests += len(next_requests) @@ -426,21 +533,10 @@ def get_next_requests(self, max_next_requests=0, **kwargs): if next_requests: self._iteration += 1 - # log (out) - self._logger.debug('GET_NEXT_REQUESTS(out) returned_requests=%s n_requests=%s/%s', - len(next_requests), self.n_requests, self.max_requests or '-') return next_requests def page_crawled(self, response): - """ - Informs the frontier about the crawl result. - - :param object response: The :class:`Response ` object for the crawled page. - - :return: None. - """ self._check_startstop() - self._logger.debug('PAGE_CRAWLED url=%s status=%s', response.url, response.status_code) assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ (self.response_model.__name__, type(response).__name__) assert hasattr(response, 'request') and response.request, "Empty response request" @@ -450,31 +546,28 @@ def page_crawled(self, response): type(response.request).__name__) assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ (self.response_model.__name__, type(response).__name__) - self._process_components(method_name='page_crawled', - obj=response, - return_classes=self.response_model) + self.states_context.to_fetch(response) + self.states_context.fetch() + self.states_context.states.set_states(response) + super(LocalFrontierManager, self).page_crawled(response) + self.states_context.states.update_cache(response) def links_extracted(self, request, links): - """ - Informs the frontier about extracted links for the request. - - :param object request: The :class:`Request ` object from which the links where crawled. - :param list links: A list of :class:`Request ` objects generated from the links \ - extracted for the request. - - :return: None. - """ self._check_startstop() - self._logger.debug('LINKS_EXTRACTED url=%s links=%d', request.url, len(links)) assert isinstance(request, self.request_model), "Request object must subclass '%s', '%s' found" % \ (self.request_model.__name__, type(request).__name__) for link in links: assert isinstance(link, self._request_model), "Link objects must subclass '%s', '%s' found" % \ (self._request_model.__name__, type(link).__name__) - self._process_components(method_name='links_extracted', - obj=request, - return_classes=self.request_model, - links=links) + super(LocalFrontierManager, self).links_extracted(request, links) + filtered = self.strategy.filter_extracted_links(request, links) + if filtered: + self.states_context.to_fetch(request) + self.states_context.to_fetch(filtered) + self.states_context.fetch() + self.states_context.states.set_states(filtered) + super(LocalFrontierManager, self).links_extracted_after(request, filtered) + self.states_context.states.update_cache(filtered) def request_error(self, request, error): """ @@ -486,13 +579,253 @@ def request_error(self, request, error): :return: None. """ self._check_startstop() - self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) - processed_page = self._process_components(method_name='request_error', - obj=request, - return_classes=self.request_model, - error=error) + self.states_context.to_fetch(request) + self.states_context.fetch() + self.states_context.states.set_states(request) + processed_page = super(LocalFrontierManager, self).request_error(request, error) + self.states_context.states.update_cache(request) return processed_page + def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): + """ + Creates request and applies middleware and canonical solver pipelines. + + :param url: str + :param method: bytes + :param headers: dict + :param cookies: dict + :param meta: dict + :param body: bytes + :return: :class:`Request ` object + """ + r = self.request_model(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) + self._process_components('create_request', + obj=r, + return_classes=self.request_model, + components=(0, 1)) + return r + def _check_startstop(self): assert self._started, "Frontier not started!" assert not self._stopped, "Call to stopped frontier!" + + +class WorkerFrontierManager(BaseContext, StrategyComponentsPipelineMixin): + """ + The :class:`WorkerFrontierManager ` class role is to + instantiate the core components and is used mainly by workers. + """ + + def __init__(self, settings, request_model, response_model, backend, max_next_requests, strategy_class=None, + strategy_args=None, scoring_stream=None, middlewares=None, canonicalsolver=None, db_worker=False, + strategy_worker=False): + """ + :param object/string request_model: The :class:`Request ` object to be \ + used by the frontier. + + :param object/string response_model: The :class:`Response ` object to be \ + used by the frontier. + + :param object/string backend: The :class:`Backend ` object to be \ + used by the frontier. + + :param list middlewares: A list of :class:`Middleware ` \ + objects to be used by the frontier. + + :param int max_next_requests: Maximum number of requests returned by \ + :attr:`get_next_requests ` method. + + :param object/string settings: The :class:`Settings ` object used by \ + the frontier. + + :param object/string canonicalsolver: The :class:`CanonicalSolver ` + object to be used by frontier. + :param object scoring_stream: Instance of :class:`UpdateScoreStream ` + for crawling strategy to send scheduled requests to. + + :param bool db_worker: True if class is instantiated in DB worker environment + + :param bool strategy_worker: True if class is instantiated in strategy worker environment + """ + + BaseContext.__init__(self, request_model, response_model, settings=settings) + + self._max_next_requests = max_next_requests + if strategy_worker: + StrategyComponentsPipelineMixin.__init__(self, backend, strategy_class, strategy_args, scoring_stream, + middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=db_worker, strategy_worker=strategy_worker) + # Init frontier components pipeline + # Some code relies on the order, modify carefully + self._components_pipeline = [ + ('Middleware', self.middlewares, True), + ('CanonicalSolver', self.canonicalsolver, False), + ] + if db_worker: + ComponentsPipelineMixin.__init__(self, backend, db_worker=db_worker, strategy_worker=strategy_worker) + + # Log frontier manager start + self._logger.info('Frontier Manager Started!') + self._logger.info('-' * 80) + + @classmethod + def from_settings(cls, settings=None, db_worker=False, strategy_worker=False, scoring_stream=None): + manager_settings = Settings.object_from(settings) + kwargs = { + 'request_model': manager_settings.REQUEST_MODEL, + 'response_model': manager_settings.RESPONSE_MODEL, + 'backend': manager_settings.BACKEND, + 'max_next_requests': manager_settings.MAX_NEXT_REQUESTS, + 'settings': manager_settings, + 'db_worker': db_worker, + 'strategy_worker': strategy_worker + } + if strategy_worker: + kwargs.update({ + 'strategy_class': manager_settings.STRATEGY, + 'strategy_args': manager_settings.STRATEGY_ARGS, + 'middlewares': manager_settings.MIDDLEWARES, + 'canonicalsolver': manager_settings.CANONICAL_SOLVER, + 'scoring_stream': scoring_stream + }) + return WorkerFrontierManager(**kwargs) + + @property + def test_mode(self): + return False + + def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): + """ + Creates request and applies middleware and canonical solver pipelines. + + :param url: str + :param method: bytes + :param headers: dict + :param cookies: dict + :param meta: dict + :param body: bytes + :return: :class:`Request ` object + """ + r = self.request_model(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) + return self._process_components('create_request', + obj=r, + return_classes=self.request_model, + components=(0, 1)) + + +class SpiderFrontierManager(BaseContext, ComponentsPipelineMixin, BaseManager): + + def __init__(self, request_model, response_model, backend, middlewares, max_next_requests, settings, + canonicalsolver): + BaseContext.__init__(self, request_model, response_model, settings=settings) + ComponentsPipelineMixin.__init__(self, backend, middlewares=middlewares, canonicalsolver=canonicalsolver, + db_worker=False, strategy_worker=False) + + self.max_next_requests = max_next_requests + self._components_pipeline = [ + ('Middleware', self.middlewares, True), + ('CanonicalSolver', self.canonicalsolver, False), + ('Backend', self.backend, False) + ] + + @classmethod + def from_settings(cls, settings=None): + manager_settings = Settings.object_from(settings) + return SpiderFrontierManager(request_model=manager_settings.REQUEST_MODEL, + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + middlewares=manager_settings.MIDDLEWARES, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER) + + @property + def test_mode(self): + return False + + @property + def auto_start(self): + return True + + def get_next_requests(self, max_next_requests=0, **kwargs): + return super(SpiderFrontierManager, self).get_next_requests(max_next_requests=max_next_requests or self.max_next_requests, **kwargs) + + def links_extracted(self, request, links): + super(SpiderFrontierManager, self).links_extracted(request, links) + super(SpiderFrontierManager, self).links_extracted_after(request, links) + + @property + def finished(self): + return False + + def start(self): + self._logger.debug('START') + self._process_components(method_name='frontier_start') + + def stop(self): + super(SpiderFrontierManager, self).close() + + +@six.add_metaclass(ABCMeta) +class UpdateScoreStream(object): + @abstractmethod + def send(self, request, score=1.0, dont_queue=False): + pass + + def flush(self): + pass + + +class MessageBusUpdateScoreStream(UpdateScoreStream): + def __init__(self, producer, encoder): + self._producer = producer + self._encoder = encoder + + def send(self, request, score=1.0, dont_queue=False): + encoded = self._encoder.encode_update_score( + request=request, + score=score, + schedule=not dont_queue + ) + self._producer.send(None, encoded) + + +class LocalUpdateScoreStream(UpdateScoreStream): + def __init__(self, queue): + self._queue = queue + + def send(self, request, score=1.0, dont_queue=False): + self._queue.schedule([(request.meta[b'fingerprint'], score, request, not dont_queue)]) + + +class StatesContext(object): + def __init__(self, states): + self._requests = [] + self.states = states + self._fingerprints = dict() + self.logger = logging.getLogger("states-context") + + def to_fetch(self, requests): + requests = requests if isinstance(requests, Iterable) else [requests] + for request in requests: + fingerprint = request.meta[b'fingerprint'] + self._fingerprints[fingerprint] = request + + def fetch(self): + self.states.fetch(self._fingerprints) + self._fingerprints.clear() + + def refresh_and_keep(self, requests): + self.to_fetch(requests) + self.fetch() + self.states.set_states(requests) + self._requests.extend(requests if isinstance(requests, Iterable) else [requests]) + + def release(self): + self.states.update_cache(self._requests) + self._requests = [] + + def flush(self): + self.logger.info("Flushing states") + self.states.flush() + self.logger.info("Flushing of states finished") diff --git a/frontera/core/messagebus.py b/frontera/core/messagebus.py index 3782f6c00..795495728 100644 --- a/frontera/core/messagebus.py +++ b/frontera/core/messagebus.py @@ -124,6 +124,27 @@ def producer(self): raise NotImplementedError +@six.add_metaclass(ABCMeta) +class BaseStatsLogStream(object): + """ + Stats log stream base class. This stream is transfering stats metrics from workers and spiders to external + data sources. This type of stream isn't requiring any partitioning. + """ + @abstractmethod + def consumer(self): + """ + :return: BaseStreamConsumer instance + """ + raise NotImplementedError + + @abstractmethod + def producer(self): + """ + :return: BaseStreamProducer instance + """ + raise NotImplementedError + + @six.add_metaclass(ABCMeta) class BaseSpiderFeedStream(object): """ @@ -204,3 +225,11 @@ def spider_feed(self): :return: instance of SpiderFeedStream """ raise NotImplementedError + + @abstractmethod + def stats_log(self): + """ + Create or return stats log stream. + :return: instance of StatsLogStream + """ + raise NotImplementedError \ No newline at end of file diff --git a/frontera/core/models.py b/frontera/core/models.py index c1c8de734..3266292f8 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -16,6 +16,7 @@ class Request(FrontierObject): :class:`Response ` object when crawled. """ + def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''): """ :param string url: URL to send. @@ -80,9 +81,12 @@ def body(self): def __str__(self): return "<%s at 0x%0x %s meta=%s body=%s... cookies=%s, headers=%s>" % (type(self).__name__, id(self), self.url, - str(self.meta), str(self.body[:20]), + str(self.meta), str(self.body[:20]) if self.body is not None else None, str(self.cookies), str(self.headers)) + def __hash__(self): + return hash(self.meta[b'fingerprint']) + __repr__ = __str__ @@ -159,6 +163,6 @@ def __str__(self): return "<%s at 0x%0x %s %s meta=%s body=%s... headers=%s>" % (type(self).__name__, id(self), self.status_code, self.url, str(self.meta), - str(self.body[:20]), str(self.headers)) + str(self.body[:20]) if self.body is not None else None, str(self.headers)) __repr__ = __str__ diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b049e7bdc..1685ace2b 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -3,35 +3,53 @@ AUTO_START = True -BACKEND = 'frontera.contrib.backends.memory.FIFO' +BACKEND = 'frontera.contrib.backends.memory.MemoryDistributedBackend' BC_MIN_REQUESTS = 64 BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' DELAY_ON_EMPTY = 5.0 +DISCOVERY_MAX_PAGES = 100 DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +DOMAIN_STATS_LOG_INTERVAL = 300 HBASE_THRIFT_HOST = 'localhost' HBASE_THRIFT_PORT = 9090 HBASE_NAMESPACE = 'crawler' HBASE_DROP_ALL_TABLES = False +HBASE_DOMAIN_METADATA_TABLE = 'domain_metadata' +HBASE_DOMAIN_METADATA_CACHE_SIZE = 1000 +HBASE_DOMAIN_METADATA_BATCH_SIZE = 100 HBASE_METADATA_TABLE = 'metadata' +HBASE_STATES_TABLE = 'states' HBASE_USE_SNAPPY = False HBASE_USE_FRAMED_COMPACT = False HBASE_BATCH_SIZE = 9216 HBASE_STATE_CACHE_SIZE_LIMIT = 3000000 +HBASE_STATE_WRITE_LOG_SIZE = 15000 HBASE_QUEUE_TABLE = 'queue' KAFKA_GET_TIMEOUT = 5.0 +LOCAL_MODE = True MAX_NEXT_REQUESTS = 64 MAX_REQUESTS = 0 MESSAGE_BUS = 'frontera.contrib.messagebus.zeromq.MessageBus' MESSAGE_BUS_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' MIDDLEWARES = [ - 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + 'frontera.contrib.middlewares.domain.DomainMiddleware', + 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware' ] NEW_BATCH_DELAY = 30.0 +DOMAINS_BLACKLIST = None OVERUSED_SLOT_FACTOR = 5.0 +OVERUSED_MAX_PER_KEY = None +OVERUSED_KEEP_PER_KEY = 1000 +OVERUSED_MAX_KEYS = None +OVERUSED_KEEP_KEYS = 100 QUEUE_HOSTNAME_PARTITIONING = False +REDIS_BACKEND_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' +REDIS_HOST = 'localhost' +REDIS_PORT = 6379 +REDIS_STATE_CACHE_SIZE_LIMIT = 0 REQUEST_MODEL = 'frontera.core.models.Request' RESPONSE_MODEL = 'frontera.core.models.Response' @@ -42,22 +60,27 @@ SPIDER_FEED_PARTITIONS = 1 SPIDER_PARTITION_ID = 0 SQLALCHEMYBACKEND_CACHE_SIZE = 10000 -SQLALCHEMYBACKEND_CLEAR_CONTENT = True -SQLALCHEMYBACKEND_DROP_ALL_TABLES = True +SQLALCHEMYBACKEND_CLEAR_CONTENT = False +SQLALCHEMYBACKEND_DROP_ALL_TABLES = False SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' SQLALCHEMYBACKEND_ENGINE_ECHO = False SQLALCHEMYBACKEND_MODELS = { 'MetadataModel': 'frontera.contrib.backends.sqlalchemy.models.MetadataModel', 'StateModel': 'frontera.contrib.backends.sqlalchemy.models.StateModel', - 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel' + 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel', + 'DomainMetadataModel': 'frontera.contrib.backends.sqlalchemy.models.DomainMetadataModel' } SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=1) STATE_CACHE_SIZE = 1000000 STATE_CACHE_SIZE_LIMIT = 0 STORE_CONTENT = False +STRATEGY = 'frontera.strategy.basic.BasicCrawlingStrategy' +STRATEGY_ARGS = {} +SW_FLUSH_INTERVAL = 300 TEST_MODE = False TLDEXTRACT_DOMAIN_INFO = False URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +USER_AGENT = 'FronteraDiscoveryBot' ZMQ_ADDRESS = '127.0.0.1' ZMQ_BASE_PORT = 5550 @@ -71,10 +94,14 @@ SPIDER_FEED_TOPIC = "frontier-todo" SPIDER_LOG_TOPIC = "frontier-done" SCORING_LOG_TOPIC = "frontier-score" +STATS_LOG_TOPIC = 'frontier-stats' SPIDER_LOG_DBW_GROUP = "dbw-spider-log" SPIDER_LOG_SW_GROUP = "sw-spider-log" SCORING_LOG_DBW_GROUP = "dbw-scoring-log" SPIDER_FEED_GROUP = "fetchers-spider-feed" +STATS_LOG_READER_GROUP = 'stats-reader-log' -KAFKA_CODEC = None \ No newline at end of file +KAFKA_CODEC = None +KAFKA_CERT_PATH = '/mnt/mesos/sandbox' +KAFKA_ENABLE_SSL = False diff --git a/frontera/worker/strategies/__init__.py b/frontera/strategy/__init__.py similarity index 51% rename from frontera/worker/strategies/__init__.py rename to frontera/strategy/__init__.py index 1b46f5d96..784f245a0 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/strategy/__init__.py @@ -1,9 +1,5 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.core.models import Request -from frontera.contrib.middlewares.fingerprint import UrlFingerprintMiddleware - from abc import ABCMeta, abstractmethod + import six @@ -19,28 +15,36 @@ class BaseCrawlingStrategy(object): After exiting from all of these methods states from meta field are passed back and stored in the backend. """ - def __init__(self, manager, mb_stream, states_context): - self._mb_stream = mb_stream + def __init__(self, manager, args, scheduled_stream, states_context): + """ + Constructor of the crawling strategy. + + Args: + manager: is an instance of :class: `Backend ` instance + args: is a dict with command line arguments from :term:`strategy worker` + scheduled_stream: is a helper class for sending scheduled requests + states_context: a helper to operate with states for requests created in crawling strategy class + """ + self._scheduled_stream = scheduled_stream self._states_context = states_context - self.url_mw = UrlFingerprintMiddleware(manager) + self._manager = manager @classmethod - def from_worker(cls, manager, mb_stream, states_context): + def from_worker(cls, manager, args, scheduled_stream, states_context): """ Called on instantiation in strategy worker. - :param manager: :class: `Backend ` instance - :param mb_stream: :class: `UpdateScoreStream ` instance + see params for constructor :return: new instance """ - return cls(manager, mb_stream, states_context) + return cls(manager, args, scheduled_stream, states_context) @abstractmethod - def add_seeds(self, seeds): + def read_seeds(self, stream): """ - Called when add_seeds event is received from spider log. + Called when :term:`strategy worker` is run using add-seeds mode. - :param list seeds: A list of :class:`Request ` objects. + :param file stream: A file-like object containing seed content """ @abstractmethod @@ -52,22 +56,43 @@ def page_crawled(self, response): """ @abstractmethod - def links_extracted(self, request, links): + def request_error(self, request, error): """ - Called every time document was successfully crawled, and receiving page_crawled event from spider log. + Called every time there was error during page downloading. + + :param object request: The fetched with error :class:`Request ` object. + :param str error: A string identifier for the error. + """ + + @abstractmethod + def filter_extracted_links(self, request, links): + """ + Called every time on receiving links_extracted event by strategy worker. This call is preceding the call + to links_extracted handler and is aiming to filter unused links and return only those where states + information is needed. + + The motivation for having the filtration separated before the actual handler is to save on HBase state + retrieval. Every non-cached link is requested from HBase and it may slow down the cluster significantly + on discovery-intensive crawls. Please make sure you use this class to filter out all the links you're not + going ot use in :method:`links_extracted + handler. :param object request: The :class:`Request ` object for the crawled page. :param list links: A list of :class:`Request ` objects generated from \ the links extracted for the crawled page. + + :return: A subset of :class:`Request ` input objects. """ @abstractmethod - def page_error(self, request, error): + def links_extracted(self, request, links): """ - Called every time there was error during page downloading. + Called every time document was successfully crawled, and receiving links_extracted event from spider log, + after the link states are fetched from backend. Should be used to schedule links according to some rules. - :param object request: The fetched with error :class:`Request ` object. - :param str error: A string identifier for the error. + :param object request: The :class:`Request ` object for the crawled page. + :param list links: A list of :class:`Request ` objects generated from \ + the links extracted for the crawled page. """ def finished(self): @@ -83,7 +108,7 @@ def close(self): """ Called when strategy worker is about to close crawling strategy. """ - self._mb_stream.flush() + self._scheduled_stream.flush() self._states_context.release() def schedule(self, request, score=1.0, dont_queue=False): @@ -94,13 +119,12 @@ def schedule(self, request, score=1.0, dont_queue=False): :param score: float from 0.0 to 1.0 :param dont_queue: bool, True - if no need to schedule, only update the score """ - self._mb_stream.send(request, score, dont_queue) + self._scheduled_stream.send(request, score, dont_queue) def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ - Creates request with specified fields, with state fetched from backend. This method only creates request, but - isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states - from storage. + Creates request with specified fields. This method only creates request, but isn't getting it's state + from storage. Use self.refresh_states on a batch of requests to get their states from storage. :param url: str :param method: str @@ -110,14 +134,18 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No :param body: str :return: :class:`Request ` """ - r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) - self.url_mw._add_fingerprint(r) - return r + return self._manager.create_request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) def refresh_states(self, requests): """ Retrieves states for all requests from storage. - - :param requests: list(:class:`Request `) + if requests is not a list of :class:`Request ` objects. + :param requests: list(:class:`Request `) or a single :class:`Request ` """ self._states_context.refresh_and_keep(requests) + + def frontier_start(self): + pass + + def frontier_stop(self): + pass diff --git a/frontera/strategy/basic.py b/frontera/strategy/basic.py new file mode 100644 index 000000000..b0e586f86 --- /dev/null +++ b/frontera/strategy/basic.py @@ -0,0 +1,25 @@ +from frontera.core.components import States +from frontera.strategy import BaseCrawlingStrategy + + +class BasicCrawlingStrategy(BaseCrawlingStrategy): + def read_seeds(self, stream): + for url in stream: + url = url.strip() + r = self.create_request(url) + self.schedule(r) + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + self.schedule(link) + link.meta[b'state'] = States.QUEUED + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def request_error(self, request, error): + request.meta[b'state'] = States.ERROR \ No newline at end of file diff --git a/frontera/strategy/depth.py b/frontera/strategy/depth.py new file mode 100644 index 000000000..8c83b852b --- /dev/null +++ b/frontera/strategy/depth.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from frontera.core.components import States +from frontera.strategy import BaseCrawlingStrategy + + +class BreadthFirstCrawlingStrategy(BaseCrawlingStrategy): + def read_seeds(self, fh): + for url in fh: + url = url.strip() + req = self.create_request(url) + self.refresh_states(req) + if req.meta[b'state'] is States.NOT_CRAWLED: + req.meta[b'state'] = States.QUEUED + req.meta[b'depth'] = 0 + self.schedule(req) + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + link.meta[b'depth'] = request.meta[b'depth'] + 1 + if link.meta[b'state'] is States.NOT_CRAWLED: + link.meta[b'state'] = States.QUEUED + self.schedule(link, self.get_score(link)) + + def request_error(self, request, error): + request.meta[b'state'] = States.ERROR + self.schedule(request, score=0.0, dont_queue=True) + + def get_score(self, link): + depth = float(link.meta[b'depth']) + return 1.0 - (depth / (depth + 1.0)) + + +class DepthFirstCrawlingStrategy(BreadthFirstCrawlingStrategy): + def get_score(self, link): + depth = float(link.meta[b'depth']) + return depth / (depth + 1.0) \ No newline at end of file diff --git a/frontera/strategy/discovery/__init__.py b/frontera/strategy/discovery/__init__.py new file mode 100644 index 000000000..9f0e77f07 --- /dev/null +++ b/frontera/strategy/discovery/__init__.py @@ -0,0 +1,523 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function, absolute_import + +from math import floor +from time import time +from zlib import crc32 + +import codecs +import logging +import random +import six +import six.moves.urllib.robotparser as robotparser +from frontera.core.components import States, DomainMetadata +from frontera.strategy import BaseCrawlingStrategy +from frontera.strategy.discovery.sitemap import parse_sitemap +from publicsuffix import PublicSuffixList +from six.moves.urllib.parse import urljoin, urlsplit +from w3lib.util import to_bytes, to_native_str + + +MAX_SITEMAPS = 100 +MAX_SUBDOMAINS = 10 +MAX_DOMAINS_REDIRECTS_STORE = 100 +SITEMAP_DOWNLOAD_MAXSIZE = 50 * 1024 * 1024 # 50MB +DEFAULT_HOME_PATHS = [ + '/', 'index.html', 'index.htm', + 'default.htm', 'default.html', +] +DEFAULT_HEADERS = {b'Accept-Language:': b'en-US,en'} + + +def is_home_page_url(url): + parsed_url = urlsplit(url) + # XXX prevent exceeding hard limit with parametrized home links + return not parsed_url.query and ( + not parsed_url.path or parsed_url.path in DEFAULT_HOME_PATHS) + + +def is_accessible_domain(domain): + return 'fatal_error' not in domain + + +def is_domain_to_ignore(domain, max_pages): + return (not is_accessible_domain(domain) or 'banned' in domain or domain.setdefault('queued_pages', 0) >= max_pages) + + +def justify_request_score_by_hostname(hostname, score): + hostname_crc = crc32(to_bytes(hostname, 'utf-8', 'ignore')) + perhost_score = abs(hostname_crc / 2147483647.0) + return floor(perhost_score * 10) / 10 + max(0.01, score - 0.01) / 10.0 + + +def update_domain_with_parser_data(domain, parser, url, body=None): + """Helper to update a domain metadata in the cache. + Body param is optional and can be used to drop the field. + """ + domain['_rp'] = parser + domain['rp_timestamp'] = int(time()) + domain['rp_url'] = url + domain['rp_body'] = body + if body is None: + del domain['rp_body'] + + +def consume_randomly(iterable): + """Helper to consume from iterable in random fashion. + Note that it converts an iterable to a list and keeps it in memory. + """ + data = list(iterable) + size = len(data) + while size: + index = random.randrange(size) + yield data[index] + data[index] = data[size - 1] + size -= 1 + + +def is_valid_robotstxt(lines): + for raw_line in lines: + line = raw_line.strip(u'\ufeff').lower() # '\xef\xbb\xbf' in case of bytes + if line and not line.startswith("#"): + if line.startswith("user-agent:") or line.startswith("sitemap:"): + return True + else: + return False + return False + + +class DomainCacheProxyWeb(DomainMetadata): + def __init__(self, domain_metadata): + self._domain_metadata = domain_metadata + self._set_fields = {'subdomains', 'redirect_from', 'redirect_to'} + + def __setitem__(self, key, value): + self._domain_metadata[key] = value + + def __getitem__(self, key): + value = self._domain_metadata[key] + for k, v in six.iteritems(value): + if k in self._set_fields: + value[k] = set(value[k]) + if 'rp_url' in value and 'rp_body' in value: + value['_rp'] = robotparser.RobotFileParser(value['rp_url']) + value['_rp'].parse(value['rp_body'].splitlines()) + return value + + def __contains__(self, key): + return key in self._domain_metadata + + def __delitem__(self, key): + del self._domain_metadata[key] + + def flush(self): + if hasattr(self._domain_metadata, "flush"): + self._domain_metadata.flush() + + def setdefault(self, key, default=None): + if hasattr(self._domain_metadata, "setdefault"): + return self._domain_metadata.setdefault(key, default) + try: + value = self[key] + except KeyError: + value = default + self[key] = value + return value + + +class Discovery(BaseCrawlingStrategy): + + def __init__(self, manager, args, mb_stream, states_context): + self.logger = logging.getLogger("discovery") + backend = manager.backend + self.domain_cache = DomainCacheProxyWeb(backend.domain_metadata) + + try: + psl_file = codecs.open("public_suffix_list.dat", encoding='utf8') + except IOError: + self.logger.exception("Please get the public suffix file from https://publicsuffix.org/") + raise + self._suffix_list = PublicSuffixList(psl_file) + self._states_ctx = states_context + self.states = backend.states + + self.user_agent = to_native_str(manager.settings.get('USER_AGENT')) + self.max_pages = int(manager.settings.get('DISCOVERY_MAX_PAGES')) + super(Discovery, self).__init__(manager, args, mb_stream, states_context) + + @classmethod + def from_worker(cls, manager, args, mb_scheduler, states_context): + return cls(manager, args, mb_scheduler, states_context) + + def close(self): + self.domain_cache.flush() + super(Discovery, self).close() + + # Handling seeds logic + + def read_seeds(self, stream): + processed, scheduled = 0, 0 + requests = [] + for line in stream: + url = to_native_str(line.strip()) + if url.startswith("#"): + continue + if not url.startswith("http"): + url = "http://" + url + "/" + try: + request = self.create_request(url, meta={b'home': True}, headers=DEFAULT_HEADERS) + requests.append(request) + if len(requests) % 40000 == 0: + scheduled += self._schedule_batch(requests) + processed += len(requests) + self.logger.info("Processed %d, scheduled %d urls.", processed, scheduled) + requests = [] + except Exception: + self.logger.exception("Error during seeds addition") + if requests: + try: + scheduled += self._schedule_batch(requests) + except Exception: + self.logger.exception("Error during seeds addition") + processed += len(requests) + self.logger.info("Processed %d, and scheduled %d urls overall.", processed, scheduled) + + def _schedule_batch(self, requests): + self.refresh_states(requests) + scheduled = self.process_seeds(requests) + self._states_ctx.release() + return scheduled + + def process_seeds(self, seeds): + """Handle and schedule a batch with seeds urls. + + We call seeds only those URLs which were injected during the crawling + bootstrapping process. So seeds cannot be found during the crawling. + """ + robots_requests = set() + scheduled = 0 + for seed in seeds: + parsed_url = urlsplit(seed.url) + robots_url = "{url.scheme}://{url.netloc}/robots.txt".format(url=parsed_url) + meta = {b'netloc': parsed_url.netloc, + b'seed': seed.url, + b'robots': True} + request = self.create_request(robots_url, meta=meta, headers=DEFAULT_HEADERS) + robots_requests.add(request) + self.refresh_states(robots_requests) + for request in robots_requests: + if self._schedule_once(request, None, score=0.9): + scheduled += 1 + else: + self.logger.warning("The seed %s was already scheduled", request.url) + return scheduled + + # Strategy main handlers section. + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + # if redirects, response.url always contains initial url + self.logger.debug("PC %s [%d] (seed: %s)", response.url, + response.status_code, response.meta.get(b'seed')) + self._log_redirects_if_defined(response.request) + is_succeeded = response.status_code in [200, 203, 206] + netloc, _, domain = self._get_domain_after_redirects(response.request) + if b'robots' in response.meta: + if is_succeeded: + self._process_robots_txt(response, domain) + else: + self._process_robots_txt_error(netloc, response.url, domain) + elif b'sitemap' in response.meta: + if is_succeeded: + self._process_sitemap(netloc, response.body, domain) + if is_accessible_domain(domain): + self._schedule_home_page(netloc, domain) + + def filter_extracted_links(self, request, links): + netloc, level_2nd_name, domain = self._get_domain_after_redirects(request) + if is_domain_to_ignore(domain, max_pages=self.max_pages): + return [] + robotparser = domain.get('_rp') + chosen_links = [] + for link in links: + if not self._is_from_same_domain(level_2nd_name, link): + continue + # validate that robots.txt allows to parse it (if defined) + if robotparser and not robotparser.can_fetch(self.user_agent, link.url): + continue + chosen_links.append(link) + # maybe ban the domain if it's eligible for ban + link_netloc = urlsplit(link.url).netloc + link_hostname, _, _ = link_netloc.partition(':') + link_2nd_level, link_domain = self._get_domain(link_netloc) + subdomains = link_domain.setdefault('subdomains', set()) + subdomains.add(link_hostname) + return chosen_links + + def links_extracted(self, request, links): + # if redirects, request.url contains final url + self.logger.debug('LE %s (seed %s) %d extracted links', + request.url, request.meta.get(b'seed'), len(links)) + self._log_redirects_if_defined(request) + _, level_2nd_name, domain = self._get_domain_after_redirects(request) + for link in links: + link.headers.update(DEFAULT_HEADERS) + self._process_links(links, domain) + + def request_error(self, request, error): + request.meta[b'state'] = States.ERROR + # if redirects, request.url always contains initial url + self.logger.debug("PE %s error: %s (seed: %s)", + request.url, error, request.meta.get(b'seed')) + self._log_redirects_if_defined(request) + netloc, _, domain = self._get_domain_after_redirects(request) + if error == 'DNSLookupError': + # marking DNS lookup error as fatal, to continue without discovery + domain['fatal_error'] = error + if b'robots' in request.meta: + self._process_robots_txt_error(netloc, request.url, domain) + elif b'sitemap' in request.meta and is_accessible_domain(domain): + self._schedule_home_page(netloc, domain) + + # Additional helper handlers for robots.txt and sitemap logic. + + def _process_robots_txt(self, response, domain): + """Handle robots.txt successful response. + + The main logic behind the method is to create a RobotFileParser instance + if it's possible to decode and read robots.txt content, and save it as a + property of domain to reuse it later when deciding about need to schedule + a domain page or not. + """ + netloc = response.meta[b'netloc'] + domain.setdefault('queued_pages', 0) + try: + body = response.body.decode('utf-8') # TODO: use encoding from response.meta.get(b'encoding', 'utf-8') + except UnicodeDecodeError: + self.logger.warning("Error during robots.txt decoding at %s", response.url) + update_domain_with_parser_data(domain, parser=None, url=response.url) + self._schedule_home_page(netloc, domain) + return + robots_lines = body.splitlines() + parser = robotparser.RobotFileParser(response.url) + try: + if not is_valid_robotstxt(robots_lines): + raise SyntaxError("Robots.txt isn't valid") + parser.parse(robots_lines) + except Exception: + self.logger.exception("Error during robots.txt parsing at %s", response.url) + update_domain_with_parser_data(domain, parser=None, url=response.url) + self._schedule_home_page(netloc, domain) + return + requests = set() + for line in robots_lines: + if line.startswith("Sitemap:"): + _, _, url = line.partition(':') + sitemap_url = urljoin(response.url, url.strip()) + meta = {b'seed': domain.get('seed'), b'sitemap': True, + b'scrapy_meta': {b'download_maxsize': SITEMAP_DOWNLOAD_MAXSIZE}} + requests.add(self.create_request(sitemap_url, meta=meta, headers=DEFAULT_HEADERS)) + self.refresh_states(requests) + # schedule sitemap requests + self._schedule_requests(requests, domain, score=0.9) + if not requests: + self.logger.debug("Sitemap in robots.txt wasn't found for url %s", response.url) + update_domain_with_parser_data(domain, parser=parser, url=response.url, body=body) + # also always schedule home page regardless of scheduled sitemaps + self._schedule_home_page(netloc, domain) + + def _process_robots_txt_error(self, netloc, url, domain): + """Handle robots.txt failure response.""" + update_domain_with_parser_data(domain, parser=None, url=url) + if is_accessible_domain(domain): + self._schedule_home_page(netloc, domain) + + def _process_sitemap(self, netloc, body, domain): + """Helper to process a sitemap request's response. + + Current logic is to split sitemap body content into sub-sitemaps and other + entries, and schedule it (sub-sitemaps could be scheduled as-is with higher score, + but other links should be processed differently exactly as links extracted from + crawled page - sub-domains homepages have more priority over others requests). + """ + if is_domain_to_ignore(domain, self.max_pages): + return + + requests, sitemaps = set(), set() + sitemap_scrapy_meta = {b'download_maxsize': SITEMAP_DOWNLOAD_MAXSIZE} + for url, sub_sitemap in parse_sitemap(body): + try: + meta = {b'seed': domain.get('seed'), b'sitemap': True, + b'scrapy_meta': sitemap_scrapy_meta} if sub_sitemap else ( + {b'home': True} if is_home_page_url(url) else {}) + request = self.create_request(url, meta=meta, headers=DEFAULT_HEADERS) + except Exception: + self.logger.exception("Error on url %s", url) + continue + sitemaps.add(request) if sub_sitemap else requests.add(request) + # 1) handle sub-sitemaps + if len(sitemaps) > MAX_SITEMAPS: + # TODO global per-host counter of sitemaps scheduled + self.logger.warning('Amount of sub-sitemaps > %d for url %s', MAX_SITEMAPS, netloc) + sitemaps = set(random.sample(sitemaps, MAX_SITEMAPS)) + self.refresh_states(sitemaps) + self._schedule_requests(sitemaps, domain, score=0.9, count=False) + + # 2) handle separate entries + # current policy is to trust sitemap data, and don't verify hostname for links + to_sample = self.max_pages - domain.get('queued_pages', 0) + if to_sample > 0 and len(requests) > to_sample: + requests = random.sample(requests, to_sample) + self.refresh_states(requests) + self._process_links(requests, domain) + + def _process_links(self, links, domain): + """Helper to process and schedule extracted links. + + The method splits a given links set into 3 parts: + - home pages for domain/sub-domain to schedule with higher score + - links of interest + - other pages + (which is a string with domain name to check for inclusion). + After splitting, the method schedules the requests. + """ + if is_domain_to_ignore(domain, self.max_pages): + return + # at first schedule home pages with higher priority, and add others to a set + home_links, interest_links, other_links = set(), set(), set() + for link in links: + link.meta[b'seed'] = domain.get('seed') + if is_home_page_url(link.url): + # XXX it may look proper to tag such links with meta[b'home'] = True, + # but it would mean trusting to any home link found among extracted, + # and lead to infinite amount of domains to crawl and infinite crawl + home_links.add(link) + elif self.is_link_of_interest(link): + interest_links.add(link) + else: + other_links.add(link) + self._schedule_requests(home_links, domain, score=0.8) + self._schedule_requests(interest_links, domain, score=0.7) + self._schedule_requests(other_links, domain, score=0.5) + + def is_link_of_interest(self, link): + """Predicate helper to match important links. + To be implemented in a subclass.""" + + # Helpers to schedule different types of requests + + # The following 2 methods accept a dict with domain metadata and control amount + # of queued pages already scheduled for the domain, please schedule all needed + # requests only via the methods. Domain metadata also must contain seed field + # to track it when validating results. + + def _schedule_home_page(self, netloc, domain): + """Schedule a domain home page. + + The method enforces setting 'seed' meta field for the request. + """ + if domain.setdefault('queued_pages', 0) >= self.max_pages: + return + home_page = "http://%s/" % netloc + meta = {b'seed': domain.get('seed'), b'home': True} + request = self.create_request(home_page, meta=meta, headers=DEFAULT_HEADERS) + self.refresh_states([request]) + if self._schedule_once(request, domain, score=0.8): + domain['queued_pages'] += 1 + self.logger.debug("Scheduled home page %s", request.url) + return True + return False + + def _schedule_requests(self, requests, domain, score, count=True): + """Helper to schedule a bunch of requests in random order. + + The method schedules requests as-is w/o any modifications (except for score), + make sure you have set all needed headers/metadata/etc before calling it. + """ + scheduled = 0 + if not is_accessible_domain(domain): + return scheduled + already_queued_pages = domain.setdefault('queued_pages', 0) + # XXX to avoid converting links set to a list if enough pages + if count and already_queued_pages >= self.max_pages: + return scheduled + for request in consume_randomly(requests): + # scheduling pages randomly if they fit within limits + if count and domain['queued_pages'] >= self.max_pages: + self.logger.debug('LIMIT REACHED pages (%d) for seed %s', + domain['queued_pages'], domain['seed']) + break + if self._schedule_once(request, domain, score=score): + self.logger.debug('IL Scheduled %s', request.url) + domain['queued_pages'] += 1 + scheduled += 1 + return scheduled + + def _schedule_once(self, request, domain, score=0.1): + """Accept a request object, justify its score and schedule it. + + The method schedules a request as-is w/o any modifications (except for score), + make sure you have set all needed headers/metadata/etc before calling it. + """ + robotparser = domain.get('_rp') if domain is not None else None + if robotparser and not robotparser.can_fetch(self.user_agent, request.url): + return False + if request.meta[b'state'] != States.NOT_CRAWLED: + return False + hostname = urlsplit(request.url).hostname # hostname is already lower-cased + if not hostname: + self.logger.warning("Can't parse hostname for '%s'", repr(request.url)) + return False + final_score = justify_request_score_by_hostname(hostname, score) + self.schedule(request, final_score) + request.meta[b'state'] = States.QUEUED + return True + + # Auxiliary helpers section + + def _get_domain_after_redirects(self, request): + seed = request.meta.get(b'seed') + redirect_urls = request.meta.get(b'redirect_urls', []) + origin_url = redirect_urls[0] if redirect_urls else request.url + origin_netloc = urlsplit(origin_url).netloc + origin_2nd_name, origin_domain = self._get_domain(origin_netloc) + + if redirect_urls and (b'robots' in request.meta or b'sitemap' in request.meta or b'home' in request.meta): + final_netloc = urlsplit(redirect_urls[-1]).netloc + if final_netloc != origin_netloc: + origin_redirects = origin_domain.setdefault('redirect_to', set()) + self._extend_redirects_list(origin_redirects, final_netloc) + final_2nd_name, final_domain = self._get_domain(final_netloc) + final_redirects = final_domain.setdefault('redirect_from', set()) + self._extend_redirects_list(final_redirects, origin_netloc) + final_domain['seed'] = seed + return final_netloc, final_2nd_name, final_domain + + origin_domain['seed'] = seed + return origin_netloc, origin_2nd_name, origin_domain + + def _log_redirects_if_defined(self, request): + redirect_urls = request.meta.get(b'redirect_urls', []) + for redirect_url in redirect_urls: + self.logger.debug("REDIR %s", redirect_url) + + def _extend_redirects_list(self, redirects, netloc): + """Helper to add a netloc to redirects list within limit.""" + if netloc not in redirects and len(redirects) < MAX_DOMAINS_REDIRECTS_STORE: + redirects.add(netloc) + + def _get_domain(self, netloc): + """Helper to get a 2nd level domain and corresponding meta for a given netloc. + Returns a tuple with a domain name and a metadata dict from domain cache. + """ + domain = self._get_2ndlevel_name(netloc) + return domain, self.domain_cache.setdefault(domain, {}) + + def _is_from_same_domain(self, domain_name, request): + """Helper to check if a request url points to the same domain.""" + return self._get_2ndlevel_name(urlsplit(request.url).netloc) == domain_name + + def _get_2ndlevel_name(self, netloc): + """Helper to extract a host from netloc and get its public suffix.""" + hostname, _, _ = netloc.partition(':') + return self._suffix_list.get_public_suffix(hostname) diff --git a/frontera/strategy/discovery/sitemap.py b/frontera/strategy/discovery/sitemap.py new file mode 100644 index 000000000..3d3c1859a --- /dev/null +++ b/frontera/strategy/discovery/sitemap.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +from bs4 import BeautifulSoup + + +def _process_sitemap(s): + soup = BeautifulSoup(s, "lxml") + result = [] + sub_sitemaps = [] + + for loc in soup.findAll('loc'): + if loc.parent.name == 'url': + result.append(loc.text.strip()) + continue + if loc.parent.name == 'sitemap': + sub_sitemaps.append(loc.text.strip()) + continue + return result, sub_sitemaps + + +def parse_sitemap(content): + sitemap, sub_sitemaps = _process_sitemap(content) + while sitemap: + yield (sitemap.pop(), False) + while sub_sitemaps: + yield (sub_sitemaps.pop(), True) diff --git a/frontera/utils/add_seeds.py b/frontera/utils/add_seeds.py new file mode 100644 index 000000000..bbee17b11 --- /dev/null +++ b/frontera/utils/add_seeds.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +from frontera.core.manager import LocalFrontierManager +from frontera.settings import Settings +from frontera.logger.handlers import CONSOLE +from argparse import ArgumentParser +import logging +from logging.config import fileConfig +from os.path import exists + + +logger = logging.getLogger(__name__) + + +def run_add_seeds(settings, seeds_file): + fh = open(seeds_file, "rb") + + logger.info("Starting local seeds addition from file %s", seeds_file) + + manager = LocalFrontierManager.from_settings(settings) + manager.add_seeds(fh) + manager.stop() + manager.close() + + logger.info("Seeds addition finished") + + +if __name__ == '__main__': + parser = ArgumentParser(description="Frontera local add seeds utility") + parser.add_argument('--config', type=str, required=True, + help='Settings module name, should be accessible by import') + parser.add_argument('--log-level', '-L', type=str, default='INFO', + help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") + parser.add_argument('--seeds-file', type=str, required=True, help="Seeds file path") + args = parser.parse_args() + settings = Settings(module=args.config) + logging_config_path = settings.get("LOGGING_CONFIG") + if logging_config_path and exists(logging_config_path): + fileConfig(logging_config_path, disable_existing_loggers=False) + else: + logging.basicConfig(level=args.log_level) + logger.setLevel(args.log_level) + logger.addHandler(CONSOLE) + + run_add_seeds(settings, args.seeds_file) \ No newline at end of file diff --git a/frontera/utils/fingerprint.py b/frontera/utils/fingerprint.py index 97bb55385..b491ebcb6 100644 --- a/frontera/utils/fingerprint.py +++ b/frontera/utils/fingerprint.py @@ -1,11 +1,10 @@ from __future__ import absolute_import import hashlib -from six.moves.urllib.parse import urlparse from struct import pack from binascii import hexlify from frontera.utils.misc import get_crc32 from frontera.utils.url import parse_url -from w3lib.util import to_native_str, to_bytes +from w3lib.util import to_bytes def sha1(key): @@ -27,12 +26,11 @@ def hostname_local_fingerprint(key): :return: str 20 bytes hex string """ result = parse_url(key) - if not result.hostname: - return sha1(key) - host_checksum = get_crc32(result.hostname) - doc_uri_combined = result.path+';'+result.params+result.query+result.fragment + hostname = result.hostname if result.hostname else '-' + host_checksum = get_crc32(hostname) + combined = hostname+result.path+';'+result.params+result.query+result.fragment - doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') - doc_fprint = hashlib.md5(doc_uri_combined).digest() + combined = to_bytes(combined, 'utf8', 'ignore') + doc_fprint = hashlib.md5(combined).digest() fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) - return fprint + return fprint \ No newline at end of file diff --git a/frontera/utils/graphs/data.py b/frontera/utils/graphs/data.py index f925cdf99..3822b597d 100644 --- a/frontera/utils/graphs/data.py +++ b/frontera/utils/graphs/data.py @@ -65,17 +65,17 @@ def __len__(self): SITE_A = CrawlSiteData( name='A', description='', - pages=create_test_site('A', 4, 2)) + pages=create_test_site('http://aaa.com/', 4, 2)) SITE_B = CrawlSiteData( name='B', description='', - pages=create_test_site('B', 4, 2)) + pages=create_test_site('http://bbb.com/', 4, 2)) SITE_C = CrawlSiteData( name='C', description='', - pages=create_test_site('C', 5, 2, self_link=True)) + pages=create_test_site('http://ccc.com/', 5, 2, self_link=True)) #----------------------------------------------------- diff --git a/frontera/utils/managers.py b/frontera/utils/managers.py index 867f7a4f6..90479e819 100644 --- a/frontera/utils/managers.py +++ b/frontera/utils/managers.py @@ -1,11 +1,12 @@ from __future__ import absolute_import -from frontera.core.manager import FrontierManager +from frontera.core.manager import LocalFrontierManager, SpiderFrontierManager from .converters import BaseRequestConverter, BaseResponseConverter class FrontierManagerWrapper(object): def __init__(self, settings, manager=None): - manager = manager or FrontierManager + if manager is None: + manager = LocalFrontierManager if settings.get("LOCAL_MODE") is True else SpiderFrontierManager self.manager = manager.from_settings(settings) self.request_converter = None self.response_converter = None @@ -24,10 +25,6 @@ def start(self): def stop(self): self.manager.stop() - def add_seeds(self, seeds): - frontier_seeds = [self.request_converter.to_frontier(seed) for seed in seeds] - self.manager.add_seeds(seeds=frontier_seeds) - def get_next_requests(self, max_next_requests=0, **kwargs): frontier_requests = self.manager.get_next_requests(max_next_requests=max_next_requests, **kwargs) return [self.request_converter.from_frontier(frontier_request) for frontier_request in frontier_requests] diff --git a/frontera/utils/misc.py b/frontera/utils/misc.py index 15731195f..7ca830a21 100644 --- a/frontera/utils/misc.py +++ b/frontera/utils/misc.py @@ -1,9 +1,22 @@ from __future__ import absolute_import -from importlib import import_module + +import time +import logging +import calendar from zlib import crc32 +from timeit import default_timer +from importlib import import_module + +import six from six.moves import range from w3lib.util import to_bytes -import six + + +logger = logging.getLogger("utils.misc") + + +def utc_timestamp(): + return calendar.timegm(time.gmtime()) def load_object(path): @@ -72,4 +85,19 @@ def dict_to_unicode(obj): if isinstance(obj, list): return map(dict_to_unicode, obj) else: - return obj \ No newline at end of file + return obj + + +class time_elapsed(object): + """Useful context manager to measure elapsed time.""" + + def __init__(self, name): + self.name = name + + def __enter__(self): + self.start = default_timer() + + def __exit__(self, ty, val, tb): + end = default_timer() + logger.debug("%s : %0.3f seconds" % (self.name, end-self.start)) + return False diff --git a/frontera/utils/msgpack.py b/frontera/utils/msgpack.py new file mode 100644 index 000000000..5f77f8607 --- /dev/null +++ b/frontera/utils/msgpack.py @@ -0,0 +1,22 @@ +import six + + +def restruct_for_pack(obj): + """Recursively walk object's hierarchy.""" + if isinstance(obj, six.text_type): + return obj + if isinstance(obj, (bool, six.integer_types, float, six.binary_type)): + return obj + elif isinstance(obj, dict): + obj = obj.copy() + for key in obj: + obj[key] = restruct_for_pack(obj[key]) + return obj + elif isinstance(obj, list) or isinstance(obj, set): + return [restruct_for_pack(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(restruct_for_pack([item for item in obj])) + elif hasattr(obj, '__dict__'): + return restruct_for_pack(obj.__dict__) + else: + return None \ No newline at end of file diff --git a/frontera/utils/ossignal.py b/frontera/utils/ossignal.py new file mode 100644 index 000000000..283a98a69 --- /dev/null +++ b/frontera/utils/ossignal.py @@ -0,0 +1,17 @@ +import signal +from twisted.internet import reactor + + +def install_shutdown_handlers(function, override_sigint=True): + """Install the given function as a signal handler for all common shutdown + signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the + SIGINT handler won't be install if there is already a handler in place + (e.g. Pdb) + """ + signal.signal(signal.SIGTERM, function) + if signal.getsignal(signal.SIGINT) == signal.default_int_handler or \ + override_sigint: + signal.signal(signal.SIGINT, function) + # Catch Ctrl-Break in windows + if hasattr(signal, "SIGBREAK"): + signal.signal(signal.SIGBREAK, function) \ No newline at end of file diff --git a/frontera/utils/s3.py b/frontera/utils/s3.py new file mode 100644 index 000000000..ed3bec4c6 --- /dev/null +++ b/frontera/utils/s3.py @@ -0,0 +1,35 @@ +from botocore.response import StreamingBody +from io import RawIOBase + + +class StreamingBodyIOBase(RawIOBase): + def __init__(self, streaming_body, *args, **kwargs): + assert isinstance(streaming_body, StreamingBody) + self._sb = streaming_body + super(StreamingBodyIOBase, self).__init__(*args, **kwargs) + + def close(self): + self._sb.close() + + def read(self, size=-1): + if size == -1: + size = None + return self._sb.read(size) + + def readable(self, *args, **kwargs): + return self._sb._amount_read < self._sb._content_length + + def tell(self): + return self._sb._amount_read + + def seekable(self, *args, **kwargs): + return False + + def writable(self, *args, **kwargs): + return False + + def isatty(self, *args, **kwargs): + return False + + + diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index 29956406f..7dd107246 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -1,9 +1,13 @@ -from __future__ import absolute_import +from __future__ import absolute_import, print_function from collections import OrderedDict, deque -from six.moves.urllib.parse import urlparse + import six + +from io import BytesIO +from os import linesep from six.moves import range +from six.moves.urllib.parse import urlparse class FrontierTester(object): @@ -31,15 +35,24 @@ def run(self, add_all_pages=False): self.frontier.stop() def _add_seeds(self): - self.frontier.add_seeds([self._make_request(seed.url) for seed in self.graph_manager.seeds]) + stream = BytesIO() + for seed in self.graph_manager.seeds: + stream.write(seed.url.encode('utf8')) + stream.write(linesep.encode('utf8')) + stream.seek(0) + self.frontier.add_seeds(stream) def _add_all(self): + stream = BytesIO() for page in self.graph_manager.pages: - if page.is_seed: - self.frontier.add_seeds([self._make_request(page.url)]) + stream.write(page.url.encode('utf8')) if not page.has_errors: for link in page.links: - self.frontier.add_seeds([self._make_request(link.url)]) + stream.write(link.url.encode('utf8')) + stream.write(linesep.encode('utf8')) + stream.seek(0) + + self.frontier.add_seeds(stream) def _make_request(self, url): r = self.frontier.request_model(url=url, @@ -138,3 +151,5 @@ def downloader_info(self): def idle(self): return len(self.slots) == 0 + + diff --git a/frontera/utils/async.py b/frontera/utils/twisted_helpers.py similarity index 100% rename from frontera/utils/async.py rename to frontera/utils/twisted_helpers.py diff --git a/frontera/worker/components/__init__.py b/frontera/worker/components/__init__.py new file mode 100644 index 000000000..74c6a8132 --- /dev/null +++ b/frontera/worker/components/__init__.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import time + +import logging +from frontera.utils.twisted_helpers import CallLaterOnce +from twisted.internet import reactor, threads + + +class DBWorkerBaseComponent(object): + + NAME = None + + def __init__(self, worker, settings, stop_event): + self.worker = worker + self.settings = settings + self.stop_event = stop_event + self.logger = logging.getLogger('db-worker.{}'.format(self.NAME)) + + def schedule(self, delay=0): + """Schedule component start with optional delay. + The function must return None or Deferred. + """ + raise NotImplementedError + + def run(self): + """Iteration logic, must be implemented in a subclass.""" + raise NotImplementedError + + def close(self): + """Optional cleanup logic when component loop is stopped.""" + + +class DBWorkerPeriodicComponent(DBWorkerBaseComponent): + + def __init__(self, worker, settings, stop_event, *args, **kwargs): + super(DBWorkerPeriodicComponent, self).__init__(worker, settings, stop_event) + self.periodic_task = CallLaterOnce(self.run_and_reschedule) + self.periodic_task.setErrback(self.run_errback) + + def schedule(self, delay=0): + self.periodic_task.schedule(delay) + + def run_and_reschedule(self): + if not self.stopped: + self.run() + self.periodic_task.schedule() + + def run_errback(self, failure): + self.logger.error(failure.getTraceback()) + if not self.stopped: + self.periodic_task.schedule() + + @property + def stopped(self): + return self.stop_event.is_set() + + +class DBWorkerThreadComponent(DBWorkerBaseComponent): + """Base class for DB worker component running in a separate thread. + + The class defines a single interface for DB worker components: you should + mainly implement only .run() method representing a single component iteration. + """ + + def __init__(self, worker, settings, stop_event, *args, **kwargs): + super(DBWorkerThreadComponent, self).__init__(worker, settings, stop_event) + self.run_backoff = 0 # replace it with a proper value in subclass + + def schedule(self): + return threads.deferToThread(self.loop) + + def loop(self): + """Main entrypoint for the thread running loop.""" + while not self.stop_event.is_set(): + try: + is_backoff_needed = self.run() + except Exception: + self.logger.exception('Exception in the main loop') + else: + if is_backoff_needed and self.run_backoff: + delay_msg = 'Sleep for {} seconds before next run()' + self.logger.debug(delay_msg.format(self.run_backoff)) + time.sleep(self.run_backoff) + self.logger.debug("Main loop was stopped") + + def run(self): + """Logic for single iteration of the component. + + The method must return True-ish value if backoff is needed between iteration. + """ + raise NotImplementedError + + def update_stats(self, **kwargs): + """Helper to update worker stats.""" + if reactor.running: + reactor.callFromThread(self.worker.update_stats, **kwargs) + else: + # for testing purposes + self.worker.update_stats(**kwargs) diff --git a/frontera/worker/components/batch_generator.py b/frontera/worker/components/batch_generator.py new file mode 100644 index 000000000..78f97747a --- /dev/null +++ b/frontera/worker/components/batch_generator.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import threading +from time import asctime, time +from collections import defaultdict +from logging import DEBUG + +from frontera.exceptions import NotConfigured +from frontera.utils.url import parse_domain_from_url_fast +from . import DBWorkerThreadComponent + + +class BatchGenerator(DBWorkerThreadComponent): + """Component to get data from backend and send it to spider feed log.""" + + NAME = 'batchgen' + + def __init__(self, worker, settings, stop_event, + no_batches=False, partitions=None, **kwargs): + super(BatchGenerator, self).__init__(worker, settings, stop_event, **kwargs) + if no_batches: + raise NotConfigured('BatchGenerator is disabled with --no-batches') + + self.run_backoff = settings.get('NEW_BATCH_DELAY') + self.backend = worker.backend + self.spider_feed = worker.message_bus.spider_feed() + self.spider_feed_producer = self.spider_feed.producer() + + self.get_key_function = self.get_fingerprint + if settings.get('QUEUE_HOSTNAME_PARTITIONING'): + self.get_key_function = self.get_hostname + + self.domains_blacklist = settings.get('DOMAINS_BLACKLIST') + self.max_next_requests = settings.MAX_NEXT_REQUESTS + self.partitions = partitions + # create an event to disable/enable batches generation via RPC + self.disabled_event = threading.Event() + + # domain statistics logging + self.domain_stats = dict([(partition_id, defaultdict(int)) for partition_id in self.partitions]) + self.domain_stats_interval = settings.get('DOMAIN_STATS_LOG_INTERVAL') + self.rotate_time = time() + self.domain_stats_interval + + def get_ready_partitions(self): + pending_partitions = self.spider_feed.available_partitions() + if not self.partitions: + return pending_partitions + return list(set(pending_partitions) & set(self.partitions)) + + def run(self): + if self.disabled_event.is_set(): + return True + if self.logger.isEnabledFor(DEBUG) and time() > self.rotate_time: + self.rotate_and_log_domain_stats() + + partitions = self.get_ready_partitions() + if not partitions: + return True + batch_count = sum(self._handle_partition(partition_id) + for partition_id in partitions) + if not batch_count: + return True + # let's count full batches in the same way as before + self.update_stats(increments={'batches_after_start': 1}, + replacements={'last_batch_size': batch_count, + 'last_batch_generated': asctime()}) + + def _handle_partition(self, partition_id): + self.logger.info("Getting new batches for partition %d", partition_id) + count = 0 + for request in self.backend.get_next_requests(self.max_next_requests, + partitions=[partition_id]): + if self._is_domain_blacklisted(request): + continue + try: + request.meta[b'jid'] = self.worker.job_id + eo = self.worker._encoder.encode_request(request) + except Exception as e: + self.logger.error("Encoding error, %s, fingerprint: %s, url: %s" % + (e, self.get_fingerprint(request), request.url)) + count += 1 # counts as a processed request + continue + try: + self.spider_feed_producer.send(self.get_key_function(request), eo) + except Exception: + self.logger.exception("Sending message error fingerprint: %s, url: %s" % + (self.get_fingerprint(request), request.url)) + finally: + count += 1 + hostname = self.get_hostname(request) + if self.logger.isEnabledFor(DEBUG): + self.domain_stats[partition_id][hostname] += 1 + self.update_stats(increments={'pushed_since_start': count}) + return count + + def _is_domain_blacklisted(self, request): + if not self.domains_blacklist: + return + if 'domain' in request.meta: + hostname = request.meta['domain'].get('name') + else: + _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) + if hostname: + hostname = hostname.lower() + if hostname in self.domains_blacklist: + self.logger.debug("Dropping black-listed hostname, URL %s", request.url) + return True + return False + + def close(self): + self.spider_feed_producer.close() + + def rotate_and_log_domain_stats(self): + self.logger.debug("Domain statistics of requests pushed to spider feed") + for partition_id, host_stats in sorted(self.domain_stats.items(), key=lambda x: x[0]): + self.logger.debug("PID %d =================================================================", partition_id) + for hostname, count in host_stats.items(): + self.logger.debug("%s\t%d", hostname, count) + + self.domain_stats[partition_id] = defaultdict(int) + self.rotate_time = time() + self.domain_stats_interval + + # --------------------------- Auxiliary tools -------------------------------- + + def get_fingerprint(self, request): + return request.meta[b'fingerprint'] + + def get_hostname(self, request): + return request.meta[b'domain'][b'name'] diff --git a/frontera/worker/components/incoming_consumer.py b/frontera/worker/components/incoming_consumer.py new file mode 100644 index 000000000..43b5f83f9 --- /dev/null +++ b/frontera/worker/components/incoming_consumer.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +from time import asctime +from collections import defaultdict + +from frontera.exceptions import NotConfigured +from . import DBWorkerPeriodicComponent + + +class IncomingConsumer(DBWorkerPeriodicComponent): + """Component to get data from spider log and handle it with backend.""" + + NAME = 'incoming' + + def __init__(self, worker, settings, stop_event, no_incoming=False, **kwargs): + super(IncomingConsumer, self).__init__(worker, settings, stop_event, **kwargs) + if no_incoming: + raise NotConfigured('IncomingConsumer is disabled with --no-incoming') + + spider_log = worker.message_bus.spider_log() + self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') + self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') + + # spider-feed is required only to handle 'offset' messages: + # check lag to decide if mark feed producer as busy or ready + # XXX not implemented for kafka message bus + self.spider_feed = worker.message_bus.spider_feed() + self.spider_feed_producer = self.spider_feed.producer() + + self.backend = worker.backend + self.max_next_requests = settings.MAX_NEXT_REQUESTS + + def run(self): + consumed, stats = 0, defaultdict(int) + for m in self.spider_log_consumer.get_messages( + timeout=1.0, count=self.spider_log_consumer_batch_size): + try: + msg = self.worker._decoder.decode(m) + except (KeyError, TypeError) as e: + self.logger.error("Decoding error: %s", e) + else: + self._handle_message(msg, stats) + finally: + consumed += 1 + """ + # TODO: Think how it should be implemented in DB-worker only mode. + if not self.strategy_disabled and self._backend.finished(): + logger.info("Crawling is finished.") + reactor.stop() + """ + stats_increments = {'consumed_since_start': consumed} + stats_increments.update(stats) + self.worker.update_stats(increments=stats_increments, + replacements={'last_consumed': consumed, + 'last_consumption_run': asctime()}) + + def _handle_message(self, msg, stats): + """Base logic to safely handle a message.""" + try: + self._handle_message_by_type(msg[0], msg, stats) + except Exception: + self.logger.exception("Error while handling a message") + self.logger.debug("Message caused the error %s", str(msg)) + + def _handle_message_by_type(self, msg_type, msg, stats): + if msg_type == 'add_seeds': + _, seeds = msg + self.logger.info('Adding %i seeds', len(seeds)) + for seed in seeds: + self.logger.debug('URL: %s', seed.url) + self.backend.add_seeds(seeds) + stats['consumed_add_seeds'] += 1 + + elif msg_type == 'page_crawled': + _, response = msg + self.logger.debug("Page crawled %s", response.url) + if b'jid' not in response.meta or response.meta[b'jid'] != self.worker.job_id: + return + self.backend.page_crawled(response) + stats['consumed_page_crawled'] += 1 + + elif msg_type == 'links_extracted': + _, request, links = msg + self.logger.debug("Links extracted %s (%d)", request.url, len(links)) + if b'jid' not in request.meta or request.meta[b'jid'] != self.worker.job_id: + return + self.backend.links_extracted(request, links) + stats['consumed_links_extracted'] += 1 + + elif msg_type == 'request_error': + _, request, error = msg + self.logger.debug("Request error %s", request.url) + if b'jid' not in request.meta or request.meta[b'jid'] != self.worker.job_id: + return + self.backend.request_error(request, error) + stats['consumed_request_error'] += 1 + + elif msg_type == 'offset': + _, partition_id, offset = msg + producer_offset = self.spider_feed_producer.get_offset(partition_id) + if producer_offset is None: + return + else: + lag = producer_offset - offset + if lag < 0: + # non-sense in general, happens when SW is restarted and + # not synced yet with Spiders. + return + if lag < self.max_next_requests or offset == 0: + self.spider_feed.mark_ready(partition_id) + else: + self.spider_feed.mark_busy(partition_id) + stats['consumed_offset'] += 1 + + else: + self.logger.debug('Unknown message type %s', msg[0]) + + def close(self): + self.spider_feed_producer.close() + self.spider_log_consumer.close() diff --git a/frontera/worker/components/scoring_consumer.py b/frontera/worker/components/scoring_consumer.py new file mode 100644 index 000000000..0fdd85244 --- /dev/null +++ b/frontera/worker/components/scoring_consumer.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +from time import asctime + +from frontera.exceptions import NotConfigured +from frontera.core.components import DistributedBackend +from . import DBWorkerPeriodicComponent + + +class ScoringConsumer(DBWorkerPeriodicComponent): + """Component to get data from scoring log and send it to backend queue.""" + + NAME = 'scoring' + + def __init__(self, worker, settings, stop_event, no_scoring=False, **kwargs): + super(ScoringConsumer, self).__init__(worker, settings, stop_event, **kwargs) + if no_scoring: + raise NotConfigured('ScoringConsumer is disabled with --no-scoring') + if not isinstance(worker.backend, DistributedBackend): + raise NotConfigured('Strategy is disabled for non-distributed backend') + + scoring_log = worker.message_bus.scoring_log() + self.scoring_log_consumer = scoring_log.consumer() + self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') + self.backend_queue = worker.backend.queue + + def run(self): + consumed, seen, batch = 0, set(), [] + for m in self.scoring_log_consumer.get_messages( + count=self.scoring_log_consumer_batch_size): + try: + msg = self.worker._decoder.decode(m) + except (KeyError, TypeError): + self.logger.exception("Decoding error") + continue + else: + if msg[0] == 'update_score': + _, request, score, schedule = msg + if request.meta[b'fingerprint'] not in seen: + batch.append((request.meta[b'fingerprint'], + score, request, schedule)) + seen.add(request.meta[b'fingerprint']) + elif msg[0] == 'new_job_id': + self.worker.job_id = msg[1] + finally: + consumed += 1 + self.backend_queue.schedule(batch) + self.worker.update_stats(increments={'consumed_scoring_since_start': consumed}, + replacements={'last_consumed_scoring': consumed, + 'last_consumption_run_scoring': asctime()}) + + def close(self): + self.scoring_log_consumer.close() diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 6f9abad85..d8d3a347c 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -1,280 +1,216 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import + +import os import logging +import threading from traceback import format_stack from signal import signal, SIGUSR1 -from logging.config import fileConfig +from collections import defaultdict from argparse import ArgumentParser -from time import asctime -from os.path import exists +from logging.config import fileConfig -from twisted.internet import reactor, task -from frontera.core.components import DistributedBackend -from frontera.core.manager import FrontierManager -from frontera.utils.url import parse_domain_from_url_fast -from frontera.logger.handlers import CONSOLE +import six +from twisted.internet import reactor, task, defer +from twisted.internet.defer import Deferred from frontera.settings import Settings from frontera.utils.misc import load_object -from frontera.utils.async import CallLaterOnce -from .server import WorkerJsonRpcService -import six -from six.moves import map +from frontera.logger.handlers import CONSOLE +from frontera.exceptions import NotConfigured +from frontera.core.manager import WorkerFrontierManager +from frontera.worker.server import WorkerJsonRpcService +from frontera.utils.ossignal import install_shutdown_handlers +from frontera.worker.stats import StatsExportMixin + +from .components.incoming_consumer import IncomingConsumer +from .components.scoring_consumer import ScoringConsumer +from .components.batch_generator import BatchGenerator + + +ALL_COMPONENTS = [ScoringConsumer, IncomingConsumer, BatchGenerator] +LOGGING_TASK_INTERVAL = 30 logger = logging.getLogger("db-worker") class Slot(object): - def __init__(self, new_batch, consume_incoming, consume_scoring, no_batches, no_scoring_log, - new_batch_delay, no_spider_log): - self.new_batch = CallLaterOnce(new_batch) - self.new_batch.setErrback(self.error) + """Slot component to manage worker components. + + Slot is responsible for scheduling all the components, modify its behaviour + and stop them gracefully on worker's discretion. + """ + def __init__(self, worker, settings, **kwargs): + # single event to stop all the components at once + self.stop_event = threading.Event() + self.components = self._load_components(worker, settings, **kwargs) + self._setup_managing_batches() + self._deferred = None + + def _load_components(self, worker, settings, **kwargs): + # each component is stored as (cls, instance) pair + components = {} + for cls in ALL_COMPONENTS: + try: + component = cls(worker, settings, stop_event=self.stop_event, **kwargs) + except NotConfigured: + logger.info("Component {} is disabled".format(cls.NAME)) + else: + components[cls] = component + if not components: + raise NotConfigured("No components to run, please check your input args") + return components + + def schedule(self): + # component.schedule() function must return None or Deferred + scheduled = [component.schedule() for component in self.components.values()] + deferred = [result for result in scheduled if isinstance(result, Deferred)] + self._deferred = defer.DeferredList(deferred) if deferred else None + + def stop(self): + """Set stop flag and return a defferred connected with all running threads.""" + self.stop_event.set() + return self._deferred if self._deferred else None - self.consumption = CallLaterOnce(consume_incoming) - self.consumption.setErrback(self.error) + def close(self): + for component in self.components.values(): + component.close() - self.scheduling = CallLaterOnce(self.schedule) - self.scheduling.setErrback(self.error) + # Additional functions to manage specific components - self.scoring_consumption = CallLaterOnce(consume_scoring) - self.scoring_consumption.setErrback(self.error) + # XXX do we actually use this feature to disable/enable new batches? + # it should be easier to just stop the batchgen component and start it again when needed - self.no_batches = no_batches - self.no_scoring_log = no_scoring_log - self.no_spider_log = no_spider_log - self.new_batch_delay = new_batch_delay + def _setup_managing_batches(self): + """Save batch-gen specific event to disable/enable it via RPC calls.""" + batchgen = self.components.get(BatchGenerator) + self.batches_disabled_event = batchgen.disabled_event if batchgen else None - def error(self, f): - logger.exception(f.value) - return f + def manage_new_batches(self, enable): + if self.batches_disabled_event: + self.batches_disabled_event.clear() if enable else self.batches_disabled_event.set() - def schedule(self, on_start=False): - if on_start and not self.no_batches: - self.new_batch.schedule(0) - if not self.no_spider_log: - self.consumption.schedule() - if not self.no_batches: - self.new_batch.schedule(self.new_batch_delay) - if not self.no_scoring_log: - self.scoring_consumption.schedule() - self.scheduling.schedule(5.0) +class BaseDBWorker(object): + """Base database worker class.""" + def __init__(self, settings, no_batches, no_incoming, no_scoring, **kwargs): -class DBWorker(object): - def __init__(self, settings, no_batches, no_incoming, no_scoring): messagebus = load_object(settings.get('MESSAGE_BUS')) - self.mb = messagebus(settings) - spider_log = self.mb.spider_log() + self.message_bus = messagebus(settings) - self.spider_feed = self.mb.spider_feed() - self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') - self.spider_feed_producer = self.spider_feed.producer() + self._manager = WorkerFrontierManager.from_settings(settings, db_worker=True) + self.backend = self._manager.backend - self._manager = FrontierManager.from_settings(settings, db_worker=True) - self._backend = self._manager.backend codec_path = settings.get('MESSAGE_BUS_CODEC') encoder_cls = load_object(codec_path+".Encoder") decoder_cls = load_object(codec_path+".Decoder") self._encoder = encoder_cls(self._manager.request_model) self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) - if isinstance(self._backend, DistributedBackend) and not no_scoring: - scoring_log = self.mb.scoring_log() - self.scoring_log_consumer = scoring_log.consumer() - self.queue = self._backend.queue - self.strategy_disabled = False - else: - self.strategy_disabled = True - self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') - self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' - self.max_next_requests = settings.MAX_NEXT_REQUESTS - self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, - self.strategy_disabled, settings.get('NEW_BATCH_DELAY'), no_incoming) + slot_kwargs = {'no_batches': no_batches, + 'no_incoming': no_incoming, + 'no_scoring': no_scoring} + slot_kwargs.update(**kwargs) + self.slot = Slot(self, settings, **slot_kwargs) + + self.stats = defaultdict(int) self.job_id = 0 - self.stats = { - 'consumed_since_start': 0, - 'consumed_scoring_since_start': 0, - 'pushed_since_start': 0 - } self._logging_task = task.LoopingCall(self.log_status) - def set_process_info(self, process_info): - self.process_info = process_info - def run(self): def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) - self.slot.schedule(on_start=True) - self._logging_task.start(30) + self.slot.schedule() + self._logging_task.start(LOGGING_TASK_INTERVAL) + install_shutdown_handlers(self._handle_shutdown) signal(SIGUSR1, debug) - reactor.addSystemEventTrigger('before', 'shutdown', self.stop) - reactor.run() + reactor.run(installSignalHandlers=False) - def stop(self): - logger.info("Stopping frontier manager.") - self._manager.stop() + # Auxiliary methods + + def update_stats(self, replacements=None, increments=None): + if replacements: + for key, value in replacements.items(): + self.stats[key] = value + if increments: + for key, value in increments.items(): + self.stats[key] += value + + def set_process_info(self, process_info): + self.process_info = process_info def log_status(self): for k, v in six.iteritems(self.stats): logger.info("%s=%s", k, v) - def disable_new_batches(self): - self.slot.no_batches = True + # Graceful shutdown - def enable_new_batches(self): - self.slot.no_batches = False + def _handle_shutdown(self, signum, _): + def call_shutdown(): + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) - def consume_incoming(self, *args, **kwargs): - consumed = 0 - for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size): - try: - msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: - logger.error("Decoding error: %s", e) - continue - else: - try: - type = msg[0] - if type == 'add_seeds': - _, seeds = msg - logger.info('Adding %i seeds', len(seeds)) - for seed in seeds: - logger.debug('URL: %s', seed.url) - self._backend.add_seeds(seeds) - continue - if type == 'page_crawled': - _, response = msg - logger.debug("Page crawled %s", response.url) - if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: - continue - self._backend.page_crawled(response) - continue - if type == 'links_extracted': - _, request, links = msg - logger.debug("Links extracted %s (%d)", request.url, len(links)) - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self._backend.links_extracted(request, links) - continue - if type == 'request_error': - _, request, error = msg - logger.debug("Request error %s", request.url) - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self._backend.request_error(request, error) - continue - if type == 'offset': - _, partition_id, offset = msg - producer_offset = self.spider_feed_producer.get_offset(partition_id) - if producer_offset is None: - continue - else: - lag = producer_offset - offset - if lag < 0: - # non-sense in general, happens when SW is restarted and not synced yet with Spiders. - continue - if lag < self.max_next_requests or offset == 0: - self.spider_feed.mark_ready(partition_id) - else: - self.spider_feed.mark_busy(partition_id) - continue - logger.debug('Unknown message type %s', type) - except Exception as exc: - logger.exception(exc) - if logger.isEnabledFor(logging.DEBUG): - logger.debug("Message caused the error %s", str(msg)) - continue - finally: - consumed += 1 - """ - # TODO: Think how it should be implemented in DB-worker only mode. - if not self.strategy_disabled and self._backend.finished(): - logger.info("Crawling is finished.") - reactor.stop() - """ - self.stats['consumed_since_start'] += consumed - self.stats['last_consumed'] = consumed - self.stats['last_consumption_run'] = asctime() - self.slot.schedule() - return consumed + logger.info("Received shutdown signal %d, shutting down gracefully.", signum) + reactor.callFromThread(call_shutdown) - def consume_scoring(self, *args, **kwargs): - consumed = 0 - seen = set() - batch = [] - for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size): - try: - msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: - logger.error("Decoding error: %s", e) - continue - else: - if msg[0] == 'update_score': - _, request, score, schedule = msg - if request.meta[b'fingerprint'] not in seen: - batch.append((request.meta[b'fingerprint'], score, request, schedule)) - seen.add(request.meta[b'fingerprint']) - if msg[0] == 'new_job_id': - self.job_id = msg[1] - finally: - consumed += 1 - self.queue.schedule(batch) - - self.stats['consumed_scoring_since_start'] += consumed - self.stats['last_consumed_scoring'] = consumed - self.stats['last_consumption_run_scoring'] = asctime() - self.slot.schedule() + def stop_tasks(self): + logger.info("Stopping periodic tasks.") + self._logging_task.stop() - def new_batch(self, *args, **kwargs): - def get_hostname(request): - try: - netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) - except Exception as e: - logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta[b'fingerprint'], - request.url)) - return None - else: - return name.encode('utf-8', 'ignore') + d = Deferred() + d.addBoth(self._stop_slot) + d.addBoth(self._close_slot) + d.addBoth(self._perform_shutdown) + d.addBoth(self._stop_reactor) + return d - def get_fingerprint(request): - return request.meta[b'fingerprint'] + def _stop_slot(self, _=None): + logger.info("Stopping DB worker slot.") + return self.slot.stop() - partitions = self.spider_feed.available_partitions() - logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) - if not partitions: - return 0 + def _close_slot(self, _=None): + logger.info('Closing DB worker slot resources.') + self.slot.close() - count = 0 - if self.spider_feed_partitioning == 'hostname': - get_key = get_hostname - elif self.spider_feed_partitioning == 'fingerprint': - get_key = get_fingerprint - else: - raise Exception("Unexpected value in self.spider_feed_partitioning") + def _perform_shutdown(self, _=None): + logger.info("Stopping frontier manager.") + self._manager.stop() - for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): - try: - request.meta[b'jid'] = self.job_id - eo = self._encoder.encode_request(request) - except Exception as e: - logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, - request.meta[b'fingerprint'], - request.url)) - continue - finally: - count += 1 - self.spider_feed_producer.send(get_key(request), eo) - - self.stats['pushed_since_start'] += count - self.stats['last_batch_size'] = count - self.stats.setdefault('batches_after_start', 0) - self.stats['batches_after_start'] += 1 - self.stats['last_batch_generated'] = asctime() - return count + def _stop_reactor(self, _=None): + logger.info("Stopping reactor.") + try: + reactor.stop() + except RuntimeError: # raised if already stopped or in shutdown stage + pass + + +class DBWorker(StatsExportMixin, BaseDBWorker): + """Main database worker class with useful extensions. + + The additional features are provided by using mixin classes: + - sending crawl stats to message bus + """ + def get_stats_tags(self, settings, no_batches, no_incoming, no_scoring, **kwargs): + if no_batches and no_scoring: + db_worker_type = 'linksdb' + elif no_batches and no_incoming: + db_worker_type = 'scoring' + elif no_incoming and no_scoring: + db_worker_type = 'batchgen' + else: + logger.warning("Can't identify DB worker type " + "(no-scoring {}, no-batches {}, no-incoming {})" + .format(no_scoring, no_batches, no_incoming)) + db_worker_type = 'none' + tags = {'source': 'dbw-{}'.format(db_worker_type)} + # add mesos task id as a tag if running via marathon + mesos_task_id = os.environ.get('MESOS_TASK_ID') + if mesos_task_id: + tags['mesos_task_id'] = mesos_task_id + return tags if __name__ == '__main__': @@ -285,6 +221,8 @@ def get_fingerprint(request): help='Disables spider log processing.') parser.add_argument('--no-scoring', action='store_true', help='Disables scoring log processing.') + parser.add_argument('--partitions', type=int, nargs='*', + help='Optional partitions range for batch generator') parser.add_argument('--config', type=str, required=True, help='Settings module name, should be accessible by import.') parser.add_argument('--log-level', '-L', type=str, default='INFO', @@ -297,14 +235,15 @@ def get_fingerprint(request): settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") - if logging_config_path and exists(logging_config_path): - fileConfig(logging_config_path) + if logging_config_path and os.path.exists(logging_config_path): + fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) - worker = DBWorker(settings, args.no_batches, args.no_incoming, args.no_scoring) + worker = DBWorker(settings, args.no_batches, args.no_incoming, + args.no_scoring, partitions=args.partitions) server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run() diff --git a/frontera/worker/server.py b/frontera/worker/server.py index a77a49bae..dd05b2e60 100644 --- a/frontera/worker/server.py +++ b/frontera/worker/server.py @@ -7,7 +7,7 @@ from twisted.web import server, resource -from frontera.utils.async import listen_tcp +from frontera.utils.twisted_helpers import listen_tcp logger = getLogger("cf-server") @@ -79,9 +79,11 @@ def __init__(self, worker): JsonResource.__init__(self) def render_GET(self, txrequest): + batches_disabled_event = self.worker.slot.batches_disabled_event + disable_new_batches = batches_disabled_event.is_set() if batches_disabled_event else None return { - 'is_finishing': self.worker.slot.is_finishing, - 'disable_new_batches': self.worker.slot.no_batches, + 'is_finishing': self.worker.slot.stop_event.is_set(), + 'disable_new_batches': disable_new_batches, 'stats': self.worker.stats } @@ -116,11 +118,11 @@ def __init__(self, worker): def process_request(self, method, jrequest): if method == 'disable_new_batches': - self.worker.disable_new_batches() + self.worker.slot.manage_new_batches(enable=False) return jsonrpc_result(jrequest['id'], "success") if method == 'enable_new_batches': - self.worker.enable_new_batches() + self.worker.slot.manage_new_batches(enable=True) return jsonrpc_result(jrequest['id'], "success") raise JsonRpcError(400, "Unknown method") @@ -148,7 +150,7 @@ def __init__(self, root, settings): def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) h = self.port.getHost() - logger.info('Web service listening on %(host)s:%(port)d'.format(host=h.host, port=h.port)) + logger.info('Web service listening on {host}:{port}'.format(host=h.host, port=h.port)) def stop_listening(self): self.port.stopListening() diff --git a/frontera/worker/stats.py b/frontera/worker/stats.py new file mode 100644 index 000000000..f5dee4aed --- /dev/null +++ b/frontera/worker/stats.py @@ -0,0 +1,100 @@ +from logging import getLogger +from traceback import format_tb + +from twisted.internet.task import LoopingCall + +from frontera.utils.misc import load_object, utc_timestamp + +logger = getLogger("messagebus.stats") + + +class StatsExportMixin(object): + """Extending Frontera worker's logic by sending stats to message bus. + + This is a lightweight mixin class to be used with a base worker classes + by sending stats to message bus if configured. The mixin also allows + you to define your custom logic for get_stats_tags() logic in your child + classes to store a dictionary with tags as a part of your metrics. + """ + STATS_PREFIXES = ['consumed', 'pushed', 'dropped'] + + def __init__(self, settings, *args, **kwargs): + super(StatsExportMixin, self).__init__(settings, *args, **kwargs) + message_bus = load_object(settings.get('MESSAGE_BUS'))(settings) + stats_log = message_bus.stats_log() + # FIXME can be removed after implementing stats_log for ZeroMQ bus + if not stats_log: + return + self.stats_producer = stats_log.producer() + self._stats_tags = self.get_stats_tags(settings, *args, **kwargs) + self._stats_interval = settings.get('STATS_LOG_INTERVAL', 60) + self._export_stats_task = LoopingCall(self.export_stats) + + def run(self, *args, **kwargs): + + def errback_export_stats(failure): + logger.exception(failure.value) + if failure.frames: + logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + self._export_stats_task.start(interval=self._stats_interval)\ + .addErrback(errback_export_stats) + + if self.stats_producer: + self._export_stats_task.start(interval=self._stats_interval)\ + .addErrback(errback_export_stats) + super(StatsExportMixin, self).run(*args, **kwargs) + + def get_stats_tags(self, *args, **kwargs): + """Get a tags dictionary for the metrics. + + Default implementation expects that this method will provide: + - 'source' - source type of the metric, one of ['sw', 'dbw', 'spider'] + - 'partition_id' (optionally) - specific partition id + """ + raise NotImplementedError("Please define the method in a child class") + + @property + def _stats_key_prefix(self): + """Build key prefix based on the given tags. + + Default implementation of the method relies on get_stats_tags() logic, + and existence of 'source'/'partition_id' tags. + """ + prefix = self._stats_tags.get('source') + if 'partition_id' in self._stats_tags: + prefix += '-{}'.format(self._stats_tags.get('partition_id')) + return prefix + + def export_stats(self): + """Export crawl stats to message bus. + + Message is formed in the following way: + - key: a prefix from _stats_key_prefix() + stats timestamp + - value: a stats dictionary packed with self._encoder + """ + stats = self.get_stats() + if not stats: + return + stats_key = '{}-{}'.format(self._stats_key_prefix, stats['_timestamp']) + # self._encoder is defined as a part of worker initialization + encoded_msg = self._encoder.encode_stats(stats) + self.stats_producer.send(stats_key, encoded_msg) + logger.debug("Sent stats for {} to message bus: {}".format(stats_key, stats)) + + def get_stats(self): + """Return default stats with a timestamp. + + It's useful to have a default implementation of the method because both + strategy and db worker store stats this way, though this logic could be + modified in a child class to redefine/transform stats data. + """ + # report only stats with given prefixes, no need to push all of them + stats = {stats_key: self.stats[stats_key] + for stats_key in self.stats + if stats_key.split('_', 1)[0] in self.STATS_PREFIXES} + stats.update(self.backend.get_stats() or {}) + if not stats: + return + stats['_timestamp'] = utc_timestamp() + stats['_tags'] = self._stats_tags + return stats \ No newline at end of file diff --git a/frontera/worker/strategies/bfs.py b/frontera/worker/strategies/bfs.py deleted file mode 100644 index 838498d7f..000000000 --- a/frontera/worker/strategies/bfs.py +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from six.moves.urllib.parse import urlparse -from frontera.core.components import States -from frontera.worker.strategies import BaseCrawlingStrategy - - -class CrawlingStrategy(BaseCrawlingStrategy): - - def add_seeds(self, seeds): - for seed in seeds: - if seed.meta[b'state'] is States.NOT_CRAWLED: - seed.meta[b'state'] = States.QUEUED - self.schedule(seed) - - def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED - - def links_extracted(self, request, links): - for link in links: - if link.meta[b'state'] is States.NOT_CRAWLED: - link.meta[b'state'] = States.QUEUED - self.schedule(link, self.get_score(link.url)) - - def page_error(self, request, error): - request.meta[b'state'] = States.ERROR - self.schedule(request, score=0.0, dont_queue=True) - - def get_score(self, url): - url_parts = urlparse(url) - path_parts = url_parts.path.split('/') - return 1.0 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index a33008ca3..54cda77ce 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -1,220 +1,236 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from time import asctime + import logging -from traceback import format_stack, format_tb -from signal import signal, SIGUSR1 -from logging.config import fileConfig from argparse import ArgumentParser +from binascii import hexlify +from logging.config import fileConfig from os.path import exists -from frontera.utils.misc import load_object +from random import randint +from signal import signal, SIGUSR1 +from time import asctime +from traceback import format_stack, format_tb +from collections import defaultdict -from frontera.core.manager import FrontierManager -from frontera.logger.handlers import CONSOLE +import six +from six.moves.urllib.parse import urlparse +from twisted.internet import reactor, task +from twisted.internet.defer import Deferred from twisted.internet.task import LoopingCall -from twisted.internet import reactor +from frontera.core.manager import WorkerFrontierManager, MessageBusUpdateScoreStream +from frontera.logger.handlers import CONSOLE from frontera.settings import Settings -from collections import Iterable -from binascii import hexlify -import six - +from frontera.utils.misc import load_object +from frontera.utils.ossignal import install_shutdown_handlers +from frontera.worker.server import WorkerJsonRpcService +from frontera.worker.stats import StatsExportMixin logger = logging.getLogger("strategy-worker") -class UpdateScoreStream(object): - def __init__(self, encoder, scoring_log_producer, size): - self._encoder = encoder - self._buffer = [] - self._producer = scoring_log_producer - self._size = size - - def send(self, request, score=1.0, dont_queue=False): - encoded = self._encoder.encode_update_score( - request, - score, - not dont_queue - ) - self._buffer.append(encoded) - if len(self._buffer) > self._size: - self.flush() - - def flush(self): - if self._buffer: - self._producer.send(None, *self._buffer) - self._buffer = [] - - -class StatesContext(object): - - def __init__(self, states): - self._requests = [] - self._states = states - self._fingerprints = set() - - def to_fetch(self, requests): - if isinstance(requests, Iterable): - self._fingerprints.update(x.meta[b'fingerprint'] for x in requests) - return - self._fingerprints.add(requests.meta[b'fingerprint']) - - def fetch(self): - self._states.fetch(self._fingerprints) - self._fingerprints.clear() - - def refresh_and_keep(self, requests): - self.to_fetch(requests) - self.fetch() - self._states.set_states(requests) - self._requests.extend(requests) - - def release(self): - self._states.update_cache(self._requests) - self._requests = [] - - def flush(self): - logger.info("Flushing states") - self._states.flush(force_clear=False) - logger.info("Flushing of states finished") - - -class StrategyWorker(object): - def __init__(self, settings, strategy_class): +class BatchedWorkflow(object): + def __init__(self, manager, scoring_stream, stats, job_id): + self.strategy = manager.strategy + self.states_context = manager.states_context + self.scoring_stream = scoring_stream + self.stats = stats + self.job_id = job_id + self.manager = manager + + self._batch = [] + + def collection_start(self): + self._batch = [] + + def process(self): + self.states_context.fetch() + for event in self._batch: + typ = event[0] + try: + if typ == 'page_crawled': + _, response = event + if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: + continue + self._on_page_crawled(response) + self.stats['consumed_page_crawled'] += 1 + continue + if typ == 'links_extracted': + _, request, links = event + if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + continue + self._on_links_extracted(request, links) + self.stats['consumed_links_extracted'] += 1 + continue + if typ == 'request_error': + _, request, error = event + if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + continue + self._on_request_error(request, error) + self.stats['consumed_request_error'] += 1 + continue + self.on_unknown_event(event) + except Exception: + logger.exception("Exception during processing") + pass + self.scoring_stream.flush() + self.states_context.release() + + def collect(self, event): + typ = event[0] + self._batch.append(event) + try: + if typ == 'page_crawled': + _, response = event + self.states_context.to_fetch(response) + return + if typ == 'links_extracted': + _, request, links = event + self.states_context.to_fetch(request) + filtered_links = self.strategy.filter_extracted_links(request, links) + if filtered_links: + # modify last message with a new links list + self._batch[-1] = (typ, request, filtered_links) + self.states_context.to_fetch(filtered_links) + else: + # drop last message if nothing to process + self._batch.pop() + self.stats['dropped_links_extracted'] += 1 + return + if typ == 'request_error': + _, request, error = event + self.states_context.to_fetch(request) + return + if typ == 'offset': + return + self.collect_unknown_event(event) + except Exception: + logger.exception("Error during event collection") + pass + + def collect_unknown_event(self, event): + logger.debug('Unknown message %s', event) + + def on_unknown_event(self, event): + pass + + def _on_page_crawled(self, response): + logger.debug("Page crawled %s", response.url) + self.states_context.states.set_states([response]) + self.strategy.page_crawled(response) + self.states_context.states.update_cache(response) + + def _on_links_extracted(self, request, links): + logger.debug("Links extracted %s (%d)", request.url, len(links)) + for link in links: + logger.debug("URL: %s", link.url) + self.states_context.states.set_states(links) + self.strategy.links_extracted(request, links) + self.states_context.states.update_cache(links) + + def _on_request_error(self, request, error): + logger.debug("Page error %s (%s)", request.url, error) + self.states_context.states.set_states(request) + self.strategy.request_error(request, error) + self.states_context.states.update_cache(request) + + +class BaseStrategyWorker(object): + """Base strategy worker class.""" + + def __init__(self, settings, is_add_seeds_mode): partition_id = settings.get('SCORING_PARTITION_ID') if partition_id is None or type(partition_id) != int: raise AttributeError("Scoring worker partition id isn't set.") messagebus = load_object(settings.get('MESSAGE_BUS')) mb = messagebus(settings) - spider_log = mb.spider_log() scoring_log = mb.scoring_log() - self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') + self.add_seeds_mode = is_add_seeds_mode + if not self.add_seeds_mode: + spider_log = mb.spider_log() + self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') + self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') self.scoring_log_producer = scoring_log.producer() - self._manager = FrontierManager.from_settings(settings, strategy_worker=True) codec_path = settings.get('MESSAGE_BUS_CODEC') - encoder_cls = load_object(codec_path+".Encoder") - decoder_cls = load_object(codec_path+".Decoder") - self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) - self._encoder = encoder_cls(self._manager.request_model) + encoder_cls = load_object(codec_path + ".Encoder") + decoder_cls = load_object(codec_path + ".Decoder") - self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) - self.states_context = StatesContext(self._manager.backend.states) + request_model = load_object(settings.get('REQUEST_MODEL')) + response_model = load_object(settings.get('RESPONSE_MODEL')) + self._decoder = decoder_cls(request_model, response_model) + self._encoder = encoder_cls(request_model) + + self.update_score = MessageBusUpdateScoreStream(self.scoring_log_producer, self._encoder) + manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True, scoring_stream=self.update_score) self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) - self.states = self._manager.backend.states - self.stats = { - 'consumed_since_start': 0 - } - self.job_id = 0 + self.stats = defaultdict(int) + self.backend = manager.backend + self.workflow = BatchedWorkflow(manager, self.update_score, self.stats, 0) self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) + self._flush_interval = settings.get("SW_FLUSH_INTERVAL") logger.info("Strategy worker is initialized and consuming partition %d", partition_id) - def collect_unknown_message(self, msg): - logger.debug('Unknown message %s', msg) - - def on_unknown_message(self, msg): - pass - - def collect_batch(self): + def work(self): consumed = 0 - batch = [] + self.workflow.collection_start() for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): try: - msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: - logger.error("Decoding error:") - logger.exception(e) + event = self._decoder.decode(m) + except (KeyError, TypeError): + logger.exception("Decoding error") logger.debug("Message %s", hexlify(m)) continue else: - type = msg[0] - batch.append(msg) - try: - if type == 'add_seeds': - _, seeds = msg - self.states_context.to_fetch(seeds) - continue - if type == 'page_crawled': - _, response = msg - self.states_context.to_fetch(response) - continue - if type == 'links_extracted': - _, request, links = msg - self.states_context.to_fetch(request) - self.states_context.to_fetch(links) - continue - if type == 'request_error': - _, request, error = msg - self.states_context.to_fetch(request) - continue - if type == 'offset': - continue - self.collect_unknown_message(msg) - except Exception as exc: - logger.exception(exc) - pass + self.workflow.collect(event) finally: consumed += 1 - return (batch, consumed) - - def process_batch(self, batch): - for msg in batch: - type = msg[0] - try: - if type == 'add_seeds': - _, seeds = msg - for seed in seeds: - seed.meta[b'jid'] = self.job_id - self.on_add_seeds(seeds) - continue - if type == 'page_crawled': - _, response = msg - if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: - continue - self.on_page_crawled(response) - continue - if type == 'links_extracted': - _, request, links = msg - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self.on_links_extracted(request, links) - continue - if type == 'request_error': - _, request, error = msg - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: - continue - self.on_request_error(request, error) - continue - self.on_unknown_message(msg) - except Exception as exc: - logger.exception(exc) - pass - - def work(self): - batch, consumed = self.collect_batch() - self.states_context.fetch() - self.process_batch(batch) - self.update_score.flush() - self.states_context.release() + self.workflow.process() # Exiting, if crawl is finished - if self.strategy.finished(): + if self.workflow.strategy.finished(): logger.info("Successfully reached the crawling goal.") - logger.info("Closing crawling strategy.") - self.strategy.close() logger.info("Finishing.") - reactor.callFromThread(reactor.stop) + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) self.stats['last_consumed'] = consumed self.stats['last_consumption_run'] = asctime() self.stats['consumed_since_start'] += consumed - def run(self): + def add_seeds(self, seeds_url): + logger.info("Seeds addition started from url %s", seeds_url) + strategy = self.workflow.strategy + if not seeds_url: + strategy.read_seeds(None) + else: + parsed = urlparse(seeds_url) + if parsed.scheme == "s3": + import boto3 + from frontera.utils.s3 import StreamingBodyIOBase + s3 = boto3.resource("s3") + path = parsed.path.lstrip("/") + obj = s3.Object(parsed.hostname, path) + response = obj.get() + fh = StreamingBodyIOBase(response['Body']) + elif parsed.scheme == "file": + fh = open(parsed.path, "rb") + else: + raise TypeError("Unsupported URL scheme") + strategy.read_seeds(fh) + try: + fh.close() + except Exception: + logger.exception("Error during closing of seeds stream") + pass + self.update_score.flush() + self.workflow.states_context.release() + + def run(self, seeds_url): def log_failure(failure): logger.exception(failure.value) if failure.frames: @@ -224,61 +240,95 @@ def errback_main(failure): log_failure(failure) self.task.start(interval=0).addErrback(errback_main) + def run_flush_states_task(): + (self._flush_states_task.start(interval=self._flush_interval) + .addErrback(errback_flush_states)) + def errback_flush_states(failure): log_failure(failure) - self._flush_states_task.start(interval=300).addErrback(errback_flush_states) + run_flush_states_task() def debug(sig, frame): logger.critical("Signal received: printing stack trace") logger.critical(str("").join(format_stack(frame))) - self.task.start(interval=0).addErrback(errback_main) - self._logging_task.start(interval=30) - self._flush_states_task.start(interval=300).addErrback(errback_flush_states) + install_shutdown_handlers(self._handle_shutdown) signal(SIGUSR1, debug) - reactor.addSystemEventTrigger('before', 'shutdown', self.stop) - reactor.run() + if self.add_seeds_mode: + self.add_seeds(seeds_url) + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) + else: + self.task.start(interval=0).addErrback(errback_main) + self._logging_task.start(interval=30) + # run flushing states LoopingCall with random delay + flush_states_task_delay = randint(0, self._flush_interval) + logger.info("Starting flush-states task in %d seconds", flush_states_task_delay) + task.deferLater(reactor, flush_states_task_delay, run_flush_states_task) + + reactor.run(installSignalHandlers=False) def log_status(self): for k, v in six.iteritems(self.stats): logger.info("%s=%s", k, v) def flush_states(self): - self.states_context.flush() - - def stop(self): - logger.info("Closing crawling strategy.") - self.strategy.close() - logger.info("Stopping frontier manager.") - self._manager.stop() - - def on_add_seeds(self, seeds): - logger.debug('Adding %i seeds', len(seeds)) - for seed in seeds: - logger.debug("URL: %s", seed.url) - self.states.set_states(seeds) - self.strategy.add_seeds(seeds) - self.states.update_cache(seeds) - - def on_page_crawled(self, response): - logger.debug("Page crawled %s", response.url) - self.states.set_states([response]) - self.strategy.page_crawled(response) - self.states.update_cache(response) - - def on_links_extracted(self, request, links): - logger.debug("Links extracted %s (%d)", request.url, len(links)) - for link in links: - logger.debug("URL: %s", link.url) - self.states.set_states(links) - self.strategy.links_extracted(request, links) - self.states.update_cache(links) - - def on_request_error(self, request, error): - logger.debug("Page error %s (%s)", request.url, error) - self.states.set_states(request) - self.strategy.page_error(request, error) - self.states.update_cache(request) + self.workflow.states_context.flush() + + def _handle_shutdown(self, signum, _): + def call_shutdown(): + d = self.stop_tasks() + reactor.callLater(0, d.callback, None) + + logger.info("Received shutdown signal %d, shutting down gracefully.", signum) + reactor.callFromThread(call_shutdown) + + def stop_tasks(self): + logger.info("Stopping periodic tasks.") + if self.task.running: + self.task.stop() + if self._flush_states_task.running: + self._flush_states_task.stop() + if self._logging_task.running: + self._logging_task.stop() + + d = Deferred() + d.addBoth(self._perform_shutdown) + d.addBoth(self._stop_reactor) + return d + + def _stop_reactor(self, _=None): + logger.info("Stopping reactor.") + try: + reactor.stop() + except RuntimeError: # raised if already stopped or in shutdown stage + pass + + def _perform_shutdown(self, _=None): + try: + self.flush_states() + logger.info("Stopping frontier manager.") + self.workflow.manager.close() + logger.info("Closing message bus.") + self.scoring_log_producer.close() + if not self.add_seeds_mode: + self.consumer.close() + except Exception: + logger.exception('Error on shutdown') + + def set_process_info(self, process_info): + self.process_info = process_info + + +class StrategyWorker(StatsExportMixin, BaseStrategyWorker): + """Main strategy worker class with useful extensions. + + The additional features are provided by using mixin classes: + - sending crawl stats to message bus + """ + + def get_stats_tags(self, settings, *args, **kwargs): + return {'source': 'sw', 'partition_id': settings.get('SCORING_PARTITION_ID')} def setup_environment(): @@ -291,13 +341,20 @@ def setup_environment(): help='Crawling strategy class path') parser.add_argument('--partition-id', type=int, help="Instance partition id.") + parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") + parser.add_argument('--args', '-a', nargs='*', type=str, help="Optional arguments for crawling strategy, " + "in a form of key=value separated with space") + parser.add_argument('--add-seeds', action='store_true', help="Run in add seeds mode. Worker finishes after running " + "of strategy add_seeds method") + parser.add_argument('--seeds-url', type=str, help="Seeds url. S3 and native urlopen schemas are currently " + "supported, implies add seeds run mode") args = parser.parse_args() settings = Settings(module=args.config) - strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') + strategy_classpath = args.strategy if args.strategy else settings.get('STRATEGY') if not strategy_classpath: raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " "settings file.") - strategy_class = load_object(strategy_classpath) + settings.set('STRATEGY', strategy_classpath) partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: @@ -305,17 +362,30 @@ def setup_environment(): partition_id) settings.set('SCORING_PARTITION_ID', partition_id) + if args.port: + settings.set('JSONRPC_PORT', args.port) + + strategy_args = {} + if args.args: + for arg in args.args: + key, _, value = arg.partition("=") + strategy_args[key] = value if value else None + settings.set("STRATEGY_ARGS", strategy_args) + logging_config_path = settings.get("LOGGING_CONFIG") if logging_config_path and exists(logging_config_path): - fileConfig(logging_config_path) + fileConfig(logging_config_path, disable_existing_loggers=False) else: logging.basicConfig(level=args.log_level) logger.setLevel(args.log_level) logger.addHandler(CONSOLE) - return settings, strategy_class + + return settings, args.add_seeds, args.seeds_url if __name__ == '__main__': - settings, strategy_class = setup_environment() - worker = StrategyWorker(settings, strategy_class) - worker.run() + settings, is_add_seeds_mode, seeds_url = setup_environment() + worker = StrategyWorker(settings, is_add_seeds_mode) + server = WorkerJsonRpcService(worker, settings) + server.start_listening() + worker.run(seeds_url) diff --git a/requirements.txt b/requirements.txt index 7c718e7af..ac8e2ab03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ six>=1.8.0 w3lib>=1.15.0 +cityhash>=0.1.7 diff --git a/requirements/tests.txt b/requirements/tests.txt index 0ac170f54..0fa21a808 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -1,3 +1,4 @@ +flaky pytest>=2.6.4 PyMySQL>=0.6.3 psycopg2>=2.5.4 @@ -6,10 +7,13 @@ scrapy>=0.24 SQLAlchemy>=1.0.0 cachetools pyzmq -msgpack-python -kafka-python>=1.0.0 +msgpack-python>=0.4 +kafka-python>=1.4.0 pytest-cov happybase>=1.0.0 mock boto>=2.42.0 -r logging.txt +redis>=2.10.5 +hiredis>=0.2 +cityhash>=0.1.7 diff --git a/setup.py b/setup.py index e498c97b0..11bff20f3 100644 --- a/setup.py +++ b/setup.py @@ -34,13 +34,16 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Software Development :: Libraries :: Application Frameworks', 'Topic :: Software Development :: Libraries :: Python Modules', ], install_requires=[ 'six>=1.8.0', - 'w3lib>=1.15.0' + 'w3lib>=1.15.0', + 'cityhash>=0.1.7' ], extras_require={ 'sql': [ @@ -60,17 +63,29 @@ 'tldextract>=1.5.1', ], 'hbase': [ - 'happybase>=1.0.0' + 'happybase>=1.0.0', + 'thriftpy2' ], 'zeromq': [ 'pyzmq', - 'msgpack-python' + 'msgpack-python>=0.4' ], 'kafka': [ - 'kafka-python>=1.0.0' + 'kafka-python>=1.4.0' ], 'distributed': [ 'Twisted' + ], + 's3': [ + 'boto3' + ], + 'redis': [ + 'redis>=2.10.5', + 'hiredis>=0.2' + ], + 'strategies': [ + 'beautifulsoup4', + 'publicsuffix' ] }, tests_require=[ @@ -79,11 +94,14 @@ "psycopg2>=2.5.4", "scrapy>=0.24", "tldextract>=1.5.1", + 'thriftpy2', "SQLAlchemy>=1.0.0", "cachetools", "mock", "boto>=2.42.0", "colorlog>=2.4.0", - "python-json-logger>=0.1.5" + "python-json-logger>=0.1.5", + "redis>=2.10.5", + "hiredis>=0.2" ] ) diff --git a/tests/backends.py b/tests/backends.py index f3cdab956..253855695 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -1,11 +1,78 @@ from __future__ import absolute_import import pytest -from frontera import FrontierManager, Settings, FrontierTester +from frontera.core.components import States +from frontera.core.manager import LocalFrontierManager +from frontera.strategy import BaseCrawlingStrategy +from frontera import Settings, FrontierTester from frontera.utils import graphs from frontera.utils.tester import BaseDownloaderSimulator +class BasicCrawlingStrategy(BaseCrawlingStrategy): + def __init__(self, manager, args, scheduled_stream, states_context): + super(BasicCrawlingStrategy, self).__init__(manager, args, scheduled_stream, states_context) + self._id = 0 + + def read_seeds(self, stream): + for url in stream: + url = url.strip() + r = self._create_request(url) + self.schedule(r) + + def _create_request(self, url): + r = self.create_request(url=url, + headers={ + b'X-Important-Header': b'Frontera' + }, + method=b'POST', + cookies={b'currency': b'USD'}, + meta={b'this_param': b'should be passed over', + b'id': self._id}) + self._id += 1 + return r + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + self.schedule(self._create_request(link.url)) + link.meta[b'state'] = States.QUEUED + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def request_error(self, request, error): + request.meta[b'state'] = States.ERROR + + +class DFSCrawlingStrategy(BasicCrawlingStrategy): + def read_seeds(self, stream): + for url in stream: + url = url.strip() + r = self._create_request(url) + r.meta[b'depth'] = 0 + self.schedule(r, self._get_score(r.meta[b'depth'])) + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + r = self._create_request(link.url) + r.meta[b'depth'] = request.meta[b'depth'] + 1 + self.schedule(r, self._get_score(r.meta[b'depth'])) + link.meta[b'state'] = States.QUEUED + + def _get_score(self, depth): + return 1.0 / (depth + 1.0) + + +class BFSCrawlingStrategy(DFSCrawlingStrategy): + def _get_score(self, depth): + return float(depth) / 10.0 + + class BackendTest(object): """ A simple pytest base class with helper methods for @@ -38,14 +105,15 @@ def get_frontier(self): """ Returns frontierManager object """ - return FrontierManager.from_settings(self.get_settings()) + return LocalFrontierManager.from_settings(self.get_settings()) def get_settings(self): """ Returns backend settings """ return Settings(attributes={ - 'BACKEND': self.backend_class + 'BACKEND': self.backend_class, + 'STRATEGY': 'tests.backends.BasicCrawlingStrategy' }) @@ -115,7 +183,7 @@ def assert_sequence(self, site_list, expected_sequence, max_next_requests): # Get sequence sequence = self.get_url_sequence(site_list, max_next_requests) - #print [str(n) for n in sequence] + #print ([str(n) for n in sequence]) # Assert sequence equals expected assert len(sequence) == len(expected_sequence) @@ -123,31 +191,30 @@ def assert_sequence(self, site_list, expected_sequence, max_next_requests): class FIFOBackendTest(BackendSequenceTest): - EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_02_A": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' - ], + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', 'http://bbb.com/111', 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' + ] + , "SEQUENCE_03_A": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], } + @pytest.mark.parametrize( ('site_list', 'max_next_requests', 'expected_sequence'), [ @@ -182,97 +249,97 @@ class LIFOBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A12', - 'A122', 'A1222', 'A1221', - 'A121', 'A1212', 'A1211', - 'A11', - 'A112', 'A1122', 'A1121', - 'A111', 'A1112', 'A1111' + 'http://aaa.com/1', + 'http://aaa.com/12', + 'http://aaa.com/122', 'http://aaa.com/1222', 'http://aaa.com/1221', + 'http://aaa.com/121', 'http://aaa.com/1212', 'http://aaa.com/1211', + 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://aaa.com/111', 'http://aaa.com/1112', 'http://aaa.com/1111' ], "SEQUENCE_01_B": [ - 'A1', - 'A12', 'A11', - 'A112', 'A111', - 'A1112', 'A1111', 'A1122', 'A1121', - 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221'], + 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/111', + 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221'], "SEQUENCE_01_C": [ - 'A1', - 'A12', 'A11', - 'A112', 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111', 'A1122', 'A1121' + 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/111', 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121' ], "SEQUENCE_02_A": [ - 'B1', - 'B12', 'B122', 'B1222', 'B1221', 'B121', 'B1212', 'B1211', - 'B11', 'B112', 'B1122', 'B1121', 'B111', 'B1112', 'B1111', - 'A1', - 'A12', 'A122', 'A1222', 'A1221', 'A121', 'A1212', 'A1211', - 'A11', 'A112', 'A1122', 'A1121', 'A111', 'A1112', 'A1111' + 'http://bbb.com/1', + 'http://bbb.com/12', 'http://bbb.com/122', 'http://bbb.com/1222', 'http://bbb.com/1221', 'http://bbb.com/121', 'http://bbb.com/1212', 'http://bbb.com/1211', + 'http://bbb.com/11', 'http://bbb.com/112', 'http://bbb.com/1122', 'http://bbb.com/1121', 'http://bbb.com/111', 'http://bbb.com/1112', 'http://bbb.com/1111', + 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/122', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/121', 'http://aaa.com/1212', 'http://aaa.com/1211', + 'http://aaa.com/11', 'http://aaa.com/112', 'http://aaa.com/1122', 'http://aaa.com/1121', 'http://aaa.com/111', 'http://aaa.com/1112', 'http://aaa.com/1111' ], "SEQUENCE_02_B": [ - 'B1', 'A1', - 'A12', 'A11', - 'A112', 'A111', - 'A1112', 'A1111', 'A1122', 'A1121', - 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', - 'B12', 'B11', - 'B112', 'B111', - 'B1112', 'B1111', 'B1122', 'B1121', - 'B122', 'B121', - 'B1212', 'B1211', 'B1222', 'B1221' + 'http://bbb.com/1', 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', + 'http://aaa.com/112', 'http://aaa.com/111', + 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', + 'http://bbb.com/12', 'http://bbb.com/11', + 'http://bbb.com/112', 'http://bbb.com/111', + 'http://bbb.com/1112', 'http://bbb.com/1111', 'http://bbb.com/1122', 'http://bbb.com/1121', + 'http://bbb.com/122', 'http://bbb.com/121', + 'http://bbb.com/1212', 'http://bbb.com/1211', 'http://bbb.com/1222', 'http://bbb.com/1221' ], "SEQUENCE_02_C": [ - 'B1', 'A1', - 'A12', 'A11', 'B12', 'B11', 'B112', 'B111', 'B122', 'B121', 'A112', - 'A1122', 'A1121', 'B1212', 'B1211', 'B1222', 'B1221', 'B1112', 'B1111', 'B1122', 'B1121', - 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111' + 'http://bbb.com/1', 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', 'http://bbb.com/12', 'http://bbb.com/11', 'http://bbb.com/112', 'http://bbb.com/111', 'http://bbb.com/122', 'http://bbb.com/121', 'http://aaa.com/112', + 'http://aaa.com/1122', 'http://aaa.com/1121', 'http://bbb.com/1212', 'http://bbb.com/1211', 'http://bbb.com/1222', 'http://bbb.com/1221', 'http://bbb.com/1112', 'http://bbb.com/1111', 'http://bbb.com/1122', 'http://bbb.com/1121', + 'http://aaa.com/111', 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/1112', 'http://aaa.com/1111' ], "SEQUENCE_02_D": [ - 'B1', 'A1', - 'A12', 'A11', 'B12', 'B11', 'B112', 'B111', 'B122', 'B121', 'A112', 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111', 'A1122', 'A1121', - 'B1212', 'B1211', 'B1222', 'B1221', 'B1112', 'B1111', 'B1122', 'B1121' + 'http://bbb.com/1', 'http://aaa.com/1', + 'http://aaa.com/12', 'http://aaa.com/11', 'http://bbb.com/12', 'http://bbb.com/11', 'http://bbb.com/112', 'http://bbb.com/111', 'http://bbb.com/122', 'http://bbb.com/121', 'http://aaa.com/112', 'http://aaa.com/111', 'http://aaa.com/122', 'http://aaa.com/121', + 'http://aaa.com/1212', 'http://aaa.com/1211', 'http://aaa.com/1222', 'http://aaa.com/1221', 'http://aaa.com/1112', 'http://aaa.com/1111', 'http://aaa.com/1122', 'http://aaa.com/1121', + 'http://bbb.com/1212', 'http://bbb.com/1211', 'http://bbb.com/1222', 'http://bbb.com/1221', 'http://bbb.com/1112', 'http://bbb.com/1111', 'http://bbb.com/1122', 'http://bbb.com/1121' ], "SEQUENCE_03_A": [ - 'C1', 'C12', 'C122', 'C1222', 'C12222', 'C12221', 'C1221', 'C12212', 'C12211', - 'C121', 'C1212', 'C12122', 'C12121', 'C1211', 'C12112', 'C12111', - 'C11', 'C112', 'C1122', 'C11222', 'C11221', 'C1121', 'C11212', 'C11211', - 'C111', 'C1112', 'C11122', 'C11121', 'C1111', 'C11112', 'C11111' + 'http://ccc.com/1', 'http://ccc.com/12', 'http://ccc.com/122', 'http://ccc.com/1222', 'http://ccc.com/12222', 'http://ccc.com/12221', 'http://ccc.com/1221', 'http://ccc.com/12212', 'http://ccc.com/12211', + 'http://ccc.com/121', 'http://ccc.com/1212', 'http://ccc.com/12122', 'http://ccc.com/12121', 'http://ccc.com/1211', 'http://ccc.com/12112', 'http://ccc.com/12111', + 'http://ccc.com/11', 'http://ccc.com/112', 'http://ccc.com/1122', 'http://ccc.com/11222', 'http://ccc.com/11221', 'http://ccc.com/1121', 'http://ccc.com/11212', 'http://ccc.com/11211', + 'http://ccc.com/111', 'http://ccc.com/1112', 'http://ccc.com/11122', 'http://ccc.com/11121', 'http://ccc.com/1111', 'http://ccc.com/11112', 'http://ccc.com/11111' ], "SEQUENCE_03_B": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', - 'C1112', 'C1111', 'C11112', 'C11111', 'C11122', 'C11121', - 'C1122', 'C1121', 'C11212', 'C11211', 'C11222', 'C11221', - 'C122', 'C121', - 'C1212', 'C1211', 'C12112', 'C12111', 'C12122', 'C12121', - 'C1222', 'C1221', 'C12212', 'C12211', 'C12222', 'C12221' + 'http://ccc.com/1', + 'http://ccc.com/12', 'http://ccc.com/11', + 'http://ccc.com/112', 'http://ccc.com/111', + 'http://ccc.com/1112', 'http://ccc.com/1111', 'http://ccc.com/11112', 'http://ccc.com/11111', 'http://ccc.com/11122', 'http://ccc.com/11121', + 'http://ccc.com/1122', 'http://ccc.com/1121', 'http://ccc.com/11212', 'http://ccc.com/11211', 'http://ccc.com/11222', 'http://ccc.com/11221', + 'http://ccc.com/122', 'http://ccc.com/121', + 'http://ccc.com/1212', 'http://ccc.com/1211', 'http://ccc.com/12112', 'http://ccc.com/12111', 'http://ccc.com/12122', 'http://ccc.com/12121', + 'http://ccc.com/1222', 'http://ccc.com/1221', 'http://ccc.com/12212', 'http://ccc.com/12211', 'http://ccc.com/12222', 'http://ccc.com/12221' ], "SEQUENCE_03_C": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', 'C122', 'C121', - 'C1212', 'C1211', 'C1222', 'C1221', 'C1112', - 'C11122', 'C11121', 'C12212', 'C12211', - 'C12222', 'C12221', 'C12112', 'C12111', - 'C12122', 'C12121', - 'C1111', 'C1122', 'C1121', 'C11212', - 'C11211', 'C11222', 'C11221', 'C11112', 'C11111' + 'http://ccc.com/1', + 'http://ccc.com/12', 'http://ccc.com/11', + 'http://ccc.com/112', 'http://ccc.com/111', 'http://ccc.com/122', 'http://ccc.com/121', + 'http://ccc.com/1212', 'http://ccc.com/1211', 'http://ccc.com/1222', 'http://ccc.com/1221', 'http://ccc.com/1112', + 'http://ccc.com/11122', 'http://ccc.com/11121', 'http://ccc.com/12212', 'http://ccc.com/12211', + 'http://ccc.com/12222', 'http://ccc.com/12221', 'http://ccc.com/12112', 'http://ccc.com/12111', + 'http://ccc.com/12122', 'http://ccc.com/12121', + 'http://ccc.com/1111', 'http://ccc.com/1122', 'http://ccc.com/1121', 'http://ccc.com/11212', + 'http://ccc.com/11211', 'http://ccc.com/11222', 'http://ccc.com/11221', 'http://ccc.com/11112', 'http://ccc.com/11111' ], "SEQUENCE_03_D": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', 'C122', 'C121', - 'C1212', 'C1211', 'C1222', 'C1221', - 'C1112', 'C1111', 'C1122', 'C1121', - 'C11212', 'C11211', 'C11222', 'C11221', 'C11112', 'C11111', 'C11122', 'C11121', - 'C12212', 'C12211', 'C12222', 'C12221', 'C12112', 'C12111', 'C12122', 'C12121' + 'http://ccc.com/1', + 'http://ccc.com/12', 'http://ccc.com/11', + 'http://ccc.com/112', 'http://ccc.com/111', 'http://ccc.com/122', 'http://ccc.com/121', + 'http://ccc.com/1212', 'http://ccc.com/1211', 'http://ccc.com/1222', 'http://ccc.com/1221', + 'http://ccc.com/1112', 'http://ccc.com/1111', 'http://ccc.com/1122', 'http://ccc.com/1121', + 'http://ccc.com/11212', 'http://ccc.com/11211', 'http://ccc.com/11222', 'http://ccc.com/11221', 'http://ccc.com/11112', 'http://ccc.com/11111', 'http://ccc.com/11122', 'http://ccc.com/11121', + 'http://ccc.com/12212', 'http://ccc.com/12211', 'http://ccc.com/12222', 'http://ccc.com/12221', 'http://ccc.com/12112', 'http://ccc.com/12111', 'http://ccc.com/12122', 'http://ccc.com/12121' ], } @@ -310,108 +377,108 @@ class DFSBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A111', 'A1111', 'A1112', 'A112', 'A1121', 'A1122', - 'A12', 'A121', 'A1211', 'A1212', 'A122', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/111', 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/12', 'http://aaa.com/121', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/122', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_01_B": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A121', 'A122', - 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_01_C": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_02_A": [ - 'A1', - 'A11', - 'A111', 'A1111', 'A1112', - 'A112', 'A1121', 'A1122', - 'A12', - 'A121', 'A1211', 'A1212', - 'A122', 'A1221', 'A1222', - 'B1', - 'B11', - 'B111', 'B1111', 'B1112', - 'B112', 'B1121', 'B1122', - 'B12', - 'B121', 'B1211', 'B1212', - 'B122', 'B1221', 'B1222' + 'http://aaa.com/1', + 'http://aaa.com/11', + 'http://aaa.com/111', 'http://aaa.com/1111', 'http://aaa.com/1112', + 'http://aaa.com/112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/12', + 'http://aaa.com/121', 'http://aaa.com/1211', 'http://aaa.com/1212', + 'http://aaa.com/122', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/1', + 'http://bbb.com/11', + 'http://bbb.com/111', 'http://bbb.com/1111', 'http://bbb.com/1112', + 'http://bbb.com/112', 'http://bbb.com/1121', 'http://bbb.com/1122', + 'http://bbb.com/12', + 'http://bbb.com/121', 'http://bbb.com/1211', 'http://bbb.com/1212', + 'http://bbb.com/122', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_02_B": [ - 'A1', 'B1', - 'A11', 'A12', - 'A111', 'A112', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A121', 'A122', - 'A1211', 'A1212', 'A1221', 'A1222', - 'B11', 'B12', - 'B111', 'B112', - 'B1111', 'B1112', 'B1121', 'B1122', - 'B121', 'B122', - 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/11', 'http://bbb.com/12', + 'http://bbb.com/111', 'http://bbb.com/112', + 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', + 'http://bbb.com/121', 'http://bbb.com/122', + 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_02_C": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', 'B1111', 'B1112', - 'B112', 'B121', 'B122', - 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', 'http://bbb.com/111', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', 'http://bbb.com/1111', 'http://bbb.com/1112', + 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_02_D": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', - 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://bbb.com/111', 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_03_A": [ - 'C1', - 'C11', - 'C111', 'C1111', 'C11111', 'C11112', 'C1112', 'C11121', 'C11122', - 'C112', 'C1121', 'C11211', 'C11212', 'C1122', 'C11221', 'C11222', - 'C12', - 'C121', 'C1211', 'C12111', 'C12112', 'C1212', 'C12121', 'C12122', - 'C122', 'C1221', 'C12211', 'C12212', 'C1222', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', + 'http://ccc.com/111', 'http://ccc.com/1111', 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/1112', 'http://ccc.com/11121', 'http://ccc.com/11122', + 'http://ccc.com/112', 'http://ccc.com/1121', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/1122', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/12', + 'http://ccc.com/121', 'http://ccc.com/1211', 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/1212', 'http://ccc.com/12121', 'http://ccc.com/12122', + 'http://ccc.com/122', 'http://ccc.com/1221', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/1222', 'http://ccc.com/12221', 'http://ccc.com/12222' ], "SEQUENCE_03_B": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', - 'C1111', 'C1112', - 'C11111', 'C11112', 'C11121', 'C11122', - 'C1121', 'C1122', - 'C11211', 'C11212', 'C11221', 'C11222', - 'C121', 'C122', - 'C1211', 'C1212', - 'C12111', 'C12112', 'C12121', 'C12122', - 'C1221', 'C1222', - 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', + 'http://ccc.com/1111', 'http://ccc.com/1112', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', + 'http://ccc.com/1121', 'http://ccc.com/1122', + 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1211', 'http://ccc.com/1212', + 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', + 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], "SEQUENCE_03_C": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', 'C12111', 'C12112', - 'C1212', 'C1221', 'C1222', - 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', 'http://ccc.com/12111', 'http://ccc.com/12112', + 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], "SEQUENCE_03_D": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], } @@ -444,31 +511,40 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): max_next_requests=max_next_requests, ) + def get_settings(self): + settings = super(DFSBackendTest, self).get_settings() + settings.TEST_MODE = True + settings.LOGGING_MANAGER_ENABLED = False + settings.LOGGING_BACKEND_ENABLED = False + settings.LOGGING_DEBUGGING_ENABLED = False + settings.STRATEGY = 'tests.backends.DFSCrawlingStrategy' + return settings + class BFSBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A1211', 'A1212', 'A1221', 'A1222' + 'http://aaa.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', + 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222' ], "SEQUENCE_02_A": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + 'http://aaa.com/1', 'http://bbb.com/1', + 'http://aaa.com/11', 'http://aaa.com/12', 'http://bbb.com/11', 'http://bbb.com/12', + 'http://aaa.com/111', 'http://aaa.com/112', 'http://aaa.com/121', 'http://aaa.com/122', 'http://bbb.com/111', 'http://bbb.com/112', 'http://bbb.com/121', 'http://bbb.com/122', + 'http://aaa.com/1111', 'http://aaa.com/1112', 'http://aaa.com/1121', 'http://aaa.com/1122', 'http://aaa.com/1211', 'http://aaa.com/1212', 'http://aaa.com/1221', 'http://aaa.com/1222', + 'http://bbb.com/1111', 'http://bbb.com/1112', 'http://bbb.com/1121', 'http://bbb.com/1122', 'http://bbb.com/1211', 'http://bbb.com/1212', 'http://bbb.com/1221', 'http://bbb.com/1222' ], "SEQUENCE_03_A": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + 'http://ccc.com/1', + 'http://ccc.com/11', 'http://ccc.com/12', + 'http://ccc.com/111', 'http://ccc.com/112', 'http://ccc.com/121', 'http://ccc.com/122', + 'http://ccc.com/1111', 'http://ccc.com/1112', 'http://ccc.com/1121', 'http://ccc.com/1122', 'http://ccc.com/1211', 'http://ccc.com/1212', 'http://ccc.com/1221', 'http://ccc.com/1222', + 'http://ccc.com/11111', 'http://ccc.com/11112', 'http://ccc.com/11121', 'http://ccc.com/11122', 'http://ccc.com/11211', 'http://ccc.com/11212', 'http://ccc.com/11221', 'http://ccc.com/11222', + 'http://ccc.com/12111', 'http://ccc.com/12112', 'http://ccc.com/12121', 'http://ccc.com/12122', 'http://ccc.com/12211', 'http://ccc.com/12212', 'http://ccc.com/12221', 'http://ccc.com/12222' ], } @@ -500,6 +576,14 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): expected_sequence=self.EXPECTED_SEQUENCES[expected_sequence], max_next_requests=max_next_requests, ) + def get_settings(self): + settings = super(BFSBackendTest, self).get_settings() + settings.TEST_MODE = True + settings.LOGGING_MANAGER_ENABLED = False + settings.LOGGING_BACKEND_ENABLED = False + settings.LOGGING_DEBUGGING_ENABLED = False + settings.STRATEGY = 'tests.backends.BFSCrawlingStrategy' + return settings class RANDOMBackendTest(BackendSequenceTest): diff --git a/tests/contrib/backends/hbase/test_domain_cache.py b/tests/contrib/backends/hbase/test_domain_cache.py new file mode 100644 index 000000000..34eb733c8 --- /dev/null +++ b/tests/contrib/backends/hbase/test_domain_cache.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +from frontera.contrib.backends.hbase.domaincache import DomainCache +from happybase import Connection +import logging +import unittest + + +class TestDomainCache(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG) + self.conn = Connection(host="hbase-docker") + if b'domain_metadata' not in self.conn.tables(): + self.conn.create_table('domain_metadata', { + 'm': {'max_versions': 1, 'block_cache_enabled': 1,} + }) + t = self.conn.table('domain_metadata') + t.delete('d1') + t.delete('d2') + t.delete('d3') + t.delete('d4') + + def test_domain_cache_both_generations(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + + # eviction should happen + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc['d1'] == {'domain': 1} + assert dc['d2'] == {'domain': 2} + assert dc['d3'] == {'domain': [3, 2, 1]} + assert dc['d4'] == {'domain': 4} + + def test_domain_cache_get_with_default(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc.get('d1', {}) == {'domain': 1} + assert dc.get('d3', {}) == {'domain': [3, 2, 1]} + + def test_domain_cache_setdefault(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc.setdefault('d1', {}) == {'domain': 1} + assert dc.setdefault('d5', {'domain': 6}) == {'domain': 6} + dc.flush() + assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]} + + def test_domain_cache_setdefault_with_second_gen_flush(self): + dc = DomainCache(2, self.conn, 'domain_metadata', batch_size=3) + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + dc.setdefault('d1', {})['domain'] += 1 + + assert dc.setdefault('d1', {}) == {'domain': 2} + + def test_empty_key(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + with self.assertRaises(KeyError): + dc[''] = {'test':1} + + def test_deletion(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + with self.assertRaises(KeyError): + del dc['d1'] + + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + del dc['d1'] # second gen + del dc['d3'] # first gen + + dc.flush() + + del dc['d4'] # hbase + + def test_contains(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert 'd1' in dc # second gen + assert 'd3' in dc # first gen + + dc.flush() + + assert 'd4' in dc + + def test_pop(self): + dc = DomainCache(2, self.conn, 'domain_metadata') + dc['d1'] = {'domain': 1} + dc['d2'] = {'domain': 2} + dc['d3'] = {'domain': [3, 2, 1]} + dc['d4'] = {'domain': 4} + + assert dc.pop('d1') == {'domain': 1} + assert 'd1' not in dc + + assert dc.pop('d3') == {'domain': [3, 2, 1]} + assert 'd3' not in dc + + dc.flush() + + assert dc.pop('d4') == {'domain': 4} + assert 'd4' not in dc \ No newline at end of file diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index e0a039fd1..88e440267 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -1,13 +1,16 @@ from __future__ import absolute_import -from happybase import Connection + from Hbase_thrift import AlreadyExists # module loaded at runtime in happybase -from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue -from frontera.core.models import Request, Response -from frontera.core.components import States from binascii import unhexlify from time import time -from w3lib.util import to_native_str + +import pytest +from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue +from frontera.core.components import States +from frontera.core.models import Request, Response +from happybase import Connection from tests import mock +from w3lib.util import to_native_str r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) @@ -40,20 +43,10 @@ def test_metadata(self): set([r1.url, r2.url, r3.url]) self.delete_rows(table, [b'10', b'11', b'12']) - def test_queue(self): - connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 2, b'queue', True) - batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), - ('12', 0.7, r3, True)] - queue.schedule(batch) - assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r3.url]) - assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r1.url, r2.url]) - + @pytest.mark.xfail def test_queue_with_delay(self): connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 1, b'queue', True) + queue = HBaseQueue(connection, 1, b'queue', use_snappy=False, drop=True) r5 = r3.copy() crawl_at = int(time()) + 1000 r5.meta[b'crawl_at'] = crawl_at @@ -67,44 +60,25 @@ def test_queue_with_delay(self): assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)]) == set([r5.url]) - def test_state(self): - connection = Connection(host='hbase-docker', port=9090) - state = HBaseState(connection, b'metadata', 300000) - state.set_states([r1, r2, r3]) - assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 - state.update_cache([r1, r2, r3]) - assert state._state_cache == {b'10': States.NOT_CRAWLED, - b'11': States.NOT_CRAWLED, - b'12': States.NOT_CRAWLED} - r1.meta[b'state'] = States.CRAWLED - r2.meta[b'state'] = States.CRAWLED - r3.meta[b'state'] = States.CRAWLED - state.update_cache([r1, r2, r3]) - state.flush(True) - assert state._state_cache == {} - state.fetch([b'10', b'11', b'12']) - assert state._state_cache == {b'10': States.CRAWLED, - b'11': States.CRAWLED, - b'12': States.CRAWLED} - r4.meta[b'state'] = States.ERROR - state.set_states([r1, r2, r4]) - assert r4.meta[b'state'] == States.CRAWLED - state.flush(True) - assert state._state_cache == {} - def test_drop_all_tables_when_table_name_is_str(self): connection = Connection(host='hbase-docker', port=9090) for table in connection.tables(): connection.delete_table(table, True) hbase_queue_table = 'queue' hbase_metadata_table = 'metadata' + hbase_states_table = 'states' connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) connection.create_table(hbase_metadata_table, {'f': {'max_versions': 1}}) + connection.create_table(hbase_states_table, {'f': {'max_versions': 1}}) tables = connection.tables() - assert set(tables) == set([b'metadata', b'queue']) # Failure of test itself + assert set(tables) == set([b'metadata', b'queue', b'states']) # Failure of test itself try: - HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True) + HBaseQueue(connection=connection, partitions=1, + table_name=hbase_queue_table, use_snappy=False, drop=True) HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, use_snappy=False, batch_size=300000, store_content=True) + HBaseState(connection, hbase_states_table, cache_size_limit=100, + write_log_size=10, drop_all_tables=True) except AlreadyExists: assert False, "failed to drop hbase tables" + diff --git a/tests/contrib/backends/memory/test_backend_memory.py b/tests/contrib/backends/memory/test_backend_memory.py deleted file mode 100644 index 4b1c6cf79..000000000 --- a/tests/contrib/backends/memory/test_backend_memory.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import absolute_import -from tests.test_overused_buffer import DFSOverusedBackendTest -from tests import backends - - -class TestFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.memory.FIFO' - - -class TestLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.memory.LIFO' - - -class TestDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.DFS' - - -class TestDFSOverused(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' - - -class TestDFSOverusedSimulation(DFSOverusedBackendTest): - backend_class = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' - - -class TestBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.BFS' - - -class TestRANDOM(backends.RANDOMBackendTest): - backend_class = 'frontera.contrib.backends.memory.RANDOM' diff --git a/tests/contrib/backends/redis_backend/test_redis.py b/tests/contrib/backends/redis_backend/test_redis.py new file mode 100644 index 000000000..c924bd495 --- /dev/null +++ b/tests/contrib/backends/redis_backend/test_redis.py @@ -0,0 +1,513 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from frontera.contrib.backends.redis_backend import FIELD_DOMAIN_FINGERPRINT, FIELD_ERROR, FIELD_STATE +from frontera.contrib.backends.redis_backend import FIELD_STATUS_CODE, FIELD_URL +from frontera.contrib.backends.redis_backend import RedisBackend, RedisMetadata, RedisQueue, RedisState +from frontera.core.manager import WorkerFrontierManager +from frontera.settings import Settings +from redis import ConnectionPool, StrictRedis +from time import time +from unittest import main, TestCase + +from logging import basicConfig, INFO + +basicConfig(level=INFO) + + +class Request: + def __init__(self, fingerprint, crawl_at, url, domain=None): + self.meta = { + b'crawl_at': crawl_at, + b'fingerprint': fingerprint + } + if domain: + self.meta[b'domain'] = {b'name': domain, b'fingerprint': "d_{}".format(fingerprint)} + self.url = url + self.method = 'https' + self.headers = {} + self.cookies = None + self.status_code = 200 + + +def get_pool(): + port = 6379 + host = 'localhost' + return ConnectionPool(host=host, port=port, db=0) + + +class RedisQueueTest(TestCase): + @staticmethod + def setup_subject(partitions): + settings = Settings(module='frontera.settings.default_settings') + return RedisQueue(WorkerFrontierManager.from_settings(settings), get_pool(), partitions, True) + + def test_scheduling_past_1part_5(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(5, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(3, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.hellan.me/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(0, subject.count()) + + def test_scheduling_past_1part_1(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(2, subject.count()) + + def test_scheduling_past_1part_2(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(2, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(1, subject.count()) + + def test_scheduling_past_2part_5(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(5, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + requests = subject.get_next_requests(5, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(0, subject.count()) + + def test_scheduling_past_2part_2(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(2, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + requests = subject.get_next_requests(2, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(0, subject.count()) + + def test_scheduling_past_2part_1(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + + requests = subject.get_next_requests(1, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(1, subject.count()) + + def test_scheduling_past_2part_multiple(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(2, subject.count()) + + requests = subject.get_next_requests(1, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.hellan.me/' in urls) + self.assertEqual(1, subject.count()) + + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(0, subject.count()) + + requests = subject.get_next_requests(1, partition_id=1, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(0, len(requests)) + + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(0, len(requests)) + + def test_scheduling_future(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) + 86400, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) + 86400, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(5, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(0, len(requests)) + + def test_scheduling_mix(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) + 86400, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + + requests = subject.get_next_requests(5, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(2, subject.count()) + + def test_scheduling_conflict_high_score_high_timestamp(self): + subject = self.setup_subject(1) + batch = [ + ("1", 1, Request("1", int(time()) + 86400, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ("4", 0.7, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ("5", 0.8, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ("6", 0.9, Request("3", int(time()) + 86400, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(6, subject.count()) + + requests = subject.get_next_requests(2, 0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(5, subject.count()) + + def test_get_next_requests_max_requests(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(2, subject.count()) + + def test_get_next_requests_min_hosts(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=2, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + def test_get_next_requests_min_hosts_high_number(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(3, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=5, min_requests=1, max_requests_per_host=5) + self.assertEqual(2, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.khellan.com/' in urls) + self.assertEqual(1, subject.count()) + + def test_get_next_requests_max_requests_2(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("1", 0.99, Request("1", int(time()) - 10, 'https://www.knuthellan.com/a', domain='knuthellan.com'), True), + ("1", 0.98, Request("1", int(time()) - 10, 'https://www.knuthellan.com/c', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.schedule(batch) + self.assertEqual(5, subject.count()) + requests = subject.get_next_requests(5, partition_id=0, min_hosts=1, min_requests=1, max_requests_per_host=2) + self.assertGreaterEqual(len(requests), 2) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertTrue('https://www.knuthellan.com/a' in urls) + self.assertFalse('https://www.knuthellan.com/c' in urls) + + def test_get_next_requests_few_items_few_hosts(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True) + ] + subject.schedule(batch) + self.assertEqual(1, subject.count()) + requests = subject.get_next_requests(1, partition_id=0, min_hosts=2, min_requests=1, max_requests_per_host=5) + self.assertEqual(1, len(requests)) + urls = [request.url for request in requests] + self.assertTrue('https://www.knuthellan.com/' in urls) + self.assertEqual(0, subject.count()) + + + +class RedisStateTest(TestCase): + def test_update_cache(self): + subject = RedisState(get_pool(), 10) + r1 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'a' + r2 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'b' + r3 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'c' + batch = [r1, r2, r3] + subject.update_cache(batch) + self.assertEqual(3, len(subject._cache)) + self.assertEqual(b'a', subject._cache["1"]) + self.assertEqual(b'b', subject._cache["2"]) + self.assertEqual(b'c', subject._cache["3"]) + + def test_set_states(self): + subject = RedisState(get_pool(), 10) + r1 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'a' + r2 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'b' + r3 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'c' + batch = [r1, r2, r3] + subject.update_cache(batch) + r4 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r5 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r6 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + batch2 = [r4, r5, r6] + subject.set_states(batch2) + self.assertEqual(b'a', r4.meta[b'state']) + self.assertEqual(b'b', r5.meta[b'state']) + self.assertEqual(b'c', r6.meta[b'state']) + + def test_flush_no_force(self): + pool = get_pool() + subject = RedisState(pool, 10) + r1 = Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'a' + r2 = Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'b' + r3 = Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'c' + batch = [r1, r2, r3] + subject.update_cache(batch) + subject.flush(False) + self.assertEqual(3, len(subject._cache)) + connection = StrictRedis(connection_pool=pool) + self.assertEqual({FIELD_STATE: b'a'}, connection.hgetall("1")) + self.assertEqual({FIELD_STATE: b'b'}, connection.hgetall("2")) + self.assertEqual({FIELD_STATE: b'c'}, connection.hgetall("3")) + + def test_flush_force(self): + pool = get_pool() + subject = RedisState(pool, 10) + r1 = Request("4", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'd' + r2 = Request("5", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'e' + r3 = Request("6", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'f' + batch = [r1, r2, r3] + subject.update_cache(batch) + subject.flush(True) + self.assertEqual(0, len(subject._cache)) + connection = StrictRedis(connection_pool=pool) + self.assertEqual({FIELD_STATE: b'd'}, connection.hgetall("4")) + self.assertEqual({FIELD_STATE: b'e'}, connection.hgetall("5")) + self.assertEqual({FIELD_STATE: b'f'}, connection.hgetall("6")) + + def test_flush_cache_overflow(self): + pool = get_pool() + subject = RedisState(pool, 1) + r1 = Request("4", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'd' + r2 = Request("5", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'e' + r3 = Request("6", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'f' + batch = [r1, r2, r3] + subject.update_cache(batch) + subject.flush(False) + self.assertEqual(0, len(subject._cache)) + connection = StrictRedis(connection_pool=pool) + self.assertEqual({FIELD_STATE: b'd'}, connection.hgetall("4")) + self.assertEqual({FIELD_STATE: b'e'}, connection.hgetall("5")) + self.assertEqual({FIELD_STATE: b'f'}, connection.hgetall("6")) + + def test_fetch(self): + subject = RedisState(get_pool(), 1) + r1 = Request("7", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r1.meta[b'state'] = b'g' + r2 = Request("8", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r2.meta[b'state'] = b'h' + batch = [r1, r2] + subject.update_cache(batch) + subject.flush(True) + r3 = Request("9", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + r3.meta[b'state'] = b'i' + subject.update_cache(r3) + self.assertEqual(1, len(subject._cache)) + to_fetch = ["7", "9"] + subject.fetch(to_fetch) + self.assertEqual(2, len(subject._cache)) + self.assertEqual(b'g', subject._cache["7"]) + self.assertEqual(b'i', subject._cache["9"]) + + +class RedisMetadataTest(TestCase): + def test_add_seeds(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + r1 = Request("md1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + r2 = Request("md2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + r3 = Request("md3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + seeds = [r1, r2, r3] + subject.add_seeds(seeds) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'https://www.knuthellan.com/', connection.hmget('md1', FIELD_URL)[0]) + self.assertEqual(b'd_md1', connection.hmget('md1', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.khellan.com/', connection.hmget("md2", FIELD_URL)[0]) + self.assertEqual(b'd_md2', connection.hmget('md2', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.hellan.me/', connection.hmget("md3", FIELD_URL)[0]) + self.assertEqual(b'd_md3', connection.hmget('md3', FIELD_DOMAIN_FINGERPRINT)[0]) + + def test_request_error(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + r1 = Request("md1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + subject.request_error(r1, 404) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'https://www.knuthellan.com/', connection.hmget('md1', FIELD_URL)[0]) + self.assertEqual(b'd_md1', connection.hmget('md1', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'404', connection.hmget('md1', FIELD_ERROR)[0]) + + def test_page_crawled(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + r1 = Request("md1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + subject.page_crawled(r1) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'200', connection.hmget('md1', FIELD_STATUS_CODE)[0]) + + def test_links_extracted(self): + pool = get_pool() + subject = RedisMetadata(pool, True) + l1 = Request("l1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com') + l2 = Request("l2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com') + l3 = Request("l3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me') + links = [l1, l2, l3] + subject.links_extracted(None, links) + connection = StrictRedis(connection_pool=pool) + self.assertEqual(b'https://www.knuthellan.com/', connection.hmget('l1', FIELD_URL)[0]) + self.assertEqual(b'd_l1', connection.hmget('l1', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.khellan.com/', connection.hmget("l2", FIELD_URL)[0]) + self.assertEqual(b'd_l2', connection.hmget('l2', FIELD_DOMAIN_FINGERPRINT)[0]) + self.assertEqual(b'https://www.hellan.me/', connection.hmget("l3", FIELD_URL)[0]) + self.assertEqual(b'd_l3', connection.hmget('l3', FIELD_DOMAIN_FINGERPRINT)[0]) + + +class RedisBackendTest(TestCase): + @staticmethod + def setup_subject(partitions): + settings = Settings(module='frontera.settings.default_settings') + settings.set('SPIDER_FEED_PARTITIONS', partitions) + settings.set('REDIS_DROP_ALL_TABLES', True) + return RedisBackend.db_worker(WorkerFrontierManager.from_settings(settings, db_worker=True)) + + def test_get_next_request(self): + subject = self.setup_subject(2) + requests = subject.get_next_requests(max_next_requests=10, partitions=['0', '1']) + self.assertEqual(0, len(requests)) + + def test_get_next_request_has_requests(self): + subject = self.setup_subject(2) + batch = [ + ("1", 1, Request("1", int(time()) - 10, 'https://www.knuthellan.com/', domain='knuthellan.com'), True), + ("2", 0.1, Request("2", int(time()) - 10, 'https://www.khellan.com/', domain='khellan.com'), True), + ("3", 0.5, Request("3", int(time()) - 10, 'https://www.hellan.me/', domain='hellan.me'), True), + ] + subject.queue.schedule(batch) + requests = subject.get_next_requests(max_next_requests=10, partitions=['0', '1']) + self.assertEqual(3, len(requests)) + + def test_close_manager(self): + settings = Settings(module='frontera.settings.default_settings') + settings.set('BACKEND', 'frontera.contrib.backends.redis_backend.RedisBackend') + manager = WorkerFrontierManager.from_settings(settings, strategy_worker=True) + self.assertEqual(RedisBackend, manager.backend.__class__) + manager.close() + + +if __name__ == '__main__': + main() diff --git a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py b/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py deleted file mode 100644 index 0dceaaa7d..000000000 --- a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py +++ /dev/null @@ -1,210 +0,0 @@ -from __future__ import absolute_import -import os - -import pymysql -from psycopg2 import connect -from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT - -from tests import backends -from tests.test_revisiting_backend import RevisitingBackendTest - - -#---------------------------------------------------- -# SQAlchemy base classes -#---------------------------------------------------- -class SQLAlchemyFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.FIFO' - - -class SQLAlchemyLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.LIFO' - - -class SQLAlchemyDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.DFS' - - -class SQLAlchemyBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.BFS' - - -class SQLAlchemyRevisiting(RevisitingBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.revisiting.Backend' - - -#---------------------------------------------------- -# SQLite Memory -#---------------------------------------------------- -class SQLiteMemory(backends.BackendTest): - - def get_settings(self): - settings = super(SQLiteMemory, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' - return settings - - -class TestSQLiteMemoryFIFO(SQLAlchemyFIFO, SQLiteMemory): - pass - - -class TestSQLiteMemoryLIFO(SQLAlchemyLIFO, SQLiteMemory): - pass - - -class TestSQLiteMemoryDFS(SQLAlchemyDFS, SQLiteMemory): - pass - - -class TestSQLiteMemoryBFS(SQLAlchemyBFS, SQLiteMemory): - pass - - -class TestSQLiteMemoryRevisiting(SQLAlchemyRevisiting): - pass - - -#---------------------------------------------------- -# SQLite File -#---------------------------------------------------- -class SQLiteFile(backends.BackendTest): - - SQLITE_DB_NAME = 'backend_test.db' - - def get_settings(self): - settings = super(SQLiteFile, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///' + self.SQLITE_DB_NAME - return settings - - def setup_backend(self, method): - self._delete_test_db() - - def teardown_backend(self, method): - self._delete_test_db() - - def _delete_test_db(self): - try: - os.remove(self.SQLITE_DB_NAME) - except OSError: - pass - - -class TestSQLiteFileFIFO(SQLAlchemyFIFO, SQLiteFile): - pass - - -class TestSQLiteFileLIFO(SQLAlchemyLIFO, SQLiteFile): - pass - - -class TestSQLiteFileDFS(SQLAlchemyDFS, SQLiteFile): - pass - - -class TestSQLiteFileBFS(SQLAlchemyBFS, SQLiteFile): - pass - - -#---------------------------------------------------- -# DB Backend test base -#---------------------------------------------------- -class DBBackendTest(object): - - DB_DATABASE = 'backend_test' - DB_ENGINE = None - DB_HOST = None - DB_USER = None - DB_PASSWORD = None - - def get_settings(self): - settings = super(DBBackendTest, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = self.DB_ENGINE - return settings - - def setup_backend(self, method): - self._delete_database() - self._create_database() - - def teardown_backend(self, method): - self._delete_database() - - def _delete_database(self): - self._execute_sql("DROP DATABASE IF EXISTS %s;" % self.DB_DATABASE) - - def _create_database(self): - self._execute_sql("CREATE DATABASE %s;" % self.DB_DATABASE) - - def _execute_sql(self, sql): - raise NotImplementedError - - -#---------------------------------------------------- -# Mysql -#---------------------------------------------------- -class Mysql(DBBackendTest): - - DB_ENGINE = 'mysql+pymysql://root:@localhost/backend_test' - DB_HOST = 'localhost' - DB_USER = 'root' - DB_PASSWORD = '' - - def _execute_sql(self, sql): - conn = pymysql.connect(host=self.DB_HOST, - user=self.DB_USER, - passwd=self.DB_PASSWORD) - cur = conn.cursor() - cur.execute(sql) - cur.close() - conn.close() - - -class TestMysqlFIFO(Mysql, SQLAlchemyFIFO): - pass - - -class TestMysqlLIFO(Mysql, SQLAlchemyLIFO): - pass - - -class TestMysqlDFS(Mysql, SQLAlchemyDFS): - pass - - -class TestMysqlBFS(Mysql, SQLAlchemyBFS): - pass - - -#---------------------------------------------------- -# Postgres -#---------------------------------------------------- -class Postgres(DBBackendTest): - - DB_ENGINE = 'postgres://postgres@localhost/backend_test' - DB_HOST = 'localhost' - DB_USER = 'postgres' - DB_PASSWORD = '' - - def _execute_sql(self, sql): - conn = connect(host=self.DB_HOST, - user=self.DB_USER, - password=self.DB_PASSWORD) - conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) - cur = conn.cursor() - cur.execute(sql) - cur.close() - conn.close() - - -class TestPostgresFIFO(Postgres, SQLAlchemyFIFO): - pass - - -class TestPostgresLIFO(Postgres, SQLAlchemyLIFO): - pass - - -class TestPostgresDFS(Postgres, SQLAlchemyDFS): - pass - - -class TestPostgresBFS(Postgres, SQLAlchemyBFS): - pass diff --git a/tests/contrib/backends/sqlalchemy/test_domain_metadata.py b/tests/contrib/backends/sqlalchemy/test_domain_metadata.py new file mode 100644 index 000000000..6f77f70a8 --- /dev/null +++ b/tests/contrib/backends/sqlalchemy/test_domain_metadata.py @@ -0,0 +1,42 @@ +from frontera.contrib.backends.sqlalchemy.components import DomainMetadata, DomainMetadataKV +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from unittest import TestCase +import random +import string + + +def random_string(N): + return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(N)) + + +class TestSqlAlchemyDomainMetadata(TestCase): + def setUp(self): + self.engine = create_engine("sqlite:///:memory:") + self.session_cls = sessionmaker() + self.session_cls.configure(bind=self.engine) + DomainMetadataKV.__table__.create(bind=self.engine) + + def test_basic(self): + dm = DomainMetadata(self.session_cls) + value = {"someint": 1, "somefloat": 1, "someblob": b"bytes"} + dm["test"] = value + assert "test" in dm + assert dm["test"] == value + del dm["test"] + assert "test" not in dm + + dm["test"] = 111 + assert "test" in dm + assert dm["test"] == 111 + + def test_many_items(self): + dm = DomainMetadata(self.session_cls) + for i in range(200): + dm["key%d" % i] = random_string(10) + + for i in range(200): + assert "key%d" % i in dm + assert len(dm["key%d" % i]) == 10 + del dm["key%d" % i] + diff --git a/tests/contrib/backends/test_backends.py b/tests/contrib/backends/test_backends.py new file mode 100644 index 000000000..dfdc3b798 --- /dev/null +++ b/tests/contrib/backends/test_backends.py @@ -0,0 +1,117 @@ +import pytest +from frontera.core.components import States +from frontera.core.models import Request +from happybase import Connection +from frontera.contrib.backends.hbase import HBaseState, HBaseQueue +from frontera.contrib.backends.sqlalchemy import States as SQLAlchemyStates, Queue as SQLAlchemyQueue +from frontera.contrib.backends.sqlalchemy.models import StateModel, QueueModel +from frontera.contrib.backends.memory import MemoryStates, MemoryQueue +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + + +r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', + b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) +r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', + b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) +r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', + b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) +r4 = r3.copy() + + +hbase_connection = None + + +def get_hbase_connection(): + global hbase_connection + if hbase_connection is None: + hbase_connection = Connection(host='hbase-docker', port=9090) + return hbase_connection + + +@pytest.fixture(scope="module", params=["memory", "sqlalchemy", "hbase"]) +def states(request): + if request.param == "memory": + ms = MemoryStates(100) + yield ms + return + + if request.param == "sqlalchemy": + engine = create_engine('sqlite:///:memory:', echo=False) + session_cls = sessionmaker() + session_cls.configure(bind=engine) + StateModel.__table__.create(bind=engine) + sqla_states = SQLAlchemyStates(session_cls, StateModel, 100) + yield sqla_states + sqla_states.frontier_stop() + engine.dispose() + return + + if request.param == "hbase": + conn = get_hbase_connection() + states = HBaseState(conn, b'states', cache_size_limit=300000, + write_log_size=5000, drop_all_tables=True) + yield states + states.frontier_stop() + return + raise KeyError("Unknown backend param") + + +def test_states(states): + states.set_states([r1, r2, r3]) + assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.CRAWLED + r2.meta[b'state'] = States.ERROR + r3.meta[b'state'] = States.QUEUED + states.update_cache([r1, r2, r3]) + states.flush() + + r1.meta[b'state'] = States.NOT_CRAWLED + r2.meta[b'state'] = States.NOT_CRAWLED + r3.meta[b'state'] = States.NOT_CRAWLED + + states.fetch([b'83']) + states.set_states([r1, r2, r4]) + assert r4.meta[b'state'] == States.QUEUED + assert r1.meta[b'state'] == States.CRAWLED + assert r2.meta[b'state'] == States.ERROR + + +@pytest.fixture(scope="module", params=["memory", "sqlalchemy", "hbase"]) +def queue(request): + if request.param == "memory": + mq = MemoryQueue(2) + yield mq + return + + if request.param == "sqlalchemy": + engine = create_engine('sqlite:///:memory:', echo=False) + session_cls = sessionmaker() + session_cls.configure(bind=engine) + QueueModel.__table__.create(bind=engine) + sqla_queue = SQLAlchemyQueue(session_cls, QueueModel, 2) + yield sqla_queue + sqla_queue.frontier_stop() + engine.dispose() + return + + if request.param == "hbase": + conn = get_hbase_connection() + hq = HBaseQueue(conn, 2, b'queue') + yield hq + hq.frontier_stop() + return + raise KeyError("Unknown backend param") + + +def test_queue(queue): + batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), + ('12', 0.7, r3, True)] + queue.schedule(batch) + assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, + max_requests_per_host=10)]) == set([r3.url]) + assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, + max_requests_per_host=10)]) == set([r1.url, r2.url]) \ No newline at end of file diff --git a/tests/mocks/components.py b/tests/mocks/components.py index 801257f10..710a9f753 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -1,18 +1,21 @@ from __future__ import absolute_import from frontera.core.components import Backend, Middleware, CanonicalSolver, \ DistributedBackend, Queue +from frontera.contrib.backends.memory import MemoryStates from six.moves import range from frontera.core.models import Request +from frontera.strategy import BaseCrawlingStrategy +from frontera.core.components import States class FakeMiddleware(Middleware): def __init__(self): - self.seeds = [] + self.requests = [] self.responses = [] self.links = [] self.errors = [] - self.lists = [self.seeds, self.responses, self.links, self.errors] + self.lists = [self.requests, self.responses, self.links, self.errors] self._started = False self._stopped = False self.test_value = 'test' @@ -27,10 +30,9 @@ def frontier_start(self): def frontier_stop(self): self._stopped = True - def add_seeds(self, seeds): - for seed in seeds: - self.seeds.append(seed) - return seeds + def create_request(self, request): + self.requests.append(request) + return request def page_crawled(self, response): self.responses.append(response) @@ -66,15 +68,26 @@ def count(self): return len(self.requests) def schedule(self, batch): - for obj in batch: - if obj[3]: - self.requests.append(Request(obj[2].url, meta={b'fingerprint': obj[0], b'score': obj[1]})) + for fingerprint, score, request, is_schedule in batch: + if is_schedule: + self.requests.append(request) class FakeBackend(FakeMiddleware, Backend): - _finished = False - queue = FakeQueue() + def __init__(self): + self._finished = False + self._queue = FakeQueue() + self._states = MemoryStates(10000) + super(FakeBackend, self).__init__() + + @property + def queue(self): + return self._queue + + @property + def states(self): + return self._states def finished(self): return self._finished @@ -127,6 +140,9 @@ def links_extracted(self, request, links): def request_error(self, request, error): self.errors.append((request, error)) + def create_request(self, request): + self.requests.append(request) + class FakeMiddlewareModifySeeds(FakeMiddleware): @@ -179,3 +195,29 @@ def links_extracted(self, request, links): self.links.append(link) link.meta[b'test_links_canonical_solver'] = self.test_value return request + + +class CrawlingStrategy(BaseCrawlingStrategy): + def read_seeds(self, fh): + for url in fh: + url = url.strip() + req = self.create_request(url) + self.refresh_states(req) + if req.meta[b'state'] == States.NOT_CRAWLED: + req.meta[b'state'] = States.QUEUED + self.schedule(req) + + def page_crawled(self, response): + response.meta[b'state'] = States.CRAWLED + + def filter_extracted_links(self, request, links): + return links + + def links_extracted(self, request, links): + for link in links: + if link.meta[b'state'] == States.NOT_CRAWLED: + link.meta[b'state'] = States.QUEUED + self.schedule(link, 0.5) + + def request_error(self, request, error): + request.meta[b'state'] = States.ERROR \ No newline at end of file diff --git a/tests/mocks/message_bus.py b/tests/mocks/message_bus.py index f8b6f582b..6057e174c 100644 --- a/tests/mocks/message_bus.py +++ b/tests/mocks/message_bus.py @@ -1,5 +1,5 @@ from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseScoringLogStream, BaseSpiderFeedStream + BaseScoringLogStream, BaseSpiderFeedStream, BaseStatsLogStream class Consumer(BaseStreamConsumer): @@ -42,6 +42,9 @@ def flush(self): def get_offset(self, partition_id): return self.offset + def close(self): + pass + class ScoringLogStream(BaseScoringLogStream): @@ -88,6 +91,17 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) +class StatsLogStream(BaseStatsLogStream): + def __init__(self, messagebus): + pass + + def producer(self): + return Producer() + + def consumer(self): + return Consumer() + + class FakeMessageBus(BaseMessageBus): def __init__(self, settings): @@ -103,3 +117,6 @@ def scoring_log(self): def spider_feed(self): return SpiderFeedStream(self) + + def stats_log(self): + return StatsLogStream(self) \ No newline at end of file diff --git a/tests/scrapy_spider/frontera/settings.py b/tests/scrapy_spider/frontera/settings.py index fd2786d9c..4840327f6 100644 --- a/tests/scrapy_spider/frontera/settings.py +++ b/tests/scrapy_spider/frontera/settings.py @@ -1,7 +1,9 @@ #-------------------------------------------------------- # Frontier #-------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.memory.FIFO' +BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' +SQLALCHEMYBACKEND_ENGINE = 'sqlite:///test.db' + MAX_REQUESTS = 5 MAX_NEXT_REQUESTS = 1 diff --git a/tests/scrapy_spider/spiders/example.py b/tests/scrapy_spider/spiders/example.py index 000c7c3b8..36bdf2b4a 100644 --- a/tests/scrapy_spider/spiders/example.py +++ b/tests/scrapy_spider/spiders/example.py @@ -5,7 +5,6 @@ class MySpider(CrawlSpider): name = 'example' - start_urls = ['http://www.dmoz.org'] callback_calls = 0 rules = [Rule(LinkExtractor(), diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 7e2aa55f6..20c511b2b 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,29 +1,64 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from frontera.contrib.backends.remote.codecs.json import Encoder as JsonEncoder, Decoder as JsonDecoder +import json +import unittest +from frontera.contrib.backends.remote.codecs.json import (Encoder as JsonEncoder, Decoder as JsonDecoder, + _convert_and_save_type, _convert_from_saved_type) from frontera.contrib.backends.remote.codecs.msgpack import Encoder as MsgPackEncoder, Decoder as MsgPackDecoder from frontera.core.models import Request, Response import pytest +def _compare_dicts(dict1, dict2): + """ + Compares two dicts + :return: True if both dicts are equal else False + """ + if dict1 == None or dict2 == None: + return False + + if type(dict1) is not dict or type(dict2) is not dict: + return False + + shared_keys = set(dict2.keys()) & set(dict2.keys()) + + if not (len(shared_keys) == len(dict1.keys()) and len(shared_keys) == len(dict2.keys())): + return False + + dicts_are_equal = True + for key in dict1.keys(): + if type(dict1[key]) is dict: + dicts_are_equal = _compare_dicts(dict1[key], dict2[key]) + else: + dicts_are_equal = (dict1[key] == dict2[key]) and (type(dict1[key]) == type(dict2[key])) + + if not dicts_are_equal: + return False + + return dicts_are_equal + + +@pytest.mark.parametrize('send_body', [True, False]) @pytest.mark.parametrize( - ('encoder', 'decoder'), [ - (MsgPackEncoder, MsgPackDecoder), - (JsonEncoder, JsonDecoder) + ('encoder', 'decoder', 'invalid_value'), [ + (MsgPackEncoder, MsgPackDecoder, b'\x91\xc4\x04test'), + (JsonEncoder, JsonDecoder, b'["dict", [[["bytes", "type"], ["bytes", "test"]]]]') ] ) -def test_codec(encoder, decoder): +def test_codec(encoder, decoder, send_body, invalid_value): def check_request(req1, req2): - assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \ - and req1.method == req2.method + assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \ + _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method - enc = encoder(Request, send_body=True) + enc = encoder(Request, send_body=send_body) dec = decoder(Request, Response) - req = Request(url="http://www.yandex.ru",method=b'GET', meta={b"test": b"shmest"}, headers={b'reqhdr': b'value'}) + req = Request(url="http://www.yandex.ru", method=b'GET', + meta={b'test': b'shmest', b'scrapy_meta': {'rule': 0, 'key': 'value'}}, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") + stats = {'_timestamp': 1499241748, 'tags': {'source': 'spider', 'partition_id': 0}, + 'crawled_pages_count': 2, 'links_extracted_count': 3} msgs = [ - enc.encode_add_seeds([req]), enc.encode_page_crawled(Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, request=req)), enc.encode_links_extracted(req, [req2]), @@ -31,25 +66,23 @@ def check_request(req1, req2): enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), - enc.encode_request(req) + enc.encode_request(req), + enc.encode_stats(stats), + invalid_value, ] it = iter(msgs) - o = dec.decode(next(it)) - assert o[0] == 'add_seeds' - assert type(o[1]) == list - req_d = o[1][0] - check_request(req_d, req) - assert type(req_d) == Request - o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response - assert o[1].url == req.url and o[1].body == b'SOME CONTENT' and o[1].meta == req.meta + assert o[1].url == req.url and o[1].meta == req.meta + if send_body: + o[1].body == b'SOME CONTENT' + else: + o[1].body is None o = dec.decode(next(it)) - print(o) assert o[0] == 'links_extracted' assert type(o[1]) == Request assert o[1].url == req.url and o[1].meta == req.meta @@ -80,3 +113,45 @@ def check_request(req1, req2): o = dec.decode_request(next(it)) check_request(o, req) + + o_type, stats = dec.decode(next(it)) + assert o_type == 'stats' + assert stats == stats + + with pytest.raises(TypeError): + dec.decode(next(it)) + + +class TestEncodeDecodeJson(unittest.TestCase): + """ + Test for testing methods `_encode_recursively` and `_decode_recursively` used in json codec + """ + + def test_encode_decode_json_recursively(self): + _int = 1 + _bytes = b'bytes' + _unicode = u'unicode' + _bool = True + _none = None + simple_dict = {'key': 'value'} + simple_list = ['item', 1] + simple_tuple = ('str', 2) + mixed_type_dict = {b'k1': 'v1', 'k2': b'v2', 'int': 1, b'none': None, 'bool': False} + mixed_type_list = [b'i1', 'i2', 23, None, True] + mixed_type_tuple = [b'i1', 'i2', 23, None, True] + nested_dict = {'k1': b'v1', 'lst': [b'i1', 1, ('str', 1, {'k2': b'v1', 'tup': (1, None)})]} + nested_list = [True, None, (1, 2, 3), {b'k1': b'v1', 'tup': ('a', b'b', [None, False])}] + nested_tuple = (1, None, ['a', 'b', True, {b'k1': 'v2', 'lst': ['a', False, (2, 3, 5)]}]) + msgs = [_int, _bytes, _unicode, _bool, _none, simple_dict, simple_list, simple_tuple, + mixed_type_dict, mixed_type_list, mixed_type_tuple, nested_dict, nested_list, nested_tuple] + encoder = json.JSONEncoder() + decoder = json.JSONDecoder() + for original_msg in msgs: + encoded_msg_1 = _convert_and_save_type(original_msg) + encoded_msg_2 = encoder.encode(encoded_msg_1) + decoded_msg_2 = decoder.decode(encoded_msg_2) + decoded_msg_1 = _convert_from_saved_type(decoded_msg_2) + if isinstance(decoded_msg_1, dict): + self.assertDictEqual(decoded_msg_1, original_msg) + elif isinstance(decoded_msg_1, (list, tuple)): + self.assertSequenceEqual(decoded_msg_1, original_msg) diff --git a/tests/test_core_overused_buffer.py b/tests/test_core_overused_buffer.py index f08e32933..849a12be0 100644 --- a/tests/test_core_overused_buffer.py +++ b/tests/test_core_overused_buffer.py @@ -2,58 +2,88 @@ from frontera.core import OverusedBuffer from frontera.core.models import Request from six.moves import range +from itertools import cycle +from random import choice +from string import ascii_lowercase -r1 = Request('http://www.example.com') -r2 = Request('http://www.example.com/some/') -r3 = Request('htttp://www.example.com/some/page/') -r4 = Request('http://example.com') -r5 = Request('http://example.com/some/page') -r6 = Request('http://example1.com') +r1 = Request('http://www.example.com', meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'}) +r2 = Request('http://www.example.com/some/', meta={b'fingerprint': b'9773afd9cb0f4ec3fd09d6d1fe2c742abf0621ec'}) +r3 = Request('htttp://www.example.com/some/page/', meta={b'fingerprint': b'7278fb7612670523a7e3e37d7c38871c73bcb0ea'}) +r4 = Request('http://example.com', meta={b'fingerprint': b'89dce6a446a69d6b9bdc01ac75251e4c322bcdff'}) +r5 = Request('http://example.com/some/page', meta={b'fingerprint':b'9dbd730bdce21e322a12c757753f26bbc95c3779'}) +r6 = Request('http://example1.com', meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'}) class TestOverusedBuffer(object): - requests = [] - logs = [] + requests = [r1, r2, r3, r4, r5, r6] - def get_func(self, max_n_requests, **kwargs): + def get_once(self, max_n_requests, **kwargs): lst = [] for _ in range(max_n_requests): - if self.requests: - lst.append(self.requests.pop()) + try: + lst.append(next(self.req_it)) + except StopIteration: + break return lst - def log_func(self, msg): - self.logs.append(msg) + def test_base(self): + self.req_it = iter(self.requests) + ob = OverusedBuffer(self.get_once, None, 100, None, 100) - def test(self): - ob = OverusedBuffer(self.get_func, self.log_func) - self.requests = [r1, r2, r3, r4, r5, r6] + assert ob._get_pending_count() == 0 assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], key_type='domain')) == set([r4, r5]) - assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']", - "Pending: 0"]) - self.logs = [] - + assert ob._get_pending_count() == 4 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [r6] - assert set(self.logs) == set(["Overused keys: ['www.example.com']", - "Pending: 4"]) - self.logs = [] + assert ob._get_pending_count() == 3 assert ob.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain') == [] - assert set(self.logs) == set(["Overused keys: ['www.example.com']", - "Pending: 3"]) - self.logs = [] + assert ob._get_pending_count() == 3 #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. assert set(ob.get_next_requests(3, overused_keys=['example.com'], key_type='domain')) == set([r1, r2, r3]) - assert set(self.logs) == set(["Overused keys: ['example.com']", - "Pending: 3"]) - self.logs = [] + assert ob._get_pending_count() == 0 assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] - assert set(self.logs) == set(["Overused keys: []", "Pending: 0"]) + assert ob._get_pending_count() == 0 + + def test_purging_keys(self): + self.req_it = cycle(self.requests) + ob = OverusedBuffer(self.get_once, 10, 1, 100, 10) + ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], + key_type="domain") + assert ob._get_pending_count() == 9 + ob.get_next_requests(10, overused_keys=["example.com", "www.example.com"], + key_type="domain") # purging of www.example.com + assert ob._get_pending_count() == 7 + + def generate_requests(self): + def get_random_host(): + return str("").join([choice(ascii_lowercase) for i in range(5)]) + + self.hosts = set() + for _ in range(21): + self.hosts.add(get_random_host()) + self.requests = [] + for host in self.hosts: + self.requests.append(Request("http://%s/" % (host))) + + + def test_purging_keys_set(self): + self.generate_requests() + self.req_it = cycle(self.requests) + ob = OverusedBuffer(self.get_once, 1000, 100, 10, 1) + + ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") + assert (ob._get_key_count()) == 10 + + ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") + assert (ob._get_key_count()) == 20 + + ob.get_next_requests(10, overused_keys=self.hosts, key_type="domain") # purging of keys set + assert (ob._get_key_count()) < 20 diff --git a/tests/test_domain_mware.py b/tests/test_domain_mware.py index ecf06169f..a18462b4d 100644 --- a/tests/test_domain_mware.py +++ b/tests/test_domain_mware.py @@ -1,7 +1,6 @@ from __future__ import absolute_import import unittest from frontera.contrib.middlewares.domain import DomainMiddleware -from frontera.core.manager import FrontierManager from frontera.core.models import Request diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index f4b4ca33b..0ea37083f 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -32,11 +32,11 @@ def test_md5_unicode(self): assert md5(url3) == b'5abf5c9aa02d870756032bdec0bd6522' def test_local_hostname_fingerprint_bytes(self): - assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' - assert hostname_local_fingerprint(to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bbdf514b8520ab28f5da589ab28eda10a6' + assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff5587d241e22865288133b37d63ab49e13' + assert hostname_local_fingerprint(to_bytes(url2)) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8' + assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bb1e215e68ef283a1939252734e84c3c76' def test_local_hostname_frongerprint_unicode(self): - assert hostname_local_fingerprint(url1) == b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' - assert hostname_local_fingerprint(url2) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert hostname_local_fingerprint(url3) == b'2ed642bbdf514b8520ab28f5da589ab28eda10a6' + assert hostname_local_fingerprint(url1) == b'1be68ff5587d241e22865288133b37d63ab49e13' + assert hostname_local_fingerprint(url2) == b'97ddb3f898d2460d60d3f4d6cb7dbc5d0b8025f8' + assert hostname_local_fingerprint(url3) == b'2ed642bb1e215e68ef283a1939252734e84c3c76' \ No newline at end of file diff --git a/tests/test_frontera_scheduler.py b/tests/test_frontera_scheduler.py index fe1b7a50a..197e7f326 100644 --- a/tests/test_frontera_scheduler.py +++ b/tests/test_frontera_scheduler.py @@ -30,17 +30,6 @@ class TestFronteraScheduler(object): - def test_enqueue_requests(self): - crawler = FakeCrawler() - fs = FronteraScheduler(crawler, manager=FakeFrontierManager) - fs.open(Spider) - assert fs.enqueue_request(r1) is True - assert fs.enqueue_request(r2) is True - assert fs.enqueue_request(r3) is True - assert set(seed.url for seed in fs.frontier.manager.seeds) == set([r1.url, r2.url, r3.url]) - assert all([isinstance(seed, FRequest) for seed in fs.frontier.manager.seeds]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 3 - def test_redirect_disabled_enqueue_requests(self): settings = Settings() settings['REDIRECT_ENABLED'] = False @@ -49,11 +38,7 @@ def test_redirect_disabled_enqueue_requests(self): fs.open(Spider) assert fs.enqueue_request(rr1) is False assert fs.enqueue_request(rr2) is False - assert fs.enqueue_request(rr3) is True - assert isinstance(fs.frontier.manager.seeds[0], FRequest) - assert len(fs.frontier.manager.seeds) == 1 - assert fs.frontier.manager.seeds[0].url == rr3.url - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 + assert fs.enqueue_request(rr3) is False def test_redirect_enabled_enqueue_requests(self): settings = Settings() @@ -64,13 +49,7 @@ def test_redirect_enabled_enqueue_requests(self): assert fs.enqueue_request(rr1) is True assert fs.enqueue_request(rr2) is True assert fs.enqueue_request(rr3) is True - assert len(fs.frontier.manager.seeds) == 1 - assert isinstance(fs.frontier.manager.seeds[0], FRequest) - assert fs.frontier.manager.seeds[0].url == rr3.url - assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url]) - assert all([isinstance(request, Request) for request in fs._pending_requests]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 - assert fs.stats_manager.stats.get_value('frontera/redirected_requests_count') == 2 + assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url, rr3.url]) def test_next_request(self): crawler = FakeCrawler() @@ -113,13 +92,19 @@ def test_next_request_overused_keys_info(self): def test_process_spider_output(self): i1 = {'name': 'item', 'item': 'i1'} i2 = {'name': 'item', 'item': 'i2'} - result = [r1, r2, r3, i1, i2] + items = [i1 , i2] + requests = [r1, r2, r3] + result = list(requests) + result.extend(items) resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) - fs.open(Spider) - assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i['item'])) == \ - sorted([i1, i2], key=lambda i: sorted(i['item'])) + spider = Spider(name="testing") + fs.open(spider) + out_items = list(fs.process_spider_output(resp, result, spider)) + assert len(out_items) == len(items) + assert set([r.url for r in fs.frontier.manager.links]) == set([r.url for r in requests]) + assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) diff --git a/tests/test_frontier_manager.py b/tests/test_frontier_manager.py index 60d57970e..233fc842d 100644 --- a/tests/test_frontier_manager.py +++ b/tests/test_frontier_manager.py @@ -1,32 +1,45 @@ from __future__ import absolute_import -from frontera.core.manager import FrontierManager +from frontera.core.manager import LocalFrontierManager from frontera.settings import Settings from frontera.core.models import Request, Response +from frontera.core.components import States from six.moves import range +from unittest import TestCase -r1 = Request('http://www.example.com') -r2 = Request('https://www.example.com/some/page') -r3 = Request('http://example1.com') +r1 = Request('http://www.example.com', meta={b'fingerprint': b'89e6a0649e06d83370cdf2cbfb05f363934a8d0c'}) +r2 = Request('https://www.example.com/some/page', meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'}) +r3 = Request('http://example1.com', meta={b'fingerprint': b'758293d800fc9672ae2c68bd083359b74ab9b6c2'}) +seeds_blob = b"""http://www.example.com +https://www.example.com/some/page +http://example1.com +""" +from io import BytesIO -class TestFrontierManager(object): +SEEDS_FILE = BytesIO(seeds_blob) + + +class TestFrontierManager(TestCase): def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' - settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', + settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', + 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' - return FrontierManager.from_settings(settings) + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + return LocalFrontierManager.from_settings(settings) def test_start(self): fm = self.setup_frontier_manager() assert fm._started is True assert fm.backend._started is True - assert [mw._started for mw in fm.middlewares] == [True]*4 + assert [mw._started for mw in fm.middlewares[-4:]] == [True]*4 assert fm.canonicalsolver._started is True def test_stop(self): @@ -34,7 +47,7 @@ def test_stop(self): fm.stop() assert fm._stopped is True assert fm.backend._stopped is True - assert [mw._stopped for mw in fm.middlewares] == [True]*4 + assert [mw._stopped for mw in fm.middlewares[-4:]] == [True]*4 assert fm.canonicalsolver._stopped is True def test_properties(self): @@ -48,24 +61,24 @@ def test_properties(self): def test_add_seeds(self): fm = self.setup_frontier_manager() - fm.add_seeds([r1, r2, r3]) + SEEDS_FILE.seek(0) + fm.add_seeds(SEEDS_FILE) + + fprints_set = set([r.meta[b'fingerprint'] for r in [r1, r2, r3]]) #seeds reached backend. - assert set([seed for seed in fm.backend.seeds]) == set([r1, r2, r3]) + assert set([r.meta[b'fingerprint'] for r in fm.backend.queue.requests]) == fprints_set #seeds reached canonicalsolver - assert set([seed for seed in fm.canonicalsolver.seeds]) == set([r1, r2, r3]) + assert set([r.meta[b'fingerprint'] for r in fm.canonicalsolver.requests]) == fprints_set #seeds reached the 4 middlewares. - assert [set([seed for seed in mw.seeds]) for mw in fm.middlewares] == [set([r1, r2, r3])]*4 - #seeds were modified. - assert [seed.meta[b'test_seeds'] for seed in [r1, r2, r3]] == ['test']*3 - assert [seed.meta[b'test_seeds_canonical_solver'] for seed in [r1, r2, r3]] == ['test']*3 + assert [set([r.meta[b'fingerprint'] for r in mw.requests]) for mw in fm.middlewares[-4:]] == [fprints_set]*4 def test_page_crawled(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.page_crawled(response) - assert fm.backend.responses.pop() == response - assert [mw.responses.pop() for mw in fm.middlewares] == [response]*4 + assert response.meta[b'state'] == States.CRAWLED + assert [mw.responses.pop() for mw in fm.middlewares[-4:]] == [response]*4 assert fm.canonicalsolver.responses.pop() == response assert response.meta[b'test_response'] == 'test' @@ -73,9 +86,9 @@ def test_links_extracted(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.links_extracted(r1, links=[r2, r3]) - assert set([link for link in fm.backend.links]) == set([r2, r3]) + assert set([link.meta[b'fingerprint'] for link in fm.backend.queue.requests]) == set([r.meta[b'fingerprint'] for r in [r2, r3]]) assert set([link for link in fm.canonicalsolver.links]) == set([r2, r3]) - assert [set([link for link in mw.links]) for mw in fm.middlewares] == [set([r2, r3])]*4 + assert [set([link for link in mw.links]) for mw in fm.middlewares[-4:]] == [set([r2, r3])]*4 assert [link.meta[b'test_links'] for link in [r2, r3]] == ['test']*2 assert [link.meta[b'test_links_canonical_solver'] for link in [r2, r3]] == ['test']*2 @@ -89,8 +102,8 @@ def test_get_next_requests(self): def test_request_error(self): fm = self.setup_frontier_manager() fm.request_error(r1, 'error') - assert fm.backend.errors.pop() == (r1, 'error') - assert [mw.errors.pop() for mw in fm.middlewares] == [(r1, 'error')]*4 + assert r1.meta[b'state'] == States.ERROR + assert [mw.errors.pop() for mw in fm.middlewares[-4:]] == [(r1, 'error')]*4 assert fm.canonicalsolver.errors.pop() == (r1, 'error') def test_max_requests_reached(self): @@ -106,14 +119,18 @@ def test_max_requests_reached(self): def test_blocking_middleware(self): settings = Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' - settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', + settings.MIDDLEWARES = ['frontera.contrib.middlewares.domain.DomainMiddleware', + 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareBlocking', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks'] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' - fm = FrontierManager.from_settings(settings) - fm.add_seeds([r1, r2, r3]) + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + fm = LocalFrontierManager.from_settings(settings) + SEEDS_FILE.seek(0) + fm.add_seeds(SEEDS_FILE) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) @@ -122,9 +139,9 @@ def test_blocking_middleware(self): #the seeds, responses, links and errors have not reached the backend. assert [len(list) for list in fm.backend.lists] == [0]*4 #the 3 seeds reach the first three middlewares. - assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 + assert [len(fm.middlewares[i].requests) for i in range(2, 5)] == [3]*3 #the error, response and link reached the first three middlewares. - assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 + assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(2, 5)] == [[1]*3]*3 #the values do not reach the bottom 2 middlewares and the canonical solver. - assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 + assert [[len(list) for list in fm.middlewares[i].lists] for i in range(5, 7)] == [[0]*4]*2 assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4 diff --git a/tests/test_message_bus.py b/tests/test_message_bus.py index b283a7405..1addc6ca3 100644 --- a/tests/test_message_bus.py +++ b/tests/test_message_bus.py @@ -2,8 +2,9 @@ from __future__ import absolute_import from frontera.settings import Settings from frontera.contrib.messagebus.zeromq import MessageBus as ZeroMQMessageBus -from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus, Consumer as KafkaConsumer +from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus from frontera.utils.fingerprint import sha1 +from flaky import flaky from kafka import KafkaClient from random import randint from time import sleep @@ -121,9 +122,11 @@ def setUp(self): logging.basicConfig() handler = logging.StreamHandler(stdout) logger = logging.getLogger("kafka") - logger.setLevel(logging.DEBUG) + logger.setLevel(logging.INFO) logger.addHandler(handler) + self.logger = logging.getLogger("tester") + self.logger.debug("setup started") kafka_location = "127.0.0.1:9092" client = KafkaClient(kafka_location) client.ensure_topic_exists("frontier-todo") @@ -155,8 +158,11 @@ def setUp(self): # spider self.sp_sl_p = spiderlog.producer() self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) + self.logger.debug("init is done") + def tearDown(self): + self.logger.debug("teardown") self.sw_us_p.close() self.db_sf_p.close() self.sp_sl_p.close() @@ -167,12 +173,14 @@ def tearDown(self): self.sp_sf_c.close() def spider_log_activity(self, messages): + self.logger.debug("spider log activity entered") for i in range(0, messages): if i % 2 == 0: self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://helloworld.com/way/to/the/sun/' + b'0') else: self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://way.to.the.sun' + b'0') self.sp_sl_p.flush() + self.logger.debug("spider log activity finished") def spider_feed_activity(self): sf_c = 0 @@ -227,6 +235,7 @@ def __init__(self): super(IPv6MessageBusTester, self).__init__(settings) +@flaky def test_zmq_message_bus(): """ Test MessageBus with default settings, IPv6 and Star as ZMQ_ADDRESS diff --git a/tests/test_message_bus_backend.py b/tests/test_message_bus_backend.py index d4753c52e..587a8c100 100644 --- a/tests/test_message_bus_backend.py +++ b/tests/test_message_bus_backend.py @@ -6,9 +6,9 @@ from frontera.core.models import Request, Response -r1 = Request('http://www.example.com/', meta={b'domain': {b'fingerprint': b'1'}}) -r2 = Request('http://www.scrapy.org/', meta={b'domain': {b'fingerprint': b'2'}}) -r3 = Request('http://www.test.com/some/page', meta={b'domain': {b'fingerprint': b'3'}}) +r1 = Request('http://www.example.com/', meta={b'domain': {b'fingerprint': b'1'}, b'fingerprint': b'abc'}) +r2 = Request('http://www.scrapy.org/', meta={b'domain': {b'fingerprint': b'2'}, b'fingerprint': b'012'}) +r3 = Request('http://www.test.com/some/page', meta={b'domain': {b'fingerprint': b'3'}, b'fingerprint': b'345'}) class TestMessageBusBackend(unittest.TestCase): @@ -38,18 +38,12 @@ def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zer settings.SPIDER_PARTITION_ID = -1 self.assertRaises(ValueError, self.mbb_setup, settings) - def test_add_seeds(self): - mbb = self.mbb_setup() - mbb.add_seeds([r1, r2, r3]) - seeds = [mbb._decoder.decode(m)[1][0] for m in mbb.spider_log_producer.messages] - self.assertEqual(set([seed.url for seed in seeds]), set([r1.url, r2.url, r3.url])) - def test_page_crawled(self): mbb = self.mbb_setup() resp = Response(r1.url, body='body', request=r1) mbb.page_crawled(resp) page = mbb._decoder.decode(mbb.spider_log_producer.messages[0])[1] - self.assertEqual((page.request.url, page.body), (resp.request.url, b'body')) + self.assertEqual((page.request.url, page.body), (resp.request.url, 'body')) def test_links_extracted(self): mbb = self.mbb_setup() diff --git a/tests/test_overused_buffer.py b/tests/test_overused_buffer.py index 96524a4c5..4213fe6b2 100644 --- a/tests/test_overused_buffer.py +++ b/tests/test_overused_buffer.py @@ -22,6 +22,15 @@ class DFSOverusedBackendTest(BackendSequenceTest): ] } + def get_settings(self): + settings = super(DFSOverusedBackendTest, self).get_settings() + settings.TEST_MODE = True + settings.LOGGING_MANAGER_ENABLED = False + settings.LOGGING_BACKEND_ENABLED = False + settings.LOGGING_DEBUGGING_ENABLED = False + settings.STRATEGY = 'tests.backends.DFSCrawlingStrategy' + return settings + def test_sequence1(self): sequence = self.get_sequence(TEST_SITES['SITE_09'], max_next_requests=5, downloader_simulator=DownloaderSimulator(rate=1)) diff --git a/tests/test_partitioners.py b/tests/test_partitioners.py index 61f52ada8..4e530ddc9 100644 --- a/tests/test_partitioners.py +++ b/tests/test_partitioners.py @@ -9,10 +9,10 @@ def test_fingerprint_partitioner(): fp = FingerprintPartitioner(partitions) key = '1be68ff556fd0bbe5802d1a100850da29f7f15b1' partition = fp.partition(key, partitions) - assert partition == 4 + assert partition == 2 partition = fp.partition(key, None) - assert partition == 4 + assert partition == 2 def test_crc32name_partitioner(): diff --git a/tests/test_scrapy.py b/tests/test_scrapy.py index e29608001..4090d716c 100644 --- a/tests/test_scrapy.py +++ b/tests/test_scrapy.py @@ -1,12 +1,23 @@ # -*- coding: utf-8 -*- - from __future__ import absolute_import -from frontera.contrib.scrapy.converters import RequestConverter, ResponseConverter + +import sys + +from scrapy.core.spidermw import SpiderMiddlewareManager +from scrapy.http import Request, Response from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse -from frontera.core.models import Request as FrontierRequest +from scrapy.spiders import Spider +from scrapy.utils.test import get_crawler +from twisted.internet.defer import Deferred +from twisted.trial import unittest from w3lib.util import to_bytes +from frontera.contrib.scrapy.converters import (RequestConverter, + ResponseConverter) +from frontera.core.models import Request as FrontierRequest +from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler + class TestSpider(object): def callback(self): @@ -75,3 +86,89 @@ def test_request_response_converters(): frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) assert frontier_request.url == url + + +class TestFronteraMiddlewaresWithScrapy(unittest.TestCase): + + def init_smw(self, custom_settings): + class TestSpider(Spider): + name = 'test' + + self.spider = TestSpider + scrapy_default_middlewares = { + 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700 + } + + # monkey patch SPIDER_MIDDLEWARES_BASE to include only referer middleware + sys.modules['scrapy.settings.default_settings'].SPIDER_MIDDLEWARES_BASE = scrapy_default_middlewares + crawler = get_crawler(self.spider, custom_settings) + self.add_frontera_scheduler(crawler) + self.smw = SpiderMiddlewareManager.from_crawler(crawler) + + @staticmethod + def add_frontera_scheduler(crawler): + scheduler = FronteraScheduler(crawler) + + # mock these functions + scheduler.frontier.page_crawled = lambda x: x + scheduler.frontier.links_extracted = lambda x, y: x + scheduler.stats_manager.add_crawled_page = lambda x, y: x + + class Engine(object): + def __init__(self, scheduler): + self.slot = type('slot', (object,), {}) + self.slot.scheduler = scheduler + + crawler.engine = Engine(scheduler) + + def perform_test(self, output_func): + def request_callback(response): + yield Request('http://frontera.org') + + req = Request( + url='http://www.scrapy.org', + callback=request_callback, + meta={b'frontier_request': FrontierRequest('http://www.scrapy.org')} + ) + + res = Response(url='http://www.scrapy.org', request=req) + + def call_request_callback(result, request, spider): + dfd = Deferred() + dfd.addCallback(request.callback) + return dfd + + def test_failure(failure): + # work around for test to fail with detailed traceback + self._observer._errors.append(failure) + + dfd = self.smw.scrape_response(call_request_callback, res, req, self.spider) + + dfd.addCallback(output_func) + dfd.addErrback(test_failure) + + dfd.callback(res) + + def test_frontera_scheduler_spider_mw_with_referer_mw(self): + + def test_middleware_output(result): + out = list(result) + # Frontera swallows requests but passes items + self.assertEquals(len(out), 0) + + self.init_smw({ + 'SPIDER_MIDDLEWARES': {'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000} + }) + self.perform_test(test_middleware_output) + + def test_frontera_scheduler_spider_mw_without_referer_mw(self): + + def test_middleware_output(result): + out = list(result) + self.assertEquals(len(out), 1) + self.assertIsInstance(out[0], Request) + self.assertIn('Referer', out[0].headers) + self.assertEquals(out[0].headers['Referer'], to_bytes('http://www.scrapy.org')) + + self.init_smw({}) + self.perform_test(test_middleware_output) diff --git a/tests/test_scrapy_spider.py b/tests/test_scrapy_spider.py index 0b797ddcd..d1e40a24d 100644 --- a/tests/test_scrapy_spider.py +++ b/tests/test_scrapy_spider.py @@ -3,12 +3,37 @@ from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import signals -from scrapy.settings import Settings +from scrapy.settings import Settings as ScrapySettings from tests.scrapy_spider.spiders.example import MySpider +from frontera.settings import Settings as FronteraSettings +from frontera.utils import add_seeds +import pytest +from os import remove +from os.path import exists -def test_scrapy_spider(): - settings = Settings() +@pytest.fixture() +def seeds_file(): + fh = open("seeds.txt", "w") + fh.write("https://en.wikipedia.org/wiki/Main_Page") + fh.close() + yield "seeds.txt" + remove("seeds.txt") + + +@pytest.fixture() +def db_file(request): + def rm_file(): + if exists("test.db"): + remove("test.db") + rm_file() + request.addfinalizer(rm_file) + +@pytest.mark.skip("throws ReactorNotRestartable and requires some planning") +def test_scrapy_spider(seeds_file, db_file): + fs = FronteraSettings(module="tests.scrapy_spider.frontera.settings") + add_seeds.run_add_seeds(fs, seeds_file) + settings = ScrapySettings() settings.setmodule("tests.scrapy_spider.settings") crawler = Crawler(MySpider, settings=settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) diff --git a/tests/test_seed_loader.py b/tests/test_seed_loader.py deleted file mode 100644 index bc512e2a9..000000000 --- a/tests/test_seed_loader.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -import unittest -from shutil import rmtree -from tempfile import mkdtemp - -from scrapy.spiders import Spider - -from frontera.settings import Settings -from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader, NotConfigured -from frontera.contrib.scrapy.middlewares.seeds.s3 import S3SeedLoader - -from tests.mocks.boto import MockConnection -from tests import mock - - -class TestFileSeedLoader(unittest.TestCase): - - def setUp(self): - self.tmp_path = mkdtemp() - - def tearDown(self): - rmtree(self.tmp_path) - - def seed_loader_setup(self, seeds_content=None): - seed_path = os.path.join(self.tmp_path, 'seeds.txt') - default_content = """ -https://www.example.com -https://www.scrapy.org -""" - seeds_content = seeds_content or default_content - with open(seed_path, 'wb') as tmpl_file: - tmpl_file.write(seeds_content.encode('utf-8')) - assert os.path.isfile(seed_path) # Failure of test itself - settings = Settings() - settings.SEEDS_SOURCE = seed_path - crawler = type('crawler', (object,), {}) - crawler.settings = settings - return FileSeedLoader(crawler) - - def test_seeds_not_configured(self): - crawler = type('crawler', (object,), {}) - crawler.settings = Settings() - self.assertRaises(NotConfigured, FileSeedLoader, crawler) - - def test_load_seeds(self): - seed_loader = self.seed_loader_setup() - seeds = seed_loader.load_seeds() - self.assertEqual(seeds, ['https://www.example.com', 'https://www.scrapy.org']) - - def test_process_start_requests(self): - seed_loader = self.seed_loader_setup() - requests = seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) - - def test_process_start_requests_ignore_comments(self): - seeds_content = """ -https://www.example.com -# https://www.dmoz.org -https://www.scrapy.org -# https://www.test.com -""" - seed_loader = self.seed_loader_setup(seeds_content) - requests = seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) - - -class TestS3SeedLoader(unittest.TestCase): - - def setUp(self): - self.tmp_path = mkdtemp() - settings = Settings() - settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder' - settings.SEEDS_AWS_ACCESS_KEY = 'access_key' - settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key' - crawler = type('crawler', (object,), {}) - crawler.settings = settings - self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt') - self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt') - s1_content = """ -https://www.example.com -https://www.scrapy.org -""" - s2_content = """ -https://www.dmoz.org -https://www.test.com -""" - - with open(self.seed_path_1, 'wb') as tmpl_file: - tmpl_file.write(s1_content.encode('utf-8')) - with open(self.seed_path_2, 'wb') as tmpl_file: - tmpl_file.write(s2_content.encode('utf-8')) - self.seed_loader = S3SeedLoader(crawler) - - def tearDown(self): - rmtree(self.tmp_path) - - def test_invalid_s3_seed_source(self): - crawler = type('crawler', (object,), {}) - settings = Settings() - settings.SEEDS_SOURCE = 'invalid_url' - crawler.settings = settings - self.assertRaises(NotConfigured, S3SeedLoader, crawler) - - def test_process_start_requests(self): - urls = ['https://www.example.com', 'https://www.scrapy.org', - 'https://www.dmoz.org', 'https://www.test.com'] - self.check_request_urls(urls) - - def test_s3_loader_ignores_non_txt_files(self): - urls = [] - self.check_request_urls(urls, '.ini') - - def check_request_urls(self, urls, key_extension='.txt'): - with open(self.seed_path_1, 'rU') as s1: - with open(self.seed_path_2, 'rU') as s2: - conn = MockConnection() - bucket = conn.create_bucket('some-bucket') - bucket.add_key('seeds-folder/seeds1%s' % key_extension, s1) - bucket.add_key('seeds-folder/seeds2%s' % key_extension, s2) - - def mocked_connect_s3(*args, **kwargs): - return conn - - with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3', - side_effect=mocked_connect_s3): - requests = self.seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual(set([r.url for r in requests]), set(urls)) diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 3c6e5dafc..bc8a23430 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,26 +1,28 @@ # -*- coding: utf-8 -*- -from frontera.worker.strategies import BaseCrawlingStrategy -from frontera.worker.strategy import StatesContext +from frontera.strategy import BaseCrawlingStrategy from frontera.settings import Settings -from tests.mocks.frontier_manager import FakeFrontierManager +from frontera.core.manager import WorkerFrontierManager, StatesContext from frontera.contrib.backends.memory import MemoryStates from frontera.core.components import States class DummyCrawlingStrategy(BaseCrawlingStrategy): - def add_seeds(self, seeds): + def read_seeds(self, seeds_file): pass def page_crawled(self, response): pass - def page_error(self, request, error): + def request_error(self, request, error): pass def links_extracted(self, request, links): pass + def filter_extracted_links(self, request, links): + pass + class MessageBusStream(object): def send(self, request, score=1.0, dont_queue=False): @@ -33,11 +35,13 @@ def flush(self): class TestCrawlingStrategy(object): def strategy(self): settings = Settings() - manager = FakeFrontierManager(settings) + settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + settings.STRATEGY = 'tests.test_strategy.DummyCrawlingStrategy' + manager = WorkerFrontierManager.from_settings(settings, db_worker=False, strategy_worker=True) stream = MessageBusStream() states = MemoryStates(10) states_ctx = StatesContext(states) - return DummyCrawlingStrategy.from_worker(manager, stream, states_ctx) + return manager.strategy def test_create_request(self): s = self.strategy() @@ -46,7 +50,7 @@ def test_create_request(self): def test_states_refresh(self): s = self.strategy() - states = s._states_context._states + states = s._states_context.states url = "http://test.com/someurl" req1 = s.create_request(url) req1.meta[b'state'] = States.CRAWLED @@ -55,4 +59,4 @@ def test_states_refresh(self): req2 = s.create_request(url) s.refresh_states([req2]) assert req2.meta[b'state'] == req1.meta[b'state'] - assert req2.meta[b'state'] == States.CRAWLED + assert req2.meta[b'state'] == States.CRAWLED \ No newline at end of file diff --git a/tests/test_utils_async.py b/tests/test_utils_async.py index bbf6d83fe..10c75326d 100644 --- a/tests/test_utils_async.py +++ b/tests/test_utils_async.py @@ -4,7 +4,7 @@ from twisted.test.proto_helpers import MemoryReactor from twisted.internet.protocol import Factory from twisted.internet.task import Clock -from frontera.utils.async import CallLaterOnce, listen_tcp +from frontera.utils.twisted_helpers import CallLaterOnce, listen_tcp class TestCallLaterOnce(object): diff --git a/tests/test_worker_db.py b/tests/test_worker_db.py index 05b91d0c2..8750b58d9 100644 --- a/tests/test_worker_db.py +++ b/tests/test_worker_db.py @@ -1,15 +1,16 @@ from frontera.core.models import Request, Response -from frontera.worker.db import DBWorker +from frontera.worker.db import DBWorker, ScoringConsumer, IncomingConsumer, BatchGenerator from frontera.settings import Settings from frontera.core.components import States +import unittest -r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0}) -r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'state': States.DEFAULT, b'jid': 0}) -r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0}) +r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0, b'domain':{b'name':'www.example.com'}}) +r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'state': States.DEFAULT, b'jid': 0, b'domain':{b'name':'www.scrapy.org'}}) +r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0, b'domain':{b'name':'www.dmoz.org'}}) -class TestDBWorker(object): +class TestDBWorker(unittest.TestCase): def dbw_setup(self, distributed=False): settings = Settings() @@ -19,71 +20,75 @@ def dbw_setup(self, distributed=False): settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' else: settings.BACKEND = 'tests.mocks.components.FakeBackend' - return DBWorker(settings, True, True, False) - - def test_add_seeds(self): - dbw = self.dbw_setup() - msg = dbw._encoder.encode_add_seeds([r1, r2, r3]) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert set([r.url for r in dbw._backend.seeds]) == set([r.url for r in [r1, r2, r3]]) + return DBWorker(settings, False, False, False, partitions=[0,1,2,3]) def test_page_crawled(self): dbw = self.dbw_setup() resp = Response(r1.url, request=r1) msg = dbw._encoder.encode_page_crawled(resp) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert set([r.url for r in dbw._backend.responses]) == set([r1.url]) + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert set([r.url for r in incoming_consumer.backend.responses]) == set([r1.url]) def test_links_extracted(self): dbw = self.dbw_setup() msg = dbw._encoder.encode_links_extracted(r1, [r2, r3]) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert set([r.url for r in dbw._backend.links]) == set([r2.url, r3.url]) + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert set([r.url for r in incoming_consumer.backend.links]) == set([r2.url, r3.url]) def test_request_error(self): dbw = self.dbw_setup() msg = dbw._encoder.encode_request_error(r1, 'error') - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert dbw._backend.errors[0][0].url == r1.url - assert dbw._backend.errors[0][1] == 'error' + incoming_consumer = dbw.slot.components[IncomingConsumer] + incoming_consumer.spider_log_consumer.put_messages([msg]) + incoming_consumer.run() + assert incoming_consumer.backend.errors[0][0].url == r1.url + assert incoming_consumer.backend.errors[0][1] == 'error' def test_scoring(self): dbw = self.dbw_setup(True) - msg = dbw._encoder.encode_add_seeds([r1, r2, r3]) - dbw.spider_log_consumer.put_messages([msg]) - dbw.consume_incoming() - assert dbw.new_batch() == 0 + batch_gen = dbw.slot.components[BatchGenerator] + batch_gen.run() + assert dbw.stats["last_batch_size"] == 0 msg1 = dbw._encoder.encode_update_score(r1, 0.5, True) msg2 = dbw._encoder.encode_update_score(r3, 0.6, True) - dbw.scoring_log_consumer.put_messages([msg1, msg2]) - dbw.consume_scoring() - assert set([r.url for r in dbw._backend.queue.requests]) == set([r1.url, r3.url]) - assert dbw.new_batch() == 2 + scoring_worker = dbw.slot.components[ScoringConsumer] + scoring_worker.scoring_log_consumer.put_messages([msg1, msg2]) + scoring_worker.run() + assert set([r.url for r in dbw.backend.queue.requests]) == set([r1.url, r3.url]) + batch_gen.run() + assert dbw.stats["last_batch_size"] == 2 def test_new_batch(self): dbw = self.dbw_setup(True) - dbw._backend.queue.put_requests([r1, r2, r3]) - assert dbw.new_batch() == 3 - assert set(dbw.spider_feed_producer.messages) == \ + batch_gen = dbw.slot.components[BatchGenerator] + batch_gen.backend.queue.put_requests([r1, r2, r3]) + batch_gen.run() + assert dbw.stats["last_batch_size"] == 3 + assert set(batch_gen.spider_feed_producer.messages) == \ set([dbw._encoder.encode_request(r) for r in [r1, r2, r3]]) def test_offset(self): dbw = self.dbw_setup(True) + incoming_worker = dbw.slot.components[IncomingConsumer] + batch_gen = dbw.slot.components[BatchGenerator] + batch_gen.spider_feed = incoming_worker.spider_feed + batch_gen.spider_feed_producer = incoming_worker.spider_feed_producer msg = dbw._encoder.encode_offset(2, 50) - dbw.spider_log_consumer.put_messages([msg]) - dbw.spider_feed_producer.offset = 100 - dbw.consume_incoming() - assert 2 in dbw.spider_feed.available_partitions() + incoming_worker.spider_log_consumer.put_messages([msg]) + incoming_worker.spider_feed_producer.offset = 100 + incoming_worker.run() + assert 2 in batch_gen.spider_feed.available_partitions() msg1 = dbw._encoder.encode_offset(2, 20) msg2 = dbw._encoder.encode_offset(3, 0) - dbw.spider_log_consumer.put_messages([msg1, msg2]) - dbw.consume_incoming() - assert 3 in dbw.spider_feed.available_partitions() - assert 2 not in dbw.spider_feed.available_partitions() - dbw._backend.queue.put_requests([r1, r2, r3]) - assert dbw.new_batch() == 3 - assert 3 in dbw._backend.partitions + incoming_worker.spider_log_consumer.put_messages([msg1, msg2]) + incoming_worker.run() + assert 3 in batch_gen.spider_feed.available_partitions() + assert 2 not in batch_gen.spider_feed.available_partitions() + batch_gen.backend.queue.put_requests([r1, r2, r3]) + batch_gen.run() + assert dbw.stats["last_batch_size"] == 3 + assert 3 in batch_gen.backend.partitions diff --git a/tests/test_worker_strategy.py b/tests/test_worker_strategy.py index 7a2acd873..b35a160f8 100644 --- a/tests/test_worker_strategy.py +++ b/tests/test_worker_strategy.py @@ -1,9 +1,11 @@ from frontera.worker.strategy import StrategyWorker -from frontera.worker.strategies.bfs import CrawlingStrategy from frontera.settings import Settings from frontera.core.models import Request, Response from frontera.core.components import States - +from tests.mocks.components import CrawlingStrategy +from unittest import TestCase +from os import remove +from os.path import exists r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'jid': 0}) r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'jid': 0}) @@ -11,32 +13,59 @@ r4 = Request('http://www.test.com/some/page', meta={b'fingerprint': b'4', b'jid': 0}) -class TestStrategyWorker(object): +class FilteredLinksCrawlingStrategy(CrawlingStrategy): + def filter_extracted_links(self, request, links): + return [] + + +class TestStrategyWorker(TestCase): + def setUp(self): + settings = Settings() + settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 + self.sw = StrategyWorker(settings, False) + + def tearDown(self): + if exists("/tmp/test_urls.txt"): + remove("/tmp/test_urls.txt") + pass - def sw_setup(self): + def sw_setup_filtered_links(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.STRATEGY = 'tests.test_worker_strategy.FilteredLinksCrawlingStrategy' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 - return StrategyWorker(settings, CrawlingStrategy) + return StrategyWorker(settings, False) + + def sw_setup_add_seeds(self): + settings = Settings() + settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' + settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 + settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' + return StrategyWorker(settings, True) def test_add_seeds(self): - sw = self.sw_setup() - msg = sw._encoder.encode_add_seeds([r1, r2, r3, r4]) - sw.consumer.put_messages([msg]) - r2.meta[b'state'] = States.CRAWLED - sw.states.update_cache([r2]) - sw.work() + sw = self.sw_setup_add_seeds() + fh = open("/tmp/test_urls.txt", "wb") + fh.write(b"http://example1.com/\n") + fh.write(b"http://www.scrapy.org/\n") + fh.close() - r1.meta[b'state'] = States.QUEUED - r3.meta[b'state'] = States.QUEUED - r4.meta[b'state'] = States.QUEUED - assert set(sw.scoring_log_producer.messages) == \ - set([sw._encoder.encode_update_score(r, 1.0, True) - for r in [r1, r3, r4]]) + sw.run("file:///tmp/test_urls.txt") + + assert sw.add_seeds_mode == True + produced = [sw._decoder.decode(msg) for msg in sw.update_score._producer.messages] + assert len(produced) == 2 + assert all(msg[0] == 'update_score' for msg in produced) + assert produced[0][1].url == "http://example1.com/" + assert produced[1][1].url == "http://www.scrapy.org/" def test_page_crawled(self): - sw = self.sw_setup() + sw = self.sw r1.meta[b'jid'] = 1 resp = Response(r1.url, request=r1) msg = sw._encoder.encode_page_crawled(resp) @@ -44,15 +73,33 @@ def test_page_crawled(self): sw.work() # response should be skipped if it's jid doesn't match the strategy worker's assert sw.scoring_log_producer.messages == [] - sw.job_id = 1 + sw.workflow.job_id = 1 sw.consumer.put_messages([msg]) sw.work() r1c = r1.copy() - sw.states.set_states(r1c) + sw.workflow.states_context.states.set_states(r1c) assert r1c.meta[b'state'] == States.CRAWLED def test_links_extracted(self): - sw = self.sw_setup() + sw = self.sw + sw.job_id = 0 + r1.meta[b'jid'] = 0 + msg = sw._encoder.encode_links_extracted(r1, [r3, r4]) + sw.consumer.put_messages([msg]) + sw.work() + r3.meta[b'state'] = States.QUEUED + r4.meta[b'state'] = States.QUEUED + + # decoding messages from scoring log + fprints = set() + for msg in sw.scoring_log_producer.messages: + typ, req, score, is_schedule = sw._decoder.decode(msg) + fprints.add(req.meta[b'fingerprint']) + + assert fprints == set([r.meta[b'fingerprint'] for r in [r3, r4]]) + + def test_filter_links_extracted(self): + sw = self.sw_setup_filtered_links() sw.job_id = 0 r1.meta[b'jid'] = 0 msg = sw._encoder.encode_links_extracted(r1, [r3, r4]) @@ -60,14 +107,13 @@ def test_links_extracted(self): sw.work() r3.meta[b'state'] = States.QUEUED r4.meta[b'state'] = States.QUEUED - assert set(sw.scoring_log_producer.messages) == \ - set(sw._encoder.encode_update_score(r, sw.strategy.get_score(r.url), True) for r in [r3, r4]) + assert set(sw.scoring_log_producer.messages) == set() def test_request_error(self): - sw = self.sw_setup() + sw = self.sw msg = sw._encoder.encode_request_error(r4, 'error') sw.consumer.put_messages([msg]) sw.work() - r4.meta[b'state'] = States.ERROR - assert sw.scoring_log_producer.messages.pop() == \ - sw._encoder.encode_update_score(r4, 0.0, False) + sw.workflow.states_context.states.set_states(r4) + + assert r4.meta[b'state'] == States.ERROR \ No newline at end of file diff --git a/tox.ini b/tox.ini index 9e47f6eac..97b4627df 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/requirements/tests.txt commands = - py.test --cov-report=term --cov=frontera -s -v {posargs:tests} + py.test --cov-report=term --cov=frontera -v {posargs:tests} [testenv:flake8] changedir = {toxinidir} @@ -26,12 +26,13 @@ commands = flake8 setup.py frontera # Options for flake8 [flake8] -ignore = E265,E501,F401,W391,W292,E226 +ignore = E265,E501,F401,W391,W292,E226,W504,W605 exclude = frontera/_version.py,versioneer.py,docs/source/conf.py,frontera/contrib/backends/opic/discovery.py # Options for pytest [pytest] -addopts = -rsvXf +addopts = -rvXf testpaths = tests ignore=requirements +log_cli_level=INFO