From 0073e9cd5a69ae005f76407e5722f594e8368956 Mon Sep 17 00:00:00 2001 From: Batard Florent Date: Mon, 19 Aug 2019 11:17:42 +0900 Subject: [PATCH] Try to optimize the crawling perfs --- config/config.yml | 24 ++++++++++++------------ lib/modules/crawler/crawler.py | 27 ++++++++++++++------------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/config/config.yml b/config/config.yml index 0f15549..23de87a 100644 --- a/config/config.yml +++ b/config/config.yml @@ -15,18 +15,18 @@ datastore: lib/data ## List of fingerprint plugins activated fingerprint_plugins: - - cms - - system - - framework - - frontend - - header - - lang - - server - - waf + - cms + - system + - framework + - frontend + - header + - lang + - server + - waf ## List of attacks plugins activated attack_plugins: - - bruteforce - - injection - - vulns - - other + - bruteforce # Parrallelized + - injection # Parrallelized + - vulns + - other diff --git a/lib/modules/crawler/crawler.py b/lib/modules/crawler/crawler.py index c1e83da..80e7272 100644 --- a/lib/modules/crawler/crawler.py +++ b/lib/modules/crawler/crawler.py @@ -8,7 +8,8 @@ from lib.utils.container import Services -urls = [] +urls = set() +allowed_domains = [] class SitadelSpider(CrawlSpider): @@ -18,18 +19,16 @@ class SitadelSpider(CrawlSpider): Rule( LinkExtractor(canonicalize=True, unique=True), follow=True, - callback="parse_items", + process_links="parse_items", ) ] # Method for parsing items - def parse_items(self, response): - links = LinkExtractor(canonicalize=True, unique=True).extract_links(response) + def parse_items(self, links): for link in links: - for allowed_domain in super.allowed_domains: - if urlparse(link.url).netloc == allowed_domain: - urls.append(link.url) - yield scrapy.Request(link.url, callback=self.parse) + if urlparse(link.url).netloc in allowed_domains: + urls.add(link.url) + yield link def crawl(url, user_agent): @@ -39,21 +38,23 @@ def crawl(url, user_agent): settings = get_project_settings() settings.set("USER_AGENT", user_agent) settings.set("LOG_LEVEL", "CRITICAL") + settings.set("RETRY_ENABLED", False) + settings.set("CONCURRENT_REQUESTS", 15) # Create the process that will perform the crawl output.info("Start crawling the target website") process = CrawlerProcess(settings) - domain = urlparse(url).hostname - process.crawl(SitadelSpider, start_urls=[str(url)], allowed_domains=[str(domain)]) + allowed_domains.append(str(urlparse(url).hostname)) + process.crawl(SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains) process.start() # Clean the results - clean_urls = set() + clean_urls = [] for u in urls: try: new_url = urlparse(u).geturl() - clean_urls.add(new_url) + clean_urls.append(new_url) except ValueError: continue - return list(clean_urls) + return clean_urls