Skip to content

Commit

Permalink
Try to optimize the crawling perfs
Browse files Browse the repository at this point in the history
  • Loading branch information
shenril committed Aug 19, 2019
1 parent d7f6772 commit 0073e9c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 25 deletions.
24 changes: 12 additions & 12 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,18 @@ datastore: lib/data

## List of fingerprint plugins activated
fingerprint_plugins:
- cms
- system
- framework
- frontend
- header
- lang
- server
- waf
- cms
- system
- framework
- frontend
- header
- lang
- server
- waf

## List of attacks plugins activated
attack_plugins:
- bruteforce
- injection
- vulns
- other
- bruteforce # Parrallelized
- injection # Parrallelized
- vulns
- other
27 changes: 14 additions & 13 deletions lib/modules/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

from lib.utils.container import Services

urls = []
urls = set()
allowed_domains = []


class SitadelSpider(CrawlSpider):
Expand All @@ -18,18 +19,16 @@ class SitadelSpider(CrawlSpider):
Rule(
LinkExtractor(canonicalize=True, unique=True),
follow=True,
callback="parse_items",
process_links="parse_items",
)
]

# Method for parsing items
def parse_items(self, response):
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
def parse_items(self, links):
for link in links:
for allowed_domain in super.allowed_domains:
if urlparse(link.url).netloc == allowed_domain:
urls.append(link.url)
yield scrapy.Request(link.url, callback=self.parse)
if urlparse(link.url).netloc in allowed_domains:
urls.add(link.url)
yield link


def crawl(url, user_agent):
Expand All @@ -39,21 +38,23 @@ def crawl(url, user_agent):
settings = get_project_settings()
settings.set("USER_AGENT", user_agent)
settings.set("LOG_LEVEL", "CRITICAL")
settings.set("RETRY_ENABLED", False)
settings.set("CONCURRENT_REQUESTS", 15)

# Create the process that will perform the crawl
output.info("Start crawling the target website")
process = CrawlerProcess(settings)
domain = urlparse(url).hostname
process.crawl(SitadelSpider, start_urls=[str(url)], allowed_domains=[str(domain)])
allowed_domains.append(str(urlparse(url).hostname))
process.crawl(SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains)
process.start()

# Clean the results
clean_urls = set()
clean_urls = []
for u in urls:
try:
new_url = urlparse(u).geturl()
clean_urls.add(new_url)
clean_urls.append(new_url)
except ValueError:
continue

return list(clean_urls)
return clean_urls

0 comments on commit 0073e9c

Please sign in to comment.