Skip to content

Commit

Permalink
Add execution time limit
Browse files Browse the repository at this point in the history
  • Loading branch information
VincentLanglet committed Dec 11, 2024
1 parent b9fb6cf commit f5ac838
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 2 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,19 @@ Crawler::create()
->setConcurrency(1) // now all urls will be crawled one by one
```

## Defining Crawl Limits
## Defining Crawl and Time Limits

By default, the crawler continues until it has crawled every page it can find. This behavior might cause issues if you are working in an environment with limitations such as a serverless environment.

The crawl behavior can be controlled with the following two options:

- **Total Crawl Limit** (`setTotalCrawlLimit`): This limit defines the maximal count of URLs to crawl.
- **Current Crawl Limit** (`setCurrentCrawlLimit`): This defines how many URLs are processed during the current crawl.
- **Total Execution Time Limit** (`setTotalExecutionTimeLimit`): This limit defines the maximal execution time of the crawl.
- **Current Execution Time Limit** (`setCurrentExecutionTimeLimit`): This limits the execution time of the current crawl.

Let's take a look at some examples to clarify the difference between these two methods.
Let's take a look at some examples to clarify the difference between `setTotalCrawlLimit` and `setCurrentCrawlLimit`.
The difference between `setTotalExecutionTimeLimit` and `setCurrentExecutionTimeLimit` will be the same.

### Example 1: Using the total crawl limit

Expand Down
68 changes: 68 additions & 0 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ class Crawler

protected ?int $currentCrawlLimit = null;

protected ?int $startedAt = null;

protected int $executionTime = 0;

protected ?int $totalExecutionTimeLimit = null;

protected ?int $currentExecutionTimeLimit = null;

protected int $maximumResponseSize = 1024 * 1024 * 2;

protected ?int $maximumDepth = null;
Expand Down Expand Up @@ -174,6 +182,44 @@ public function getCurrentCrawlCount(): int
return $this->currentUrlCount;
}

public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self
{
$this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond;

return $this;
}

public function getTotalExecutionTimeLimit(): ?int
{
return $this->totalExecutionTimeLimit;
}

public function getTotalExecutionTime(): int
{
return $this->executionTime + $this->getCurrentExecutionTime();
}

public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self
{
$this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond;

return $this;
}

public function getCurrentExecutionTimeLimit(): ?int
{
return $this->currentExecutionTimeLimit;
}

public function getCurrentExecutionTime(): int
{
if (is_null($this->startedAt)) {
return 0;
}

return time() - $this->startedAt;
}

public function setMaximumDepth(int $maximumDepth): self
{
$this->maximumDepth = $maximumDepth;
Expand Down Expand Up @@ -412,6 +458,8 @@ public function getBaseUrl(): UriInterface

public function startCrawling(UriInterface|string $baseUrl)
{
$this->startedAt = time();

if (! $baseUrl instanceof UriInterface) {
$baseUrl = new Uri($baseUrl);
}
Expand Down Expand Up @@ -445,6 +493,9 @@ public function startCrawling(UriInterface|string $baseUrl)
foreach ($this->crawlObservers as $crawlObserver) {
$crawlObserver->finishedCrawling();
}

$this->executionTime += time() - $this->startedAt;
$this->startedAt = null; // To reset currentExecutionTime
}

public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
Expand Down Expand Up @@ -480,6 +531,7 @@ protected function startCrawlingQueue(): void
{
while (
$this->reachedCrawlLimits() === false &&
$this->reachedTimeLimits() === false &&
$this->crawlQueue->hasPendingUrls()
) {
$pool = new Pool($this->client, $this->getCrawlRequests(), [
Expand All @@ -504,6 +556,7 @@ protected function getCrawlRequests(): Generator
{
while (
$this->reachedCrawlLimits() === false &&
$this->reachedTimeLimits() === false &&
$crawlUrl = $this->crawlQueue->getPendingUrl()
) {
if (
Expand Down Expand Up @@ -556,4 +609,19 @@ public function reachedCrawlLimits(): bool

return false;
}

public function reachedTimeLimits(): bool
{
$totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit();
if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) {
return true;
}

$currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit();
if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) {
return true;
}

return false;
}
}
38 changes: 38 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,44 @@ function ($url) {
assertCrawledUrlCount(3);
});

it('respects the total execution time limit', function () {
$baseUrl = 'http://localhost:8080';

$crawler = createCrawler()
->setMaximumDepth(2)
->setDelayBetweenRequests(500) // 500ms
->setTotalExecutionTimeLimit(2)
->setCrawlProfile(new CrawlSubdomains($baseUrl));

$crawler->startCrawling($baseUrl);

// At 500ms delay per URL, only four URL can be crawled in 2 seconds.
assertCrawledUrlCount(4);

$crawler->startCrawling($baseUrl);

assertCrawledUrlCount(4);
});

it('respects the current execution time limit', function () {
$baseUrl = 'http://localhost:8080';

$crawler = createCrawler()
->setMaximumDepth(2)
->setDelayBetweenRequests(500) // 500ms
->setCurrentExecutionTimeLimit(2)
->setCrawlProfile(new CrawlSubdomains($baseUrl));

$crawler->startCrawling($baseUrl);

// At 500ms delay per URL, only four URL can be crawled in 2 seconds.
assertCrawledUrlCount(4);

$crawler->startCrawling($baseUrl);

assertCrawledUrlCount(11);
});

function javascriptInjectedUrls(): array
{
return [[
Expand Down

0 comments on commit f5ac838

Please sign in to comment.