Skip to content

Commit

Permalink
Merge pull request #1060 from Norconex/feature/CU-86888fmx3/crawler-s…
Browse files Browse the repository at this point in the history
…tability

JavaDoc + better serialization of DelayRange + renaming of FeaturedIm…
  • Loading branch information
essiembre authored Sep 14, 2024
2 parents 0a03d59 + e544471 commit c1d2b5b
Show file tree
Hide file tree
Showing 81 changed files with 1,438 additions and 3,154 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public final class CrawlDocMetadata {
//TODO use the same prefix for both crawler and importer...
// all "document." ? In any case, no longer make it "collector."

public static final String PREFIX = "collector.";
public static final String PREFIX = "crawler.";

public static final String DEPTH = PREFIX + "depth";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ void testLifeCycle() {
1,
new Condition<>(
req -> req.getMetadata().getBoolean(
"collector.is-crawl-new"),
"crawler.is-crawl-new"),
""))
.map(CommitterRequest::getReference)
// ref1 is last because orphans are processed last
Expand Down
7 changes: 7 additions & 0 deletions crawler/web/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,13 @@
<artifactId>selenium</artifactId>
<scope>test</scope>
</dependency>
<!--
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
-->
<!-- TODO: Implement using this, or remove:
<dependency>
<groupId>com.norconex.commons</groupId>
Expand Down
60 changes: 36 additions & 24 deletions crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,19 @@

import com.norconex.crawler.core.Crawler;
import com.norconex.crawler.core.CrawlerBuilder;
import com.norconex.crawler.core.CrawlerException;
import com.norconex.crawler.core.cli.CliCrawlerLauncher;
import com.norconex.crawler.web.callbacks.WebCrawlerCallbacks;
import com.norconex.crawler.web.doc.WebCrawlDocContext;
import com.norconex.crawler.web.doc.pipelines.WebDocPipelines;
import com.norconex.crawler.web.fetch.HttpFetcherProvider;

public class WebCrawler {
/**
* Facade for launching or obtaining a Web Crawler.
*/
public final class WebCrawler {

protected static final Supplier<CrawlerBuilder> crawlerBuilderSupplier =
private static final Supplier<CrawlerBuilder> crawlerBuilderSupplier =
() -> Crawler
.builder()
.configuration(new WebCrawlerConfig())
Expand All @@ -37,6 +41,9 @@ public class WebCrawler {
.docContextType(WebCrawlDocContext.class)
.context(new WebCrawlerContext());

private WebCrawler() {
}

/**
* Invokes the Web Crawler from the command line.
* You can invoke it without any arguments to get a list of command-line
Expand All @@ -52,33 +59,38 @@ public static void main(String[] args) {
}
}

/**
* Launches the Web Crawler. Similar to {@link #main(String[])}, but
* do not call {@link System#exit(int)} and returns the execution status
* code instead. It will throw a runtime exception upon failure
* (typically a {@link CrawlerException}).
* @param args command line arguments
* @return execution status code
*/
public static int launch(String... args) {
return CliCrawlerLauncher.launch(crawlerBuilderSupplier.get(), args);
return CliCrawlerLauncher.launch(builder(), args);
}

/**
* Creates a Web Crawler instance.
* @param crawlerConfig Web Crawler configuration
* @return crawler
*/
public static Crawler create(WebCrawlerConfig crawlerConfig) {
return crawlerBuilderSupplier
.get()
.configuration(
Optional.ofNullable(crawlerConfig)
.orElseGet(WebCrawlerConfig::new))
return builder()
.configuration(Optional.ofNullable(crawlerConfig)
.orElseGet(WebCrawlerConfig::new))
.build();
}

// static CrawlSessionImpl initCrawlSessionImpl(
// CrawlSessionConfig sessionConfig) {
// return CrawlSessionImpl
// .builder()
// .crawlerConfigClass(WebCrawlerConfig.class)
// .crawlerFactory(
// (sess, cfg) -> Crawler.builder()
// .crawlSession(sess)
// .crawlerConfig(cfg)
// .crawlerImpl(WebCrawlerImplFactory.create())
// .build()
// )
// .beanMapper(Web.beanMapper())
// .crawlSessionConfig(sessionConfig)
// .build();
// }
/**
* Gets the builder used to create a Web Crawler. To get a web crawler
* instance, it is best to call {@link #create(WebCrawlerConfig)}.
* This method is typically for internal use, unless you know what you are
* doing and want to create your own crawler, based on this one.
* @return crawler builder
*/
public static CrawlerBuilder builder() {
return crawlerBuilderSupplier.get();
}
}
Loading

0 comments on commit c1d2b5b

Please sign in to comment.