Merge pull request #1060 from Norconex/feature/CU-86888fmx3/crawler-s…

…tability JavaDoc + better serialization of DelayRange + renaming of FeaturedIm…
Norconex · Sep 14, 2024 · c1d2b5b · c1d2b5b
2 parents 0a03d59 + e544471
commit c1d2b5b
Show file tree

Hide file tree

Showing 81 changed files with 1,438 additions and 3,154 deletions.
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java
@@ -26,7 +26,7 @@ public final class CrawlDocMetadata {
     //TODO use the same prefix for both crawler and importer...
     // all "document." ? In any case, no longer make it "collector."
 
-    public static final String PREFIX = "collector.";
+    public static final String PREFIX = "crawler.";
 
     public static final String DEPTH = PREFIX + "depth";
 

diff --git a/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java b/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java
@@ -230,7 +230,7 @@ void testLifeCycle() {
                         1,
                         new Condition<>(
                                 req -> req.getMetadata().getBoolean(
-                                        "collector.is-crawl-new"),
+                                        "crawler.is-crawl-new"),
                                 ""))
                 .map(CommitterRequest::getReference)
                 // ref1 is last because orphans are processed last

diff --git a/crawler/web/pom.xml b/crawler/web/pom.xml
@@ -214,6 +214,13 @@
       <artifactId>selenium</artifactId>
       <scope>test</scope>
     </dependency>
+    <!--      
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-junit-jupiter</artifactId>
+      <scope>test</scope>
+    </dependency>    
+    -->
     <!-- TODO: Implement using this, or remove:
     <dependency>
         <groupId>com.norconex.commons</groupId>

diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
@@ -19,15 +19,19 @@
 
 import com.norconex.crawler.core.Crawler;
 import com.norconex.crawler.core.CrawlerBuilder;
+import com.norconex.crawler.core.CrawlerException;
 import com.norconex.crawler.core.cli.CliCrawlerLauncher;
 import com.norconex.crawler.web.callbacks.WebCrawlerCallbacks;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
 import com.norconex.crawler.web.doc.pipelines.WebDocPipelines;
 import com.norconex.crawler.web.fetch.HttpFetcherProvider;
 
-public class WebCrawler {
+/**
+ * Facade for launching or obtaining a Web Crawler.
+ */
+public final class WebCrawler {
 
-    protected static final Supplier<CrawlerBuilder> crawlerBuilderSupplier =
+    private static final Supplier<CrawlerBuilder> crawlerBuilderSupplier =
             () -> Crawler
                     .builder()
                     .configuration(new WebCrawlerConfig())
@@ -37,6 +41,9 @@ public class WebCrawler {
                     .docContextType(WebCrawlDocContext.class)
                     .context(new WebCrawlerContext());
 
+    private WebCrawler() {
+    }
+
     /**
      * Invokes the Web Crawler from the command line.
      * You can invoke it without any arguments to get a list of command-line
@@ -52,33 +59,38 @@ public static void main(String[] args) {
         }
     }
 
+    /**
+     * Launches the Web Crawler. Similar to {@link #main(String[])}, but
+     * do not call {@link System#exit(int)} and returns the execution status
+     * code instead. It will throw a runtime exception upon failure
+     * (typically a {@link CrawlerException}).
+     * @param args command line arguments
+     * @return execution status code
+     */
     public static int launch(String... args) {
-        return CliCrawlerLauncher.launch(crawlerBuilderSupplier.get(), args);
+        return CliCrawlerLauncher.launch(builder(), args);
     }
 
+    /**
+     * Creates a Web Crawler instance.
+     * @param crawlerConfig Web Crawler configuration
+     * @return crawler
+     */
     public static Crawler create(WebCrawlerConfig crawlerConfig) {
-        return crawlerBuilderSupplier
-                .get()
-                .configuration(
-                        Optional.ofNullable(crawlerConfig)
-                                .orElseGet(WebCrawlerConfig::new))
+        return builder()
+                .configuration(Optional.ofNullable(crawlerConfig)
+                        .orElseGet(WebCrawlerConfig::new))
                 .build();
     }
 
-    //    static CrawlSessionImpl initCrawlSessionImpl(
-    //            CrawlSessionConfig sessionConfig) {
-    //        return CrawlSessionImpl
-    //            .builder()
-    //            .crawlerConfigClass(WebCrawlerConfig.class)
-    //            .crawlerFactory(
-    //                (sess, cfg) -> Crawler.builder()
-    //                    .crawlSession(sess)
-    //                    .crawlerConfig(cfg)
-    //                    .crawlerImpl(WebCrawlerImplFactory.create())
-    //                    .build()
-    //            )
-    //            .beanMapper(Web.beanMapper())
-    //            .crawlSessionConfig(sessionConfig)
-    //            .build();
-    //    }
+    /**
+     * Gets the builder used to create a Web Crawler. To get a web crawler
+     * instance, it is best to call {@link #create(WebCrawlerConfig)}.
+     * This method is typically for internal use, unless you know what you are
+     * doing and want to create your own crawler, based on this one.
+     * @return crawler builder
+     */
+    public static CrawlerBuilder builder() {
+        return crawlerBuilderSupplier.get();
+    }
 }