From d8d6d653344f58b4ae02e130588b8bf049414ecb Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Sat, 7 Sep 2024 00:24:32 -0400
Subject: [PATCH 01/10] JavaDoc + better serialization of DelayRange + renaming
 of FeaturedImageProcessor to FeaturedImageResolver.

---
 .../crawler/core/doc/CrawlDocMetadata.java    |   2 +-
 .../delay/impl/AbstractDelayResolver.java     |  20 +-
 .../delay/impl/BaseDelayResolverConfig.java   |  35 +--
 .../operations/delay/impl/CrawlerDelay.java   |   5 +
 .../doc/operations/delay/impl/DelayRange.java |   2 +
 .../operations/delay/impl/DelaySchedule.java  |   7 +-
 .../delay/impl/GenericDelayResolver.java      |  36 ---
 .../impl/GenericDelayResolverConfig.java      |  69 +----
 .../delay/impl/ReferenceDelayResolver.java    |  33 ---
 .../impl/ReferenceDelayResolverConfig.java    |  62 +---
 .../filter/impl/SegmentCountUrlFilter.java    |  18 --
 .../impl/SegmentCountUrlFilterConfig.java     |  51 ++--
 .../impl/FeaturedImageProcessorConfig.java    | 269 ------------------
 ...cessor.java => FeaturedImageResolver.java} | 165 ++---------
 .../impl/FeaturedImageResolverConfig.java     | 242 ++++++++++++++++
 .../link/impl/DomLinkExtractor.java           |  64 +----
 .../link/impl/DomLinkExtractorConfig.java     | 178 ++----------
 .../crawler/web/spi/CrawlerWebPtProvider.java |   2 +
 .../com/norconex/crawler/web/WebTestUtil.java |   4 +-
 ...st.java => FeaturedImageResolverTest.java} |  20 +-
 20 files changed, 349 insertions(+), 935 deletions(-)
 delete mode 100644 crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java
 rename crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/{FeaturedImageProcessor.java => FeaturedImageResolver.java} (76%)
 create mode 100644 crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java
 rename crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/{FeaturedImageProcessorTest.java => FeaturedImageResolverTest.java} (94%)
diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java
index 6c16323a8..d9ed19851 100644
--- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java
+++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java
@@ -26,7 +26,7 @@ public final class CrawlDocMetadata {
     //TODO use the same prefix for both crawler and importer...
     // all "document." ? In any case, no longer make it "collector."
 
-    public static final String PREFIX = "collector.";
+    public static final String PREFIX = "crawler.";
 
     public static final String DEPTH = PREFIX + "depth";
 
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java
index 13d16e5c1..b7f941c98 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java
@@ -61,20 +61,7 @@
  *       any given thread.  The more threads you have the less of an
  *       impact the delay will have.</li>
  * </ul>
- * <h3>
- * XML configuration usage:
- * </h3>
- * <p>
- * The following should be shared across concrete implementations
- * (which can add more configurable attributes and tags).
- * </p>
- * {@nx.xml
- * <delay class="(implementing class)"
- *     default="(milliseconds)"
- *     ignoreRobotsCrawlDelay="[false|true]"
- *     scope="[crawler|site|thread]">
- * </delay>
- * }
+ * @param <T> type of configuration class
  * @since 2.5.0
  */
 @Slf4j
@@ -101,9 +88,8 @@ public void delay(RobotsTxt robotsTxt, String url) {
         }
         var delay = delays.get(getConfiguration().getScope());
         if (delay == null) {
-            LOG.warn(
-                    "Unspecified or unsupported delay scope: {}. "
-                            + "Using {} scope.",
+            LOG.warn("Unspecified or unsupported delay scope: {}. "
+                    + "Using {} scope.",
                     getConfiguration().getScope(),
                     BaseDelayResolverConfig.DEFAULT_SCOPE);
             delay = delays.get(BaseDelayResolverConfig.DEFAULT_SCOPE);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java
index e6fb3f720..34d19021c 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java
@@ -38,37 +38,22 @@
  * in order from the best behaved to the least.
  * </p>
  * <ul>
- *   <li><b>crawler</b>: the delay is applied between each URL download
+ *   <li><b>CRAWLER</b>: the delay is applied between each URL download
  *       within a crawler instance, regardless how many threads are defined
  *       within that crawler, or whether URLs are from the
  *       same site or not.  This is the default scope.</li>
- *   <li><b>site</b>: the delay is applied between each URL download
+ *   <li><b>SITE</b>: the delay is applied between each URL download
  *       from the same site within a crawler instance, regardless how many
  *       threads are defined. A site is defined by a URL protocol and its
  *       domain (e.g. http://example.com).</li>
- *   <li><b>thread</b>: the delay is applied between each URL download from
+ *   <li><b>THREAD</b>: the delay is applied between each URL download from
  *       any given thread.  The more threads you have the less of an
  *       impact the delay will have.</li>
  * </ul>
- * <h3>
- * XML configuration usage:
- * </h3>
- * <p>
- * The following should be shared across concrete implementations
- * (which can add more configurable attributes and tags).
- * </p>
- * {@nx.xml
- * <delay class="(implementing class)"
- *     default="(milliseconds)"
- *     ignoreRobotsCrawlDelay="[false|true]"
- *     scope="[crawler|site|thread]">
- * </delay>
- * }
  * @since 2.5.0
  */
 @Data
 @Accessors(chain = true)
-@SuppressWarnings("javadoc")
 public class BaseDelayResolverConfig {
 
     public enum DelayResolverScope {
@@ -81,25 +66,19 @@ public enum DelayResolverScope {
             DelayResolverScope.CRAWLER;
 
     /**
-     * The default delay in milliseconds.
-     * @param defaultDelay default deleay
-     * @return default delay
+     * The default delay.
      */
     private Duration defaultDelay = DEFAULT_DELAY;
 
     /**
      * Whether to ignore crawl delays specified in a site robots.txt
-     * file.  Not applicable when robots.txt are ignored.
-     * @param ignoreRobotsCrawlDelay <code>true</code> if ignoring
-     *            robots.txt crawl delay
-     * @return <code>true</code> if ignoring robots.txt crawl delay
+     * file. Not applicable when robots.txt are ignored.
      */
     private boolean ignoreRobotsCrawlDelay = false;
 
     /**
-     * Gets the delay scope.
-     * @param scope one of "crawler", "site", or "thread".
-     * @return delay scope
+     * Gets the delay scope. See class documentation for a description
+     * of supported scopes.
      */
     private DelayResolverScope scope = DEFAULT_SCOPE;
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java
index fb8a72f4f..4a5e1ff96 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java
@@ -18,9 +18,14 @@
 
 import com.norconex.commons.lang.Sleeper;
 
+import lombok.EqualsAndHashCode;
+import lombok.ToString;
+
 /**
  * It is assumed there will be one instance of this class per crawler defined.
  */
+@EqualsAndHashCode(onlyExplicitlyIncluded = true)
+@ToString(onlyExplicitlyIncluded = true)
 public class CrawlerDelay extends AbstractDelay {
 
     private MutableLong lastHitEpochNanos = new MutableLong(-1);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java
index 3d791d873..c1157cac0 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java
@@ -18,11 +18,13 @@
 import lombok.Data;
 import lombok.NoArgsConstructor;
 import lombok.experimental.Accessors;
+import lombok.experimental.FieldNameConstants;
 
 @Data
 @Accessors(chain = true)
 @AllArgsConstructor
 @NoArgsConstructor
+@FieldNameConstants
 public class DelayRange<T> {
     private T start;
     private T end;
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java
index 2eb7e6bf7..8d9a6465c 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java
@@ -41,9 +41,10 @@ public enum DOW {
     private DelayRange<LocalTime> timeRange;
     private Duration delay;
 
-    // For Jackson serialization
+    //--- Serialization helpers ------------------------------------------------
+
     @JsonSetter(value = "timeRange")
-    void setTimeRangeFromString(DelayRange<String> range) {
+    void setTimeRangeSerial(DelayRange<String> range) {
         if (range == null) {
             timeRange = null;
             return;
@@ -54,7 +55,7 @@ void setTimeRangeFromString(DelayRange<String> range) {
     }
 
     @JsonGetter(value = "timeRange")
-    DelayRange<String> getTimeRangeAsString() {
+    DelayRange<String> getTimeRangeSerial() {
         if (timeRange == null) {
             return null;
         }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java
index e565cb25e..9afc3698d 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java
@@ -25,7 +25,6 @@
 import com.fasterxml.jackson.annotation.JsonIgnore;
 import com.norconex.commons.lang.CircularRange;
 import com.norconex.commons.lang.event.EventListener;
-import com.norconex.commons.lang.time.DurationParser;
 import com.norconex.crawler.core.event.CrawlerEvent;
 
 import lombok.EqualsAndHashCode;
@@ -67,41 +66,6 @@
  *       any given thread.  The more threads you have the less of an
  *       impact the delay will have.</li>
  * </ul>
- *
- * <p>
- * As of 2.7.0, XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
- * </p>
- *
- * {@nx.xml.usage
- * <delay class="com.norconex.crawler.web.delay.impl.GenericDelayResolver"
- *       default="(milliseconds)"
- *       ignoreRobotsCrawlDelay="[false|true]"
- *       scope="[crawler|site|thread]">
- *   <schedule
- *       dayOfWeek="from (week day) to (week day)"
- *       dayOfMonth="from [1-31] to [1-31]"
- *       time="from (HH:mm) to (HH:mm)">
- *     (delay in milliseconds)
- *   </schedule>
- *
- *   (... repeat schedule tag as needed ...)
- * </delay>
- * }
- *
- * {@nx.xml.example
- * <delay class="GenericDelayResolver"
- *     default="5 seconds" ignoreRobotsCrawlDelay="true" scope="site" >
- *   <schedule dayOfWeek="from Saturday to Sunday">1 second</schedule>
- * </delay>
- * }
- *
- * <p>
- * The above example set the minimum delay between each document download
- * on a given site to 5 seconds, no matter what the crawler robots.txt may
- * say, except on weekend, where it is more agressive (1 second).
- * </p>
  */
 @EqualsAndHashCode
 @ToString
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java
index 456e54095..0b04c9da6 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java
@@ -19,80 +19,13 @@
 import java.util.List;
 
 import com.norconex.commons.lang.collection.CollectionUtil;
-import com.norconex.commons.lang.time.DurationParser;
 
 import lombok.Data;
 import lombok.experimental.Accessors;
 
 /**
  * <p>
- * Default implementation for creating voluntary delays between URL downloads.
- * There are a few ways the actual delay value can be defined (in order):
- * </p>
- * <ol>
- *   <li>Takes the delay specify by a robots.txt file.
- *       Only applicable if robots.txt files and its robots crawl delays
- *       are not ignored.</li>
- *   <li>Takes an explicitly scheduled delay, if any (picks the first
- *       one matching).</li>
- *   <li>Use the specified default delay or 3 seconds, if none is
- *       specified.</li>
- * </ol>
- * <p>
- * In a delay schedule, the days of weeks are spelled out (in English):
- * Monday, Tuesday, etc.  Time ranges are using the 24h format.
- * </p>
- * <p>
- * One of these following scope dictates how the delay is applied, listed
- * in order from the best behaved to the least.
- * </p>
- * <ul>
- *   <li><b>crawler</b>: the delay is applied between each URL download
- *       within a crawler instance, regardless how many threads are defined
- *       within that crawler, or whether URLs are from the
- *       same site or not.  This is the default scope.</li>
- *   <li><b>site</b>: the delay is applied between each URL download
- *       from the same site within a crawler instance, regardless how many
- *       threads are defined. A site is defined by a URL protocol and its
- *       domain (e.g. http://example.com).</li>
- *   <li><b>thread</b>: the delay is applied between each URL download from
- *       any given thread.  The more threads you have the less of an
- *       impact the delay will have.</li>
- * </ul>
- *
- * <p>
- * As of 2.7.0, XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
- * </p>
- *
- * {@nx.xml.usage
- * <delay class="com.norconex.crawler.web.delay.impl.GenericDelayResolver"
- *       default="(milliseconds)"
- *       ignoreRobotsCrawlDelay="[false|true]"
- *       scope="[crawler|site|thread]">
- *   <schedule
- *       dayOfWeek="from (week day) to (week day)"
- *       dayOfMonth="from [1-31] to [1-31]"
- *       time="from (HH:mm) to (HH:mm)">
- *     (delay in milliseconds)
- *   </schedule>
- *
- *   (... repeat schedule tag as needed ...)
- * </delay>
- * }
- *
- * {@nx.xml.example
- * <delay class="GenericDelayResolver"
- *     default="5 seconds" ignoreRobotsCrawlDelay="true" scope="site" >
- *   <schedule dayOfWeek="from Saturday to Sunday">1 second</schedule>
- * </delay>
- * }
- *
- * <p>
- * The above example set the minimum delay between each document download
- * on a given site to 5 seconds, no matter what the crawler robots.txt may
- * say, except on weekend, where it is more agressive (1 second).
+ * Configuration for {@link GenericDelayResolver}.
  * </p>
  */
 @Data
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java
index ed16d5727..89bae0f9f 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java
@@ -16,8 +16,6 @@
 
 import java.time.Duration;
 
-import com.norconex.commons.lang.time.DurationParser;
-
 import lombok.EqualsAndHashCode;
 import lombok.Getter;
 import lombok.ToString;
@@ -54,37 +52,6 @@
  *       any given thread.  The more threads you have the less of an
  *       impact the delay will have.</li>
  * </ul>
- *
- * <p>
- * As of 2.7.0, XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
- * </p>
- *
- * {@nx.xml.usage
- * <delay class="com.norconex.crawler.web.delay.impl.ReferenceDelayResolver"
- *         default="(milliseconds)"
- *         ignoreRobotsCrawlDelay="[false|true]"
- *         scope="[crawler|site|thread]">
- *     <pattern delay="(delay in milliseconds)">
- *       (regular expression applied against document reference)
- *     </pattern>
- *
- *     (... repeat pattern tag as needed ...)
- * </delay>
- * }
- *
- * {@nx.xml.example
- * <pre>
- * <delay class="ReferenceDelayResolver" default="3 seconds" >
- *     <pattern delay="10 seconds">.*\.pdf</pattern>
- * </delay>
- * }
- * <p>
- * The above examlpe will increase the delay to 10 seconds when encountering
- * PDFs from a default of 3 seconds.
- * </p>
- *
  * @since 2.5.0
  */
 @EqualsAndHashCode
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java
index c06a2aeaf..9ecde8f5a 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java
@@ -19,74 +19,14 @@
 import java.util.List;
 
 import com.norconex.commons.lang.collection.CollectionUtil;
-import com.norconex.commons.lang.time.DurationParser;
 
 import lombok.Data;
 import lombok.experimental.Accessors;
 
 /**
  * <p>
- * Introduces different delays between document downloads based on matching
- * document reference (URL) patterns.
- * There are a few ways the actual delay value can be defined (in order):
+ * Configuration for {@link ReferenceDelayResolver}.
  * </p>
- * <ol>
- *   <li>Takes the delay specify by a robots.txt file.
- *       Only applicable if robots.txt files and its robots crawl delays
- *       are not ignored.</li>
- *   <li>Takes the delay matching a reference pattern, if any (picks the first
- *       one matching).</li>
- *   <li>Used the specified default delay or 3 seconds, if none is
- *       specified.</li>
- * </ol>
- * <p>
- * One of these following scope dictates how the delay is applied, listed
- * in order from the best behaved to the least.
- * </p>
- * <ul>
- *   <li><b>crawler</b>: the delay is applied between each URL download
- *       within a crawler instance, regardless how many threads are defined
- *       within that crawler, or whether URLs are from the
- *       same site or not.  This is the default scope.</li>
- *   <li><b>site</b>: the delay is applied between each URL download
- *       from the same site within a crawler instance, regardless how many
- *       threads are defined. A site is defined by a URL protocol and its
- *       domain (e.g. http://example.com).</li>
- *   <li><b>thread</b>: the delay is applied between each URL download from
- *       any given thread.  The more threads you have the less of an
- *       impact the delay will have.</li>
- * </ul>
- *
- * <p>
- * As of 2.7.0, XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
- * </p>
- *
- * {@nx.xml.usage
- * <delay class="com.norconex.crawler.web.delay.impl.ReferenceDelayResolver"
- *         default="(milliseconds)"
- *         ignoreRobotsCrawlDelay="[false|true]"
- *         scope="[crawler|site|thread]">
- *     <pattern delay="(delay in milliseconds)">
- *       (regular expression applied against document reference)
- *     </pattern>
- *
- *     (... repeat pattern tag as needed ...)
- * </delay>
- * }
- *
- * {@nx.xml.example
- * <pre>
- * <delay class="ReferenceDelayResolver" default="3 seconds" >
- *     <pattern delay="10 seconds">.*\.pdf</pattern>
- * </delay>
- * }
- * <p>
- * The above examlpe will increase the delay to 10 seconds when encountering
- * PDFs from a default of 3 seconds.
- * </p>
- *
  * @since 2.5.0
  */
 @Data
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java
index 62fcffbda..190650609 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java
@@ -52,24 +52,6 @@
  * When <code>duplicate</code> is <code>true</code>, it will count the maximum
  * number of duplicate segments found.
  * </p>
- *
- * {@nx.xml.usage
- *  <filter class="com.norconex.crawler.web.doc.operations.filter.impl.SegmentCountUrlFilter"
- *      onMatch="[include|exclude]"
- *      count="(numeric value)"
- *      duplicate="[false|true]">
- *    <separator>(a regex identifying segment separator)</separator>
- *  </filter>
- * }
- *
- * {@nx.xml.example
- *  <filter class="SegmentCountUrlFilter" onMatch="exclude" count="5" />
- * }
- * <p>
- * The above example will reject URLs with more than 5 forward slashes after
- * the domain.
- * </p>
- *
  * @since 1.2
  * @see Pattern
  */
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java
index 74bec03fe..230faa072 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java
@@ -14,8 +14,6 @@
  */
 package com.norconex.crawler.web.doc.operations.filter.impl;
 
-import java.util.regex.Pattern;
-
 import com.norconex.crawler.core.doc.operations.filter.OnMatch;
 
 import lombok.Data;
@@ -23,40 +21,9 @@
 
 /**
  * <p>
- * Filters URL based based on the number of URL segments. A URL with
- * a number of segments equal or more than the specified count will either
- * be included or excluded, as specified.
- * </p>
- * <p>
- * By default
- * segments are obtained by breaking the URL text at each forward slashes
- * (/), starting after the host name.  You can define different or
- * additional segment separator characters.
+ * Configuration for {@link SegmentCountUrlFilter}.
  * </p>
- * <p>
- * When <code>duplicate</code> is <code>true</code>, it will count the maximum
- * number of duplicate segments found.
- * </p>
- *
- * {@nx.xml.usage
- *  <filter class="com.norconex.crawler.web.doc.operations.filter.impl.SegmentCountUrlFilter"
- *      onMatch="[include|exclude]"
- *      count="(numeric value)"
- *      duplicate="[false|true]">
- *    <separator>(a regex identifying segment separator)</separator>
- *  </filter>
- * }
- *
- * {@nx.xml.example
- *  <filter class="SegmentCountUrlFilter" onMatch="exclude" count="5" />
- * }
- * <p>
- * The above example will reject URLs with more than 5 forward slashes after
- * the domain.
- * </p>
- *
  * @since 1.2
- * @see Pattern
  */
 @Data
 @Accessors(chain = true)
@@ -67,8 +34,24 @@ public class SegmentCountUrlFilterConfig {
     /** Default segment count. */
     public static final int DEFAULT_SEGMENT_COUNT = 10;
 
+    /**
+     * Number of segments after which this filter is considered a match.
+     * Default is {@value #DEFAULT_SEGMENT_COUNT}
+     */
     private int count = DEFAULT_SEGMENT_COUNT;
+    /**
+     * Whether the configured segment count represents the number of
+     * duplicated segments for this filter to be considered a match.
+     */
     private boolean duplicate;
+    /**
+     * Segment separator. Default is
+     * {@value #DEFAULT_SEGMENT_SEPARATOR_PATTERN}.
+     */
     private String separator = DEFAULT_SEGMENT_SEPARATOR_PATTERN;
+
+    /**
+     * Action to undertake when there is a match.
+     */
     private OnMatch onMatch;
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java
deleted file mode 100644
index aa657f570..000000000
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2017-2024 Norconex Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.norconex.crawler.web.doc.operations.image.impl;
-
-import java.awt.Dimension;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-import org.imgscalr.Scalr.Method;
-
-import com.norconex.commons.lang.collection.CollectionUtil;
-import com.norconex.crawler.core.doc.CrawlDocMetadata;
-
-import lombok.Data;
-import lombok.Getter;
-import lombok.experimental.Accessors;
-
-/**
- * <p>
- * Document processor that extract the "main" image from HTML pages.
- * Since HTML is expected, this class should only be used at
- * pre-import processor. It is possible for this processor to not find any
- * image.
- * </p>
- *
- * <h3>Finding the image</h3>
- * <p>
- * By default this class will get the first image (&lt;img&gt;) matching
- * the minimum size. You can specify you want the largest of all matching
- * ones instead.  In addition, if you know your images to be defined
- * in a special way (e.g., all share the same CSS class), then you can use
- * the "domSelector" to limit to one or a few images. See
- * <a href="https://jsoup.org/cookbook/extracting-data/selector-syntax">
- * JSoup selector-syntax</a> for how to build the "domSelector".
- * </p>
- *
- * <h3>Storing the image</h3>
- * <p>
- * One or more storage method can be specified. Here are
- * the possible storage options:
- * </p>
- * <ul>
- *   <li>
- *     <b>url</b>: Default. The absolute image URL is stored in a
- *     <code>collector.featured-image-url</code> field.
- *     When only this option is set, scaling options and image format
- *     have no effect.
- *   </li>
- *   <li>
- *     <b>inline</b>: Stores a Base64 string of the scaled image, in the format
- *     specified, in a <code>collector.featured-image-inline</code> field.
- *     The string is ready to be
- *     used inline, in a &lt;img src="..."&gt; tag.
- *   </li>
- *   <li>
- *     <b>disk</b>: Stores the scaled image on the file system, in the format
- *     and directory specified. A reference to the file on disk is stored
- *     in a <code>collector.featured-image-path</code> field.
- *   </li>
- * </ul>
- *
- * {@nx.xml.usage
- * <processor class="com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessor">
- *
- *    <pageContentTypePattern>
- *        (Optional regex to overwrite default matching of HTML pages)
- *    </pageContentTypePattern>
- *
- *    <domSelector>
- *        (Optional CSS-like path matching one or more image elements)
- *    </domSelector>
- *    <minDimensions>
- *        (Minimum pixel size for an image to be considered.
- *         Default is 400x400).
- *    </minDimensions>
- *    <largest>[false|true]</largest>
- *
- *    <imageCacheSize>
- *        (Maximum number of images to cache for faster processing.
- *         Set to 0 to disable caching.)
- *    </imageCacheSize>
- *    <imageCacheDir>
- *        (Directory where to create the image cache)
- *    </imageCacheDir>
- *
- *    <storage>
- *        [url|inline|disk]
- *        (One or more, comma-separated. Default is "url".)
- *    </storage>
- *
- *    <!-- Only applicable for "inline" and "disk" storage: -->
- *    <scaleDimensions>
- *        (Target pixel size the featured image should be scaled to.
- *         Default is 150x150.)
- *    </scaleDimensions>
- *    <scaleStretch>
- *        [false|true]
- *        (Whether to stretch to match scale size. Default keeps aspect ratio.)
- *    </scaleStretch>
- *    <scaleQuality>
- *        [auto|low|medium|high|max]
- *        (Default is "auto", which tries the best balance between quality
- *         and speed based on image size. The lower the quality the faster
- *         it is to scale images.)
- *    </scaleQuality>
- *    <imageFormat>
- *        (Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ...
- *         Default is "png")
- *    </imageFormat>
- *
- *    <!-- Only applicable for "disk" storage: -->
- *    <storageDiskDir structure="[url2path|date|datetime]">
- *        (Path to directory where to store images on disk.)
- *    </storageDiskDir>
- *    <storageDiskField>
- *        (Overwrite default field where to store the image path.
- *         Default is {@value #COLLECTOR_FEATURED_IMAGE_PATH}.)
- *    </storageDiskField>
- *
- *    <!-- Only applicable for "inline" storage: -->
- *    <storageInlineField>
- *        (Overwrite default field where to store the inline image.
- *         Default is {@value #COLLECTOR_FEATURED_IMAGE_INLINE}.)
- *    </storageInlineField>
- *
- *    <!-- Only applicable for "url" storage: -->
- *    <storageUrlField>
- *        (Overwrite default field where to store the image URL.
- *         Default is {@value #COLLECTOR_FEATURED_IMAGE_URL}.)
- *    </storageUrlField>
- *
- * </processor>
- * }
- *
- * When specifying an image size, the format is <code>[width]x[height]</code>
- * or a single value. When a single value is used, that value represents both
- * the width and height (i.e., a square).
- *
- * {@nx.xml.example
- * <preImportProcessors>
- *   <processor class="FeaturedImageProcessor">
- *     <minDimensions>300x400</minDimensions>
- *     <scaleDimensions>50</scaleDimensions>
- *     <imageFormat>jpg</imageFormat>
- *     <scaleQuality>max</scaleQuality>
- *     <storage>inline</storage>
- *   </processor>
- * </preImportProcessors>
- * }
- * <p>
- * The above example extracts the first image being 300x400 or larger, scaling
- * it down to be 50x50 and storing it as an inline JPEG in a document field,
- * preserving aspect ratio and using the best quality possible.
- * </p>
- *
- * @since 2.8.0
- */
-@SuppressWarnings("javadoc")
-@Data
-@Accessors(chain = true)
-public class FeaturedImageProcessorConfig {
-
-    public static final String COLLECTOR_FEATURED_IMAGE_URL =
-            CrawlDocMetadata.PREFIX + "featured-image-url";
-    public static final String COLLECTOR_FEATURED_IMAGE_PATH =
-            CrawlDocMetadata.PREFIX + "featured-image-path";
-    public static final String COLLECTOR_FEATURED_IMAGE_INLINE =
-            CrawlDocMetadata.PREFIX + "featured-image-inline";
-
-    public static final String DEFAULT_PAGE_CONTENT_TYPE_PATTERN =
-            "text/html|application/(xhtml\\+xml|vnd\\.wap.xhtml\\+xml|x-asp)";
-    public static final int DEFAULT_IMAGE_CACHE_SIZE = 1000;
-
-    /**
-     * Default image cache directory, relative to the crawler working
-     * directory.
-     */
-    public static final String DEFAULT_IMAGE_CACHE_DIR =
-            "featuredImageCache";
-    /**
-     * Default featured image directory, relative to the crawler working
-     * directory.
-     */
-    public static final String DEFAULT_STORAGE_DISK_DIR =
-            "featuredImages";
-
-    public static final String DEFAULT_IMAGE_FORMAT = "png";
-    public static final Dimension DEFAULT_MIN_SIZE = new Dimension(400, 400);
-    public static final Dimension DEFAULT_SCALE_SIZE = new Dimension(150, 150);
-    public static final Storage DEFAULT_STORAGE = Storage.URL;
-    public static final StorageDiskStructure DEFAULT_STORAGE_DISK_STRUCTURE =
-            StorageDiskStructure.URL2PATH;
-
-    public enum Storage {
-        URL, INLINE, DISK
-    }
-
-    public enum StorageDiskStructure {
-        URL2PATH, DATE, DATETIME
-    }
-
-    public enum Quality {
-        AUTO(Method.AUTOMATIC),
-        LOW(Method.SPEED),
-        MEDIUM(Method.BALANCED),
-        HIGH(Method.QUALITY),
-        MAX(Method.ULTRA_QUALITY);
-
-        @Getter
-        private final Method scalrMethod;
-
-        Quality(Method scalrMethod) {
-            this.scalrMethod = scalrMethod;
-        }
-    }
-
-    private String pageContentTypePattern = DEFAULT_PAGE_CONTENT_TYPE_PATTERN;
-    private String domSelector;
-    private Dimension minDimensions = DEFAULT_MIN_SIZE;
-    private Dimension scaleDimensions = DEFAULT_SCALE_SIZE;
-    private boolean scaleStretch;
-    private String imageFormat = DEFAULT_IMAGE_FORMAT;
-    private int imageCacheSize = DEFAULT_IMAGE_CACHE_SIZE;
-
-    private Path imageCacheDir;
-    private boolean largest;
-    private final List<Storage> storage =
-            new ArrayList<>(Arrays.asList(DEFAULT_STORAGE));
-
-    private Path storageDiskDir;
-    private StorageDiskStructure storageDiskStructure;
-    private Quality scaleQuality = Quality.AUTO;
-
-    private String storageDiskField = COLLECTOR_FEATURED_IMAGE_PATH;
-    private String storageInlineField = COLLECTOR_FEATURED_IMAGE_INLINE;
-    private String storageUrlField = COLLECTOR_FEATURED_IMAGE_URL;
-
-    /**
-     * Gets the storage mechanisms.
-     * @return storage mechanisms
-     */
-    public List<Storage> getStorage() {
-        return Collections.unmodifiableList(storage);
-    }
-
-    /**
-     * Sets the storage mechanisms.
-     * @param storage storage mechanisms
-     */
-    public FeaturedImageProcessorConfig setStorage(List<Storage> storage) {
-        CollectionUtil.setAll(this.storage, storage);
-        return this;
-    }
-}
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java
similarity index 76%
rename from crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessor.java
rename to crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java
index 996f991d6..58bc45d24 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java
@@ -14,11 +14,11 @@
  */
 package com.norconex.crawler.web.doc.operations.image.impl;
 
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.COLLECTOR_FEATURED_IMAGE_INLINE;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.COLLECTOR_FEATURED_IMAGE_PATH;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.COLLECTOR_FEATURED_IMAGE_URL;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.DEFAULT_IMAGE_CACHE_DIR;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.DEFAULT_STORAGE_DISK_DIR;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.DEFAULT_IMAGE_CACHE_DIR;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.DEFAULT_STORAGE_DISK_DIR;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.FEATURED_IMAGE_INLINE_FIELD;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.FEATURED_IMAGE_PATH_FIELD;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.FEATURED_IMAGE_URL_FIELD;
 import static java.util.Optional.ofNullable;
 import static org.apache.commons.lang3.StringUtils.endsWithIgnoreCase;
 import static org.apache.commons.lang3.StringUtils.isNotBlank;
@@ -60,8 +60,8 @@
 import com.norconex.crawler.core.fetch.FetchResponse;
 import com.norconex.crawler.core.fetch.Fetcher;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
-import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage;
-import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.StorageDiskStructure;
+import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage;
+import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.StorageDiskStructure;
 import com.norconex.crawler.web.fetch.HttpFetchRequest;
 import com.norconex.crawler.web.fetch.HttpFetcher;
 import com.norconex.crawler.web.fetch.HttpMethod;
@@ -75,7 +75,7 @@
 /**
  * <p>
  * Document processor that extract the "main" image from HTML pages.
- * Since HTML is expected, this class should only be used at
+ * Since HTML is expected, this class should only be used as a
  * pre-import processor. It is possible for this processor to not find any
  * image.
  * </p>
@@ -93,135 +93,21 @@
  *
  * <h3>Storing the image</h3>
  * <p>
- * One or more storage method can be specified. Here are
- * the possible storage options:
+ * When identified, the featured image can be stored either on local disk,
+ * or as a metadata field in Base64 format, or simply as a URL pointing
+ * to its remote location. See {@link FeaturedImageResolverConfig} for details.
  * </p>
- * <ul>
- *   <li>
- *     <b>url</b>: Default. The absolute image URL is stored in a
- *     <code>collector.featured-image-url</code> field.
- *     When only this option is set, scaling options and image format
- *     have no effect.
- *   </li>
- *   <li>
- *     <b>inline</b>: Stores a Base64 string of the scaled image, in the format
- *     specified, in a <code>collector.featured-image-inline</code> field.
- *     The string is ready to be
- *     used inline, in a &lt;img src="..."&gt; tag.
- *   </li>
- *   <li>
- *     <b>disk</b>: Stores the scaled image on the file system, in the format
- *     and directory specified. A reference to the file on disk is stored
- *     in a <code>collector.featured-image-path</code> field.
- *   </li>
- * </ul>
- *
- * {@nx.xml.usage
- * <processor class="com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessor">
- *
- *    <pageContentTypePattern>
- *        (Optional regex to overwrite default matching of HTML pages)
- *    </pageContentTypePattern>
- *
- *    <domSelector>
- *        (Optional CSS-like path matching one or more image elements)
- *    </domSelector>
- *    <minDimensions>
- *        (Minimum pixel size for an image to be considered.
- *         Default is 400x400).
- *    </minDimensions>
- *    <largest>[false|true]</largest>
- *
- *    <imageCacheSize>
- *        (Maximum number of images to cache for faster processing.
- *         Set to 0 to disable caching.)
- *    </imageCacheSize>
- *    <imageCacheDir>
- *        (Directory where to create the image cache)
- *    </imageCacheDir>
- *
- *    <storage>
- *        [url|inline|disk]
- *        (One or more, comma-separated. Default is "url".)
- *    </storage>
- *
- *    <!-- Only applicable for "inline" and "disk" storage: -->
- *    <scaleDimensions>
- *        (Target pixel size the featured image should be scaled to.
- *         Default is 150x150.)
- *    </scaleDimensions>
- *    <scaleStretch>
- *        [false|true]
- *        (Whether to stretch to match scale size. Default keeps aspect ratio.)
- *    </scaleStretch>
- *    <scaleQuality>
- *        [auto|low|medium|high|max]
- *        (Default is "auto", which tries the best balance between quality
- *         and speed based on image size. The lower the quality the faster
- *         it is to scale images.)
- *    </scaleQuality>
- *    <imageFormat>
- *        (Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ...
- *         Default is "png")
- *    </imageFormat>
- *
- *    <!-- Only applicable for "disk" storage: -->
- *    <storageDiskDir structure="[url2path|date|datetime]">
- *        (Path to directory where to store images on disk.)
- *    </storageDiskDir>
- *    <storageDiskField>
- *        (Overwrite default field where to store the image path.
- *         Default is {@value #COLLECTOR_FEATURED_IMAGE_PATH}.)
- *    </storageDiskField>
- *
- *    <!-- Only applicable for "inline" storage: -->
- *    <storageInlineField>
- *        (Overwrite default field where to store the inline image.
- *         Default is {@value #COLLECTOR_FEATURED_IMAGE_INLINE}.)
- *    </storageInlineField>
- *
- *    <!-- Only applicable for "url" storage: -->
- *    <storageUrlField>
- *        (Overwrite default field where to store the image URL.
- *         Default is {@value #COLLECTOR_FEATURED_IMAGE_URL}.)
- *    </storageUrlField>
- *
- * </processor>
- * }
- *
- * When specifying an image size, the format is <code>[width]x[height]</code>
- * or a single value. When a single value is used, that value represents both
- * the width and height (i.e., a square).
- *
- * {@nx.xml.example
- * <preImportProcessors>
- *   <processor class="FeaturedImageProcessor">
- *     <minDimensions>300x400</minDimensions>
- *     <scaleDimensions>50</scaleDimensions>
- *     <imageFormat>jpg</imageFormat>
- *     <scaleQuality>max</scaleQuality>
- *     <storage>inline</storage>
- *   </processor>
- * </preImportProcessors>
- * }
- * <p>
- * The above example extracts the first image being 300x400 or larger, scaling
- * it down to be 50x50 and storing it as an inline JPEG in a document field,
- * preserving aspect ratio and using the best quality possible.
- * </p>
- *
  * @since 2.8.0
  */
-@SuppressWarnings("javadoc")
 @EqualsAndHashCode
 @ToString
 @Slf4j
-public class FeaturedImageProcessor
+public class FeaturedImageResolver
         extends
         CrawlerLifeCycleListener
         implements
         DocumentConsumer,
-        Configurable<FeaturedImageProcessorConfig> {
+        Configurable<FeaturedImageResolverConfig> {
 
     //TODO add ability to extract from popular HTML <meta> for
     // featured image
@@ -229,8 +115,8 @@ public class FeaturedImageProcessor
     //TODO add option to process embedded images (base 64)
 
     @Getter
-    private final FeaturedImageProcessorConfig configuration =
-            new FeaturedImageProcessorConfig();
+    private final FeaturedImageResolverConfig configuration =
+            new FeaturedImageResolverConfig();
 
     private static final Map<Path, ImageCache> IMG_CACHES = new HashMap<>();
 
@@ -258,7 +144,8 @@ protected void onCrawlerRunBegin(CrawlerEvent event) {
         // Initialize image cache directory
         if (configuration.getImageCacheSize() > 0) {
             resolvedImageCacheDir = ofNullable(configuration.getImageCacheDir())
-                    .orElseGet(() -> workDir.resolve(DEFAULT_IMAGE_CACHE_DIR));
+                    .orElseGet(() -> workDir.resolve(
+                            DEFAULT_IMAGE_CACHE_DIR));
             try {
                 Files.createDirectories(resolvedImageCacheDir);
                 LOG.info(
@@ -275,7 +162,7 @@ protected void onCrawlerRunBegin(CrawlerEvent event) {
         }
 
         // Initialize image directory
-        if (configuration.getStorage().contains(Storage.DISK)) {
+        if (configuration.getStorages().contains(Storage.DISK)) {
             resolvedStorageDiskDir = ofNullable(
                     configuration.getStorageDiskDir()).orElseGet(
                             () -> workDir.resolve(DEFAULT_STORAGE_DISK_DIR));
@@ -338,21 +225,21 @@ public void accept(Fetcher<?, ?> f, CrawlDoc doc) {
     private void storeImage(FeaturedImage img, Doc doc)
             throws IOException {
         var imgFormat = configuration.getImageFormat();
-        if (configuration.getStorage().contains(Storage.URL)) {
+        if (configuration.getStorages().contains(Storage.URL)) {
             doc.getMetadata().add(
                     Objects.toString(
                             configuration.getStorageUrlField(),
-                            COLLECTOR_FEATURED_IMAGE_URL),
+                            FEATURED_IMAGE_URL_FIELD),
                     img.getUrl());
         }
-        if (configuration.getStorage().contains(Storage.INLINE)) {
+        if (configuration.getStorages().contains(Storage.INLINE)) {
             doc.getMetadata().add(
                     Objects.toString(
                             configuration.getStorageInlineField(),
-                            COLLECTOR_FEATURED_IMAGE_INLINE),
+                            FEATURED_IMAGE_INLINE_FIELD),
                     img.toHTMLInlineString(imgFormat));
         }
-        if (configuration.getStorage().contains(Storage.DISK)) {
+        if (configuration.getStorages().contains(Storage.DISK)) {
             Path imageFile = null;
             if (configuration
                     .getStorageDiskStructure() == StorageDiskStructure.DATE) {
@@ -390,14 +277,14 @@ private void storeImage(FeaturedImage img, Doc doc)
             doc.getMetadata().add(
                     Objects.toString(
                             configuration.getStorageDiskField(),
-                            COLLECTOR_FEATURED_IMAGE_PATH),
+                            FEATURED_IMAGE_PATH_FIELD),
                     imageFile.toFile().getAbsolutePath());
         }
     }
 
     private boolean savingImage() {
-        return configuration.getStorage().contains(Storage.INLINE)
-                || configuration.getStorage().contains(Storage.DISK);
+        return configuration.getStorages().contains(Storage.INLINE)
+                || configuration.getStorages().contains(Storage.DISK);
     }
 
     private FeaturedImage findFeaturedImage(
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java
new file mode 100644
index 000000000..f931dae7e
--- /dev/null
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java
@@ -0,0 +1,242 @@
+/* Copyright 2017-2024 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.norconex.crawler.web.doc.operations.image.impl;
+
+import java.awt.Dimension;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.imgscalr.Scalr.Method;
+
+import com.norconex.commons.lang.collection.CollectionUtil;
+import com.norconex.crawler.core.doc.CrawlDocMetadata;
+
+import lombok.Data;
+import lombok.Getter;
+import lombok.experimental.Accessors;
+
+/**
+ * <p>
+ * Configuration for {@link FeaturedImageResolver}.
+ * </p>
+ * @since 2.8.0
+ */
+@Data
+@Accessors(chain = true)
+public class FeaturedImageResolverConfig {
+
+    public static final String FEATURED_IMAGE_URL_FIELD =
+            CrawlDocMetadata.PREFIX + "featured-image-url";
+    public static final String FEATURED_IMAGE_PATH_FIELD =
+            CrawlDocMetadata.PREFIX + "featured-image-path";
+    public static final String FEATURED_IMAGE_INLINE_FIELD =
+            CrawlDocMetadata.PREFIX + "featured-image-inline";
+
+    public static final String DEFAULT_PAGE_CONTENT_TYPE_PATTERN =
+            "text/html|application/(xhtml\\+xml|vnd\\.wap.xhtml\\+xml|x-asp)";
+    public static final int DEFAULT_IMAGE_CACHE_SIZE = 1000;
+
+    /**
+     * Default image cache directory, relative to the crawler working
+     * directory.
+     */
+    public static final String DEFAULT_IMAGE_CACHE_DIR =
+            "featuredImageCache";
+    /**
+     * Default featured image directory, relative to the crawler working
+     * directory.
+     */
+    public static final String DEFAULT_STORAGE_DISK_DIR =
+            "featuredImages";
+
+    public static final String DEFAULT_IMAGE_FORMAT = "png";
+    public static final Dimension DEFAULT_MIN_SIZE = new Dimension(400, 400);
+    public static final Dimension DEFAULT_SCALE_SIZE = new Dimension(150, 150);
+    public static final Storage DEFAULT_STORAGE = Storage.URL;
+    public static final StorageDiskStructure DEFAULT_STORAGE_DISK_STRUCTURE =
+            StorageDiskStructure.URL2PATH;
+
+    /**
+     * Type of featured image storages.
+     */
+    public enum Storage {
+        /**
+         * Default storages. The absolute image URL is stored in a
+         * {@value #FEATURED_IMAGE_URL_FIELD} metadata field.
+         * When only this storages option is set, scaling options and image
+         * format have no effect.
+         */
+        URL,
+        /**
+         * Stores a Base64 string of the scaled image, in the format
+         * specified, in a {@value #FEATURED_IMAGE_INLINE_FIELD} metadata
+         * field. The string is ready to be used inline, in a
+         * &lt;img src="..."&gt; tag (as an example).
+         */
+        INLINE,
+        /**
+         * Stores the scaled image on the file system, in the format
+         * and directory specified. A reference to the file on disk is stored
+         * in a {@value #FEATURED_IMAGE_PATH_FIELD} metadata field.
+         */
+        DISK
+    }
+
+    /**
+     * Directory structure when storing images on disk.
+     */
+    public enum StorageDiskStructure {
+        /**
+         * Create directories for each URL segments, with handling
+         * of special characters.
+         */
+        URL2PATH,
+        /**
+         * Create directories for each date (e.g., <code>2000/12/31/</code>).
+         */
+        DATE,
+        /**
+         * Create directories for each date and time, up to seconds
+         * (e.g., <code>2000/12/31/13/34/12/</code>).
+         */
+        DATETIME
+    }
+
+    public enum Quality {
+        AUTO(Method.AUTOMATIC),
+        LOW(Method.SPEED),
+        MEDIUM(Method.BALANCED),
+        HIGH(Method.QUALITY),
+        MAX(Method.ULTRA_QUALITY);
+
+        @Getter
+        private final Method scalrMethod;
+
+        Quality(Method scalrMethod) {
+            this.scalrMethod = scalrMethod;
+        }
+    }
+
+    /**
+     * Optional regex to overwrite default matching of HTML pages.
+     * Default is {@value #DEFAULT_PAGE_CONTENT_TYPE_PATTERN}
+     */
+    private String pageContentTypePattern = DEFAULT_PAGE_CONTENT_TYPE_PATTERN;
+    /**
+     * Optional CSS-like path matching one or more image elements.
+     */
+    private String domSelector;
+    /**
+     * Minimum pixel size for an image to be considered. Default is 400x400.
+     */
+    private Dimension minDimensions = DEFAULT_MIN_SIZE;
+    /**
+     * Target pixel size the featured image should be scaled to.
+     * Default is 150x150.
+     */
+    private Dimension scaleDimensions = DEFAULT_SCALE_SIZE;
+    /**
+     * Whether to stretch to match scale size. Default keeps aspect ratio.
+     */
+    private boolean scaleStretch;
+    /**
+     * Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ...
+     * Default is {@value #DEFAULT_IMAGE_FORMAT}
+     */
+    private String imageFormat = DEFAULT_IMAGE_FORMAT;
+    /**
+     * Maximum number of images to cache on the local file system for faster
+     * processing.
+     * Set to 0 to disable caching. Default is
+     * {@value #DEFAULT_IMAGE_CACHE_SIZE}.
+     */
+    private int imageCacheSize = DEFAULT_IMAGE_CACHE_SIZE;
+
+    /**
+     * Directory where to cache the images. Defaults to
+     * {@value #DEFAULT_IMAGE_CACHE_DIR}
+     */
+    private Path imageCacheDir;
+    /**
+     * When more than one featured image is found, whether to return the
+     * largest of them all (as opposed to the first one encountered).
+     */
+    private boolean largest;
+    /**
+     * One or more type of physical storages for the image.
+     */
+    private final List<Storage> storages =
+            new ArrayList<>(Arrays.asList(DEFAULT_STORAGE));
+
+    /**
+     * Path to directory where to store images on disk. Only applicable
+     * when one of the values of {@link #getStorages()} is {@link Storage#DISK}.
+     */
+    private Path storageDiskDir;
+    /**
+     * The type of directory structure to create when one of the
+     * values of of {@link #getStorages()} is {@link Storage#DISK}.
+     */
+    private StorageDiskStructure storageDiskStructure;
+    /**
+     * Desired scaling quality. Default is {@link Quality#AUTO}, which tries
+     * the best balance between quality and speed based on image size. The
+     * lower the quality the faster it is to scale images.
+     */
+    private Quality scaleQuality = Quality.AUTO;
+
+    /**
+     * Name of metadata field where to store the local path to an image.
+     * Only applicable if one of the {@link #getStorages()} values
+     * is {@link Storage#DISK}.
+     * Default is {@value #FEATURED_IMAGE_PATH_FIELD}
+     */
+    private String storageDiskField = FEATURED_IMAGE_PATH_FIELD;
+    /**
+     * Name of metadata field where to store the Base64 image.
+     * Only applicable if one of the {@link #getStorages()} values
+     * is {@link Storage#INLINE}.
+     * Default is {@value #FEATURED_IMAGE_INLINE_FIELD}
+     */
+    private String storageInlineField = FEATURED_IMAGE_INLINE_FIELD;
+    /**
+     * Name of metadata field where to store the remote image URL.
+     * Only applicable if one of the {@link #getStorages()} values
+     * is {@link Storage#URL}.
+     * Default is {@value #FEATURED_IMAGE_URL_FIELD}
+     */
+    private String storageUrlField = FEATURED_IMAGE_URL_FIELD;
+
+    /**
+     * Gets the storages mechanisms.
+     * @return storages mechanisms
+     */
+    public List<Storage> getStorages() {
+        return Collections.unmodifiableList(storages);
+    }
+
+    /**
+     * Sets the storages mechanisms.
+     * @param storages storages mechanisms
+     * @return this
+     */
+    public FeaturedImageResolverConfig setStorages(List<Storage> storages) {
+        CollectionUtil.setAll(storages, storages);
+        return this;
+    }
+}
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java
index 1ec5bee23..ae46be6da 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java
@@ -68,12 +68,13 @@
  *
  * <p>When used before importing this class attempts to detect the content
  * character encoding unless the character encoding
- * was specified using {@link #setCharset(String)}. Since document
- * parsing converts content to UTF-8, UTF-8 is always assumed when
- * used as a post-parse handler.
+ * was specified using
+ * {@link DomLinkExtractorConfig#setCharset(java.nio.charset.Charset)}.
+ * Since document parsing converts content to UTF-8, UTF-8 is always assumed
+ * when used as a post-parse handler.
  * </p>
  *
- * <p>You can specify which parser to use when reading
+ * <p>You can specify which DOM parser to use when reading
  * documents. The default is "html" and will normalize the content
  * as HTML. This is generally a desired behavior, but this can sometimes
  * have your selector fail. If you encounter this
@@ -122,7 +123,8 @@
  * That information gets stored as metadata in the target document.
  * If you want to limit the quantity of information extracted/stored,
  * you can disable this feature by setting
- * {@link #ignoreLinkData} to <code>true</code>.
+ * {@link DomLinkExtractorConfig#setIgnoreLinkData(boolean)} to
+ * <code>true</code>.
  * </p>
  *
  * <h3>URL Schemes</h3>
@@ -131,12 +133,12 @@
  * schemes</a> are extracted for absolute URLs. By default, those are
  * <code>http</code>, <code>https</code>, and <code>ftp</code>. You can
  * specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
+ * {@link DomLinkExtractorConfig#setSchemes(java.util.List)}.
  * </p>
  *
  * <h3>Applicable documents</h3>
  * <p>
- * By default, this extractor only will be applied on documents matching
+ * By default, this extractor will only be applied on documents matching
  * one of these content types:
  * </p>
  * {@nx.include com.norconex.importer.handler.CommonMatchers#domContentTypes}
@@ -147,57 +149,11 @@
  * won't be extracted (e.g.
  * <code>&lt;a href="x.html" rel="nofollow" ...&gt;</code>).
  * To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to <code>true</code>.
+ * {@link DomLinkExtractorConfig#setIgnoreNofollow(boolean)} to <code>true</code>.
  * </p>
  *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.DOMLinkExtractor"
- *     ignoreNofollow="[false|true]"
- *     ignoreLinkData="[false|true]"
- *     parser="[html|xml]"
- *     charset="(supported character encoding)">
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- *
- *   <schemes>
- *     (CSV list of URI scheme for which to perform link extraction.
- *      leave blank or remove tag to use defaults.)
- *   </schemes>
- *
- *   <!-- Repeat as needed: -->
- *   <linkSelector
- *       {@nx.include com.norconex.importer.util.DomUtil#attributes}>
- *      (selector syntax)
- *   </linkSelector>
- *
- *   <!-- Optional. Only apply link selectors to portions of a document
- *        matching these selectors. Repeat as needed. -->
- *   <extractSelector>(selector syntax)</extractSelector>
- *
- *   <!-- Optional. Do not apply link selectors to portions of a document
- *        matching these selectors. Repeat as needed. -->
- *   <noExtractSelector>(selector syntax)</noExtractSelector>
- *
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.DOMLinkExtractor">
- *   <linkSelector extract="attr(href)">a[href]</linkSelector>
- *   <linkSelector extract="attr(src)">[src]</linkSelector>
- *   <linkSelector extract="attr(href)">link[href]</linkSelector>
- *   <linkSelector extract="attr(content)">meta[http-equiv='refresh']</linkSelector>
- *
- *   <linkSelector extract="attr(data-myurl)">[data-myurl]</linkSelector>
- * </extractor>
- * }
- *
- * <p>
- * The above example will extract URLs found in custom element attributes named
- * <code>data-myurl</code>.
- * </p>
  * @since 3.0.0
  */
-@SuppressWarnings("javadoc")
 @EqualsAndHashCode
 @ToString
 public class DomLinkExtractor
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java
index 23777c312..3df4b2286 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java
@@ -36,160 +36,10 @@
 
 /**
  * <p>
- * Extracts links from a Document Object Model (DOM) representation of an
- * HTML, XHTML, or XML document content based on values of matching
- * elements and attributes.
- * </p>
- * <p>
- * In order to construct a DOM tree, text is loaded entirely
- * into memory. It uses the document content by default, but it can also
- * come from specified metadata fields.
- * Use this filter with caution if you know you'll need to parse
- * huge files. Use the {@link HtmlLinkExtractor} instead if this is a
- * concern.
- * </p>
- * <p>
- * The <a href="http://jsoup.org/">jsoup</a> parser library is used to load a
- * document content into a DOM tree. Elements are referenced using a
- * <a href="http://jsoup.org/cookbook/extracting-data/selector-syntax">
- * CSS or JQuery-like syntax</a>.
- * </p>
- * <p>
- * This link extractor is normally used before importing.
- * </p>
- *
- * <p>When used before importing this class attempts to detect the content
- * character encoding unless the character encoding
- * was specified using {@link #setCharset(String)}. Since document
- * parsing converts content to UTF-8, UTF-8 is always assumed when
- * used as a post-parse handler.
- * </p>
- *
- * <p>You can specify which parser to use when reading
- * documents. The default is "html" and will normalize the content
- * as HTML. This is generally a desired behavior, but this can sometimes
- * have your selector fail. If you encounter this
- * problem, try switching to "xml" parser, which does not attempt normalization
- * on the content. The drawback with "xml" is you may not get all HTML-specific
- * selector options to work.  If you know you are dealing with XML to begin
- * with, specifying "xml" should be a good option.
- * </p>
- *
- * <h3>Matching links</h3>
- * <p>
- * You can define as many JSoup "selectors" as desired. All values matched
- * by a selector will be extracted as a URL.
- * </p>
- * <p>
- * It is possible to control what gets extracted
- * exactly for matching purposes thanks to the "extract" argument expected
- * with every selector.  Possible values are:
- * </p>
- *
- * {@nx.include com.norconex.importer.util.DomUtil#extract}
- *
- * <p>
- * When not specified, the default is "text".
- * </p>
- *
- * <p>The default selectors / extract strategies are:</p>
- * <ul>
- *   <li>a[href] / attr(href)</li>
- *   <li>[src] / attr(src)</li>
- *   <li>link[href] / attr(href)</li>
- *   <li>meta[http-equiv='refresh'] / attr(content)</li>
- * </ul>
- * <p>
- * For any extracted link values, this extractor will perform minimal
- * heuristics to clean extra content not part of a regular URL.  For instance,
- * it will only keep what is after <code>url=</code> when dealing with
- * <code>&lt;meta http-equiv</code> refresh URLs.  It will also trim white
- * spaces.
- * </p>
- *
- * <h3>Ignoring link data</h3>
- * <p>
- * By default, contextual information is kept about the HTML/XML mark-up
- * tag from which a link is extracted (e.g., tag name and attributes).
- * That information gets stored as metadata in the target document.
- * If you want to limit the quantity of information extracted/stored,
- * you can disable this feature by setting
- * {@link #ignoreLinkData} to <code>true</code>.
- * </p>
- *
- * <h3>URL Schemes</h3>
- * <p>Only valid
- * <a href="https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Syntax">
- * schemes</a> are extracted for absolute URLs. By default, those are
- * <code>http</code>, <code>https</code>, and <code>ftp</code>. You can
- * specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
- * </p>
- *
- * <h3>Applicable documents</h3>
- * <p>
- * By default, this extractor only will be applied on documents matching
- * one of these content types:
- * </p>
- * {@nx.include com.norconex.importer.handler.CommonMatchers#domContentTypes}
- *
- * <h3>"nofollow"</h3>
- * <p>
- * By default, a regular HTML link having the "rel" attribute set to "nofollow"
- * won't be extracted (e.g.
- * <code>&lt;a href="x.html" rel="nofollow" ...&gt;</code>).
- * To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to <code>true</code>.
- * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.DOMLinkExtractor"
- *     ignoreNofollow="[false|true]"
- *     ignoreLinkData="[false|true]"
- *     parser="[html|xml]"
- *     charset="(supported character encoding)">
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- *
- *   <schemes>
- *     (CSV list of URI scheme for which to perform link extraction.
- *      leave blank or remove tag to use defaults.)
- *   </schemes>
- *
- *   <!-- Repeat as needed: -->
- *   <linkSelector
- *       {@nx.include com.norconex.importer.util.DomUtil#attributes}>
- *      (selector syntax)
- *   </linkSelector>
- *
- *   <!-- Optional. Only apply link selectors to portions of a document
- *        matching these selectors. Repeat as needed. -->
- *   <extractSelector>(selector syntax)</extractSelector>
- *
- *   <!-- Optional. Do not apply link selectors to portions of a document
- *        matching these selectors. Repeat as needed. -->
- *   <noExtractSelector>(selector syntax)</noExtractSelector>
- *
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.DOMLinkExtractor">
- *   <linkSelector extract="attr(href)">a[href]</linkSelector>
- *   <linkSelector extract="attr(src)">[src]</linkSelector>
- *   <linkSelector extract="attr(href)">link[href]</linkSelector>
- *   <linkSelector extract="attr(content)">meta[http-equiv='refresh']</linkSelector>
- *
- *   <linkSelector extract="attr(data-myurl)">[data-myurl]</linkSelector>
- * </extractor>
- * }
- *
- * <p>
- * The above example will extract URLs found in custom element attributes named
- * <code>data-myurl</code>.
+ * Configuration for {@link DomLinkExtractor}.
  * </p>
  * @since 3.0.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class DomLinkExtractorConfig {
@@ -209,8 +59,6 @@ public static class LinkSelector {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#DOM_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
-     * @return content type matcher
      */
     private final TextMatcher contentTypeMatcher =
             CommonMatchers.domContentTypes();
@@ -218,8 +66,6 @@ public static class LinkSelector {
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
-     * @param fieldMatcher field matcher
-     * @return field matcher
      */
     private final TextMatcher fieldMatcher = new TextMatcher();
 
@@ -229,21 +75,15 @@ public static class LinkSelector {
 
     /**
      * The assumed source character encoding.
-     * @param charset character encoding of the source content
-     * @return character encoding of the source content
      */
     private Charset charset;
     /**
      * The parser to use when creating the DOM-tree.
-     * @param parser <code>html</code> or <code>xml</code>.
-     * @return <code>html</code> (default) or <code>xml</code>.
      */
     private String parser = DomUtil.PARSER_HTML;
     private boolean ignoreNofollow;
     /**
      * Whether to ignore extra data associated with a link.
-     * @param ignoreLinkData <code>true</code> to ignore.
-     * @return <code>true</code> to ignore.
      */
     private boolean ignoreLinkData;
     private final List<String> schemes = new ArrayList<>(DEFAULT_SCHEMES);
@@ -264,7 +104,7 @@ public DomLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#HTML_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
+     * @param matcher content type matcher
      * @return this
      */
     public DomLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
@@ -275,6 +115,7 @@ public DomLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
     /**
      * Adds a new link selector extracting the "text" from matches.
      * @param selector JSoup selector
+     * @return this
      */
     public DomLinkExtractorConfig addLinkSelector(String selector) {
         addLinkSelector(selector, null);
@@ -305,6 +146,12 @@ public List<String> getExtractSelectors() {
         return Collections.unmodifiableList(extractSelectors);
     }
 
+    /**
+     * Only apply link selectors to portions of a document
+     * matching the supplied selectors.
+     * @param selectors the CSS selectors
+     * @return this
+     */
     public DomLinkExtractorConfig setExtractSelectors(List<String> selectors) {
         CollectionUtil.setAll(extractSelectors, selectors);
         return this;
@@ -319,6 +166,12 @@ public List<String> getNoExtractSelectors() {
         return Collections.unmodifiableList(noExtractSelectors);
     }
 
+    /**
+     * Do not apply link selectors to portions of a document
+     * matching the supplied selectors.
+     * @param selectors the CSS selectors
+     * @return this
+     */
     public DomLinkExtractorConfig setNoExtractSelectors(
             List<String> selectors) {
         CollectionUtil.setAll(noExtractSelectors, selectors);
@@ -342,6 +195,7 @@ public List<String> getSchemes() {
     /**
      * Sets the schemes to be extracted.
      * @param schemes schemes to be extracted
+     * @return this
      */
     public DomLinkExtractorConfig setSchemes(List<String> schemes) {
         CollectionUtil.setAll(this.schemes, schemes);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java
index 995461b5d..2d717cb30 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java
@@ -35,6 +35,7 @@
 import com.norconex.crawler.web.WebCrawlerConfig;
 import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector;
 import com.norconex.crawler.web.doc.operations.delay.DelayResolver;
+import com.norconex.crawler.web.doc.operations.delay.impl.DelayRange;
 import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver;
 import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver;
@@ -61,6 +62,7 @@ public MultiValuedMap<Class<?>, Class<?>> getPolymorphicTypes() {
         addPolyType(map, MetadataChecksummer.class, "doc.operations.checksum");
         addPolyType(map, EventListener.class, "event.listeners");
         addPolyType(map, DelayResolver.class);
+        addPolyType(map, DelayRange.class);
         addPolyType(
                 map, DocumentFilter.class,
                 "doc.operations.filter"); //NOSONAR
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java b/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java
index 47720edac..7799ac5f6 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java
@@ -68,7 +68,7 @@
 import com.norconex.crawler.web.doc.operations.delay.DelayResolver;
 import com.norconex.crawler.web.doc.operations.delay.impl.BaseDelayResolverConfig.DelayResolverScope;
 import com.norconex.crawler.web.doc.operations.delay.impl.GenericDelayResolver;
-import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessor;
+import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolver;
 import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.crawler.web.doc.operations.link.impl.DomLinkExtractor;
 import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver;
@@ -153,7 +153,7 @@ public final class WebTestUtil {
                     .excludeType(DataStore.class::equals)
                     .excludeType(SitemapResolver.class::equals)
                     .excludeType(DocumentConsumer.class::equals)
-                    .excludeType(FeaturedImageProcessor.class::equals)
+                    .excludeType(FeaturedImageResolver.class::equals)
                     .excludeType(RecrawlableResolver.class::equals)
                     .excludeType(ReferencesProvider.class::equals)
                     .excludeType(BiPredicate.class::equals)
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java
similarity index 94%
rename from crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.java
rename to crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java
index 54af0a205..95957b6ab 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java
@@ -17,9 +17,9 @@
 import static com.norconex.crawler.web.TestResource.IMG_160X120_PNG;
 import static com.norconex.crawler.web.TestResource.IMG_320X240_PNG;
 import static com.norconex.crawler.web.TestResource.IMG_640X480_PNG;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage.DISK;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage.INLINE;
-import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage.URL;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage.DISK;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage.INLINE;
+import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage.URL;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatNoException;
 
@@ -42,14 +42,14 @@
 import com.norconex.crawler.core.doc.CrawlDoc;
 import com.norconex.crawler.core.event.CrawlerEvent;
 import com.norconex.crawler.web.WebsiteMock;
-import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Quality;
-import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage;
-import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.StorageDiskStructure;
+import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Quality;
+import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage;
+import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.StorageDiskStructure;
 import com.norconex.crawler.web.junit.WithCrawlerTest;
 import com.norconex.crawler.web.stubs.CrawlDocStubs;
 
 @MockServerSettings
-class FeaturedImageProcessorTest {
+class FeaturedImageResolverTest {
 
     private @TempDir Path tempDir;
 
@@ -66,7 +66,7 @@ void testProcessFeaturedImage(
 
         var fetcher = crawler.getFetcher();
 
-        var fip = new FeaturedImageProcessor();
+        var fip = new FeaturedImageResolver();
         fip.getConfiguration()
                 .setStorage(List.of(INLINE, URL, DISK))
                 .setStorageDiskDir(tempDir.resolve("imageStorage"))
@@ -122,7 +122,7 @@ void testProcessFeaturedImage(
 
     @Test
     void testWriteRead() {
-        var p = new FeaturedImageProcessor();
+        var p = new FeaturedImageResolver();
 
         // All settings
         p.getConfiguration()
@@ -171,7 +171,7 @@ void testWriteRead() {
         //        // set everything null by default?
         //
         //        var read = BeanMapper.DEFAULT.writeRead(p, Format.XML);
-        //        assertThat(read).isEqualTo(new FeaturedImageProcessor());
+        //        assertThat(read).isEqualTo(new FeaturedImageResolver());
         //
         ////        assertThatNoException().isThrownBy(
         ////                () -> BeanMapper.DEFAULT.assertWriteRead(p));

From 78eede970b9ffaae013e61792a0d5500313b9715 Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Sat, 7 Sep 2024 01:29:16 -0400
Subject: [PATCH 02/10] JavaDoc + Unit test.

---
 .../norconex/crawler/core/CrawlerTest.java    |   2 +-
 .../link/impl/HtmlLinkExtractor.java          | 305 +++++-------------
 .../link/impl/HtmlLinkExtractorConfig.java    | 239 +-------------
 .../link/impl/HtmlExtractorTest.java          |   1 +
 4 files changed, 102 insertions(+), 445 deletions(-)

diff --git a/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java b/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java
index e3f2b7a82..cba585d53 100644
--- a/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java
+++ b/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java
@@ -230,7 +230,7 @@ void testLifeCycle() {
                         1,
                         new Condition<>(
                                 req -> req.getMetadata().getBoolean(
-                                        "collector.is-crawl-new"),
+                                        "crawler.is-crawl-new"),
                                 ""))
                 .map(CommitterRequest::getReference)
                 // ref1 is last because orphans are processed last
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java
index 2b2d17d26..9804138c3 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java
@@ -47,7 +47,6 @@
 import com.norconex.crawler.web.doc.operations.link.Link;
 import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractorConfig.RegexPair;
-import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer;
 import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizer;
 import com.norconex.crawler.web.util.Web;
 
@@ -70,13 +69,13 @@
  *
  * <h3>Applicable documents</h3>
  * <p>
- * By default, this extractor only will be applied on documents matching
+ * By default, this extractor will only be applied on documents matching
  * one of these content types:
  * </p>
  * {@nx.include com.norconex.importer.handler.CommonRestrictions#htmlContentTypes}
  * <p>
  * You can specify your own content types or other restrictions with
- * {@link #setRestrictions(List)}.
+ * {@link HtmlLinkExtractorConfig#setContentTypeMatcher(com.norconex.commons.lang.text.TextMatcher)}.
  * Make sure they represent a file with HTML-like markup tags containing URLs.
  * For documents that are just
  * too different, consider implementing your own {@link LinkExtractor} instead.
@@ -108,7 +107,8 @@
  * <p>
  * The <code>meta.http-equiv</code> is treated differently.  Only if the
  * "http-equiv" value is "refresh" and a "content" attribute with a URL exist
- * that it will be extracted.  "object" and "applet" can have multiple URLs.
+ * that it will be extracted. The "object" and "applet" tags can have
+ * multiple URLs.
  * </p>
  *
  * <p>
@@ -124,7 +124,7 @@
  * {@link WebDocMetadata#REFERRER_LINK_PREFIX}.
  * </p>
  * <p>
- * The referrer data is always stored (was optional before).
+ * The referrer data is always stored.
  * </p>
  *
  * <h3>Character encoding</h3>
@@ -132,7 +132,7 @@
  * detect the encoding of the a page when extracting links and
  * referrer information. If no charset could be detected, it falls back to
  * UTF-8. It is also possible to dictate which encoding to use with
- * {@link #setCharset(String)}.
+ * {@link HtmlLinkExtractorConfig#setCharset(java.nio.charset.Charset)}.
  * </p>
  *
  * <h3>"nofollow"</h3>
@@ -141,15 +141,14 @@
  * won't be extracted (e.g.
  * <code>&lt;a href="x.html" rel="nofollow" ...&gt;</code>).
  * To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to <code>true</code>.
+ * {@link HtmlLinkExtractorConfig#setIgnoreNofollow(boolean)} to
+ * <code>true</code>.
  * </p>
  *
  * <h3>URL Fragments</h3>
- * <p>This extractor preserves hashtag characters (#) found
- * in URLs and every characters after it. It relies on the implementation
- * of {@link WebUrlNormalizer} to strip it if need be.
- * {@link GenericUrlNormalizer} is now always invoked by default, and the
- * default set of rules defined for it will remove fragments.
+ * <p>While extractor preserves hashtag characters (#) found
+ * in URLs and every characters after it, the default URL normalizer
+ * ({@link GenericUrlNormalizer}) will strip it by default.
  * </p>
  *
  * <p>
@@ -171,7 +170,8 @@
  * That information gets stored as metadata in the target document.
  * If you want to limit the quantity of information extracted/stored,
  * you can disable this feature by setting
- * {@link #ignoreLinkData} to <code>true</code>.
+ * {@link HtmlLinkExtractorConfig#setIgnoreLinkData(boolean)} to
+ * <code>true</code>.
  * </p>
  *
  * <h3>URL Schemes</h3>
@@ -180,96 +180,31 @@
  * schemes</a> are extracted for absolute URLs. By default, those are
  * <code>http</code>, <code>https</code>, and <code>ftp</code>. You can
  * specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
+ * {@link HtmlLinkExtractorConfig#setSchemes(List)}.
  * </p>
  *
  * <h3>HTML/XML Comments</h3>
- * <p>URLs found in &lt;!-- comments --&gt; are no longer
+ * <p>URLs found in &lt;!-- comments --&gt; are not
  * extracted by default. To enable URL extraction from comments, use
- * {@link #setCommentsEnabled(boolean)}
+ * {@link HtmlLinkExtractorConfig#setCommentsEnabled(boolean)}
  * </p>
  *
  * <h3>Extract links in certain parts only</h3>
  * <p>You can identify portions of a document where links
  * should be extracted or ignored with
- * {@link #setExtractBetweens(List)} and
- * {@link #setNoExtractBetweens(List)}. Eligible content for link
- * extraction is identified first, and content to exclude is done on that
- * subset.
+ * {@link HtmlLinkExtractorConfig#setExtractBetweens(List)} and
+ * {@link HtmlLinkExtractorConfig#setNoExtractBetweens(List)}. Eligible
+ * content for link extraction is identified first, and content to exclude is
+ * done on that subset.
  * </p>
  * <p>You can further limit link extraction to specific
  * area by using
  * <a href="https://jsoup.org/cookbook/extracting-data/selector-syntax">selector-syntax</a>
  * to do so, with
- * {@link #setExtractSelectors(List)} and
- * {@link #setNoExtractSelectors(List)}.
- * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor"
- *     maxURLLength="(maximum URL length. Default is 2048)"
- *     ignoreNofollow="[false|true]"
- *     ignoreLinkData="[false|true]"
- *     commentsEnabled="[false|true]"
- *     charset="(supported character encoding)">
- *
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- *
- *   <schemes>
- *     (CSV list of URI scheme for which to perform link extraction.
- *      leave blank or remove tag to use defaults.)
- *   </schemes>
- *
- *   <!-- Which tags and attributes hold the URLs to extract. -->
- *   <tags>
- *     <tag name="(tag name)" attribute="(tag attribute)" />
- *     <!-- you can have multiple tag entries -->
- *   </tags>
- *
- *   <!-- Only extract URLs from the following text portions. -->
- *   <extractBetween ignoreCase="[false|true]">
- *     <start>(regex)</start>
- *     <end>(regex)</end>
- *   </extractBetween>
- *   <!-- you can have multiple extractBetween entries -->
- *
- *   <!-- Do not extract URLs from the following text portions. -->
- *   <noExtractBetween ignoreCase="[false|true]">
- *     <start>(regex)</start>
- *     <end>(regex)</end>
- *   </noExtractBetween>
- *   <!-- you can have multiple noExtractBetween entries -->
- *
- *   <!-- Only extract URLs matching the following selectors. -->
- *   <extractSelector>(selector)</extractSelector>
- *   <!-- you can have multiple extractSelector entries -->
- *
- *   <!-- Do not extract URLs matching the following selectors. -->
- *   <noExtractSelector>(selector)</noExtractSelector>
- *   <!-- you can have multiple noExtractSelector entries -->
- *
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor">
- *   <tags>
- *     <tag name="a" attribute="href" />
- *     <tag name="frame" attribute="src" />
- *     <tag name="iframe" attribute="src" />
- *     <tag name="img" attribute="src" />
- *     <tag name="meta" attribute="http-equiv" />
- *     <tag name="script" attribute="src" />
- *   </tags>
- * </extractor>
- * }
- *
- * <p>
- * The above example adds URLs to JavaScript files to the list of URLs to be
- * extracted.
+ * {@link HtmlLinkExtractorConfig#setExtractSelectors(List)} and
+ * {@link HtmlLinkExtractorConfig#setNoExtractSelectors(List)}.
  * </p>
  */
-@SuppressWarnings("javadoc")
 @Slf4j
 @EqualsAndHashCode
 @ToString
@@ -286,6 +221,7 @@ public class HtmlLinkExtractor
     private final HtmlLinkExtractorConfig configuration =
             new HtmlLinkExtractorConfig();
 
+    // @formatter:off
     // NOTE: When this predicate is invoked the tag name is always lower case
     // and known to have been identified as a target tag name in configuration.
     // For each predicate, returning true won't try following predicates
@@ -293,129 +229,69 @@ public class HtmlLinkExtractor
     @ToString.Exclude
     private final BiPredicate<Tag, Set<Link>> tagLinksExtractor =
 
-            //--- From tag body ---
-            // When no attributes configured for a tag name, we take the body
-            // value as the URL.
-            ((BiPredicate<Tag, Set<Link>>) (tag, links) -> Optional.of(tag)
-                    .filter(t -> t.configAttribNames.isEmpty())
-                    .filter(t -> isNotBlank(t.bodyText))
-                    .map(
-                            t -> toCleanAbsoluteURL(
-                                    t.referrer,
-                                    tag.bodyText.trim()))
-                    .map(url -> addAsLink(links, url, tag, null))
+        //--- From tag body ---
+        // When no attributes configured for a tag name, we take the body
+        // value as the URL.
+        ((BiPredicate<Tag, Set<Link>>) (tag, links) -> Optional.of(tag)
+            .filter(t -> t.configAttribNames.isEmpty())
+            .filter(t -> isNotBlank(t.bodyText))
+            .map(t -> toCleanAbsoluteURL(t.referrer, tag.bodyText.trim()))
+            .map(url -> addAsLink(links, url, tag, null))
+            .filter(Boolean::valueOf)
+            .orElse(false))
+                //--- From meta http-equiv tag ---
+                // E.g.: <meta http-equiv="refresh" content="...">:
+                .or((tag, links) -> Optional.of(tag)
+                    .filter(t -> "meta".equals(t.name))
+                    .filter(t -> t.configAttribNames.contains(HTTP_EQUIV))
+                    .filter(t -> t.attribs.getStrings(HTTP_EQUIV)
+                            .contains("refresh"))
+                    .filter(t -> t.attribs.containsKey(CONTENT))
+                    // very unlikely that we have more than one
+                    // redirect directives, but loop just in case
+                    .map(t -> t.attribs
+                        .getStrings(CONTENT)
+                        .stream()
+                        .map(LinkUtil::extractHttpEquivRefreshContentUrl)
+                        .map(url -> toCleanAbsoluteURL(tag.referrer, url))
+                        .findFirst()
+                        .map(url -> addAsLink(links, url, tag, CONTENT))
+                        .filter(Boolean::valueOf)
+                        .orElse(false))
                     .filter(Boolean::valueOf)
                     .orElse(false))
 
-                            //--- From meta http-equiv tag ---
-                            // E.g.: <meta http-equiv="refresh" content="...">:
-                            .or(
-                                    (tag, links) -> Optional.of(tag)
-                                            .filter(t -> "meta".equals(t.name))
-                                            .filter(
-                                                    t -> t.configAttribNames
-                                                            .contains(
-                                                                    HTTP_EQUIV))
-                                            .filter(
-                                                    t -> t.attribs
-                                                            .getStrings(
-                                                                    HTTP_EQUIV)
-                                                            .contains(
-                                                                    "refresh"))
-                                            .filter(
-                                                    t -> t.attribs.containsKey(
-                                                            CONTENT))
-                                            // very unlikely that we have more than one redirect directives,
-                                            // but loop just in case
-                                            .map(
-                                                    t -> t.attribs
-                                                            .getStrings(CONTENT)
-                                                            .stream()
-                                                            .map(
-                                                                    LinkUtil::extractHttpEquivRefreshContentUrl)
-                                                            .map(
-                                                                    url -> toCleanAbsoluteURL(
-                                                                            tag.referrer,
-                                                                            url))
-                                                            .findFirst()
-                                                            .map(
-                                                                    url -> addAsLink(
-                                                                            links,
-                                                                            url,
-                                                                            tag,
-                                                                            CONTENT))
-                                                            .filter(
-                                                                    Boolean::valueOf)
-                                                            .orElse(false))
-                                            .filter(Boolean::valueOf)
-                                            .orElse(false))
-
-                            //--- From anchor tag ---
-                            // E.g.: <a href="...">...</a>
-                            .or(
-                                    (tag, links) -> Optional.of(tag)
-                                            .filter(t -> "a".equals(t.name))
-                                            .filter(
-                                                    t -> t.configAttribNames
-                                                            .contains("href"))
-                                            .filter(
-                                                    t -> t.attribs
-                                                            .containsKey(
-                                                                    "href"))
-                                            .filter(
-                                                    t -> !hasActiveDoNotFollow(
-                                                            t))
-                                            .map(
-                                                    t -> toCleanAbsoluteURL(
-                                                            t.referrer,
-                                                            t.attribs.getString(
-                                                                    "href")))
-                                            .map(
-                                                    url -> addAsLink(
-                                                            links, url, tag,
-                                                            "href"))
-                                            .filter(Boolean::valueOf)
-                                            .orElse(hasActiveDoNotFollow(tag)) // skip others if no follow
-                            )
-
-                            //--- From other matching attributes for tag ---
-                            .or(
-                                    (tag, links) -> tag.configAttribNames
-                                            .stream()
-                                            .map(
-                                                    cfgAttr -> Optional
-                                                            .ofNullable(
-                                                                    tag.attribs
-                                                                            .getString(
-                                                                                    cfgAttr))
-                                                            .map(
-                                                                    urlStr -> (EqualsUtil
-                                                                            .equalsAny(
-                                                                                    tag.name,
-                                                                                    "object",
-                                                                                    "applet")
-                                                                                            ? List.of(
-                                                                                                    StringUtils
-                                                                                                            .split(
-                                                                                                                    urlStr,
-                                                                                                                    ", "))
-                                                                                            : List.of(
-                                                                                                    urlStr))
-                                                                                                            .stream()
-                                                                                                            .map(
-                                                                                                                    url -> toCleanAbsoluteURL(
-                                                                                                                            tag.referrer,
-                                                                                                                            url))
-                                                                                                            .map(
-                                                                                                                    url -> addAsLink(
-                                                                                                                            links,
-                                                                                                                            url,
-                                                                                                                            tag,
-                                                                                                                            cfgAttr))
-                                                                                                            .anyMatch(
-                                                                                                                    Boolean::valueOf)))
-                                            .flatMap(Optional::stream)
-                                            .anyMatch(Boolean::valueOf));
+                    //--- From anchor tag ---
+                    // E.g.: <a href="...">...</a>
+                    .or((tag, links) -> Optional.of(tag)
+                        .filter(t -> "a".equals(t.name))
+                        .filter(t -> t.configAttribNames.contains("href"))
+                        .filter(t -> t.attribs.containsKey("href"))
+                        .filter(t -> !hasActiveDoNotFollow(t))
+                        .map(t -> toCleanAbsoluteURL(
+                                t.referrer, t.attribs.getString("href")))
+                        .map(url -> addAsLink(links, url, tag, "href"))
+                        .filter(Boolean::valueOf)
+                        // skip others if no follow
+                        .orElse(hasActiveDoNotFollow(tag))
+                    )
+
+                    //--- From other matching attributes for tag ---
+                    .or((tag, links) -> tag.configAttribNames
+                        .stream()
+                        .map(cfgAttr -> Optional.ofNullable(
+                                tag.attribs.getString(cfgAttr))
+                        .map(urlStr -> (EqualsUtil.equalsAny(
+                                tag.name, "object", "applet")
+                                ? List.of(StringUtils.split(urlStr, ", "))
+                                : List.of(urlStr))
+                        .stream()
+                        .map(url -> toCleanAbsoluteURL(tag.referrer, url))
+                        .map(url -> addAsLink(links, url, tag, cfgAttr))
+                        .anyMatch(Boolean::valueOf)))
+                        .flatMap(Optional::stream)
+                        .anyMatch(Boolean::valueOf));
+    // @formatter:on
 
     @Override
     public Set<Link> extractLinks(CrawlDoc doc) throws IOException {
@@ -434,10 +310,8 @@ public Set<Link> extractLinks(CrawlDoc doc) throws IOException {
             doc.getMetadata()
                     .matchKeys(configuration.getFieldMatcher())
                     .valueList()
-                    .forEach(
-                            val -> extractLinksFromText(
-                                    links, val, refererUrl,
-                                    true));
+                    .forEach(val -> extractLinksFromText(
+                            links, val, refererUrl, true));
         } else {
             // Body
             try (var r = new TextReader(
@@ -720,11 +594,10 @@ private String toCleanAbsoluteURL(
 
         if (url.length() > configuration.getMaxURLLength()) {
             if (LOG.isDebugEnabled()) {
-                LOG.debug(
-                        """
-                                URL length ({}) exceeding maximum length allowed\s\
-                                ({}) to be extracted. URL (showing first {} chars):\s\
-                                {}...""",
+                LOG.debug("""
+                        URL length ({}) exceeding maximum length allowed\s\
+                        ({}) to be extracted. URL (showing first {} chars):\s\
+                        {}...""",
                         url.length(),
                         configuration.getMaxURLLength(),
                         LOGGING_MAX_URL_LENGTH,
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java
index e1ab3e558..019f5198d 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java
@@ -24,10 +24,6 @@
 import com.norconex.commons.lang.collection.CollectionUtil;
 import com.norconex.commons.lang.map.Properties;
 import com.norconex.commons.lang.text.TextMatcher;
-import com.norconex.crawler.web.doc.WebDocMetadata;
-import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
-import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer;
-import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizer;
 import com.norconex.importer.handler.CommonMatchers;
 
 import lombok.Data;
@@ -35,218 +31,9 @@
 
 /**
  * <p>
- * A memory efficient HTML link extractor.
- * </p>
- * <p>
- * This link extractor uses regular expressions to extract links. It does
- * so on a chunk of text at a time, so that large files are not fully loaded
- * into memory. If you prefer a more flexible implementation that loads the
- * DOM model in memory to perform link extraction, consider using
- * {@link DomLinkExtractor}.
- * </p>
- *
- * <h3>Applicable documents</h3>
- * <p>
- * By default, this extractor only will be applied on documents matching
- * one of these content types:
- * </p>
- * {@nx.include com.norconex.importer.handler.CommonRestrictions#htmlContentTypes}
- * <p>
- * You can specify your own content types or other restrictions with
- * {@link #setRestrictions(List)}.
- * Make sure they represent a file with HTML-like markup tags containing URLs.
- * For documents that are just
- * too different, consider implementing your own {@link LinkExtractor} instead.
- * Removing the default values and define no content types will have for effect
- * to try to extract URLs from all files (usually a bad idea).
- * </p>
- *
- * <h3>Tags attributes</h3>
- * URLs are assumed to be contained within valid tags or tag attributes.
- * The default tags and attributes used are (tag.attribute):
- * <pre>
- * a.href, frame.src, iframe.src, img.src, meta.http-equiv
- * </pre>
- * You can specify your own set of tags and attributes to have
- * different ones used for extracting URLs. For an elaborated set, you can
- * combine the above with your own list or use any of the following
- * suggestions (tag.attribute):
- * <pre>
- * applet.archive,   applet.codebase,  area.href,         audio.src,
- * base.href,        blockquote.cite,  body.background,   button.formaction,
- * command.icon,     del.cite,         embed.src,         form.action,
- * frame.longdesc,   head.profile,     html.manifest,     iframe.longdesc,
- * img.longdesc,     img.usemap,       input.formaction,  input.src,
- * input.usemap,     ins.cite,         link.href,         object.archive,
- * object.classid,   object.codebase,  object.data,       object.usemap,
- * q.cite,           script.src,       source.src,        video.poster,
- * video.src
- * </pre>
- * <p>
- * The <code>meta.http-equiv</code> is treated differently.  Only if the
- * "http-equiv" value is "refresh" and a "content" attribute with a URL exist
- * that it will be extracted.  "object" and "applet" can have multiple URLs.
- * </p>
- *
- * <p>
- * It is possible to identify a tag only as the holder of
- * a URL (without attributes). The tag body value will be used as the URL.
- * </p>
- *
- * <h3>Referrer data</h3>
- * <p>
- * Some "referrer" information is derived from the each link and stored as
- * metadata in the document they point to.
- * These may vary for each link, but they are normally prefixed with
- * {@link WebDocMetadata#REFERRER_LINK_PREFIX}.
- * </p>
- * <p>
- * The referrer data is always stored (was optional before).
- * </p>
- *
- * <h3>Character encoding</h3>
- * <p>This extractor will by default <i>attempt</i> to
- * detect the encoding of the a page when extracting links and
- * referrer information. If no charset could be detected, it falls back to
- * UTF-8. It is also possible to dictate which encoding to use with
- * {@link #setCharset(String)}.
- * </p>
- *
- * <h3>"nofollow"</h3>
- * <p>
- * By default, a regular HTML link having the "rel" attribute set to "nofollow"
- * won't be extracted (e.g.
- * <code>&lt;a href="x.html" rel="nofollow" ...&gt;</code>).
- * To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to <code>true</code>.
- * </p>
- *
- * <h3>URL Fragments</h3>
- * <p>This extractor preserves hashtag characters (#) found
- * in URLs and every characters after it. It relies on the implementation
- * of {@link WebUrlNormalizer} to strip it if need be.
- * {@link GenericUrlNormalizer} is now always invoked by default, and the
- * default set of rules defined for it will remove fragments.
- * </p>
- *
- * <p>
- * The URL specification says hashtags
- * are used to represent fragments only. That is, to quickly jump to a specific
- * section of the page the URL represents. Under normal circumstances,
- * keeping the URL fragments usually leads to duplicates documents being fetched
- * (same URL but different fragment) and they should be stripped. Unfortunately,
- * there are sites not following the URL standard and using hashtags as a
- * regular part of a URL (i.e. different hashtags point to different web pages).
- * It may be essential when crawling these sites to keep the URL fragments.
- * This can be done by making sure the URL normalizer does not strip them.
- * </p>
- *
- * <h3>Ignoring link data</h3>
- * <p>
- * By default, contextual information is kept about the HTML/XML mark-up
- * tag from which a link is extracted (e.g., tag name and attributes).
- * That information gets stored as metadata in the target document.
- * If you want to limit the quantity of information extracted/stored,
- * you can disable this feature by setting
- * {@link #ignoreLinkData} to <code>true</code>.
- * </p>
- *
- * <h3>URL Schemes</h3>
- * <p>Only valid
- * <a href="https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Syntax">
- * schemes</a> are extracted for absolute URLs. By default, those are
- * <code>http</code>, <code>https</code>, and <code>ftp</code>. You can
- * specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
- * </p>
- *
- * <h3>HTML/XML Comments</h3>
- * <p>URLs found in &lt;!-- comments --&gt; are no longer
- * extracted by default. To enable URL extraction from comments, use
- * {@link #setCommentsEnabled(boolean)}
- * </p>
- *
- * <h3>Extract links in certain parts only</h3>
- * <p>You can identify portions of a document where links
- * should be extracted or ignored with
- * {@link #setExtractBetweens(List)} and
- * {@link #setNoExtractBetweens(List)}. Eligible content for link
- * extraction is identified first, and content to exclude is done on that
- * subset.
- * </p>
- * <p>You can further limit link extraction to specific
- * area by using
- * <a href="https://jsoup.org/cookbook/extracting-data/selector-syntax">selector-syntax</a>
- * to do so, with
- * {@link #setExtractSelectors(List)} and
- * {@link #setNoExtractSelectors(List)}.
- * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor"
- *     maxURLLength="(maximum URL length. Default is 2048)"
- *     ignoreNofollow="[false|true]"
- *     ignoreLinkData="[false|true]"
- *     commentsEnabled="[false|true]"
- *     charset="(supported character encoding)">
- *
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- *
- *   <schemes>
- *     (CSV list of URI scheme for which to perform link extraction.
- *      leave blank or remove tag to use defaults.)
- *   </schemes>
- *
- *   <!-- Which tags and attributes hold the URLs to extract. -->
- *   <tags>
- *     <tag name="(tag name)" attribute="(tag attribute)" />
- *     <!-- you can have multiple tag entries -->
- *   </tags>
- *
- *   <!-- Only extract URLs from the following text portions. -->
- *   <extractBetween ignoreCase="[false|true]">
- *     <start>(regex)</start>
- *     <end>(regex)</end>
- *   </extractBetween>
- *   <!-- you can have multiple extractBetween entries -->
- *
- *   <!-- Do not extract URLs from the following text portions. -->
- *   <noExtractBetween ignoreCase="[false|true]">
- *     <start>(regex)</start>
- *     <end>(regex)</end>
- *   </noExtractBetween>
- *   <!-- you can have multiple noExtractBetween entries -->
- *
- *   <!-- Only extract URLs matching the following selectors. -->
- *   <extractSelector>(selector)</extractSelector>
- *   <!-- you can have multiple extractSelector entries -->
- *
- *   <!-- Do not extract URLs matching the following selectors. -->
- *   <noExtractSelector>(selector)</noExtractSelector>
- *   <!-- you can have multiple noExtractSelector entries -->
- *
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor">
- *   <tags>
- *     <tag name="a" attribute="href" />
- *     <tag name="frame" attribute="src" />
- *     <tag name="iframe" attribute="src" />
- *     <tag name="img" attribute="src" />
- *     <tag name="meta" attribute="http-equiv" />
- *     <tag name="script" attribute="src" />
- *   </tags>
- * </extractor>
- * }
- *
- * <p>
- * The above example adds URLs to JavaScript files to the list of URLs to be
- * extracted.
+ * Configuration for {@link HtmlLinkExtractor}.
  * </p>
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class HtmlLinkExtractorConfig {
@@ -266,8 +53,6 @@ public class HtmlLinkExtractorConfig {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#HTML_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
-     * @return content type matcher
      */
     private final TextMatcher contentTypeMatcher =
             CommonMatchers.htmlContentTypes();
@@ -275,15 +60,11 @@ public class HtmlLinkExtractorConfig {
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
-     * @param fieldMatcher field matcher
-     * @return field matcher
      */
     private final TextMatcher fieldMatcher = new TextMatcher();
 
     /**
      * The maximum supported URL length. Longer URLs are ignored.
-     * @param maxURLLength maximum URL length
-     * @return maximum URL length
      */
     private int maxURLLength = DEFAULT_MAX_URL_LENGTH;
 
@@ -295,15 +76,11 @@ public class HtmlLinkExtractorConfig {
      *   By default this link won't be crawled.
      * &lt;/a&gt;
      * </pre>
-     * @param ignoreNofollow whether to ignore "nofollow" directives
-     * @return <code>true</code> if ignoring "nofollow" directives
      */
     private boolean ignoreNofollow;
 
     /**
      * Gets whether to ignore extra data associated with a link.
-     * @param ignoreLinkData <code>true</code> to ignore.
-     * @return <code>true</code> to ignore.
      */
     private boolean ignoreLinkData;
 
@@ -311,8 +88,6 @@ public class HtmlLinkExtractorConfig {
      * The character set to use for pages on which link extraction is performed.
      * When <code>null</code> (default), character set detection will be
      * attempted.
-     * @param charset character set to use, or <code>null</code>
-     * @return character set to use, or <code>null</code>
      */
     private Charset charset;
 
@@ -324,7 +99,6 @@ public class HtmlLinkExtractorConfig {
      * &lt;a href="https://yoursite.com/somepage.html"&gt;Some URL&lt;/a&gt;
      * --&gt;
      * </pre>
-     * @return <code>true</code> if links should be extracted from comments.
      */
     private boolean commentsEnabled;
 
@@ -357,6 +131,7 @@ public List<RegexPair> getExtractBetweens() {
      * Sets the patterns delimiting the portions of a document to be considered
      * for link extraction.
      * @param betweens extract between patterns
+     * @return this
      */
     public HtmlLinkExtractorConfig setExtractBetweens(
             List<RegexPair> betweens) {
@@ -370,6 +145,7 @@ public HtmlLinkExtractorConfig setExtractBetweens(
      * @param start pattern matching start of text portion
      * @param end pattern matching end of text portion
      * @param ignoreCase whether the patterns are case sensitive or not
+     * @return this
      */
     public HtmlLinkExtractorConfig addExtractBetween(
             String start, String end, boolean ignoreCase) {
@@ -390,6 +166,7 @@ public List<RegexPair> getNoExtractBetweens() {
      * Sets the patterns delimiting the portions of a document to be excluded
      * from link extraction.
      * @param betweens extract between patterns
+     * @return this
      */
     public HtmlLinkExtractorConfig setNoExtractBetweens(
             List<RegexPair> betweens) {
@@ -403,6 +180,7 @@ public HtmlLinkExtractorConfig setNoExtractBetweens(
      * @param start pattern matching start of text portion
      * @param end pattern matching end of text portion
      * @param ignoreCase whether the patterns are case sensitive or not
+     * @return this
      */
     public HtmlLinkExtractorConfig addNoExtractBetween(
             String start, String end, boolean ignoreCase) {
@@ -423,6 +201,7 @@ public List<String> getExtractSelectors() {
      * Sets the selectors matching the portions of a document to be considered
      * for link extraction.
      * @param selectors selectors
+     * @return this
      */
     public HtmlLinkExtractorConfig setExtractSelectors(
             List<String> selectors) {
@@ -434,6 +213,7 @@ public HtmlLinkExtractorConfig setExtractSelectors(
      * Adds selectors matching the portions of a document to be considered
      * for link extraction.
      * @param selectors selectors
+     * @return this
      */
     public HtmlLinkExtractorConfig addExtractSelectors(
             List<String> selectors) {
@@ -454,6 +234,7 @@ public List<String> getNoExtractSelectors() {
      * Sets the selectors matching the portions of a document to be excluded
      * from link extraction.
      * @param selectors selectors
+     * @return this
      */
     public HtmlLinkExtractorConfig setNoExtractSelectors(
             List<String> selectors) {
@@ -465,6 +246,7 @@ public HtmlLinkExtractorConfig setNoExtractSelectors(
      * Adds selectors matching the portions of a document to be excluded
      * from link extraction.
      * @param selectors selectors
+     * @return this
      */
     public HtmlLinkExtractorConfig addNoExtractSelectors(
             List<String> selectors) {
@@ -483,6 +265,7 @@ public List<String> getSchemes() {
     /**
      * Sets the schemes to be extracted.
      * @param schemes schemes to be extracted
+     * @return this
      */
     public HtmlLinkExtractorConfig setSchemes(List<String> schemes) {
         CollectionUtil.setAll(this.schemes, schemes);
@@ -498,7 +281,7 @@ public HtmlLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#HTML_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
+     * @param matcher content type matcher
      * @return this
      */
     public HtmlLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java
index f07dabbc3..e037b1732 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java
@@ -176,6 +176,7 @@ void testHtmlWriteRead() {
         htmlExtractor.getConfiguration()
                 .setIgnoreNofollow(true)
                 .addLinkTag("food", "chocolate")
+                .addLinkTag("food", "candy")
                 .addLinkTag("friend", "Thor")
                 .addExtractBetween("start1", "end1", true)
                 .addExtractBetween("start2", "end2", false)

From a9b20684118040b848cb3a7215bc90c2bed01550 Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Sat, 7 Sep 2024 13:18:08 -0400
Subject: [PATCH 03/10] Update FeaturedImageResolverTest.java

---
 .../doc/operations/image/impl/FeaturedImageResolverTest.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java
index 95957b6ab..6be8c60fa 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java
@@ -68,7 +68,7 @@ void testProcessFeaturedImage(
 
         var fip = new FeaturedImageResolver();
         fip.getConfiguration()
-                .setStorage(List.of(INLINE, URL, DISK))
+                .setStorages(List.of(INLINE, URL, DISK))
                 .setStorageDiskDir(tempDir.resolve("imageStorage"))
                 .setImageCacheDir(tempDir.resolve("imageCache"))
                 .setStorageInlineField("image-inline")
@@ -136,7 +136,7 @@ void testWriteRead() {
                 .setScaleQuality(Quality.LOW)
                 .setScaleDimensions(new Dimension(50, 50))
                 .setScaleStretch(true)
-                .setStorage(List.of(Storage.URL, Storage.INLINE, Storage.DISK))
+                .setStorages(List.of(Storage.URL, Storage.INLINE, Storage.DISK))
                 .setStorageDiskDir(Paths.get("c:\\someotherdir"))
                 .setStorageDiskStructure(StorageDiskStructure.DATETIME)
                 .setStorageDiskField("diskField")

From f01f7591c0d4fd825598a23b900d626ae5f6f471 Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Sun, 8 Sep 2024 02:19:48 -0400
Subject: [PATCH 04/10] JavaDoc + Unit tests.

---
 .../com/norconex/crawler/web/WebCrawler.java  |  22 +--
 .../crawler/web/WebCrawlerConfig.java         | 164 ++++--------------
 .../crawler/web/doc/WebCrawlDocContext.java   |  12 --
 .../crawler/web/doc/WebDocMetadata.java       |   8 -
 .../image/impl/FeaturedImageResolver.java     |   7 +-
 .../impl/FeaturedImageResolverConfig.java     |   5 +-
 .../doc/operations/link/LinkExtractor.java    |   4 +-
 .../link/impl/DomLinkExtractor.java           |   7 +
 .../link/impl/DomLinkExtractorConfig.java     |  18 ++
 .../link/impl/HtmlLinkExtractor.java          |   7 +
 .../link/impl/HtmlLinkExtractorConfig.java    |  18 +-
 .../link/impl/RegexLinkExtractor.java         |  46 +----
 .../link/impl/RegexLinkExtractorConfig.java   | 100 +++--------
 .../link/impl/TikaLinkExtractor.java          |  18 +-
 .../link/impl/TikaLinkExtractorConfig.java    |  48 +++--
 .../link/impl/XmlFeedLinkExtractor.java       |  25 +--
 .../link/impl/XmlFeedLinkExtractorConfig.java |  71 +++-----
 .../impl/GenericRecrawlableResolver.java      | 106 +++--------
 .../GenericRecrawlableResolverConfig.java     |  97 +++--------
 .../scope/impl/GenericUrlScopeResolver.java   |   4 +-
 .../impl/GenericUrlScopeResolverConfig.java   |  22 +--
 .../url/impl/GenericUrlNormalizer.java        |  45 -----
 .../url/impl/GenericUrlNormalizerConfig.java  | 114 +-----------
 .../UrlStatusCrawlerEventListener.java        |  32 +---
 .../UrlStatusCrawlerEventListenerConfig.java  |  98 +----------
 .../crawler/web/fetch/HttpFetcher.java        |  58 -------
 .../web/fetch/HttpFetcherProvider.java        |   5 +-
 .../web/fetch/impl/GenericHttpFetcher.java    | 108 +-----------
 .../fetch/impl/GenericHttpFetcherConfig.java  |  80 +++------
 .../web/fetch/impl/HttpAuthConfig.java        |  99 ++---------
 .../web/fetch/impl/webdriver/HttpSniffer.java |  20 +--
 .../impl/webdriver/HttpSnifferConfig.java     |  60 +++----
 .../impl/webdriver/ScreenshotHandler.java     |  34 +---
 .../webdriver/ScreenshotHandlerConfig.java    |  21 +--
 .../impl/webdriver/WebDriverHttpFetcher.java  | 130 ++------------
 .../webdriver/WebDriverHttpFetcherConfig.java | 104 ++++++++++-
 .../web/fetch/util/DocImageHandler.java       |  51 ------
 .../web/fetch/util/DocImageHandlerConfig.java |  48 ++---
 .../util/GenericRedirectUrlProvider.java      | 111 +++++-------
 .../GenericRedirectUrlProviderConfig.java     |  34 ++++
 .../impl/StandardRobotsMetaProvider.java      |  14 --
 .../StandardRobotsMetaProviderConfig.java     |  34 +---
 .../robot/impl/StandardRobotsTxtProvider.java |  12 --
 .../sitemap/impl/GenericSitemapLocator.java   |  17 --
 .../impl/GenericSitemapLocatorConfig.java     |  39 ++---
 .../web/sitemap/impl/SitemapParser.java       |  18 +-
 .../crawler/web/sitemap/impl/SitemapUtil.java |   4 -
 .../crawler/web/spi/CrawlerWebPtProvider.java |   2 -
 .../com/norconex/crawler/web/util/Web.java    |  90 ++--------
 .../web/cases/feature/StayOnSitemapTest.java  |   2 +-
 ...st.html => FeaturedImageResolverTest.html} |   0
 .../impl/GenericRecrawlableResolverTest.java  |  18 +-
 .../AbstractWebDriverHttpFetcherTest.java     |   9 +-
 .../WebDriverHttpFetcherConfigTest.java       |  18 +-
 .../util/GenericRedirectUrlProviderTest.java  |   2 +-
 .../validation/web-crawl-session-large.xml    |   9 +-
 .../importer/handler/CommonMatchers.java      |  10 ++
 57 files changed, 616 insertions(+), 1743 deletions(-)
 create mode 100644 crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java
 rename crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/{FeaturedImageProcessorTest.html => FeaturedImageResolverTest.html} (100%)

diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
index 0453dc015..cca040ae0 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
@@ -59,26 +59,8 @@ public static int launch(String... args) {
     public static Crawler create(WebCrawlerConfig crawlerConfig) {
         return crawlerBuilderSupplier
                 .get()
-                .configuration(
-                        Optional.ofNullable(crawlerConfig)
-                                .orElseGet(WebCrawlerConfig::new))
+                .configuration(Optional.ofNullable(crawlerConfig)
+                        .orElseGet(WebCrawlerConfig::new))
                 .build();
     }
-
-    //    static CrawlSessionImpl initCrawlSessionImpl(
-    //            CrawlSessionConfig sessionConfig) {
-    //        return CrawlSessionImpl
-    //            .builder()
-    //            .crawlerConfigClass(WebCrawlerConfig.class)
-    //            .crawlerFactory(
-    //                (sess, cfg) -> Crawler.builder()
-    //                    .crawlSession(sess)
-    //                    .crawlerConfig(cfg)
-    //                    .crawlerImpl(WebCrawlerImplFactory.create())
-    //                    .build()
-    //            )
-    //            .beanMapper(Web.beanMapper())
-    //            .crawlSessionConfig(sessionConfig)
-    //            .build();
-    //    }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java
index 6685ca5a3..5436d97e4 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java
@@ -28,8 +28,10 @@
 import com.norconex.crawler.core.doc.operations.checksum.DocumentChecksummer;
 import com.norconex.crawler.core.doc.operations.checksum.MetadataChecksummer;
 import com.norconex.crawler.core.doc.operations.checksum.impl.Md5DocumentChecksummer;
+import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategizer;
 import com.norconex.crawler.core.doc.pipelines.queue.ReferencesProvider;
 import com.norconex.crawler.core.fetch.FetchDirectiveSupport;
+import com.norconex.crawler.core.store.DataStoreEngine;
 import com.norconex.crawler.core.store.impl.mvstore.MvStoreDataStoreEngine;
 import com.norconex.crawler.web.doc.WebDocMetadata;
 import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector;
@@ -39,6 +41,7 @@
 import com.norconex.crawler.web.doc.operations.delay.impl.GenericDelayResolver;
 import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor;
+import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractorConfig;
 import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver;
 import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolver;
 import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver;
@@ -64,7 +67,8 @@
 
 /**
  * <p>
- * Web Crawler configuration.
+ * Web Crawler configuration, adding more options to the base
+ * {@link CrawlerConfig}.
  * </p>
  * <h3>Start URLs</h3>
  * <p>
@@ -90,7 +94,7 @@
  * <b>Scope: </b> To limit crawling to specific web domains, and avoid creating
  * many filters to that effect, you can tell the crawler to "stay" within
  * the web site "scope" with
- * {@link #setUrlCrawlScopeStrategy(GenericUrlScopeResolver)}.
+ * {@link #setUrlScopeResolver(UrlScopeResolver)}.
  * </p>
  *
  * <h3>URL Normalization</h3>
@@ -195,7 +199,7 @@
  * (see {@link MvStoreDataStoreEngine}). While very capable and suitable
  * for most sites, if you need a larger storage system, you can change
  * the default implementation or provide your own
- * with {@link #setDataStoreEngine(IDataStoreEngine)}.
+ * with {@link #setDataStoreEngine(DataStoreEngine)}.
  * </p>
  *
  * <h3>Document Importing</h3>
@@ -214,7 +218,7 @@
  * and are suddenly failing on a subsequent crawl are considered "spoiled".
  * You can decide whether to grace (retry next time), delete, or ignore
  * those spoiled documents with
- * {@link #setSpoiledReferenceStrategizer(ISpoiledReferenceStrategizer)}.
+ * {@link #setSpoiledReferenceStrategizer(SpoiledReferenceStrategizer)}.
  * </p>
  *
  * <h3>Committing Documents</h3>
@@ -238,7 +242,7 @@
  * is needed. For instance, JavaScript-generated web pages are often best
  * handled by web browsers. In such case you can use the
  * {@link WebDriverHttpFetcher}. You can also use
- * {@link #setHttpFetchers(List)} to supply own fetcher implementation.
+ * {@link #setFetchers(List)} to supply your own fetcher implementation.
  * </p>
  *
  * <h3>HTTP Methods</h3>
@@ -250,7 +254,7 @@
  * </p>
  * <p>
  * You can tell the crawler how it should handle HTTP GET and HEAD requests
- * using using {@link #setDocumentFetchSupport(FetchDirectiveSupport) and
+ * using using {@link #setDocumentFetchSupport(FetchDirectiveSupport)} and
  * {@link #setMetadataFetchSupport(FetchDirectiveSupport)} respectively.
  * For each, the options are:
  * </p>
@@ -302,15 +306,18 @@
  *     <b>Metadata filters:</b> Applies filtering on a document metadata fields.
  *     </p>
  *     <p>
- *     If {@link #isFetchHttpHead()} returns <code>true</code>, these filters
- *     will be invoked after the crawler performs a distinct HTTP HEAD request.
+ *     If {@link #getMetadataFetchSupport()} value forces a distinct call
+ *     for fetching metadata, these filters will be invoked after the crawler
+ *     performs an HTTP HEAD request.
  *     It gives you the opportunity to filter documents based on the HTTP HEAD
  *     response to potentially save a more expensive HTTP GET request for
  *     download (but results in two HTTP requests for valid documents --
- *     HEAD and GET). Filtering occurs before URLs are extracted.
+ *     HEAD and GET). Filtering occurs before URLs are extracted (since
+ *     no content is downloaded.
  *     </p>
  *     <p>
- *     When {@link #isFetchHttpHead()} is <code>false</code>, these filters
+ *     When {@link #getMetadataFetchSupport()} does not invoke making a
+ *     distinct call for metadata, these filters
  *     will be invoked on the metadata of the HTTP response
  *     obtained from an HTTP GET request (as the document is downloaded).
  *     Filtering occurs after URLs are extracted.
@@ -326,10 +333,11 @@
  *     <b>Importer filters:</b> The Importer module also offers document
  *     filtering options. At that point a document is already downloaded
  *     and its links extracted.  There are two types of filtering offered
- *     by the Importer: before and after document parsing.  Use
- *     filters before parsing if you need to filter on raw content or
- *     want to prevent an expensive parsing. Use filters after parsing
- *     when you need to read the content as plain text.
+ *     by the Importer: before and after document parsing (assuming you
+ *     configured at least one parser).  Use filters before parsing if you
+ *     need to filter on raw content or want to avoid parsing some documents.
+ *     Use filters after parsing when you need to read the content
+ *     as plain text.
  *   </li>
  * </ul>
  *
@@ -362,7 +370,7 @@
  *     <b>HTML "nofollow":</b> Most HTML-oriented link extractors support
  *     the <code>rel="nofollow"</code> attribute set on HTML links and offer
  *     a way to disable this instruction. E.g.,
- *     {@link HtmlLinkExtractor#setIgnoreNofollow(boolean)}.
+ *     {@link HtmlLinkExtractorConfig#setIgnoreNofollow(boolean)}.
  *   </li>
  *   <li>
  *     <b>Sitemap:</b> Sitemaps XML files contain as listing of
@@ -373,7 +381,7 @@
  *     offers support for disabling sitemap detection to rely only
  *     on sitemap start references.
  *     Setting it to <code>null</code> via
- *     {@link #setSitemapResolver(SitemapResolver_OLD) effectively disables
+ *     {@link #setSitemapResolver(SitemapResolver)} effectively disables
  *     sitemap support altogether, and is thus incompatible with sitemaps
  *     specified as start references.
 *    </li>
@@ -383,7 +391,7 @@
  *     HTTP response instructions.
  *     Defaults to {@link GenericCanonicalLinkDetector}.
  *     Set to <code>null</code> via
- *     {@link #setCanonicalLinkDetector(CanonicalLinkDetector) to disable
+ *     {@link #setCanonicalLinkDetector(CanonicalLinkDetector)} to disable
  *     support canonical links (increasing the chance of getting duplicates).
  *     </li>
  *   <li>
@@ -426,7 +434,6 @@
  *
  * <h3>Deduplication</h3>
  * <p>
- * <b>EXPERIMENTAL:</b>
  * The crawler can attempt to detect and reject documents considered as
  * duplicates within a crawler session.  A document will be considered
  * duplicate if there was already a document processed with the same
@@ -434,7 +441,7 @@
  * {@link #setMetadataDeduplicate(boolean)} and/or
  * {@link #setDocumentDeduplicate(boolean)} to <code>true</code>. Setting
  * those will have no effect if the corresponding checksummers are
- * <code>null</code>.
+ * <code>null</code> or checksums are otherwise not are being generated.
  * </p>
  * <p>
  * Deduplication can impact crawl performance.  It is recommended you
@@ -461,99 +468,10 @@
  * URLs in that field will become eligible for crawling.
  * See {@link #setPostImportLinks(TextMatcher)}.
  * </p>
- *
- * {@nx.xml.usage
- * <crawler id="(crawler unique identifier)">
- *
- *   <start
- *       stayOnDomain="[false|true]"
- *       includeSubdomains="[false|true]"
- *       stayOnPort="[false|true]"
- *       stayOnProtocol="[false|true]"
- *       async="[false|true]"
- *       stayOnSitemap="[false|true]">
- *     <!-- All the following tags are repeatable. -->
- *     <ref>(a URL)</ref>
- *     <refsFile>(local path to a file containing URLs)</refsFile>
- *     <sitemap>(URL to a sitemap XML)</sitemap>
- *     <provider class="(StartURLsProvider implementation)"/>
- *   </start>
- *
- *   <urlNormalizer class="(URLNormalizer implementation)" />
- *
- *   <delay class="(DelayResolver implementation)"/>
- *
- *   <maxDepth>(maximum crawl depth)</maxDepth>
- *   <keepReferencedLinks>[INSCOPE|OUTSCOPE|MAXDEPTH]</keepReferencedLinks>
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#init}
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#directive-meta}
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#directive-doc}
- *
- *   <httpFetchers
- *       maxRetries="(number of times to retry a failed fetch attempt)"
- *       retryDelay="(how many milliseconds to wait between re-attempting)">
- *     <!-- Repeatable -->
- *     <fetcher
- *         class="(HttpFetcher implementation)"/>
- *   </httpFetchers>
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#pipeline-queue}
- *
- *   <robotsTxt class="(RobotsMetaProvider implementation)"/>
- *
- *   <sitemapResolver class="(SitemapResolver_OLD implementation)"/>
- *
- *   <recrawlableResolver class="(RecrawlableResolver implementation)" />
- *
- *   <canonicalLinkDetector class="(CanonicalLinkDetector implementation)"/>
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#checksum-meta}
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#dedup-meta}
- *
- *   <robotsMeta class="(RobotsMetaProvider implementation)" />
- *
- *   <linkExtractors>
- *     <!-- Repeatable -->
- *     <extractor class="(LinkExtractor implementation)" />
- *   </linkExtractors>
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#pipeline-import}
- *
- *   <preImportProcessors>
- *     <!-- Repeatable -->
- *     <processor class="(WebDocumentProcessor implementation)"></processor>
- *   </preImportProcessors>
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#import}
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#checksum-doc}
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#dedup-doc}
- *
- *   <postImportProcessors>
- *     <!-- Repeatable -->
- *     <processor class="(WebDocumentProcessor implementation)"></processor>
- *   </postImportProcessors>
- *
- *   <postImportLinks keep="[false|true]">
- *     <fieldMatcher
- *       {@nx.include com.norconex.commons.lang.text.TextMatcher#matchAttributes} />
- *   </postImportLinks>
- *
- *   {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#pipeline-committer}
- * </crawler>
- * }
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 @FieldNameConstants
-
-//TODO Given we don't need @schema here to pick up javadoc
-// when we have the maven-compiler-plugin setup properly...
-// do we need to include all compile maven dependencies we are including for open api?
-// maybe need to add to maven-compiler plugin the swagger stuff, like in core?
-//@Schema //(description = "Web crawler configuration.")
 public class WebCrawlerConfig extends CrawlerConfig {
 
     /**
@@ -573,23 +491,17 @@ public enum ReferencedLinkType {
 
     /**
      * The strategy to use to determine if a URL is in scope.
-     * @param genericUrlScopeResolver strategy to use
-     * @return the strategy
      */
     private UrlScopeResolver urlScopeResolver = new GenericUrlScopeResolver();
 
     /**
      * The URL normalizer. Defaults to {@link GenericUrlNormalizer}.
-     * @param urlNormalizer URL normalizer
-     * @return URL normalizer
      */
     private WebUrlNormalizer urlNormalizer = new GenericUrlNormalizer();
 
     /**
      * The delay resolver dictating the minimum amount of time to wait
      * between web requests. Defaults to {@link GenericDelayResolver}.
-     * @param delayResolver delay resolver
-     * @return delay resolver
      */
     private DelayResolver delayResolver = new GenericDelayResolver();
 
@@ -597,8 +509,6 @@ public enum ReferencedLinkType {
      * The canonical link detector. To disable canonical link detection,
      * use {@link #setIgnoreCanonicalLinks(boolean)}.
      * Defaults to {@link GenericCanonicalLinkDetector}.
-     * @param canonicalLinkDetector the canonical link detector
-     * @return the canonical link detector
      */
     private CanonicalLinkDetector canonicalLinkDetector =
             new GenericCanonicalLinkDetector();
@@ -608,10 +518,10 @@ public enum ReferencedLinkType {
 
     private TextMatcher postImportLinks = new TextMatcher();
     /**
-     * Whether to keep the Importer-generated field holding URLs to queue
-     * for further crawling.
-     * @param postImportLinksKeep <code>true</code> if keeping
-     * @return <code>true</code> if keeping
+     * Whether to keep the Importer-populated fields
+     * from {@link #getPostImportLinks()}. By default, those are deleted
+     * from a document when the URLs they contain are queued for processing
+     * or otherwise evaluated.
      * @see #setPostImportLinks(TextMatcher)
      */
     private boolean postImportLinksKeep;
@@ -620,8 +530,6 @@ public enum ReferencedLinkType {
      * The provider of robots.txt rules for a site (if applicable).
      * Defaults to {@link StandardRobotsTxtProvider}.
      * Set to <code>null</code> to disable.
-     * @param robotsTxtProvider robots.txt provider
-     * @return robots.txt provider
      * @see #setIgnoreRobotsTxt(boolean)
      */
     private RobotsTxtProvider robotsTxtProvider =
@@ -631,8 +539,6 @@ public enum ReferencedLinkType {
      * The provider of robots metadata rules for a page (if applicable).
      * Defaults to {@link StandardRobotsMetaProvider}.
      * Set to <code>null</code> to disable.
-     * @param robotsMetaProvider robots metadata rules
-     * @return robots metadata rules r
      * @see #setIgnoreRobotsMeta(boolean)
      */
     private RobotsMetaProvider robotsMetaProvider =
@@ -643,8 +549,6 @@ public enum ReferencedLinkType {
      * Defaults to {@link GenericSitemapResolver}.
      * Set to <code>null</code> to disable all sitemap support, or
      * see class documentation to disable sitemap detection only.
-     * @param sitemapResolver sitemap resolver
-     * @return sitemap resolver
      * @see SitemapLocator
      */
     private SitemapResolver sitemapResolver = new GenericSitemapResolver();
@@ -654,8 +558,6 @@ public enum ReferencedLinkType {
      * Defaults to {@link GenericSitemapLocator}.
      * Set to <code>null</code> to disable locating sitemaps
      * (relying on sitemaps defined as start reference, if any).
-     * @param sitemapLocator sitemap locator
-     * @return sitemap locator
      * @see SitemapResolver
      */
     private SitemapLocator sitemapLocator = new GenericSitemapLocator();
@@ -665,8 +567,6 @@ public enum ReferencedLinkType {
      * crawled by a new crawl session. Usually amounts to checking if enough
      * time has passed between two crawl sessions.
      * Defaults to {@link GenericRecrawlableResolver}.
-     * @param robotsMetaProvider recrawlable resolver
-     * @return recrawlableResolver recrawlable resolver
      */
     private RecrawlableResolver recrawlableResolver =
             new GenericRecrawlableResolver();
@@ -690,6 +590,7 @@ public List<String> getStartReferencesSitemaps() {
     /**
      * Sets the sitemap URLs used as starting points for crawling.
      * @param startReferencesSitemaps sitemap URLs
+     * @return this
      * @since 3.0.0
      */
     public WebCrawlerConfig setStartReferencesSitemaps(
@@ -716,6 +617,7 @@ public Set<ReferencedLinkType> getKeepReferencedLinks() {
      * Those links are URLs extracted by link extractors. See class
      * documentation for more details.
      * @param keepReferencedLinks option for keeping links
+     * @return this
      * @since 3.0.0
      */
     public WebCrawlerConfig setKeepReferencedLinks(
@@ -735,6 +637,7 @@ public List<LinkExtractor> getLinkExtractors() {
     /**
      * Sets link extractors.
      * @param linkExtractors link extractors
+     * @return this
      * @since 3.0.0
      */
     public WebCrawlerConfig setLinkExtractors(
@@ -757,6 +660,7 @@ public TextMatcher getPostImportLinks() {
      * Set a field matcher used to identify post-import metadata fields
      * holding URLs to consider for crawling.
      * @param fieldMatcher field matcher
+     * @return this
      * @since 3.0.0
      */
     public WebCrawlerConfig setPostImportLinks(TextMatcher fieldMatcher) {
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java
index a12348bbe..026e3e51b 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java
@@ -47,26 +47,17 @@ public class WebCrawlDocContext extends CrawlDocContext {
 
     /**
      * The document last modified date according to sitemap.
-     * @param sitemapLastMod document last modified date
-     * @return document last modified date
      */
-    @SuppressWarnings("javadoc")
     private ZonedDateTime sitemapLastMod;
 
     /**
      * The document change frequency according to sitemap.
-     * @param sitemapChangeFreq document change frequency
-     * @return document change frequency
      */
-    @SuppressWarnings("javadoc")
     private String sitemapChangeFreq;
 
     /**
      * The document priority according to sitemap.
-     * @param sitemapPriority document priority
-     * @return document priority
      */
-    @SuppressWarnings("javadoc")
     private Float sitemapPriority;
 
     private String referrerReference;
@@ -74,11 +65,8 @@ public class WebCrawlDocContext extends CrawlDocContext {
 
     /**
      * The HTTP ETag.
-     * @return etag
-     * @param etag the HTTP ETag
      * @since 3.0.0
      */
-    @SuppressWarnings("javadoc")
     private String etag;
 
     private final List<String> referencedUrls = new ArrayList<>();
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java
index 181fcb2c9..d573f421e 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java
@@ -16,7 +16,6 @@
 
 import static com.norconex.crawler.core.doc.CrawlDocMetadata.PREFIX;
 
-import com.norconex.crawler.core.doc.CrawlDocMetadata;
 import com.norconex.importer.doc.DocMetadata;
 
 /**
@@ -42,13 +41,6 @@ public final class WebDocMetadata {
     /** @since 3.0.0 */
     public static final String ORIGINAL_REFERENCE =
             PREFIX + "original-reference";
-    /**
-     * @since 3.0.0
-     * @deprecated since 4.0.0, {@link CrawlDocMetadata#FETCHER} is used instead
-     */
-    @Deprecated
-    public static final String HTTP_FETCHER =
-            PREFIX + "http-fetcher";
 
     private WebDocMetadata() {
     }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java
index 58bc45d24..41ce6eaa4 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java
@@ -103,11 +103,8 @@
 @ToString
 @Slf4j
 public class FeaturedImageResolver
-        extends
-        CrawlerLifeCycleListener
-        implements
-        DocumentConsumer,
-        Configurable<FeaturedImageResolverConfig> {
+        extends CrawlerLifeCycleListener
+        implements DocumentConsumer, Configurable<FeaturedImageResolverConfig> {
 
     //TODO add ability to extract from popular HTML <meta> for
     // featured image
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java
index f931dae7e..e8d408c58 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java
@@ -40,6 +40,9 @@
 @Accessors(chain = true)
 public class FeaturedImageResolverConfig {
 
+    //TODO consider taking advantage of DocImageHandlerConfig since there
+    // are overlaps
+
     public static final String FEATURED_IMAGE_URL_FIELD =
             CrawlDocMetadata.PREFIX + "featured-image-url";
     public static final String FEATURED_IMAGE_PATH_FIELD =
@@ -236,7 +239,7 @@ public List<Storage> getStorages() {
      * @return this
      */
     public FeaturedImageResolverConfig setStorages(List<Storage> storages) {
-        CollectionUtil.setAll(storages, storages);
+        CollectionUtil.setAll(this.storages, storages);
         return this;
     }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java
index eec2f5cfa..b502bb19f 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java
@@ -32,8 +32,8 @@
  */
 public interface LinkExtractor {
 
-    //TODO have ability to return any number of extra info with a link
-    // that could be added to target URL as extra metadata.  e.g., store as json.
+    //MAYBE have ability to return any number of extra info with a link
+    //that could be added to target URL as extra metadata. Store as JSON?
 
     Set<Link> extractLinks(CrawlDoc doc) throws IOException;
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java
index ae46be6da..492c18113 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java
@@ -19,6 +19,7 @@
 
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.regex.Pattern;
@@ -172,6 +173,12 @@ public Set<Link> extractLinks(CrawlDoc doc) throws IOException {
             return Set.of();
         }
 
+        if (!getConfiguration().getRestrictions().isEmpty()
+                && !getConfiguration().getRestrictions().matches(
+                        doc.getMetadata())) {
+            return Collections.emptySet();
+        }
+
         Set<Link> links = new HashSet<>();
         var parser = DomUtil.toJSoupParser(configuration.getParser());
 
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java
index 3df4b2286..6392d7c0f 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java
@@ -25,6 +25,7 @@
 import java.util.Objects;
 
 import com.norconex.commons.lang.collection.CollectionUtil;
+import com.norconex.commons.lang.map.PropertyMatchers;
 import com.norconex.commons.lang.text.TextMatcher;
 import com.norconex.importer.handler.CommonMatchers;
 import com.norconex.importer.util.DomUtil;
@@ -63,6 +64,8 @@ public static class LinkSelector {
     private final TextMatcher contentTypeMatcher =
             CommonMatchers.domContentTypes();
 
+    private final PropertyMatchers restrictions = new PropertyMatchers();
+
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
@@ -201,4 +204,19 @@ public DomLinkExtractorConfig setSchemes(List<String> schemes) {
         CollectionUtil.setAll(this.schemes, schemes);
         return this;
     }
+
+    /**
+     * Clears all restrictions.
+     */
+    public void clearRestrictions() {
+        restrictions.clear();
+    }
+
+    /**
+     * Gets all restrictions
+     * @return the restrictions
+     */
+    public PropertyMatchers getRestrictions() {
+        return restrictions;
+    }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java
index 9804138c3..875578ac4 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Optional;
@@ -302,6 +303,12 @@ public Set<Link> extractLinks(CrawlDoc doc) throws IOException {
             return Set.of();
         }
 
+        if (!getConfiguration().getRestrictions().isEmpty()
+                && !getConfiguration().getRestrictions().matches(
+                        doc.getMetadata())) {
+            return Collections.emptySet();
+        }
+
         var refererUrl = doc.getReference();
         Set<Link> links = new HashSet<>();
 
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java
index 019f5198d..0bc185467 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java
@@ -23,6 +23,7 @@
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.norconex.commons.lang.collection.CollectionUtil;
 import com.norconex.commons.lang.map.Properties;
+import com.norconex.commons.lang.map.PropertyMatchers;
 import com.norconex.commons.lang.text.TextMatcher;
 import com.norconex.importer.handler.CommonMatchers;
 
@@ -57,6 +58,8 @@ public class HtmlLinkExtractorConfig {
     private final TextMatcher contentTypeMatcher =
             CommonMatchers.htmlContentTypes();
 
+    private final PropertyMatchers restrictions = new PropertyMatchers();
+
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
@@ -289,7 +292,20 @@ public HtmlLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
         return this;
     }
 
-    //--- Public methods -------------------------------------------------------
+    /**
+     * Clears all restrictions.
+     */
+    public void clearRestrictions() {
+        restrictions.clear();
+    }
+
+    /**
+     * Gets all restrictions
+     * @return the restrictions
+     */
+    public PropertyMatchers getRestrictions() {
+        return restrictions;
+    }
 
     public synchronized HtmlLinkExtractorConfig addLinkTag(
             String tagName, String attribute) {
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java
index ada0f10a9..235cf3a6a 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java
@@ -18,7 +18,6 @@
 import java.io.InputStreamReader;
 import java.util.Collections;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -54,7 +53,8 @@
  * text/.*
  * </pre>
  * <p>
- * You can specify your own restrictions using {@link #setRestrictions(List)},
+ * You can specify your own restrictions using
+ * {@link RegexLinkExtractorConfig#getRestrictions()},
  * but make sure they represent text files.
  * </p>
  *
@@ -74,45 +74,11 @@
  * detect the encoding of the a page when extracting links and
  * referrer information. If no charset could be detected, it falls back to
  * UTF-8. It is also possible to dictate which encoding to use with
- * {@link #setCharset(String)}.
- * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.RegexLinkExtractor"
- *     maxURLLength="(maximum URL length. Default is 2048)"
- *     charset="(supported character encoding)" >
- *
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- *
- *   <!-- Patterns for URLs to extract -->
- *   <linkExtractionPatterns>
- *     <pattern>
- *       <match>(regular expression)</match>
- *       <replace>(optional regex replacement)</replace>
- *     </pattern>
- *     <!-- you can have multiple pattern entries -->
- *   </linkExtractionPatterns>
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.RegexLinkExtractor">
- *   <linkExtractionPatterns>
- *     <pattern>
- *       <match>\[(\d+)\]</match>
- *       <replace>http://www.example.com/page?id=$1</replace>
- *     </pattern>
- *   </linkExtractionPatterns>
- * </extractor>
- * }
- * <p>
- * The above example extracts page "ids" contained in square brackets and
- * add them to a custom URL.
+ * {@link RegexLinkExtractorConfig#setCharset(java.nio.charset.Charset)}.
  * </p>
  *
  * @since 2.7.0
  */
-@SuppressWarnings("javadoc")
 @EqualsAndHashCode
 @ToString
 public class RegexLinkExtractor
@@ -144,10 +110,8 @@ public Set<Link> extractLinks(CrawlDoc doc) throws IOException {
             doc.getMetadata()
                     .matchKeys(configuration.getFieldMatcher())
                     .valueList()
-                    .forEach(
-                            val -> extractLinks(
-                                    links, val,
-                                    doc.getReference()));
+                    .forEach(val -> extractLinks(
+                            links, val, doc.getReference()));
         } else {
             // Body
             var sb = new StringBuilder();
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java
index 193618830..ed0a3534b 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java
@@ -22,7 +22,7 @@
 import com.norconex.commons.lang.collection.CollectionUtil;
 import com.norconex.commons.lang.map.PropertyMatchers;
 import com.norconex.commons.lang.text.TextMatcher;
-import com.norconex.crawler.web.doc.WebDocMetadata;
+import com.norconex.importer.handler.CommonMatchers;
 
 import lombok.AllArgsConstructor;
 import lombok.Data;
@@ -31,81 +31,10 @@
 
 /**
  * <p>
- * Link extractor using regular expressions to extract links found in text
- * documents. Relative links are resolved to the document URL.
- * For HTML documents, it is best advised to use the
- * {@link HtmlLinkExtractor} or {@link DomLinkExtractor},
- * which addresses many cases specific to HTML.
+ * Configuration for {@link RegexLinkExtractor}.
  * </p>
- *
- * <h3>Applicable documents</h3>
- * <p>
- * By default, this extractor will extract URLs only in documents having
- * their content type matching this regular expression:
- * </p>
- * <pre>
- * text/.*
- * </pre>
- * <p>
- * You can specify your own restrictions using {@link #setRestrictions(List)},
- * but make sure they represent text files.
- * </p>
- *
- * <h3>Referrer data</h3>
- * <p>
- * The following referrer information is stored as metadata in each document
- * represented by the extracted URLs:
- * </p>
- * <ul>
- *   <li><b>Referrer reference:</b> The reference (URL) of the page where the
- *   link to a document was found.  Metadata value is
- *   {@link WebDocMetadata#REFERRER_REFERENCE}.</li>
- * </ul>
- *
- * <h3>Character encoding</h3>
- * <p>This extractor will by default <i>attempt</i> to
- * detect the encoding of the a page when extracting links and
- * referrer information. If no charset could be detected, it falls back to
- * UTF-8. It is also possible to dictate which encoding to use with
- * {@link #setCharset(String)}.
- * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.RegexLinkExtractor"
- *     maxURLLength="(maximum URL length. Default is 2048)"
- *     charset="(supported character encoding)" >
- *
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- *
- *   <!-- Patterns for URLs to extract -->
- *   <linkExtractionPatterns>
- *     <pattern>
- *       <match>(regular expression)</match>
- *       <replace>(optional regex replacement)</replace>
- *     </pattern>
- *     <!-- you can have multiple pattern entries -->
- *   </linkExtractionPatterns>
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.RegexLinkExtractor">
- *   <linkExtractionPatterns>
- *     <pattern>
- *       <match>\[(\d+)\]</match>
- *       <replace>http://www.example.com/page?id=$1</replace>
- *     </pattern>
- *   </linkExtractionPatterns>
- * </extractor>
- * }
- * <p>
- * The above example extracts page "ids" contained in square brackets and
- * add them to a custom URL.
- * </p>
- *
  * @since 2.7.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class RegexLinkExtractorConfig {
@@ -127,19 +56,22 @@ public static class ExtractionPattern {
     /**
      * The maximum supported URL length.
      * Default is {@value #DEFAULT_MAX_URL_LENGTH}.
-     * @param maxUrlLength maximum URL length
-     * @return maximum URL length
      */
     private int maxUrlLength = DEFAULT_MAX_URL_LENGTH;
 
     /**
      * Gets the character set of pages on which link extraction is performed.
      * Default is <code>null</code> (charset detection will be attempted).
-     * @param charset character set to use, or <code>null</code>
-     * @return character set to use, or <code>null</code>
      */
     private Charset charset;
 
+    /**
+     * The matcher of content types to apply link extraction on. No attempt to
+     * extract links from any other content types will be made. Default
+     * matches all content types
+     */
+    private final TextMatcher contentTypeMatcher = CommonMatchers.all();
+
     private final List<ExtractionPattern> patterns = new ArrayList<>();
 
     private final PropertyMatchers restrictions = new PropertyMatchers();
@@ -147,8 +79,6 @@ public static class ExtractionPattern {
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
-     * @param fieldMatcher field matcher
-     * @return field matcher
      */
     private final TextMatcher fieldMatcher = new TextMatcher();
 
@@ -167,6 +97,18 @@ public RegexLinkExtractorConfig clearPatterns() {
         return this;
     }
 
+    /**
+     * The matcher of content types to apply link extraction on. No attempt to
+     * extract links from any other content types will be made. Default matches
+     * all content types.
+     * @param matcher content type matcher
+     * @return this
+     */
+    public RegexLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
+        contentTypeMatcher.copyFrom(matcher);
+        return this;
+    }
+
     /**
      * Clears all restrictions.
      */
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java
index 90d71d2ea..2cd45eb46 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java
@@ -19,6 +19,7 @@
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.Set;
@@ -48,7 +49,8 @@
  * Implementation of {@link LinkExtractor} using
  * <a href="http://tika.apache.org/">Apache Tika</a> to perform URL
  * extractions from HTML documents.
- * This is an alternative to the {@link HtmlLinkExtractor}.
+ * This is an alternative to the {@link HtmlLinkExtractor} or even
+ * {@link DomLinkExtractor}.
  * </p>
  * <p>
  * The configuration of content-types, storing the referrer data, and ignoring
@@ -57,16 +59,8 @@
  * pre-defined set of link attributes, when available (title, type,
  * uri, text, rel).
  * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.TikaLinkExtractor"
- *     ignoreNofollow="[false|true]" >
- *   {@nx.include com.norconex.importer.handler.AbstractImporterHandler#restrictTo}
- * </extractor>
- * }
  * @see HtmlLinkExtractor
  */
-@SuppressWarnings("javadoc")
 @EqualsAndHashCode
 @ToString
 public class TikaLinkExtractor
@@ -90,6 +84,12 @@ public Set<com.norconex.crawler.web.doc.operations.link.Link> extractLinks(
             return Set.of();
         }
 
+        if (!getConfiguration().getRestrictions().isEmpty()
+                && !getConfiguration().getRestrictions().matches(
+                        doc.getMetadata())) {
+            return Collections.emptySet();
+        }
+
         var refererUrl = doc.getReference();
         Set<com.norconex.crawler.web.doc.operations.link.Link> nxLinks =
                 new HashSet<>();
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java
index 1a99cfe20..ba01ae372 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java
@@ -14,8 +14,8 @@
  */
 package com.norconex.crawler.web.doc.operations.link.impl;
 
+import com.norconex.commons.lang.map.PropertyMatchers;
 import com.norconex.commons.lang.text.TextMatcher;
-import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.importer.handler.CommonMatchers;
 
 import lombok.Data;
@@ -23,28 +23,9 @@
 
 /**
  * <p>
- * Implementation of {@link LinkExtractor} using
- * <a href="http://tika.apache.org/">Apache Tika</a> to perform URL
- * extractions from HTML documents.
- * This is an alternative to the {@link HtmlLinkExtractor}.
+ * Configuration for {@link TikaLinkExtractor}.
  * </p>
- * <p>
- * The configuration of content-types, storing the referrer data, and ignoring
- * "nofollow" and ignoring link data are the same as in
- * {@link HtmlLinkExtractor}. For link data, this parser only keeps a
- * pre-defined set of link attributes, when available (title, type,
- * uri, text, rel).
- * </p>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.TikaLinkExtractor"
- *     ignoreNofollow="[false|true]" >
- *   {@nx.include com.norconex.importer.handler.AbstractImporterHandler#restrictTo}
- * </extractor>
- * }
- * @see HtmlLinkExtractor
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class TikaLinkExtractorConfig {
@@ -52,8 +33,6 @@ public class TikaLinkExtractorConfig {
     private boolean ignoreNofollow;
     /**
      * Whether to ignore extra data associated with a link.
-     * @param ignoreLinkData <code>true</code> to ignore.
-     * @return <code>true</code> to ignore.
      * @since 3.0.0
      */
     private boolean ignoreLinkData;
@@ -62,17 +41,15 @@ public class TikaLinkExtractorConfig {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#HTML_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
-     * @return content type matcher
      */
     private final TextMatcher contentTypeMatcher =
             CommonMatchers.htmlContentTypes();
 
+    private final PropertyMatchers restrictions = new PropertyMatchers();
+
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
-     * @param fieldMatcher field matcher
-     * @return field matcher
      */
     private final TextMatcher fieldMatcher = new TextMatcher();
 
@@ -85,11 +62,26 @@ public TikaLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#HTML_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
+     * @param matcher content type matcher
      * @return this
      */
     public TikaLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
         contentTypeMatcher.copyFrom(matcher);
         return this;
     }
+
+    /**
+     * Clears all restrictions.
+     */
+    public void clearRestrictions() {
+        restrictions.clear();
+    }
+
+    /**
+     * Gets all restrictions
+     * @return the restrictions
+     */
+    public PropertyMatchers getRestrictions() {
+        return restrictions;
+    }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java
index 88bf838b1..c9270e521 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java
@@ -18,6 +18,7 @@
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -68,26 +69,8 @@
  *   {@link WebDocMetadata#REFERRER_REFERENCE}.</li>
  * </ul>
  *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.XmlFeedLinkExtractor">
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.XmlFeedLinkExtractor">
- *   <restrictTo field="document.reference" method="regex">.*rss$</restrictTo>
- * </extractor>
- * }
- * <p>
- * The above example specifies this extractor should only apply on documents
- * that have their URL ending with "rss" (in addition to the default
- * content types supported).
- * </p>
- *
  * @since 2.7.0
  */
-@SuppressWarnings("javadoc")
 @EqualsAndHashCode
 @ToString
 public class XmlFeedLinkExtractor
@@ -106,6 +89,12 @@ public Set<Link> extractLinks(CrawlDoc doc) throws IOException {
             return Set.of();
         }
 
+        if (!getConfiguration().getRestrictions().isEmpty()
+                && !getConfiguration().getRestrictions().matches(
+                        doc.getMetadata())) {
+            return Collections.emptySet();
+        }
+
         var refererUrl = doc.getReference();
         Set<Link> links = new HashSet<>();
 
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java
index 794469625..da5047974 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java
@@ -14,9 +14,8 @@
  */
 package com.norconex.crawler.web.doc.operations.link.impl;
 
+import com.norconex.commons.lang.map.PropertyMatchers;
 import com.norconex.commons.lang.text.TextMatcher;
-import com.norconex.crawler.web.doc.WebDocMetadata;
-import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.importer.handler.CommonMatchers;
 
 import lombok.Data;
@@ -24,53 +23,10 @@
 
 /**
  * <p>
- * Link extractor for extracting links out of
- * <a href="https://en.wikipedia.org/wiki/RSS">RSS</a> and
- * <a href="https://en.wikipedia.org/wiki/Atom_(standard)">Atom</a> XML feeds.
- * It extracts the content of &lt;link&gt; tags.  If you need more complex
- * extraction, consider using {@link RegexLinkExtractor} or creating your own
- * {@link LinkExtractor} implementation.
+ * Configuration for {@link XmlFeedLinkExtractor}.
  * </p>
- *
- * <h3>Applicable documents</h3>
- * <p>
- * By default, this extractor only will be applied on documents matching
- * one of these content types:
- * </p>
- *
- * {@nx.include com.norconex.importer.handler.CommonMatchers#xmlFeedContentTypes}
- *
- * <h3>Referrer data</h3>
- * <p>
- * The following referrer information is stored as metadata in each document
- * represented by the extracted URLs:
- * </p>
- * <ul>
- *   <li><b>Referrer reference:</b> The reference (URL) of the page where the
- *   link to a document was found.  Metadata value is
- *   {@link WebDocMetadata#REFERRER_REFERENCE}.</li>
- * </ul>
- *
- * {@nx.xml.usage
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.XmlFeedLinkExtractor">
- *   {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage}
- * </extractor>
- * }
- *
- * {@nx.xml.example
- * <extractor class="com.norconex.crawler.web.doc.operations.link.impl.XmlFeedLinkExtractor">
- *   <restrictTo field="document.reference" method="regex">.*rss$</restrictTo>
- * </extractor>
- * }
- * <p>
- * The above example specifies this extractor should only apply on documents
- * that have their URL ending with "rss" (in addition to the default
- * content types supported).
- * </p>
- *
  * @since 2.7.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class XmlFeedLinkExtractorConfig {
@@ -78,8 +34,6 @@ public class XmlFeedLinkExtractorConfig {
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#XML_FEED_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
-     * @return content type matcher
      */
     private final TextMatcher contentTypeMatcher =
             CommonMatchers.xmlFeedContentTypes();
@@ -87,11 +41,11 @@ public class XmlFeedLinkExtractorConfig {
     /**
      * Matcher of one or more fields to use as the source of content to
      * extract links from, instead of the document content.
-     * @param fieldMatcher field matcher
-     * @return field matcher
      */
     private final TextMatcher fieldMatcher = new TextMatcher();
 
+    private final PropertyMatchers restrictions = new PropertyMatchers();
+
     public XmlFeedLinkExtractorConfig setFieldMatcher(
             TextMatcher fieldMatcher) {
         this.fieldMatcher.copyFrom(fieldMatcher);
@@ -102,7 +56,7 @@ public XmlFeedLinkExtractorConfig setFieldMatcher(
      * The matcher of content types to apply link extraction on. No attempt to
      * extract links from any other content types will be made. Default is
      * {@link CommonMatchers#XML_FEED_CONTENT_TYPES}.
-     * @param contentTypeMatcher content type matcher
+     * @param matcher content type matcher
      * @return this
      */
     public XmlFeedLinkExtractorConfig setContentTypeMatcher(
@@ -110,4 +64,19 @@ public XmlFeedLinkExtractorConfig setContentTypeMatcher(
         contentTypeMatcher.copyFrom(matcher);
         return this;
     }
+
+    /**
+     * Clears all restrictions.
+     */
+    public void clearRestrictions() {
+        restrictions.clear();
+    }
+
+    /**
+     * Gets all restrictions
+     * @return the restrictions
+     */
+    public PropertyMatchers getRestrictions() {
+        return restrictions;
+    }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java
index 02676a1e7..52bb5eac0 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java
@@ -14,6 +14,8 @@
  */
 package com.norconex.crawler.web.doc.operations.recrawl.impl;
 
+import static java.util.Optional.ofNullable;
+
 import java.time.Duration;
 import java.time.ZonedDateTime;
 import java.time.temporal.ChronoField;
@@ -29,6 +31,7 @@
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
 import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver;
 import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency;
+import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency.ApplyTo;
 import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.SitemapSupport;
 import com.norconex.crawler.web.sitemap.SitemapChangeFrequency;
 
@@ -51,67 +54,21 @@
  * <p>
  * By default, existing sitemap directives take precedence over custom ones.
  * You chose to have sitemap directives be considered last or even disable
- * sitemap directives using the {@link #setSitemapSupport(SitemapSupport)}
+ * sitemap directives using the
+ * {@link GenericRecrawlableResolverConfig#setSitemapSupport(SitemapSupport)}
  * method.
  * </p>
  *
  * <h3>Custom recrawl frequencies:</h3>
  * <p>
  * You can chose to have some of your crawled documents be re-crawled less
- * frequently than others by specifying custom minimum frequencies
- * ({@link #setMinFrequencies(Collection)}). Minimum frequencies are
- * processed in the order specified and must each have to following:
- * </p>
- * <ul>
- *   <li>applyTo: Either "reference" or "contentType"
- *       (defaults to "reference").</li>
- *   <li>pattern: A regular expression.</li>
- *   <li>value: one of "always", "hourly", "daily", "weekly", "monthly",
- *       "yearly", "never", or a numeric value in milliseconds.</li>
- * </ul>
- *
- * <p>
- * As of 2.7.0, XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
- * </p>
- *
- * {@nx.xml.usage
- * <recrawlableResolver
- *     class="com.norconex.crawler.web.recrawl.impl.GenericRecrawlableResolver"
- *     sitemapSupport="[first|last|never]" >
- *
- *   <minFrequency applyTo="[reference|contentType]"
- *       value="([always|hourly|daily|weekly|monthly|yearly|never] or milliseconds)">
- *     <matcher {@nx.include com.norconex.commons.lang.text.TextMatcher#matchAttributes}>
- *       (Matcher for the reference or content type.)
- *     </matcher>
- *   </minFrequency>
- *   (... repeat frequency tag as needed ...)
- * </recrawlableResolver>
- * }
- *
- * {@nx.xml.example
- * <recrawlableResolver
- *     class="com.norconex.crawler.web.recrawl.impl.GenericRecrawlableResolver"
- *     sitemapSupport="last" >
- *   <minFrequency applyTo="contentType" value="monthly">
- *     <matcher>application/pdf</matcher>
- *   </minFrequency>
- *   <minFrequency applyTo="reference" value="1800000">
- *     <matcher method="regex">.*latest-news.*\.html</matcher>
- *   </minFrequency>
- * </recrawlableResolver>
- * }
- * <p>
- * The above example ensures PDFs are re-crawled no more frequently than
- * once a month, while HTML news can be re-crawled as fast at every half hour.
- * For the rest, it relies on the website sitemap directives (if any).
+ * frequently than others by specifying custom minimum frequencies with
+ * ({@link GenericRecrawlableResolverConfig#setMinFrequencies(Collection)}).
+ * Minimum frequencies are processed in the order specified.
  * </p>
  *
  * @since 2.5.0
  */
-@SuppressWarnings("javadoc")
 @Slf4j
 @EqualsAndHashCode
 @ToString
@@ -157,15 +114,11 @@ public boolean isRecrawlable(WebCrawlDocContext prevData) {
 
     private MinFrequency getMatchingMinFrequency(WebCrawlDocContext prevData) {
         for (MinFrequency f : configuration.getMinFrequencies()) {
-            var applyTo = f.getApplyTo();
-            if (StringUtils.isBlank(applyTo)) {
-                applyTo = "reference";
-            }
-            if (("reference".equalsIgnoreCase(applyTo)
-                    && f.getMatcher().matches(prevData.getReference())
-                    || ("contentType".equalsIgnoreCase(applyTo)
-                            && f.getMatcher().matches(
-                                    prevData.getContentType().toString())))) {
+            var applyTo = ofNullable(f.getApplyTo()).orElse(ApplyTo.REFERENCE);
+            var matchMe = applyTo == ApplyTo.REFERENCE
+                    ? prevData.getReference()
+                    : prevData.getContentType().toString();
+            if (f.getMatcher().matches(matchMe)) {
                 return f;
             }
         }
@@ -243,17 +196,16 @@ private boolean isRecrawlableFromSitemap(WebCrawlDocContext prevData) {
                     lastModified, prevData.getReference());
             if (lastModified.isAfter(lastCrawled)) {
                 if (LOG.isDebugEnabled()) {
-                    LOG.debug(
-                            "Recrawlable according to sitemap directive "
-                                    + "(last modified '{}' > last crawled '{}'): {}",
+                    LOG.debug("""
+                        Recrawlable according to sitemap directive \
+                        (last modified '{}' > last crawled '{}'): {}""",
                             lastModified, lastCrawled, prevData.getReference());
                 }
                 return true;
             }
             if (LOG.isDebugEnabled()) {
-                LOG.debug(
-                        "Not recrawlable according to sitemap directive "
-                                + "(last modified '{}' > last crawled '{}'): {}",
+                LOG.debug("Not recrawlable according to sitemap directive "
+                        + "(last modified '{}' > last crawled '{}'): {}",
                         lastModified, lastCrawled, prevData.getReference());
             }
             return false;
@@ -275,8 +227,7 @@ private boolean isRecrawlableFromFrequency(
         }
 
         if (LOG.isDebugEnabled()) {
-            LOG.debug(
-                    "The {} change frequency is {} for: {}",
+            LOG.debug("The {} change frequency is {} for: {}",
                     context, cf, prevData.getReference());
         }
         if (cf == SitemapChangeFrequency.ALWAYS) {
@@ -324,16 +275,15 @@ private boolean isRecrawlableFromFrequency(
             return true;
         }
         if (LOG.isDebugEnabled()) {
-            LOG.debug(
-                    String.format("""
-                            Not recrawlable according to {} directive\s\
-                            (required elapsed time '{}'\s\
-                            >= actual elapsed time '{}' since '{}'): {}""",
-                            context,
-                            formatDuration(lastCrawlDate, minCrawlDate),
-                            formatDuration(lastCrawlDate, now),
-                            lastCrawlDate,
-                            prevData.getReference()));
+            LOG.debug(String.format("""
+                    Not recrawlable according to {} directive\s\
+                    (required elapsed time '{}'\s\
+                    >= actual elapsed time '{}' since '{}'): {}""",
+                    context,
+                    formatDuration(lastCrawlDate, minCrawlDate),
+                    formatDuration(lastCrawlDate, now),
+                    lastCrawlDate,
+                    prevData.getReference()));
         }
         return false;
     }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java
index 8a3b212b3..5a7a2d81f 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java
@@ -23,87 +23,17 @@
 
 import com.norconex.commons.lang.collection.CollectionUtil;
 import com.norconex.commons.lang.text.TextMatcher;
-import com.norconex.commons.lang.time.DurationParser;
 
 import lombok.Data;
 import lombok.NoArgsConstructor;
 import lombok.experimental.Accessors;
 
 /**
- * <p>Relies on both sitemap directives and custom instructions for
- * establishing the minimum frequency between each document recrawl.
- * </p>
- *
- * <h3>Sitemap support:</h3>
- * <p>
- * Provided crawler support for sitemaps has not been disabled,
- * this class tries to honor last modified and frequency directives found
- * in sitemap files.
- * </p>
- * <p>
- * By default, existing sitemap directives take precedence over custom ones.
- * You chose to have sitemap directives be considered last or even disable
- * sitemap directives using the {@link #setSitemapSupport(SitemapSupport)}
- * method.
- * </p>
- *
- * <h3>Custom recrawl frequencies:</h3>
- * <p>
- * You can chose to have some of your crawled documents be re-crawled less
- * frequently than others by specifying custom minimum frequencies
- * ({@link #setMinFrequencies(Collection)}). Minimum frequencies are
- * processed in the order specified and must each have to following:
- * </p>
- * <ul>
- *   <li>applyTo: Either "reference" or "contentType"
- *       (defaults to "reference").</li>
- *   <li>pattern: A regular expression.</li>
- *   <li>value: one of "always", "hourly", "daily", "weekly", "monthly",
- *       "yearly", "never", or a numeric value in milliseconds.</li>
- * </ul>
- *
  * <p>
- * As of 2.7.0, XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
+ * Configuration for {@link GenericRecrawlableResolver}.
  * </p>
- *
- * {@nx.xml.usage
- * <recrawlableResolver
- *     class="com.norconex.crawler.web.recrawl.impl.GenericRecrawlableResolver"
- *     sitemapSupport="[first|last|never]" >
- *
- *   <minFrequency applyTo="[reference|contentType]"
- *       value="([always|hourly|daily|weekly|monthly|yearly|never] or milliseconds)">
- *     <matcher {@nx.include com.norconex.commons.lang.text.TextMatcher#matchAttributes}>
- *       (Matcher for the reference or content type.)
- *     </matcher>
- *   </minFrequency>
- *   (... repeat frequency tag as needed ...)
- * </recrawlableResolver>
- * }
- *
- * {@nx.xml.example
- * <recrawlableResolver
- *     class="com.norconex.crawler.web.recrawl.impl.GenericRecrawlableResolver"
- *     sitemapSupport="last" >
- *   <minFrequency applyTo="contentType" value="monthly">
- *     <matcher>application/pdf</matcher>
- *   </minFrequency>
- *   <minFrequency applyTo="reference" value="1800000">
- *     <matcher method="regex">.*latest-news.*\.html</matcher>
- *   </minFrequency>
- * </recrawlableResolver>
- * }
- * <p>
- * The above example ensures PDFs are re-crawled no more frequently than
- * once a month, while HTML news can be re-crawled as fast at every half hour.
- * For the rest, it relies on the website sitemap directives (if any).
- * </p>
- *
  * @since 2.5.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class GenericRecrawlableResolverConfig {
@@ -127,8 +57,6 @@ public static SitemapSupport getSitemapSupport(String sitemapSupport) {
     /**
      * The sitemap support strategy. A <code>null</code> value
      * is equivalent to specifying the default {@link SitemapSupport#FIRST}.
-     * @param sitemapSupport sitemap support strategy
-     * @return sitemap support strategy
      */
     private SitemapSupport sitemapSupport = SitemapSupport.FIRST;
 
@@ -154,11 +82,30 @@ public void setMinFrequencies(Collection<MinFrequency> minFrequencies) {
     @Accessors(chain = true)
     @NoArgsConstructor
     public static class MinFrequency {
-        private String applyTo;
+        public enum ApplyTo {
+            CONTENT_TYPE, REFERENCE
+        }
+
+        /**
+         * Whether to apply this minimum frequency to matching content type
+         * or document reference. Default to {@link ApplyTo#REFERENCE}.
+         */
+        private ApplyTo applyTo = ApplyTo.REFERENCE;
+        /**
+         * String representation of a frequency. Can be one of "always",
+         * "hourly", "daily", "weekly", "monthly", "yearly", "never", or a
+         * numeric value in milliseconds.
+         */
         private String value;
+
+        /**
+         * A matcher applied to either a document reference or content type,
+         * based on {@link #getApplyTo()}.
+         */
         private final TextMatcher matcher = new TextMatcher();
 
-        public MinFrequency(String applyTo, String value, TextMatcher matcher) {
+        public MinFrequency(
+                ApplyTo applyTo, String value, TextMatcher matcher) {
             this.applyTo = applyTo;
             this.value = value;
             this.matcher.copyFrom(matcher);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java
index 7ed12d244..cb9e64048 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java
@@ -33,7 +33,8 @@
 import lombok.extern.slf4j.Slf4j;
 
 /**
- * <p>By default a crawler will try to follow all links it discovers. You can
+ * <p>
+ * By default a crawler will try to follow all links it discovers. You can
  * define your own filters to limit the scope of the pages being crawled.
  * When you have multiple URLs defined as start URLs, it can be tricky to
  * perform global filtering that apply to each URLs without causing
@@ -47,7 +48,6 @@
  * </p>
  * @since 2.3.0
  */
-//TODO make this an interface so developers can provide their own?
 @EqualsAndHashCode
 @ToString
 @Slf4j
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java
index 746855946..bd2ec7ef2 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java
@@ -18,24 +18,13 @@
 import lombok.experimental.Accessors;
 
 /**
- * <p>By default a crawler will try to follow all links it discovers. You can
- * define your own filters to limit the scope of the pages being crawled.
- * When you have multiple URLs defined as start URLs, it can be tricky to
- * perform global filtering that apply to each URLs without causing
- * URL filtering conflicts.  This class offers an easy way to address
- * a frequent URL filtering need: to "stay on target". That is,
- * when following a page and extracting URLs found in it, make sure to
- * only keep URLs that are on the same site as the page URL we are on.
- * </p>
  * <p>
- * By default this class does not request to stay on a site.
+ * Configuration for {@link GenericUrlScopeResolver}.
  * </p>
  * @since 2.3.0
  */
-//TODO make this an interface so developers can provide their own?
 @Data
 @Accessors(chain = true)
-@SuppressWarnings("javadoc")
 public class GenericUrlScopeResolverConfig {
 
     /**
@@ -43,16 +32,12 @@ public class GenericUrlScopeResolverConfig {
      * the domain for each URL specified as a start URL.  By default (false)
      * the crawler will try follow any discovered links not otherwise rejected
      * by other settings (like regular filtering rules you may have).
-     * @param stayOnDomain <code>true</code> for the crawler to stay on domain
-     * @return <code>true</code> if the crawler should stay on a domain
      */
     private boolean stayOnDomain;
 
     /**
      * Whether sub-domains are considered to be the same as a URL domain.
      * Only applicable when "stayOnDomain" is <code>true</code>.
-     * @param includeSubdomains <code>true</code> to include sub-domains
-     * @return <code>true</code> if including sub-domains
      * @since 2.9.0
      */
     private boolean includeSubdomains;
@@ -62,8 +47,6 @@ public class GenericUrlScopeResolverConfig {
      * the port for each URL specified as a start URL.  By default (false)
      * the crawler will try follow any discovered links not otherwise rejected
      * by other settings (like regular filtering rules you may have).
-     * @param stayOnPort <code>true</code> for the crawler to stay on port
-     * @return <code>true</code> if the crawler should stay on a port
      */
     private boolean stayOnPort;
 
@@ -72,9 +55,6 @@ public class GenericUrlScopeResolverConfig {
      * the protocol for each URL specified as a start URL.  By default (false)
      * the crawler will try follow any discovered links not otherwise rejected
      * by other settings (like regular filtering rules you may have).
-     * @param stayOnProtocol
-     *        <code>true</code> for the crawler to stay on protocol
-     * @return <code>true</code> if the crawler should stay on protocol
      */
     private boolean stayOnProtocol = false;
 
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java
index bba4f4884..5e650b019 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java
@@ -98,51 +98,6 @@
  *   In addition, this class allows you to specify any number of URL
  *   value replacements using regular expressions.
  * </p>
- *
- * {@nx.xml.usage
- *  <urlNormalizer
- *      class="com.norconex.crawler.web.url.impl.GenericUrlNormalizer">
- *    <normalizations>
- *      (normalization code names, coma separated)
- *    </normalizations>
- *    <replacements>
- *      <replace>
- *         <match>(regex pattern to match)</match>
- *         <replacement>(optional replacement value, default to blank)</replacement>
- *      </replace>
- *      (... repeat replace tag  as needed ...)
- *    </replacements>
- *  </urlNormalizer>
- * }
- * <p>
- * Since 2.7.2, having an empty "normalizations" tag will effectively remove
- * any normalizations rules previously set (like default ones).
- * Not having the tag
- * at all will keep existing/default normalizations.
- * </p>
- *
- * {@nx.xml.example
- * <urlNormalizer class="com.norconex.crawler.web.url.impl.GenericUrlNormalizer">
- *   <normalizations>
- *       removeFragment, lowerCaseSchemeHost, upperCaseEscapeSequence,
- *       decodeUnreservedCharacters, removeDefaultPort,
- *       encodeNonURICharacters, addWWW
- *   </normalizations>
- *   <replacements>
- *     <replace><match>&amp;amp;view=print</match></replace>
- *     <replace>
- *        <match>(&amp;amp;type=)(summary)</match>
- *        <replacement>$1full</replacement>
- *     </replace>
- *   </replacements>
- * </urlNormalizer>
- * }
- * <p>
- * The following adds a normalization to add "www." to URL domains when
- * missing, to the default set of normalizations. It also add custom
- * URL "search-and-replace" to remove any "&amp;view=print" strings from URLs
- * as well as replace "&amp;type=summary" with "&amp;type=full".
- * </p>
  */
 @EqualsAndHashCode
 @ToString
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java
index a990bc186..a5a1656e7 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java
@@ -24,8 +24,6 @@
 import com.norconex.commons.lang.collection.CollectionUtil;
 import com.norconex.commons.lang.convert.GenericConverter;
 import com.norconex.commons.lang.url.UrlNormalizer;
-import com.norconex.crawler.web.WebCrawlerConfig;
-import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer;
 
 import lombok.Data;
 import lombok.Getter;
@@ -33,117 +31,7 @@
 
 /**
  * <p>
- * Generic implementation of {@link WebUrlNormalizer} that should satisfy
- * most URL normalization needs.  This implementation relies on
- * {@link UrlNormalizer}.  Please refer to it for complete documentation and
- * examples.
- * </p>
- * <p>
- * This class is in effect by default. To skip its usage, you
- * can explicitly set the URL Normalizer to <code>null</code> in the
- * {@link WebCrawlerConfig}.
- * </p>
- * <p>
- * By default, this class removes the URL fragment and applies these
- * <a href="http://tools.ietf.org/html/rfc3986">RFC 3986</a>
- * normalizations:
- * </p>
- * <ul>
- *   <li>Converting the scheme and host to lower case</li>
- *   <li>Capitalizing letters in escape sequences</li>
- *   <li>Decoding percent-encoded unreserved characters</li>
- *   <li>Removing the default port</li>
- *   <li>Encoding non-URI characters</li>
- * </ul>
- * <p>
- * To overwrite this default, you have to specify a new list of normalizations
- * to apply, via the {@link #setNormalizations(List)} method,
- * or via XML configuration.  Each
- * normalizations is identified by a code name.  The following is the
- * complete code name list for supported normalizations.  Click on any code
- * name to get a full description from {@link WebUrlNormalizer}:
- * </p>
- * <ul>
- *   <li>{@link UrlNormalizer#addDirectoryTrailingSlash() addDirectoryTrailingSlash} (since 2.6.0)</li>
- *   <li>{@link UrlNormalizer#addDomainTrailingSlash() addDomainTrailingSlash} (since 2.6.1)</li>
- *   <li>{@link UrlNormalizer#addWWW() addWWW}</li>
- *   <li>{@link UrlNormalizer#decodeUnreservedCharacters() decodeUnreservedCharacters}</li>
- *   <li>{@link UrlNormalizer#encodeNonURICharacters() encodeNonURICharacters}</li>
- *   <li>{@link UrlNormalizer#encodeSpaces() encodeSpaces}</li>
- *   <li>{@link UrlNormalizer#lowerCase() lowerCase} (since 2.9.0)</li>
- *   <li>{@link UrlNormalizer#lowerCasePath() lowerCasePath} (since 2.9.0)</li>
- *   <li>{@link UrlNormalizer#lowerCaseQuery() lowerCaseQuery} (since 2.9.0)</li>
- *   <li>{@link UrlNormalizer#lowerCaseQueryParameterNames()
- *        lowerCaseQueryParameterNames} (since 2.9.0)</li>
- *   <li>{@link UrlNormalizer#lowerCaseQueryParameterValues()
- *        lowerCaseQueryParameterValues} (since 2.9.0)</li>
- *   <li>{@link UrlNormalizer#lowerCaseSchemeHost() lowerCaseSchemeHost}</li>
- *   <li>{@link UrlNormalizer#removeDefaultPort() removeDefaultPort}</li>
- *   <li>{@link UrlNormalizer#removeDirectoryIndex() removeDirectoryIndex}</li>
- *   <li>{@link UrlNormalizer#removeDotSegments() removeDotSegments}</li>
- *   <li>{@link UrlNormalizer#removeDuplicateSlashes() removeDuplicateSlashes}</li>
- *   <li>{@link UrlNormalizer#removeEmptyParameters() removeEmptyParameters}</li>
- *   <li>{@link UrlNormalizer#removeFragment() removeFragment}</li>
- *   <li>{@link UrlNormalizer#removeQueryString() removeQueryString} (since 2.9.0)</li>
- *   <li>{@link UrlNormalizer#removeSessionIds() removeSessionIds}</li>
- *   <li>{@link UrlNormalizer#removeTrailingQuestionMark() removeTrailingQuestionMark}</li>
- *   <li>{@link UrlNormalizer#removeTrailingSlash() removeTrailingSlash} (since 2.6.0)</li>
- *   <li>{@link UrlNormalizer#removeTrailingHash() removeTrailingHash} (since 2.7.0)</li>
- *   <li>{@link UrlNormalizer#removeWWW() removeWWW}</li>
- *   <li>{@link UrlNormalizer#replaceIPWithDomainName() replaceIPWithDomainName}</li>
- *   <li>{@link UrlNormalizer#secureScheme() secureScheme}</li>
- *   <li>{@link UrlNormalizer#sortQueryParameters() sortQueryParameters}</li>
- *   <li>{@link UrlNormalizer#unsecureScheme() unsecureScheme}</li>
- *   <li>{@link UrlNormalizer#upperCaseEscapeSequence() upperCaseEscapeSequence}</li>
- * </ul>
- * <p>
- *   In addition, this class allows you to specify any number of URL
- *   value replacements using regular expressions.
- * </p>
- *
- * {@nx.xml.usage
- *  <urlNormalizer
- *      class="com.norconex.crawler.web.url.impl.GenericUrlNormalizer">
- *    <normalizations>
- *      (normalization code names, coma separated)
- *    </normalizations>
- *    <replacements>
- *      <replace>
- *         <match>(regex pattern to match)</match>
- *         <replacement>(optional replacement value, default to blank)</replacement>
- *      </replace>
- *      (... repeat replace tag  as needed ...)
- *    </replacements>
- *  </urlNormalizer>
- * }
- * <p>
- * Since 2.7.2, having an empty "normalizations" tag will effectively remove
- * any normalizations rules previously set (like default ones).
- * Not having the tag
- * at all will keep existing/default normalizations.
- * </p>
- *
- * {@nx.xml.example
- * <urlNormalizer class="com.norconex.crawler.web.url.impl.GenericUrlNormalizer">
- *   <normalizations>
- *       removeFragment, lowerCaseSchemeHost, upperCaseEscapeSequence,
- *       decodeUnreservedCharacters, removeDefaultPort,
- *       encodeNonURICharacters, addWWW
- *   </normalizations>
- *   <replacements>
- *     <replace><match>&amp;amp;view=print</match></replace>
- *     <replace>
- *        <match>(&amp;amp;type=)(summary)</match>
- *        <replacement>$1full</replacement>
- *     </replace>
- *   </replacements>
- * </urlNormalizer>
- * }
- * <p>
- * The following adds a normalization to add "www." to URL domains when
- * missing, to the default set of normalizations. It also add custom
- * URL "search-and-replace" to remove any "&amp;view=print" strings from URLs
- * as well as replace "&amp;type=summary" with "&amp;type=full".
+ * Configuration for {@link GenericUrlNormalizer}.
  * </p>
  */
 @Data
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java
index 5fdb6d37d..591a9c438 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java
@@ -100,31 +100,8 @@
  * using a custom link extractor.
  * </p>
  *
- * {@nx.xml.usage
- * <listener
- *     class="com.norconex.crawler.web.crawler.event.impl.UrlStatusCrawlerEventListener">
- *   <statusCodes>(CSV list of status codes)</statusCodes>
- *   <outputDir>(path to a directory of your choice)</outputDir>
- *   <fileNamePrefix>(report file name prefix)</fileNamePrefix>
- *   <timestamped>[false|true]</timestamped>
- * </listener>
- * }
- *
- * {@nx.xml.example
- * <listener class="UrlStatusCrawlerEventListener">
- *   <statusCodes>404</statusCodes>
- *   <outputDir>/report/path/</outputDir>
- *   <fileNamePrefix>brokenLinks</fileNamePrefix>
- * </listener>
- * }
- * <p>
- * The above example will generate a broken links report by recording
- * 404 status codes (from HTTP response).
- * </p>
- *
  * @since 2.2.0
  */
-
 @EqualsAndHashCode
 @ToString
 @Slf4j
@@ -223,8 +200,8 @@ private void resolveStatusCodeRange(
             var end = toInt(endPoints[1]);
             if (start >= end) {
                 throw new IllegalArgumentException(
-                        "Invalid statusCode range: " + range
-                                + ". Start value must be higher than end value.");
+                        "Invalid statusCode range: %s. Start value must be "
+                                + "higher than end value.".formatted(range));
             }
             while (start <= end) {
                 parsedCodes.add(start);
@@ -274,9 +251,8 @@ private int toInt(String num) {
             return Integer.parseInt(num.trim());
         } catch (NumberFormatException e) {
             throw new IllegalArgumentException(
-                    "The statusCodes attribute "
-                            + "can only contain valid numbers. This number is invalid: "
-                            + num);
+                    "The statusCodes attribute can only contain valid numbers. "
+                            + "This number is invalid: " + num);
         }
     }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java
index af5c0eaee..9b573f7d7 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java
@@ -15,106 +15,18 @@
 package com.norconex.crawler.web.event.listeners;
 
 import java.nio.file.Path;
-import java.util.List;
-
-import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor;
-import com.norconex.crawler.web.doc.operations.link.impl.TikaLinkExtractor;
 
 import lombok.Data;
 import lombok.experimental.Accessors;
 
 /**
  * <p>
- * Store on file all URLs that were "fetched", along with their HTTP response
- * code. Useful for reporting purposes (e.g. finding broken links). A short
- * summary of all HTTP status codes can be found
- * <a href="http://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml">here</a>.
- * </p>
- *
- * <h3>Filter by status codes</h3>
- * <p>
- * By default, the status of all fetched URLs are stored by this listener,
- * regardless what were those statuses.  This can generate very lengthy reports
- * on large crawls. If you are only interested in certain status codes, you can
- * listen only for those using the {@link #setStatusCodes(String)} method
- * or XML configuration equivalent. You specify the codes you want to listen
- * for as coma-separated values. Ranges are also supported: specify two range
- * values (both inclusive) separated by an hyphen.  For instance, if you want
- * to store all "bad" URLs, you can quickly specify all codes except
- * 200 (OK) this way:
- * </p>
- * <pre>100-199,201-599</pre>
- *
- * <h3>Output location</h3>
- * <p>
- * By default one generated report is created for each crawler, stored
- * in crawler-specific directories under the collector working directory.
- * The collector working directory can be overwritten using
- * {@link #setOutputDir(Path)}.
- * If {@link #isCombined()} is <code>true</code>, status from all crawlers
- * defined will be written to a unique file in the collector working directory.
- * </p>
- *
- * <h3>File naming</h3>
- * <p>
- * By default, the file generated will use this naming pattern:
- * </p>
- * <pre>
- *   urlstatuses-[timestamp].csv
- * </pre>
- * <p>
- * The filename prefix can be changed from "urlstatuses-" to anything else
- * using {@link #setFileNamePrefix(String)}.
+ * Configuration for {@link UrlStatusCrawlerEventListener}.
  * </p>
- *
- * <h3>Filter which crawler to record URL statuses</h3>
- * <p>
- * By default all crawlers will have their URL fetch statuses recorded when
- * using this event listener.  To only do so for some crawlers, you can
- * use {@link #setCrawlerIds(List)} to identify them.
- * </p>
- *
- * <h3>Referring/parent URLs and custom link extractor</h3>
- * <p>
- * To capture the referring pages you have to use a link extractor that
- * extracts referrer information.  The default link extractor
- * {@link HtmlLinkExtractor} properly extracts this information.  Same with
- * {@link TikaLinkExtractor}.  This is only a consideration when
- * using a custom link extractor.
- * </p>
- *
- * {@nx.xml.usage
- * <listener
- *     class="com.norconex.crawler.web.crawler.event.impl.UrlStatusCrawlerEventListener">
- *   <statusCodes>(CSV list of status codes)</statusCodes>
- *   <crawlerIds>
- *     <!-- repeat as needed -->
- *     <id>(existing crawler ID)</id>
- *   </crawlerIds>
- *   <outputDir>(path to a directory of your choice)</outputDir>
- *   <fileNamePrefix>(report file name prefix)</fileNamePrefix>
- *   <combined>[false|true]</combined>
- *   <timestamped>[false|true]</timestamped>
- * </listener>
- * }
- *
- * {@nx.xml.example
- * <listener class="UrlStatusCrawlerEventListener">
- *   <statusCodes>404</statusCodes>
- *   <outputDir>/report/path/</outputDir>
- *   <fileNamePrefix>brokenLinks</fileNamePrefix>
- * </listener>
- * }
- * <p>
- * The above example will generate a broken links report by recording
- * 404 status codes (from HTTP response).
- * </p>
- *
  * @since 2.2.0
  */
 @Data
 @Accessors(chain = true)
-@SuppressWarnings("javadoc")
 public class UrlStatusCrawlerEventListenerConfig {
 
     public static final String DEFAULT_FILENAME_PREFIX = "urlstatuses-";
@@ -123,32 +35,24 @@ public class UrlStatusCrawlerEventListenerConfig {
      * The coma-separated list of status codes to listen to.
      * Default is <code>null</code> (listens for all status codes).
      * See class documentation for how to specify code ranges.
-     * @param statusCode HTTP status codes
-     * @return status codes
      */
     private String statusCodes;
 
     /**
      * The local directory where this listener report will be written.
      * Default uses the collector working directory.
-     * @param outputDir directory path
-     * @return directory path
      */
     private Path outputDir;
 
     /**
      * The generated report file name prefix. See class documentation
      * for default prefix.
-     * @param fileNamePrefix file name prefix
-     * @return file name prefix
      */
     private String fileNamePrefix = DEFAULT_FILENAME_PREFIX;
 
     /**
      * Whether to add a timestamp to the file name, to ensure
      * a new one is created with each run.
-     * @param timestamped <code>true</code> if timestamped
-     * @return <code>true</code> if timestamped
      */
     private boolean timestamped;
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java
index 4e68c96cc..6ce57d161 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java
@@ -22,63 +22,5 @@
  */
 public interface HttpFetcher
         extends Fetcher<HttpFetchRequest, HttpFetchResponse> {
-    //    extends AbstractFetcher<HttpFetchRequest, HttpFetchResponse> {
-    //
-    //    @Override
-    //    public HttpFetchResponse fetch(HttpFetchRequest fetchRequest)
-    //            throws FetchException {
-    //        // TODO Auto-generated method stub
-    //        return null;
-    //    }
-    //
-    //    @Override
-    //    protected void loadFetcherFromXML(XML xml) {
-    //        // TODO Auto-generated method stub
-    //
-    //    }
-    //
-    //    @Override
-    //    protected void saveFetcherToXML(XML xml) {
-    //        // TODO Auto-generated method stub
-    //
-    //    }
-
-    //TODO do we need this class?  Depends if we need this method:
-
-    //User agent is not obtained part of http fetch response... so no need
-    // really.
-    //    String getUserAgent();
-    //
-    //    boolean accept(Doc doc, HttpMethod httpMethod);
-    //
-    //    /**
-    //     * <p>
-    //     * Performs an HTTP request for the supplied document reference
-    //     * and HTTP method.
-    //     * </p>
-    //     * <p>
-    //     * For each HTTP method supported, implementors should
-    //     * do their best to populate the document and its {@link CrawlDocRecord}
-    //     * with as much information they can.
-    //     * </p>
-    //     * <p>
-    //     * Unsupported HTTP methods should return an HTTP response with the
-    //     * {@link CrawlDocState#UNSUPPORTED} state. To prevent users having to
-    //     * configure multiple HTTP clients, implementors should try to support
-    //     * both the <code>GET</code> and <code>HEAD</code> methods.
-    //     * POST is only used in special cases and is often not used during a
-    //     * crawl session.
-    //     * </p>
-    //     * <p>
-    //     * A <code>null</code> method is treated as a <code>GET</code>.
-    //     * </p>
-    //     * @param doc document to fetch or to use to make the request.
-    //     * @param httpMethod HTTP method
-    //     * @return an HTTP response
-    //     * @throws HttpFetchException problem when fetching the document
-    //     * @see HttpFetchResponseBuilder#unsupported()
-    //     */
-    //    IHttpFetchResponse fetch(CrawlDoc doc, HttpMethod httpMethod)
-    //            throws HttpFetchException;
 
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java
index 1ce1738ce..abe4c5c8b 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java
@@ -20,7 +20,6 @@
 import com.norconex.crawler.web.WebCrawlerConfig;
 import com.norconex.crawler.web.fetch.impl.GenericHttpFetchResponse;
 import com.norconex.crawler.web.fetch.impl.GenericHttpFetcher;
-import com.norconex.crawler.web.util.Web;
 
 public class HttpFetcherProvider
         implements Function<Crawler, HttpMultiFetcher> {
@@ -31,7 +30,9 @@ public HttpMultiFetcher apply(Crawler crawler) {
         var cfg = (WebCrawlerConfig) crawler.getConfiguration();
 
         //TODO really convert here?  and this way?
-        var fetchers = Web.toHttpFetcher(cfg.getFetchers());
+        var fetchers = cfg.getFetchers().stream()
+                .map(HttpFetcher.class::cast)
+                .toList();
         if (fetchers.isEmpty()) {
             fetchers.add(new GenericHttpFetcher());
         }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java
index bd980bd3e..8c5f818fa 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java
@@ -87,14 +87,13 @@
 import org.apache.hc.core5.util.Timeout;
 
 import com.norconex.commons.lang.encrypt.EncryptionUtil;
-import com.norconex.commons.lang.time.DurationParser;
 import com.norconex.crawler.core.Crawler;
 import com.norconex.crawler.core.CrawlerException;
 import com.norconex.crawler.core.doc.CrawlDocState;
 import com.norconex.crawler.core.fetch.AbstractFetcher;
 import com.norconex.crawler.core.fetch.FetchException;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
-import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizer;
+import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizerConfig.Normalization;
 import com.norconex.crawler.web.fetch.HttpFetchRequest;
 import com.norconex.crawler.web.fetch.HttpFetchResponse;
 import com.norconex.crawler.web.fetch.HttpFetcher;
@@ -146,12 +145,6 @@
  *
  * {@nx.include com.norconex.commons.lang.security.Credentials#doc}
  *
- * <p>
- * XML configuration entries expecting millisecond durations
- * can be provided in human-readable format (English only), as per
- * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s").
- * </p>
- *
  * <h3>HSTS Support</h3>
  * <p>
  * Upon first encountering a secure site, this fetcher will check whether the
@@ -163,10 +156,9 @@
  * </p>
  * <p>
  * If you want to convert non-secure URLs secure ones regardless of website
- * HSTS support, use
- * {@link GenericUrlNormalizer.Normalization#secureScheme} instead.
+ * HSTS support, use {@link Normalization#SECURE_SCHEME} instead.
  * To disable HSTS support, use
- * {@link GenericHttpFetcherConfig#setDisableHSTS(boolean)}.
+ * {@link GenericHttpFetcherConfig#setHstsDisabled(boolean)}.
  * </p>
  *
  * <h3>Pro-active change detection</h3>
@@ -183,106 +175,16 @@
  * supporting servers we only want to download a document if it was modified
  * since our last request.
  * To disable support for pro-active change detection, you can use
- * {@link GenericHttpFetcherConfig#setDisableIfModifiedSince(boolean)} and
- * {@link GenericHttpFetcherConfig#setDisableETag(boolean)}.
+ * {@link GenericHttpFetcherConfig#setIfModifiedSinceDisabled(boolean)} and
+ * {@link GenericHttpFetcherConfig#setETagDisabled(boolean)}.
  * </p>
  * <p>
  * These settings have no effect for web servers not supporting them.
  * </p>
  *
- * {@nx.xml.usage
- * <fetcher class="com.norconex.crawler.web.fetch.impl.GenericHttpFetcher">
- *
- *   <userAgent>(identify yourself!)</userAgent>
- *   <cookieSpec>[RELAXED|STRICT|IGNORE]</cookieSpec>
- *   <connectionTimeout>(milliseconds)</connectionTimeout>
- *   <socketTimeout>(milliseconds)</socketTimeout>
- *   <connectionRequestTimeout>(milliseconds)</connectionRequestTimeout>
- *   <expectContinueEnabled>[false|true]</expectContinueEnabled>
- *   <maxRedirects>...</maxRedirects>
- *   <redirectURLProvider>(implementation handling redirects)</redirectURLProvider>
- *   <localAddress>...</localAddress>
- *   <maxConnections>...</maxConnections>
- *   <maxConnectionsPerRoute>...</maxConnectionsPerRoute>
- *   <maxConnectionIdleTime>(milliseconds)</maxConnectionIdleTime>
- *   <maxConnectionInactiveTime>(milliseconds)</maxConnectionInactiveTime>
- *
- *   <!-- Be warned: trusting all certificates is usually a bad idea. -->
- *   <trustAllSSLCertificates>[false|true]</trustAllSSLCertificates>
- *
- *   <!-- You can specify SSL/TLS protocols to use -->
- *   <sslProtocols>(coma-separated list)</sslProtocols>
- *
- *   <!-- Disable Server Name Indication (SNI) -->
- *   <disableSNI>[false|true]</disableSNI>
- *
- *   <!-- Disable support for website "Strict-Transport-Security" setting. -->
- *   <disableHSTS>[false|true]</disableHSTS>
- *
- *   <!-- You can use a specific key store for SSL Certificates -->
- *   <keyStoreFile></keyStoreFile>
- *
- *   <proxySettings>
- *     {@nx.include com.norconex.commons.lang.net.ProxySettings@nx.xml.usage}
- *   </proxySettings>
- *
- *   <!-- HTTP request header constants passed on every HTTP requests -->
- *   <headers>
- *     <header name="(header name)">(header value)</header>
- *     <!-- You can repeat this header tag as needed. -->
- *   </headers>
- *
- *   <!-- Disable conditionally getting a document based on last crawl date. -->
- *   <disableIfModifiedSince>[false|true]</disableIfModifiedSince>
- *
- *   <!-- Disable ETag support. -->
- *   <disableETag>[false|true]</disableETag>
- *
- *   <!-- Optional authentication details. -->
- *   <authentication>
- *     {@nx.include com.norconex.crawler.web.fetch.impl.HttpAuthConfig@nx.xml.usage}
- *   </authentication>
- *
- *   <validStatusCodes>(defaults to 200)</validStatusCodes>
- *   <notFoundStatusCodes>(defaults to 404)</notFoundStatusCodes>
- *   <headersPrefix>(string to prefix headers)</headersPrefix>
- *
- *   <!-- Force detect, or only when not provided in HTTP response headers -->
- *   <forceContentTypeDetection>[false|true]</forceContentTypeDetection>
- *   <forceCharsetDetection>[false|true]</forceCharsetDetection>
- *
- *   {@nx.include com.norconex.crawler.core.fetch.AbstractFetcher#referenceFilters}
- *
- *   <!-- Comma-separated list of supported HTTP methods. -->
- *   <httpMethods>(defaults to: GET, HEAD)</httpMethods>
- *
- * </fetcher>
- * }
- *
- * {@nx.xml.example
- * <fetcher class="GenericHttpFetcher">
- *     <authentication>
- *       <method>form</method>
- *       <credentials>
- *         <username>joeUser</username>
- *         <password>joePasword</password>
- *       </credentials>
- *       <formUsernameField>loginUser</formUsernameField>
- *       <formPasswordField>loginPwd</formPasswordField>
- *       <url>http://www.example.com/login/submit</url>
- *     </authentication>
- * </fetcher>
- * }
- * <p>
- * The above example will authenticate the crawler to a web site before
- * crawling. The website uses an HTML form with a username and password
- * fields called "loginUser" and "loginPwd".
- * </p>
- *
  * @since 3.0.0 (Merged from GenericDocumentFetcher and
  *        GenericHttpClientFactory)
  */
-@SuppressWarnings("javadoc")
 @Slf4j
 @EqualsAndHashCode(onlyExplicitlyIncluded = true)
 @ToString(onlyExplicitlyIncluded = true)
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java
index d3e910930..abd670f46 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java
@@ -38,10 +38,7 @@
 
 /**
  * Generic HTTP Fetcher configuration.
- * @since 3.0.0 (adapted from GenericHttpClientFactory and
- *        GenericDocumentFetcher from version 2.x)
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class GenericHttpFetcherConfig extends BaseFetcherConfig {
@@ -61,23 +58,26 @@ public enum CookieSpec {
         RELAXED, STRICT, IGNORE
     }
 
+    /**
+     * HTTP status codes considered "valid". Defaults to 200.
+     */
     private final List<Integer> validStatusCodes =
             new ArrayList<>(DEFAULT_VALID_STATUS_CODES);
 
+    /**
+     * HTTP status codes considered "not found". Defaults to 404.
+     */
     private final List<Integer> notFoundStatusCodes =
             new ArrayList<>(DEFAULT_NOT_FOUND_STATUS_CODES);
     /**
-     * Optional prefix prepended to captured HTTP response fields.
-     * @param headersPrefix optional prefix
-     * @return prefix or <code>null</code>
+     * Optional prefix prepended to captured HTTP response fields. A
+     * <code>null</code> value (default) won't add any prefix.
      */
     private String headersPrefix;
 
     /**
      * Whether content type is detected instead of relying on
      * returned <code>Content-Type</code> HTTP response header.
-     * @param forceContentTypeDetection <code>true</code> to enable detection
-     * @return <code>true</code> to enable detection
      */
     private boolean forceContentTypeDetection;
 
@@ -85,33 +85,28 @@ public enum CookieSpec {
      * Whether character encoding is detected instead of relying on
      * the charset sometimes found in the <code>Content-Type</code> HTTP
      * response header.
-     * @param forceCharsetDetection <code>true</code> to enable detection
-     * @return <code>true</code> to enable detection
      */
     private boolean forceCharsetDetection;
 
     /**
      * Authentication configuration for sites requiring it. Default
      * is <code>null</code>.
-     * @param authentication authentication configuration
-     * @return authentication configuration
      */
     private HttpAuthConfig authentication;
 
     /**
      * Cookie specification to use when fetching documents. Default is relaxed.
-     * @param cookieSpec cookie specification name
-     * @return the cookieSpec cookie specification name
      */
     private CookieSpec cookieSpec = CookieSpec.RELAXED;
 
+    /**
+     * An optional HTTP proxy.
+     */
     private final ProxySettings proxySettings = new ProxySettings();
 
     /**
      * The connection timeout for a connection to be established.
      * Default is {@link #DEFAULT_TIMEOUT}.
-     * @param connectionTimeout connection timeout
-     * @return connection timeout
      */
     private Duration connectionTimeout = DEFAULT_TIMEOUT;
 
@@ -119,32 +114,24 @@ public enum CookieSpec {
      * Gets the maximum period of inactivity between two consecutive data
      * packets.
      * Default is {@link #DEFAULT_TIMEOUT}.
-     * @param socketTimeout socket timeout
-     * @return socket timeout
      */
     private Duration socketTimeout = DEFAULT_TIMEOUT;
 
     /**
      * Gets the timeout when requesting a connection.
      * Default is {@link #DEFAULT_TIMEOUT}.
-     * @param connectionRequestTimeout connection request timeout
-     * @return connection request timeout
      */
     private Duration connectionRequestTimeout = DEFAULT_TIMEOUT;
 
     /**
      * The local address, which may be useful when working with multiple
      * network interfaces.
-     * @param localAddress locale address
-     * @return local address
      */
     private String localAddress;
 
     /**
      * Whether 'Expect: 100-continue' handshake is enabled.
      * See {@link RequestConfig#isExpectContinueEnabled()}
-     * @param expectContinueEnabled <code>true</code> if enabled
-     * @return <code>true</code> if enabled
      */
     private boolean expectContinueEnabled;
 
@@ -152,8 +139,6 @@ public enum CookieSpec {
      * The maximum number of redirects to be followed.  This can help
      * prevent infinite loops.  A value of zero effectively disables
      * redirects.  Default is {@link #DEFAULT_MAX_REDIRECT}.
-     * @param maxRedirects maximum number of redirects to be followed
-     * @return maximum number of redirects to be followed
      */
     private int maxRedirects = DEFAULT_MAX_REDIRECT;
 
@@ -161,16 +146,12 @@ public enum CookieSpec {
      * The maximum number of connections that can be created.  Typically,
      * you would have at least the same amount as threads.
      * Default is {@link #DEFAULT_MAX_CONNECTIONS}.
-     * @param maxConnections maximum number of connections
-     * @return number of connections
      */
     private int maxConnections = DEFAULT_MAX_CONNECTIONS;
 
     /**
      * The maximum number of connections that can be used per route.
      * Default is {@link #DEFAULT_MAX_CONNECTIONS_PER_ROUTE}.
-     * @param maxConnectionsPerRoute maximum number of connections per route
-     * @return number of connections per route
      */
     private int maxConnectionsPerRoute = DEFAULT_MAX_CONNECTIONS_PER_ROUTE;
 
@@ -178,9 +159,6 @@ public enum CookieSpec {
      * Sets the period of time after which to evict idle
      * connections from the connection pool.
      * Default is {@link #DEFAULT_MAX_IDLE_TIME}.
-     * @param maxConnectionIdleTime amount of time after which to evict idle
-     *         connections
-     * @return amount of time after which to evict idle connections
      */
     private Duration maxConnectionIdleTime = DEFAULT_MAX_IDLE_TIME;
 
@@ -188,11 +166,12 @@ public enum CookieSpec {
      * Sets the period of time a connection must be inactive
      * to be checked in case it became stalled. Default is 0 (not pro-actively
      * checked).
-     * @param maxConnectionInactiveTime period of time in milliseconds
-     * @return period of time in milliseconds
      */
     private Duration maxConnectionInactiveTime;
 
+    /**
+     * Headers to send with every HTTP request.
+     */
     private final Map<String, String> requestHeaders = new HashMap<>();
 
     /**
@@ -200,8 +179,6 @@ public enum CookieSpec {
      * header is disabled.
      * Servers supporting this header will only return the requested document
      * if it was last modified since the supplied date.
-     * @param ifModifiedSinceDisabled <code>true</code> if disabled
-     * @return <code>true</code> if disabled
      */
     private boolean ifModifiedSinceDisabled;
 
@@ -211,28 +188,25 @@ public enum CookieSpec {
      * Servers supporting this header will only return the requested document
      * if the ETag value has changed, indicating a more recent version is
      * available.
-     * @param eTagDisabled <code>true</code> if disabled
-     * @return <code>true</code> if disabled
      */
     private boolean eTagDisabled;
 
     /**
      * The user-agent used when identifying the crawler to targeted web sites.
      * <b>It is highly recommended to always identify yourself.</b>
-     * @param userAgent user agent
-     * @return user agent
      */
     private String userAgent;
 
     /**
      * The redirect URL provider.
      * Defaults to {@link GenericRedirectUrlProvider}.
-     * @param redirectUrlProvider redirect URL provider
-     * @return the redirect URL provider
      */
     private RedirectUrlProvider redirectUrlProvider =
             new GenericRedirectUrlProvider();
 
+    /**
+     * List of supported HTTP methods.
+     */
     private final List<HttpMethod> httpMethods = new ArrayList<>(
             Arrays.asList(
                     HttpMethod.GET, HttpMethod.HEAD));
@@ -241,31 +215,27 @@ public enum CookieSpec {
 
     /**
      * Sets whether to trust all SSL certificate (affects only "https"
-     * connections).  This is typically a bad
-     * idea (favors man-in-the-middle attacks). Try to install a SSL
+     * connections).  This is typically a bad idea if you care to avoid
+     * "man-in-the-middle" attacks. Try to install a SSL
      * certificate locally to ensure a proper certificate exchange instead.
      * @since 1.3.0
-     * @param trustAllSSLCertificates <code>true</code> if trusting all SSL
-     *            certificates
-     * @return <code>true</code> if trusting all SSL certificates
      */
     private boolean trustAllSSLCertificates;
 
     /**
      * Sets whether Server Name Indication (SNI) is disabled.
-     * @param sniDisabled <code>true</code> if disabled
-     * @return <code>true</code> if disabled
      */
     private boolean sniDisabled;
 
+    /**
+     * Supported security protocols.
+     */
     private final List<String> sslProtocols = new ArrayList<>();
 
     /**
      * Gets whether the forcing of non secure URLs to secure ones is disabled,
      * according to the URL domain <code>Strict-Transport-Security</code> policy
      * (obtained from HTTP response header).
-     * @param hstsDisabled <code>true</code> if disabled
-     * @return <code>true</code> if disabled
      */
     private boolean hstsDisabled;
 
@@ -280,6 +250,7 @@ public List<Integer> getValidStatusCodes() {
     /**
      * Gets valid HTTP response status codes.
      * @param validStatusCodes valid status codes
+     * @return this
      */
     public GenericHttpFetcherConfig setValidStatusCodes(
             List<Integer> validStatusCodes) {
@@ -299,6 +270,7 @@ public List<Integer> getNotFoundStatusCodes() {
     /**
      * Sets HTTP status codes to be considered as "Not found" state.
      * @param notFoundStatusCodes "Not found" codes
+     * @return this
      */
     public final GenericHttpFetcherConfig setNotFoundStatusCodes(
             List<Integer> notFoundStatusCodes) {
@@ -312,6 +284,7 @@ public final GenericHttpFetcherConfig setNotFoundStatusCodes(
      * may already provide.
      * @param name HTTP request header name
      * @param value HTTP request header value
+     * @return this
      */
     public GenericHttpFetcherConfig setRequestHeader(
             String name, String value) {
@@ -324,6 +297,7 @@ public GenericHttpFetcherConfig setRequestHeader(
      * Those are in addition to any default request headers Apache HttpClient
      * may already provide.
      * @param headers map of header names and values
+     * @return this
      */
     public GenericHttpFetcherConfig setRequestHeaders(
             Map<String, String> headers) {
@@ -388,6 +362,7 @@ public List<String> getSslProtocols() {
      * and TLSv1.2.  Note that specifying a protocol not supported by
      * your underlying Java platform will not work.
      * @param sslProtocols SSL/TLS protocols supported
+     * @return this
      */
     public GenericHttpFetcherConfig setSslProtocols(
             List<String> sslProtocols) {
@@ -408,6 +383,7 @@ public List<HttpMethod> getHttpMethods() {
      * Sets the list of HTTP methods to be accepted by this fetcher.
      * Defaults are {@link HttpMethod#GET} and {@link HttpMethod#HEAD}.
      * @param httpMethods HTTP methods
+     * @return this
      */
     public GenericHttpFetcherConfig setHttpMethods(
             List<HttpMethod> httpMethods) {
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java
index cdb7b188c..364980f31 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java
@@ -35,106 +35,42 @@
  * <p>
  * Generic HTTP Fetcher authentication configuration.
  * </p>
- * {@nx.xml.usage
- *   <method>[form|basic|digest|ntlm|spnego|kerberos]</method>
- *
- *   <!-- These apply to any authentication mechanism -->
- *   <credentials>
- *     {@nx.include com.norconex.commons.lang.security.Credentials@nx.xml.usage}
- *   </credentials>
- *
- *   <!-- These apply to FORM authentication -->
- *   <formUsernameField>...</formUsernameField>
- *   <formPasswordField>...</formPasswordField>
- *   <url>
- *     (Either a login form's action target URL or the URL of a page containing
- *      a login form if a "formSelector" is specified.)
- *   </url>
- *   <formCharset>...</formCharset>
- *   <!-- Extra form parameters required to authenticate (since 2.8.0) -->
- *   <formParams>
- *     <param name="(param name)">(param value)</param>
- *     <!-- You can repeat this param tag as needed. -->
- *   </formParams>
- *   <formSelector>
- *     (CSS selector identifying the login page. E.g., "form")
- *   </formSelector>
- *
- *   <!-- These apply to both BASIC and DIGEST authentication -->
- *   <host>
- *     {@nx.include com.norconex.commons.lang.net.Host@nx.xml.usage}
- *   </host>
- *
- *   <realm>...</realm>
- *
- *   <!-- This applies to BASIC authentication -->
- *   <preemptive>[false|true]</preemptive>
- *
- *   <!-- These apply to NTLM authentication -->
- *   <host>
- *     {@nx.include com.norconex.commons.lang.net.Host@nx.xml.usage}
- *   </host>
- *   <workstation>...</workstation>
- *   <domain>...</domain>
- * }
- *
- * <p>
- * The above XML configurable options can be nested in a supporting parent
- * tag of any name.
- * The expected parent tag name is defined by the consuming classes
- * (e.g. "authentication").
- * </p>
  * @since 3.0.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 @FieldNameConstants
 public class HttpAuthConfig {
 
     /**
-     * <p>
-     * The authentication method. Valid values are (case insensitive):
-     * </p>
-     * <ul>
-     *   <li>form</li>
-     *   <li>basic</li>
-     *   <li>digest</li>
-     *   <li>ntlm</li>
-     *   <li>spnego</li>
-     *   <li>kerberos</li>
-     * </ul>
-     * @return authentication method
-     * @param method authentication method
+     * The authentication method.
      */
     private HttpAuthMethod method;
 
     /**
      * The URL for "form" authentication.
-     * The username and password will be POSTed to this URL.
+     * The username and password will be POSTed to this URL unless
+     * {@link #setFormSelector(String)} is set, then it is assumed to be
+     * the URL of the page containing the form.
      * This is used only for "form" authentication.
-     * @param url "form" authentication URL
-     * @return "form" authentication URL
      */
     private String url;
-    //TODO consider taking those out in favor of 'formParams'?
 
     /**
      * The name of the HTML field where the username is set.
      * This is used only for "form" authentication.
-     * @param formUsernameField name of the HTML field
-     * @return username name of the HTML field
      */
     private String formUsernameField;
 
     /**
      * The name of the HTML field where the password is set.
      * This is used only for "form" authentication.
-     * @param formPasswordField name of the HTML field
-     * @return name of the HTML field
      */
     private String formPasswordField;
 
+    /**
+     * User name and password.
+     */
     private final Credentials credentials = new Credentials();
 
     /**
@@ -142,8 +78,6 @@ public class HttpAuthConfig {
      * <code>null</code> (default value) indicates "any host" for the
      * scope.
      * Used for BASIC and DIGEST authentication.
-     * @param host host for the scope
-     * @return host for the scope
      */
     private Host host;
 
@@ -151,51 +85,40 @@ public class HttpAuthConfig {
      * The realm name for the current authentication scope.
      * <code>null</code> (default) indicates "any realm" for the scope.
      * Used for BASIC and DIGEST authentication.
-     * @param realm reaml name for the scope
-     * @return realm name for the scope
      */
     private String realm;
 
-    //form
     /**
      * The authentication form character set for the form field values.
      * Default is UTF-8.
-     * @param formCharset authentication form character set
-     * @return authentication form character set
      */
     private Charset formCharset = StandardCharsets.UTF_8;
 
     /**
-     * The CSS selelector that identifies the form in a login page.
+     * The CSS selector that identifies the form in a login page.
      * When set, requires {@link #getUrl()} to be pointing to a login
      * page containing a login form.
-     * @param formSelector form selector
-     * @return form selector
      */
     private String formSelector;
 
+    /**
+     * Additional form parameters possibly expected by the login form.
+     */
     private final Map<String, String> formParams = new HashMap<>();
 
     /**
      * The NTLM authentication workstation name.
-     * @param workstation workstation name
-     * @return workstation name
      */
     private String workstation;
 
     /**
      * Gets the NTLM authentication domain.
-     * @param domain authentication domain
-     * @return authentication domain
      */
     private String domain;
 
     /**
      * Whether to perform preemptive authentication
      * (valid for "basic" authentication method).
-     * @param preemptive
-     *            <code>true</code> to perform preemptive authentication
-     * @return <code>true</code> to perform preemptive authentication
      */
     private boolean preemptive;
 
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java
index 3a4e75965..8008b3185 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java
@@ -45,9 +45,8 @@
  * </p>
  * <p>
  * <b>EXPERIMENTAL:</b> The use of this class is experimental.
- * It is known to not be supported properly
- * with some web drivers and/or browsers. It can even be ignored altogether
- * by some web drivers.
+ * It is known to not be supported properly with some web drivers and/or
+ * browsers. It can even be ignored altogether by some web drivers.
  * </p>
  *
  * @since 3.0.0
@@ -129,17 +128,12 @@ void start(MutableCapabilities options) {
                 new ResponseFilterAdapter.FilterSource(
                         (response, contents, messageInfo) -> {
                             // sniff only if original URL is being tracked
-                            var trackedResponse =
-                                    trackedUrlResponses
-                                            .get(messageInfo.getOriginalUrl());
-
+                            var trackedResponse = trackedUrlResponses
+                                    .get(messageInfo.getOriginalUrl());
                             if (trackedResponse != null) {
-                                response.headers()
-                                        .forEach(
-                                                en -> trackedResponse.headers
-                                                        .put(
-                                                                en.getKey(),
-                                                                en.getValue()));
+                                response.headers().forEach(
+                                        en -> trackedResponse.headers.put(
+                                                en.getKey(), en.getValue()));
                                 trackedResponse.statusCode =
                                         response.status().code();
                                 trackedResponse.reasonPhrase =
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java
index 3975b622d..5f1716ff0 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java
@@ -27,37 +27,8 @@
  * <p>
  * Configuration for {@link HttpSniffer}.
  * </p>
- *
- * {@nx.xml.usage
- * <port>(default is 0 = random free port)</port>
- * <host>(default is "localhost")</host>
- * <userAgent>(optionally overwrite browser user agent)</userAgent>
- * <maxBufferSize>
- *   (Maximum byte size before a request/response content is considered
- *    too large. Can be specified using notations, e.g., 25MB. Default is 10MB)
- * </maxBufferSize>
- * <!-- Optional HTTP request headers passed on every HTTP requests -->
- * <headers>
- *   <!-- You can repeat this header tag as needed. -->
- *   <header name="(header name)">(header value)</header>
- * </headers>
- * <!-- Optional chained proxy -->
- * <chainedProxy>
- *   {@nx.include com.norconex.commons.lang.net.ProxySettings@nx.xml.usage}
- * </chainedProxy>
- * }
- *
- * <p>
- * The above XML configurable options can be nested in a supporting parent
- * tag of any name.
- * The expected parent tag name is defined by the consuming classes
- * (e.g. "httpSniffer").
- * </p>
- *
- * @author Pascal Essiembre
  * @since 3.0.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class HttpSnifferConfig {
@@ -65,24 +36,48 @@ public class HttpSnifferConfig {
     public static final int DEFAULT_MAX_BUFFER_SIZE =
             DataUnit.MB.toBytes(10).intValue();
 
+    /**
+     * The host name passed to the browser pointing to the sniffer proxy.
+     * Defaults to 0 (random free port).
+     */
     private int port;
     /**
      * The host name passed to the browser pointing to the sniffer proxy.
      * Defaults to "localhost".
-     * @param host host name
-     * @return host name
-     * @since 3.1.0
      */
     private String host;
+    /**
+     * Optionally overwrite browser user agent.
+     */
     private String userAgent;
     private final Map<String, String> requestHeaders = new HashMap<>();
+    /**
+     * Maximum byte size before a request/response content is considered too
+     * large. Can be specified using notations, e.g., 25MB. Default is
+     * {@value #DEFAULT_MAX_BUFFER_SIZE}.
+     */
     private int maxBufferSize = DEFAULT_MAX_BUFFER_SIZE;
+
+    /**
+     * Chained proxy for cases where the HTTP Sniffer itself needs to use a
+     * proxy.
+     * @since 3.1.0
+     */
     private final ProxySettings chainedProxy = new ProxySettings();
 
+    /**
+     * Gets the request headers to add to every HTTP request.
+     * @return map of request headers
+     */
     public Map<String, String> getRequestHeaders() {
         return requestHeaders;
     }
 
+    /**
+     * Sets the request headers to add to every HTTP request.
+     * @param requestHeaders map of request headers
+     * @return this
+     */
     public HttpSnifferConfig setRequestHeaders(
             Map<String, String> requestHeaders) {
         this.requestHeaders.clear();
@@ -104,6 +99,7 @@ public ProxySettings getChainedProxy() {
      * Sets chained proxy settings, if any. That is, when the sniffer proxy
      * has to itself use a proxy.
      * @param chainedProxy chained proxy settings
+     * @return this
      * @since 3.1.0
      */
     public HttpSnifferConfig setChainedProxy(ProxySettings chainedProxy) {
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java
index d09c593dc..07f00cde7 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java
@@ -47,22 +47,8 @@
  * Screenshot images can be stored in a document metadata/field or
  * in a local directory.
  * </p>
- *
- * {@nx.xml.usage
- *   <cssSelector>(Optional selector of element to capture.)</cssSelector>
- *   {@nx.include com.norconex.crawler.web.fetch.util.DocImageHandler@nx.xml.usage}
- * }
- *
- * <p>
- * The above XML configurable options can be nested in a supporting parent
- * tag of any name.
- * The expected parent tag name is defined by the consuming classes
- * (e.g. "screenshot").
- * </p>
- *
  * @since 3.0.0
  */
-@SuppressWarnings("javadoc")
 @ToString
 @EqualsAndHashCode
 @Slf4j
@@ -92,9 +78,8 @@ public void takeScreenshot(WebDriver driver, Doc doc) {
         imageHandler.setConfiguration(configuration);
 
         try (InputStream in = streamFactory.newInputStream(
-                new ByteArrayInputStream(
-                        ((TakesScreenshot) driver)
-                                .getScreenshotAs(OutputType.BYTES)))) {
+                new ByteArrayInputStream(((TakesScreenshot) driver)
+                        .getScreenshotAs(OutputType.BYTES)))) {
 
             // If wanting a specific web element:
             if (StringUtils.isNotBlank(configuration.getCssSelector())) {
@@ -107,24 +92,19 @@ public void takeScreenshot(WebDriver driver, Doc doc) {
                         location.x, location.y, size.width, size.height);
                 var img = new MutableImage(in);
                 img.crop(rectangle);
-                imageHandler.handleImage(
-                        img.toInputStream(
-                                ofNullable(
-                                        getConfiguration()
-                                                .getImageFormat())
-                                                        .orElse("png")),
+                imageHandler.handleImage(img.toInputStream(
+                        ofNullable(getConfiguration().getImageFormat())
+                                .orElse("png")),
                         doc);
             } else {
                 imageHandler.handleImage(in, doc);
             }
         } catch (Exception e) {
             if (LOG.isDebugEnabled()) {
-                LOG.error(
-                        "Could not take screenshot of: {}",
+                LOG.error("Could not take screenshot of: {}",
                         doc.getReference(), e);
             } else {
-                LOG.error(
-                        "Could not take screenshot of: {}. Error:\n{}",
+                LOG.error("Could not take screenshot of: {}. Error:\n{}",
                         doc.getReference(),
                         ExceptionUtil.getFormattedMessages(e));
             }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java
index a8632b22f..2302ced57 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java
@@ -17,8 +17,6 @@
 import java.nio.file.Path;
 import java.nio.file.Paths;
 
-import org.openqa.selenium.WebDriver;
-
 import com.norconex.crawler.core.doc.CrawlDocMetadata;
 import com.norconex.crawler.web.fetch.util.DocImageHandlerConfig;
 
@@ -27,27 +25,10 @@
 
 /**
  * <p>
- * Takes screenshot of pages using a Selenium {@link WebDriver}.
- * Either the entire page, or a specific DOM element.
- * Screenshot images can be stored in a document metadata/field or
- * in a local directory.
- * </p>
- *
- * {@nx.xml.usage
- *   <cssSelector>(Optional selector of element to capture.)</cssSelector>
- *   {@nx.include com.norconex.crawler.web.fetch.util.DocImageHandler@nx.xml.usage}
- * }
- *
- * <p>
- * The above XML configurable options can be nested in a supporting parent
- * tag of any name.
- * The expected parent tag name is defined by the consuming classes
- * (e.g. "screenshot").
+ * Configuration for {@link ScreenshotHandler}.
  * </p>
- *
  * @since 3.0.0
  */
-@SuppressWarnings("javadoc")
 @Data
 @Accessors(chain = true)
 public class ScreenshotHandlerConfig extends DocImageHandlerConfig {
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java
index 1507eff02..804841eee 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java
@@ -14,7 +14,6 @@
  */
 package com.norconex.crawler.web.fetch.impl.webdriver;
 
-import static java.time.Duration.ofMillis;
 import static java.util.Optional.ofNullable;
 
 import java.io.InputStream;
@@ -92,93 +91,6 @@
  * Browsers/WebDriver implementations.
  * </p>
  *
- * {@nx.xml.usage
- * <fetcher class="com.norconex.crawler.web.fetch.impl.webdriver.WebDriverHttpFetcher">
- *
- *   <browser>[chrome|edge|firefox|opera|safari]</browser>
- *
- *   <!-- Local web driver settings -->
- *   <browserPath>(browser executable or blank to detect)</browserPath>
- *   <driverPath>(driver executable or blank to detect)</driverPath>
- *
- *   <!-- Remote web driver setting -->
- *   <remoteURL>(URL of the remote web driver cluster)</remoteURL>
- *
- *   <!-- Optional browser capabilities supported by the web driver. -->
- *   <capabilities>
- *     <capability name="(capability name)">(capability value)</capability>
- *     <!-- multiple "capability" tags allowed -->
- *   </capabilities>
- *
- *   <!-- Optionally take screenshots of each web pages. -->
- *   <screenshot>
- *     {@nx.include com.norconex.crawler.web.fetch.impl.webdriver.ScreenshotHandler@nx.xml.usage}
- *   </screenshot>
- *
- *   <windowSize>(Optional. Browser window dimensions. E.g., 640x480)</windowSize>
- *
- *   <earlyPageScript>
- *     (Optional JavaScript code to be run the moment a page is requested.)
- *   </earlyPageScript>
- *   <latePageScript>
- *     (Optional JavaScript code to be run after we are done
- *      waiting for a page.)
- *   </latePageScript>
- *
- *   <!-- The following timeouts/waits are set in milliseconds or
- *      - human-readable format (English). Default is zero (not set).
- *      -->
- *   <pageLoadTimeout>
- *     (Web driver max wait time for a page to load.)
- *   </pageLoadTimeout>
- *   <implicitlyWait>
- *     (Web driver max wait time for an element to appear. See
- *      "waitForElement".)
- *   </implicitlyWait>
- *   <scriptTimeout>
- *     (Web driver max wait time for a scripts to execute.)
- *   </scriptTimeout>
- *   <waitForElement
- *       type="[tagName|className|cssSelector|id|linkText|name|partialLinkText|xpath]"
- *       selector="(Reference to element, as per the type specified.)">
- *     (Max wait time for an element to show up in browser before returning.
- *      Default 'type' is 'tagName'.)
- *   </waitForElement>
- *   <threadWait>
- *     (Makes the current thread sleep for the specified duration, to
- *     give the web driver enough time to load the page.
- *     Sometimes necessary for some web driver implementations if the above
- *     options do not work.)
- *   </threadWait>
- *
- *   {@nx.include com.norconex.crawler.core.fetch.AbstractFetcher#referenceFilters}
- *
- *   <!-- Optionally setup an HTTP proxy that allows to set and capture
- *        HTTP headers. For advanced use only. Not recommended
- *        for regular usage. -->
- *   <httpSniffer>
- *     {@nx.include com.norconex.crawler.web.fetch.impl.webdriver.HttpSnifferConfig@nx.xml.usage}
- *   </httpSniffer>
- *
- * </fetcher>
- * }
- *
- * {@nx.xml.example
- * <fetcher class="com.norconex.crawler.web.fetch.impl.webdriver.WebDriverHttpFetcher">
- *   <browser>firefox</browser>
- *   <driverPath>/drivers/geckodriver.exe</driverPath>
- *   <referenceFilters>
- *     <filter class="ReferenceFilter">
- *       <valueMatcher method="regex">.*dynamic.*$</valueMatcher>
- *     </filter>
- *   </referenceFilters>
- * </fetcher>
- * }
- *
- * <p>The above example will use Firefox to crawl dynamically generated
- * pages using a specific web driver.
- * </p>
- *
  * @since 3.0.0
  */
 @SuppressWarnings("javadoc")
@@ -334,9 +246,8 @@ public HttpFetchResponse fetch(HttpFetchRequest req)
                 .builder()
                 .crawlDocState(CrawlDocState.NEW)
                 .statusCode(200)
-                .reasonPhrase(
-                        "No exception thrown, but real status code "
-                                + "unknown. Capture headers for real status code.")
+                .reasonPhrase("No exception thrown, but real status code "
+                        + "unknown. Capture headers for real status code.")
                 .userAgent(getUserAgent())
                 .build();
     }
@@ -387,39 +298,30 @@ protected InputStream fetchDocumentContent(String url) {
         }
 
         var timeouts = driver.manage().timeouts();
-        if (configuration.getPageLoadTimeout() != 0) {
-            timeouts.pageLoadTimeout(
-                    ofMillis(configuration.getPageLoadTimeout()));
+        if (configuration.getPageLoadTimeout() != null) {
+            timeouts.pageLoadTimeout(configuration.getPageLoadTimeout());
         }
-        if (configuration.getImplicitlyWait() != 0) {
-            timeouts.implicitlyWait(
-                    ofMillis(configuration.getImplicitlyWait()));
+        if (configuration.getImplicitlyWait() != null) {
+            timeouts.implicitlyWait(configuration.getImplicitlyWait());
         }
-        if (configuration.getScriptTimeout() != 0) {
-            timeouts.scriptTimeout(
-                    ofMillis(configuration.getScriptTimeout()));
+        if (configuration.getScriptTimeout() != null) {
+            timeouts.scriptTimeout(configuration.getScriptTimeout());
         }
 
-        if (configuration.getWaitForElementTimeout() != 0
+        if (configuration.getWaitForElementTimeout() != null
                 && StringUtils.isNotBlank(
                         configuration.getWaitForElementSelector())) {
             var elType = ObjectUtils.defaultIfNull(
                     configuration.getWaitForElementType(),
                     WaitElementType.TAGNAME);
-            LOG.debug(
-                    "Waiting for element '{}' of type '{}' for '{}'.",
+            LOG.debug("Waiting for element '{}' of type '{}' for '{}'.",
                     configuration.getWaitForElementSelector(), elType, url);
 
             var wait = new WebDriverWait(
-                    driver, ofMillis(configuration.getWaitForElementTimeout()));
-            wait.until(
-                    ExpectedConditions.presenceOfElementLocated(
-                            elType.getBy(
-                                    configuration
-                                            .getWaitForElementSelector())));
-
-            LOG.debug(
-                    "Done waiting for element '{}' of type '{}' for '{}'.",
+                    driver, configuration.getWaitForElementTimeout());
+            wait.until(ExpectedConditions.presenceOfElementLocated(
+                    elType.getBy(configuration.getWaitForElementSelector())));
+            LOG.debug("Done waiting for element '{}' of type '{}' for '{}'.",
                     configuration.getWaitForElementSelector(), elType, url);
         }
 
@@ -428,8 +330,8 @@ protected InputStream fetchDocumentContent(String url) {
                     configuration.getLatePageScript());
         }
 
-        if (configuration.getThreadWait() != 0) {
-            Sleeper.sleepMillis(configuration.getThreadWait());
+        if (configuration.getThreadWait() != null) {
+            Sleeper.sleepMillis(configuration.getThreadWait().toMillis());
         }
 
         var pageSource = driver.getPageSource();
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java
index d7a027955..47d9cf6ee 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java
@@ -17,6 +17,7 @@
 import java.awt.Dimension;
 import java.net.URL;
 import java.nio.file.Path;
+import java.time.Duration;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -64,11 +65,32 @@ By getBy(String selector) {
         }
     }
 
+    /**
+     * The browser used for crawling. Also defines which WebDriver to use.
+     * Default is Firefox.
+     */
     private Browser browser = Browser.FIREFOX;
-    // Default will try to detect driver installation on OS
+    /**
+     * Local path to driver executable or <code>null</code> to attempt
+     * automatic detection of the driver path.
+     * See web driver vendor documentation for the location facilitating
+     * detection.
+     * Use {@link #setRemoteURL(URL)} instead when using
+     * a remote web driver cluster.
+     */
     private Path driverPath;
-    // Default will try to detect browser installation on OS
+    /**
+     * Local path to browser executable or <code>null</code> to attempt
+     * automatic browser path detection. See browser vendor documentation
+     * for the expected browser installed location.
+     * Use {@link #setRemoteURL(URL)} instead when using
+     * a remote web driver cluster.
+     */
     private Path browserPath;
+    /**
+     * URL of a remote WebDriver cluster. Alternative to using a local
+     * browser and local web driver.
+     */
     private URL remoteURL;
 
     /**
@@ -81,31 +103,95 @@ By getBy(String selector) {
      */
     private boolean useHtmlUnit;
 
+    /**
+     * Optionally setup an HTTP proxy that allows to set and capture HTTP
+     * headers. For advanced use only.
+     */
     private HttpSniffer httpSniffer;
+
+    /**
+     * When configured, takes screenshots of each web pages.
+     */
     private ScreenshotHandler screenshotHandler;
 
+    /**
+     * Optional capabilities (configuration options) for the web driver.
+     * Many are specific to each browser or web driver. Refer to vendor
+     * documentation.
+     */
     private final Map<String, String> capabilities = new HashMap<>();
+    /**
+     * Optional command-line arguments supported by some web driver or browser.
+     */
     private final List<String> arguments = new ArrayList<>();
 
+    /**
+     * Optionally set the browser window dimensions. E.g., 640x480.
+     */
     private Dimension windowSize;
 
+    /**
+     * Optional JavaScript code to be run the moment a page is requested.
+     */
     private String earlyPageScript;
+    /**
+     * Optional JavaScript code to be run after we are done waiting for a page.
+     */
     private String latePageScript;
 
-    private long pageLoadTimeout;
-    private long implicitlyWait;
-    private long scriptTimeout;
-    private long threadWait;
+    /**
+     * Web driver max wait time for a page to load.
+     */
+    private Duration pageLoadTimeout;
+    /**
+     * Web driver max wait time for an element to appear. See
+     * {@link #getWaitForElementSelector()}.
+     */
+    private Duration implicitlyWait;
+    /**
+     * Web driver max wait time for a scripts to execute.
+     */
+    private Duration scriptTimeout;
+    /**
+     * Makes the current thread sleep for the specified duration, to
+     * give the web driver enough time to load the page.
+     * Sometimes necessary for some web driver implementations when preferable
+     * options fail.
+     */
+    private Duration threadWait;
 
+    /**
+     * The type of reference to use when waiting for an element.
+     */
     private WaitElementType waitForElementType;
+    /**
+     * Reference to an element to wait for. The nature of the reference itself
+     * is defined by {@link #getWaitForElementType()}.
+     */
     private String waitForElementSelector;
-    private long waitForElementTimeout;
+    /**
+     * Max wait time for an element to show up in browser before returning.
+     * Default 'type' is 'tagName'.
+     */
+    private Duration waitForElementTimeout;
 
-    public Map<String, String> getCapabilities(
-            Map<String, String> capabilities) {
+    /**
+     * Gets optional capabilities (configuration options) for the web driver.
+     * Many are specific to each browser or web driver. Refer to vendor
+     * documentation.
+     * @return capabilities
+     */
+    public Map<String, String> getCapabilities() {
         return Collections.unmodifiableMap(capabilities);
     }
 
+    /**
+     * Sets optional capabilities (configuration options) for the web driver.
+     * Many are specific to each browser or web driver. Refer to vendor
+     * documentation.
+     * @param capabilities web driver capabilities
+     * @return this
+     */
     public WebDriverHttpFetcherConfig setCapabilities(
             Map<String, String> capabilities) {
         CollectionUtil.setAll(this.capabilities, capabilities);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java
index 413138da9..05569de46 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java
@@ -41,23 +41,6 @@
  * being itself an image).  Examples can be screenshots, featured image, etc.
  * Images can be stored in a document metadata/field or in a local directory.
  * </p>
- *
- * {@nx.xml.usage
- * <targets>[metadata|directory] (One or both, separated by comma.)</targets>
- * <imageFormat>(Image format. Default is "png".)</imageFormat>
- * <!-- The following applies to the "directory" target: -->
- * <targetDir
- *     field="(Document field to store the local path to the image.)"
- *     structure="[url2path|date|datetime]">
- *   (Local directory where to save images.)
- * </targetDir>
- * <!-- The following applies to the "metadata" target: -->
- * <targetMetaField>(Document field where to store the image.)</targetMetaField>
- * }
- * <p>
- * The above XML configurable options can be nested in a parent tag of any name.
- * The expected parent tag name is defined by the consuming classes.
- * </p>
  * @since 3.0.0
  */
 @Slf4j
@@ -70,42 +53,8 @@ public class DocImageHandler implements Configurable<DocImageHandlerConfig> {
     @NonNull
     private DocImageHandlerConfig configuration = new DocImageHandlerConfig();
 
-    //    public enum Target { METADATA, DIRECTORY }
-    //    public enum DirStructure { URL2PATH, DATE, DATETIME }
-    //    public static final String DEFAULT_IMAGE_FORMAT = "png";
-    //
-    //    protected static final List<Target> DEFAULT_TYPES =
-    //            List.of(Target.DIRECTORY) ;
-    //
-    ////    @EqualsAndHashCode.Exclude
-    ////    @ToString.Exclude
-    //    private final List<Target> targets = new ArrayList<>(DEFAULT_TYPES);
-    //    private Path targetDir;
-    //    private String targetDirField;
-    //    private DirStructure targetDirStructure = DirStructure.DATETIME;
-    //    private String targetMetaField;
-    //    private String imageFormat = DEFAULT_IMAGE_FORMAT;
-
     private final ImageTransformer imgTransformer = new ImageTransformer();
 
-    //    public DocImageHandler(
-    //            Path defaultDir,
-    //            String defaultDirField,
-    //            String defaultMetaField) {
-    //        targetDir = defaultDir;
-    //        targetDirField = defaultDirField;
-    //        targetMetaField = defaultMetaField;
-    //    }
-    //
-    //    public DocImageHandler() {}
-    //
-    //    public List<Target> getTargets() {
-    //        return Collections.unmodifiableList(targets);
-    //    }
-    //    public void setTargets(List<Target> targets) {
-    //        CollectionUtil.setAll(this.targets, targets);
-    //    }
-
     public void handleImage(InputStream imageStream, Doc doc) {
 
         //TODO check for null and:
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java
index 4b5cfc8e7..eea58f01d 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java
@@ -26,26 +26,7 @@
 
 /**
  * <p>
- * Handles images associated with a document (which is different than a document
- * being itself an image).  Examples can be screenshots, featured image, etc.
- * Images can be stored in a document metadata/field or in a local directory.
- * </p>
- *
- * {@nx.xml.usage
- * <targets>[metadata|directory] (One or both, separated by comma.)</targets>
- * <imageFormat>(Image format. Default is "png".)</imageFormat>
- * <!-- The following applies to the "directory" target: -->
- * <targetDir
- *     field="(Document field to store the local path to the image.)"
- *     structure="[url2path|date|datetime]">
- *   (Local directory where to save images.)
- * </targetDir>
- * <!-- The following applies to the "metadata" target: -->
- * <targetMetaField>(Document field where to store the image.)</targetMetaField>
- * }
- * <p>
- * The above XML configurable options can be nested in a parent tag of any name.
- * The expected parent tag name is defined by the consuming classes.
+ * Configuration for {@link DocImageHandler}.
  * </p>
  * @since 3.0.0
  */
@@ -54,11 +35,34 @@
 public class DocImageHandlerConfig {
 
     public enum Target {
-        METADATA, DIRECTORY
+        /**
+         * Store image in metadata field.
+         */
+        METADATA,
+        /**
+         * Store image on local directory.
+         */
+        DIRECTORY
     }
 
+    /**
+     * Directory structure when storing images on disk.
+     */
     public enum DirStructure {
-        URL2PATH, DATE, DATETIME
+        /**
+         * Create directories for each URL segments, with handling
+         * of special characters.
+         */
+        URL2PATH,
+        /**
+         * Create directories for each date (e.g., <code>2000/12/31/</code>).
+         */
+        DATE,
+        /**
+         * Create directories for each date and time, up to seconds
+         * (e.g., <code>2000/12/31/13/34/12/</code>).
+         */
+        DATETIME
     }
 
     public static final String DEFAULT_IMAGE_FORMAT = "png";
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
index e6f30ff5f..15dd31f04 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
@@ -14,11 +14,15 @@
  */
 package com.norconex.crawler.web.fetch.util;
 
-import java.io.UnsupportedEncodingException;
+import static com.norconex.crawler.web.fetch.util.GenericRedirectUrlProviderConfig.DEFAULT_FALLBACK_CHARSET;
+import static java.util.Optional.ofNullable;
+import static org.apache.commons.lang3.StringUtils.substringAfterLast;
+import static org.apache.commons.lang3.StringUtils.trimToNull;
+
 import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
+import java.nio.charset.Charset;
 
-import org.apache.commons.lang3.StringUtils;
+import org.apache.hc.core5.http.Header;
 import org.apache.hc.core5.http.HttpHeaders;
 import org.apache.hc.core5.http.HttpRequest;
 import org.apache.hc.core5.http.HttpResponse;
@@ -26,11 +30,11 @@
 import org.apache.hc.core5.http.protocol.HttpCoreContext;
 import org.apache.tika.utils.CharsetUtils;
 
+import com.norconex.commons.lang.config.Configurable;
 import com.norconex.commons.lang.url.HttpURL;
-import com.norconex.commons.lang.xml.Xml;
-import com.norconex.commons.lang.xml.XmlConfigurable;
 
 import lombok.Data;
+import lombok.Getter;
 import lombok.extern.slf4j.Slf4j;
 
 /**
@@ -80,34 +84,18 @@
  *   </li>
  * </ul>
  *
- * {@nx.xml.usage
- * <redirectURLProvider
- *     class="com.norconex.crawler.web.redirect.impl.GenericRedirectURLProvider"
- *     fallbackCharset="(character encoding)" />
- * }
- *
- * {@nx.xml.example
- * <pre>
- * <redirectURLProvider fallbackCharset="ISO-8859-1" />
- * }
- * <p>
- * The above example sets the default character encoding to be "ISO-8859-1"
- * when it could not be detected.
- * </p>
- *
  * @since 2.4.0
  */
 @Slf4j
 @Data
-public class GenericRedirectUrlProvider
-        implements RedirectUrlProvider, XmlConfigurable {
-
-    public static final String DEFAULT_FALLBACK_CHARSET =
-            StandardCharsets.UTF_8.toString();
+public class GenericRedirectUrlProvider implements
+        RedirectUrlProvider, Configurable<GenericRedirectUrlProviderConfig> {
 
     private static final int ASCII_MAX_CODEPOINT = 128;
 
-    private String fallbackCharset = DEFAULT_FALLBACK_CHARSET;
+    @Getter
+    private final GenericRedirectUrlProviderConfig configuration =
+            new GenericRedirectUrlProviderConfig();
 
     @Override
     public String provideRedirectURL(
@@ -127,30 +115,15 @@ public String provideRedirectURL(
         var hl = response.getLastHeader(HttpHeaders.LOCATION);
         if (hl == null) {
             //TODO should throw exception instead?
-            LOG.error(
-                    "Redirect detected to a null Location for: {}",
+            LOG.error("Redirect detected to a null Location for: {}",
                     originalURL);
             return null;
         }
         var redirectLocation = hl.getValue();
 
-        //--- Charset ---
-        String charset = null;
-        var hc = response.getLastHeader("Content-Type");
-        if (hc != null) {
-            var contentType = hc.getValue();
-            if (contentType.contains(";")) {
-                charset = StringUtils.substringAfterLast(
-                        contentType, "charset=");
-            }
-        }
-        if (StringUtils.isBlank(charset)) {
-            charset = fallbackCharset;
-        }
-
         //--- Build/fix redirect URL ---
         var targetURL = HttpURL.toAbsolute(originalURL, redirectLocation);
-        targetURL = resolveRedirectURL(targetURL, charset);
+        targetURL = resolveRedirectURL(response, targetURL);
 
         if (LOG.isDebugEnabled()) {
             LOG.debug("URL redirect: {} -> {}", originalURL, targetURL);
@@ -158,16 +131,17 @@ public String provideRedirectURL(
         return targetURL;
     }
 
-    //TODO is there value in moving this method to somewhere re-usable?
+    //MAYBE: is there value in moving this method to somewhere re-usable?
     private String resolveRedirectURL(
-            final String redirectURL, final String nonAsciiCharset) {
+            HttpResponse response, String redirectURL) {
 
         var url = redirectURL;
 
         // Is string containing only ASCII as it should?
         var isAscii = true;
         final var length = url.length();
-        for (var offset = 0; offset < length;) {
+        var offset = 0;
+        while (offset < length) {
             final var codepoint = url.codePointAt(offset);
             if (codepoint > ASCII_MAX_CODEPOINT) {
                 isAscii = false;
@@ -184,30 +158,29 @@ private String resolveRedirectURL(
                 Will try to fix. Redirect URL: {}""", redirectURL);
 
         // try to fix if non ascii charset is non UTF8.
-        if (StringUtils.isNotBlank(nonAsciiCharset)) {
-            var charset = CharsetUtils.clean(nonAsciiCharset);
-            if (!StandardCharsets.UTF_8.toString().equals(charset)) {
-                try {
-                    return new String(url.getBytes(charset));
-                } catch (UnsupportedEncodingException e) {
-                    LOG.warn(
-                            "Could not fix badly encoded URL with charset "
-                                    + "\"{}\". Redirect URL: {}",
-                            charset, redirectURL, e);
-                }
-            }
-        }
-
-        return new String(url.getBytes(StandardCharsets.UTF_8));
+        return new String(url.getBytes(resolveCharset(response, redirectURL)));
     }
 
-    @Override
-    public void loadFromXML(Xml xml) {
-        setFallbackCharset(xml.getString("@fallbackCharset", fallbackCharset));
-    }
-
-    @Override
-    public void saveToXML(Xml xml) {
-        xml.setAttribute("fallbackCharset", fallbackCharset);
+    // Detect charset from response header or use fallback
+    private Charset resolveCharset(HttpResponse response, String redirectUrl) {
+        return ofNullable(response.getLastHeader("Content-Type"))
+                .map(Header::getValue)
+                .filter(ct -> ct.contains(";"))
+                .map(ct -> trimToNull(substringAfterLast(ct, "charset=")))
+                .map(chset -> {
+                    try {
+                        return CharsetUtils.forName(chset);
+                    } catch (RuntimeException e) {
+                        var charset =
+                                ofNullable(configuration.getFallbackCharset())
+                                        .orElse(DEFAULT_FALLBACK_CHARSET);
+                        LOG.warn("""
+                            Could not fix badly encoded URL with charset \
+                            "{}". Redirect URL: "{}". Will try with \
+                            fallback charset: {}""",
+                                charset, redirectUrl, charset);
+                        return charset;
+                    }
+                }).get();
     }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java
new file mode 100644
index 000000000..17bc2e59b
--- /dev/null
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java
@@ -0,0 +1,34 @@
+/* Copyright 2015-2024 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.norconex.crawler.web.fetch.util;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import lombok.Data;
+import lombok.experimental.Accessors;
+
+/**
+ * Configuration for {@link GenericRedirectUrlProvider}.
+ */
+@Data
+@Accessors(chain = true)
+public class GenericRedirectUrlProviderConfig {
+
+    public static final Charset DEFAULT_FALLBACK_CHARSET =
+            StandardCharsets.UTF_8;
+
+    private Charset fallbackCharset = DEFAULT_FALLBACK_CHARSET;
+}
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java
index ef0a7e002..9560ec923 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java
@@ -51,20 +51,6 @@
  * <p>If robots instructions are provided in both the HTML page and
  * HTTP header, the ones in HTML page will take precedence, and the
  * ones in HTTP header will be ignored.</p>
- *
- * {@nx.xml.usage
- *  <robotsMeta
- *     class="com.norconex.crawler.web.robot.impl.StandardRobotsMetaProvider">
- *     <headersPrefix>(string prefixing headers)</headersPrefix>
- *  </robotsMeta>
- * }
- *
- * {@nx.xml.example
- * <robotsMeta />
- * }
- * <p>
- * The above example ignores robot meta information.
- * </p>
  */
 @Slf4j
 @EqualsAndHashCode
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java
index 9ee56acf0..4bf348029 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java
@@ -14,43 +14,11 @@
  */
 package com.norconex.crawler.web.robot.impl;
 
-import com.norconex.crawler.web.robot.RobotsMetaProvider;
-
 import lombok.Data;
 import lombok.experimental.Accessors;
 
 /**
- * <p>Implementation of {@link RobotsMetaProvider} as per X-Robots-Tag
- * and ROBOTS standards.
- * Extracts robots information from "ROBOTS" meta tag in an HTML page
- * or "X-Robots-Tag" tag in the HTTP header (see
- * <a href="https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag">
- * https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag</a>
- * and
- * <a href="http://www.robotstxt.org/meta.html">
- * http://www.robotstxt.org/meta.html</a>).
- * </p>
- *
- * <p>If you specified a prefix for the HTTP headers, make sure to specify it
- * again here or the robots meta tags will not be found.</p>
- *
- * <p>If robots instructions are provided in both the HTML page and
- * HTTP header, the ones in HTML page will take precedence, and the
- * ones in HTTP header will be ignored.</p>
- *
- * {@nx.xml.usage
- *  <robotsMeta
- *     class="com.norconex.crawler.web.robot.impl.StandardRobotsMetaProvider">
- *     <headersPrefix>(string prefixing headers)</headersPrefix>
- *  </robotsMeta>
- * }
- *
- * {@nx.xml.example
- * <robotsMeta />
- * }
- * <p>
- * The above example ignores robot meta information.
- * </p>
+ * Configuration for {@link StandardRobotsMetaProvider}.
  */
 @Data
 @Accessors(chain = true)
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java
index e104ecf72..4994e5726 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java
@@ -60,18 +60,6 @@
  * described at <a href="http://www.robotstxt.org/robotstxt.html">
  * http://www.robotstxt.org/robotstxt.html</a>.
  * </p>
- * {@nx.xml.usage
- * <robotsTxt ignore="false"
- *     class="com.norconex.crawler.web.robot.impl.StandardRobotsTxtProvider"/>
- * }
- *
- * {@nx.xml.example
- * <pre>
- * <robotsTxt ignore="true" />
- * }
- * <p>
- * The above example ignores "robots.txt" files present on web sites.
- * </p>
  */
 @Slf4j
 @EqualsAndHashCode
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java
index e992ffeb1..2b3f19694 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java
@@ -41,23 +41,6 @@
  * Default paths are: <code>/sitemap.xml</code> and
  * <code>/sitemap_index.xml</code>
  * </p>
- *
- * {@nx.xml.usage
- * <sitemapLocator
- *   class="com.norconex.crawler.web.sitemap.impl.GenericSitemapLocator"
- *   robotsTxtSitemapDisabled="[false|true]"
- * >
- *   <paths>
- *     <!--
- *       Disable locating by paths by self-closing this tag.
- *       -->
- *     <path>
- *       (Sitemap URL path relative to web site domain.
- *        Overwriting default when specified.)
- *     </path>
- *   </paths>
- * </sitemapLocator>
- * }
  */
 @EqualsAndHashCode
 @ToString
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java
index ea9ff1920..7eabace5b 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java
@@ -19,40 +19,13 @@
 import java.util.List;
 
 import com.norconex.commons.lang.collection.CollectionUtil;
-import com.norconex.crawler.web.robot.RobotsTxtProvider;
 
 import lombok.Data;
 import lombok.experimental.Accessors;
 import lombok.experimental.FieldNameConstants;
 
 /**
- * <p>
- * If there is a sitemap defined as a start reference for the same URL web site,
- * this locator is not used. Otherwise, it tells the crawler to
- * use the sitemap as defined in the web site "robots.txt" file (provided
- * the web site defines one and {@link RobotsTxtProvider} is enabled).
- * If no sitemap resolution was possible from "robots.txt", an attempt will
- * be made to retrieve a sitemap using the configured sitemap paths.
- * Default paths are: <code>/sitemap.xml</code> and
- * <code>/sitemap_index.xml</code>
- * </p>
- *
- * {@nx.xml.usage
- * <sitemapLocator
- *   class="com.norconex.crawler.web.sitemap.impl.GenericSitemapLocator"
- *   robotsTxtSitemapDisabled="[false|true]"
- * >
- *   <paths>
- *     <!--
- *       Disable locating by paths by self-closing this tag.
- *       -->
- *     <path>
- *       (Sitemap URL path relative to web site domain.
- *        Overwriting default when specified.)
- *     </path>
- *   </paths>
- * </sitemapLocator>
- * }
+ * Configuration for {@link GenericSitemapLocator}.
  */
 @Data
 @Accessors(chain = true)
@@ -62,8 +35,18 @@ public class GenericSitemapLocatorConfig {
     public static final List<String> DEFAULT_PATHS =
             List.of("/sitemap.xml", "/sitemap_index.xml");
 
+    /**
+     * The domain-relative URL paths where to look for sitemaps when not
+     * supplied as start reference or part of a web site robots.txt file.
+     * Defaults to <code>/sitemap.xml</code> and
+     * <code>/sitemap_index.xml</code>.
+     */
     private final List<String> paths = new ArrayList<>(DEFAULT_PATHS);
 
+    /**
+     * Whether to disable checking for the sitemap locations in a web site
+     * robots.txt file.
+     */
     private boolean robotsTxtSitemapDisabled;
 
     /**
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java
index 481b5767f..92cc053ab 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java
@@ -55,9 +55,8 @@ List<SitemapRecord> parse(
             Xml.stream(is)
                     .takeWhile(c -> {
                         if (stopping.isTrue()) {
-                            LOG.debug(
-                                    "Sitemap not entirely parsed due to "
-                                            + "crawler being stopped.");
+                            LOG.debug("Sitemap not entirely parsed due to "
+                                    + "crawler being stopped.");
                             return false;
                         }
                         return true;
@@ -72,13 +71,11 @@ List<SitemapRecord> parse(
                         }
                     });
         } catch (XmlException e) {
-            LOG.error(
-                    "Cannot fetch sitemap: {} -- Likely an invalid sitemap "
-                            + "XML format causing a parsing error (actual error:{}).",
+            LOG.error("Cannot fetch sitemap: {} -- Likely an invalid sitemap "
+                    + "XML format causing a parsing error (actual error:{}).",
                     location, e.getMessage());
         } catch (IOException e) {
-            LOG.error(
-                    "Cannot fetch sitemap: {} ({})",
+            LOG.error("Cannot fetch sitemap: {} ({})",
                     location, e.getMessage(), e);
         }
         return children;
@@ -104,9 +101,8 @@ private Optional<WebCrawlDocContext> toDocRecord(
         // Is URL valid?
         if (StringUtils.isBlank(url)
                 || (!lenient && !url.startsWith(sitemapLocationDir))) {
-            LOG.debug(
-                    "Sitemap URL invalid for location directory."
-                            + " URL: {}  Location directory: {}",
+            LOG.debug("Sitemap URL invalid for location directory."
+                    + " URL: {}  Location directory: {}",
                     url, sitemapLocationDir);
             return Optional.empty();
         }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java
index af86a6100..70e2470f9 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java
@@ -73,10 +73,6 @@ static boolean shouldProcessSitemap(
                 || cacheModifDate.isBefore(newRec.getLastModified());
     }
 
-    //    static ZonedDateTime now() {
-    //        return ZonedDateTime.now(ZoneOffset.UTC);
-    //    }
-
     static SitemapRecord toSitemapRecord(CrawlDoc doc) {
         var indexRec = new SitemapRecord();
         var docRec = Web.docContext(doc);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java
index 2d717cb30..995461b5d 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java
@@ -35,7 +35,6 @@
 import com.norconex.crawler.web.WebCrawlerConfig;
 import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector;
 import com.norconex.crawler.web.doc.operations.delay.DelayResolver;
-import com.norconex.crawler.web.doc.operations.delay.impl.DelayRange;
 import com.norconex.crawler.web.doc.operations.link.LinkExtractor;
 import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver;
 import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver;
@@ -62,7 +61,6 @@ public MultiValuedMap<Class<?>, Class<?>> getPolymorphicTypes() {
         addPolyType(map, MetadataChecksummer.class, "doc.operations.checksum");
         addPolyType(map, EventListener.class, "event.listeners");
         addPolyType(map, DelayResolver.class);
-        addPolyType(map, DelayRange.class);
         addPolyType(
                 map, DocumentFilter.class,
                 "doc.operations.filter"); //NOSONAR
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java b/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java
index 64f7ceaef..60413b5e7 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java
@@ -16,8 +16,6 @@
 
 import static org.apache.commons.lang3.StringUtils.substring;
 
-import java.util.Collection;
-import java.util.List;
 import java.util.Optional;
 import java.util.regex.Pattern;
 
@@ -27,7 +25,6 @@
 import com.norconex.crawler.core.Crawler;
 import com.norconex.crawler.core.doc.CrawlDoc;
 import com.norconex.crawler.core.event.CrawlerEvent;
-import com.norconex.crawler.core.fetch.Fetcher;
 import com.norconex.crawler.web.WebCrawlerConfig;
 import com.norconex.crawler.web.WebCrawlerContext;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
@@ -48,38 +45,17 @@ public static void fireIfUrlOutOfScope(
             WebCrawlDocContext docContext,
             UrlScope urlScope) {
         if (!urlScope.isInScope()) {
-            crawler.fire(
-                    CrawlerEvent
-                            .builder()
-                            .name(WebCrawlerEvent.REJECTED_OUT_OF_SCOPE)
-                            .source(crawler)
-                            .subject(Web.config(crawler).getUrlScopeResolver())
-                            .docContext(docContext)
-                            .message(urlScope.outOfScopeReason())
-                            .build());
+            crawler.fire(CrawlerEvent
+                    .builder()
+                    .name(WebCrawlerEvent.REJECTED_OUT_OF_SCOPE)
+                    .source(crawler)
+                    .subject(Web.config(crawler).getUrlScopeResolver())
+                    .docContext(docContext)
+                    .message(urlScope.outOfScopeReason())
+                    .build());
         }
     }
 
-    //    private static final BeanMapper BEAN_MAPPER =
-    //            CrawlSessionBeanMapperFactory.create(
-    //                    WebCrawlerConfig.class, b ->
-    //                        b.unboundPropertyMapping(
-    //                                "crawler", WebCrawlerMixIn.class));
-    //    private static class WebCrawlerMixIn {
-    //        @JsonDeserialize(as = WebCrawlerConfig.class)
-    //        private CrawlerConfig configuration;
-    //    }
-
-    //    public static BeanMapper beanMapper() {
-    //        return BEAN_MAPPER;
-    //    }
-
-    //    public static WebCrawlerConfig config(CrawlerConfig cfg) {
-    //        return (WebCrawlerConfig) cfg;
-    //    }
-    //    public static WebCrawlerConfig config(AbstractPipelineContext ctx) {
-    //        return (WebCrawlerConfig) Web.config(ctx.getCrawler());
-    //    }
     public static WebCrawlerConfig config(Crawler crawler) {
         return (WebCrawlerConfig) crawler.getConfiguration();
     }
@@ -88,32 +64,6 @@ public static WebCrawlerContext crawlerContext(Crawler crawler) {
         return (WebCrawlerContext) crawler.getContext();
     }
 
-    //    public static WebImporterPipelineContext importerContext(
-    //            AbstractPipelineContext ctx) {
-    //        return (WebImporterPipelineContext) ctx;
-    //    }
-
-    //    //TODO move this one to core?
-    //    public static void fire(
-    //            Crawler crawler,
-    //            @NonNull
-    //            Consumer<CrawlerEventBuilder<?, ?>> c) {
-    //        if (crawler != null) {
-    //            var builder = CrawlerEvent.builder();
-    //            c.accept(builder);
-    //            crawler.getEventManager().fire(builder.build());
-    //        }
-    //    }
-
-    //TODO could probably move this where needed since generically,
-    // we would get the fetcher wrapper directly from crawler.
-    public static List<HttpFetcher> toHttpFetcher(
-            @NonNull Collection<Fetcher<?, ?>> fetchers) {
-        return fetchers.stream()
-                .map(HttpFetcher.class::cast)
-                .toList();
-    }
-
     public static HttpFetcher fetcher(Crawler crawler) {
         return (HttpFetcher) crawler.getFetcher();
     }
@@ -130,10 +80,9 @@ public static WebCrawlDocContext cachedDocContext(
     public static RobotsTxt robotsTxt(Crawler crawler, String reference) {
         var cfg = Web.config(crawler);
         return Optional.ofNullable(cfg.getRobotsTxtProvider())
-                .map(
-                        rb -> rb.getRobotsTxt(
-                                (HttpFetcher) crawler.getFetcher(),
-                                reference))
+                .map(rb -> rb.getRobotsTxt(
+                        (HttpFetcher) crawler.getFetcher(),
+                        reference))
                 .orElse(null);
     }
 
@@ -199,15 +148,14 @@ public static Properties parseDomAttributes(
         if (StringUtils.isBlank(attribsStr)) {
             return props;
         }
-        doParseDomAttributes(
-                attribsStr
-                        // strip before and after angle brackets as separate steps,
-                        // in case of weird mark-up
-                        .replaceFirst("(?s)^.*<\\s*[\\w-]+\\s*(.*)$", "$1")
-                        .replaceFirst("(?s)^(.*?)>.*$", "$1")
-                        .replaceAll("\\s+", " ")
-                        .replace(" =", "=")
-                        .replace("= ", "="),
+        doParseDomAttributes(attribsStr
+                // strip before and after angle brackets as separate steps,
+                // in case of weird mark-up
+                .replaceFirst("(?s)^.*<\\s*[\\w-]+\\s*(.*)$", "$1")
+                .replaceFirst("(?s)^(.*?)>.*$", "$1")
+                .replaceAll("\\s+", " ")
+                .replace(" =", "=")
+                .replace("= ", "="),
                 props);
         return props;
     }
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java
index 073be458c..ea8041132 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java
@@ -134,7 +134,7 @@ public HttpFetchResponse fetch(HttpFetchRequest req)
         mem.getUpsertRequests().forEach(req -> {
             assertThat(
                     req.getMetadata().getInteger(
-                            "collector.depth")).isZero();
+                            "crawler.depth")).isZero();
             assertThat(req.getReference()).containsAnyOf(
                     page1Path,
                     page2Path,
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.html b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.html
similarity index 100%
rename from crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.html
rename to crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.html
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java
index 5ea5a0557..b0d93500f 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java
@@ -32,6 +32,7 @@
 import com.norconex.commons.lang.text.TextMatcher;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
 import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency;
+import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency.ApplyTo;
 import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.SitemapSupport;
 
 import lombok.extern.slf4j.Slf4j;
@@ -45,10 +46,10 @@ void testWriteRead() {
         r.getConfiguration().setSitemapSupport(SitemapSupport.LAST);
 
         var f1 = new MinFrequency(
-                "reference", "monthly",
+                ApplyTo.REFERENCE, "monthly",
                 TextMatcher.regex(".*\\.pdf").ignoreCase());
         var f2 = new MinFrequency(
-                "contentType", "1234",
+                ApplyTo.CONTENT_TYPE, "1234",
                 TextMatcher.regex(".*"));
 
         r.getConfiguration().setMinFrequencies(List.of(f1, f2));
@@ -72,13 +73,14 @@ void testCustomFrequency() {
         prevCrawl.setCrawlDate(prevCrawlDate);
 
         var f = new MinFrequency(
-                "reference", "120 days", TextMatcher.regex(".*"));
+                ApplyTo.REFERENCE, "120 days", TextMatcher.regex(".*"));
 
         r.getConfiguration().setMinFrequencies(List.of(f));
         Assertions.assertFalse(r.isRecrawlable(prevCrawl));
 
         // Delay has passed
-        f = new MinFrequency("reference", "5 days", TextMatcher.regex(".*"));
+        f = new MinFrequency(
+                ApplyTo.REFERENCE, "5 days", TextMatcher.regex(".*"));
         r.getConfiguration().setMinFrequencies(List.of(f));
         Assertions.assertTrue(r.isRecrawlable(prevCrawl));
     }
@@ -163,10 +165,10 @@ void testIsRecrawlable(
         var matcher = "reference".equals(minFreqApplyTo)
                 ? TextMatcher.basic(url)
                 : TextMatcher.basic("text/html");
-        resolver.getConfiguration().setMinFrequencies(
-                List.of(
-                        new MinFrequency(
-                                minFreqApplyTo, minFreqValue, matcher)));
+        resolver.getConfiguration().setMinFrequencies(List.of(
+                new MinFrequency("reference".equals(minFreqApplyTo)
+                        ? ApplyTo.REFERENCE
+                        : ApplyTo.CONTENT_TYPE, minFreqValue, matcher)));
 
         assertThat(resolver.isRecrawlable(prevRec)).isEqualTo(expected);
     }
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java
index 405dc0529..3aea1ee01 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java
@@ -25,6 +25,7 @@
 import java.io.UncheckedIOException;
 import java.net.ServerSocket;
 import java.nio.file.Path;
+import java.time.Duration;
 import java.util.List;
 
 import org.apache.commons.lang3.RandomStringUtils;
@@ -282,11 +283,11 @@ void testResolvingUserAgent(ClientAndServer client) {
             // test setting a bunch of other params
             fetcher.getConfiguration()
                     .setWindowSize(new java.awt.Dimension(640, 480))
-                    .setPageLoadTimeout(10_1000)
-                    .setImplicitlyWait(1000)
-                    .setScriptTimeout(10_000)
+                    .setPageLoadTimeout(Duration.ofSeconds(10))
+                    .setImplicitlyWait(Duration.ofSeconds(1))
+                    .setScriptTimeout(Duration.ofSeconds(10))
                     .setWaitForElementSelector("p")
-                    .setWaitForElementTimeout(10_000);
+                    .setWaitForElementTimeout(Duration.ofSeconds(10));
             cfg.setStartReferences(List.of(hostUrl(client, path)));
         });
 
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java
index b0be4a0db..7ca59db96 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java
@@ -21,6 +21,7 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.file.Paths;
+import java.time.Duration;
 import java.util.List;
 
 import org.junit.jupiter.api.Test;
@@ -45,13 +46,13 @@ void testWriteReadFetcher() throws MalformedURLException {
         c.setBrowserPath(Paths.get("/some/browser/path"));
         c.setDriverPath(Paths.get("/some/driver/path"));
         c.setRemoteURL(new URL("http://example.com"));
-        c.setImplicitlyWait(4000);
+        c.setImplicitlyWait(Duration.ofSeconds(4));
         c.setEarlyPageScript("alert('hello init!');");
-        c.setPageLoadTimeout(5000);
+        c.setPageLoadTimeout(Duration.ofSeconds(5));
         c.setLatePageScript("alert('hello page!');");
-        c.setScriptTimeout(6000);
+        c.setScriptTimeout(Duration.ofSeconds(6));
         c.setWaitForElementSelector("#header");
-        c.setWaitForElementTimeout(1234);
+        c.setWaitForElementTimeout(Duration.ofMillis(1234));
         c.setWaitForElementType(WaitElementType.ID);
         c.setWindowSize(new Dimension(666, 999));
         c.setCapabilities(
@@ -69,12 +70,9 @@ void testWriteReadFetcher() throws MalformedURLException {
                                 "rh2", "hrval2"));
         c.setHttpSniffer(snif);
 
-        c.setReferenceFilters(
-                List.of(
-                        configure(
-                                new GenericReferenceFilter(), cfg -> cfg
-                                        .setValueMatcher(
-                                                TextMatcher.regex("test.*")))));
+        c.setReferenceFilters(List.of(configure(
+                new GenericReferenceFilter(), cfg -> cfg
+                        .setValueMatcher(TextMatcher.regex("test.*")))));
 
         var sh = new ScreenshotHandler();
         sh.getConfiguration()
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java
index f85159170..6ea9d6162 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java
@@ -27,7 +27,7 @@ class GenericRedirectUrlProviderTest {
     @Test
     void testWriteRead() {
         var p = new GenericRedirectUrlProvider();
-        p.setFallbackCharset(StandardCharsets.UTF_8.toString());
+        p.getConfiguration().setFallbackCharset(StandardCharsets.UTF_8);
         assertThatNoException()
                 .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(p));
     }
diff --git a/crawler/web/src/test/resources/validation/web-crawl-session-large.xml b/crawler/web/src/test/resources/validation/web-crawl-session-large.xml
index e2e838f8f..59021df4b 100644
--- a/crawler/web/src/test/resources/validation/web-crawl-session-large.xml
+++ b/crawler/web/src/test/resources/validation/web-crawl-session-large.xml
@@ -232,7 +232,7 @@
       <minFrequency applyTo="reference" value="always" >
         <matcher method="regex" pattern=".*\.pdf"/>
       </minFrequency>
-      <minFrequency applyTo="contentType" value="3000" >
+      <minFrequency applyTo="CONTENT_TYPE" value="3000" >
         <matcher pattern="text/html"/>
       </minFrequency>
     </minFrequencies>
@@ -352,14 +352,17 @@
   </documentFilters>
 
   <preImportConsumers>
-    <consumer class="FeaturedImageProcessor">
+    <consumer class="FeaturedImageResolver">
        <pageContentTypePattern>text/html</pageContentTypePattern>
        <domSelector>dom dom</domSelector>
        <minDimensions>425x312</minDimensions>
        <largest>true</largest>
        <imageCacheSize>1234</imageCacheSize>
        <imageCacheDir>/some/path</imageCacheDir>
-       <storage>url, inline</storage>
+       <storages>
+         <storage>url</storage>
+         <storage>inline</storage>
+       </storages>
        <scaleQuality>medium</scaleQuality>
        <scaleDimensions>25</scaleDimensions>
        <scaleStretch>true</scaleStretch>
diff --git a/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java b/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java
index 135fd09a0..eb1ba3fa1 100644
--- a/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java
+++ b/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java
@@ -214,6 +214,16 @@ public static TextMatcher imageIOStandardContentTypes() {
         return csv(IMAGE_IO_CONTENT_TYPES);
     }
 
+    /**
+     * <p>
+     * Matches all content types.
+     * </p>
+     * @return text matcher
+     */
+    public static TextMatcher all() {
+        return TextMatcher.regex(".*");
+    }
+
     private static TextMatcher csv(Set<String> values) {
         return TextMatcher
                 .csv(StringUtils.join(values, ','))

From 0f195992ace10c18e68a25d7839a8e124c62f113 Mon Sep 17 00:00:00 2001
From: essiembre <essiembre@users.noreply.github.com>
Date: Sun, 8 Sep 2024 06:20:13 +0000
Subject: [PATCH 05/10] Apply Copyright year changes

---
 .../web/robot/impl/StandardRobotsMetaProviderConfig.java        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java
index 4bf348029..7900cbf5d 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java
@@ -1,4 +1,4 @@
-/* Copyright 2010-2023 Norconex Inc.
+/* Copyright 2010-2024 Norconex Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 1cee93bdf8accd9927401283aa19ce4f10d53c79 Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Tue, 10 Sep 2024 22:22:29 -0400
Subject: [PATCH 06/10] More code coverage.

---
 .../link/impl/RegexLinkExtractor.java         |  4 +-
 .../delay/impl/GenericDelayResolverTest.java  | 22 +++++++
 .../link/impl/RegexLinkExtractorTest.java     | 60 +++++++++++++++++++
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java
index 235cf3a6a..7ff6146f0 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java
@@ -86,9 +86,9 @@ public class RegexLinkExtractor
 
     //TODO make buffer size and overlap size configurable
     //1MB: make configurable
-    private static final int MAX_BUFFER_SIZE = 1024 * 1024;
+    static final int MAX_BUFFER_SIZE = 1024 * 1024;
     // max url leng is 2048 x 2 bytes x 2 for <a> anchor attributes.
-    private static final int OVERLAP_SIZE = 2 * 2 * 2048;
+    static final int OVERLAP_SIZE = 2 * 2 * 2048;
 
     @Getter
     private final RegexLinkExtractorConfig configuration =
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java
index c7211a68e..ff4881865 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java
@@ -27,6 +27,7 @@
 
 import com.norconex.commons.lang.bean.BeanMapper;
 import com.norconex.crawler.web.doc.operations.delay.impl.BaseDelayResolverConfig.DelayResolverScope;
+import com.norconex.crawler.web.robot.RobotsTxt;
 
 class GenericDelayResolverTest {
 
@@ -57,6 +58,27 @@ void testWriteRead() {
                 .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(r));
     }
 
+    @Test
+    void testNullDelays() {
+        var r = new GenericDelayResolver();
+        r.getConfiguration()
+                .setScope(null);
+        assertThatNoException().isThrownBy(
+                () -> r.delay(null, "http://somewhere.com"));
+
+    }
+
+    @Test
+    void testWithRobotsTxt() {
+        var r = new GenericDelayResolver();
+        //        r.getConfiguration()
+        //                .setScope(null);
+        var robotsTxt = RobotsTxt.builder().crawlDelay(1000f).build();
+        assertThatNoException().isThrownBy(
+                () -> r.delay(robotsTxt, "http://somewhere.com"));
+
+    }
+
     @Test
     void testDelayScheduleBoundaries() {
         //FYI: Jan 1, 2000 was a Saturday
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java
index e7111051e..0599806b7 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java
@@ -15,9 +15,11 @@
 package com.norconex.crawler.web.doc.operations.link.impl;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatNoException;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -25,6 +27,7 @@
 import java.util.List;
 import java.util.Set;
 
+import org.apache.commons.io.input.NullInputStream;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
@@ -32,6 +35,8 @@
 import com.norconex.commons.lang.bean.BeanMapper.Format;
 import com.norconex.commons.lang.file.ContentType;
 import com.norconex.commons.lang.io.CachedInputStream;
+import com.norconex.commons.lang.map.PropertyMatcher;
+import com.norconex.commons.lang.text.TextMatcher;
 import com.norconex.crawler.core.doc.CrawlDoc;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
 import com.norconex.crawler.web.doc.operations.link.Link;
@@ -141,6 +146,61 @@ void testGenericWriteRead() {
                 () -> BeanMapper.DEFAULT.assertWriteRead(extractor));
     }
 
+    @Test
+    void testFromFieldAndRestrictions() throws IOException {
+        var extractor = new RegexLinkExtractor();
+        var cfg = extractor.getConfiguration();
+        cfg.setPatterns(
+                List.of(new ExtractionPattern("http:.*?\\.html", null)));
+        cfg.getRestrictions().add(new PropertyMatcher(TextMatcher.regex(".*")));
+        cfg.getFieldMatcher().setPattern("myfield");
+
+        var doc = toCrawlDoc("n/a",
+                ContentType.TEXT,
+                NullInputStream.nullInputStream());
+        doc.getMetadata().set("myfield",
+                "http://one.com/1.html|http://two.com/2.html|NOT_ME");
+        var links = extractor.extractLinks(doc);
+        assertThat(links).map(Link::getUrl).containsExactlyInAnyOrder(
+                "http://one.com/1.html", "http://two.com/2.html");
+
+        cfg.clearPatterns();
+        cfg.clearRestrictions();
+        cfg.setContentTypeMatcher(TextMatcher.basic("application/pdf"));
+        links = extractor.extractLinks(doc);
+        assertThat(links).isEmpty();
+    }
+
+    @Test
+    void testNoRestrictionMatch() throws IOException {
+        var extractor = new RegexLinkExtractor();
+        var cfg = extractor.getConfiguration();
+        cfg.getRestrictions().add(
+                new PropertyMatcher(TextMatcher.regex("NOPE")));
+
+        var doc = toCrawlDoc("n/a",
+                ContentType.TEXT,
+                NullInputStream.nullInputStream());
+        var links = extractor.extractLinks(doc);
+        assertThat(links).isEmpty();
+    }
+
+    @Test
+    void testLargeContent() throws IOException {
+        var doc = toCrawlDoc("n/a", ContentType.TEXT, new ByteArrayInputStream(
+                ("http://one.com/1.html"
+                        + "X".repeat(RegexLinkExtractor.MAX_BUFFER_SIZE)
+                        + "http://two.com/2.html" + "X".repeat(
+                                RegexLinkExtractor.MAX_BUFFER_SIZE))
+                                        .getBytes()));
+        var extractor = new RegexLinkExtractor();
+        extractor.getConfiguration().setPatterns(
+                List.of(new ExtractionPattern("http:.*?\\.html", null)));
+        var links = extractor.extractLinks(doc);
+        assertThat(links).map(Link::getUrl).containsExactlyInAnyOrder(
+                "http://one.com/1.html", "http://two.com/2.html");
+    }
+
     private boolean contains(Set<Link> links, String url) {
         for (Link link : links) {
             if (url.equals(link.getUrl())) {

From ba0c2a61c3a9fbae1b0e873edbf6d165b84433ad Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Wed, 11 Sep 2024 00:06:45 -0400
Subject: [PATCH 07/10] Code coverage.

---
 crawler/web/pom.xml                           |  7 ++
 .../com/norconex/crawler/web/WebCrawler.java  | 40 ++++++++--
 .../UrlStatusCrawlerEventListener.java        |  4 +-
 .../norconex/crawler/web/WebCrawlerTest.java  | 33 +++++++++
 .../impl/HtmlDomTikaLinkExtractorTest.java    | 24 ++++++
 .../UrlStatusCrawlerEventListenerTest.java    | 74 +++++++++++++++----
 .../crawler/web/stubs/CrawlerStubs.java       | 11 +--
 7 files changed, 161 insertions(+), 32 deletions(-)
 create mode 100644 crawler/web/src/test/java/com/norconex/crawler/web/WebCrawlerTest.java

diff --git a/crawler/web/pom.xml b/crawler/web/pom.xml
index d3074e379..85cdf08df 100644
--- a/crawler/web/pom.xml
+++ b/crawler/web/pom.xml
@@ -214,6 +214,13 @@
       <artifactId>selenium</artifactId>
       <scope>test</scope>
     </dependency>
+    <!--      
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-junit-jupiter</artifactId>
+      <scope>test</scope>
+    </dependency>    
+    -->
     <!-- TODO: Implement using this, or remove:
     <dependency>
         <groupId>com.norconex.commons</groupId>
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
index cca040ae0..7a380f39b 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java
@@ -19,15 +19,19 @@
 
 import com.norconex.crawler.core.Crawler;
 import com.norconex.crawler.core.CrawlerBuilder;
+import com.norconex.crawler.core.CrawlerException;
 import com.norconex.crawler.core.cli.CliCrawlerLauncher;
 import com.norconex.crawler.web.callbacks.WebCrawlerCallbacks;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
 import com.norconex.crawler.web.doc.pipelines.WebDocPipelines;
 import com.norconex.crawler.web.fetch.HttpFetcherProvider;
 
-public class WebCrawler {
+/**
+ * Facade for launching or obtaining a Web Crawler.
+ */
+public final class WebCrawler {
 
-    protected static final Supplier<CrawlerBuilder> crawlerBuilderSupplier =
+    private static final Supplier<CrawlerBuilder> crawlerBuilderSupplier =
             () -> Crawler
                     .builder()
                     .configuration(new WebCrawlerConfig())
@@ -37,6 +41,9 @@ public class WebCrawler {
                     .docContextType(WebCrawlDocContext.class)
                     .context(new WebCrawlerContext());
 
+    private WebCrawler() {
+    }
+
     /**
      * Invokes the Web Crawler from the command line.
      * You can invoke it without any arguments to get a list of command-line
@@ -52,15 +59,38 @@ public static void main(String[] args) {
         }
     }
 
+    /**
+     * Launches the Web Crawler. Similar to {@link #main(String[])}, but
+     * do not call {@link System#exit(int)} and returns the execution status
+     * code instead. It will throw a runtime exception upon failure
+     * (typically a {@link CrawlerException}).
+     * @param args command line arguments
+     * @return execution status code
+     */
     public static int launch(String... args) {
-        return CliCrawlerLauncher.launch(crawlerBuilderSupplier.get(), args);
+        return CliCrawlerLauncher.launch(builder(), args);
     }
 
+    /**
+     * Creates a Web Crawler instance.
+     * @param crawlerConfig Web Crawler configuration
+     * @return crawler
+     */
     public static Crawler create(WebCrawlerConfig crawlerConfig) {
-        return crawlerBuilderSupplier
-                .get()
+        return builder()
                 .configuration(Optional.ofNullable(crawlerConfig)
                         .orElseGet(WebCrawlerConfig::new))
                 .build();
     }
+
+    /**
+     * Gets the builder used to create a Web Crawler. To get a web crawler
+     * instance, it is best to call {@link #create(WebCrawlerConfig)}.
+     * This method is typically for internal use, unless you know what you are
+     * doing and want to create your own crawler, based on this one.
+     * @return crawler builder
+     */
+    public static CrawlerBuilder builder() {
+        return crawlerBuilderSupplier.get();
+    }
 }
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java
index 591a9c438..659488abc 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java
@@ -200,8 +200,8 @@ private void resolveStatusCodeRange(
             var end = toInt(endPoints[1]);
             if (start >= end) {
                 throw new IllegalArgumentException(
-                        "Invalid statusCode range: %s. Start value must be "
-                                + "higher than end value.".formatted(range));
+                        ("Invalid statusCode range: %s. Start value must be "
+                                + "higher than end value.").formatted(range));
             }
             while (start <= end) {
                 parsedCodes.add(start);
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/WebCrawlerTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/WebCrawlerTest.java
new file mode 100644
index 000000000..166839ea5
--- /dev/null
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/WebCrawlerTest.java
@@ -0,0 +1,33 @@
+/* Copyright 2024 Norconex Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.norconex.crawler.web;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.jupiter.api.Test;
+
+class WebCrawlerTest {
+
+    @Test
+    void testLaunch() {
+        assertThat(WebCrawler.launch()).isEqualTo(-1);
+        assertThat(WebCrawler.launch("-version")).isEqualTo(0);
+    }
+
+    @Test
+    void testCreate() {
+        assertThat(WebCrawler.create(null)).isNotNull();
+    }
+}
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomTikaLinkExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomTikaLinkExtractorTest.java
index 42d875e70..e79876a7c 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomTikaLinkExtractorTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomTikaLinkExtractorTest.java
@@ -15,6 +15,7 @@
 package com.norconex.crawler.web.doc.operations.link.impl;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatException;
 import static org.assertj.core.api.Assertions.assertThatNoException;
 
 import java.io.IOException;
@@ -27,6 +28,8 @@
 import java.util.Set;
 import java.util.stream.Stream;
 
+import org.apache.commons.io.input.BrokenInputStream;
+import org.apache.commons.io.input.NullInputStream;
 import org.apache.commons.lang3.ArrayUtils;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.MethodSource;
@@ -253,4 +256,25 @@ private Link linkWithReferrer(
         }
         return link;
     }
+
+    @LinkExtractorsTest
+    void testNonMatchingContentType(LinkExtractor extractor)
+            throws IOException {
+        var links = extractor.extractLinks(
+                CrawlDocStubs.crawlDoc(
+                        "http://www.site.com/file.pdf",
+                        ContentType.PDF, NullInputStream.nullInputStream()));
+        assertThat(links).isEmpty();
+    }
+
+    @LinkExtractorsTest
+    void testFailingDocStream(LinkExtractor extractor) {
+        assertThatException().isThrownBy(() -> {//NOSONAR
+            extractor.extractLinks(
+                    CrawlDocStubs.crawlDoc(
+                            "http://www.site.com/file.html",
+                            ContentType.HTML,
+                            BrokenInputStream.INSTANCE));
+        });
+    }
 }
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerTest.java
index 35d6d359a..79db77f68 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerTest.java
@@ -17,6 +17,7 @@
 import static com.norconex.crawler.web.WebsiteMock.serverUrl;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatExceptionOfType;
 import static org.assertj.core.api.Assertions.assertThatNoException;
 import static org.mockserver.model.HttpRequest.request;
 import static org.mockserver.model.HttpResponse.response;
@@ -34,6 +35,7 @@
 import org.mockserver.model.HttpStatusCode;
 
 import com.norconex.commons.lang.bean.BeanMapper;
+import com.norconex.crawler.core.CrawlerException;
 import com.norconex.crawler.web.WebTestUtil;
 import com.norconex.crawler.web.WebsiteMock;
 
@@ -59,26 +61,20 @@ void testURLStatusCrawlerEventListener(
         WebsiteMock.whenHtml(client, ok1Path, "This page is OK.");
         WebsiteMock.whenHtml(client, ok1Path, "This page is OK.");
 
-        client
-                .when(request(notFoundPath))
+        client.when(request(notFoundPath))
                 .respond(HttpResponse.notFoundResponse());
 
-        client
-                .when(request(errorPath))
-                .respond(
-                        response()
-                                .withStatusCode(
-                                        HttpStatusCode.INTERNAL_SERVER_ERROR_500
-                                                .code())
-                                .withReasonPhrase("Kaput!"));
+        client.when(request(errorPath))
+                .respond(response().withStatusCode(
+                        HttpStatusCode.INTERNAL_SERVER_ERROR_500.code())
+                        .withReasonPhrase("Kaput!"));
 
         WebTestUtil.runWithConfig(tempDir, cfg -> {
-            cfg.setStartReferences(
-                    List.of(
-                            serverUrl(client, ok1Path),
-                            serverUrl(client, notFoundPath),
-                            serverUrl(client, ok2Path),
-                            serverUrl(client, errorPath)))
+            cfg.setStartReferences(List.of(
+                    serverUrl(client, ok1Path),
+                    serverUrl(client, notFoundPath),
+                    serverUrl(client, ok2Path),
+                    serverUrl(client, errorPath)))
                     .addEventListener(urlStatusListener);
         });
 
@@ -99,5 +95,51 @@ void testURLStatusCrawlerEventListener(
 
         assertThatNoException().isThrownBy(
                 () -> BeanMapper.DEFAULT.assertWriteRead(urlStatusListener));
+
+        // run again without any status codes, which should equate "any" and
+        // return all again.
+        urlStatusListener.getConfiguration().setStatusCodes("");
+        WebTestUtil.runWithConfig(tempDir, cfg -> {
+            cfg.setStartReferences(List.of(
+                    serverUrl(client, ok1Path),
+                    serverUrl(client, notFoundPath),
+                    serverUrl(client, ok2Path),
+                    serverUrl(client, errorPath)))
+                    .addEventListener(urlStatusListener);
+        });
+        assertThat(csvLines).size().isEqualTo(5);
+
+        // test with inverted range
+        urlStatusListener.getConfiguration().setStatusCodes("299-200");
+        assertThatExceptionOfType(CrawlerException.class).isThrownBy(
+                () -> {
+                    WebTestUtil.runWithConfig(tempDir, cfg -> {
+                        cfg.setStartReferences(List.of("http://blah.com"))
+                                .addEventListener(urlStatusListener);
+                    });
+
+                });
+
+        // test with range of too many segments
+        urlStatusListener.getConfiguration().setStatusCodes("200-300-400");
+        assertThatExceptionOfType(CrawlerException.class).isThrownBy(
+                () -> {
+                    WebTestUtil.runWithConfig(tempDir, cfg -> {
+                        cfg.setStartReferences(List.of("http://blah.com"))
+                                .addEventListener(urlStatusListener);
+                    });
+
+                });
+
+        // test with invalid range number
+        urlStatusListener.getConfiguration().setStatusCodes("123XYZ");
+        assertThatExceptionOfType(CrawlerException.class).isThrownBy(
+                () -> {
+                    WebTestUtil.runWithConfig(tempDir, cfg -> {
+                        cfg.setStartReferences(List.of("http://blah.com"))
+                                .addEventListener(urlStatusListener);
+                    });
+
+                });
     }
 }
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/stubs/CrawlerStubs.java b/crawler/web/src/test/java/com/norconex/crawler/web/stubs/CrawlerStubs.java
index 15098ce81..f6a09be82 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/stubs/CrawlerStubs.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/stubs/CrawlerStubs.java
@@ -44,19 +44,12 @@ public static CrawlerBuilder memoryCrawlerBuilder(Path workDir) {
 
     public static CrawlerBuilder memoryCrawlerBuilder(
             Path workDir, Consumer<WebCrawlerConfig> c) {
-        var b = SneakyWebCrawler
+        var b = WebCrawler
                 .builder()
-                .configuration(
-                        CrawlerConfigStubs.memoryCrawlerConfig(workDir));
+                .configuration(CrawlerConfigStubs.memoryCrawlerConfig(workDir));
         if (c != null) {
             c.accept((WebCrawlerConfig) b.configuration());
         }
         return b;
     }
-
-    static class SneakyWebCrawler extends WebCrawler {
-        static CrawlerBuilder builder() {
-            return WebCrawler.crawlerBuilderSupplier.get();
-        }
-    }
 }

From f405d6780a53586d5809d25cdeb26592871d263f Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Thu, 12 Sep 2024 23:36:21 -0400
Subject: [PATCH 08/10] Update HtmlDomLinkExtractorTest.java

---
 .../link/impl/HtmlDomLinkExtractorTest.java   | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java
index 7017d1c25..bf588fac6 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java
@@ -25,12 +25,14 @@
 import java.util.stream.Stream;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.BrokenInputStream;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.CsvSource;
 import org.junit.jupiter.params.provider.MethodSource;
 
 import com.norconex.commons.lang.file.ContentType;
 import com.norconex.commons.lang.io.CachedInputStream;
+import com.norconex.commons.lang.map.PropertyMatcher;
 import com.norconex.commons.lang.text.TextMatcher;
 import com.norconex.crawler.core.doc.CrawlDoc;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
@@ -181,6 +183,47 @@ void testExtractAttributes(String implType, String from)
         assertThat(links).containsExactlyInAnyOrderElementsOf(expectedLinks);
     }
 
+    @ParameterizedTest(name = "{0}")
+    @MethodSource("testExtractorProvider")
+    void testRestrictions(LinkExtractor extractor) throws IOException {
+        // with restrictions, it shall skip extractions and not even attempt
+        // to read the input stream (thus, shall not fail)
+        var links = extractor.extractLinks(
+                CrawlDocStubs.crawlDoc(
+                        "n/a",
+                        ContentType.HTML, BrokenInputStream.INSTANCE));
+        assertThat(links).isEmpty();
+    }
+
+    static Stream<LinkExtractor> testExtractorProvider() {
+        var htmlExtractor = new HtmlLinkExtractor();
+        htmlExtractor
+                .getConfiguration()
+                .getRestrictions()
+                .add(new PropertyMatcher(TextMatcher.basic("NOMATCH")));
+
+        var domExtractor = new DomLinkExtractor();
+        domExtractor.getConfiguration()
+                .getRestrictions()
+                .add(new PropertyMatcher(TextMatcher.basic("NOMATCH")));
+
+        var tikaExtractor = new TikaLinkExtractor();
+        tikaExtractor.getConfiguration()
+                .getRestrictions()
+                .add(new PropertyMatcher(TextMatcher.basic("NOMATCH")));
+
+        var feedExtractor = new XmlFeedLinkExtractor();
+        tikaExtractor.getConfiguration()
+                .getRestrictions()
+                .add(new PropertyMatcher(TextMatcher.basic("NOMATCH")));
+
+        return Stream.of(
+                htmlExtractor,
+                domExtractor,
+                tikaExtractor,
+                feedExtractor);
+    }
+
     static Stream<LinkExtractor> testExtractAttributesProvider() {
         var htmlExtractor = new HtmlLinkExtractor();
         htmlExtractor.getConfiguration().addLinkTag("link", "href");

From 5c81dd3f475e659ebc0414bc94580f14eacb249e Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Fri, 13 Sep 2024 00:37:35 -0400
Subject: [PATCH 09/10] Update HtmlDomLinkExtractorTest.java

---
 .../link/impl/HtmlDomLinkExtractorTest.java   | 39 ++++++++++++++++---
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java
index bf588fac6..2548f38f8 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlDomLinkExtractorTest.java
@@ -16,23 +16,28 @@
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatException;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.lang.reflect.InvocationTargetException;
 import java.util.List;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.stream.Stream;
 
+import org.apache.commons.beanutils.MethodUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.BrokenInputStream;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.CsvSource;
 import org.junit.jupiter.params.provider.MethodSource;
 
+import com.norconex.commons.lang.config.Configurable;
 import com.norconex.commons.lang.file.ContentType;
 import com.norconex.commons.lang.io.CachedInputStream;
 import com.norconex.commons.lang.map.PropertyMatcher;
+import com.norconex.commons.lang.map.PropertyMatchers;
 import com.norconex.commons.lang.text.TextMatcher;
 import com.norconex.crawler.core.doc.CrawlDoc;
 import com.norconex.crawler.web.doc.WebCrawlDocContext;
@@ -185,14 +190,32 @@ void testExtractAttributes(String implType, String from)
 
     @ParameterizedTest(name = "{0}")
     @MethodSource("testExtractorProvider")
-    void testRestrictions(LinkExtractor extractor) throws IOException {
+    void testRestrictions(LinkExtractor extractor)
+            throws IOException, NoSuchMethodException, IllegalAccessException,
+            InvocationTargetException {
         // with restrictions, it shall skip extractions and not even attempt
         // to read the input stream (thus, shall not fail)
-        var links = extractor.extractLinks(
-                CrawlDocStubs.crawlDoc(
-                        "n/a",
-                        ContentType.HTML, BrokenInputStream.INSTANCE));
+        var doc = CrawlDocStubs.crawlDoc(
+                "n/a",
+                ContentType.HTML, BrokenInputStream.INSTANCE);
+
+        var links = extractor.extractLinks(doc);
         assertThat(links).isEmpty();
+
+        var config = ((Configurable<?>) extractor).getConfiguration();
+
+        //TODO have a "Restrictable" interface or something like this to
+        // avoid using reflection here.
+        var restrictions = (PropertyMatchers) MethodUtils.invokeMethod(
+                config, "getRestrictions", null);
+
+        assertThat((List<?>) restrictions).hasSize(1);
+        MethodUtils.invokeMethod(config, "clearRestrictions", null);
+        assertThat((List<?>) restrictions).isEmpty();
+
+        assertThatException()
+                .isThrownBy(() -> extractor.extractLinks(doc))
+                .withMessageContaining("Broken input stream");
     }
 
     static Stream<LinkExtractor> testExtractorProvider() {
@@ -213,9 +236,13 @@ static Stream<LinkExtractor> testExtractorProvider() {
                 .add(new PropertyMatcher(TextMatcher.basic("NOMATCH")));
 
         var feedExtractor = new XmlFeedLinkExtractor();
-        tikaExtractor.getConfiguration()
+        feedExtractor
+                .getConfiguration()
                 .getRestrictions()
                 .add(new PropertyMatcher(TextMatcher.basic("NOMATCH")));
+        feedExtractor
+                .getConfiguration()
+                .setContentTypeMatcher(TextMatcher.regex(".*"));
 
         return Stream.of(
                 htmlExtractor,

From e54447157de189156c511dd7182b58d61ee8e8b8 Mon Sep 17 00:00:00 2001
From: Pascal Essiembre <pascal.essiembre@norconex.com>
Date: Fri, 13 Sep 2024 22:39:49 -0400
Subject: [PATCH 10/10] Code coverage.

---
 .../util/GenericRedirectUrlProvider.java      | 16 +++--
 .../util/GenericRedirectUrlProviderTest.java  | 61 +++++++++++++++++++
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
index 15dd31f04..cb2d95e2d 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
@@ -153,7 +153,7 @@ private String resolveRedirectURL(
             return url;
         }
         LOG.warn("""
-                Redirect URI made of 7-bit clean ASCII.\s\
+                Redirect URI not made of 7-bit clean ASCII.\s\
                 It probably is not encoded properly.\s\
                 Will try to fix. Redirect URL: {}""", redirectURL);
 
@@ -171,16 +171,14 @@ private Charset resolveCharset(HttpResponse response, String redirectUrl) {
                     try {
                         return CharsetUtils.forName(chset);
                     } catch (RuntimeException e) {
-                        var charset =
-                                ofNullable(configuration.getFallbackCharset())
-                                        .orElse(DEFAULT_FALLBACK_CHARSET);
                         LOG.warn("""
                             Could not fix badly encoded URL with charset \
-                            "{}". Redirect URL: "{}". Will try with \
-                            fallback charset: {}""",
-                                charset, redirectUrl, charset);
-                        return charset;
+                            "{}". Redirect URL: "{}". Will use fallback.""",
+                                chset, redirectUrl);
+                        return null;
                     }
-                }).get();
+                }).orElseGet(() -> ofNullable(
+                        configuration.getFallbackCharset())
+                                .orElse(DEFAULT_FALLBACK_CHARSET));
     }
 }
diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java
index 6ea9d6162..a3ed7f84e 100644
--- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java
+++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java
@@ -14,10 +14,16 @@
  */
 package com.norconex.crawler.web.fetch.util;
 
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatNoException;
 
 import java.nio.charset.StandardCharsets;
 
+import org.apache.hc.client5.http.async.methods.SimpleHttpRequest;
+import org.apache.hc.client5.http.async.methods.SimpleHttpResponse;
+import org.apache.hc.core5.http.HttpHeaders;
+import org.apache.hc.core5.http.protocol.BasicHttpContext;
+import org.apache.hc.core5.http.protocol.HttpCoreContext;
 import org.junit.jupiter.api.Test;
 
 import com.norconex.commons.lang.bean.BeanMapper;
@@ -31,4 +37,59 @@ void testWriteRead() {
         assertThatNoException()
                 .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(p));
     }
+
+    @Test
+    void testNoOriginalUrl() {
+        var provider = new GenericRedirectUrlProvider();
+        var req = SimpleHttpRequest.create("GET", "http://example.com");
+        var resp = SimpleHttpResponse.create(200, "content");
+        var ctx = new BasicHttpContext();
+        ctx.setAttribute(HttpCoreContext.HTTP_REQUEST, req);
+
+        assertThat(provider.provideRedirectURL(req, resp, ctx)).isNull();
+    }
+
+    @Test
+    void testRelativeRedirect() {
+        var provider = new GenericRedirectUrlProvider();
+        var req = SimpleHttpRequest.create("GET", "http://xyz.com/original");
+        var resp = SimpleHttpResponse.create(200, "content");
+        resp.setHeader(HttpHeaders.LOCATION, "/redirected");
+        var ctx = new BasicHttpContext();
+        ctx.setAttribute(HttpCoreContext.HTTP_REQUEST, req);
+
+        assertThat(provider.provideRedirectURL(req, resp, ctx)).isEqualTo(
+                "http://xyz.com/redirected");
+    }
+
+    @Test
+    void testNonAscii() {
+        var provider = new GenericRedirectUrlProvider();
+        var req = SimpleHttpRequest.create("GET", "http://xyz.com/original");
+        var resp = SimpleHttpResponse.create(200, "content");
+        resp.setHeader(HttpHeaders.LOCATION, new String("/redirigé"
+                .getBytes(), StandardCharsets.ISO_8859_1));
+        resp.setHeader(
+                HttpHeaders.CONTENT_TYPE, "text/html;charset=ISO-8859-1");
+        var ctx = new BasicHttpContext();
+        ctx.setAttribute(HttpCoreContext.HTTP_REQUEST, req);
+
+        assertThat(provider.provideRedirectURL(req, resp, ctx)).isEqualTo(
+                "http://xyz.com/redirigé");
+    }
+
+    @Test
+    void testFallbackCharset() {
+        var provider = new GenericRedirectUrlProvider();
+        var req = SimpleHttpRequest.create("GET", "http://xyz.com/original");
+        var resp = SimpleHttpResponse.create(200, "content");
+        resp.setHeader(HttpHeaders.LOCATION, "/redirigé");
+        resp.setHeader(
+                HttpHeaders.CONTENT_TYPE, "text/html;charset=BAD_CHARSET");
+        var ctx = new BasicHttpContext();
+        ctx.setAttribute(HttpCoreContext.HTTP_REQUEST, req);
+
+        assertThat(provider.provideRedirectURL(req, resp, ctx)).isEqualTo(
+                "http://xyz.com/redirigé");
+    }
 }