From d8d6d653344f58b4ae02e130588b8bf049414ecb Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Sat, 7 Sep 2024 00:24:32 -0400 Subject: [PATCH 01/10] JavaDoc + better serialization of DelayRange + renaming of FeaturedImageProcessor to FeaturedImageResolver. --- .../crawler/core/doc/CrawlDocMetadata.java | 2 +- .../delay/impl/AbstractDelayResolver.java | 20 +- .../delay/impl/BaseDelayResolverConfig.java | 35 +-- .../operations/delay/impl/CrawlerDelay.java | 5 + .../doc/operations/delay/impl/DelayRange.java | 2 + .../operations/delay/impl/DelaySchedule.java | 7 +- .../delay/impl/GenericDelayResolver.java | 36 --- .../impl/GenericDelayResolverConfig.java | 69 +---- .../delay/impl/ReferenceDelayResolver.java | 33 --- .../impl/ReferenceDelayResolverConfig.java | 62 +--- .../filter/impl/SegmentCountUrlFilter.java | 18 -- .../impl/SegmentCountUrlFilterConfig.java | 51 ++-- .../impl/FeaturedImageProcessorConfig.java | 269 ------------------ ...cessor.java => FeaturedImageResolver.java} | 165 ++--------- .../impl/FeaturedImageResolverConfig.java | 242 ++++++++++++++++ .../link/impl/DomLinkExtractor.java | 64 +---- .../link/impl/DomLinkExtractorConfig.java | 178 ++---------- .../crawler/web/spi/CrawlerWebPtProvider.java | 2 + .../com/norconex/crawler/web/WebTestUtil.java | 4 +- ...st.java => FeaturedImageResolverTest.java} | 20 +- 20 files changed, 349 insertions(+), 935 deletions(-) delete mode 100644 crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java rename crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/{FeaturedImageProcessor.java => FeaturedImageResolver.java} (76%) create mode 100644 crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java rename crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/{FeaturedImageProcessorTest.java => FeaturedImageResolverTest.java} (94%) diff --git a/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java b/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java index 6c16323a8..d9ed19851 100644 --- a/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java +++ b/crawler/core/src/main/java/com/norconex/crawler/core/doc/CrawlDocMetadata.java @@ -26,7 +26,7 @@ public final class CrawlDocMetadata { //TODO use the same prefix for both crawler and importer... // all "document." ? In any case, no longer make it "collector." - public static final String PREFIX = "collector."; + public static final String PREFIX = "crawler."; public static final String DEPTH = PREFIX + "depth"; diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java index 13d16e5c1..b7f941c98 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/AbstractDelayResolver.java @@ -61,20 +61,7 @@ * any given thread. The more threads you have the less of an * impact the delay will have. * - *

- * XML configuration usage: - *

- *

- * The following should be shared across concrete implementations - * (which can add more configurable attributes and tags). - *

- * {@nx.xml - * - * - * } + * @param type of configuration class * @since 2.5.0 */ @Slf4j @@ -101,9 +88,8 @@ public void delay(RobotsTxt robotsTxt, String url) { } var delay = delays.get(getConfiguration().getScope()); if (delay == null) { - LOG.warn( - "Unspecified or unsupported delay scope: {}. " - + "Using {} scope.", + LOG.warn("Unspecified or unsupported delay scope: {}. " + + "Using {} scope.", getConfiguration().getScope(), BaseDelayResolverConfig.DEFAULT_SCOPE); delay = delays.get(BaseDelayResolverConfig.DEFAULT_SCOPE); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java index e6fb3f720..34d19021c 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/BaseDelayResolverConfig.java @@ -38,37 +38,22 @@ * in order from the best behaved to the least. *

* - *

- * XML configuration usage: - *

- *

- * The following should be shared across concrete implementations - * (which can add more configurable attributes and tags). - *

- * {@nx.xml - * - * - * } * @since 2.5.0 */ @Data @Accessors(chain = true) -@SuppressWarnings("javadoc") public class BaseDelayResolverConfig { public enum DelayResolverScope { @@ -81,25 +66,19 @@ public enum DelayResolverScope { DelayResolverScope.CRAWLER; /** - * The default delay in milliseconds. - * @param defaultDelay default deleay - * @return default delay + * The default delay. */ private Duration defaultDelay = DEFAULT_DELAY; /** * Whether to ignore crawl delays specified in a site robots.txt - * file. Not applicable when robots.txt are ignored. - * @param ignoreRobotsCrawlDelay true if ignoring - * robots.txt crawl delay - * @return true if ignoring robots.txt crawl delay + * file. Not applicable when robots.txt are ignored. */ private boolean ignoreRobotsCrawlDelay = false; /** - * Gets the delay scope. - * @param scope one of "crawler", "site", or "thread". - * @return delay scope + * Gets the delay scope. See class documentation for a description + * of supported scopes. */ private DelayResolverScope scope = DEFAULT_SCOPE; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java index fb8a72f4f..4a5e1ff96 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java @@ -18,9 +18,14 @@ import com.norconex.commons.lang.Sleeper; +import lombok.EqualsAndHashCode; +import lombok.ToString; + /** * It is assumed there will be one instance of this class per crawler defined. */ +@EqualsAndHashCode(onlyExplicitlyIncluded = true) +@ToString(onlyExplicitlyIncluded = true) public class CrawlerDelay extends AbstractDelay { private MutableLong lastHitEpochNanos = new MutableLong(-1); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java index 3d791d873..c1157cac0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java @@ -18,11 +18,13 @@ import lombok.Data; import lombok.NoArgsConstructor; import lombok.experimental.Accessors; +import lombok.experimental.FieldNameConstants; @Data @Accessors(chain = true) @AllArgsConstructor @NoArgsConstructor +@FieldNameConstants public class DelayRange { private T start; private T end; diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java index 2eb7e6bf7..8d9a6465c 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelaySchedule.java @@ -41,9 +41,10 @@ public enum DOW { private DelayRange timeRange; private Duration delay; - // For Jackson serialization + //--- Serialization helpers ------------------------------------------------ + @JsonSetter(value = "timeRange") - void setTimeRangeFromString(DelayRange range) { + void setTimeRangeSerial(DelayRange range) { if (range == null) { timeRange = null; return; @@ -54,7 +55,7 @@ void setTimeRangeFromString(DelayRange range) { } @JsonGetter(value = "timeRange") - DelayRange getTimeRangeAsString() { + DelayRange getTimeRangeSerial() { if (timeRange == null) { return null; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java index e565cb25e..9afc3698d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolver.java @@ -25,7 +25,6 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.norconex.commons.lang.CircularRange; import com.norconex.commons.lang.event.EventListener; -import com.norconex.commons.lang.time.DurationParser; import com.norconex.crawler.core.event.CrawlerEvent; import lombok.EqualsAndHashCode; @@ -67,41 +66,6 @@ * any given thread. The more threads you have the less of an * impact the delay will have. * - * - *

- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *

- * - * {@nx.xml.usage - * - * - * (delay in milliseconds) - * - * - * (... repeat schedule tag as needed ...) - * - * } - * - * {@nx.xml.example - * - * 1 second - * - * } - * - *

- * The above example set the minimum delay between each document download - * on a given site to 5 seconds, no matter what the crawler robots.txt may - * say, except on weekend, where it is more agressive (1 second). - *

*/ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java index 456e54095..0b04c9da6 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java @@ -19,80 +19,13 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.commons.lang.time.DurationParser; import lombok.Data; import lombok.experimental.Accessors; /** *

- * Default implementation for creating voluntary delays between URL downloads. - * There are a few ways the actual delay value can be defined (in order): - *

- *
    - *
  1. Takes the delay specify by a robots.txt file. - * Only applicable if robots.txt files and its robots crawl delays - * are not ignored.
  2. - *
  3. Takes an explicitly scheduled delay, if any (picks the first - * one matching).
  4. - *
  5. Use the specified default delay or 3 seconds, if none is - * specified.
  6. - *
- *

- * In a delay schedule, the days of weeks are spelled out (in English): - * Monday, Tuesday, etc. Time ranges are using the 24h format. - *

- *

- * One of these following scope dictates how the delay is applied, listed - * in order from the best behaved to the least. - *

- *
    - *
  • crawler: the delay is applied between each URL download - * within a crawler instance, regardless how many threads are defined - * within that crawler, or whether URLs are from the - * same site or not. This is the default scope.
  • - *
  • site: the delay is applied between each URL download - * from the same site within a crawler instance, regardless how many - * threads are defined. A site is defined by a URL protocol and its - * domain (e.g. http://example.com).
  • - *
  • thread: the delay is applied between each URL download from - * any given thread. The more threads you have the less of an - * impact the delay will have.
  • - *
- * - *

- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *

- * - * {@nx.xml.usage - * - * - * (delay in milliseconds) - * - * - * (... repeat schedule tag as needed ...) - * - * } - * - * {@nx.xml.example - * - * 1 second - * - * } - * - *

- * The above example set the minimum delay between each document download - * on a given site to 5 seconds, no matter what the crawler robots.txt may - * say, except on weekend, where it is more agressive (1 second). + * Configuration for {@link GenericDelayResolver}. *

*/ @Data diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java index ed16d5727..89bae0f9f 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java @@ -16,8 +16,6 @@ import java.time.Duration; -import com.norconex.commons.lang.time.DurationParser; - import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.ToString; @@ -54,37 +52,6 @@ * any given thread. The more threads you have the less of an * impact the delay will have. * - * - *

- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *

- * - * {@nx.xml.usage - * - * - * (regular expression applied against document reference) - * - * - * (... repeat pattern tag as needed ...) - * - * } - * - * {@nx.xml.example - *
- * 
- *     .*\.pdf
- * 
- * }
- * 

- * The above examlpe will increase the delay to 10 seconds when encountering - * PDFs from a default of 3 seconds. - *

- * * @since 2.5.0 */ @EqualsAndHashCode diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java index c06a2aeaf..9ecde8f5a 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java @@ -19,74 +19,14 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.commons.lang.time.DurationParser; import lombok.Data; import lombok.experimental.Accessors; /** *

- * Introduces different delays between document downloads based on matching - * document reference (URL) patterns. - * There are a few ways the actual delay value can be defined (in order): + * Configuration for {@link ReferenceDelayResolver}. *

- *
    - *
  1. Takes the delay specify by a robots.txt file. - * Only applicable if robots.txt files and its robots crawl delays - * are not ignored.
  2. - *
  3. Takes the delay matching a reference pattern, if any (picks the first - * one matching).
  4. - *
  5. Used the specified default delay or 3 seconds, if none is - * specified.
  6. - *
- *

- * One of these following scope dictates how the delay is applied, listed - * in order from the best behaved to the least. - *

- *
    - *
  • crawler: the delay is applied between each URL download - * within a crawler instance, regardless how many threads are defined - * within that crawler, or whether URLs are from the - * same site or not. This is the default scope.
  • - *
  • site: the delay is applied between each URL download - * from the same site within a crawler instance, regardless how many - * threads are defined. A site is defined by a URL protocol and its - * domain (e.g. http://example.com).
  • - *
  • thread: the delay is applied between each URL download from - * any given thread. The more threads you have the less of an - * impact the delay will have.
  • - *
- * - *

- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *

- * - * {@nx.xml.usage - * - * - * (regular expression applied against document reference) - * - * - * (... repeat pattern tag as needed ...) - * - * } - * - * {@nx.xml.example - *
- * 
- *     .*\.pdf
- * 
- * }
- * 

- * The above examlpe will increase the delay to 10 seconds when encountering - * PDFs from a default of 3 seconds. - *

- * * @since 2.5.0 */ @Data diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java index 62fcffbda..190650609 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java @@ -52,24 +52,6 @@ * When duplicate is true, it will count the maximum * number of duplicate segments found. *

- * - * {@nx.xml.usage - * - * (a regex identifying segment separator) - * - * } - * - * {@nx.xml.example - * - * } - *

- * The above example will reject URLs with more than 5 forward slashes after - * the domain. - *

- * * @since 1.2 * @see Pattern */ diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java index 74bec03fe..230faa072 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java @@ -14,8 +14,6 @@ */ package com.norconex.crawler.web.doc.operations.filter.impl; -import java.util.regex.Pattern; - import com.norconex.crawler.core.doc.operations.filter.OnMatch; import lombok.Data; @@ -23,40 +21,9 @@ /** *

- * Filters URL based based on the number of URL segments. A URL with - * a number of segments equal or more than the specified count will either - * be included or excluded, as specified. - *

- *

- * By default - * segments are obtained by breaking the URL text at each forward slashes - * (/), starting after the host name. You can define different or - * additional segment separator characters. + * Configuration for {@link SegmentCountUrlFilter}. *

- *

- * When duplicate is true, it will count the maximum - * number of duplicate segments found. - *

- * - * {@nx.xml.usage - * - * (a regex identifying segment separator) - * - * } - * - * {@nx.xml.example - * - * } - *

- * The above example will reject URLs with more than 5 forward slashes after - * the domain. - *

- * * @since 1.2 - * @see Pattern */ @Data @Accessors(chain = true) @@ -67,8 +34,24 @@ public class SegmentCountUrlFilterConfig { /** Default segment count. */ public static final int DEFAULT_SEGMENT_COUNT = 10; + /** + * Number of segments after which this filter is considered a match. + * Default is {@value #DEFAULT_SEGMENT_COUNT} + */ private int count = DEFAULT_SEGMENT_COUNT; + /** + * Whether the configured segment count represents the number of + * duplicated segments for this filter to be considered a match. + */ private boolean duplicate; + /** + * Segment separator. Default is + * {@value #DEFAULT_SEGMENT_SEPARATOR_PATTERN}. + */ private String separator = DEFAULT_SEGMENT_SEPARATOR_PATTERN; + + /** + * Action to undertake when there is a match. + */ private OnMatch onMatch; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java deleted file mode 100644 index aa657f570..000000000 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java +++ /dev/null @@ -1,269 +0,0 @@ -/* Copyright 2017-2024 Norconex Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.norconex.crawler.web.doc.operations.image.impl; - -import java.awt.Dimension; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -import org.imgscalr.Scalr.Method; - -import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.crawler.core.doc.CrawlDocMetadata; - -import lombok.Data; -import lombok.Getter; -import lombok.experimental.Accessors; - -/** - *

- * Document processor that extract the "main" image from HTML pages. - * Since HTML is expected, this class should only be used at - * pre-import processor. It is possible for this processor to not find any - * image. - *

- * - *

Finding the image

- *

- * By default this class will get the first image (<img>) matching - * the minimum size. You can specify you want the largest of all matching - * ones instead. In addition, if you know your images to be defined - * in a special way (e.g., all share the same CSS class), then you can use - * the "domSelector" to limit to one or a few images. See - * - * JSoup selector-syntax for how to build the "domSelector". - *

- * - *

Storing the image

- *

- * One or more storage method can be specified. Here are - * the possible storage options: - *

- *
    - *
  • - * url: Default. The absolute image URL is stored in a - * collector.featured-image-url field. - * When only this option is set, scaling options and image format - * have no effect. - *
  • - *
  • - * inline: Stores a Base64 string of the scaled image, in the format - * specified, in a collector.featured-image-inline field. - * The string is ready to be - * used inline, in a <img src="..."> tag. - *
  • - *
  • - * disk: Stores the scaled image on the file system, in the format - * and directory specified. A reference to the file on disk is stored - * in a collector.featured-image-path field. - *
  • - *
- * - * {@nx.xml.usage - * - * - * - * (Optional regex to overwrite default matching of HTML pages) - * - * - * - * (Optional CSS-like path matching one or more image elements) - * - * - * (Minimum pixel size for an image to be considered. - * Default is 400x400). - * - * [false|true] - * - * - * (Maximum number of images to cache for faster processing. - * Set to 0 to disable caching.) - * - * - * (Directory where to create the image cache) - * - * - * - * [url|inline|disk] - * (One or more, comma-separated. Default is "url".) - * - * - * - * - * (Target pixel size the featured image should be scaled to. - * Default is 150x150.) - * - * - * [false|true] - * (Whether to stretch to match scale size. Default keeps aspect ratio.) - * - * - * [auto|low|medium|high|max] - * (Default is "auto", which tries the best balance between quality - * and speed based on image size. The lower the quality the faster - * it is to scale images.) - * - * - * (Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ... - * Default is "png") - * - * - * - * - * (Path to directory where to store images on disk.) - * - * - * (Overwrite default field where to store the image path. - * Default is {@value #COLLECTOR_FEATURED_IMAGE_PATH}.) - * - * - * - * - * (Overwrite default field where to store the inline image. - * Default is {@value #COLLECTOR_FEATURED_IMAGE_INLINE}.) - * - * - * - * - * (Overwrite default field where to store the image URL. - * Default is {@value #COLLECTOR_FEATURED_IMAGE_URL}.) - * - * - * - * } - * - * When specifying an image size, the format is [width]x[height] - * or a single value. When a single value is used, that value represents both - * the width and height (i.e., a square). - * - * {@nx.xml.example - * - * - * 300x400 - * 50 - * jpg - * max - * inline - * - * - * } - *

- * The above example extracts the first image being 300x400 or larger, scaling - * it down to be 50x50 and storing it as an inline JPEG in a document field, - * preserving aspect ratio and using the best quality possible. - *

- * - * @since 2.8.0 - */ -@SuppressWarnings("javadoc") -@Data -@Accessors(chain = true) -public class FeaturedImageProcessorConfig { - - public static final String COLLECTOR_FEATURED_IMAGE_URL = - CrawlDocMetadata.PREFIX + "featured-image-url"; - public static final String COLLECTOR_FEATURED_IMAGE_PATH = - CrawlDocMetadata.PREFIX + "featured-image-path"; - public static final String COLLECTOR_FEATURED_IMAGE_INLINE = - CrawlDocMetadata.PREFIX + "featured-image-inline"; - - public static final String DEFAULT_PAGE_CONTENT_TYPE_PATTERN = - "text/html|application/(xhtml\\+xml|vnd\\.wap.xhtml\\+xml|x-asp)"; - public static final int DEFAULT_IMAGE_CACHE_SIZE = 1000; - - /** - * Default image cache directory, relative to the crawler working - * directory. - */ - public static final String DEFAULT_IMAGE_CACHE_DIR = - "featuredImageCache"; - /** - * Default featured image directory, relative to the crawler working - * directory. - */ - public static final String DEFAULT_STORAGE_DISK_DIR = - "featuredImages"; - - public static final String DEFAULT_IMAGE_FORMAT = "png"; - public static final Dimension DEFAULT_MIN_SIZE = new Dimension(400, 400); - public static final Dimension DEFAULT_SCALE_SIZE = new Dimension(150, 150); - public static final Storage DEFAULT_STORAGE = Storage.URL; - public static final StorageDiskStructure DEFAULT_STORAGE_DISK_STRUCTURE = - StorageDiskStructure.URL2PATH; - - public enum Storage { - URL, INLINE, DISK - } - - public enum StorageDiskStructure { - URL2PATH, DATE, DATETIME - } - - public enum Quality { - AUTO(Method.AUTOMATIC), - LOW(Method.SPEED), - MEDIUM(Method.BALANCED), - HIGH(Method.QUALITY), - MAX(Method.ULTRA_QUALITY); - - @Getter - private final Method scalrMethod; - - Quality(Method scalrMethod) { - this.scalrMethod = scalrMethod; - } - } - - private String pageContentTypePattern = DEFAULT_PAGE_CONTENT_TYPE_PATTERN; - private String domSelector; - private Dimension minDimensions = DEFAULT_MIN_SIZE; - private Dimension scaleDimensions = DEFAULT_SCALE_SIZE; - private boolean scaleStretch; - private String imageFormat = DEFAULT_IMAGE_FORMAT; - private int imageCacheSize = DEFAULT_IMAGE_CACHE_SIZE; - - private Path imageCacheDir; - private boolean largest; - private final List storage = - new ArrayList<>(Arrays.asList(DEFAULT_STORAGE)); - - private Path storageDiskDir; - private StorageDiskStructure storageDiskStructure; - private Quality scaleQuality = Quality.AUTO; - - private String storageDiskField = COLLECTOR_FEATURED_IMAGE_PATH; - private String storageInlineField = COLLECTOR_FEATURED_IMAGE_INLINE; - private String storageUrlField = COLLECTOR_FEATURED_IMAGE_URL; - - /** - * Gets the storage mechanisms. - * @return storage mechanisms - */ - public List getStorage() { - return Collections.unmodifiableList(storage); - } - - /** - * Sets the storage mechanisms. - * @param storage storage mechanisms - */ - public FeaturedImageProcessorConfig setStorage(List storage) { - CollectionUtil.setAll(this.storage, storage); - return this; - } -} diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java similarity index 76% rename from crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessor.java rename to crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java index 996f991d6..58bc45d24 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java @@ -14,11 +14,11 @@ */ package com.norconex.crawler.web.doc.operations.image.impl; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.COLLECTOR_FEATURED_IMAGE_INLINE; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.COLLECTOR_FEATURED_IMAGE_PATH; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.COLLECTOR_FEATURED_IMAGE_URL; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.DEFAULT_IMAGE_CACHE_DIR; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.DEFAULT_STORAGE_DISK_DIR; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.DEFAULT_IMAGE_CACHE_DIR; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.DEFAULT_STORAGE_DISK_DIR; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.FEATURED_IMAGE_INLINE_FIELD; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.FEATURED_IMAGE_PATH_FIELD; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.FEATURED_IMAGE_URL_FIELD; import static java.util.Optional.ofNullable; import static org.apache.commons.lang3.StringUtils.endsWithIgnoreCase; import static org.apache.commons.lang3.StringUtils.isNotBlank; @@ -60,8 +60,8 @@ import com.norconex.crawler.core.fetch.FetchResponse; import com.norconex.crawler.core.fetch.Fetcher; import com.norconex.crawler.web.doc.WebCrawlDocContext; -import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage; -import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.StorageDiskStructure; +import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage; +import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.StorageDiskStructure; import com.norconex.crawler.web.fetch.HttpFetchRequest; import com.norconex.crawler.web.fetch.HttpFetcher; import com.norconex.crawler.web.fetch.HttpMethod; @@ -75,7 +75,7 @@ /** *

* Document processor that extract the "main" image from HTML pages. - * Since HTML is expected, this class should only be used at + * Since HTML is expected, this class should only be used as a * pre-import processor. It is possible for this processor to not find any * image. *

@@ -93,135 +93,21 @@ * *

Storing the image

*

- * One or more storage method can be specified. Here are - * the possible storage options: + * When identified, the featured image can be stored either on local disk, + * or as a metadata field in Base64 format, or simply as a URL pointing + * to its remote location. See {@link FeaturedImageResolverConfig} for details. *

- *
    - *
  • - * url: Default. The absolute image URL is stored in a - * collector.featured-image-url field. - * When only this option is set, scaling options and image format - * have no effect. - *
  • - *
  • - * inline: Stores a Base64 string of the scaled image, in the format - * specified, in a collector.featured-image-inline field. - * The string is ready to be - * used inline, in a <img src="..."> tag. - *
  • - *
  • - * disk: Stores the scaled image on the file system, in the format - * and directory specified. A reference to the file on disk is stored - * in a collector.featured-image-path field. - *
  • - *
- * - * {@nx.xml.usage - * - * - * - * (Optional regex to overwrite default matching of HTML pages) - * - * - * - * (Optional CSS-like path matching one or more image elements) - * - * - * (Minimum pixel size for an image to be considered. - * Default is 400x400). - * - * [false|true] - * - * - * (Maximum number of images to cache for faster processing. - * Set to 0 to disable caching.) - * - * - * (Directory where to create the image cache) - * - * - * - * [url|inline|disk] - * (One or more, comma-separated. Default is "url".) - * - * - * - * - * (Target pixel size the featured image should be scaled to. - * Default is 150x150.) - * - * - * [false|true] - * (Whether to stretch to match scale size. Default keeps aspect ratio.) - * - * - * [auto|low|medium|high|max] - * (Default is "auto", which tries the best balance between quality - * and speed based on image size. The lower the quality the faster - * it is to scale images.) - * - * - * (Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ... - * Default is "png") - * - * - * - * - * (Path to directory where to store images on disk.) - * - * - * (Overwrite default field where to store the image path. - * Default is {@value #COLLECTOR_FEATURED_IMAGE_PATH}.) - * - * - * - * - * (Overwrite default field where to store the inline image. - * Default is {@value #COLLECTOR_FEATURED_IMAGE_INLINE}.) - * - * - * - * - * (Overwrite default field where to store the image URL. - * Default is {@value #COLLECTOR_FEATURED_IMAGE_URL}.) - * - * - * - * } - * - * When specifying an image size, the format is [width]x[height] - * or a single value. When a single value is used, that value represents both - * the width and height (i.e., a square). - * - * {@nx.xml.example - * - * - * 300x400 - * 50 - * jpg - * max - * inline - * - * - * } - *

- * The above example extracts the first image being 300x400 or larger, scaling - * it down to be 50x50 and storing it as an inline JPEG in a document field, - * preserving aspect ratio and using the best quality possible. - *

- * * @since 2.8.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString @Slf4j -public class FeaturedImageProcessor +public class FeaturedImageResolver extends CrawlerLifeCycleListener implements DocumentConsumer, - Configurable { + Configurable { //TODO add ability to extract from popular HTML for // featured image @@ -229,8 +115,8 @@ public class FeaturedImageProcessor //TODO add option to process embedded images (base 64) @Getter - private final FeaturedImageProcessorConfig configuration = - new FeaturedImageProcessorConfig(); + private final FeaturedImageResolverConfig configuration = + new FeaturedImageResolverConfig(); private static final Map IMG_CACHES = new HashMap<>(); @@ -258,7 +144,8 @@ protected void onCrawlerRunBegin(CrawlerEvent event) { // Initialize image cache directory if (configuration.getImageCacheSize() > 0) { resolvedImageCacheDir = ofNullable(configuration.getImageCacheDir()) - .orElseGet(() -> workDir.resolve(DEFAULT_IMAGE_CACHE_DIR)); + .orElseGet(() -> workDir.resolve( + DEFAULT_IMAGE_CACHE_DIR)); try { Files.createDirectories(resolvedImageCacheDir); LOG.info( @@ -275,7 +162,7 @@ protected void onCrawlerRunBegin(CrawlerEvent event) { } // Initialize image directory - if (configuration.getStorage().contains(Storage.DISK)) { + if (configuration.getStorages().contains(Storage.DISK)) { resolvedStorageDiskDir = ofNullable( configuration.getStorageDiskDir()).orElseGet( () -> workDir.resolve(DEFAULT_STORAGE_DISK_DIR)); @@ -338,21 +225,21 @@ public void accept(Fetcher f, CrawlDoc doc) { private void storeImage(FeaturedImage img, Doc doc) throws IOException { var imgFormat = configuration.getImageFormat(); - if (configuration.getStorage().contains(Storage.URL)) { + if (configuration.getStorages().contains(Storage.URL)) { doc.getMetadata().add( Objects.toString( configuration.getStorageUrlField(), - COLLECTOR_FEATURED_IMAGE_URL), + FEATURED_IMAGE_URL_FIELD), img.getUrl()); } - if (configuration.getStorage().contains(Storage.INLINE)) { + if (configuration.getStorages().contains(Storage.INLINE)) { doc.getMetadata().add( Objects.toString( configuration.getStorageInlineField(), - COLLECTOR_FEATURED_IMAGE_INLINE), + FEATURED_IMAGE_INLINE_FIELD), img.toHTMLInlineString(imgFormat)); } - if (configuration.getStorage().contains(Storage.DISK)) { + if (configuration.getStorages().contains(Storage.DISK)) { Path imageFile = null; if (configuration .getStorageDiskStructure() == StorageDiskStructure.DATE) { @@ -390,14 +277,14 @@ private void storeImage(FeaturedImage img, Doc doc) doc.getMetadata().add( Objects.toString( configuration.getStorageDiskField(), - COLLECTOR_FEATURED_IMAGE_PATH), + FEATURED_IMAGE_PATH_FIELD), imageFile.toFile().getAbsolutePath()); } } private boolean savingImage() { - return configuration.getStorage().contains(Storage.INLINE) - || configuration.getStorage().contains(Storage.DISK); + return configuration.getStorages().contains(Storage.INLINE) + || configuration.getStorages().contains(Storage.DISK); } private FeaturedImage findFeaturedImage( diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java new file mode 100644 index 000000000..f931dae7e --- /dev/null +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java @@ -0,0 +1,242 @@ +/* Copyright 2017-2024 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.norconex.crawler.web.doc.operations.image.impl; + +import java.awt.Dimension; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.imgscalr.Scalr.Method; + +import com.norconex.commons.lang.collection.CollectionUtil; +import com.norconex.crawler.core.doc.CrawlDocMetadata; + +import lombok.Data; +import lombok.Getter; +import lombok.experimental.Accessors; + +/** + *

+ * Configuration for {@link FeaturedImageResolver}. + *

+ * @since 2.8.0 + */ +@Data +@Accessors(chain = true) +public class FeaturedImageResolverConfig { + + public static final String FEATURED_IMAGE_URL_FIELD = + CrawlDocMetadata.PREFIX + "featured-image-url"; + public static final String FEATURED_IMAGE_PATH_FIELD = + CrawlDocMetadata.PREFIX + "featured-image-path"; + public static final String FEATURED_IMAGE_INLINE_FIELD = + CrawlDocMetadata.PREFIX + "featured-image-inline"; + + public static final String DEFAULT_PAGE_CONTENT_TYPE_PATTERN = + "text/html|application/(xhtml\\+xml|vnd\\.wap.xhtml\\+xml|x-asp)"; + public static final int DEFAULT_IMAGE_CACHE_SIZE = 1000; + + /** + * Default image cache directory, relative to the crawler working + * directory. + */ + public static final String DEFAULT_IMAGE_CACHE_DIR = + "featuredImageCache"; + /** + * Default featured image directory, relative to the crawler working + * directory. + */ + public static final String DEFAULT_STORAGE_DISK_DIR = + "featuredImages"; + + public static final String DEFAULT_IMAGE_FORMAT = "png"; + public static final Dimension DEFAULT_MIN_SIZE = new Dimension(400, 400); + public static final Dimension DEFAULT_SCALE_SIZE = new Dimension(150, 150); + public static final Storage DEFAULT_STORAGE = Storage.URL; + public static final StorageDiskStructure DEFAULT_STORAGE_DISK_STRUCTURE = + StorageDiskStructure.URL2PATH; + + /** + * Type of featured image storages. + */ + public enum Storage { + /** + * Default storages. The absolute image URL is stored in a + * {@value #FEATURED_IMAGE_URL_FIELD} metadata field. + * When only this storages option is set, scaling options and image + * format have no effect. + */ + URL, + /** + * Stores a Base64 string of the scaled image, in the format + * specified, in a {@value #FEATURED_IMAGE_INLINE_FIELD} metadata + * field. The string is ready to be used inline, in a + * <img src="..."> tag (as an example). + */ + INLINE, + /** + * Stores the scaled image on the file system, in the format + * and directory specified. A reference to the file on disk is stored + * in a {@value #FEATURED_IMAGE_PATH_FIELD} metadata field. + */ + DISK + } + + /** + * Directory structure when storing images on disk. + */ + public enum StorageDiskStructure { + /** + * Create directories for each URL segments, with handling + * of special characters. + */ + URL2PATH, + /** + * Create directories for each date (e.g., 2000/12/31/). + */ + DATE, + /** + * Create directories for each date and time, up to seconds + * (e.g., 2000/12/31/13/34/12/). + */ + DATETIME + } + + public enum Quality { + AUTO(Method.AUTOMATIC), + LOW(Method.SPEED), + MEDIUM(Method.BALANCED), + HIGH(Method.QUALITY), + MAX(Method.ULTRA_QUALITY); + + @Getter + private final Method scalrMethod; + + Quality(Method scalrMethod) { + this.scalrMethod = scalrMethod; + } + } + + /** + * Optional regex to overwrite default matching of HTML pages. + * Default is {@value #DEFAULT_PAGE_CONTENT_TYPE_PATTERN} + */ + private String pageContentTypePattern = DEFAULT_PAGE_CONTENT_TYPE_PATTERN; + /** + * Optional CSS-like path matching one or more image elements. + */ + private String domSelector; + /** + * Minimum pixel size for an image to be considered. Default is 400x400. + */ + private Dimension minDimensions = DEFAULT_MIN_SIZE; + /** + * Target pixel size the featured image should be scaled to. + * Default is 150x150. + */ + private Dimension scaleDimensions = DEFAULT_SCALE_SIZE; + /** + * Whether to stretch to match scale size. Default keeps aspect ratio. + */ + private boolean scaleStretch; + /** + * Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ... + * Default is {@value #DEFAULT_IMAGE_FORMAT} + */ + private String imageFormat = DEFAULT_IMAGE_FORMAT; + /** + * Maximum number of images to cache on the local file system for faster + * processing. + * Set to 0 to disable caching. Default is + * {@value #DEFAULT_IMAGE_CACHE_SIZE}. + */ + private int imageCacheSize = DEFAULT_IMAGE_CACHE_SIZE; + + /** + * Directory where to cache the images. Defaults to + * {@value #DEFAULT_IMAGE_CACHE_DIR} + */ + private Path imageCacheDir; + /** + * When more than one featured image is found, whether to return the + * largest of them all (as opposed to the first one encountered). + */ + private boolean largest; + /** + * One or more type of physical storages for the image. + */ + private final List storages = + new ArrayList<>(Arrays.asList(DEFAULT_STORAGE)); + + /** + * Path to directory where to store images on disk. Only applicable + * when one of the values of {@link #getStorages()} is {@link Storage#DISK}. + */ + private Path storageDiskDir; + /** + * The type of directory structure to create when one of the + * values of of {@link #getStorages()} is {@link Storage#DISK}. + */ + private StorageDiskStructure storageDiskStructure; + /** + * Desired scaling quality. Default is {@link Quality#AUTO}, which tries + * the best balance between quality and speed based on image size. The + * lower the quality the faster it is to scale images. + */ + private Quality scaleQuality = Quality.AUTO; + + /** + * Name of metadata field where to store the local path to an image. + * Only applicable if one of the {@link #getStorages()} values + * is {@link Storage#DISK}. + * Default is {@value #FEATURED_IMAGE_PATH_FIELD} + */ + private String storageDiskField = FEATURED_IMAGE_PATH_FIELD; + /** + * Name of metadata field where to store the Base64 image. + * Only applicable if one of the {@link #getStorages()} values + * is {@link Storage#INLINE}. + * Default is {@value #FEATURED_IMAGE_INLINE_FIELD} + */ + private String storageInlineField = FEATURED_IMAGE_INLINE_FIELD; + /** + * Name of metadata field where to store the remote image URL. + * Only applicable if one of the {@link #getStorages()} values + * is {@link Storage#URL}. + * Default is {@value #FEATURED_IMAGE_URL_FIELD} + */ + private String storageUrlField = FEATURED_IMAGE_URL_FIELD; + + /** + * Gets the storages mechanisms. + * @return storages mechanisms + */ + public List getStorages() { + return Collections.unmodifiableList(storages); + } + + /** + * Sets the storages mechanisms. + * @param storages storages mechanisms + * @return this + */ + public FeaturedImageResolverConfig setStorages(List storages) { + CollectionUtil.setAll(storages, storages); + return this; + } +} diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java index 1ec5bee23..ae46be6da 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java @@ -68,12 +68,13 @@ * *

When used before importing this class attempts to detect the content * character encoding unless the character encoding - * was specified using {@link #setCharset(String)}. Since document - * parsing converts content to UTF-8, UTF-8 is always assumed when - * used as a post-parse handler. + * was specified using + * {@link DomLinkExtractorConfig#setCharset(java.nio.charset.Charset)}. + * Since document parsing converts content to UTF-8, UTF-8 is always assumed + * when used as a post-parse handler. *

* - *

You can specify which parser to use when reading + *

You can specify which DOM parser to use when reading * documents. The default is "html" and will normalize the content * as HTML. This is generally a desired behavior, but this can sometimes * have your selector fail. If you encounter this @@ -122,7 +123,8 @@ * That information gets stored as metadata in the target document. * If you want to limit the quantity of information extracted/stored, * you can disable this feature by setting - * {@link #ignoreLinkData} to true. + * {@link DomLinkExtractorConfig#setIgnoreLinkData(boolean)} to + * true. *

* *

URL Schemes

@@ -131,12 +133,12 @@ * schemes are extracted for absolute URLs. By default, those are * http, https, and ftp. You can * specify your own list of supported protocols with - * {@link #setSchemes(String[])}. + * {@link DomLinkExtractorConfig#setSchemes(java.util.List)}. *

* *

Applicable documents

*

- * By default, this extractor only will be applied on documents matching + * By default, this extractor will only be applied on documents matching * one of these content types: *

* {@nx.include com.norconex.importer.handler.CommonMatchers#domContentTypes} @@ -147,57 +149,11 @@ * won't be extracted (e.g. * <a href="x.html" rel="nofollow" ...>). * To force its extraction (and ensure it is followed) you can set - * {@link #setIgnoreNofollow(boolean)} to true. + * {@link DomLinkExtractorConfig#setIgnoreNofollow(boolean)} to true. *

* - * {@nx.xml.usage - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * - * (CSV list of URI scheme for which to perform link extraction. - * leave blank or remove tag to use defaults.) - * - * - * - * - * (selector syntax) - * - * - * - * (selector syntax) - * - * - * (selector syntax) - * - * - * } - * - * {@nx.xml.example - * - * a[href] - * [src] - * link[href] - * meta[http-equiv='refresh'] - * - * [data-myurl] - * - * } - * - *

- * The above example will extract URLs found in custom element attributes named - * data-myurl. - *

* @since 3.0.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class DomLinkExtractor diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java index 23777c312..3df4b2286 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java @@ -36,160 +36,10 @@ /** *

- * Extracts links from a Document Object Model (DOM) representation of an - * HTML, XHTML, or XML document content based on values of matching - * elements and attributes. - *

- *

- * In order to construct a DOM tree, text is loaded entirely - * into memory. It uses the document content by default, but it can also - * come from specified metadata fields. - * Use this filter with caution if you know you'll need to parse - * huge files. Use the {@link HtmlLinkExtractor} instead if this is a - * concern. - *

- *

- * The jsoup parser library is used to load a - * document content into a DOM tree. Elements are referenced using a - * - * CSS or JQuery-like syntax. - *

- *

- * This link extractor is normally used before importing. - *

- * - *

When used before importing this class attempts to detect the content - * character encoding unless the character encoding - * was specified using {@link #setCharset(String)}. Since document - * parsing converts content to UTF-8, UTF-8 is always assumed when - * used as a post-parse handler. - *

- * - *

You can specify which parser to use when reading - * documents. The default is "html" and will normalize the content - * as HTML. This is generally a desired behavior, but this can sometimes - * have your selector fail. If you encounter this - * problem, try switching to "xml" parser, which does not attempt normalization - * on the content. The drawback with "xml" is you may not get all HTML-specific - * selector options to work. If you know you are dealing with XML to begin - * with, specifying "xml" should be a good option. - *

- * - *

Matching links

- *

- * You can define as many JSoup "selectors" as desired. All values matched - * by a selector will be extracted as a URL. - *

- *

- * It is possible to control what gets extracted - * exactly for matching purposes thanks to the "extract" argument expected - * with every selector. Possible values are: - *

- * - * {@nx.include com.norconex.importer.util.DomUtil#extract} - * - *

- * When not specified, the default is "text". - *

- * - *

The default selectors / extract strategies are:

- *
    - *
  • a[href] / attr(href)
  • - *
  • [src] / attr(src)
  • - *
  • link[href] / attr(href)
  • - *
  • meta[http-equiv='refresh'] / attr(content)
  • - *
- *

- * For any extracted link values, this extractor will perform minimal - * heuristics to clean extra content not part of a regular URL. For instance, - * it will only keep what is after url= when dealing with - * <meta http-equiv refresh URLs. It will also trim white - * spaces. - *

- * - *

Ignoring link data

- *

- * By default, contextual information is kept about the HTML/XML mark-up - * tag from which a link is extracted (e.g., tag name and attributes). - * That information gets stored as metadata in the target document. - * If you want to limit the quantity of information extracted/stored, - * you can disable this feature by setting - * {@link #ignoreLinkData} to true. - *

- * - *

URL Schemes

- *

Only valid - * - * schemes are extracted for absolute URLs. By default, those are - * http, https, and ftp. You can - * specify your own list of supported protocols with - * {@link #setSchemes(String[])}. - *

- * - *

Applicable documents

- *

- * By default, this extractor only will be applied on documents matching - * one of these content types: - *

- * {@nx.include com.norconex.importer.handler.CommonMatchers#domContentTypes} - * - *

"nofollow"

- *

- * By default, a regular HTML link having the "rel" attribute set to "nofollow" - * won't be extracted (e.g. - * <a href="x.html" rel="nofollow" ...>). - * To force its extraction (and ensure it is followed) you can set - * {@link #setIgnoreNofollow(boolean)} to true. - *

- * - * {@nx.xml.usage - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * - * (CSV list of URI scheme for which to perform link extraction. - * leave blank or remove tag to use defaults.) - * - * - * - * - * (selector syntax) - * - * - * - * (selector syntax) - * - * - * (selector syntax) - * - * - * } - * - * {@nx.xml.example - * - * a[href] - * [src] - * link[href] - * meta[http-equiv='refresh'] - * - * [data-myurl] - * - * } - * - *

- * The above example will extract URLs found in custom element attributes named - * data-myurl. + * Configuration for {@link DomLinkExtractor}. *

* @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class DomLinkExtractorConfig { @@ -209,8 +59,6 @@ public static class LinkSelector { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#DOM_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher - * @return content type matcher */ private final TextMatcher contentTypeMatcher = CommonMatchers.domContentTypes(); @@ -218,8 +66,6 @@ public static class LinkSelector { /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); @@ -229,21 +75,15 @@ public static class LinkSelector { /** * The assumed source character encoding. - * @param charset character encoding of the source content - * @return character encoding of the source content */ private Charset charset; /** * The parser to use when creating the DOM-tree. - * @param parser html or xml. - * @return html (default) or xml. */ private String parser = DomUtil.PARSER_HTML; private boolean ignoreNofollow; /** * Whether to ignore extra data associated with a link. - * @param ignoreLinkData true to ignore. - * @return true to ignore. */ private boolean ignoreLinkData; private final List schemes = new ArrayList<>(DEFAULT_SCHEMES); @@ -264,7 +104,7 @@ public DomLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#HTML_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher + * @param matcher content type matcher * @return this */ public DomLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) { @@ -275,6 +115,7 @@ public DomLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) { /** * Adds a new link selector extracting the "text" from matches. * @param selector JSoup selector + * @return this */ public DomLinkExtractorConfig addLinkSelector(String selector) { addLinkSelector(selector, null); @@ -305,6 +146,12 @@ public List getExtractSelectors() { return Collections.unmodifiableList(extractSelectors); } + /** + * Only apply link selectors to portions of a document + * matching the supplied selectors. + * @param selectors the CSS selectors + * @return this + */ public DomLinkExtractorConfig setExtractSelectors(List selectors) { CollectionUtil.setAll(extractSelectors, selectors); return this; @@ -319,6 +166,12 @@ public List getNoExtractSelectors() { return Collections.unmodifiableList(noExtractSelectors); } + /** + * Do not apply link selectors to portions of a document + * matching the supplied selectors. + * @param selectors the CSS selectors + * @return this + */ public DomLinkExtractorConfig setNoExtractSelectors( List selectors) { CollectionUtil.setAll(noExtractSelectors, selectors); @@ -342,6 +195,7 @@ public List getSchemes() { /** * Sets the schemes to be extracted. * @param schemes schemes to be extracted + * @return this */ public DomLinkExtractorConfig setSchemes(List schemes) { CollectionUtil.setAll(this.schemes, schemes); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java index 995461b5d..2d717cb30 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java @@ -35,6 +35,7 @@ import com.norconex.crawler.web.WebCrawlerConfig; import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector; import com.norconex.crawler.web.doc.operations.delay.DelayResolver; +import com.norconex.crawler.web.doc.operations.delay.impl.DelayRange; import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver; @@ -61,6 +62,7 @@ public MultiValuedMap, Class> getPolymorphicTypes() { addPolyType(map, MetadataChecksummer.class, "doc.operations.checksum"); addPolyType(map, EventListener.class, "event.listeners"); addPolyType(map, DelayResolver.class); + addPolyType(map, DelayRange.class); addPolyType( map, DocumentFilter.class, "doc.operations.filter"); //NOSONAR diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java b/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java index 47720edac..7799ac5f6 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/WebTestUtil.java @@ -68,7 +68,7 @@ import com.norconex.crawler.web.doc.operations.delay.DelayResolver; import com.norconex.crawler.web.doc.operations.delay.impl.BaseDelayResolverConfig.DelayResolverScope; import com.norconex.crawler.web.doc.operations.delay.impl.GenericDelayResolver; -import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessor; +import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolver; import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.crawler.web.doc.operations.link.impl.DomLinkExtractor; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; @@ -153,7 +153,7 @@ public final class WebTestUtil { .excludeType(DataStore.class::equals) .excludeType(SitemapResolver.class::equals) .excludeType(DocumentConsumer.class::equals) - .excludeType(FeaturedImageProcessor.class::equals) + .excludeType(FeaturedImageResolver.class::equals) .excludeType(RecrawlableResolver.class::equals) .excludeType(ReferencesProvider.class::equals) .excludeType(BiPredicate.class::equals) diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java similarity index 94% rename from crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.java rename to crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java index 54af0a205..95957b6ab 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java @@ -17,9 +17,9 @@ import static com.norconex.crawler.web.TestResource.IMG_160X120_PNG; import static com.norconex.crawler.web.TestResource.IMG_320X240_PNG; import static com.norconex.crawler.web.TestResource.IMG_640X480_PNG; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage.DISK; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage.INLINE; -import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage.URL; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage.DISK; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage.INLINE; +import static com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage.URL; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; @@ -42,14 +42,14 @@ import com.norconex.crawler.core.doc.CrawlDoc; import com.norconex.crawler.core.event.CrawlerEvent; import com.norconex.crawler.web.WebsiteMock; -import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Quality; -import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.Storage; -import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageProcessorConfig.StorageDiskStructure; +import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Quality; +import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.Storage; +import com.norconex.crawler.web.doc.operations.image.impl.FeaturedImageResolverConfig.StorageDiskStructure; import com.norconex.crawler.web.junit.WithCrawlerTest; import com.norconex.crawler.web.stubs.CrawlDocStubs; @MockServerSettings -class FeaturedImageProcessorTest { +class FeaturedImageResolverTest { private @TempDir Path tempDir; @@ -66,7 +66,7 @@ void testProcessFeaturedImage( var fetcher = crawler.getFetcher(); - var fip = new FeaturedImageProcessor(); + var fip = new FeaturedImageResolver(); fip.getConfiguration() .setStorage(List.of(INLINE, URL, DISK)) .setStorageDiskDir(tempDir.resolve("imageStorage")) @@ -122,7 +122,7 @@ void testProcessFeaturedImage( @Test void testWriteRead() { - var p = new FeaturedImageProcessor(); + var p = new FeaturedImageResolver(); // All settings p.getConfiguration() @@ -171,7 +171,7 @@ void testWriteRead() { // // set everything null by default? // // var read = BeanMapper.DEFAULT.writeRead(p, Format.XML); - // assertThat(read).isEqualTo(new FeaturedImageProcessor()); + // assertThat(read).isEqualTo(new FeaturedImageResolver()); // //// assertThatNoException().isThrownBy( //// () -> BeanMapper.DEFAULT.assertWriteRead(p)); From 78eede970b9ffaae013e61792a0d5500313b9715 Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Sat, 7 Sep 2024 01:29:16 -0400 Subject: [PATCH 02/10] JavaDoc + Unit test. --- .../norconex/crawler/core/CrawlerTest.java | 2 +- .../link/impl/HtmlLinkExtractor.java | 305 +++++------------- .../link/impl/HtmlLinkExtractorConfig.java | 239 +------------- .../link/impl/HtmlExtractorTest.java | 1 + 4 files changed, 102 insertions(+), 445 deletions(-) diff --git a/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java b/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java index e3f2b7a82..cba585d53 100644 --- a/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java +++ b/crawler/core/src/test/java/com/norconex/crawler/core/CrawlerTest.java @@ -230,7 +230,7 @@ void testLifeCycle() { 1, new Condition<>( req -> req.getMetadata().getBoolean( - "collector.is-crawl-new"), + "crawler.is-crawl-new"), "")) .map(CommitterRequest::getReference) // ref1 is last because orphans are processed last diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java index 2b2d17d26..9804138c3 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java @@ -47,7 +47,6 @@ import com.norconex.crawler.web.doc.operations.link.Link; import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractorConfig.RegexPair; -import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer; import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizer; import com.norconex.crawler.web.util.Web; @@ -70,13 +69,13 @@ * *

Applicable documents

*

- * By default, this extractor only will be applied on documents matching + * By default, this extractor will only be applied on documents matching * one of these content types: *

* {@nx.include com.norconex.importer.handler.CommonRestrictions#htmlContentTypes} *

* You can specify your own content types or other restrictions with - * {@link #setRestrictions(List)}. + * {@link HtmlLinkExtractorConfig#setContentTypeMatcher(com.norconex.commons.lang.text.TextMatcher)}. * Make sure they represent a file with HTML-like markup tags containing URLs. * For documents that are just * too different, consider implementing your own {@link LinkExtractor} instead. @@ -108,7 +107,8 @@ *

* The meta.http-equiv is treated differently. Only if the * "http-equiv" value is "refresh" and a "content" attribute with a URL exist - * that it will be extracted. "object" and "applet" can have multiple URLs. + * that it will be extracted. The "object" and "applet" tags can have + * multiple URLs. *

* *

@@ -124,7 +124,7 @@ * {@link WebDocMetadata#REFERRER_LINK_PREFIX}. *

*

- * The referrer data is always stored (was optional before). + * The referrer data is always stored. *

* *

Character encoding

@@ -132,7 +132,7 @@ * detect the encoding of the a page when extracting links and * referrer information. If no charset could be detected, it falls back to * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. + * {@link HtmlLinkExtractorConfig#setCharset(java.nio.charset.Charset)}. *

* *

"nofollow"

@@ -141,15 +141,14 @@ * won't be extracted (e.g. * <a href="x.html" rel="nofollow" ...>). * To force its extraction (and ensure it is followed) you can set - * {@link #setIgnoreNofollow(boolean)} to true. + * {@link HtmlLinkExtractorConfig#setIgnoreNofollow(boolean)} to + * true. *

* *

URL Fragments

- *

This extractor preserves hashtag characters (#) found - * in URLs and every characters after it. It relies on the implementation - * of {@link WebUrlNormalizer} to strip it if need be. - * {@link GenericUrlNormalizer} is now always invoked by default, and the - * default set of rules defined for it will remove fragments. + *

While extractor preserves hashtag characters (#) found + * in URLs and every characters after it, the default URL normalizer + * ({@link GenericUrlNormalizer}) will strip it by default. *

* *

@@ -171,7 +170,8 @@ * That information gets stored as metadata in the target document. * If you want to limit the quantity of information extracted/stored, * you can disable this feature by setting - * {@link #ignoreLinkData} to true. + * {@link HtmlLinkExtractorConfig#setIgnoreLinkData(boolean)} to + * true. *

* *

URL Schemes

@@ -180,96 +180,31 @@ * schemes are extracted for absolute URLs. By default, those are * http, https, and ftp. You can * specify your own list of supported protocols with - * {@link #setSchemes(String[])}. + * {@link HtmlLinkExtractorConfig#setSchemes(List)}. *

* *

HTML/XML Comments

- *

URLs found in <!-- comments --> are no longer + *

URLs found in <!-- comments --> are not * extracted by default. To enable URL extraction from comments, use - * {@link #setCommentsEnabled(boolean)} + * {@link HtmlLinkExtractorConfig#setCommentsEnabled(boolean)} *

* *

Extract links in certain parts only

*

You can identify portions of a document where links * should be extracted or ignored with - * {@link #setExtractBetweens(List)} and - * {@link #setNoExtractBetweens(List)}. Eligible content for link - * extraction is identified first, and content to exclude is done on that - * subset. + * {@link HtmlLinkExtractorConfig#setExtractBetweens(List)} and + * {@link HtmlLinkExtractorConfig#setNoExtractBetweens(List)}. Eligible + * content for link extraction is identified first, and content to exclude is + * done on that subset. *

*

You can further limit link extraction to specific * area by using * selector-syntax * to do so, with - * {@link #setExtractSelectors(List)} and - * {@link #setNoExtractSelectors(List)}. - *

- * - * {@nx.xml.usage - * - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * - * (CSV list of URI scheme for which to perform link extraction. - * leave blank or remove tag to use defaults.) - * - * - * - * - * - * - * - * - * - * - * (regex) - * (regex) - * - * - * - * - * - * (regex) - * (regex) - * - * - * - * - * (selector) - * - * - * - * (selector) - * - * - * - * } - * - * {@nx.xml.example - * - * - * - * - * - * - * - * - * - * - * } - * - *

- * The above example adds URLs to JavaScript files to the list of URLs to be - * extracted. + * {@link HtmlLinkExtractorConfig#setExtractSelectors(List)} and + * {@link HtmlLinkExtractorConfig#setNoExtractSelectors(List)}. *

*/ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode @ToString @@ -286,6 +221,7 @@ public class HtmlLinkExtractor private final HtmlLinkExtractorConfig configuration = new HtmlLinkExtractorConfig(); + // @formatter:off // NOTE: When this predicate is invoked the tag name is always lower case // and known to have been identified as a target tag name in configuration. // For each predicate, returning true won't try following predicates @@ -293,129 +229,69 @@ public class HtmlLinkExtractor @ToString.Exclude private final BiPredicate> tagLinksExtractor = - //--- From tag body --- - // When no attributes configured for a tag name, we take the body - // value as the URL. - ((BiPredicate>) (tag, links) -> Optional.of(tag) - .filter(t -> t.configAttribNames.isEmpty()) - .filter(t -> isNotBlank(t.bodyText)) - .map( - t -> toCleanAbsoluteURL( - t.referrer, - tag.bodyText.trim())) - .map(url -> addAsLink(links, url, tag, null)) + //--- From tag body --- + // When no attributes configured for a tag name, we take the body + // value as the URL. + ((BiPredicate>) (tag, links) -> Optional.of(tag) + .filter(t -> t.configAttribNames.isEmpty()) + .filter(t -> isNotBlank(t.bodyText)) + .map(t -> toCleanAbsoluteURL(t.referrer, tag.bodyText.trim())) + .map(url -> addAsLink(links, url, tag, null)) + .filter(Boolean::valueOf) + .orElse(false)) + //--- From meta http-equiv tag --- + // E.g.: : + .or((tag, links) -> Optional.of(tag) + .filter(t -> "meta".equals(t.name)) + .filter(t -> t.configAttribNames.contains(HTTP_EQUIV)) + .filter(t -> t.attribs.getStrings(HTTP_EQUIV) + .contains("refresh")) + .filter(t -> t.attribs.containsKey(CONTENT)) + // very unlikely that we have more than one + // redirect directives, but loop just in case + .map(t -> t.attribs + .getStrings(CONTENT) + .stream() + .map(LinkUtil::extractHttpEquivRefreshContentUrl) + .map(url -> toCleanAbsoluteURL(tag.referrer, url)) + .findFirst() + .map(url -> addAsLink(links, url, tag, CONTENT)) + .filter(Boolean::valueOf) + .orElse(false)) .filter(Boolean::valueOf) .orElse(false)) - //--- From meta http-equiv tag --- - // E.g.: : - .or( - (tag, links) -> Optional.of(tag) - .filter(t -> "meta".equals(t.name)) - .filter( - t -> t.configAttribNames - .contains( - HTTP_EQUIV)) - .filter( - t -> t.attribs - .getStrings( - HTTP_EQUIV) - .contains( - "refresh")) - .filter( - t -> t.attribs.containsKey( - CONTENT)) - // very unlikely that we have more than one redirect directives, - // but loop just in case - .map( - t -> t.attribs - .getStrings(CONTENT) - .stream() - .map( - LinkUtil::extractHttpEquivRefreshContentUrl) - .map( - url -> toCleanAbsoluteURL( - tag.referrer, - url)) - .findFirst() - .map( - url -> addAsLink( - links, - url, - tag, - CONTENT)) - .filter( - Boolean::valueOf) - .orElse(false)) - .filter(Boolean::valueOf) - .orElse(false)) - - //--- From anchor tag --- - // E.g.: ... - .or( - (tag, links) -> Optional.of(tag) - .filter(t -> "a".equals(t.name)) - .filter( - t -> t.configAttribNames - .contains("href")) - .filter( - t -> t.attribs - .containsKey( - "href")) - .filter( - t -> !hasActiveDoNotFollow( - t)) - .map( - t -> toCleanAbsoluteURL( - t.referrer, - t.attribs.getString( - "href"))) - .map( - url -> addAsLink( - links, url, tag, - "href")) - .filter(Boolean::valueOf) - .orElse(hasActiveDoNotFollow(tag)) // skip others if no follow - ) - - //--- From other matching attributes for tag --- - .or( - (tag, links) -> tag.configAttribNames - .stream() - .map( - cfgAttr -> Optional - .ofNullable( - tag.attribs - .getString( - cfgAttr)) - .map( - urlStr -> (EqualsUtil - .equalsAny( - tag.name, - "object", - "applet") - ? List.of( - StringUtils - .split( - urlStr, - ", ")) - : List.of( - urlStr)) - .stream() - .map( - url -> toCleanAbsoluteURL( - tag.referrer, - url)) - .map( - url -> addAsLink( - links, - url, - tag, - cfgAttr)) - .anyMatch( - Boolean::valueOf))) - .flatMap(Optional::stream) - .anyMatch(Boolean::valueOf)); + //--- From anchor tag --- + // E.g.: ... + .or((tag, links) -> Optional.of(tag) + .filter(t -> "a".equals(t.name)) + .filter(t -> t.configAttribNames.contains("href")) + .filter(t -> t.attribs.containsKey("href")) + .filter(t -> !hasActiveDoNotFollow(t)) + .map(t -> toCleanAbsoluteURL( + t.referrer, t.attribs.getString("href"))) + .map(url -> addAsLink(links, url, tag, "href")) + .filter(Boolean::valueOf) + // skip others if no follow + .orElse(hasActiveDoNotFollow(tag)) + ) + + //--- From other matching attributes for tag --- + .or((tag, links) -> tag.configAttribNames + .stream() + .map(cfgAttr -> Optional.ofNullable( + tag.attribs.getString(cfgAttr)) + .map(urlStr -> (EqualsUtil.equalsAny( + tag.name, "object", "applet") + ? List.of(StringUtils.split(urlStr, ", ")) + : List.of(urlStr)) + .stream() + .map(url -> toCleanAbsoluteURL(tag.referrer, url)) + .map(url -> addAsLink(links, url, tag, cfgAttr)) + .anyMatch(Boolean::valueOf))) + .flatMap(Optional::stream) + .anyMatch(Boolean::valueOf)); + // @formatter:on @Override public Set extractLinks(CrawlDoc doc) throws IOException { @@ -434,10 +310,8 @@ public Set extractLinks(CrawlDoc doc) throws IOException { doc.getMetadata() .matchKeys(configuration.getFieldMatcher()) .valueList() - .forEach( - val -> extractLinksFromText( - links, val, refererUrl, - true)); + .forEach(val -> extractLinksFromText( + links, val, refererUrl, true)); } else { // Body try (var r = new TextReader( @@ -720,11 +594,10 @@ private String toCleanAbsoluteURL( if (url.length() > configuration.getMaxURLLength()) { if (LOG.isDebugEnabled()) { - LOG.debug( - """ - URL length ({}) exceeding maximum length allowed\s\ - ({}) to be extracted. URL (showing first {} chars):\s\ - {}...""", + LOG.debug(""" + URL length ({}) exceeding maximum length allowed\s\ + ({}) to be extracted. URL (showing first {} chars):\s\ + {}...""", url.length(), configuration.getMaxURLLength(), LOGGING_MAX_URL_LENGTH, diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java index e1ab3e558..019f5198d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java @@ -24,10 +24,6 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.map.Properties; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.web.doc.WebDocMetadata; -import com.norconex.crawler.web.doc.operations.link.LinkExtractor; -import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer; -import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizer; import com.norconex.importer.handler.CommonMatchers; import lombok.Data; @@ -35,218 +31,9 @@ /** *

- * A memory efficient HTML link extractor. - *

- *

- * This link extractor uses regular expressions to extract links. It does - * so on a chunk of text at a time, so that large files are not fully loaded - * into memory. If you prefer a more flexible implementation that loads the - * DOM model in memory to perform link extraction, consider using - * {@link DomLinkExtractor}. - *

- * - *

Applicable documents

- *

- * By default, this extractor only will be applied on documents matching - * one of these content types: - *

- * {@nx.include com.norconex.importer.handler.CommonRestrictions#htmlContentTypes} - *

- * You can specify your own content types or other restrictions with - * {@link #setRestrictions(List)}. - * Make sure they represent a file with HTML-like markup tags containing URLs. - * For documents that are just - * too different, consider implementing your own {@link LinkExtractor} instead. - * Removing the default values and define no content types will have for effect - * to try to extract URLs from all files (usually a bad idea). - *

- * - *

Tags attributes

- * URLs are assumed to be contained within valid tags or tag attributes. - * The default tags and attributes used are (tag.attribute): - *
- * a.href, frame.src, iframe.src, img.src, meta.http-equiv
- * 
- * You can specify your own set of tags and attributes to have - * different ones used for extracting URLs. For an elaborated set, you can - * combine the above with your own list or use any of the following - * suggestions (tag.attribute): - *
- * applet.archive,   applet.codebase,  area.href,         audio.src,
- * base.href,        blockquote.cite,  body.background,   button.formaction,
- * command.icon,     del.cite,         embed.src,         form.action,
- * frame.longdesc,   head.profile,     html.manifest,     iframe.longdesc,
- * img.longdesc,     img.usemap,       input.formaction,  input.src,
- * input.usemap,     ins.cite,         link.href,         object.archive,
- * object.classid,   object.codebase,  object.data,       object.usemap,
- * q.cite,           script.src,       source.src,        video.poster,
- * video.src
- * 
- *

- * The meta.http-equiv is treated differently. Only if the - * "http-equiv" value is "refresh" and a "content" attribute with a URL exist - * that it will be extracted. "object" and "applet" can have multiple URLs. - *

- * - *

- * It is possible to identify a tag only as the holder of - * a URL (without attributes). The tag body value will be used as the URL. - *

- * - *

Referrer data

- *

- * Some "referrer" information is derived from the each link and stored as - * metadata in the document they point to. - * These may vary for each link, but they are normally prefixed with - * {@link WebDocMetadata#REFERRER_LINK_PREFIX}. - *

- *

- * The referrer data is always stored (was optional before). - *

- * - *

Character encoding

- *

This extractor will by default attempt to - * detect the encoding of the a page when extracting links and - * referrer information. If no charset could be detected, it falls back to - * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. - *

- * - *

"nofollow"

- *

- * By default, a regular HTML link having the "rel" attribute set to "nofollow" - * won't be extracted (e.g. - * <a href="x.html" rel="nofollow" ...>). - * To force its extraction (and ensure it is followed) you can set - * {@link #setIgnoreNofollow(boolean)} to true. - *

- * - *

URL Fragments

- *

This extractor preserves hashtag characters (#) found - * in URLs and every characters after it. It relies on the implementation - * of {@link WebUrlNormalizer} to strip it if need be. - * {@link GenericUrlNormalizer} is now always invoked by default, and the - * default set of rules defined for it will remove fragments. - *

- * - *

- * The URL specification says hashtags - * are used to represent fragments only. That is, to quickly jump to a specific - * section of the page the URL represents. Under normal circumstances, - * keeping the URL fragments usually leads to duplicates documents being fetched - * (same URL but different fragment) and they should be stripped. Unfortunately, - * there are sites not following the URL standard and using hashtags as a - * regular part of a URL (i.e. different hashtags point to different web pages). - * It may be essential when crawling these sites to keep the URL fragments. - * This can be done by making sure the URL normalizer does not strip them. - *

- * - *

Ignoring link data

- *

- * By default, contextual information is kept about the HTML/XML mark-up - * tag from which a link is extracted (e.g., tag name and attributes). - * That information gets stored as metadata in the target document. - * If you want to limit the quantity of information extracted/stored, - * you can disable this feature by setting - * {@link #ignoreLinkData} to true. - *

- * - *

URL Schemes

- *

Only valid - * - * schemes are extracted for absolute URLs. By default, those are - * http, https, and ftp. You can - * specify your own list of supported protocols with - * {@link #setSchemes(String[])}. - *

- * - *

HTML/XML Comments

- *

URLs found in <!-- comments --> are no longer - * extracted by default. To enable URL extraction from comments, use - * {@link #setCommentsEnabled(boolean)} - *

- * - *

Extract links in certain parts only

- *

You can identify portions of a document where links - * should be extracted or ignored with - * {@link #setExtractBetweens(List)} and - * {@link #setNoExtractBetweens(List)}. Eligible content for link - * extraction is identified first, and content to exclude is done on that - * subset. - *

- *

You can further limit link extraction to specific - * area by using - * selector-syntax - * to do so, with - * {@link #setExtractSelectors(List)} and - * {@link #setNoExtractSelectors(List)}. - *

- * - * {@nx.xml.usage - * - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * - * (CSV list of URI scheme for which to perform link extraction. - * leave blank or remove tag to use defaults.) - * - * - * - * - * - * - * - * - * - * - * (regex) - * (regex) - * - * - * - * - * - * (regex) - * (regex) - * - * - * - * - * (selector) - * - * - * - * (selector) - * - * - * - * } - * - * {@nx.xml.example - * - * - * - * - * - * - * - * - * - * - * } - * - *

- * The above example adds URLs to JavaScript files to the list of URLs to be - * extracted. + * Configuration for {@link HtmlLinkExtractor}. *

*/ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class HtmlLinkExtractorConfig { @@ -266,8 +53,6 @@ public class HtmlLinkExtractorConfig { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#HTML_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher - * @return content type matcher */ private final TextMatcher contentTypeMatcher = CommonMatchers.htmlContentTypes(); @@ -275,15 +60,11 @@ public class HtmlLinkExtractorConfig { /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); /** * The maximum supported URL length. Longer URLs are ignored. - * @param maxURLLength maximum URL length - * @return maximum URL length */ private int maxURLLength = DEFAULT_MAX_URL_LENGTH; @@ -295,15 +76,11 @@ public class HtmlLinkExtractorConfig { * By default this link won't be crawled. * </a> *
- * @param ignoreNofollow whether to ignore "nofollow" directives - * @return true if ignoring "nofollow" directives */ private boolean ignoreNofollow; /** * Gets whether to ignore extra data associated with a link. - * @param ignoreLinkData true to ignore. - * @return true to ignore. */ private boolean ignoreLinkData; @@ -311,8 +88,6 @@ public class HtmlLinkExtractorConfig { * The character set to use for pages on which link extraction is performed. * When null (default), character set detection will be * attempted. - * @param charset character set to use, or null - * @return character set to use, or null */ private Charset charset; @@ -324,7 +99,6 @@ public class HtmlLinkExtractorConfig { * <a href="https://yoursite.com/somepage.html">Some URL</a> * --> *
- * @return true if links should be extracted from comments. */ private boolean commentsEnabled; @@ -357,6 +131,7 @@ public List getExtractBetweens() { * Sets the patterns delimiting the portions of a document to be considered * for link extraction. * @param betweens extract between patterns + * @return this */ public HtmlLinkExtractorConfig setExtractBetweens( List betweens) { @@ -370,6 +145,7 @@ public HtmlLinkExtractorConfig setExtractBetweens( * @param start pattern matching start of text portion * @param end pattern matching end of text portion * @param ignoreCase whether the patterns are case sensitive or not + * @return this */ public HtmlLinkExtractorConfig addExtractBetween( String start, String end, boolean ignoreCase) { @@ -390,6 +166,7 @@ public List getNoExtractBetweens() { * Sets the patterns delimiting the portions of a document to be excluded * from link extraction. * @param betweens extract between patterns + * @return this */ public HtmlLinkExtractorConfig setNoExtractBetweens( List betweens) { @@ -403,6 +180,7 @@ public HtmlLinkExtractorConfig setNoExtractBetweens( * @param start pattern matching start of text portion * @param end pattern matching end of text portion * @param ignoreCase whether the patterns are case sensitive or not + * @return this */ public HtmlLinkExtractorConfig addNoExtractBetween( String start, String end, boolean ignoreCase) { @@ -423,6 +201,7 @@ public List getExtractSelectors() { * Sets the selectors matching the portions of a document to be considered * for link extraction. * @param selectors selectors + * @return this */ public HtmlLinkExtractorConfig setExtractSelectors( List selectors) { @@ -434,6 +213,7 @@ public HtmlLinkExtractorConfig setExtractSelectors( * Adds selectors matching the portions of a document to be considered * for link extraction. * @param selectors selectors + * @return this */ public HtmlLinkExtractorConfig addExtractSelectors( List selectors) { @@ -454,6 +234,7 @@ public List getNoExtractSelectors() { * Sets the selectors matching the portions of a document to be excluded * from link extraction. * @param selectors selectors + * @return this */ public HtmlLinkExtractorConfig setNoExtractSelectors( List selectors) { @@ -465,6 +246,7 @@ public HtmlLinkExtractorConfig setNoExtractSelectors( * Adds selectors matching the portions of a document to be excluded * from link extraction. * @param selectors selectors + * @return this */ public HtmlLinkExtractorConfig addNoExtractSelectors( List selectors) { @@ -483,6 +265,7 @@ public List getSchemes() { /** * Sets the schemes to be extracted. * @param schemes schemes to be extracted + * @return this */ public HtmlLinkExtractorConfig setSchemes(List schemes) { CollectionUtil.setAll(this.schemes, schemes); @@ -498,7 +281,7 @@ public HtmlLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#HTML_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher + * @param matcher content type matcher * @return this */ public HtmlLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) { diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java index f07dabbc3..e037b1732 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlExtractorTest.java @@ -176,6 +176,7 @@ void testHtmlWriteRead() { htmlExtractor.getConfiguration() .setIgnoreNofollow(true) .addLinkTag("food", "chocolate") + .addLinkTag("food", "candy") .addLinkTag("friend", "Thor") .addExtractBetween("start1", "end1", true) .addExtractBetween("start2", "end2", false) From a9b20684118040b848cb3a7215bc90c2bed01550 Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Sat, 7 Sep 2024 13:18:08 -0400 Subject: [PATCH 03/10] Update FeaturedImageResolverTest.java --- .../doc/operations/image/impl/FeaturedImageResolverTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java index 95957b6ab..6be8c60fa 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.java @@ -68,7 +68,7 @@ void testProcessFeaturedImage( var fip = new FeaturedImageResolver(); fip.getConfiguration() - .setStorage(List.of(INLINE, URL, DISK)) + .setStorages(List.of(INLINE, URL, DISK)) .setStorageDiskDir(tempDir.resolve("imageStorage")) .setImageCacheDir(tempDir.resolve("imageCache")) .setStorageInlineField("image-inline") @@ -136,7 +136,7 @@ void testWriteRead() { .setScaleQuality(Quality.LOW) .setScaleDimensions(new Dimension(50, 50)) .setScaleStretch(true) - .setStorage(List.of(Storage.URL, Storage.INLINE, Storage.DISK)) + .setStorages(List.of(Storage.URL, Storage.INLINE, Storage.DISK)) .setStorageDiskDir(Paths.get("c:\\someotherdir")) .setStorageDiskStructure(StorageDiskStructure.DATETIME) .setStorageDiskField("diskField") From f01f7591c0d4fd825598a23b900d626ae5f6f471 Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Sun, 8 Sep 2024 02:19:48 -0400 Subject: [PATCH 04/10] JavaDoc + Unit tests. --- .../com/norconex/crawler/web/WebCrawler.java | 22 +-- .../crawler/web/WebCrawlerConfig.java | 164 ++++-------------- .../crawler/web/doc/WebCrawlDocContext.java | 12 -- .../crawler/web/doc/WebDocMetadata.java | 8 - .../image/impl/FeaturedImageResolver.java | 7 +- .../impl/FeaturedImageResolverConfig.java | 5 +- .../doc/operations/link/LinkExtractor.java | 4 +- .../link/impl/DomLinkExtractor.java | 7 + .../link/impl/DomLinkExtractorConfig.java | 18 ++ .../link/impl/HtmlLinkExtractor.java | 7 + .../link/impl/HtmlLinkExtractorConfig.java | 18 +- .../link/impl/RegexLinkExtractor.java | 46 +---- .../link/impl/RegexLinkExtractorConfig.java | 100 +++-------- .../link/impl/TikaLinkExtractor.java | 18 +- .../link/impl/TikaLinkExtractorConfig.java | 48 +++-- .../link/impl/XmlFeedLinkExtractor.java | 25 +-- .../link/impl/XmlFeedLinkExtractorConfig.java | 71 +++----- .../impl/GenericRecrawlableResolver.java | 106 +++-------- .../GenericRecrawlableResolverConfig.java | 97 +++-------- .../scope/impl/GenericUrlScopeResolver.java | 4 +- .../impl/GenericUrlScopeResolverConfig.java | 22 +-- .../url/impl/GenericUrlNormalizer.java | 45 ----- .../url/impl/GenericUrlNormalizerConfig.java | 114 +----------- .../UrlStatusCrawlerEventListener.java | 32 +--- .../UrlStatusCrawlerEventListenerConfig.java | 98 +---------- .../crawler/web/fetch/HttpFetcher.java | 58 ------- .../web/fetch/HttpFetcherProvider.java | 5 +- .../web/fetch/impl/GenericHttpFetcher.java | 108 +----------- .../fetch/impl/GenericHttpFetcherConfig.java | 80 +++------ .../web/fetch/impl/HttpAuthConfig.java | 99 ++--------- .../web/fetch/impl/webdriver/HttpSniffer.java | 20 +-- .../impl/webdriver/HttpSnifferConfig.java | 60 +++---- .../impl/webdriver/ScreenshotHandler.java | 34 +--- .../webdriver/ScreenshotHandlerConfig.java | 21 +-- .../impl/webdriver/WebDriverHttpFetcher.java | 130 ++------------ .../webdriver/WebDriverHttpFetcherConfig.java | 104 ++++++++++- .../web/fetch/util/DocImageHandler.java | 51 ------ .../web/fetch/util/DocImageHandlerConfig.java | 48 ++--- .../util/GenericRedirectUrlProvider.java | 111 +++++------- .../GenericRedirectUrlProviderConfig.java | 34 ++++ .../impl/StandardRobotsMetaProvider.java | 14 -- .../StandardRobotsMetaProviderConfig.java | 34 +--- .../robot/impl/StandardRobotsTxtProvider.java | 12 -- .../sitemap/impl/GenericSitemapLocator.java | 17 -- .../impl/GenericSitemapLocatorConfig.java | 39 ++--- .../web/sitemap/impl/SitemapParser.java | 18 +- .../crawler/web/sitemap/impl/SitemapUtil.java | 4 - .../crawler/web/spi/CrawlerWebPtProvider.java | 2 - .../com/norconex/crawler/web/util/Web.java | 90 ++-------- .../web/cases/feature/StayOnSitemapTest.java | 2 +- ...st.html => FeaturedImageResolverTest.html} | 0 .../impl/GenericRecrawlableResolverTest.java | 18 +- .../AbstractWebDriverHttpFetcherTest.java | 9 +- .../WebDriverHttpFetcherConfigTest.java | 18 +- .../util/GenericRedirectUrlProviderTest.java | 2 +- .../validation/web-crawl-session-large.xml | 9 +- .../importer/handler/CommonMatchers.java | 10 ++ 57 files changed, 616 insertions(+), 1743 deletions(-) create mode 100644 crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java rename crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/{FeaturedImageProcessorTest.html => FeaturedImageResolverTest.html} (100%) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java index 0453dc015..cca040ae0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawler.java @@ -59,26 +59,8 @@ public static int launch(String... args) { public static Crawler create(WebCrawlerConfig crawlerConfig) { return crawlerBuilderSupplier .get() - .configuration( - Optional.ofNullable(crawlerConfig) - .orElseGet(WebCrawlerConfig::new)) + .configuration(Optional.ofNullable(crawlerConfig) + .orElseGet(WebCrawlerConfig::new)) .build(); } - - // static CrawlSessionImpl initCrawlSessionImpl( - // CrawlSessionConfig sessionConfig) { - // return CrawlSessionImpl - // .builder() - // .crawlerConfigClass(WebCrawlerConfig.class) - // .crawlerFactory( - // (sess, cfg) -> Crawler.builder() - // .crawlSession(sess) - // .crawlerConfig(cfg) - // .crawlerImpl(WebCrawlerImplFactory.create()) - // .build() - // ) - // .beanMapper(Web.beanMapper()) - // .crawlSessionConfig(sessionConfig) - // .build(); - // } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java index 6685ca5a3..5436d97e4 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/WebCrawlerConfig.java @@ -28,8 +28,10 @@ import com.norconex.crawler.core.doc.operations.checksum.DocumentChecksummer; import com.norconex.crawler.core.doc.operations.checksum.MetadataChecksummer; import com.norconex.crawler.core.doc.operations.checksum.impl.Md5DocumentChecksummer; +import com.norconex.crawler.core.doc.operations.spoil.SpoiledReferenceStrategizer; import com.norconex.crawler.core.doc.pipelines.queue.ReferencesProvider; import com.norconex.crawler.core.fetch.FetchDirectiveSupport; +import com.norconex.crawler.core.store.DataStoreEngine; import com.norconex.crawler.core.store.impl.mvstore.MvStoreDataStoreEngine; import com.norconex.crawler.web.doc.WebDocMetadata; import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector; @@ -39,6 +41,7 @@ import com.norconex.crawler.web.doc.operations.delay.impl.GenericDelayResolver; import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor; +import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractorConfig; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolver; import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver; @@ -64,7 +67,8 @@ /** *

- * Web Crawler configuration. + * Web Crawler configuration, adding more options to the base + * {@link CrawlerConfig}. *

*

Start URLs

*

@@ -90,7 +94,7 @@ * Scope: To limit crawling to specific web domains, and avoid creating * many filters to that effect, you can tell the crawler to "stay" within * the web site "scope" with - * {@link #setUrlCrawlScopeStrategy(GenericUrlScopeResolver)}. + * {@link #setUrlScopeResolver(UrlScopeResolver)}. *

* *

URL Normalization

@@ -195,7 +199,7 @@ * (see {@link MvStoreDataStoreEngine}). While very capable and suitable * for most sites, if you need a larger storage system, you can change * the default implementation or provide your own - * with {@link #setDataStoreEngine(IDataStoreEngine)}. + * with {@link #setDataStoreEngine(DataStoreEngine)}. *

* *

Document Importing

@@ -214,7 +218,7 @@ * and are suddenly failing on a subsequent crawl are considered "spoiled". * You can decide whether to grace (retry next time), delete, or ignore * those spoiled documents with - * {@link #setSpoiledReferenceStrategizer(ISpoiledReferenceStrategizer)}. + * {@link #setSpoiledReferenceStrategizer(SpoiledReferenceStrategizer)}. *

* *

Committing Documents

@@ -238,7 +242,7 @@ * is needed. For instance, JavaScript-generated web pages are often best * handled by web browsers. In such case you can use the * {@link WebDriverHttpFetcher}. You can also use - * {@link #setHttpFetchers(List)} to supply own fetcher implementation. + * {@link #setFetchers(List)} to supply your own fetcher implementation. *

* *

HTTP Methods

@@ -250,7 +254,7 @@ *

*

* You can tell the crawler how it should handle HTTP GET and HEAD requests - * using using {@link #setDocumentFetchSupport(FetchDirectiveSupport) and + * using using {@link #setDocumentFetchSupport(FetchDirectiveSupport)} and * {@link #setMetadataFetchSupport(FetchDirectiveSupport)} respectively. * For each, the options are: *

@@ -302,15 +306,18 @@ * Metadata filters: Applies filtering on a document metadata fields. *

*

- * If {@link #isFetchHttpHead()} returns true, these filters - * will be invoked after the crawler performs a distinct HTTP HEAD request. + * If {@link #getMetadataFetchSupport()} value forces a distinct call + * for fetching metadata, these filters will be invoked after the crawler + * performs an HTTP HEAD request. * It gives you the opportunity to filter documents based on the HTTP HEAD * response to potentially save a more expensive HTTP GET request for * download (but results in two HTTP requests for valid documents -- - * HEAD and GET). Filtering occurs before URLs are extracted. + * HEAD and GET). Filtering occurs before URLs are extracted (since + * no content is downloaded. *

*

- * When {@link #isFetchHttpHead()} is false, these filters + * When {@link #getMetadataFetchSupport()} does not invoke making a + * distinct call for metadata, these filters * will be invoked on the metadata of the HTTP response * obtained from an HTTP GET request (as the document is downloaded). * Filtering occurs after URLs are extracted. @@ -326,10 +333,11 @@ * Importer filters: The Importer module also offers document * filtering options. At that point a document is already downloaded * and its links extracted. There are two types of filtering offered - * by the Importer: before and after document parsing. Use - * filters before parsing if you need to filter on raw content or - * want to prevent an expensive parsing. Use filters after parsing - * when you need to read the content as plain text. + * by the Importer: before and after document parsing (assuming you + * configured at least one parser). Use filters before parsing if you + * need to filter on raw content or want to avoid parsing some documents. + * Use filters after parsing when you need to read the content + * as plain text. * * * @@ -362,7 +370,7 @@ * HTML "nofollow": Most HTML-oriented link extractors support * the rel="nofollow" attribute set on HTML links and offer * a way to disable this instruction. E.g., - * {@link HtmlLinkExtractor#setIgnoreNofollow(boolean)}. + * {@link HtmlLinkExtractorConfig#setIgnoreNofollow(boolean)}. * *

  • * Sitemap: Sitemaps XML files contain as listing of @@ -373,7 +381,7 @@ * offers support for disabling sitemap detection to rely only * on sitemap start references. * Setting it to null via - * {@link #setSitemapResolver(SitemapResolver_OLD) effectively disables + * {@link #setSitemapResolver(SitemapResolver)} effectively disables * sitemap support altogether, and is thus incompatible with sitemaps * specified as start references. *
  • @@ -383,7 +391,7 @@ * HTTP response instructions. * Defaults to {@link GenericCanonicalLinkDetector}. * Set to null via - * {@link #setCanonicalLinkDetector(CanonicalLinkDetector) to disable + * {@link #setCanonicalLinkDetector(CanonicalLinkDetector)} to disable * support canonical links (increasing the chance of getting duplicates). * *
  • @@ -426,7 +434,6 @@ * *

    Deduplication

    *

    - * EXPERIMENTAL: * The crawler can attempt to detect and reject documents considered as * duplicates within a crawler session. A document will be considered * duplicate if there was already a document processed with the same @@ -434,7 +441,7 @@ * {@link #setMetadataDeduplicate(boolean)} and/or * {@link #setDocumentDeduplicate(boolean)} to true. Setting * those will have no effect if the corresponding checksummers are - * null. + * null or checksums are otherwise not are being generated. *

    *

    * Deduplication can impact crawl performance. It is recommended you @@ -461,99 +468,10 @@ * URLs in that field will become eligible for crawling. * See {@link #setPostImportLinks(TextMatcher)}. *

    - * - * {@nx.xml.usage - * - * - * - * - * (a URL) - * (local path to a file containing URLs) - * (URL to a sitemap XML) - * - * - * - * - * - * - * - * (maximum crawl depth) - * [INSCOPE|OUTSCOPE|MAXDEPTH] - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#init} - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#directive-meta} - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#directive-doc} - * - * - * - * - * - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#pipeline-queue} - * - * - * - * - * - * - * - * - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#checksum-meta} - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#dedup-meta} - * - * - * - * - * - * - * - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#pipeline-import} - * - * - * - * - * - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#import} - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#checksum-doc} - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#dedup-doc} - * - * - * - * - * - * - * - * - * - * - * {@nx.include com.norconex.crawler.core.crawler.CrawlerConfig#pipeline-committer} - * - * } */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) @FieldNameConstants - -//TODO Given we don't need @schema here to pick up javadoc -// when we have the maven-compiler-plugin setup properly... -// do we need to include all compile maven dependencies we are including for open api? -// maybe need to add to maven-compiler plugin the swagger stuff, like in core? -//@Schema //(description = "Web crawler configuration.") public class WebCrawlerConfig extends CrawlerConfig { /** @@ -573,23 +491,17 @@ public enum ReferencedLinkType { /** * The strategy to use to determine if a URL is in scope. - * @param genericUrlScopeResolver strategy to use - * @return the strategy */ private UrlScopeResolver urlScopeResolver = new GenericUrlScopeResolver(); /** * The URL normalizer. Defaults to {@link GenericUrlNormalizer}. - * @param urlNormalizer URL normalizer - * @return URL normalizer */ private WebUrlNormalizer urlNormalizer = new GenericUrlNormalizer(); /** * The delay resolver dictating the minimum amount of time to wait * between web requests. Defaults to {@link GenericDelayResolver}. - * @param delayResolver delay resolver - * @return delay resolver */ private DelayResolver delayResolver = new GenericDelayResolver(); @@ -597,8 +509,6 @@ public enum ReferencedLinkType { * The canonical link detector. To disable canonical link detection, * use {@link #setIgnoreCanonicalLinks(boolean)}. * Defaults to {@link GenericCanonicalLinkDetector}. - * @param canonicalLinkDetector the canonical link detector - * @return the canonical link detector */ private CanonicalLinkDetector canonicalLinkDetector = new GenericCanonicalLinkDetector(); @@ -608,10 +518,10 @@ public enum ReferencedLinkType { private TextMatcher postImportLinks = new TextMatcher(); /** - * Whether to keep the Importer-generated field holding URLs to queue - * for further crawling. - * @param postImportLinksKeep true if keeping - * @return true if keeping + * Whether to keep the Importer-populated fields + * from {@link #getPostImportLinks()}. By default, those are deleted + * from a document when the URLs they contain are queued for processing + * or otherwise evaluated. * @see #setPostImportLinks(TextMatcher) */ private boolean postImportLinksKeep; @@ -620,8 +530,6 @@ public enum ReferencedLinkType { * The provider of robots.txt rules for a site (if applicable). * Defaults to {@link StandardRobotsTxtProvider}. * Set to null to disable. - * @param robotsTxtProvider robots.txt provider - * @return robots.txt provider * @see #setIgnoreRobotsTxt(boolean) */ private RobotsTxtProvider robotsTxtProvider = @@ -631,8 +539,6 @@ public enum ReferencedLinkType { * The provider of robots metadata rules for a page (if applicable). * Defaults to {@link StandardRobotsMetaProvider}. * Set to null to disable. - * @param robotsMetaProvider robots metadata rules - * @return robots metadata rules r * @see #setIgnoreRobotsMeta(boolean) */ private RobotsMetaProvider robotsMetaProvider = @@ -643,8 +549,6 @@ public enum ReferencedLinkType { * Defaults to {@link GenericSitemapResolver}. * Set to null to disable all sitemap support, or * see class documentation to disable sitemap detection only. - * @param sitemapResolver sitemap resolver - * @return sitemap resolver * @see SitemapLocator */ private SitemapResolver sitemapResolver = new GenericSitemapResolver(); @@ -654,8 +558,6 @@ public enum ReferencedLinkType { * Defaults to {@link GenericSitemapLocator}. * Set to null to disable locating sitemaps * (relying on sitemaps defined as start reference, if any). - * @param sitemapLocator sitemap locator - * @return sitemap locator * @see SitemapResolver */ private SitemapLocator sitemapLocator = new GenericSitemapLocator(); @@ -665,8 +567,6 @@ public enum ReferencedLinkType { * crawled by a new crawl session. Usually amounts to checking if enough * time has passed between two crawl sessions. * Defaults to {@link GenericRecrawlableResolver}. - * @param robotsMetaProvider recrawlable resolver - * @return recrawlableResolver recrawlable resolver */ private RecrawlableResolver recrawlableResolver = new GenericRecrawlableResolver(); @@ -690,6 +590,7 @@ public List getStartReferencesSitemaps() { /** * Sets the sitemap URLs used as starting points for crawling. * @param startReferencesSitemaps sitemap URLs + * @return this * @since 3.0.0 */ public WebCrawlerConfig setStartReferencesSitemaps( @@ -716,6 +617,7 @@ public Set getKeepReferencedLinks() { * Those links are URLs extracted by link extractors. See class * documentation for more details. * @param keepReferencedLinks option for keeping links + * @return this * @since 3.0.0 */ public WebCrawlerConfig setKeepReferencedLinks( @@ -735,6 +637,7 @@ public List getLinkExtractors() { /** * Sets link extractors. * @param linkExtractors link extractors + * @return this * @since 3.0.0 */ public WebCrawlerConfig setLinkExtractors( @@ -757,6 +660,7 @@ public TextMatcher getPostImportLinks() { * Set a field matcher used to identify post-import metadata fields * holding URLs to consider for crawling. * @param fieldMatcher field matcher + * @return this * @since 3.0.0 */ public WebCrawlerConfig setPostImportLinks(TextMatcher fieldMatcher) { diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java index a12348bbe..026e3e51b 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebCrawlDocContext.java @@ -47,26 +47,17 @@ public class WebCrawlDocContext extends CrawlDocContext { /** * The document last modified date according to sitemap. - * @param sitemapLastMod document last modified date - * @return document last modified date */ - @SuppressWarnings("javadoc") private ZonedDateTime sitemapLastMod; /** * The document change frequency according to sitemap. - * @param sitemapChangeFreq document change frequency - * @return document change frequency */ - @SuppressWarnings("javadoc") private String sitemapChangeFreq; /** * The document priority according to sitemap. - * @param sitemapPriority document priority - * @return document priority */ - @SuppressWarnings("javadoc") private Float sitemapPriority; private String referrerReference; @@ -74,11 +65,8 @@ public class WebCrawlDocContext extends CrawlDocContext { /** * The HTTP ETag. - * @return etag - * @param etag the HTTP ETag * @since 3.0.0 */ - @SuppressWarnings("javadoc") private String etag; private final List referencedUrls = new ArrayList<>(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java index 181fcb2c9..d573f421e 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/WebDocMetadata.java @@ -16,7 +16,6 @@ import static com.norconex.crawler.core.doc.CrawlDocMetadata.PREFIX; -import com.norconex.crawler.core.doc.CrawlDocMetadata; import com.norconex.importer.doc.DocMetadata; /** @@ -42,13 +41,6 @@ public final class WebDocMetadata { /** @since 3.0.0 */ public static final String ORIGINAL_REFERENCE = PREFIX + "original-reference"; - /** - * @since 3.0.0 - * @deprecated since 4.0.0, {@link CrawlDocMetadata#FETCHER} is used instead - */ - @Deprecated - public static final String HTTP_FETCHER = - PREFIX + "http-fetcher"; private WebDocMetadata() { } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java index 58bc45d24..41ce6eaa4 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolver.java @@ -103,11 +103,8 @@ @ToString @Slf4j public class FeaturedImageResolver - extends - CrawlerLifeCycleListener - implements - DocumentConsumer, - Configurable { + extends CrawlerLifeCycleListener + implements DocumentConsumer, Configurable { //TODO add ability to extract from popular HTML for // featured image diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java index f931dae7e..e8d408c58 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverConfig.java @@ -40,6 +40,9 @@ @Accessors(chain = true) public class FeaturedImageResolverConfig { + //TODO consider taking advantage of DocImageHandlerConfig since there + // are overlaps + public static final String FEATURED_IMAGE_URL_FIELD = CrawlDocMetadata.PREFIX + "featured-image-url"; public static final String FEATURED_IMAGE_PATH_FIELD = @@ -236,7 +239,7 @@ public List getStorages() { * @return this */ public FeaturedImageResolverConfig setStorages(List storages) { - CollectionUtil.setAll(storages, storages); + CollectionUtil.setAll(this.storages, storages); return this; } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java index eec2f5cfa..b502bb19f 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/LinkExtractor.java @@ -32,8 +32,8 @@ */ public interface LinkExtractor { - //TODO have ability to return any number of extra info with a link - // that could be added to target URL as extra metadata. e.g., store as json. + //MAYBE have ability to return any number of extra info with a link + //that could be added to target URL as extra metadata. Store as JSON? Set extractLinks(CrawlDoc doc) throws IOException; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java index ae46be6da..492c18113 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractor.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.InputStreamReader; +import java.util.Collections; import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; @@ -172,6 +173,12 @@ public Set extractLinks(CrawlDoc doc) throws IOException { return Set.of(); } + if (!getConfiguration().getRestrictions().isEmpty() + && !getConfiguration().getRestrictions().matches( + doc.getMetadata())) { + return Collections.emptySet(); + } + Set links = new HashSet<>(); var parser = DomUtil.toJSoupParser(configuration.getParser()); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java index 3df4b2286..6392d7c0f 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/DomLinkExtractorConfig.java @@ -25,6 +25,7 @@ import java.util.Objects; import com.norconex.commons.lang.collection.CollectionUtil; +import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; import com.norconex.importer.handler.CommonMatchers; import com.norconex.importer.util.DomUtil; @@ -63,6 +64,8 @@ public static class LinkSelector { private final TextMatcher contentTypeMatcher = CommonMatchers.domContentTypes(); + private final PropertyMatchers restrictions = new PropertyMatchers(); + /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. @@ -201,4 +204,19 @@ public DomLinkExtractorConfig setSchemes(List schemes) { CollectionUtil.setAll(this.schemes, schemes); return this; } + + /** + * Clears all restrictions. + */ + public void clearRestrictions() { + restrictions.clear(); + } + + /** + * Gets all restrictions + * @return the restrictions + */ + public PropertyMatchers getRestrictions() { + return restrictions; + } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java index 9804138c3..875578ac4 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractor.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Optional; @@ -302,6 +303,12 @@ public Set extractLinks(CrawlDoc doc) throws IOException { return Set.of(); } + if (!getConfiguration().getRestrictions().isEmpty() + && !getConfiguration().getRestrictions().matches( + doc.getMetadata())) { + return Collections.emptySet(); + } + var refererUrl = doc.getReference(); Set links = new HashSet<>(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java index 019f5198d..0bc185467 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/HtmlLinkExtractorConfig.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.map.Properties; +import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; import com.norconex.importer.handler.CommonMatchers; @@ -57,6 +58,8 @@ public class HtmlLinkExtractorConfig { private final TextMatcher contentTypeMatcher = CommonMatchers.htmlContentTypes(); + private final PropertyMatchers restrictions = new PropertyMatchers(); + /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. @@ -289,7 +292,20 @@ public HtmlLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) { return this; } - //--- Public methods ------------------------------------------------------- + /** + * Clears all restrictions. + */ + public void clearRestrictions() { + restrictions.clear(); + } + + /** + * Gets all restrictions + * @return the restrictions + */ + public PropertyMatchers getRestrictions() { + return restrictions; + } public synchronized HtmlLinkExtractorConfig addLinkTag( String tagName, String attribute) { diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java index ada0f10a9..235cf3a6a 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java @@ -18,7 +18,6 @@ import java.io.InputStreamReader; import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Set; import java.util.regex.Pattern; @@ -54,7 +53,8 @@ * text/.* * *

    - * You can specify your own restrictions using {@link #setRestrictions(List)}, + * You can specify your own restrictions using + * {@link RegexLinkExtractorConfig#getRestrictions()}, * but make sure they represent text files. *

    * @@ -74,45 +74,11 @@ * detect the encoding of the a page when extracting links and * referrer information. If no charset could be detected, it falls back to * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. - *

    - * - * {@nx.xml.usage - * - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * - * - * - * (regular expression) - * (optional regex replacement) - * - * - * - * - * } - * - * {@nx.xml.example - * - * - * - * \[(\d+)\] - * http://www.example.com/page?id=$1 - * - * - * - * } - *

    - * The above example extracts page "ids" contained in square brackets and - * add them to a custom URL. + * {@link RegexLinkExtractorConfig#setCharset(java.nio.charset.Charset)}. *

    * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class RegexLinkExtractor @@ -144,10 +110,8 @@ public Set extractLinks(CrawlDoc doc) throws IOException { doc.getMetadata() .matchKeys(configuration.getFieldMatcher()) .valueList() - .forEach( - val -> extractLinks( - links, val, - doc.getReference())); + .forEach(val -> extractLinks( + links, val, doc.getReference())); } else { // Body var sb = new StringBuilder(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java index 193618830..ed0a3534b 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java @@ -22,7 +22,7 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.web.doc.WebDocMetadata; +import com.norconex.importer.handler.CommonMatchers; import lombok.AllArgsConstructor; import lombok.Data; @@ -31,81 +31,10 @@ /** *

    - * Link extractor using regular expressions to extract links found in text - * documents. Relative links are resolved to the document URL. - * For HTML documents, it is best advised to use the - * {@link HtmlLinkExtractor} or {@link DomLinkExtractor}, - * which addresses many cases specific to HTML. + * Configuration for {@link RegexLinkExtractor}. *

    - * - *

    Applicable documents

    - *

    - * By default, this extractor will extract URLs only in documents having - * their content type matching this regular expression: - *

    - *
    - * text/.*
    - * 
    - *

    - * You can specify your own restrictions using {@link #setRestrictions(List)}, - * but make sure they represent text files. - *

    - * - *

    Referrer data

    - *

    - * The following referrer information is stored as metadata in each document - * represented by the extracted URLs: - *

    - *
      - *
    • Referrer reference: The reference (URL) of the page where the - * link to a document was found. Metadata value is - * {@link WebDocMetadata#REFERRER_REFERENCE}.
    • - *
    - * - *

    Character encoding

    - *

    This extractor will by default attempt to - * detect the encoding of the a page when extracting links and - * referrer information. If no charset could be detected, it falls back to - * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. - *

    - * - * {@nx.xml.usage - * - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * - * - * - * (regular expression) - * (optional regex replacement) - * - * - * - * - * } - * - * {@nx.xml.example - * - * - * - * \[(\d+)\] - * http://www.example.com/page?id=$1 - * - * - * - * } - *

    - * The above example extracts page "ids" contained in square brackets and - * add them to a custom URL. - *

    - * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class RegexLinkExtractorConfig { @@ -127,19 +56,22 @@ public static class ExtractionPattern { /** * The maximum supported URL length. * Default is {@value #DEFAULT_MAX_URL_LENGTH}. - * @param maxUrlLength maximum URL length - * @return maximum URL length */ private int maxUrlLength = DEFAULT_MAX_URL_LENGTH; /** * Gets the character set of pages on which link extraction is performed. * Default is null (charset detection will be attempted). - * @param charset character set to use, or null - * @return character set to use, or null */ private Charset charset; + /** + * The matcher of content types to apply link extraction on. No attempt to + * extract links from any other content types will be made. Default + * matches all content types + */ + private final TextMatcher contentTypeMatcher = CommonMatchers.all(); + private final List patterns = new ArrayList<>(); private final PropertyMatchers restrictions = new PropertyMatchers(); @@ -147,8 +79,6 @@ public static class ExtractionPattern { /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); @@ -167,6 +97,18 @@ public RegexLinkExtractorConfig clearPatterns() { return this; } + /** + * The matcher of content types to apply link extraction on. No attempt to + * extract links from any other content types will be made. Default matches + * all content types. + * @param matcher content type matcher + * @return this + */ + public RegexLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) { + contentTypeMatcher.copyFrom(matcher); + return this; + } + /** * Clears all restrictions. */ diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java index 90d71d2ea..2cd45eb46 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractor.java @@ -19,6 +19,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.Collections; import java.util.HashSet; import java.util.Optional; import java.util.Set; @@ -48,7 +49,8 @@ * Implementation of {@link LinkExtractor} using * Apache Tika to perform URL * extractions from HTML documents. - * This is an alternative to the {@link HtmlLinkExtractor}. + * This is an alternative to the {@link HtmlLinkExtractor} or even + * {@link DomLinkExtractor}. *

    *

    * The configuration of content-types, storing the referrer data, and ignoring @@ -57,16 +59,8 @@ * pre-defined set of link attributes, when available (title, type, * uri, text, rel). *

    - * - * {@nx.xml.usage - * - * {@nx.include com.norconex.importer.handler.AbstractImporterHandler#restrictTo} - * - * } * @see HtmlLinkExtractor */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class TikaLinkExtractor @@ -90,6 +84,12 @@ public Set extractLinks( return Set.of(); } + if (!getConfiguration().getRestrictions().isEmpty() + && !getConfiguration().getRestrictions().matches( + doc.getMetadata())) { + return Collections.emptySet(); + } + var refererUrl = doc.getReference(); Set nxLinks = new HashSet<>(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java index 1a99cfe20..ba01ae372 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/TikaLinkExtractorConfig.java @@ -14,8 +14,8 @@ */ package com.norconex.crawler.web.doc.operations.link.impl; +import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.importer.handler.CommonMatchers; import lombok.Data; @@ -23,28 +23,9 @@ /** *

    - * Implementation of {@link LinkExtractor} using - * Apache Tika to perform URL - * extractions from HTML documents. - * This is an alternative to the {@link HtmlLinkExtractor}. + * Configuration for {@link TikaLinkExtractor}. *

    - *

    - * The configuration of content-types, storing the referrer data, and ignoring - * "nofollow" and ignoring link data are the same as in - * {@link HtmlLinkExtractor}. For link data, this parser only keeps a - * pre-defined set of link attributes, when available (title, type, - * uri, text, rel). - *

    - * - * {@nx.xml.usage - * - * {@nx.include com.norconex.importer.handler.AbstractImporterHandler#restrictTo} - * - * } - * @see HtmlLinkExtractor */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class TikaLinkExtractorConfig { @@ -52,8 +33,6 @@ public class TikaLinkExtractorConfig { private boolean ignoreNofollow; /** * Whether to ignore extra data associated with a link. - * @param ignoreLinkData true to ignore. - * @return true to ignore. * @since 3.0.0 */ private boolean ignoreLinkData; @@ -62,17 +41,15 @@ public class TikaLinkExtractorConfig { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#HTML_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher - * @return content type matcher */ private final TextMatcher contentTypeMatcher = CommonMatchers.htmlContentTypes(); + private final PropertyMatchers restrictions = new PropertyMatchers(); + /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); @@ -85,11 +62,26 @@ public TikaLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#HTML_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher + * @param matcher content type matcher * @return this */ public TikaLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) { contentTypeMatcher.copyFrom(matcher); return this; } + + /** + * Clears all restrictions. + */ + public void clearRestrictions() { + restrictions.clear(); + } + + /** + * Gets all restrictions + * @return the restrictions + */ + public PropertyMatchers getRestrictions() { + return restrictions; + } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java index 88bf838b1..c9270e521 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java @@ -18,6 +18,7 @@ import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; +import java.util.Collections; import java.util.HashSet; import java.util.Set; @@ -68,26 +69,8 @@ * {@link WebDocMetadata#REFERRER_REFERENCE}.
  • * * - * {@nx.xml.usage - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * } - * - * {@nx.xml.example - * - * .*rss$ - * - * } - *

    - * The above example specifies this extractor should only apply on documents - * that have their URL ending with "rss" (in addition to the default - * content types supported). - *

    - * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class XmlFeedLinkExtractor @@ -106,6 +89,12 @@ public Set extractLinks(CrawlDoc doc) throws IOException { return Set.of(); } + if (!getConfiguration().getRestrictions().isEmpty() + && !getConfiguration().getRestrictions().matches( + doc.getMetadata())) { + return Collections.emptySet(); + } + var refererUrl = doc.getReference(); Set links = new HashSet<>(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java index 794469625..da5047974 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java @@ -14,9 +14,8 @@ */ package com.norconex.crawler.web.doc.operations.link.impl; +import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.web.doc.WebDocMetadata; -import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.importer.handler.CommonMatchers; import lombok.Data; @@ -24,53 +23,10 @@ /** *

    - * Link extractor for extracting links out of - * RSS and - * Atom XML feeds. - * It extracts the content of <link> tags. If you need more complex - * extraction, consider using {@link RegexLinkExtractor} or creating your own - * {@link LinkExtractor} implementation. + * Configuration for {@link XmlFeedLinkExtractor}. *

    - * - *

    Applicable documents

    - *

    - * By default, this extractor only will be applied on documents matching - * one of these content types: - *

    - * - * {@nx.include com.norconex.importer.handler.CommonMatchers#xmlFeedContentTypes} - * - *

    Referrer data

    - *

    - * The following referrer information is stored as metadata in each document - * represented by the extracted URLs: - *

    - *
      - *
    • Referrer reference: The reference (URL) of the page where the - * link to a document was found. Metadata value is - * {@link WebDocMetadata#REFERRER_REFERENCE}.
    • - *
    - * - * {@nx.xml.usage - * - * {@nx.include com.norconex.crawler.web.doc.operations.link.AbstractTextLinkExtractor@nx.xml.usage} - * - * } - * - * {@nx.xml.example - * - * .*rss$ - * - * } - *

    - * The above example specifies this extractor should only apply on documents - * that have their URL ending with "rss" (in addition to the default - * content types supported). - *

    - * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class XmlFeedLinkExtractorConfig { @@ -78,8 +34,6 @@ public class XmlFeedLinkExtractorConfig { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#XML_FEED_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher - * @return content type matcher */ private final TextMatcher contentTypeMatcher = CommonMatchers.xmlFeedContentTypes(); @@ -87,11 +41,11 @@ public class XmlFeedLinkExtractorConfig { /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); + private final PropertyMatchers restrictions = new PropertyMatchers(); + public XmlFeedLinkExtractorConfig setFieldMatcher( TextMatcher fieldMatcher) { this.fieldMatcher.copyFrom(fieldMatcher); @@ -102,7 +56,7 @@ public XmlFeedLinkExtractorConfig setFieldMatcher( * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#XML_FEED_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher + * @param matcher content type matcher * @return this */ public XmlFeedLinkExtractorConfig setContentTypeMatcher( @@ -110,4 +64,19 @@ public XmlFeedLinkExtractorConfig setContentTypeMatcher( contentTypeMatcher.copyFrom(matcher); return this; } + + /** + * Clears all restrictions. + */ + public void clearRestrictions() { + restrictions.clear(); + } + + /** + * Gets all restrictions + * @return the restrictions + */ + public PropertyMatchers getRestrictions() { + return restrictions; + } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java index 02676a1e7..52bb5eac0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java @@ -14,6 +14,8 @@ */ package com.norconex.crawler.web.doc.operations.recrawl.impl; +import static java.util.Optional.ofNullable; + import java.time.Duration; import java.time.ZonedDateTime; import java.time.temporal.ChronoField; @@ -29,6 +31,7 @@ import com.norconex.crawler.web.doc.WebCrawlDocContext; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency; +import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency.ApplyTo; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.SitemapSupport; import com.norconex.crawler.web.sitemap.SitemapChangeFrequency; @@ -51,67 +54,21 @@ *

    * By default, existing sitemap directives take precedence over custom ones. * You chose to have sitemap directives be considered last or even disable - * sitemap directives using the {@link #setSitemapSupport(SitemapSupport)} + * sitemap directives using the + * {@link GenericRecrawlableResolverConfig#setSitemapSupport(SitemapSupport)} * method. *

    * *

    Custom recrawl frequencies:

    *

    * You can chose to have some of your crawled documents be re-crawled less - * frequently than others by specifying custom minimum frequencies - * ({@link #setMinFrequencies(Collection)}). Minimum frequencies are - * processed in the order specified and must each have to following: - *

    - *
      - *
    • applyTo: Either "reference" or "contentType" - * (defaults to "reference").
    • - *
    • pattern: A regular expression.
    • - *
    • value: one of "always", "hourly", "daily", "weekly", "monthly", - * "yearly", "never", or a numeric value in milliseconds.
    • - *
    - * - *

    - * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *

    - * - * {@nx.xml.usage - * - * - * - * - * (Matcher for the reference or content type.) - * - * - * (... repeat frequency tag as needed ...) - * - * } - * - * {@nx.xml.example - * - * - * application/pdf - * - * - * .*latest-news.*\.html - * - * - * } - *

    - * The above example ensures PDFs are re-crawled no more frequently than - * once a month, while HTML news can be re-crawled as fast at every half hour. - * For the rest, it relies on the website sitemap directives (if any). + * frequently than others by specifying custom minimum frequencies with + * ({@link GenericRecrawlableResolverConfig#setMinFrequencies(Collection)}). + * Minimum frequencies are processed in the order specified. *

    * * @since 2.5.0 */ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode @ToString @@ -157,15 +114,11 @@ public boolean isRecrawlable(WebCrawlDocContext prevData) { private MinFrequency getMatchingMinFrequency(WebCrawlDocContext prevData) { for (MinFrequency f : configuration.getMinFrequencies()) { - var applyTo = f.getApplyTo(); - if (StringUtils.isBlank(applyTo)) { - applyTo = "reference"; - } - if (("reference".equalsIgnoreCase(applyTo) - && f.getMatcher().matches(prevData.getReference()) - || ("contentType".equalsIgnoreCase(applyTo) - && f.getMatcher().matches( - prevData.getContentType().toString())))) { + var applyTo = ofNullable(f.getApplyTo()).orElse(ApplyTo.REFERENCE); + var matchMe = applyTo == ApplyTo.REFERENCE + ? prevData.getReference() + : prevData.getContentType().toString(); + if (f.getMatcher().matches(matchMe)) { return f; } } @@ -243,17 +196,16 @@ private boolean isRecrawlableFromSitemap(WebCrawlDocContext prevData) { lastModified, prevData.getReference()); if (lastModified.isAfter(lastCrawled)) { if (LOG.isDebugEnabled()) { - LOG.debug( - "Recrawlable according to sitemap directive " - + "(last modified '{}' > last crawled '{}'): {}", + LOG.debug(""" + Recrawlable according to sitemap directive \ + (last modified '{}' > last crawled '{}'): {}""", lastModified, lastCrawled, prevData.getReference()); } return true; } if (LOG.isDebugEnabled()) { - LOG.debug( - "Not recrawlable according to sitemap directive " - + "(last modified '{}' > last crawled '{}'): {}", + LOG.debug("Not recrawlable according to sitemap directive " + + "(last modified '{}' > last crawled '{}'): {}", lastModified, lastCrawled, prevData.getReference()); } return false; @@ -275,8 +227,7 @@ private boolean isRecrawlableFromFrequency( } if (LOG.isDebugEnabled()) { - LOG.debug( - "The {} change frequency is {} for: {}", + LOG.debug("The {} change frequency is {} for: {}", context, cf, prevData.getReference()); } if (cf == SitemapChangeFrequency.ALWAYS) { @@ -324,16 +275,15 @@ private boolean isRecrawlableFromFrequency( return true; } if (LOG.isDebugEnabled()) { - LOG.debug( - String.format(""" - Not recrawlable according to {} directive\s\ - (required elapsed time '{}'\s\ - >= actual elapsed time '{}' since '{}'): {}""", - context, - formatDuration(lastCrawlDate, minCrawlDate), - formatDuration(lastCrawlDate, now), - lastCrawlDate, - prevData.getReference())); + LOG.debug(String.format(""" + Not recrawlable according to {} directive\s\ + (required elapsed time '{}'\s\ + >= actual elapsed time '{}' since '{}'): {}""", + context, + formatDuration(lastCrawlDate, minCrawlDate), + formatDuration(lastCrawlDate, now), + lastCrawlDate, + prevData.getReference())); } return false; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java index 8a3b212b3..5a7a2d81f 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java @@ -23,87 +23,17 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.commons.lang.time.DurationParser; import lombok.Data; import lombok.NoArgsConstructor; import lombok.experimental.Accessors; /** - *

    Relies on both sitemap directives and custom instructions for - * establishing the minimum frequency between each document recrawl. - *

    - * - *

    Sitemap support:

    - *

    - * Provided crawler support for sitemaps has not been disabled, - * this class tries to honor last modified and frequency directives found - * in sitemap files. - *

    - *

    - * By default, existing sitemap directives take precedence over custom ones. - * You chose to have sitemap directives be considered last or even disable - * sitemap directives using the {@link #setSitemapSupport(SitemapSupport)} - * method. - *

    - * - *

    Custom recrawl frequencies:

    - *

    - * You can chose to have some of your crawled documents be re-crawled less - * frequently than others by specifying custom minimum frequencies - * ({@link #setMinFrequencies(Collection)}). Minimum frequencies are - * processed in the order specified and must each have to following: - *

    - *
      - *
    • applyTo: Either "reference" or "contentType" - * (defaults to "reference").
    • - *
    • pattern: A regular expression.
    • - *
    • value: one of "always", "hourly", "daily", "weekly", "monthly", - * "yearly", "never", or a numeric value in milliseconds.
    • - *
    - * *

    - * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). + * Configuration for {@link GenericRecrawlableResolver}. *

    - * - * {@nx.xml.usage - * - * - * - * - * (Matcher for the reference or content type.) - * - * - * (... repeat frequency tag as needed ...) - * - * } - * - * {@nx.xml.example - * - * - * application/pdf - * - * - * .*latest-news.*\.html - * - * - * } - *

    - * The above example ensures PDFs are re-crawled no more frequently than - * once a month, while HTML news can be re-crawled as fast at every half hour. - * For the rest, it relies on the website sitemap directives (if any). - *

    - * * @since 2.5.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericRecrawlableResolverConfig { @@ -127,8 +57,6 @@ public static SitemapSupport getSitemapSupport(String sitemapSupport) { /** * The sitemap support strategy. A null value * is equivalent to specifying the default {@link SitemapSupport#FIRST}. - * @param sitemapSupport sitemap support strategy - * @return sitemap support strategy */ private SitemapSupport sitemapSupport = SitemapSupport.FIRST; @@ -154,11 +82,30 @@ public void setMinFrequencies(Collection minFrequencies) { @Accessors(chain = true) @NoArgsConstructor public static class MinFrequency { - private String applyTo; + public enum ApplyTo { + CONTENT_TYPE, REFERENCE + } + + /** + * Whether to apply this minimum frequency to matching content type + * or document reference. Default to {@link ApplyTo#REFERENCE}. + */ + private ApplyTo applyTo = ApplyTo.REFERENCE; + /** + * String representation of a frequency. Can be one of "always", + * "hourly", "daily", "weekly", "monthly", "yearly", "never", or a + * numeric value in milliseconds. + */ private String value; + + /** + * A matcher applied to either a document reference or content type, + * based on {@link #getApplyTo()}. + */ private final TextMatcher matcher = new TextMatcher(); - public MinFrequency(String applyTo, String value, TextMatcher matcher) { + public MinFrequency( + ApplyTo applyTo, String value, TextMatcher matcher) { this.applyTo = applyTo; this.value = value; this.matcher.copyFrom(matcher); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java index 7ed12d244..cb9e64048 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolver.java @@ -33,7 +33,8 @@ import lombok.extern.slf4j.Slf4j; /** - *

    By default a crawler will try to follow all links it discovers. You can + *

    + * By default a crawler will try to follow all links it discovers. You can * define your own filters to limit the scope of the pages being crawled. * When you have multiple URLs defined as start URLs, it can be tricky to * perform global filtering that apply to each URLs without causing @@ -47,7 +48,6 @@ *

    * @since 2.3.0 */ -//TODO make this an interface so developers can provide their own? @EqualsAndHashCode @ToString @Slf4j diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java index 746855946..bd2ec7ef2 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java @@ -18,24 +18,13 @@ import lombok.experimental.Accessors; /** - *

    By default a crawler will try to follow all links it discovers. You can - * define your own filters to limit the scope of the pages being crawled. - * When you have multiple URLs defined as start URLs, it can be tricky to - * perform global filtering that apply to each URLs without causing - * URL filtering conflicts. This class offers an easy way to address - * a frequent URL filtering need: to "stay on target". That is, - * when following a page and extracting URLs found in it, make sure to - * only keep URLs that are on the same site as the page URL we are on. - *

    *

    - * By default this class does not request to stay on a site. + * Configuration for {@link GenericUrlScopeResolver}. *

    * @since 2.3.0 */ -//TODO make this an interface so developers can provide their own? @Data @Accessors(chain = true) -@SuppressWarnings("javadoc") public class GenericUrlScopeResolverConfig { /** @@ -43,16 +32,12 @@ public class GenericUrlScopeResolverConfig { * the domain for each URL specified as a start URL. By default (false) * the crawler will try follow any discovered links not otherwise rejected * by other settings (like regular filtering rules you may have). - * @param stayOnDomain true for the crawler to stay on domain - * @return true if the crawler should stay on a domain */ private boolean stayOnDomain; /** * Whether sub-domains are considered to be the same as a URL domain. * Only applicable when "stayOnDomain" is true. - * @param includeSubdomains true to include sub-domains - * @return true if including sub-domains * @since 2.9.0 */ private boolean includeSubdomains; @@ -62,8 +47,6 @@ public class GenericUrlScopeResolverConfig { * the port for each URL specified as a start URL. By default (false) * the crawler will try follow any discovered links not otherwise rejected * by other settings (like regular filtering rules you may have). - * @param stayOnPort true for the crawler to stay on port - * @return true if the crawler should stay on a port */ private boolean stayOnPort; @@ -72,9 +55,6 @@ public class GenericUrlScopeResolverConfig { * the protocol for each URL specified as a start URL. By default (false) * the crawler will try follow any discovered links not otherwise rejected * by other settings (like regular filtering rules you may have). - * @param stayOnProtocol - * true for the crawler to stay on protocol - * @return true if the crawler should stay on protocol */ private boolean stayOnProtocol = false; diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java index bba4f4884..5e650b019 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java @@ -98,51 +98,6 @@ * In addition, this class allows you to specify any number of URL * value replacements using regular expressions. *

    - * - * {@nx.xml.usage - * - * - * (normalization code names, coma separated) - * - * - * - * (regex pattern to match) - * (optional replacement value, default to blank) - * - * (... repeat replace tag as needed ...) - * - * - * } - *

    - * Since 2.7.2, having an empty "normalizations" tag will effectively remove - * any normalizations rules previously set (like default ones). - * Not having the tag - * at all will keep existing/default normalizations. - *

    - * - * {@nx.xml.example - * - * - * removeFragment, lowerCaseSchemeHost, upperCaseEscapeSequence, - * decodeUnreservedCharacters, removeDefaultPort, - * encodeNonURICharacters, addWWW - * - * - * &amp;view=print - * - * (&amp;type=)(summary) - * $1full - * - * - * - * } - *

    - * The following adds a normalization to add "www." to URL domains when - * missing, to the default set of normalizations. It also add custom - * URL "search-and-replace" to remove any "&view=print" strings from URLs - * as well as replace "&type=summary" with "&type=full". - *

    */ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java index a990bc186..a5a1656e7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java @@ -24,8 +24,6 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.convert.GenericConverter; import com.norconex.commons.lang.url.UrlNormalizer; -import com.norconex.crawler.web.WebCrawlerConfig; -import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer; import lombok.Data; import lombok.Getter; @@ -33,117 +31,7 @@ /** *

    - * Generic implementation of {@link WebUrlNormalizer} that should satisfy - * most URL normalization needs. This implementation relies on - * {@link UrlNormalizer}. Please refer to it for complete documentation and - * examples. - *

    - *

    - * This class is in effect by default. To skip its usage, you - * can explicitly set the URL Normalizer to null in the - * {@link WebCrawlerConfig}. - *

    - *

    - * By default, this class removes the URL fragment and applies these - * RFC 3986 - * normalizations: - *

    - *
      - *
    • Converting the scheme and host to lower case
    • - *
    • Capitalizing letters in escape sequences
    • - *
    • Decoding percent-encoded unreserved characters
    • - *
    • Removing the default port
    • - *
    • Encoding non-URI characters
    • - *
    - *

    - * To overwrite this default, you have to specify a new list of normalizations - * to apply, via the {@link #setNormalizations(List)} method, - * or via XML configuration. Each - * normalizations is identified by a code name. The following is the - * complete code name list for supported normalizations. Click on any code - * name to get a full description from {@link WebUrlNormalizer}: - *

    - *
      - *
    • {@link UrlNormalizer#addDirectoryTrailingSlash() addDirectoryTrailingSlash} (since 2.6.0)
    • - *
    • {@link UrlNormalizer#addDomainTrailingSlash() addDomainTrailingSlash} (since 2.6.1)
    • - *
    • {@link UrlNormalizer#addWWW() addWWW}
    • - *
    • {@link UrlNormalizer#decodeUnreservedCharacters() decodeUnreservedCharacters}
    • - *
    • {@link UrlNormalizer#encodeNonURICharacters() encodeNonURICharacters}
    • - *
    • {@link UrlNormalizer#encodeSpaces() encodeSpaces}
    • - *
    • {@link UrlNormalizer#lowerCase() lowerCase} (since 2.9.0)
    • - *
    • {@link UrlNormalizer#lowerCasePath() lowerCasePath} (since 2.9.0)
    • - *
    • {@link UrlNormalizer#lowerCaseQuery() lowerCaseQuery} (since 2.9.0)
    • - *
    • {@link UrlNormalizer#lowerCaseQueryParameterNames() - * lowerCaseQueryParameterNames} (since 2.9.0)
    • - *
    • {@link UrlNormalizer#lowerCaseQueryParameterValues() - * lowerCaseQueryParameterValues} (since 2.9.0)
    • - *
    • {@link UrlNormalizer#lowerCaseSchemeHost() lowerCaseSchemeHost}
    • - *
    • {@link UrlNormalizer#removeDefaultPort() removeDefaultPort}
    • - *
    • {@link UrlNormalizer#removeDirectoryIndex() removeDirectoryIndex}
    • - *
    • {@link UrlNormalizer#removeDotSegments() removeDotSegments}
    • - *
    • {@link UrlNormalizer#removeDuplicateSlashes() removeDuplicateSlashes}
    • - *
    • {@link UrlNormalizer#removeEmptyParameters() removeEmptyParameters}
    • - *
    • {@link UrlNormalizer#removeFragment() removeFragment}
    • - *
    • {@link UrlNormalizer#removeQueryString() removeQueryString} (since 2.9.0)
    • - *
    • {@link UrlNormalizer#removeSessionIds() removeSessionIds}
    • - *
    • {@link UrlNormalizer#removeTrailingQuestionMark() removeTrailingQuestionMark}
    • - *
    • {@link UrlNormalizer#removeTrailingSlash() removeTrailingSlash} (since 2.6.0)
    • - *
    • {@link UrlNormalizer#removeTrailingHash() removeTrailingHash} (since 2.7.0)
    • - *
    • {@link UrlNormalizer#removeWWW() removeWWW}
    • - *
    • {@link UrlNormalizer#replaceIPWithDomainName() replaceIPWithDomainName}
    • - *
    • {@link UrlNormalizer#secureScheme() secureScheme}
    • - *
    • {@link UrlNormalizer#sortQueryParameters() sortQueryParameters}
    • - *
    • {@link UrlNormalizer#unsecureScheme() unsecureScheme}
    • - *
    • {@link UrlNormalizer#upperCaseEscapeSequence() upperCaseEscapeSequence}
    • - *
    - *

    - * In addition, this class allows you to specify any number of URL - * value replacements using regular expressions. - *

    - * - * {@nx.xml.usage - * - * - * (normalization code names, coma separated) - * - * - * - * (regex pattern to match) - * (optional replacement value, default to blank) - * - * (... repeat replace tag as needed ...) - * - * - * } - *

    - * Since 2.7.2, having an empty "normalizations" tag will effectively remove - * any normalizations rules previously set (like default ones). - * Not having the tag - * at all will keep existing/default normalizations. - *

    - * - * {@nx.xml.example - * - * - * removeFragment, lowerCaseSchemeHost, upperCaseEscapeSequence, - * decodeUnreservedCharacters, removeDefaultPort, - * encodeNonURICharacters, addWWW - * - * - * &amp;view=print - * - * (&amp;type=)(summary) - * $1full - * - * - * - * } - *

    - * The following adds a normalization to add "www." to URL domains when - * missing, to the default set of normalizations. It also add custom - * URL "search-and-replace" to remove any "&view=print" strings from URLs - * as well as replace "&type=summary" with "&type=full". + * Configuration for {@link GenericUrlNormalizer}. *

    */ @Data diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java index 5fdb6d37d..591a9c438 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java @@ -100,31 +100,8 @@ * using a custom link extractor. *

    * - * {@nx.xml.usage - * - * (CSV list of status codes) - * (path to a directory of your choice) - * (report file name prefix) - * [false|true] - * - * } - * - * {@nx.xml.example - * - * 404 - * /report/path/ - * brokenLinks - * - * } - *

    - * The above example will generate a broken links report by recording - * 404 status codes (from HTTP response). - *

    - * * @since 2.2.0 */ - @EqualsAndHashCode @ToString @Slf4j @@ -223,8 +200,8 @@ private void resolveStatusCodeRange( var end = toInt(endPoints[1]); if (start >= end) { throw new IllegalArgumentException( - "Invalid statusCode range: " + range - + ". Start value must be higher than end value."); + "Invalid statusCode range: %s. Start value must be " + + "higher than end value.".formatted(range)); } while (start <= end) { parsedCodes.add(start); @@ -274,9 +251,8 @@ private int toInt(String num) { return Integer.parseInt(num.trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( - "The statusCodes attribute " - + "can only contain valid numbers. This number is invalid: " - + num); + "The statusCodes attribute can only contain valid numbers. " + + "This number is invalid: " + num); } } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java index af5c0eaee..9b573f7d7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java @@ -15,106 +15,18 @@ package com.norconex.crawler.web.event.listeners; import java.nio.file.Path; -import java.util.List; - -import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor; -import com.norconex.crawler.web.doc.operations.link.impl.TikaLinkExtractor; import lombok.Data; import lombok.experimental.Accessors; /** *

    - * Store on file all URLs that were "fetched", along with their HTTP response - * code. Useful for reporting purposes (e.g. finding broken links). A short - * summary of all HTTP status codes can be found - * here. - *

    - * - *

    Filter by status codes

    - *

    - * By default, the status of all fetched URLs are stored by this listener, - * regardless what were those statuses. This can generate very lengthy reports - * on large crawls. If you are only interested in certain status codes, you can - * listen only for those using the {@link #setStatusCodes(String)} method - * or XML configuration equivalent. You specify the codes you want to listen - * for as coma-separated values. Ranges are also supported: specify two range - * values (both inclusive) separated by an hyphen. For instance, if you want - * to store all "bad" URLs, you can quickly specify all codes except - * 200 (OK) this way: - *

    - *
    100-199,201-599
    - * - *

    Output location

    - *

    - * By default one generated report is created for each crawler, stored - * in crawler-specific directories under the collector working directory. - * The collector working directory can be overwritten using - * {@link #setOutputDir(Path)}. - * If {@link #isCombined()} is true, status from all crawlers - * defined will be written to a unique file in the collector working directory. - *

    - * - *

    File naming

    - *

    - * By default, the file generated will use this naming pattern: - *

    - *
    - *   urlstatuses-[timestamp].csv
    - * 
    - *

    - * The filename prefix can be changed from "urlstatuses-" to anything else - * using {@link #setFileNamePrefix(String)}. + * Configuration for {@link UrlStatusCrawlerEventListener}. *

    - * - *

    Filter which crawler to record URL statuses

    - *

    - * By default all crawlers will have their URL fetch statuses recorded when - * using this event listener. To only do so for some crawlers, you can - * use {@link #setCrawlerIds(List)} to identify them. - *

    - * - *

    Referring/parent URLs and custom link extractor

    - *

    - * To capture the referring pages you have to use a link extractor that - * extracts referrer information. The default link extractor - * {@link HtmlLinkExtractor} properly extracts this information. Same with - * {@link TikaLinkExtractor}. This is only a consideration when - * using a custom link extractor. - *

    - * - * {@nx.xml.usage - * - * (CSV list of status codes) - * - * - * (existing crawler ID) - * - * (path to a directory of your choice) - * (report file name prefix) - * [false|true] - * [false|true] - * - * } - * - * {@nx.xml.example - * - * 404 - * /report/path/ - * brokenLinks - * - * } - *

    - * The above example will generate a broken links report by recording - * 404 status codes (from HTTP response). - *

    - * * @since 2.2.0 */ @Data @Accessors(chain = true) -@SuppressWarnings("javadoc") public class UrlStatusCrawlerEventListenerConfig { public static final String DEFAULT_FILENAME_PREFIX = "urlstatuses-"; @@ -123,32 +35,24 @@ public class UrlStatusCrawlerEventListenerConfig { * The coma-separated list of status codes to listen to. * Default is null (listens for all status codes). * See class documentation for how to specify code ranges. - * @param statusCode HTTP status codes - * @return status codes */ private String statusCodes; /** * The local directory where this listener report will be written. * Default uses the collector working directory. - * @param outputDir directory path - * @return directory path */ private Path outputDir; /** * The generated report file name prefix. See class documentation * for default prefix. - * @param fileNamePrefix file name prefix - * @return file name prefix */ private String fileNamePrefix = DEFAULT_FILENAME_PREFIX; /** * Whether to add a timestamp to the file name, to ensure * a new one is created with each run. - * @param timestamped true if timestamped - * @return true if timestamped */ private boolean timestamped; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java index 4e68c96cc..6ce57d161 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java @@ -22,63 +22,5 @@ */ public interface HttpFetcher extends Fetcher { - // extends AbstractFetcher { - // - // @Override - // public HttpFetchResponse fetch(HttpFetchRequest fetchRequest) - // throws FetchException { - // // TODO Auto-generated method stub - // return null; - // } - // - // @Override - // protected void loadFetcherFromXML(XML xml) { - // // TODO Auto-generated method stub - // - // } - // - // @Override - // protected void saveFetcherToXML(XML xml) { - // // TODO Auto-generated method stub - // - // } - - //TODO do we need this class? Depends if we need this method: - - //User agent is not obtained part of http fetch response... so no need - // really. - // String getUserAgent(); - // - // boolean accept(Doc doc, HttpMethod httpMethod); - // - // /** - // *

    - // * Performs an HTTP request for the supplied document reference - // * and HTTP method. - // *

    - // *

    - // * For each HTTP method supported, implementors should - // * do their best to populate the document and its {@link CrawlDocRecord} - // * with as much information they can. - // *

    - // *

    - // * Unsupported HTTP methods should return an HTTP response with the - // * {@link CrawlDocState#UNSUPPORTED} state. To prevent users having to - // * configure multiple HTTP clients, implementors should try to support - // * both the GET and HEAD methods. - // * POST is only used in special cases and is often not used during a - // * crawl session. - // *

    - // *

    - // * A null method is treated as a GET. - // *

    - // * @param doc document to fetch or to use to make the request. - // * @param httpMethod HTTP method - // * @return an HTTP response - // * @throws HttpFetchException problem when fetching the document - // * @see HttpFetchResponseBuilder#unsupported() - // */ - // IHttpFetchResponse fetch(CrawlDoc doc, HttpMethod httpMethod) - // throws HttpFetchException; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java index 1ce1738ce..abe4c5c8b 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcherProvider.java @@ -20,7 +20,6 @@ import com.norconex.crawler.web.WebCrawlerConfig; import com.norconex.crawler.web.fetch.impl.GenericHttpFetchResponse; import com.norconex.crawler.web.fetch.impl.GenericHttpFetcher; -import com.norconex.crawler.web.util.Web; public class HttpFetcherProvider implements Function { @@ -31,7 +30,9 @@ public HttpMultiFetcher apply(Crawler crawler) { var cfg = (WebCrawlerConfig) crawler.getConfiguration(); //TODO really convert here? and this way? - var fetchers = Web.toHttpFetcher(cfg.getFetchers()); + var fetchers = cfg.getFetchers().stream() + .map(HttpFetcher.class::cast) + .toList(); if (fetchers.isEmpty()) { fetchers.add(new GenericHttpFetcher()); } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java index bd980bd3e..8c5f818fa 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcher.java @@ -87,14 +87,13 @@ import org.apache.hc.core5.util.Timeout; import com.norconex.commons.lang.encrypt.EncryptionUtil; -import com.norconex.commons.lang.time.DurationParser; import com.norconex.crawler.core.Crawler; import com.norconex.crawler.core.CrawlerException; import com.norconex.crawler.core.doc.CrawlDocState; import com.norconex.crawler.core.fetch.AbstractFetcher; import com.norconex.crawler.core.fetch.FetchException; import com.norconex.crawler.web.doc.WebCrawlDocContext; -import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizer; +import com.norconex.crawler.web.doc.operations.url.impl.GenericUrlNormalizerConfig.Normalization; import com.norconex.crawler.web.fetch.HttpFetchRequest; import com.norconex.crawler.web.fetch.HttpFetchResponse; import com.norconex.crawler.web.fetch.HttpFetcher; @@ -146,12 +145,6 @@ * * {@nx.include com.norconex.commons.lang.security.Credentials#doc} * - *

    - * XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *

    - * *

    HSTS Support

    *

    * Upon first encountering a secure site, this fetcher will check whether the @@ -163,10 +156,9 @@ *

    *

    * If you want to convert non-secure URLs secure ones regardless of website - * HSTS support, use - * {@link GenericUrlNormalizer.Normalization#secureScheme} instead. + * HSTS support, use {@link Normalization#SECURE_SCHEME} instead. * To disable HSTS support, use - * {@link GenericHttpFetcherConfig#setDisableHSTS(boolean)}. + * {@link GenericHttpFetcherConfig#setHstsDisabled(boolean)}. *

    * *

    Pro-active change detection

    @@ -183,106 +175,16 @@ * supporting servers we only want to download a document if it was modified * since our last request. * To disable support for pro-active change detection, you can use - * {@link GenericHttpFetcherConfig#setDisableIfModifiedSince(boolean)} and - * {@link GenericHttpFetcherConfig#setDisableETag(boolean)}. + * {@link GenericHttpFetcherConfig#setIfModifiedSinceDisabled(boolean)} and + * {@link GenericHttpFetcherConfig#setETagDisabled(boolean)}. *

    *

    * These settings have no effect for web servers not supporting them. *

    * - * {@nx.xml.usage - * - * - * (identify yourself!) - * [RELAXED|STRICT|IGNORE] - * (milliseconds) - * (milliseconds) - * (milliseconds) - * [false|true] - * ... - * (implementation handling redirects) - * ... - * ... - * ... - * (milliseconds) - * (milliseconds) - * - * - * [false|true] - * - * - * (coma-separated list) - * - * - * [false|true] - * - * - * [false|true] - * - * - * - * - * - * {@nx.include com.norconex.commons.lang.net.ProxySettings@nx.xml.usage} - * - * - * - * - *
    (header value)
    - * - *
    - * - * - * [false|true] - * - * - * [false|true] - * - * - * - * {@nx.include com.norconex.crawler.web.fetch.impl.HttpAuthConfig@nx.xml.usage} - * - * - * (defaults to 200) - * (defaults to 404) - * (string to prefix headers) - * - * - * [false|true] - * [false|true] - * - * {@nx.include com.norconex.crawler.core.fetch.AbstractFetcher#referenceFilters} - * - * - * (defaults to: GET, HEAD) - * - *
    - * } - * - * {@nx.xml.example - * - * - * form - * - * joeUser - * joePasword - * - * loginUser - * loginPwd - * http://www.example.com/login/submit - * - * - * } - *

    - * The above example will authenticate the crawler to a web site before - * crawling. The website uses an HTML form with a username and password - * fields called "loginUser" and "loginPwd". - *

    - * * @since 3.0.0 (Merged from GenericDocumentFetcher and * GenericHttpClientFactory) */ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode(onlyExplicitlyIncluded = true) @ToString(onlyExplicitlyIncluded = true) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java index d3e910930..abd670f46 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java @@ -38,10 +38,7 @@ /** * Generic HTTP Fetcher configuration. - * @since 3.0.0 (adapted from GenericHttpClientFactory and - * GenericDocumentFetcher from version 2.x) */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericHttpFetcherConfig extends BaseFetcherConfig { @@ -61,23 +58,26 @@ public enum CookieSpec { RELAXED, STRICT, IGNORE } + /** + * HTTP status codes considered "valid". Defaults to 200. + */ private final List validStatusCodes = new ArrayList<>(DEFAULT_VALID_STATUS_CODES); + /** + * HTTP status codes considered "not found". Defaults to 404. + */ private final List notFoundStatusCodes = new ArrayList<>(DEFAULT_NOT_FOUND_STATUS_CODES); /** - * Optional prefix prepended to captured HTTP response fields. - * @param headersPrefix optional prefix - * @return prefix or null + * Optional prefix prepended to captured HTTP response fields. A + * null value (default) won't add any prefix. */ private String headersPrefix; /** * Whether content type is detected instead of relying on * returned Content-Type HTTP response header. - * @param forceContentTypeDetection true to enable detection - * @return true to enable detection */ private boolean forceContentTypeDetection; @@ -85,33 +85,28 @@ public enum CookieSpec { * Whether character encoding is detected instead of relying on * the charset sometimes found in the Content-Type HTTP * response header. - * @param forceCharsetDetection true to enable detection - * @return true to enable detection */ private boolean forceCharsetDetection; /** * Authentication configuration for sites requiring it. Default * is null. - * @param authentication authentication configuration - * @return authentication configuration */ private HttpAuthConfig authentication; /** * Cookie specification to use when fetching documents. Default is relaxed. - * @param cookieSpec cookie specification name - * @return the cookieSpec cookie specification name */ private CookieSpec cookieSpec = CookieSpec.RELAXED; + /** + * An optional HTTP proxy. + */ private final ProxySettings proxySettings = new ProxySettings(); /** * The connection timeout for a connection to be established. * Default is {@link #DEFAULT_TIMEOUT}. - * @param connectionTimeout connection timeout - * @return connection timeout */ private Duration connectionTimeout = DEFAULT_TIMEOUT; @@ -119,32 +114,24 @@ public enum CookieSpec { * Gets the maximum period of inactivity between two consecutive data * packets. * Default is {@link #DEFAULT_TIMEOUT}. - * @param socketTimeout socket timeout - * @return socket timeout */ private Duration socketTimeout = DEFAULT_TIMEOUT; /** * Gets the timeout when requesting a connection. * Default is {@link #DEFAULT_TIMEOUT}. - * @param connectionRequestTimeout connection request timeout - * @return connection request timeout */ private Duration connectionRequestTimeout = DEFAULT_TIMEOUT; /** * The local address, which may be useful when working with multiple * network interfaces. - * @param localAddress locale address - * @return local address */ private String localAddress; /** * Whether 'Expect: 100-continue' handshake is enabled. * See {@link RequestConfig#isExpectContinueEnabled()} - * @param expectContinueEnabled true if enabled - * @return true if enabled */ private boolean expectContinueEnabled; @@ -152,8 +139,6 @@ public enum CookieSpec { * The maximum number of redirects to be followed. This can help * prevent infinite loops. A value of zero effectively disables * redirects. Default is {@link #DEFAULT_MAX_REDIRECT}. - * @param maxRedirects maximum number of redirects to be followed - * @return maximum number of redirects to be followed */ private int maxRedirects = DEFAULT_MAX_REDIRECT; @@ -161,16 +146,12 @@ public enum CookieSpec { * The maximum number of connections that can be created. Typically, * you would have at least the same amount as threads. * Default is {@link #DEFAULT_MAX_CONNECTIONS}. - * @param maxConnections maximum number of connections - * @return number of connections */ private int maxConnections = DEFAULT_MAX_CONNECTIONS; /** * The maximum number of connections that can be used per route. * Default is {@link #DEFAULT_MAX_CONNECTIONS_PER_ROUTE}. - * @param maxConnectionsPerRoute maximum number of connections per route - * @return number of connections per route */ private int maxConnectionsPerRoute = DEFAULT_MAX_CONNECTIONS_PER_ROUTE; @@ -178,9 +159,6 @@ public enum CookieSpec { * Sets the period of time after which to evict idle * connections from the connection pool. * Default is {@link #DEFAULT_MAX_IDLE_TIME}. - * @param maxConnectionIdleTime amount of time after which to evict idle - * connections - * @return amount of time after which to evict idle connections */ private Duration maxConnectionIdleTime = DEFAULT_MAX_IDLE_TIME; @@ -188,11 +166,12 @@ public enum CookieSpec { * Sets the period of time a connection must be inactive * to be checked in case it became stalled. Default is 0 (not pro-actively * checked). - * @param maxConnectionInactiveTime period of time in milliseconds - * @return period of time in milliseconds */ private Duration maxConnectionInactiveTime; + /** + * Headers to send with every HTTP request. + */ private final Map requestHeaders = new HashMap<>(); /** @@ -200,8 +179,6 @@ public enum CookieSpec { * header is disabled. * Servers supporting this header will only return the requested document * if it was last modified since the supplied date. - * @param ifModifiedSinceDisabled true if disabled - * @return true if disabled */ private boolean ifModifiedSinceDisabled; @@ -211,28 +188,25 @@ public enum CookieSpec { * Servers supporting this header will only return the requested document * if the ETag value has changed, indicating a more recent version is * available. - * @param eTagDisabled true if disabled - * @return true if disabled */ private boolean eTagDisabled; /** * The user-agent used when identifying the crawler to targeted web sites. * It is highly recommended to always identify yourself. - * @param userAgent user agent - * @return user agent */ private String userAgent; /** * The redirect URL provider. * Defaults to {@link GenericRedirectUrlProvider}. - * @param redirectUrlProvider redirect URL provider - * @return the redirect URL provider */ private RedirectUrlProvider redirectUrlProvider = new GenericRedirectUrlProvider(); + /** + * List of supported HTTP methods. + */ private final List httpMethods = new ArrayList<>( Arrays.asList( HttpMethod.GET, HttpMethod.HEAD)); @@ -241,31 +215,27 @@ public enum CookieSpec { /** * Sets whether to trust all SSL certificate (affects only "https" - * connections). This is typically a bad - * idea (favors man-in-the-middle attacks). Try to install a SSL + * connections). This is typically a bad idea if you care to avoid + * "man-in-the-middle" attacks. Try to install a SSL * certificate locally to ensure a proper certificate exchange instead. * @since 1.3.0 - * @param trustAllSSLCertificates true if trusting all SSL - * certificates - * @return true if trusting all SSL certificates */ private boolean trustAllSSLCertificates; /** * Sets whether Server Name Indication (SNI) is disabled. - * @param sniDisabled true if disabled - * @return true if disabled */ private boolean sniDisabled; + /** + * Supported security protocols. + */ private final List sslProtocols = new ArrayList<>(); /** * Gets whether the forcing of non secure URLs to secure ones is disabled, * according to the URL domain Strict-Transport-Security policy * (obtained from HTTP response header). - * @param hstsDisabled true if disabled - * @return true if disabled */ private boolean hstsDisabled; @@ -280,6 +250,7 @@ public List getValidStatusCodes() { /** * Gets valid HTTP response status codes. * @param validStatusCodes valid status codes + * @return this */ public GenericHttpFetcherConfig setValidStatusCodes( List validStatusCodes) { @@ -299,6 +270,7 @@ public List getNotFoundStatusCodes() { /** * Sets HTTP status codes to be considered as "Not found" state. * @param notFoundStatusCodes "Not found" codes + * @return this */ public final GenericHttpFetcherConfig setNotFoundStatusCodes( List notFoundStatusCodes) { @@ -312,6 +284,7 @@ public final GenericHttpFetcherConfig setNotFoundStatusCodes( * may already provide. * @param name HTTP request header name * @param value HTTP request header value + * @return this */ public GenericHttpFetcherConfig setRequestHeader( String name, String value) { @@ -324,6 +297,7 @@ public GenericHttpFetcherConfig setRequestHeader( * Those are in addition to any default request headers Apache HttpClient * may already provide. * @param headers map of header names and values + * @return this */ public GenericHttpFetcherConfig setRequestHeaders( Map headers) { @@ -388,6 +362,7 @@ public List getSslProtocols() { * and TLSv1.2. Note that specifying a protocol not supported by * your underlying Java platform will not work. * @param sslProtocols SSL/TLS protocols supported + * @return this */ public GenericHttpFetcherConfig setSslProtocols( List sslProtocols) { @@ -408,6 +383,7 @@ public List getHttpMethods() { * Sets the list of HTTP methods to be accepted by this fetcher. * Defaults are {@link HttpMethod#GET} and {@link HttpMethod#HEAD}. * @param httpMethods HTTP methods + * @return this */ public GenericHttpFetcherConfig setHttpMethods( List httpMethods) { diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java index cdb7b188c..364980f31 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/HttpAuthConfig.java @@ -35,106 +35,42 @@ *

    * Generic HTTP Fetcher authentication configuration. *

    - * {@nx.xml.usage - * [form|basic|digest|ntlm|spnego|kerberos] - * - * - * - * {@nx.include com.norconex.commons.lang.security.Credentials@nx.xml.usage} - * - * - * - * ... - * ... - * - * (Either a login form's action target URL or the URL of a page containing - * a login form if a "formSelector" is specified.) - * - * ... - * - * - * (param value) - * - * - * - * (CSS selector identifying the login page. E.g., "form") - * - * - * - * - * {@nx.include com.norconex.commons.lang.net.Host@nx.xml.usage} - * - * - * ... - * - * - * [false|true] - * - * - * - * {@nx.include com.norconex.commons.lang.net.Host@nx.xml.usage} - * - * ... - * ... - * } - * - *

    - * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "authentication"). - *

    * @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) @FieldNameConstants public class HttpAuthConfig { /** - *

    - * The authentication method. Valid values are (case insensitive): - *

    - *
      - *
    • form
    • - *
    • basic
    • - *
    • digest
    • - *
    • ntlm
    • - *
    • spnego
    • - *
    • kerberos
    • - *
    - * @return authentication method - * @param method authentication method + * The authentication method. */ private HttpAuthMethod method; /** * The URL for "form" authentication. - * The username and password will be POSTed to this URL. + * The username and password will be POSTed to this URL unless + * {@link #setFormSelector(String)} is set, then it is assumed to be + * the URL of the page containing the form. * This is used only for "form" authentication. - * @param url "form" authentication URL - * @return "form" authentication URL */ private String url; - //TODO consider taking those out in favor of 'formParams'? /** * The name of the HTML field where the username is set. * This is used only for "form" authentication. - * @param formUsernameField name of the HTML field - * @return username name of the HTML field */ private String formUsernameField; /** * The name of the HTML field where the password is set. * This is used only for "form" authentication. - * @param formPasswordField name of the HTML field - * @return name of the HTML field */ private String formPasswordField; + /** + * User name and password. + */ private final Credentials credentials = new Credentials(); /** @@ -142,8 +78,6 @@ public class HttpAuthConfig { * null (default value) indicates "any host" for the * scope. * Used for BASIC and DIGEST authentication. - * @param host host for the scope - * @return host for the scope */ private Host host; @@ -151,51 +85,40 @@ public class HttpAuthConfig { * The realm name for the current authentication scope. * null (default) indicates "any realm" for the scope. * Used for BASIC and DIGEST authentication. - * @param realm reaml name for the scope - * @return realm name for the scope */ private String realm; - //form /** * The authentication form character set for the form field values. * Default is UTF-8. - * @param formCharset authentication form character set - * @return authentication form character set */ private Charset formCharset = StandardCharsets.UTF_8; /** - * The CSS selelector that identifies the form in a login page. + * The CSS selector that identifies the form in a login page. * When set, requires {@link #getUrl()} to be pointing to a login * page containing a login form. - * @param formSelector form selector - * @return form selector */ private String formSelector; + /** + * Additional form parameters possibly expected by the login form. + */ private final Map formParams = new HashMap<>(); /** * The NTLM authentication workstation name. - * @param workstation workstation name - * @return workstation name */ private String workstation; /** * Gets the NTLM authentication domain. - * @param domain authentication domain - * @return authentication domain */ private String domain; /** * Whether to perform preemptive authentication * (valid for "basic" authentication method). - * @param preemptive - * true to perform preemptive authentication - * @return true to perform preemptive authentication */ private boolean preemptive; diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java index 3a4e75965..8008b3185 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java @@ -45,9 +45,8 @@ *

    *

    * EXPERIMENTAL: The use of this class is experimental. - * It is known to not be supported properly - * with some web drivers and/or browsers. It can even be ignored altogether - * by some web drivers. + * It is known to not be supported properly with some web drivers and/or + * browsers. It can even be ignored altogether by some web drivers. *

    * * @since 3.0.0 @@ -129,17 +128,12 @@ void start(MutableCapabilities options) { new ResponseFilterAdapter.FilterSource( (response, contents, messageInfo) -> { // sniff only if original URL is being tracked - var trackedResponse = - trackedUrlResponses - .get(messageInfo.getOriginalUrl()); - + var trackedResponse = trackedUrlResponses + .get(messageInfo.getOriginalUrl()); if (trackedResponse != null) { - response.headers() - .forEach( - en -> trackedResponse.headers - .put( - en.getKey(), - en.getValue())); + response.headers().forEach( + en -> trackedResponse.headers.put( + en.getKey(), en.getValue())); trackedResponse.statusCode = response.status().code(); trackedResponse.reasonPhrase = diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java index 3975b622d..5f1716ff0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java @@ -27,37 +27,8 @@ *

    * Configuration for {@link HttpSniffer}. *

    - * - * {@nx.xml.usage - * (default is 0 = random free port) - * (default is "localhost") - * (optionally overwrite browser user agent) - * - * (Maximum byte size before a request/response content is considered - * too large. Can be specified using notations, e.g., 25MB. Default is 10MB) - * - * - * - * - *
    (header value)
    - *
    - * - * - * {@nx.include com.norconex.commons.lang.net.ProxySettings@nx.xml.usage} - * - * } - * - *

    - * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "httpSniffer"). - *

    - * - * @author Pascal Essiembre * @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class HttpSnifferConfig { @@ -65,24 +36,48 @@ public class HttpSnifferConfig { public static final int DEFAULT_MAX_BUFFER_SIZE = DataUnit.MB.toBytes(10).intValue(); + /** + * The host name passed to the browser pointing to the sniffer proxy. + * Defaults to 0 (random free port). + */ private int port; /** * The host name passed to the browser pointing to the sniffer proxy. * Defaults to "localhost". - * @param host host name - * @return host name - * @since 3.1.0 */ private String host; + /** + * Optionally overwrite browser user agent. + */ private String userAgent; private final Map requestHeaders = new HashMap<>(); + /** + * Maximum byte size before a request/response content is considered too + * large. Can be specified using notations, e.g., 25MB. Default is + * {@value #DEFAULT_MAX_BUFFER_SIZE}. + */ private int maxBufferSize = DEFAULT_MAX_BUFFER_SIZE; + + /** + * Chained proxy for cases where the HTTP Sniffer itself needs to use a + * proxy. + * @since 3.1.0 + */ private final ProxySettings chainedProxy = new ProxySettings(); + /** + * Gets the request headers to add to every HTTP request. + * @return map of request headers + */ public Map getRequestHeaders() { return requestHeaders; } + /** + * Sets the request headers to add to every HTTP request. + * @param requestHeaders map of request headers + * @return this + */ public HttpSnifferConfig setRequestHeaders( Map requestHeaders) { this.requestHeaders.clear(); @@ -104,6 +99,7 @@ public ProxySettings getChainedProxy() { * Sets chained proxy settings, if any. That is, when the sniffer proxy * has to itself use a proxy. * @param chainedProxy chained proxy settings + * @return this * @since 3.1.0 */ public HttpSnifferConfig setChainedProxy(ProxySettings chainedProxy) { diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java index d09c593dc..07f00cde7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandler.java @@ -47,22 +47,8 @@ * Screenshot images can be stored in a document metadata/field or * in a local directory. *

    - * - * {@nx.xml.usage - * (Optional selector of element to capture.) - * {@nx.include com.norconex.crawler.web.fetch.util.DocImageHandler@nx.xml.usage} - * } - * - *

    - * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "screenshot"). - *

    - * * @since 3.0.0 */ -@SuppressWarnings("javadoc") @ToString @EqualsAndHashCode @Slf4j @@ -92,9 +78,8 @@ public void takeScreenshot(WebDriver driver, Doc doc) { imageHandler.setConfiguration(configuration); try (InputStream in = streamFactory.newInputStream( - new ByteArrayInputStream( - ((TakesScreenshot) driver) - .getScreenshotAs(OutputType.BYTES)))) { + new ByteArrayInputStream(((TakesScreenshot) driver) + .getScreenshotAs(OutputType.BYTES)))) { // If wanting a specific web element: if (StringUtils.isNotBlank(configuration.getCssSelector())) { @@ -107,24 +92,19 @@ public void takeScreenshot(WebDriver driver, Doc doc) { location.x, location.y, size.width, size.height); var img = new MutableImage(in); img.crop(rectangle); - imageHandler.handleImage( - img.toInputStream( - ofNullable( - getConfiguration() - .getImageFormat()) - .orElse("png")), + imageHandler.handleImage(img.toInputStream( + ofNullable(getConfiguration().getImageFormat()) + .orElse("png")), doc); } else { imageHandler.handleImage(in, doc); } } catch (Exception e) { if (LOG.isDebugEnabled()) { - LOG.error( - "Could not take screenshot of: {}", + LOG.error("Could not take screenshot of: {}", doc.getReference(), e); } else { - LOG.error( - "Could not take screenshot of: {}. Error:\n{}", + LOG.error("Could not take screenshot of: {}. Error:\n{}", doc.getReference(), ExceptionUtil.getFormattedMessages(e)); } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java index a8632b22f..2302ced57 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java @@ -17,8 +17,6 @@ import java.nio.file.Path; import java.nio.file.Paths; -import org.openqa.selenium.WebDriver; - import com.norconex.crawler.core.doc.CrawlDocMetadata; import com.norconex.crawler.web.fetch.util.DocImageHandlerConfig; @@ -27,27 +25,10 @@ /** *

    - * Takes screenshot of pages using a Selenium {@link WebDriver}. - * Either the entire page, or a specific DOM element. - * Screenshot images can be stored in a document metadata/field or - * in a local directory. - *

    - * - * {@nx.xml.usage - * (Optional selector of element to capture.) - * {@nx.include com.norconex.crawler.web.fetch.util.DocImageHandler@nx.xml.usage} - * } - * - *

    - * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "screenshot"). + * Configuration for {@link ScreenshotHandler}. *

    - * * @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class ScreenshotHandlerConfig extends DocImageHandlerConfig { diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java index 1507eff02..804841eee 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java @@ -14,7 +14,6 @@ */ package com.norconex.crawler.web.fetch.impl.webdriver; -import static java.time.Duration.ofMillis; import static java.util.Optional.ofNullable; import java.io.InputStream; @@ -92,93 +91,6 @@ * Browsers/WebDriver implementations. *

    * - * {@nx.xml.usage - * - * - * [chrome|edge|firefox|opera|safari] - * - * - * (browser executable or blank to detect) - * (driver executable or blank to detect) - * - * - * (URL of the remote web driver cluster) - * - * - * - * (capability value) - * - * - * - * - * - * {@nx.include com.norconex.crawler.web.fetch.impl.webdriver.ScreenshotHandler@nx.xml.usage} - * - * - * (Optional. Browser window dimensions. E.g., 640x480) - * - * - * (Optional JavaScript code to be run the moment a page is requested.) - * - * - * (Optional JavaScript code to be run after we are done - * waiting for a page.) - * - * - * - * - * (Web driver max wait time for a page to load.) - * - * - * (Web driver max wait time for an element to appear. See - * "waitForElement".) - * - * - * (Web driver max wait time for a scripts to execute.) - * - * - * (Max wait time for an element to show up in browser before returning. - * Default 'type' is 'tagName'.) - * - * - * (Makes the current thread sleep for the specified duration, to - * give the web driver enough time to load the page. - * Sometimes necessary for some web driver implementations if the above - * options do not work.) - * - * - * {@nx.include com.norconex.crawler.core.fetch.AbstractFetcher#referenceFilters} - * - * - * - * {@nx.include com.norconex.crawler.web.fetch.impl.webdriver.HttpSnifferConfig@nx.xml.usage} - * - * - * - * } - * - * {@nx.xml.example - * - * firefox - * /drivers/geckodriver.exe - * - * - * .*dynamic.*$ - * - * - * - * } - * - *

    The above example will use Firefox to crawl dynamically generated - * pages using a specific web driver. - *

    - * * @since 3.0.0 */ @SuppressWarnings("javadoc") @@ -334,9 +246,8 @@ public HttpFetchResponse fetch(HttpFetchRequest req) .builder() .crawlDocState(CrawlDocState.NEW) .statusCode(200) - .reasonPhrase( - "No exception thrown, but real status code " - + "unknown. Capture headers for real status code.") + .reasonPhrase("No exception thrown, but real status code " + + "unknown. Capture headers for real status code.") .userAgent(getUserAgent()) .build(); } @@ -387,39 +298,30 @@ protected InputStream fetchDocumentContent(String url) { } var timeouts = driver.manage().timeouts(); - if (configuration.getPageLoadTimeout() != 0) { - timeouts.pageLoadTimeout( - ofMillis(configuration.getPageLoadTimeout())); + if (configuration.getPageLoadTimeout() != null) { + timeouts.pageLoadTimeout(configuration.getPageLoadTimeout()); } - if (configuration.getImplicitlyWait() != 0) { - timeouts.implicitlyWait( - ofMillis(configuration.getImplicitlyWait())); + if (configuration.getImplicitlyWait() != null) { + timeouts.implicitlyWait(configuration.getImplicitlyWait()); } - if (configuration.getScriptTimeout() != 0) { - timeouts.scriptTimeout( - ofMillis(configuration.getScriptTimeout())); + if (configuration.getScriptTimeout() != null) { + timeouts.scriptTimeout(configuration.getScriptTimeout()); } - if (configuration.getWaitForElementTimeout() != 0 + if (configuration.getWaitForElementTimeout() != null && StringUtils.isNotBlank( configuration.getWaitForElementSelector())) { var elType = ObjectUtils.defaultIfNull( configuration.getWaitForElementType(), WaitElementType.TAGNAME); - LOG.debug( - "Waiting for element '{}' of type '{}' for '{}'.", + LOG.debug("Waiting for element '{}' of type '{}' for '{}'.", configuration.getWaitForElementSelector(), elType, url); var wait = new WebDriverWait( - driver, ofMillis(configuration.getWaitForElementTimeout())); - wait.until( - ExpectedConditions.presenceOfElementLocated( - elType.getBy( - configuration - .getWaitForElementSelector()))); - - LOG.debug( - "Done waiting for element '{}' of type '{}' for '{}'.", + driver, configuration.getWaitForElementTimeout()); + wait.until(ExpectedConditions.presenceOfElementLocated( + elType.getBy(configuration.getWaitForElementSelector()))); + LOG.debug("Done waiting for element '{}' of type '{}' for '{}'.", configuration.getWaitForElementSelector(), elType, url); } @@ -428,8 +330,8 @@ protected InputStream fetchDocumentContent(String url) { configuration.getLatePageScript()); } - if (configuration.getThreadWait() != 0) { - Sleeper.sleepMillis(configuration.getThreadWait()); + if (configuration.getThreadWait() != null) { + Sleeper.sleepMillis(configuration.getThreadWait().toMillis()); } var pageSource = driver.getPageSource(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java index d7a027955..47d9cf6ee 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java @@ -17,6 +17,7 @@ import java.awt.Dimension; import java.net.URL; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -64,11 +65,32 @@ By getBy(String selector) { } } + /** + * The browser used for crawling. Also defines which WebDriver to use. + * Default is Firefox. + */ private Browser browser = Browser.FIREFOX; - // Default will try to detect driver installation on OS + /** + * Local path to driver executable or null to attempt + * automatic detection of the driver path. + * See web driver vendor documentation for the location facilitating + * detection. + * Use {@link #setRemoteURL(URL)} instead when using + * a remote web driver cluster. + */ private Path driverPath; - // Default will try to detect browser installation on OS + /** + * Local path to browser executable or null to attempt + * automatic browser path detection. See browser vendor documentation + * for the expected browser installed location. + * Use {@link #setRemoteURL(URL)} instead when using + * a remote web driver cluster. + */ private Path browserPath; + /** + * URL of a remote WebDriver cluster. Alternative to using a local + * browser and local web driver. + */ private URL remoteURL; /** @@ -81,31 +103,95 @@ By getBy(String selector) { */ private boolean useHtmlUnit; + /** + * Optionally setup an HTTP proxy that allows to set and capture HTTP + * headers. For advanced use only. + */ private HttpSniffer httpSniffer; + + /** + * When configured, takes screenshots of each web pages. + */ private ScreenshotHandler screenshotHandler; + /** + * Optional capabilities (configuration options) for the web driver. + * Many are specific to each browser or web driver. Refer to vendor + * documentation. + */ private final Map capabilities = new HashMap<>(); + /** + * Optional command-line arguments supported by some web driver or browser. + */ private final List arguments = new ArrayList<>(); + /** + * Optionally set the browser window dimensions. E.g., 640x480. + */ private Dimension windowSize; + /** + * Optional JavaScript code to be run the moment a page is requested. + */ private String earlyPageScript; + /** + * Optional JavaScript code to be run after we are done waiting for a page. + */ private String latePageScript; - private long pageLoadTimeout; - private long implicitlyWait; - private long scriptTimeout; - private long threadWait; + /** + * Web driver max wait time for a page to load. + */ + private Duration pageLoadTimeout; + /** + * Web driver max wait time for an element to appear. See + * {@link #getWaitForElementSelector()}. + */ + private Duration implicitlyWait; + /** + * Web driver max wait time for a scripts to execute. + */ + private Duration scriptTimeout; + /** + * Makes the current thread sleep for the specified duration, to + * give the web driver enough time to load the page. + * Sometimes necessary for some web driver implementations when preferable + * options fail. + */ + private Duration threadWait; + /** + * The type of reference to use when waiting for an element. + */ private WaitElementType waitForElementType; + /** + * Reference to an element to wait for. The nature of the reference itself + * is defined by {@link #getWaitForElementType()}. + */ private String waitForElementSelector; - private long waitForElementTimeout; + /** + * Max wait time for an element to show up in browser before returning. + * Default 'type' is 'tagName'. + */ + private Duration waitForElementTimeout; - public Map getCapabilities( - Map capabilities) { + /** + * Gets optional capabilities (configuration options) for the web driver. + * Many are specific to each browser or web driver. Refer to vendor + * documentation. + * @return capabilities + */ + public Map getCapabilities() { return Collections.unmodifiableMap(capabilities); } + /** + * Sets optional capabilities (configuration options) for the web driver. + * Many are specific to each browser or web driver. Refer to vendor + * documentation. + * @param capabilities web driver capabilities + * @return this + */ public WebDriverHttpFetcherConfig setCapabilities( Map capabilities) { CollectionUtil.setAll(this.capabilities, capabilities); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java index 413138da9..05569de46 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandler.java @@ -41,23 +41,6 @@ * being itself an image). Examples can be screenshots, featured image, etc. * Images can be stored in a document metadata/field or in a local directory. *

    - * - * {@nx.xml.usage - * [metadata|directory] (One or both, separated by comma.) - * (Image format. Default is "png".) - * - * - * (Local directory where to save images.) - * - * - * (Document field where to store the image.) - * } - *

    - * The above XML configurable options can be nested in a parent tag of any name. - * The expected parent tag name is defined by the consuming classes. - *

    * @since 3.0.0 */ @Slf4j @@ -70,42 +53,8 @@ public class DocImageHandler implements Configurable { @NonNull private DocImageHandlerConfig configuration = new DocImageHandlerConfig(); - // public enum Target { METADATA, DIRECTORY } - // public enum DirStructure { URL2PATH, DATE, DATETIME } - // public static final String DEFAULT_IMAGE_FORMAT = "png"; - // - // protected static final List DEFAULT_TYPES = - // List.of(Target.DIRECTORY) ; - // - //// @EqualsAndHashCode.Exclude - //// @ToString.Exclude - // private final List targets = new ArrayList<>(DEFAULT_TYPES); - // private Path targetDir; - // private String targetDirField; - // private DirStructure targetDirStructure = DirStructure.DATETIME; - // private String targetMetaField; - // private String imageFormat = DEFAULT_IMAGE_FORMAT; - private final ImageTransformer imgTransformer = new ImageTransformer(); - // public DocImageHandler( - // Path defaultDir, - // String defaultDirField, - // String defaultMetaField) { - // targetDir = defaultDir; - // targetDirField = defaultDirField; - // targetMetaField = defaultMetaField; - // } - // - // public DocImageHandler() {} - // - // public List getTargets() { - // return Collections.unmodifiableList(targets); - // } - // public void setTargets(List targets) { - // CollectionUtil.setAll(this.targets, targets); - // } - public void handleImage(InputStream imageStream, Doc doc) { //TODO check for null and: diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java index 4b5cfc8e7..eea58f01d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/DocImageHandlerConfig.java @@ -26,26 +26,7 @@ /** *

    - * Handles images associated with a document (which is different than a document - * being itself an image). Examples can be screenshots, featured image, etc. - * Images can be stored in a document metadata/field or in a local directory. - *

    - * - * {@nx.xml.usage - * [metadata|directory] (One or both, separated by comma.) - * (Image format. Default is "png".) - * - * - * (Local directory where to save images.) - * - * - * (Document field where to store the image.) - * } - *

    - * The above XML configurable options can be nested in a parent tag of any name. - * The expected parent tag name is defined by the consuming classes. + * Configuration for {@link DocImageHandler}. *

    * @since 3.0.0 */ @@ -54,11 +35,34 @@ public class DocImageHandlerConfig { public enum Target { - METADATA, DIRECTORY + /** + * Store image in metadata field. + */ + METADATA, + /** + * Store image on local directory. + */ + DIRECTORY } + /** + * Directory structure when storing images on disk. + */ public enum DirStructure { - URL2PATH, DATE, DATETIME + /** + * Create directories for each URL segments, with handling + * of special characters. + */ + URL2PATH, + /** + * Create directories for each date (e.g., 2000/12/31/). + */ + DATE, + /** + * Create directories for each date and time, up to seconds + * (e.g., 2000/12/31/13/34/12/). + */ + DATETIME } public static final String DEFAULT_IMAGE_FORMAT = "png"; diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java index e6f30ff5f..15dd31f04 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java @@ -14,11 +14,15 @@ */ package com.norconex.crawler.web.fetch.util; -import java.io.UnsupportedEncodingException; +import static com.norconex.crawler.web.fetch.util.GenericRedirectUrlProviderConfig.DEFAULT_FALLBACK_CHARSET; +import static java.util.Optional.ofNullable; +import static org.apache.commons.lang3.StringUtils.substringAfterLast; +import static org.apache.commons.lang3.StringUtils.trimToNull; + import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; +import java.nio.charset.Charset; -import org.apache.commons.lang3.StringUtils; +import org.apache.hc.core5.http.Header; import org.apache.hc.core5.http.HttpHeaders; import org.apache.hc.core5.http.HttpRequest; import org.apache.hc.core5.http.HttpResponse; @@ -26,11 +30,11 @@ import org.apache.hc.core5.http.protocol.HttpCoreContext; import org.apache.tika.utils.CharsetUtils; +import com.norconex.commons.lang.config.Configurable; import com.norconex.commons.lang.url.HttpURL; -import com.norconex.commons.lang.xml.Xml; -import com.norconex.commons.lang.xml.XmlConfigurable; import lombok.Data; +import lombok.Getter; import lombok.extern.slf4j.Slf4j; /** @@ -80,34 +84,18 @@ * * * - * {@nx.xml.usage - * - * } - * - * {@nx.xml.example - *
    - * 
    - * }
    - * 

    - * The above example sets the default character encoding to be "ISO-8859-1" - * when it could not be detected. - *

    - * * @since 2.4.0 */ @Slf4j @Data -public class GenericRedirectUrlProvider - implements RedirectUrlProvider, XmlConfigurable { - - public static final String DEFAULT_FALLBACK_CHARSET = - StandardCharsets.UTF_8.toString(); +public class GenericRedirectUrlProvider implements + RedirectUrlProvider, Configurable { private static final int ASCII_MAX_CODEPOINT = 128; - private String fallbackCharset = DEFAULT_FALLBACK_CHARSET; + @Getter + private final GenericRedirectUrlProviderConfig configuration = + new GenericRedirectUrlProviderConfig(); @Override public String provideRedirectURL( @@ -127,30 +115,15 @@ public String provideRedirectURL( var hl = response.getLastHeader(HttpHeaders.LOCATION); if (hl == null) { //TODO should throw exception instead? - LOG.error( - "Redirect detected to a null Location for: {}", + LOG.error("Redirect detected to a null Location for: {}", originalURL); return null; } var redirectLocation = hl.getValue(); - //--- Charset --- - String charset = null; - var hc = response.getLastHeader("Content-Type"); - if (hc != null) { - var contentType = hc.getValue(); - if (contentType.contains(";")) { - charset = StringUtils.substringAfterLast( - contentType, "charset="); - } - } - if (StringUtils.isBlank(charset)) { - charset = fallbackCharset; - } - //--- Build/fix redirect URL --- var targetURL = HttpURL.toAbsolute(originalURL, redirectLocation); - targetURL = resolveRedirectURL(targetURL, charset); + targetURL = resolveRedirectURL(response, targetURL); if (LOG.isDebugEnabled()) { LOG.debug("URL redirect: {} -> {}", originalURL, targetURL); @@ -158,16 +131,17 @@ public String provideRedirectURL( return targetURL; } - //TODO is there value in moving this method to somewhere re-usable? + //MAYBE: is there value in moving this method to somewhere re-usable? private String resolveRedirectURL( - final String redirectURL, final String nonAsciiCharset) { + HttpResponse response, String redirectURL) { var url = redirectURL; // Is string containing only ASCII as it should? var isAscii = true; final var length = url.length(); - for (var offset = 0; offset < length;) { + var offset = 0; + while (offset < length) { final var codepoint = url.codePointAt(offset); if (codepoint > ASCII_MAX_CODEPOINT) { isAscii = false; @@ -184,30 +158,29 @@ private String resolveRedirectURL( Will try to fix. Redirect URL: {}""", redirectURL); // try to fix if non ascii charset is non UTF8. - if (StringUtils.isNotBlank(nonAsciiCharset)) { - var charset = CharsetUtils.clean(nonAsciiCharset); - if (!StandardCharsets.UTF_8.toString().equals(charset)) { - try { - return new String(url.getBytes(charset)); - } catch (UnsupportedEncodingException e) { - LOG.warn( - "Could not fix badly encoded URL with charset " - + "\"{}\". Redirect URL: {}", - charset, redirectURL, e); - } - } - } - - return new String(url.getBytes(StandardCharsets.UTF_8)); + return new String(url.getBytes(resolveCharset(response, redirectURL))); } - @Override - public void loadFromXML(Xml xml) { - setFallbackCharset(xml.getString("@fallbackCharset", fallbackCharset)); - } - - @Override - public void saveToXML(Xml xml) { - xml.setAttribute("fallbackCharset", fallbackCharset); + // Detect charset from response header or use fallback + private Charset resolveCharset(HttpResponse response, String redirectUrl) { + return ofNullable(response.getLastHeader("Content-Type")) + .map(Header::getValue) + .filter(ct -> ct.contains(";")) + .map(ct -> trimToNull(substringAfterLast(ct, "charset="))) + .map(chset -> { + try { + return CharsetUtils.forName(chset); + } catch (RuntimeException e) { + var charset = + ofNullable(configuration.getFallbackCharset()) + .orElse(DEFAULT_FALLBACK_CHARSET); + LOG.warn(""" + Could not fix badly encoded URL with charset \ + "{}". Redirect URL: "{}". Will try with \ + fallback charset: {}""", + charset, redirectUrl, charset); + return charset; + } + }).get(); } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java new file mode 100644 index 000000000..17bc2e59b --- /dev/null +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java @@ -0,0 +1,34 @@ +/* Copyright 2015-2024 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.norconex.crawler.web.fetch.util; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import lombok.Data; +import lombok.experimental.Accessors; + +/** + * Configuration for {@link GenericRedirectUrlProvider}. + */ +@Data +@Accessors(chain = true) +public class GenericRedirectUrlProviderConfig { + + public static final Charset DEFAULT_FALLBACK_CHARSET = + StandardCharsets.UTF_8; + + private Charset fallbackCharset = DEFAULT_FALLBACK_CHARSET; +} diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java index ef0a7e002..9560ec923 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java @@ -51,20 +51,6 @@ *

    If robots instructions are provided in both the HTML page and * HTTP header, the ones in HTML page will take precedence, and the * ones in HTTP header will be ignored.

    - * - * {@nx.xml.usage - * - * (string prefixing headers) - * - * } - * - * {@nx.xml.example - * - * } - *

    - * The above example ignores robot meta information. - *

    */ @Slf4j @EqualsAndHashCode diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java index 9ee56acf0..4bf348029 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java @@ -14,43 +14,11 @@ */ package com.norconex.crawler.web.robot.impl; -import com.norconex.crawler.web.robot.RobotsMetaProvider; - import lombok.Data; import lombok.experimental.Accessors; /** - *

    Implementation of {@link RobotsMetaProvider} as per X-Robots-Tag - * and ROBOTS standards. - * Extracts robots information from "ROBOTS" meta tag in an HTML page - * or "X-Robots-Tag" tag in the HTTP header (see - * - * https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag - * and - * - * http://www.robotstxt.org/meta.html). - *

    - * - *

    If you specified a prefix for the HTTP headers, make sure to specify it - * again here or the robots meta tags will not be found.

    - * - *

    If robots instructions are provided in both the HTML page and - * HTTP header, the ones in HTML page will take precedence, and the - * ones in HTTP header will be ignored.

    - * - * {@nx.xml.usage - * - * (string prefixing headers) - * - * } - * - * {@nx.xml.example - * - * } - *

    - * The above example ignores robot meta information. - *

    + * Configuration for {@link StandardRobotsMetaProvider}. */ @Data @Accessors(chain = true) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java index e104ecf72..4994e5726 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java @@ -60,18 +60,6 @@ * described at * http://www.robotstxt.org/robotstxt.html. *

    - * {@nx.xml.usage - * - * } - * - * {@nx.xml.example - *
    - * 
    - * }
    - * 

    - * The above example ignores "robots.txt" files present on web sites. - *

    */ @Slf4j @EqualsAndHashCode diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java index e992ffeb1..2b3f19694 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java @@ -41,23 +41,6 @@ * Default paths are: /sitemap.xml and * /sitemap_index.xml *

    - * - * {@nx.xml.usage - * - * - * - * - * (Sitemap URL path relative to web site domain. - * Overwriting default when specified.) - * - * - * - * } */ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java index ea9ff1920..7eabace5b 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java @@ -19,40 +19,13 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.crawler.web.robot.RobotsTxtProvider; import lombok.Data; import lombok.experimental.Accessors; import lombok.experimental.FieldNameConstants; /** - *

    - * If there is a sitemap defined as a start reference for the same URL web site, - * this locator is not used. Otherwise, it tells the crawler to - * use the sitemap as defined in the web site "robots.txt" file (provided - * the web site defines one and {@link RobotsTxtProvider} is enabled). - * If no sitemap resolution was possible from "robots.txt", an attempt will - * be made to retrieve a sitemap using the configured sitemap paths. - * Default paths are: /sitemap.xml and - * /sitemap_index.xml - *

    - * - * {@nx.xml.usage - * - * - * - * - * (Sitemap URL path relative to web site domain. - * Overwriting default when specified.) - * - * - * - * } + * Configuration for {@link GenericSitemapLocator}. */ @Data @Accessors(chain = true) @@ -62,8 +35,18 @@ public class GenericSitemapLocatorConfig { public static final List DEFAULT_PATHS = List.of("/sitemap.xml", "/sitemap_index.xml"); + /** + * The domain-relative URL paths where to look for sitemaps when not + * supplied as start reference or part of a web site robots.txt file. + * Defaults to /sitemap.xml and + * /sitemap_index.xml. + */ private final List paths = new ArrayList<>(DEFAULT_PATHS); + /** + * Whether to disable checking for the sitemap locations in a web site + * robots.txt file. + */ private boolean robotsTxtSitemapDisabled; /** diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java index 481b5767f..92cc053ab 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java @@ -55,9 +55,8 @@ List parse( Xml.stream(is) .takeWhile(c -> { if (stopping.isTrue()) { - LOG.debug( - "Sitemap not entirely parsed due to " - + "crawler being stopped."); + LOG.debug("Sitemap not entirely parsed due to " + + "crawler being stopped."); return false; } return true; @@ -72,13 +71,11 @@ List parse( } }); } catch (XmlException e) { - LOG.error( - "Cannot fetch sitemap: {} -- Likely an invalid sitemap " - + "XML format causing a parsing error (actual error:{}).", + LOG.error("Cannot fetch sitemap: {} -- Likely an invalid sitemap " + + "XML format causing a parsing error (actual error:{}).", location, e.getMessage()); } catch (IOException e) { - LOG.error( - "Cannot fetch sitemap: {} ({})", + LOG.error("Cannot fetch sitemap: {} ({})", location, e.getMessage(), e); } return children; @@ -104,9 +101,8 @@ private Optional toDocRecord( // Is URL valid? if (StringUtils.isBlank(url) || (!lenient && !url.startsWith(sitemapLocationDir))) { - LOG.debug( - "Sitemap URL invalid for location directory." - + " URL: {} Location directory: {}", + LOG.debug("Sitemap URL invalid for location directory." + + " URL: {} Location directory: {}", url, sitemapLocationDir); return Optional.empty(); } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java index af86a6100..70e2470f9 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java @@ -73,10 +73,6 @@ static boolean shouldProcessSitemap( || cacheModifDate.isBefore(newRec.getLastModified()); } - // static ZonedDateTime now() { - // return ZonedDateTime.now(ZoneOffset.UTC); - // } - static SitemapRecord toSitemapRecord(CrawlDoc doc) { var indexRec = new SitemapRecord(); var docRec = Web.docContext(doc); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java index 2d717cb30..995461b5d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java @@ -35,7 +35,6 @@ import com.norconex.crawler.web.WebCrawlerConfig; import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector; import com.norconex.crawler.web.doc.operations.delay.DelayResolver; -import com.norconex.crawler.web.doc.operations.delay.impl.DelayRange; import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver; @@ -62,7 +61,6 @@ public MultiValuedMap, Class> getPolymorphicTypes() { addPolyType(map, MetadataChecksummer.class, "doc.operations.checksum"); addPolyType(map, EventListener.class, "event.listeners"); addPolyType(map, DelayResolver.class); - addPolyType(map, DelayRange.class); addPolyType( map, DocumentFilter.class, "doc.operations.filter"); //NOSONAR diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java b/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java index 64f7ceaef..60413b5e7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java @@ -16,8 +16,6 @@ import static org.apache.commons.lang3.StringUtils.substring; -import java.util.Collection; -import java.util.List; import java.util.Optional; import java.util.regex.Pattern; @@ -27,7 +25,6 @@ import com.norconex.crawler.core.Crawler; import com.norconex.crawler.core.doc.CrawlDoc; import com.norconex.crawler.core.event.CrawlerEvent; -import com.norconex.crawler.core.fetch.Fetcher; import com.norconex.crawler.web.WebCrawlerConfig; import com.norconex.crawler.web.WebCrawlerContext; import com.norconex.crawler.web.doc.WebCrawlDocContext; @@ -48,38 +45,17 @@ public static void fireIfUrlOutOfScope( WebCrawlDocContext docContext, UrlScope urlScope) { if (!urlScope.isInScope()) { - crawler.fire( - CrawlerEvent - .builder() - .name(WebCrawlerEvent.REJECTED_OUT_OF_SCOPE) - .source(crawler) - .subject(Web.config(crawler).getUrlScopeResolver()) - .docContext(docContext) - .message(urlScope.outOfScopeReason()) - .build()); + crawler.fire(CrawlerEvent + .builder() + .name(WebCrawlerEvent.REJECTED_OUT_OF_SCOPE) + .source(crawler) + .subject(Web.config(crawler).getUrlScopeResolver()) + .docContext(docContext) + .message(urlScope.outOfScopeReason()) + .build()); } } - // private static final BeanMapper BEAN_MAPPER = - // CrawlSessionBeanMapperFactory.create( - // WebCrawlerConfig.class, b -> - // b.unboundPropertyMapping( - // "crawler", WebCrawlerMixIn.class)); - // private static class WebCrawlerMixIn { - // @JsonDeserialize(as = WebCrawlerConfig.class) - // private CrawlerConfig configuration; - // } - - // public static BeanMapper beanMapper() { - // return BEAN_MAPPER; - // } - - // public static WebCrawlerConfig config(CrawlerConfig cfg) { - // return (WebCrawlerConfig) cfg; - // } - // public static WebCrawlerConfig config(AbstractPipelineContext ctx) { - // return (WebCrawlerConfig) Web.config(ctx.getCrawler()); - // } public static WebCrawlerConfig config(Crawler crawler) { return (WebCrawlerConfig) crawler.getConfiguration(); } @@ -88,32 +64,6 @@ public static WebCrawlerContext crawlerContext(Crawler crawler) { return (WebCrawlerContext) crawler.getContext(); } - // public static WebImporterPipelineContext importerContext( - // AbstractPipelineContext ctx) { - // return (WebImporterPipelineContext) ctx; - // } - - // //TODO move this one to core? - // public static void fire( - // Crawler crawler, - // @NonNull - // Consumer> c) { - // if (crawler != null) { - // var builder = CrawlerEvent.builder(); - // c.accept(builder); - // crawler.getEventManager().fire(builder.build()); - // } - // } - - //TODO could probably move this where needed since generically, - // we would get the fetcher wrapper directly from crawler. - public static List toHttpFetcher( - @NonNull Collection> fetchers) { - return fetchers.stream() - .map(HttpFetcher.class::cast) - .toList(); - } - public static HttpFetcher fetcher(Crawler crawler) { return (HttpFetcher) crawler.getFetcher(); } @@ -130,10 +80,9 @@ public static WebCrawlDocContext cachedDocContext( public static RobotsTxt robotsTxt(Crawler crawler, String reference) { var cfg = Web.config(crawler); return Optional.ofNullable(cfg.getRobotsTxtProvider()) - .map( - rb -> rb.getRobotsTxt( - (HttpFetcher) crawler.getFetcher(), - reference)) + .map(rb -> rb.getRobotsTxt( + (HttpFetcher) crawler.getFetcher(), + reference)) .orElse(null); } @@ -199,15 +148,14 @@ public static Properties parseDomAttributes( if (StringUtils.isBlank(attribsStr)) { return props; } - doParseDomAttributes( - attribsStr - // strip before and after angle brackets as separate steps, - // in case of weird mark-up - .replaceFirst("(?s)^.*<\\s*[\\w-]+\\s*(.*)$", "$1") - .replaceFirst("(?s)^(.*?)>.*$", "$1") - .replaceAll("\\s+", " ") - .replace(" =", "=") - .replace("= ", "="), + doParseDomAttributes(attribsStr + // strip before and after angle brackets as separate steps, + // in case of weird mark-up + .replaceFirst("(?s)^.*<\\s*[\\w-]+\\s*(.*)$", "$1") + .replaceFirst("(?s)^(.*?)>.*$", "$1") + .replaceAll("\\s+", " ") + .replace(" =", "=") + .replace("= ", "="), props); return props; } diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java index 073be458c..ea8041132 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java @@ -134,7 +134,7 @@ public HttpFetchResponse fetch(HttpFetchRequest req) mem.getUpsertRequests().forEach(req -> { assertThat( req.getMetadata().getInteger( - "collector.depth")).isZero(); + "crawler.depth")).isZero(); assertThat(req.getReference()).containsAnyOf( page1Path, page2Path, diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.html b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.html similarity index 100% rename from crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.html rename to crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.html diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java index 5ea5a0557..b0d93500f 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java @@ -32,6 +32,7 @@ import com.norconex.commons.lang.text.TextMatcher; import com.norconex.crawler.web.doc.WebCrawlDocContext; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency; +import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency.ApplyTo; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.SitemapSupport; import lombok.extern.slf4j.Slf4j; @@ -45,10 +46,10 @@ void testWriteRead() { r.getConfiguration().setSitemapSupport(SitemapSupport.LAST); var f1 = new MinFrequency( - "reference", "monthly", + ApplyTo.REFERENCE, "monthly", TextMatcher.regex(".*\\.pdf").ignoreCase()); var f2 = new MinFrequency( - "contentType", "1234", + ApplyTo.CONTENT_TYPE, "1234", TextMatcher.regex(".*")); r.getConfiguration().setMinFrequencies(List.of(f1, f2)); @@ -72,13 +73,14 @@ void testCustomFrequency() { prevCrawl.setCrawlDate(prevCrawlDate); var f = new MinFrequency( - "reference", "120 days", TextMatcher.regex(".*")); + ApplyTo.REFERENCE, "120 days", TextMatcher.regex(".*")); r.getConfiguration().setMinFrequencies(List.of(f)); Assertions.assertFalse(r.isRecrawlable(prevCrawl)); // Delay has passed - f = new MinFrequency("reference", "5 days", TextMatcher.regex(".*")); + f = new MinFrequency( + ApplyTo.REFERENCE, "5 days", TextMatcher.regex(".*")); r.getConfiguration().setMinFrequencies(List.of(f)); Assertions.assertTrue(r.isRecrawlable(prevCrawl)); } @@ -163,10 +165,10 @@ void testIsRecrawlable( var matcher = "reference".equals(minFreqApplyTo) ? TextMatcher.basic(url) : TextMatcher.basic("text/html"); - resolver.getConfiguration().setMinFrequencies( - List.of( - new MinFrequency( - minFreqApplyTo, minFreqValue, matcher))); + resolver.getConfiguration().setMinFrequencies(List.of( + new MinFrequency("reference".equals(minFreqApplyTo) + ? ApplyTo.REFERENCE + : ApplyTo.CONTENT_TYPE, minFreqValue, matcher))); assertThat(resolver.isRecrawlable(prevRec)).isEqualTo(expected); } diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java index 405dc0529..3aea1ee01 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java @@ -25,6 +25,7 @@ import java.io.UncheckedIOException; import java.net.ServerSocket; import java.nio.file.Path; +import java.time.Duration; import java.util.List; import org.apache.commons.lang3.RandomStringUtils; @@ -282,11 +283,11 @@ void testResolvingUserAgent(ClientAndServer client) { // test setting a bunch of other params fetcher.getConfiguration() .setWindowSize(new java.awt.Dimension(640, 480)) - .setPageLoadTimeout(10_1000) - .setImplicitlyWait(1000) - .setScriptTimeout(10_000) + .setPageLoadTimeout(Duration.ofSeconds(10)) + .setImplicitlyWait(Duration.ofSeconds(1)) + .setScriptTimeout(Duration.ofSeconds(10)) .setWaitForElementSelector("p") - .setWaitForElementTimeout(10_000); + .setWaitForElementTimeout(Duration.ofSeconds(10)); cfg.setStartReferences(List.of(hostUrl(client, path))); }); diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java index b0be4a0db..7ca59db96 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java @@ -21,6 +21,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Paths; +import java.time.Duration; import java.util.List; import org.junit.jupiter.api.Test; @@ -45,13 +46,13 @@ void testWriteReadFetcher() throws MalformedURLException { c.setBrowserPath(Paths.get("/some/browser/path")); c.setDriverPath(Paths.get("/some/driver/path")); c.setRemoteURL(new URL("http://example.com")); - c.setImplicitlyWait(4000); + c.setImplicitlyWait(Duration.ofSeconds(4)); c.setEarlyPageScript("alert('hello init!');"); - c.setPageLoadTimeout(5000); + c.setPageLoadTimeout(Duration.ofSeconds(5)); c.setLatePageScript("alert('hello page!');"); - c.setScriptTimeout(6000); + c.setScriptTimeout(Duration.ofSeconds(6)); c.setWaitForElementSelector("#header"); - c.setWaitForElementTimeout(1234); + c.setWaitForElementTimeout(Duration.ofMillis(1234)); c.setWaitForElementType(WaitElementType.ID); c.setWindowSize(new Dimension(666, 999)); c.setCapabilities( @@ -69,12 +70,9 @@ void testWriteReadFetcher() throws MalformedURLException { "rh2", "hrval2")); c.setHttpSniffer(snif); - c.setReferenceFilters( - List.of( - configure( - new GenericReferenceFilter(), cfg -> cfg - .setValueMatcher( - TextMatcher.regex("test.*"))))); + c.setReferenceFilters(List.of(configure( + new GenericReferenceFilter(), cfg -> cfg + .setValueMatcher(TextMatcher.regex("test.*"))))); var sh = new ScreenshotHandler(); sh.getConfiguration() diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java index f85159170..6ea9d6162 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java @@ -27,7 +27,7 @@ class GenericRedirectUrlProviderTest { @Test void testWriteRead() { var p = new GenericRedirectUrlProvider(); - p.setFallbackCharset(StandardCharsets.UTF_8.toString()); + p.getConfiguration().setFallbackCharset(StandardCharsets.UTF_8); assertThatNoException() .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(p)); } diff --git a/crawler/web/src/test/resources/validation/web-crawl-session-large.xml b/crawler/web/src/test/resources/validation/web-crawl-session-large.xml index e2e838f8f..59021df4b 100644 --- a/crawler/web/src/test/resources/validation/web-crawl-session-large.xml +++ b/crawler/web/src/test/resources/validation/web-crawl-session-large.xml @@ -232,7 +232,7 @@ - + @@ -352,14 +352,17 @@ - + text/html dom dom 425x312 true 1234 /some/path - url, inline + + url + inline + medium 25 true diff --git a/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java b/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java index 135fd09a0..eb1ba3fa1 100644 --- a/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java +++ b/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java @@ -214,6 +214,16 @@ public static TextMatcher imageIOStandardContentTypes() { return csv(IMAGE_IO_CONTENT_TYPES); } + /** + *

    + * Matches all content types. + *

    + * @return text matcher + */ + public static TextMatcher all() { + return TextMatcher.regex(".*"); + } + private static TextMatcher csv(Set values) { return TextMatcher .csv(StringUtils.join(values, ',')) From 0f195992ace10c18e68a25d7839a8e124c62f113 Mon Sep 17 00:00:00 2001 From: essiembre Date: Sun, 8 Sep 2024 06:20:13 +0000 Subject: [PATCH 05/10] Apply Copyright year changes --- .../web/robot/impl/StandardRobotsMetaProviderConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java index 4bf348029..7900cbf5d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java @@ -1,4 +1,4 @@ -/* Copyright 2010-2023 Norconex Inc. +/* Copyright 2010-2024 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 1cee93bdf8accd9927401283aa19ce4f10d53c79 Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Tue, 10 Sep 2024 22:22:29 -0400 Subject: [PATCH 06/10] More code coverage. --- .../link/impl/RegexLinkExtractor.java | 4 +- .../delay/impl/GenericDelayResolverTest.java | 22 +++++++ .../link/impl/RegexLinkExtractorTest.java | 60 +++++++++++++++++++ 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java index 235cf3a6a..7ff6146f0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java @@ -86,9 +86,9 @@ public class RegexLinkExtractor //TODO make buffer size and overlap size configurable //1MB: make configurable - private static final int MAX_BUFFER_SIZE = 1024 * 1024; + static final int MAX_BUFFER_SIZE = 1024 * 1024; // max url leng is 2048 x 2 bytes x 2 for anchor attributes. - private static final int OVERLAP_SIZE = 2 * 2 * 2048; + static final int OVERLAP_SIZE = 2 * 2 * 2048; @Getter private final RegexLinkExtractorConfig configuration = diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java index c7211a68e..ff4881865 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java @@ -27,6 +27,7 @@ import com.norconex.commons.lang.bean.BeanMapper; import com.norconex.crawler.web.doc.operations.delay.impl.BaseDelayResolverConfig.DelayResolverScope; +import com.norconex.crawler.web.robot.RobotsTxt; class GenericDelayResolverTest { @@ -57,6 +58,27 @@ void testWriteRead() { .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(r)); } + @Test + void testNullDelays() { + var r = new GenericDelayResolver(); + r.getConfiguration() + .setScope(null); + assertThatNoException().isThrownBy( + () -> r.delay(null, "http://somewhere.com")); + + } + + @Test + void testWithRobotsTxt() { + var r = new GenericDelayResolver(); + // r.getConfiguration() + // .setScope(null); + var robotsTxt = RobotsTxt.builder().crawlDelay(1000f).build(); + assertThatNoException().isThrownBy( + () -> r.delay(robotsTxt, "http://somewhere.com")); + + } + @Test void testDelayScheduleBoundaries() { //FYI: Jan 1, 2000 was a Saturday diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java index e7111051e..0599806b7 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java @@ -15,9 +15,11 @@ package com.norconex.crawler.web.doc.operations.link.impl; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -25,6 +27,7 @@ import java.util.List; import java.util.Set; +import org.apache.commons.io.input.NullInputStream; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -32,6 +35,8 @@ import com.norconex.commons.lang.bean.BeanMapper.Format; import com.norconex.commons.lang.file.ContentType; import com.norconex.commons.lang.io.CachedInputStream; +import com.norconex.commons.lang.map.PropertyMatcher; +import com.norconex.commons.lang.text.TextMatcher; import com.norconex.crawler.core.doc.CrawlDoc; import com.norconex.crawler.web.doc.WebCrawlDocContext; import com.norconex.crawler.web.doc.operations.link.Link; @@ -141,6 +146,61 @@ void testGenericWriteRead() { () -> BeanMapper.DEFAULT.assertWriteRead(extractor)); } + @Test + void testFromFieldAndRestrictions() throws IOException { + var extractor = new RegexLinkExtractor(); + var cfg = extractor.getConfiguration(); + cfg.setPatterns( + List.of(new ExtractionPattern("http:.*?\\.html", null))); + cfg.getRestrictions().add(new PropertyMatcher(TextMatcher.regex(".*"))); + cfg.getFieldMatcher().setPattern("myfield"); + + var doc = toCrawlDoc("n/a", + ContentType.TEXT, + NullInputStream.nullInputStream()); + doc.getMetadata().set("myfield", + "http://one.com/1.html|http://two.com/2.html|NOT_ME"); + var links = extractor.extractLinks(doc); + assertThat(links).map(Link::getUrl).containsExactlyInAnyOrder( + "http://one.com/1.html", "http://two.com/2.html"); + + cfg.clearPatterns(); + cfg.clearRestrictions(); + cfg.setContentTypeMatcher(TextMatcher.basic("application/pdf")); + links = extractor.extractLinks(doc); + assertThat(links).isEmpty(); + } + + @Test + void testNoRestrictionMatch() throws IOException { + var extractor = new RegexLinkExtractor(); + var cfg = extractor.getConfiguration(); + cfg.getRestrictions().add( + new PropertyMatcher(TextMatcher.regex("NOPE"))); + + var doc = toCrawlDoc("n/a", + ContentType.TEXT, + NullInputStream.nullInputStream()); + var links = extractor.extractLinks(doc); + assertThat(links).isEmpty(); + } + + @Test + void testLargeContent() throws IOException { + var doc = toCrawlDoc("n/a", ContentType.TEXT, new ByteArrayInputStream( + ("http://one.com/1.html" + + "X".repeat(RegexLinkExtractor.MAX_BUFFER_SIZE) + + "http://two.com/2.html" + "X".repeat( + RegexLinkExtractor.MAX_BUFFER_SIZE)) + .getBytes())); + var extractor = new RegexLinkExtractor(); + extractor.getConfiguration().setPatterns( + List.of(new ExtractionPattern("http:.*?\\.html", null))); + var links = extractor.extractLinks(doc); + assertThat(links).map(Link::getUrl).containsExactlyInAnyOrder( + "http://one.com/1.html", "http://two.com/2.html"); + } + private boolean contains(Set links, String url) { for (Link link : links) { if (url.equals(link.getUrl())) { From ba0c2a61c3a9fbae1b0e873edbf6d165b84433ad Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Wed, 11 Sep 2024 00:06:45 -0400 Subject: [PATCH 07/10] Code coverage. --- crawler/web/pom.xml | 7 ++ .../com/norconex/crawler/web/WebCrawler.java | 40 ++++++++-- .../UrlStatusCrawlerEventListener.java | 4 +- .../norconex/crawler/web/WebCrawlerTest.java | 33 +++++++++ .../impl/HtmlDomTikaLinkExtractorTest.java | 24 ++++++ .../UrlStatusCrawlerEventListenerTest.java | 74 +++++++++++++++---- .../crawler/web/stubs/CrawlerStubs.java | 11 +-- 7 files changed, 161 insertions(+), 32 deletions(-) create mode 100644 crawler/web/src/test/java/com/norconex/crawler/web/WebCrawlerTest.java diff --git a/crawler/web/pom.xml b/crawler/web/pom.xml index d3074e379..85cdf08df 100644 --- a/crawler/web/pom.xml +++ b/crawler/web/pom.xml @@ -214,6 +214,13 @@ selenium test +