From d8d6d653344f58b4ae02e130588b8bf049414ecb Mon Sep 17 00:00:00 2001
From: Pascal Essiembre
- * The following should be shared across concrete implementations
- * (which can add more configurable attributes and tags).
- *
- * XML configuration usage:
- *
- *
- * The following should be shared across concrete implementations - * (which can add more configurable attributes and tags). - *
- * {@nx.xml - *true
if ignoring
- * robots.txt crawl delay
- * @return true
if ignoring robots.txt crawl delay
+ * file. Not applicable when robots.txt are ignored.
*/
private boolean ignoreRobotsCrawlDelay = false;
/**
- * Gets the delay scope.
- * @param scope one of "crawler", "site", or "thread".
- * @return delay scope
+ * Gets the delay scope. See class documentation for a description
+ * of supported scopes.
*/
private DelayResolverScope scope = DEFAULT_SCOPE;
}
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java
index fb8a72f4f..4a5e1ff96 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/CrawlerDelay.java
@@ -18,9 +18,14 @@
import com.norconex.commons.lang.Sleeper;
+import lombok.EqualsAndHashCode;
+import lombok.ToString;
+
/**
* It is assumed there will be one instance of this class per crawler defined.
*/
+@EqualsAndHashCode(onlyExplicitlyIncluded = true)
+@ToString(onlyExplicitlyIncluded = true)
public class CrawlerDelay extends AbstractDelay {
private MutableLong lastHitEpochNanos = new MutableLong(-1);
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java
index 3d791d873..c1157cac0 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/DelayRange.java
@@ -18,11 +18,13 @@
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
+import lombok.experimental.FieldNameConstants;
@Data
@Accessors(chain = true)
@AllArgsConstructor
@NoArgsConstructor
+@FieldNameConstants
public class DelayRange- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *
- * - * {@nx.xml.usage - *- * The above example set the minimum delay between each document download - * on a given site to 5 seconds, no matter what the crawler robots.txt may - * say, except on weekend, where it is more agressive (1 second). - *
*/ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java index 456e54095..0b04c9da6 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverConfig.java @@ -19,80 +19,13 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.commons.lang.time.DurationParser; import lombok.Data; import lombok.experimental.Accessors; /** *- * Default implementation for creating voluntary delays between URL downloads. - * There are a few ways the actual delay value can be defined (in order): - *
- *- * In a delay schedule, the days of weeks are spelled out (in English): - * Monday, Tuesday, etc. Time ranges are using the 24h format. - *
- *- * One of these following scope dictates how the delay is applied, listed - * in order from the best behaved to the least. - *
- *- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *
- * - * {@nx.xml.usage - *- * The above example set the minimum delay between each document download - * on a given site to 5 seconds, no matter what the crawler robots.txt may - * say, except on weekend, where it is more agressive (1 second). + * Configuration for {@link GenericDelayResolver}. *
*/ @Data diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java index ed16d5727..89bae0f9f 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolver.java @@ -16,8 +16,6 @@ import java.time.Duration; -import com.norconex.commons.lang.time.DurationParser; - import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.ToString; @@ -54,37 +52,6 @@ * any given thread. The more threads you have the less of an * impact the delay will have. * - * - *- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *
- * - * {@nx.xml.usage - *- *- * - * } - *- * The above examlpe will increase the delay to 10 seconds when encountering - * PDFs from a default of 3 seconds. - *
- * * @since 2.5.0 */ @EqualsAndHashCode diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java index c06a2aeaf..9ecde8f5a 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/delay/impl/ReferenceDelayResolverConfig.java @@ -19,74 +19,14 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.commons.lang.time.DurationParser; import lombok.Data; import lombok.experimental.Accessors; /** *- * Introduces different delays between document downloads based on matching - * document reference (URL) patterns. - * There are a few ways the actual delay value can be defined (in order): + * Configuration for {@link ReferenceDelayResolver}. *
- *- *
- *- Takes the delay specify by a robots.txt file. - * Only applicable if robots.txt files and its robots crawl delays - * are not ignored.
- *- Takes the delay matching a reference pattern, if any (picks the first - * one matching).
- *- Used the specified default delay or 3 seconds, if none is - * specified.
- *- * One of these following scope dictates how the delay is applied, listed - * in order from the best behaved to the least. - *
- *
- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *
- * - * {@nx.xml.usage - *- *- * - * } - *- * The above examlpe will increase the delay to 10 seconds when encountering - * PDFs from a default of 3 seconds. - *
- * * @since 2.5.0 */ @Data diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java index 62fcffbda..190650609 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilter.java @@ -52,24 +52,6 @@ * Whenduplicate
istrue
, it will count the maximum * number of duplicate segments found. * - * - * {@nx.xml.usage - *- * - * } - * - * {@nx.xml.example - *(a regex identifying segment separator) - *- * } - * - * The above example will reject URLs with more than 5 forward slashes after - * the domain. - *
- * * @since 1.2 * @see Pattern */ diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java index 74bec03fe..230faa072 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/filter/impl/SegmentCountUrlFilterConfig.java @@ -14,8 +14,6 @@ */ package com.norconex.crawler.web.doc.operations.filter.impl; -import java.util.regex.Pattern; - import com.norconex.crawler.core.doc.operations.filter.OnMatch; import lombok.Data; @@ -23,40 +21,9 @@ /** *- * Filters URL based based on the number of URL segments. A URL with - * a number of segments equal or more than the specified count will either - * be included or excluded, as specified. - *
- *- * By default - * segments are obtained by breaking the URL text at each forward slashes - * (/), starting after the host name. You can define different or - * additional segment separator characters. + * Configuration for {@link SegmentCountUrlFilter}. *
- *- * When
- * - * {@nx.xml.usage - *duplicate
istrue
, it will count the maximum - * number of duplicate segments found. - *- * - * } - * - * {@nx.xml.example - *(a regex identifying segment separator) - *- * } - * - * The above example will reject URLs with more than 5 forward slashes after - * the domain. - *
- * * @since 1.2 - * @see Pattern */ @Data @Accessors(chain = true) @@ -67,8 +34,24 @@ public class SegmentCountUrlFilterConfig { /** Default segment count. */ public static final int DEFAULT_SEGMENT_COUNT = 10; + /** + * Number of segments after which this filter is considered a match. + * Default is {@value #DEFAULT_SEGMENT_COUNT} + */ private int count = DEFAULT_SEGMENT_COUNT; + /** + * Whether the configured segment count represents the number of + * duplicated segments for this filter to be considered a match. + */ private boolean duplicate; + /** + * Segment separator. Default is + * {@value #DEFAULT_SEGMENT_SEPARATOR_PATTERN}. + */ private String separator = DEFAULT_SEGMENT_SEPARATOR_PATTERN; + + /** + * Action to undertake when there is a match. + */ private OnMatch onMatch; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java deleted file mode 100644 index aa657f570..000000000 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorConfig.java +++ /dev/null @@ -1,269 +0,0 @@ -/* Copyright 2017-2024 Norconex Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.norconex.crawler.web.doc.operations.image.impl; - -import java.awt.Dimension; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -import org.imgscalr.Scalr.Method; - -import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.crawler.core.doc.CrawlDocMetadata; - -import lombok.Data; -import lombok.Getter; -import lombok.experimental.Accessors; - -/** - *- * Document processor that extract the "main" image from HTML pages. - * Since HTML is expected, this class should only be used at - * pre-import processor. It is possible for this processor to not find any - * image. - *
- * - *Finding the image
- *- * By default this class will get the first image (<img>) matching - * the minimum size. You can specify you want the largest of all matching - * ones instead. In addition, if you know your images to be defined - * in a special way (e.g., all share the same CSS class), then you can use - * the "domSelector" to limit to one or a few images. See - * - * JSoup selector-syntax for how to build the "domSelector". - *
- * - *Storing the image
- *- * One or more storage method can be specified. Here are - * the possible storage options: - *
- *
collector.featured-image-url
field.
- * When only this option is set, scaling options and image format
- * have no effect.
- * collector.featured-image-inline
field.
- * The string is ready to be
- * used inline, in a <img src="..."> tag.
- * collector.featured-image-path
field.
- * [width]x[height]
- * or a single value. When a single value is used, that value represents both
- * the width and height (i.e., a square).
- *
- * {@nx.xml.example
- * - * The above example extracts the first image being 300x400 or larger, scaling - * it down to be 50x50 and storing it as an inline JPEG in a document field, - * preserving aspect ratio and using the best quality possible. - *
- * - * @since 2.8.0 - */ -@SuppressWarnings("javadoc") -@Data -@Accessors(chain = true) -public class FeaturedImageProcessorConfig { - - public static final String COLLECTOR_FEATURED_IMAGE_URL = - CrawlDocMetadata.PREFIX + "featured-image-url"; - public static final String COLLECTOR_FEATURED_IMAGE_PATH = - CrawlDocMetadata.PREFIX + "featured-image-path"; - public static final String COLLECTOR_FEATURED_IMAGE_INLINE = - CrawlDocMetadata.PREFIX + "featured-image-inline"; - - public static final String DEFAULT_PAGE_CONTENT_TYPE_PATTERN = - "text/html|application/(xhtml\\+xml|vnd\\.wap.xhtml\\+xml|x-asp)"; - public static final int DEFAULT_IMAGE_CACHE_SIZE = 1000; - - /** - * Default image cache directory, relative to the crawler working - * directory. - */ - public static final String DEFAULT_IMAGE_CACHE_DIR = - "featuredImageCache"; - /** - * Default featured image directory, relative to the crawler working - * directory. - */ - public static final String DEFAULT_STORAGE_DISK_DIR = - "featuredImages"; - - public static final String DEFAULT_IMAGE_FORMAT = "png"; - public static final Dimension DEFAULT_MIN_SIZE = new Dimension(400, 400); - public static final Dimension DEFAULT_SCALE_SIZE = new Dimension(150, 150); - public static final Storage DEFAULT_STORAGE = Storage.URL; - public static final StorageDiskStructure DEFAULT_STORAGE_DISK_STRUCTURE = - StorageDiskStructure.URL2PATH; - - public enum Storage { - URL, INLINE, DISK - } - - public enum StorageDiskStructure { - URL2PATH, DATE, DATETIME - } - - public enum Quality { - AUTO(Method.AUTOMATIC), - LOW(Method.SPEED), - MEDIUM(Method.BALANCED), - HIGH(Method.QUALITY), - MAX(Method.ULTRA_QUALITY); - - @Getter - private final Method scalrMethod; - - Quality(Method scalrMethod) { - this.scalrMethod = scalrMethod; - } - } - - private String pageContentTypePattern = DEFAULT_PAGE_CONTENT_TYPE_PATTERN; - private String domSelector; - private Dimension minDimensions = DEFAULT_MIN_SIZE; - private Dimension scaleDimensions = DEFAULT_SCALE_SIZE; - private boolean scaleStretch; - private String imageFormat = DEFAULT_IMAGE_FORMAT; - private int imageCacheSize = DEFAULT_IMAGE_CACHE_SIZE; - - private Path imageCacheDir; - private boolean largest; - private final List* Document processor that extract the "main" image from HTML pages. - * Since HTML is expected, this class should only be used at + * Since HTML is expected, this class should only be used as a * pre-import processor. It is possible for this processor to not find any * image. *
@@ -93,135 +93,21 @@ * *- * One or more storage method can be specified. Here are - * the possible storage options: + * When identified, the featured image can be stored either on local disk, + * or as a metadata field in Base64 format, or simply as a URL pointing + * to its remote location. See {@link FeaturedImageResolverConfig} for details. *
- *collector.featured-image-url
field.
- * When only this option is set, scaling options and image format
- * have no effect.
- * collector.featured-image-inline
field.
- * The string is ready to be
- * used inline, in a <img src="..."> tag.
- * collector.featured-image-path
field.
- * [width]x[height]
- * or a single value. When a single value is used, that value represents both
- * the width and height (i.e., a square).
- *
- * {@nx.xml.example
- * - * The above example extracts the first image being 300x400 or larger, scaling - * it down to be 50x50 and storing it as an inline JPEG in a document field, - * preserving aspect ratio and using the best quality possible. - *
- * * @since 2.8.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString @Slf4j -public class FeaturedImageProcessor +public class FeaturedImageResolver extends CrawlerLifeCycleListener implements DocumentConsumer, - Configurable+ * Configuration for {@link FeaturedImageResolver}. + *
+ * @since 2.8.0 + */ +@Data +@Accessors(chain = true) +public class FeaturedImageResolverConfig { + + public static final String FEATURED_IMAGE_URL_FIELD = + CrawlDocMetadata.PREFIX + "featured-image-url"; + public static final String FEATURED_IMAGE_PATH_FIELD = + CrawlDocMetadata.PREFIX + "featured-image-path"; + public static final String FEATURED_IMAGE_INLINE_FIELD = + CrawlDocMetadata.PREFIX + "featured-image-inline"; + + public static final String DEFAULT_PAGE_CONTENT_TYPE_PATTERN = + "text/html|application/(xhtml\\+xml|vnd\\.wap.xhtml\\+xml|x-asp)"; + public static final int DEFAULT_IMAGE_CACHE_SIZE = 1000; + + /** + * Default image cache directory, relative to the crawler working + * directory. + */ + public static final String DEFAULT_IMAGE_CACHE_DIR = + "featuredImageCache"; + /** + * Default featured image directory, relative to the crawler working + * directory. + */ + public static final String DEFAULT_STORAGE_DISK_DIR = + "featuredImages"; + + public static final String DEFAULT_IMAGE_FORMAT = "png"; + public static final Dimension DEFAULT_MIN_SIZE = new Dimension(400, 400); + public static final Dimension DEFAULT_SCALE_SIZE = new Dimension(150, 150); + public static final Storage DEFAULT_STORAGE = Storage.URL; + public static final StorageDiskStructure DEFAULT_STORAGE_DISK_STRUCTURE = + StorageDiskStructure.URL2PATH; + + /** + * Type of featured image storages. + */ + public enum Storage { + /** + * Default storages. The absolute image URL is stored in a + * {@value #FEATURED_IMAGE_URL_FIELD} metadata field. + * When only this storages option is set, scaling options and image + * format have no effect. + */ + URL, + /** + * Stores a Base64 string of the scaled image, in the format + * specified, in a {@value #FEATURED_IMAGE_INLINE_FIELD} metadata + * field. The string is ready to be used inline, in a + * <img src="..."> tag (as an example). + */ + INLINE, + /** + * Stores the scaled image on the file system, in the format + * and directory specified. A reference to the file on disk is stored + * in a {@value #FEATURED_IMAGE_PATH_FIELD} metadata field. + */ + DISK + } + + /** + * Directory structure when storing images on disk. + */ + public enum StorageDiskStructure { + /** + * Create directories for each URL segments, with handling + * of special characters. + */ + URL2PATH, + /** + * Create directories for each date (e.g.,2000/12/31/
).
+ */
+ DATE,
+ /**
+ * Create directories for each date and time, up to seconds
+ * (e.g., 2000/12/31/13/34/12/
).
+ */
+ DATETIME
+ }
+
+ public enum Quality {
+ AUTO(Method.AUTOMATIC),
+ LOW(Method.SPEED),
+ MEDIUM(Method.BALANCED),
+ HIGH(Method.QUALITY),
+ MAX(Method.ULTRA_QUALITY);
+
+ @Getter
+ private final Method scalrMethod;
+
+ Quality(Method scalrMethod) {
+ this.scalrMethod = scalrMethod;
+ }
+ }
+
+ /**
+ * Optional regex to overwrite default matching of HTML pages.
+ * Default is {@value #DEFAULT_PAGE_CONTENT_TYPE_PATTERN}
+ */
+ private String pageContentTypePattern = DEFAULT_PAGE_CONTENT_TYPE_PATTERN;
+ /**
+ * Optional CSS-like path matching one or more image elements.
+ */
+ private String domSelector;
+ /**
+ * Minimum pixel size for an image to be considered. Default is 400x400.
+ */
+ private Dimension minDimensions = DEFAULT_MIN_SIZE;
+ /**
+ * Target pixel size the featured image should be scaled to.
+ * Default is 150x150.
+ */
+ private Dimension scaleDimensions = DEFAULT_SCALE_SIZE;
+ /**
+ * Whether to stretch to match scale size. Default keeps aspect ratio.
+ */
+ private boolean scaleStretch;
+ /**
+ * Target format of stored image. E.g., "jpg", "png", "gif", "bmp", ...
+ * Default is {@value #DEFAULT_IMAGE_FORMAT}
+ */
+ private String imageFormat = DEFAULT_IMAGE_FORMAT;
+ /**
+ * Maximum number of images to cache on the local file system for faster
+ * processing.
+ * Set to 0 to disable caching. Default is
+ * {@value #DEFAULT_IMAGE_CACHE_SIZE}.
+ */
+ private int imageCacheSize = DEFAULT_IMAGE_CACHE_SIZE;
+
+ /**
+ * Directory where to cache the images. Defaults to
+ * {@value #DEFAULT_IMAGE_CACHE_DIR}
+ */
+ private Path imageCacheDir;
+ /**
+ * When more than one featured image is found, whether to return the
+ * largest of them all (as opposed to the first one encountered).
+ */
+ private boolean largest;
+ /**
+ * One or more type of physical storages for the image.
+ */
+ private final ListWhen used before importing this class attempts to detect the content * character encoding unless the character encoding - * was specified using {@link #setCharset(String)}. Since document - * parsing converts content to UTF-8, UTF-8 is always assumed when - * used as a post-parse handler. + * was specified using + * {@link DomLinkExtractorConfig#setCharset(java.nio.charset.Charset)}. + * Since document parsing converts content to UTF-8, UTF-8 is always assumed + * when used as a post-parse handler. *
* - *You can specify which parser to use when reading + *
You can specify which DOM parser to use when reading
* documents. The default is "html" and will normalize the content
* as HTML. This is generally a desired behavior, but this can sometimes
* have your selector fail. If you encounter this
@@ -122,7 +123,8 @@
* That information gets stored as metadata in the target document.
* If you want to limit the quantity of information extracted/stored,
* you can disable this feature by setting
- * {@link #ignoreLinkData} to true
.
+ * {@link DomLinkExtractorConfig#setIgnoreLinkData(boolean)} to
+ * true
.
*
http
, https
, and ftp
. You can
* specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
+ * {@link DomLinkExtractorConfig#setSchemes(java.util.List)}.
*
*
* - * By default, this extractor only will be applied on documents matching + * By default, this extractor will only be applied on documents matching * one of these content types: *
* {@nx.include com.norconex.importer.handler.CommonMatchers#domContentTypes} @@ -147,57 +149,11 @@ * won't be extracted (e.g. *<a href="x.html" rel="nofollow" ...>
).
* To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to true
.
+ * {@link DomLinkExtractorConfig#setIgnoreNofollow(boolean)} to true
.
*
*
- * {@nx.xml.usage
- *
- * The above example will extract URLs found in custom element attributes named
- * data-myurl
.
- *
- * Extracts links from a Document Object Model (DOM) representation of an - * HTML, XHTML, or XML document content based on values of matching - * elements and attributes. - *
- *- * In order to construct a DOM tree, text is loaded entirely - * into memory. It uses the document content by default, but it can also - * come from specified metadata fields. - * Use this filter with caution if you know you'll need to parse - * huge files. Use the {@link HtmlLinkExtractor} instead if this is a - * concern. - *
- *- * The jsoup parser library is used to load a - * document content into a DOM tree. Elements are referenced using a - * - * CSS or JQuery-like syntax. - *
- *- * This link extractor is normally used before importing. - *
- * - *When used before importing this class attempts to detect the content - * character encoding unless the character encoding - * was specified using {@link #setCharset(String)}. Since document - * parsing converts content to UTF-8, UTF-8 is always assumed when - * used as a post-parse handler. - *
- * - *You can specify which parser to use when reading - * documents. The default is "html" and will normalize the content - * as HTML. This is generally a desired behavior, but this can sometimes - * have your selector fail. If you encounter this - * problem, try switching to "xml" parser, which does not attempt normalization - * on the content. The drawback with "xml" is you may not get all HTML-specific - * selector options to work. If you know you are dealing with XML to begin - * with, specifying "xml" should be a good option. - *
- * - *- * You can define as many JSoup "selectors" as desired. All values matched - * by a selector will be extracted as a URL. - *
- *- * It is possible to control what gets extracted - * exactly for matching purposes thanks to the "extract" argument expected - * with every selector. Possible values are: - *
- * - * {@nx.include com.norconex.importer.util.DomUtil#extract} - * - *- * When not specified, the default is "text". - *
- * - *The default selectors / extract strategies are:
- *
- * For any extracted link values, this extractor will perform minimal
- * heuristics to clean extra content not part of a regular URL. For instance,
- * it will only keep what is after url=
when dealing with
- * <meta http-equiv
refresh URLs. It will also trim white
- * spaces.
- *
- * By default, contextual information is kept about the HTML/XML mark-up
- * tag from which a link is extracted (e.g., tag name and attributes).
- * That information gets stored as metadata in the target document.
- * If you want to limit the quantity of information extracted/stored,
- * you can disable this feature by setting
- * {@link #ignoreLinkData} to true
.
- *
Only valid
- *
- * schemes are extracted for absolute URLs. By default, those are
- * http
, https
, and ftp
. You can
- * specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
- *
- * By default, this extractor only will be applied on documents matching - * one of these content types: - *
- * {@nx.include com.norconex.importer.handler.CommonMatchers#domContentTypes} - * - *
- * By default, a regular HTML link having the "rel" attribute set to "nofollow"
- * won't be extracted (e.g.
- * <a href="x.html" rel="nofollow" ...>
).
- * To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to true
.
- *
- * The above example will extract URLs found in custom element attributes named
- * data-myurl
.
+ * Configuration for {@link DomLinkExtractor}.
*
html
or xml
.
- * @return html
(default) or xml
.
*/
private String parser = DomUtil.PARSER_HTML;
private boolean ignoreNofollow;
/**
* Whether to ignore extra data associated with a link.
- * @param ignoreLinkData true
to ignore.
- * @return true
to ignore.
*/
private boolean ignoreLinkData;
private final List- * By default, this extractor only will be applied on documents matching + * By default, this extractor will only be applied on documents matching * one of these content types: *
* {@nx.include com.norconex.importer.handler.CommonRestrictions#htmlContentTypes} ** You can specify your own content types or other restrictions with - * {@link #setRestrictions(List)}. + * {@link HtmlLinkExtractorConfig#setContentTypeMatcher(com.norconex.commons.lang.text.TextMatcher)}. * Make sure they represent a file with HTML-like markup tags containing URLs. * For documents that are just * too different, consider implementing your own {@link LinkExtractor} instead. @@ -108,7 +107,8 @@ *
* The meta.http-equiv
is treated differently. Only if the
* "http-equiv" value is "refresh" and a "content" attribute with a URL exist
- * that it will be extracted. "object" and "applet" can have multiple URLs.
+ * that it will be extracted. The "object" and "applet" tags can have
+ * multiple URLs.
*
@@ -124,7 +124,7 @@ * {@link WebDocMetadata#REFERRER_LINK_PREFIX}. *
*- * The referrer data is always stored (was optional before). + * The referrer data is always stored. *
* *<a href="x.html" rel="nofollow" ...>
).
* To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to true
.
+ * {@link HtmlLinkExtractorConfig#setIgnoreNofollow(boolean)} to
+ * true
.
*
*
* This extractor preserves hashtag characters (#) found - * in URLs and every characters after it. It relies on the implementation - * of {@link WebUrlNormalizer} to strip it if need be. - * {@link GenericUrlNormalizer} is now always invoked by default, and the - * default set of rules defined for it will remove fragments. + *
While extractor preserves hashtag characters (#) found + * in URLs and every characters after it, the default URL normalizer + * ({@link GenericUrlNormalizer}) will strip it by default. *
* *
@@ -171,7 +170,8 @@
* That information gets stored as metadata in the target document.
* If you want to limit the quantity of information extracted/stored,
* you can disable this feature by setting
- * {@link #ignoreLinkData} to true
.
+ * {@link HtmlLinkExtractorConfig#setIgnoreLinkData(boolean)} to
+ * true
.
*
http
, https
, and ftp
. You can
* specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
+ * {@link HtmlLinkExtractorConfig#setSchemes(List)}.
*
*
* URLs found in <!-- comments --> are no longer + *
URLs found in <!-- comments --> are not * extracted by default. To enable URL extraction from comments, use - * {@link #setCommentsEnabled(boolean)} + * {@link HtmlLinkExtractorConfig#setCommentsEnabled(boolean)} *
* *You can identify portions of a document where links * should be extracted or ignored with - * {@link #setExtractBetweens(List)} and - * {@link #setNoExtractBetweens(List)}. Eligible content for link - * extraction is identified first, and content to exclude is done on that - * subset. + * {@link HtmlLinkExtractorConfig#setExtractBetweens(List)} and + * {@link HtmlLinkExtractorConfig#setNoExtractBetweens(List)}. Eligible + * content for link extraction is identified first, and content to exclude is + * done on that subset. *
*You can further limit link extraction to specific * area by using * selector-syntax * to do so, with - * {@link #setExtractSelectors(List)} and - * {@link #setNoExtractSelectors(List)}. - *
- * - * {@nx.xml.usage - *- * The above example adds URLs to JavaScript files to the list of URLs to be - * extracted. + * {@link HtmlLinkExtractorConfig#setExtractSelectors(List)} and + * {@link HtmlLinkExtractorConfig#setNoExtractSelectors(List)}. *
*/ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode @ToString @@ -286,6 +221,7 @@ public class HtmlLinkExtractor private final HtmlLinkExtractorConfig configuration = new HtmlLinkExtractorConfig(); + // @formatter:off // NOTE: When this predicate is invoked the tag name is always lower case // and known to have been identified as a target tag name in configuration. // For each predicate, returning true won't try following predicates @@ -293,129 +229,69 @@ public class HtmlLinkExtractor @ToString.Exclude private final BiPredicate- * A memory efficient HTML link extractor. - *
- *- * This link extractor uses regular expressions to extract links. It does - * so on a chunk of text at a time, so that large files are not fully loaded - * into memory. If you prefer a more flexible implementation that loads the - * DOM model in memory to perform link extraction, consider using - * {@link DomLinkExtractor}. - *
- * - *- * By default, this extractor only will be applied on documents matching - * one of these content types: - *
- * {@nx.include com.norconex.importer.handler.CommonRestrictions#htmlContentTypes} - *- * You can specify your own content types or other restrictions with - * {@link #setRestrictions(List)}. - * Make sure they represent a file with HTML-like markup tags containing URLs. - * For documents that are just - * too different, consider implementing your own {@link LinkExtractor} instead. - * Removing the default values and define no content types will have for effect - * to try to extract URLs from all files (usually a bad idea). - *
- * - *- * a.href, frame.src, iframe.src, img.src, meta.http-equiv - *- * You can specify your own set of tags and attributes to have - * different ones used for extracting URLs. For an elaborated set, you can - * combine the above with your own list or use any of the following - * suggestions (tag.attribute): - *
- * applet.archive, applet.codebase, area.href, audio.src, - * base.href, blockquote.cite, body.background, button.formaction, - * command.icon, del.cite, embed.src, form.action, - * frame.longdesc, head.profile, html.manifest, iframe.longdesc, - * img.longdesc, img.usemap, input.formaction, input.src, - * input.usemap, ins.cite, link.href, object.archive, - * object.classid, object.codebase, object.data, object.usemap, - * q.cite, script.src, source.src, video.poster, - * video.src - *- *
- * The meta.http-equiv
is treated differently. Only if the
- * "http-equiv" value is "refresh" and a "content" attribute with a URL exist
- * that it will be extracted. "object" and "applet" can have multiple URLs.
- *
- * It is possible to identify a tag only as the holder of - * a URL (without attributes). The tag body value will be used as the URL. - *
- * - *- * Some "referrer" information is derived from the each link and stored as - * metadata in the document they point to. - * These may vary for each link, but they are normally prefixed with - * {@link WebDocMetadata#REFERRER_LINK_PREFIX}. - *
- *- * The referrer data is always stored (was optional before). - *
- * - *This extractor will by default attempt to - * detect the encoding of the a page when extracting links and - * referrer information. If no charset could be detected, it falls back to - * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. - *
- * - *
- * By default, a regular HTML link having the "rel" attribute set to "nofollow"
- * won't be extracted (e.g.
- * <a href="x.html" rel="nofollow" ...>
).
- * To force its extraction (and ensure it is followed) you can set
- * {@link #setIgnoreNofollow(boolean)} to true
.
- *
This extractor preserves hashtag characters (#) found - * in URLs and every characters after it. It relies on the implementation - * of {@link WebUrlNormalizer} to strip it if need be. - * {@link GenericUrlNormalizer} is now always invoked by default, and the - * default set of rules defined for it will remove fragments. - *
- * - *- * The URL specification says hashtags - * are used to represent fragments only. That is, to quickly jump to a specific - * section of the page the URL represents. Under normal circumstances, - * keeping the URL fragments usually leads to duplicates documents being fetched - * (same URL but different fragment) and they should be stripped. Unfortunately, - * there are sites not following the URL standard and using hashtags as a - * regular part of a URL (i.e. different hashtags point to different web pages). - * It may be essential when crawling these sites to keep the URL fragments. - * This can be done by making sure the URL normalizer does not strip them. - *
- * - *
- * By default, contextual information is kept about the HTML/XML mark-up
- * tag from which a link is extracted (e.g., tag name and attributes).
- * That information gets stored as metadata in the target document.
- * If you want to limit the quantity of information extracted/stored,
- * you can disable this feature by setting
- * {@link #ignoreLinkData} to true
.
- *
Only valid
- *
- * schemes are extracted for absolute URLs. By default, those are
- * http
, https
, and ftp
. You can
- * specify your own list of supported protocols with
- * {@link #setSchemes(String[])}.
- *
URLs found in <!-- comments --> are no longer - * extracted by default. To enable URL extraction from comments, use - * {@link #setCommentsEnabled(boolean)} - *
- * - *You can identify portions of a document where links - * should be extracted or ignored with - * {@link #setExtractBetweens(List)} and - * {@link #setNoExtractBetweens(List)}. Eligible content for link - * extraction is identified first, and content to exclude is done on that - * subset. - *
- *You can further limit link extraction to specific - * area by using - * selector-syntax - * to do so, with - * {@link #setExtractSelectors(List)} and - * {@link #setNoExtractSelectors(List)}. - *
- * - * {@nx.xml.usage - *- * The above example adds URLs to JavaScript files to the list of URLs to be - * extracted. + * Configuration for {@link HtmlLinkExtractor}. *
*/ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class HtmlLinkExtractorConfig { @@ -266,8 +53,6 @@ public class HtmlLinkExtractorConfig { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#HTML_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher - * @return content type matcher */ private final TextMatcher contentTypeMatcher = CommonMatchers.htmlContentTypes(); @@ -275,15 +60,11 @@ public class HtmlLinkExtractorConfig { /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); /** * The maximum supported URL length. Longer URLs are ignored. - * @param maxURLLength maximum URL length - * @return maximum URL length */ private int maxURLLength = DEFAULT_MAX_URL_LENGTH; @@ -295,15 +76,11 @@ public class HtmlLinkExtractorConfig { * By default this link won't be crawled. * </a> * - * @param ignoreNofollow whether to ignore "nofollow" directives - * @returntrue
if ignoring "nofollow" directives
*/
private boolean ignoreNofollow;
/**
* Gets whether to ignore extra data associated with a link.
- * @param ignoreLinkData true
to ignore.
- * @return true
to ignore.
*/
private boolean ignoreLinkData;
@@ -311,8 +88,6 @@ public class HtmlLinkExtractorConfig {
* The character set to use for pages on which link extraction is performed.
* When null
(default), character set detection will be
* attempted.
- * @param charset character set to use, or null
- * @return character set to use, or null
*/
private Charset charset;
@@ -324,7 +99,6 @@ public class HtmlLinkExtractorConfig {
* <a href="https://yoursite.com/somepage.html">Some URL</a>
* -->
*
- * @return true
if links should be extracted from comments.
*/
private boolean commentsEnabled;
@@ -357,6 +131,7 @@ public List- * Web Crawler configuration. + * Web Crawler configuration, adding more options to the base + * {@link CrawlerConfig}. *
*@@ -90,7 +94,7 @@ * Scope: To limit crawling to specific web domains, and avoid creating * many filters to that effect, you can tell the crawler to "stay" within * the web site "scope" with - * {@link #setUrlCrawlScopeStrategy(GenericUrlScopeResolver)}. + * {@link #setUrlScopeResolver(UrlScopeResolver)}. *
* ** You can tell the crawler how it should handle HTTP GET and HEAD requests - * using using {@link #setDocumentFetchSupport(FetchDirectiveSupport) and + * using using {@link #setDocumentFetchSupport(FetchDirectiveSupport)} and * {@link #setMetadataFetchSupport(FetchDirectiveSupport)} respectively. * For each, the options are: *
@@ -302,15 +306,18 @@ * Metadata filters: Applies filtering on a document metadata fields. * *
- * If {@link #isFetchHttpHead()} returns true
, these filters
- * will be invoked after the crawler performs a distinct HTTP HEAD request.
+ * If {@link #getMetadataFetchSupport()} value forces a distinct call
+ * for fetching metadata, these filters will be invoked after the crawler
+ * performs an HTTP HEAD request.
* It gives you the opportunity to filter documents based on the HTTP HEAD
* response to potentially save a more expensive HTTP GET request for
* download (but results in two HTTP requests for valid documents --
- * HEAD and GET). Filtering occurs before URLs are extracted.
+ * HEAD and GET). Filtering occurs before URLs are extracted (since
+ * no content is downloaded.
*
- * When {@link #isFetchHttpHead()} is false
, these filters
+ * When {@link #getMetadataFetchSupport()} does not invoke making a
+ * distinct call for metadata, these filters
* will be invoked on the metadata of the HTTP response
* obtained from an HTTP GET request (as the document is downloaded).
* Filtering occurs after URLs are extracted.
@@ -326,10 +333,11 @@
* Importer filters: The Importer module also offers document
* filtering options. At that point a document is already downloaded
* and its links extracted. There are two types of filtering offered
- * by the Importer: before and after document parsing. Use
- * filters before parsing if you need to filter on raw content or
- * want to prevent an expensive parsing. Use filters after parsing
- * when you need to read the content as plain text.
+ * by the Importer: before and after document parsing (assuming you
+ * configured at least one parser). Use filters before parsing if you
+ * need to filter on raw content or want to avoid parsing some documents.
+ * Use filters after parsing when you need to read the content
+ * as plain text.
*
*
*
@@ -362,7 +370,7 @@
* HTML "nofollow": Most HTML-oriented link extractors support
* the rel="nofollow"
attribute set on HTML links and offer
* a way to disable this instruction. E.g.,
- * {@link HtmlLinkExtractor#setIgnoreNofollow(boolean)}.
+ * {@link HtmlLinkExtractorConfig#setIgnoreNofollow(boolean)}.
*
*
null
via
- * {@link #setSitemapResolver(SitemapResolver_OLD) effectively disables
+ * {@link #setSitemapResolver(SitemapResolver)} effectively disables
* sitemap support altogether, and is thus incompatible with sitemaps
* specified as start references.
* null
via
- * {@link #setCanonicalLinkDetector(CanonicalLinkDetector) to disable
+ * {@link #setCanonicalLinkDetector(CanonicalLinkDetector)} to disable
* support canonical links (increasing the chance of getting duplicates).
*
*
- * EXPERIMENTAL:
* The crawler can attempt to detect and reject documents considered as
* duplicates within a crawler session. A document will be considered
* duplicate if there was already a document processed with the same
@@ -434,7 +441,7 @@
* {@link #setMetadataDeduplicate(boolean)} and/or
* {@link #setDocumentDeduplicate(boolean)} to true
. Setting
* those will have no effect if the corresponding checksummers are
- * null
.
+ * null
or checksums are otherwise not are being generated.
*
* Deduplication can impact crawl performance. It is recommended you @@ -461,99 +468,10 @@ * URLs in that field will become eligible for crawling. * See {@link #setPostImportLinks(TextMatcher)}. *
- * - * {@nx.xml.usage - *true
if keeping
- * @return true
if keeping
+ * Whether to keep the Importer-populated fields
+ * from {@link #getPostImportLinks()}. By default, those are deleted
+ * from a document when the URLs they contain are queued for processing
+ * or otherwise evaluated.
* @see #setPostImportLinks(TextMatcher)
*/
private boolean postImportLinksKeep;
@@ -620,8 +530,6 @@ public enum ReferencedLinkType {
* The provider of robots.txt rules for a site (if applicable).
* Defaults to {@link StandardRobotsTxtProvider}.
* Set to null
to disable.
- * @param robotsTxtProvider robots.txt provider
- * @return robots.txt provider
* @see #setIgnoreRobotsTxt(boolean)
*/
private RobotsTxtProvider robotsTxtProvider =
@@ -631,8 +539,6 @@ public enum ReferencedLinkType {
* The provider of robots metadata rules for a page (if applicable).
* Defaults to {@link StandardRobotsMetaProvider}.
* Set to null
to disable.
- * @param robotsMetaProvider robots metadata rules
- * @return robots metadata rules r
* @see #setIgnoreRobotsMeta(boolean)
*/
private RobotsMetaProvider robotsMetaProvider =
@@ -643,8 +549,6 @@ public enum ReferencedLinkType {
* Defaults to {@link GenericSitemapResolver}.
* Set to null
to disable all sitemap support, or
* see class documentation to disable sitemap detection only.
- * @param sitemapResolver sitemap resolver
- * @return sitemap resolver
* @see SitemapLocator
*/
private SitemapResolver sitemapResolver = new GenericSitemapResolver();
@@ -654,8 +558,6 @@ public enum ReferencedLinkType {
* Defaults to {@link GenericSitemapLocator}.
* Set to null
to disable locating sitemaps
* (relying on sitemaps defined as start reference, if any).
- * @param sitemapLocator sitemap locator
- * @return sitemap locator
* @see SitemapResolver
*/
private SitemapLocator sitemapLocator = new GenericSitemapLocator();
@@ -665,8 +567,6 @@ public enum ReferencedLinkType {
* crawled by a new crawl session. Usually amounts to checking if enough
* time has passed between two crawl sessions.
* Defaults to {@link GenericRecrawlableResolver}.
- * @param robotsMetaProvider recrawlable resolver
- * @return recrawlableResolver recrawlable resolver
*/
private RecrawlableResolver recrawlableResolver =
new GenericRecrawlableResolver();
@@ -690,6 +590,7 @@ public List- * You can specify your own restrictions using {@link #setRestrictions(List)}, + * You can specify your own restrictions using + * {@link RegexLinkExtractorConfig#getRestrictions()}, * but make sure they represent text files. *
* @@ -74,45 +74,11 @@ * detect the encoding of the a page when extracting links and * referrer information. If no charset could be detected, it falls back to * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. - * - * - * {@nx.xml.usage - *- * The above example extracts page "ids" contained in square brackets and - * add them to a custom URL. + * {@link RegexLinkExtractorConfig#setCharset(java.nio.charset.Charset)}. *
* * @since 2.7.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class RegexLinkExtractor @@ -144,10 +110,8 @@ public Set extractLinks(CrawlDoc doc) throws IOException { doc.getMetadata() .matchKeys(configuration.getFieldMatcher()) .valueList() - .forEach( - val -> extractLinks( - links, val, - doc.getReference())); + .forEach(val -> extractLinks( + links, val, doc.getReference())); } else { // Body var sb = new StringBuilder(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java index 193618830..ed0a3534b 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorConfig.java @@ -22,7 +22,7 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.web.doc.WebDocMetadata; +import com.norconex.importer.handler.CommonMatchers; import lombok.AllArgsConstructor; import lombok.Data; @@ -31,81 +31,10 @@ /** *- * Link extractor using regular expressions to extract links found in text - * documents. Relative links are resolved to the document URL. - * For HTML documents, it is best advised to use the - * {@link HtmlLinkExtractor} or {@link DomLinkExtractor}, - * which addresses many cases specific to HTML. + * Configuration for {@link RegexLinkExtractor}. *
- * - *- * By default, this extractor will extract URLs only in documents having - * their content type matching this regular expression: - *
- *- * text/.* - *- *
- * You can specify your own restrictions using {@link #setRestrictions(List)}, - * but make sure they represent text files. - *
- * - *- * The following referrer information is stored as metadata in each document - * represented by the extracted URLs: - *
- *This extractor will by default attempt to - * detect the encoding of the a page when extracting links and - * referrer information. If no charset could be detected, it falls back to - * UTF-8. It is also possible to dictate which encoding to use with - * {@link #setCharset(String)}. - *
- * - * {@nx.xml.usage - *- * The above example extracts page "ids" contained in square brackets and - * add them to a custom URL. - *
- * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class RegexLinkExtractorConfig { @@ -127,19 +56,22 @@ public static class ExtractionPattern { /** * The maximum supported URL length. * Default is {@value #DEFAULT_MAX_URL_LENGTH}. - * @param maxUrlLength maximum URL length - * @return maximum URL length */ private int maxUrlLength = DEFAULT_MAX_URL_LENGTH; /** * Gets the character set of pages on which link extraction is performed. * Default isnull
(charset detection will be attempted).
- * @param charset character set to use, or null
- * @return character set to use, or null
*/
private Charset charset;
+ /**
+ * The matcher of content types to apply link extraction on. No attempt to
+ * extract links from any other content types will be made. Default
+ * matches all content types
+ */
+ private final TextMatcher contentTypeMatcher = CommonMatchers.all();
+
private final List* The configuration of content-types, storing the referrer data, and ignoring @@ -57,16 +59,8 @@ * pre-defined set of link attributes, when available (title, type, * uri, text, rel). *
- * - * {@nx.xml.usage - *- * Implementation of {@link LinkExtractor} using - * Apache Tika to perform URL - * extractions from HTML documents. - * This is an alternative to the {@link HtmlLinkExtractor}. + * Configuration for {@link TikaLinkExtractor}. *
- *- * The configuration of content-types, storing the referrer data, and ignoring - * "nofollow" and ignoring link data are the same as in - * {@link HtmlLinkExtractor}. For link data, this parser only keeps a - * pre-defined set of link attributes, when available (title, type, - * uri, text, rel). - *
- * - * {@nx.xml.usage - *true
to ignore.
- * @return true
to ignore.
* @since 3.0.0
*/
private boolean ignoreLinkData;
@@ -62,17 +41,15 @@ public class TikaLinkExtractorConfig {
* The matcher of content types to apply link extraction on. No attempt to
* extract links from any other content types will be made. Default is
* {@link CommonMatchers#HTML_CONTENT_TYPES}.
- * @param contentTypeMatcher content type matcher
- * @return content type matcher
*/
private final TextMatcher contentTypeMatcher =
CommonMatchers.htmlContentTypes();
+ private final PropertyMatchers restrictions = new PropertyMatchers();
+
/**
* Matcher of one or more fields to use as the source of content to
* extract links from, instead of the document content.
- * @param fieldMatcher field matcher
- * @return field matcher
*/
private final TextMatcher fieldMatcher = new TextMatcher();
@@ -85,11 +62,26 @@ public TikaLinkExtractorConfig setFieldMatcher(TextMatcher fieldMatcher) {
* The matcher of content types to apply link extraction on. No attempt to
* extract links from any other content types will be made. Default is
* {@link CommonMatchers#HTML_CONTENT_TYPES}.
- * @param contentTypeMatcher content type matcher
+ * @param matcher content type matcher
* @return this
*/
public TikaLinkExtractorConfig setContentTypeMatcher(TextMatcher matcher) {
contentTypeMatcher.copyFrom(matcher);
return this;
}
+
+ /**
+ * Clears all restrictions.
+ */
+ public void clearRestrictions() {
+ restrictions.clear();
+ }
+
+ /**
+ * Gets all restrictions
+ * @return the restrictions
+ */
+ public PropertyMatchers getRestrictions() {
+ return restrictions;
+ }
}
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java
index 88bf838b1..c9270e521 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractor.java
@@ -18,6 +18,7 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@@ -68,26 +69,8 @@
* {@link WebDocMetadata#REFERRER_REFERENCE}.- * The above example specifies this extractor should only apply on documents - * that have their URL ending with "rss" (in addition to the default - * content types supported). - *
- * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @EqualsAndHashCode @ToString public class XmlFeedLinkExtractor @@ -106,6 +89,12 @@ public Set extractLinks(CrawlDoc doc) throws IOException { return Set.of(); } + if (!getConfiguration().getRestrictions().isEmpty() + && !getConfiguration().getRestrictions().matches( + doc.getMetadata())) { + return Collections.emptySet(); + } + var refererUrl = doc.getReference(); Set links = new HashSet<>(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java index 794469625..da5047974 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/XmlFeedLinkExtractorConfig.java @@ -14,9 +14,8 @@ */ package com.norconex.crawler.web.doc.operations.link.impl; +import com.norconex.commons.lang.map.PropertyMatchers; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.crawler.web.doc.WebDocMetadata; -import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.importer.handler.CommonMatchers; import lombok.Data; @@ -24,53 +23,10 @@ /** *- * Link extractor for extracting links out of - * RSS and - * Atom XML feeds. - * It extracts the content of <link> tags. If you need more complex - * extraction, consider using {@link RegexLinkExtractor} or creating your own - * {@link LinkExtractor} implementation. + * Configuration for {@link XmlFeedLinkExtractor}. *
- * - *- * By default, this extractor only will be applied on documents matching - * one of these content types: - *
- * - * {@nx.include com.norconex.importer.handler.CommonMatchers#xmlFeedContentTypes} - * - *- * The following referrer information is stored as metadata in each document - * represented by the extracted URLs: - *
- *- * The above example specifies this extractor should only apply on documents - * that have their URL ending with "rss" (in addition to the default - * content types supported). - *
- * * @since 2.7.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class XmlFeedLinkExtractorConfig { @@ -78,8 +34,6 @@ public class XmlFeedLinkExtractorConfig { * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#XML_FEED_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher - * @return content type matcher */ private final TextMatcher contentTypeMatcher = CommonMatchers.xmlFeedContentTypes(); @@ -87,11 +41,11 @@ public class XmlFeedLinkExtractorConfig { /** * Matcher of one or more fields to use as the source of content to * extract links from, instead of the document content. - * @param fieldMatcher field matcher - * @return field matcher */ private final TextMatcher fieldMatcher = new TextMatcher(); + private final PropertyMatchers restrictions = new PropertyMatchers(); + public XmlFeedLinkExtractorConfig setFieldMatcher( TextMatcher fieldMatcher) { this.fieldMatcher.copyFrom(fieldMatcher); @@ -102,7 +56,7 @@ public XmlFeedLinkExtractorConfig setFieldMatcher( * The matcher of content types to apply link extraction on. No attempt to * extract links from any other content types will be made. Default is * {@link CommonMatchers#XML_FEED_CONTENT_TYPES}. - * @param contentTypeMatcher content type matcher + * @param matcher content type matcher * @return this */ public XmlFeedLinkExtractorConfig setContentTypeMatcher( @@ -110,4 +64,19 @@ public XmlFeedLinkExtractorConfig setContentTypeMatcher( contentTypeMatcher.copyFrom(matcher); return this; } + + /** + * Clears all restrictions. + */ + public void clearRestrictions() { + restrictions.clear(); + } + + /** + * Gets all restrictions + * @return the restrictions + */ + public PropertyMatchers getRestrictions() { + return restrictions; + } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java index 02676a1e7..52bb5eac0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolver.java @@ -14,6 +14,8 @@ */ package com.norconex.crawler.web.doc.operations.recrawl.impl; +import static java.util.Optional.ofNullable; + import java.time.Duration; import java.time.ZonedDateTime; import java.time.temporal.ChronoField; @@ -29,6 +31,7 @@ import com.norconex.crawler.web.doc.WebCrawlDocContext; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency; +import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency.ApplyTo; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.SitemapSupport; import com.norconex.crawler.web.sitemap.SitemapChangeFrequency; @@ -51,67 +54,21 @@ ** By default, existing sitemap directives take precedence over custom ones. * You chose to have sitemap directives be considered last or even disable - * sitemap directives using the {@link #setSitemapSupport(SitemapSupport)} + * sitemap directives using the + * {@link GenericRecrawlableResolverConfig#setSitemapSupport(SitemapSupport)} * method. *
* ** You can chose to have some of your crawled documents be re-crawled less - * frequently than others by specifying custom minimum frequencies - * ({@link #setMinFrequencies(Collection)}). Minimum frequencies are - * processed in the order specified and must each have to following: - *
- *- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *
- * - * {@nx.xml.usage - *- * The above example ensures PDFs are re-crawled no more frequently than - * once a month, while HTML news can be re-crawled as fast at every half hour. - * For the rest, it relies on the website sitemap directives (if any). + * frequently than others by specifying custom minimum frequencies with + * ({@link GenericRecrawlableResolverConfig#setMinFrequencies(Collection)}). + * Minimum frequencies are processed in the order specified. *
* * @since 2.5.0 */ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode @ToString @@ -157,15 +114,11 @@ public boolean isRecrawlable(WebCrawlDocContext prevData) { private MinFrequency getMatchingMinFrequency(WebCrawlDocContext prevData) { for (MinFrequency f : configuration.getMinFrequencies()) { - var applyTo = f.getApplyTo(); - if (StringUtils.isBlank(applyTo)) { - applyTo = "reference"; - } - if (("reference".equalsIgnoreCase(applyTo) - && f.getMatcher().matches(prevData.getReference()) - || ("contentType".equalsIgnoreCase(applyTo) - && f.getMatcher().matches( - prevData.getContentType().toString())))) { + var applyTo = ofNullable(f.getApplyTo()).orElse(ApplyTo.REFERENCE); + var matchMe = applyTo == ApplyTo.REFERENCE + ? prevData.getReference() + : prevData.getContentType().toString(); + if (f.getMatcher().matches(matchMe)) { return f; } } @@ -243,17 +196,16 @@ private boolean isRecrawlableFromSitemap(WebCrawlDocContext prevData) { lastModified, prevData.getReference()); if (lastModified.isAfter(lastCrawled)) { if (LOG.isDebugEnabled()) { - LOG.debug( - "Recrawlable according to sitemap directive " - + "(last modified '{}' > last crawled '{}'): {}", + LOG.debug(""" + Recrawlable according to sitemap directive \ + (last modified '{}' > last crawled '{}'): {}""", lastModified, lastCrawled, prevData.getReference()); } return true; } if (LOG.isDebugEnabled()) { - LOG.debug( - "Not recrawlable according to sitemap directive " - + "(last modified '{}' > last crawled '{}'): {}", + LOG.debug("Not recrawlable according to sitemap directive " + + "(last modified '{}' > last crawled '{}'): {}", lastModified, lastCrawled, prevData.getReference()); } return false; @@ -275,8 +227,7 @@ private boolean isRecrawlableFromFrequency( } if (LOG.isDebugEnabled()) { - LOG.debug( - "The {} change frequency is {} for: {}", + LOG.debug("The {} change frequency is {} for: {}", context, cf, prevData.getReference()); } if (cf == SitemapChangeFrequency.ALWAYS) { @@ -324,16 +275,15 @@ private boolean isRecrawlableFromFrequency( return true; } if (LOG.isDebugEnabled()) { - LOG.debug( - String.format(""" - Not recrawlable according to {} directive\s\ - (required elapsed time '{}'\s\ - >= actual elapsed time '{}' since '{}'): {}""", - context, - formatDuration(lastCrawlDate, minCrawlDate), - formatDuration(lastCrawlDate, now), - lastCrawlDate, - prevData.getReference())); + LOG.debug(String.format(""" + Not recrawlable according to {} directive\s\ + (required elapsed time '{}'\s\ + >= actual elapsed time '{}' since '{}'): {}""", + context, + formatDuration(lastCrawlDate, minCrawlDate), + formatDuration(lastCrawlDate, now), + lastCrawlDate, + prevData.getReference())); } return false; } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java index 8a3b212b3..5a7a2d81f 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverConfig.java @@ -23,87 +23,17 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.text.TextMatcher; -import com.norconex.commons.lang.time.DurationParser; import lombok.Data; import lombok.NoArgsConstructor; import lombok.experimental.Accessors; /** - *Relies on both sitemap directives and custom instructions for - * establishing the minimum frequency between each document recrawl. - *
- * - *- * Provided crawler support for sitemaps has not been disabled, - * this class tries to honor last modified and frequency directives found - * in sitemap files. - *
- *- * By default, existing sitemap directives take precedence over custom ones. - * You chose to have sitemap directives be considered last or even disable - * sitemap directives using the {@link #setSitemapSupport(SitemapSupport)} - * method. - *
- * - *- * You can chose to have some of your crawled documents be re-crawled less - * frequently than others by specifying custom minimum frequencies - * ({@link #setMinFrequencies(Collection)}). Minimum frequencies are - * processed in the order specified and must each have to following: - *
- *- * As of 2.7.0, XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). + * Configuration for {@link GenericRecrawlableResolver}. *
- * - * {@nx.xml.usage - *- * The above example ensures PDFs are re-crawled no more frequently than - * once a month, while HTML news can be re-crawled as fast at every half hour. - * For the rest, it relies on the website sitemap directives (if any). - *
- * * @since 2.5.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericRecrawlableResolverConfig { @@ -127,8 +57,6 @@ public static SitemapSupport getSitemapSupport(String sitemapSupport) { /** * The sitemap support strategy. Anull
value
* is equivalent to specifying the default {@link SitemapSupport#FIRST}.
- * @param sitemapSupport sitemap support strategy
- * @return sitemap support strategy
*/
private SitemapSupport sitemapSupport = SitemapSupport.FIRST;
@@ -154,11 +82,30 @@ public void setMinFrequencies(CollectionBy default a crawler will try to follow all links it discovers. You can + *
+ * By default a crawler will try to follow all links it discovers. You can * define your own filters to limit the scope of the pages being crawled. * When you have multiple URLs defined as start URLs, it can be tricky to * perform global filtering that apply to each URLs without causing @@ -47,7 +48,6 @@ *
* @since 2.3.0 */ -//TODO make this an interface so developers can provide their own? @EqualsAndHashCode @ToString @Slf4j diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java index 746855946..bd2ec7ef2 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/scope/impl/GenericUrlScopeResolverConfig.java @@ -18,24 +18,13 @@ import lombok.experimental.Accessors; /** - *By default a crawler will try to follow all links it discovers. You can - * define your own filters to limit the scope of the pages being crawled. - * When you have multiple URLs defined as start URLs, it can be tricky to - * perform global filtering that apply to each URLs without causing - * URL filtering conflicts. This class offers an easy way to address - * a frequent URL filtering need: to "stay on target". That is, - * when following a page and extracting URLs found in it, make sure to - * only keep URLs that are on the same site as the page URL we are on. - *
*- * By default this class does not request to stay on a site. + * Configuration for {@link GenericUrlScopeResolver}. *
* @since 2.3.0 */ -//TODO make this an interface so developers can provide their own? @Data @Accessors(chain = true) -@SuppressWarnings("javadoc") public class GenericUrlScopeResolverConfig { /** @@ -43,16 +32,12 @@ public class GenericUrlScopeResolverConfig { * the domain for each URL specified as a start URL. By default (false) * the crawler will try follow any discovered links not otherwise rejected * by other settings (like regular filtering rules you may have). - * @param stayOnDomaintrue
for the crawler to stay on domain
- * @return true
if the crawler should stay on a domain
*/
private boolean stayOnDomain;
/**
* Whether sub-domains are considered to be the same as a URL domain.
* Only applicable when "stayOnDomain" is true
.
- * @param includeSubdomains true
to include sub-domains
- * @return true
if including sub-domains
* @since 2.9.0
*/
private boolean includeSubdomains;
@@ -62,8 +47,6 @@ public class GenericUrlScopeResolverConfig {
* the port for each URL specified as a start URL. By default (false)
* the crawler will try follow any discovered links not otherwise rejected
* by other settings (like regular filtering rules you may have).
- * @param stayOnPort true
for the crawler to stay on port
- * @return true
if the crawler should stay on a port
*/
private boolean stayOnPort;
@@ -72,9 +55,6 @@ public class GenericUrlScopeResolverConfig {
* the protocol for each URL specified as a start URL. By default (false)
* the crawler will try follow any discovered links not otherwise rejected
* by other settings (like regular filtering rules you may have).
- * @param stayOnProtocol
- * true
for the crawler to stay on protocol
- * @return true
if the crawler should stay on protocol
*/
private boolean stayOnProtocol = false;
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java
index bba4f4884..5e650b019 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizer.java
@@ -98,51 +98,6 @@
* In addition, this class allows you to specify any number of URL
* value replacements using regular expressions.
*
- *
- * {@nx.xml.usage
- * - * Since 2.7.2, having an empty "normalizations" tag will effectively remove - * any normalizations rules previously set (like default ones). - * Not having the tag - * at all will keep existing/default normalizations. - *
- * - * {@nx.xml.example - *- * The following adds a normalization to add "www." to URL domains when - * missing, to the default set of normalizations. It also add custom - * URL "search-and-replace" to remove any "&view=print" strings from URLs - * as well as replace "&type=summary" with "&type=full". - *
*/ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java index a990bc186..a5a1656e7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/url/impl/GenericUrlNormalizerConfig.java @@ -24,8 +24,6 @@ import com.norconex.commons.lang.collection.CollectionUtil; import com.norconex.commons.lang.convert.GenericConverter; import com.norconex.commons.lang.url.UrlNormalizer; -import com.norconex.crawler.web.WebCrawlerConfig; -import com.norconex.crawler.web.doc.operations.url.WebUrlNormalizer; import lombok.Data; import lombok.Getter; @@ -33,117 +31,7 @@ /** *- * Generic implementation of {@link WebUrlNormalizer} that should satisfy - * most URL normalization needs. This implementation relies on - * {@link UrlNormalizer}. Please refer to it for complete documentation and - * examples. - *
- *
- * This class is in effect by default. To skip its usage, you
- * can explicitly set the URL Normalizer to null
in the
- * {@link WebCrawlerConfig}.
- *
- * By default, this class removes the URL fragment and applies these - * RFC 3986 - * normalizations: - *
- *- * To overwrite this default, you have to specify a new list of normalizations - * to apply, via the {@link #setNormalizations(List)} method, - * or via XML configuration. Each - * normalizations is identified by a code name. The following is the - * complete code name list for supported normalizations. Click on any code - * name to get a full description from {@link WebUrlNormalizer}: - *
- *- * In addition, this class allows you to specify any number of URL - * value replacements using regular expressions. - *
- * - * {@nx.xml.usage - *- * Since 2.7.2, having an empty "normalizations" tag will effectively remove - * any normalizations rules previously set (like default ones). - * Not having the tag - * at all will keep existing/default normalizations. - *
- * - * {@nx.xml.example - *- * The following adds a normalization to add "www." to URL domains when - * missing, to the default set of normalizations. It also add custom - * URL "search-and-replace" to remove any "&view=print" strings from URLs - * as well as replace "&type=summary" with "&type=full". + * Configuration for {@link GenericUrlNormalizer}. *
*/ @Data diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java index 5fdb6d37d..591a9c438 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListener.java @@ -100,31 +100,8 @@ * using a custom link extractor. * * - * {@nx.xml.usage - *- * The above example will generate a broken links report by recording - * 404 status codes (from HTTP response). - *
- * * @since 2.2.0 */ - @EqualsAndHashCode @ToString @Slf4j @@ -223,8 +200,8 @@ private void resolveStatusCodeRange( var end = toInt(endPoints[1]); if (start >= end) { throw new IllegalArgumentException( - "Invalid statusCode range: " + range - + ". Start value must be higher than end value."); + "Invalid statusCode range: %s. Start value must be " + + "higher than end value.".formatted(range)); } while (start <= end) { parsedCodes.add(start); @@ -274,9 +251,8 @@ private int toInt(String num) { return Integer.parseInt(num.trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException( - "The statusCodes attribute " - + "can only contain valid numbers. This number is invalid: " - + num); + "The statusCodes attribute can only contain valid numbers. " + + "This number is invalid: " + num); } } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java index af5c0eaee..9b573f7d7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/event/listeners/UrlStatusCrawlerEventListenerConfig.java @@ -15,106 +15,18 @@ package com.norconex.crawler.web.event.listeners; import java.nio.file.Path; -import java.util.List; - -import com.norconex.crawler.web.doc.operations.link.impl.HtmlLinkExtractor; -import com.norconex.crawler.web.doc.operations.link.impl.TikaLinkExtractor; import lombok.Data; import lombok.experimental.Accessors; /** *- * Store on file all URLs that were "fetched", along with their HTTP response - * code. Useful for reporting purposes (e.g. finding broken links). A short - * summary of all HTTP status codes can be found - * here. - *
- * - *- * By default, the status of all fetched URLs are stored by this listener, - * regardless what were those statuses. This can generate very lengthy reports - * on large crawls. If you are only interested in certain status codes, you can - * listen only for those using the {@link #setStatusCodes(String)} method - * or XML configuration equivalent. You specify the codes you want to listen - * for as coma-separated values. Ranges are also supported: specify two range - * values (both inclusive) separated by an hyphen. For instance, if you want - * to store all "bad" URLs, you can quickly specify all codes except - * 200 (OK) this way: - *
- *100-199,201-599- * - *
- * By default one generated report is created for each crawler, stored
- * in crawler-specific directories under the collector working directory.
- * The collector working directory can be overwritten using
- * {@link #setOutputDir(Path)}.
- * If {@link #isCombined()} is true
, status from all crawlers
- * defined will be written to a unique file in the collector working directory.
- *
- * By default, the file generated will use this naming pattern: - *
- *- * urlstatuses-[timestamp].csv - *- *
- * The filename prefix can be changed from "urlstatuses-" to anything else - * using {@link #setFileNamePrefix(String)}. + * Configuration for {@link UrlStatusCrawlerEventListener}. *
- * - *- * By default all crawlers will have their URL fetch statuses recorded when - * using this event listener. To only do so for some crawlers, you can - * use {@link #setCrawlerIds(List)} to identify them. - *
- * - *- * To capture the referring pages you have to use a link extractor that - * extracts referrer information. The default link extractor - * {@link HtmlLinkExtractor} properly extracts this information. Same with - * {@link TikaLinkExtractor}. This is only a consideration when - * using a custom link extractor. - *
- * - * {@nx.xml.usage - *- * The above example will generate a broken links report by recording - * 404 status codes (from HTTP response). - *
- * * @since 2.2.0 */ @Data @Accessors(chain = true) -@SuppressWarnings("javadoc") public class UrlStatusCrawlerEventListenerConfig { public static final String DEFAULT_FILENAME_PREFIX = "urlstatuses-"; @@ -123,32 +35,24 @@ public class UrlStatusCrawlerEventListenerConfig { * The coma-separated list of status codes to listen to. * Default isnull
(listens for all status codes).
* See class documentation for how to specify code ranges.
- * @param statusCode HTTP status codes
- * @return status codes
*/
private String statusCodes;
/**
* The local directory where this listener report will be written.
* Default uses the collector working directory.
- * @param outputDir directory path
- * @return directory path
*/
private Path outputDir;
/**
* The generated report file name prefix. See class documentation
* for default prefix.
- * @param fileNamePrefix file name prefix
- * @return file name prefix
*/
private String fileNamePrefix = DEFAULT_FILENAME_PREFIX;
/**
* Whether to add a timestamp to the file name, to ensure
* a new one is created with each run.
- * @param timestamped true
if timestamped
- * @return true
if timestamped
*/
private boolean timestamped;
}
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java
index 4e68c96cc..6ce57d161 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/HttpFetcher.java
@@ -22,63 +22,5 @@
*/
public interface HttpFetcher
extends Fetcher- // * Performs an HTTP request for the supplied document reference - // * and HTTP method. - // *
- // *- // * For each HTTP method supported, implementors should - // * do their best to populate the document and its {@link CrawlDocRecord} - // * with as much information they can. - // *
- // *
- // * Unsupported HTTP methods should return an HTTP response with the
- // * {@link CrawlDocState#UNSUPPORTED} state. To prevent users having to
- // * configure multiple HTTP clients, implementors should try to support
- // * both the GET
and HEAD
methods.
- // * POST is only used in special cases and is often not used during a
- // * crawl session.
- // *
- // * A null
method is treated as a GET
.
- // *
- * XML configuration entries expecting millisecond durations - * can be provided in human-readable format (English only), as per - * {@link DurationParser} (e.g., "5 minutes and 30 seconds" or "5m30s"). - *
- * ** Upon first encountering a secure site, this fetcher will check whether the @@ -163,10 +156,9 @@ *
** If you want to convert non-secure URLs secure ones regardless of website - * HSTS support, use - * {@link GenericUrlNormalizer.Normalization#secureScheme} instead. + * HSTS support, use {@link Normalization#SECURE_SCHEME} instead. * To disable HSTS support, use - * {@link GenericHttpFetcherConfig#setDisableHSTS(boolean)}. + * {@link GenericHttpFetcherConfig#setHstsDisabled(boolean)}. *
* ** These settings have no effect for web servers not supporting them. *
* - * {@nx.xml.usage - *- * The above example will authenticate the crawler to a web site before - * crawling. The website uses an HTML form with a username and password - * fields called "loginUser" and "loginPwd". - *
- * * @since 3.0.0 (Merged from GenericDocumentFetcher and * GenericHttpClientFactory) */ -@SuppressWarnings("javadoc") @Slf4j @EqualsAndHashCode(onlyExplicitlyIncluded = true) @ToString(onlyExplicitlyIncluded = true) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java index d3e910930..abd670f46 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/GenericHttpFetcherConfig.java @@ -38,10 +38,7 @@ /** * Generic HTTP Fetcher configuration. - * @since 3.0.0 (adapted from GenericHttpClientFactory and - * GenericDocumentFetcher from version 2.x) */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class GenericHttpFetcherConfig extends BaseFetcherConfig { @@ -61,23 +58,26 @@ public enum CookieSpec { RELAXED, STRICT, IGNORE } + /** + * HTTP status codes considered "valid". Defaults to 200. + */ private final Listnull
+ * Optional prefix prepended to captured HTTP response fields. A
+ * null
value (default) won't add any prefix.
*/
private String headersPrefix;
/**
* Whether content type is detected instead of relying on
* returned Content-Type
HTTP response header.
- * @param forceContentTypeDetection true
to enable detection
- * @return true
to enable detection
*/
private boolean forceContentTypeDetection;
@@ -85,33 +85,28 @@ public enum CookieSpec {
* Whether character encoding is detected instead of relying on
* the charset sometimes found in the Content-Type
HTTP
* response header.
- * @param forceCharsetDetection true
to enable detection
- * @return true
to enable detection
*/
private boolean forceCharsetDetection;
/**
* Authentication configuration for sites requiring it. Default
* is null
.
- * @param authentication authentication configuration
- * @return authentication configuration
*/
private HttpAuthConfig authentication;
/**
* Cookie specification to use when fetching documents. Default is relaxed.
- * @param cookieSpec cookie specification name
- * @return the cookieSpec cookie specification name
*/
private CookieSpec cookieSpec = CookieSpec.RELAXED;
+ /**
+ * An optional HTTP proxy.
+ */
private final ProxySettings proxySettings = new ProxySettings();
/**
* The connection timeout for a connection to be established.
* Default is {@link #DEFAULT_TIMEOUT}.
- * @param connectionTimeout connection timeout
- * @return connection timeout
*/
private Duration connectionTimeout = DEFAULT_TIMEOUT;
@@ -119,32 +114,24 @@ public enum CookieSpec {
* Gets the maximum period of inactivity between two consecutive data
* packets.
* Default is {@link #DEFAULT_TIMEOUT}.
- * @param socketTimeout socket timeout
- * @return socket timeout
*/
private Duration socketTimeout = DEFAULT_TIMEOUT;
/**
* Gets the timeout when requesting a connection.
* Default is {@link #DEFAULT_TIMEOUT}.
- * @param connectionRequestTimeout connection request timeout
- * @return connection request timeout
*/
private Duration connectionRequestTimeout = DEFAULT_TIMEOUT;
/**
* The local address, which may be useful when working with multiple
* network interfaces.
- * @param localAddress locale address
- * @return local address
*/
private String localAddress;
/**
* Whether 'Expect: 100-continue' handshake is enabled.
* See {@link RequestConfig#isExpectContinueEnabled()}
- * @param expectContinueEnabled true
if enabled
- * @return true
if enabled
*/
private boolean expectContinueEnabled;
@@ -152,8 +139,6 @@ public enum CookieSpec {
* The maximum number of redirects to be followed. This can help
* prevent infinite loops. A value of zero effectively disables
* redirects. Default is {@link #DEFAULT_MAX_REDIRECT}.
- * @param maxRedirects maximum number of redirects to be followed
- * @return maximum number of redirects to be followed
*/
private int maxRedirects = DEFAULT_MAX_REDIRECT;
@@ -161,16 +146,12 @@ public enum CookieSpec {
* The maximum number of connections that can be created. Typically,
* you would have at least the same amount as threads.
* Default is {@link #DEFAULT_MAX_CONNECTIONS}.
- * @param maxConnections maximum number of connections
- * @return number of connections
*/
private int maxConnections = DEFAULT_MAX_CONNECTIONS;
/**
* The maximum number of connections that can be used per route.
* Default is {@link #DEFAULT_MAX_CONNECTIONS_PER_ROUTE}.
- * @param maxConnectionsPerRoute maximum number of connections per route
- * @return number of connections per route
*/
private int maxConnectionsPerRoute = DEFAULT_MAX_CONNECTIONS_PER_ROUTE;
@@ -178,9 +159,6 @@ public enum CookieSpec {
* Sets the period of time after which to evict idle
* connections from the connection pool.
* Default is {@link #DEFAULT_MAX_IDLE_TIME}.
- * @param maxConnectionIdleTime amount of time after which to evict idle
- * connections
- * @return amount of time after which to evict idle connections
*/
private Duration maxConnectionIdleTime = DEFAULT_MAX_IDLE_TIME;
@@ -188,11 +166,12 @@ public enum CookieSpec {
* Sets the period of time a connection must be inactive
* to be checked in case it became stalled. Default is 0 (not pro-actively
* checked).
- * @param maxConnectionInactiveTime period of time in milliseconds
- * @return period of time in milliseconds
*/
private Duration maxConnectionInactiveTime;
+ /**
+ * Headers to send with every HTTP request.
+ */
private final Maptrue
if disabled
- * @return true
if disabled
*/
private boolean ifModifiedSinceDisabled;
@@ -211,28 +188,25 @@ public enum CookieSpec {
* Servers supporting this header will only return the requested document
* if the ETag value has changed, indicating a more recent version is
* available.
- * @param eTagDisabled true
if disabled
- * @return true
if disabled
*/
private boolean eTagDisabled;
/**
* The user-agent used when identifying the crawler to targeted web sites.
* It is highly recommended to always identify yourself.
- * @param userAgent user agent
- * @return user agent
*/
private String userAgent;
/**
* The redirect URL provider.
* Defaults to {@link GenericRedirectUrlProvider}.
- * @param redirectUrlProvider redirect URL provider
- * @return the redirect URL provider
*/
private RedirectUrlProvider redirectUrlProvider =
new GenericRedirectUrlProvider();
+ /**
+ * List of supported HTTP methods.
+ */
private final Listtrue
if trusting all SSL
- * certificates
- * @return true
if trusting all SSL certificates
*/
private boolean trustAllSSLCertificates;
/**
* Sets whether Server Name Indication (SNI) is disabled.
- * @param sniDisabled true
if disabled
- * @return true
if disabled
*/
private boolean sniDisabled;
+ /**
+ * Supported security protocols.
+ */
private final ListStrict-Transport-Security
policy
* (obtained from HTTP response header).
- * @param hstsDisabled true
if disabled
- * @return true
if disabled
*/
private boolean hstsDisabled;
@@ -280,6 +250,7 @@ public List* Generic HTTP Fetcher authentication configuration. *
- * {@nx.xml.usage - *- * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "authentication"). - *
* @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) @FieldNameConstants public class HttpAuthConfig { /** - *- * The authentication method. Valid values are (case insensitive): - *
- *null
(default value) indicates "any host" for the
* scope.
* Used for BASIC and DIGEST authentication.
- * @param host host for the scope
- * @return host for the scope
*/
private Host host;
@@ -151,51 +85,40 @@ public class HttpAuthConfig {
* The realm name for the current authentication scope.
* null
(default) indicates "any realm" for the scope.
* Used for BASIC and DIGEST authentication.
- * @param realm reaml name for the scope
- * @return realm name for the scope
*/
private String realm;
- //form
/**
* The authentication form character set for the form field values.
* Default is UTF-8.
- * @param formCharset authentication form character set
- * @return authentication form character set
*/
private Charset formCharset = StandardCharsets.UTF_8;
/**
- * The CSS selelector that identifies the form in a login page.
+ * The CSS selector that identifies the form in a login page.
* When set, requires {@link #getUrl()} to be pointing to a login
* page containing a login form.
- * @param formSelector form selector
- * @return form selector
*/
private String formSelector;
+ /**
+ * Additional form parameters possibly expected by the login form.
+ */
private final Maptrue
to perform preemptive authentication
- * @return true
to perform preemptive authentication
*/
private boolean preemptive;
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java
index 3a4e75965..8008b3185 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSniffer.java
@@ -45,9 +45,8 @@
*
* * EXPERIMENTAL: The use of this class is experimental. - * It is known to not be supported properly - * with some web drivers and/or browsers. It can even be ignored altogether - * by some web drivers. + * It is known to not be supported properly with some web drivers and/or + * browsers. It can even be ignored altogether by some web drivers. *
* * @since 3.0.0 @@ -129,17 +128,12 @@ void start(MutableCapabilities options) { new ResponseFilterAdapter.FilterSource( (response, contents, messageInfo) -> { // sniff only if original URL is being tracked - var trackedResponse = - trackedUrlResponses - .get(messageInfo.getOriginalUrl()); - + var trackedResponse = trackedUrlResponses + .get(messageInfo.getOriginalUrl()); if (trackedResponse != null) { - response.headers() - .forEach( - en -> trackedResponse.headers - .put( - en.getKey(), - en.getValue())); + response.headers().forEach( + en -> trackedResponse.headers.put( + en.getKey(), en.getValue())); trackedResponse.statusCode = response.status().code(); trackedResponse.reasonPhrase = diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java index 3975b622d..5f1716ff0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/HttpSnifferConfig.java @@ -27,37 +27,8 @@ ** Configuration for {@link HttpSniffer}. *
- * - * {@nx.xml.usage - *- * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "httpSniffer"). - *
- * - * @author Pascal Essiembre * @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class HttpSnifferConfig { @@ -65,24 +36,48 @@ public class HttpSnifferConfig { public static final int DEFAULT_MAX_BUFFER_SIZE = DataUnit.MB.toBytes(10).intValue(); + /** + * The host name passed to the browser pointing to the sniffer proxy. + * Defaults to 0 (random free port). + */ private int port; /** * The host name passed to the browser pointing to the sniffer proxy. * Defaults to "localhost". - * @param host host name - * @return host name - * @since 3.1.0 */ private String host; + /** + * Optionally overwrite browser user agent. + */ private String userAgent; private final Map- * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "screenshot"). - *
- * * @since 3.0.0 */ -@SuppressWarnings("javadoc") @ToString @EqualsAndHashCode @Slf4j @@ -92,9 +78,8 @@ public void takeScreenshot(WebDriver driver, Doc doc) { imageHandler.setConfiguration(configuration); try (InputStream in = streamFactory.newInputStream( - new ByteArrayInputStream( - ((TakesScreenshot) driver) - .getScreenshotAs(OutputType.BYTES)))) { + new ByteArrayInputStream(((TakesScreenshot) driver) + .getScreenshotAs(OutputType.BYTES)))) { // If wanting a specific web element: if (StringUtils.isNotBlank(configuration.getCssSelector())) { @@ -107,24 +92,19 @@ public void takeScreenshot(WebDriver driver, Doc doc) { location.x, location.y, size.width, size.height); var img = new MutableImage(in); img.crop(rectangle); - imageHandler.handleImage( - img.toInputStream( - ofNullable( - getConfiguration() - .getImageFormat()) - .orElse("png")), + imageHandler.handleImage(img.toInputStream( + ofNullable(getConfiguration().getImageFormat()) + .orElse("png")), doc); } else { imageHandler.handleImage(in, doc); } } catch (Exception e) { if (LOG.isDebugEnabled()) { - LOG.error( - "Could not take screenshot of: {}", + LOG.error("Could not take screenshot of: {}", doc.getReference(), e); } else { - LOG.error( - "Could not take screenshot of: {}. Error:\n{}", + LOG.error("Could not take screenshot of: {}. Error:\n{}", doc.getReference(), ExceptionUtil.getFormattedMessages(e)); } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java index a8632b22f..2302ced57 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/ScreenshotHandlerConfig.java @@ -17,8 +17,6 @@ import java.nio.file.Path; import java.nio.file.Paths; -import org.openqa.selenium.WebDriver; - import com.norconex.crawler.core.doc.CrawlDocMetadata; import com.norconex.crawler.web.fetch.util.DocImageHandlerConfig; @@ -27,27 +25,10 @@ /** *- * Takes screenshot of pages using a Selenium {@link WebDriver}. - * Either the entire page, or a specific DOM element. - * Screenshot images can be stored in a document metadata/field or - * in a local directory. - *
- * - * {@nx.xml.usage - *- * The above XML configurable options can be nested in a supporting parent - * tag of any name. - * The expected parent tag name is defined by the consuming classes - * (e.g. "screenshot"). + * Configuration for {@link ScreenshotHandler}. *
- * * @since 3.0.0 */ -@SuppressWarnings("javadoc") @Data @Accessors(chain = true) public class ScreenshotHandlerConfig extends DocImageHandlerConfig { diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java index 1507eff02..804841eee 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcher.java @@ -14,7 +14,6 @@ */ package com.norconex.crawler.web.fetch.impl.webdriver; -import static java.time.Duration.ofMillis; import static java.util.Optional.ofNullable; import java.io.InputStream; @@ -92,93 +91,6 @@ * Browsers/WebDriver implementations. * * - * {@nx.xml.usage - *The above example will use Firefox to crawl dynamically generated - * pages using a specific web driver. - *
- * * @since 3.0.0 */ @SuppressWarnings("javadoc") @@ -334,9 +246,8 @@ public HttpFetchResponse fetch(HttpFetchRequest req) .builder() .crawlDocState(CrawlDocState.NEW) .statusCode(200) - .reasonPhrase( - "No exception thrown, but real status code " - + "unknown. Capture headers for real status code.") + .reasonPhrase("No exception thrown, but real status code " + + "unknown. Capture headers for real status code.") .userAgent(getUserAgent()) .build(); } @@ -387,39 +298,30 @@ protected InputStream fetchDocumentContent(String url) { } var timeouts = driver.manage().timeouts(); - if (configuration.getPageLoadTimeout() != 0) { - timeouts.pageLoadTimeout( - ofMillis(configuration.getPageLoadTimeout())); + if (configuration.getPageLoadTimeout() != null) { + timeouts.pageLoadTimeout(configuration.getPageLoadTimeout()); } - if (configuration.getImplicitlyWait() != 0) { - timeouts.implicitlyWait( - ofMillis(configuration.getImplicitlyWait())); + if (configuration.getImplicitlyWait() != null) { + timeouts.implicitlyWait(configuration.getImplicitlyWait()); } - if (configuration.getScriptTimeout() != 0) { - timeouts.scriptTimeout( - ofMillis(configuration.getScriptTimeout())); + if (configuration.getScriptTimeout() != null) { + timeouts.scriptTimeout(configuration.getScriptTimeout()); } - if (configuration.getWaitForElementTimeout() != 0 + if (configuration.getWaitForElementTimeout() != null && StringUtils.isNotBlank( configuration.getWaitForElementSelector())) { var elType = ObjectUtils.defaultIfNull( configuration.getWaitForElementType(), WaitElementType.TAGNAME); - LOG.debug( - "Waiting for element '{}' of type '{}' for '{}'.", + LOG.debug("Waiting for element '{}' of type '{}' for '{}'.", configuration.getWaitForElementSelector(), elType, url); var wait = new WebDriverWait( - driver, ofMillis(configuration.getWaitForElementTimeout())); - wait.until( - ExpectedConditions.presenceOfElementLocated( - elType.getBy( - configuration - .getWaitForElementSelector()))); - - LOG.debug( - "Done waiting for element '{}' of type '{}' for '{}'.", + driver, configuration.getWaitForElementTimeout()); + wait.until(ExpectedConditions.presenceOfElementLocated( + elType.getBy(configuration.getWaitForElementSelector()))); + LOG.debug("Done waiting for element '{}' of type '{}' for '{}'.", configuration.getWaitForElementSelector(), elType, url); } @@ -428,8 +330,8 @@ protected InputStream fetchDocumentContent(String url) { configuration.getLatePageScript()); } - if (configuration.getThreadWait() != 0) { - Sleeper.sleepMillis(configuration.getThreadWait()); + if (configuration.getThreadWait() != null) { + Sleeper.sleepMillis(configuration.getThreadWait().toMillis()); } var pageSource = driver.getPageSource(); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java index d7a027955..47d9cf6ee 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfig.java @@ -17,6 +17,7 @@ import java.awt.Dimension; import java.net.URL; import java.nio.file.Path; +import java.time.Duration; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -64,11 +65,32 @@ By getBy(String selector) { } } + /** + * The browser used for crawling. Also defines which WebDriver to use. + * Default is Firefox. + */ private Browser browser = Browser.FIREFOX; - // Default will try to detect driver installation on OS + /** + * Local path to driver executable ornull
to attempt
+ * automatic detection of the driver path.
+ * See web driver vendor documentation for the location facilitating
+ * detection.
+ * Use {@link #setRemoteURL(URL)} instead when using
+ * a remote web driver cluster.
+ */
private Path driverPath;
- // Default will try to detect browser installation on OS
+ /**
+ * Local path to browser executable or null
to attempt
+ * automatic browser path detection. See browser vendor documentation
+ * for the expected browser installed location.
+ * Use {@link #setRemoteURL(URL)} instead when using
+ * a remote web driver cluster.
+ */
private Path browserPath;
+ /**
+ * URL of a remote WebDriver cluster. Alternative to using a local
+ * browser and local web driver.
+ */
private URL remoteURL;
/**
@@ -81,31 +103,95 @@ By getBy(String selector) {
*/
private boolean useHtmlUnit;
+ /**
+ * Optionally setup an HTTP proxy that allows to set and capture HTTP
+ * headers. For advanced use only.
+ */
private HttpSniffer httpSniffer;
+
+ /**
+ * When configured, takes screenshots of each web pages.
+ */
private ScreenshotHandler screenshotHandler;
+ /**
+ * Optional capabilities (configuration options) for the web driver.
+ * Many are specific to each browser or web driver. Refer to vendor
+ * documentation.
+ */
private final Map- * The above XML configurable options can be nested in a parent tag of any name. - * The expected parent tag name is defined by the consuming classes. - *
* @since 3.0.0 */ @Slf4j @@ -70,42 +53,8 @@ public class DocImageHandler implements Configurable- * Handles images associated with a document (which is different than a document - * being itself an image). Examples can be screenshots, featured image, etc. - * Images can be stored in a document metadata/field or in a local directory. - *
- * - * {@nx.xml.usage - *- * The above XML configurable options can be nested in a parent tag of any name. - * The expected parent tag name is defined by the consuming classes. + * Configuration for {@link DocImageHandler}. *
* @since 3.0.0 */ @@ -54,11 +35,34 @@ public class DocImageHandlerConfig { public enum Target { - METADATA, DIRECTORY + /** + * Store image in metadata field. + */ + METADATA, + /** + * Store image on local directory. + */ + DIRECTORY } + /** + * Directory structure when storing images on disk. + */ public enum DirStructure { - URL2PATH, DATE, DATETIME + /** + * Create directories for each URL segments, with handling + * of special characters. + */ + URL2PATH, + /** + * Create directories for each date (e.g.,2000/12/31/
).
+ */
+ DATE,
+ /**
+ * Create directories for each date and time, up to seconds
+ * (e.g., 2000/12/31/13/34/12/
).
+ */
+ DATETIME
}
public static final String DEFAULT_IMAGE_FORMAT = "png";
diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
index e6f30ff5f..15dd31f04 100644
--- a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
+++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProvider.java
@@ -14,11 +14,15 @@
*/
package com.norconex.crawler.web.fetch.util;
-import java.io.UnsupportedEncodingException;
+import static com.norconex.crawler.web.fetch.util.GenericRedirectUrlProviderConfig.DEFAULT_FALLBACK_CHARSET;
+import static java.util.Optional.ofNullable;
+import static org.apache.commons.lang3.StringUtils.substringAfterLast;
+import static org.apache.commons.lang3.StringUtils.trimToNull;
+
import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
+import java.nio.charset.Charset;
-import org.apache.commons.lang3.StringUtils;
+import org.apache.hc.core5.http.Header;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.HttpRequest;
import org.apache.hc.core5.http.HttpResponse;
@@ -26,11 +30,11 @@
import org.apache.hc.core5.http.protocol.HttpCoreContext;
import org.apache.tika.utils.CharsetUtils;
+import com.norconex.commons.lang.config.Configurable;
import com.norconex.commons.lang.url.HttpURL;
-import com.norconex.commons.lang.xml.Xml;
-import com.norconex.commons.lang.xml.XmlConfigurable;
import lombok.Data;
+import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
/**
@@ -80,34 +84,18 @@
*
*
*
- * {@nx.xml.usage
- * - *- * } - * - * The above example sets the default character encoding to be "ISO-8859-1" - * when it could not be detected. - *
- * * @since 2.4.0 */ @Slf4j @Data -public class GenericRedirectUrlProvider - implements RedirectUrlProvider, XmlConfigurable { - - public static final String DEFAULT_FALLBACK_CHARSET = - StandardCharsets.UTF_8.toString(); +public class GenericRedirectUrlProvider implements + RedirectUrlProvider, Configurable{ private static final int ASCII_MAX_CODEPOINT = 128; - private String fallbackCharset = DEFAULT_FALLBACK_CHARSET; + @Getter + private final GenericRedirectUrlProviderConfig configuration = + new GenericRedirectUrlProviderConfig(); @Override public String provideRedirectURL( @@ -127,30 +115,15 @@ public String provideRedirectURL( var hl = response.getLastHeader(HttpHeaders.LOCATION); if (hl == null) { //TODO should throw exception instead? - LOG.error( - "Redirect detected to a null Location for: {}", + LOG.error("Redirect detected to a null Location for: {}", originalURL); return null; } var redirectLocation = hl.getValue(); - //--- Charset --- - String charset = null; - var hc = response.getLastHeader("Content-Type"); - if (hc != null) { - var contentType = hc.getValue(); - if (contentType.contains(";")) { - charset = StringUtils.substringAfterLast( - contentType, "charset="); - } - } - if (StringUtils.isBlank(charset)) { - charset = fallbackCharset; - } - //--- Build/fix redirect URL --- var targetURL = HttpURL.toAbsolute(originalURL, redirectLocation); - targetURL = resolveRedirectURL(targetURL, charset); + targetURL = resolveRedirectURL(response, targetURL); if (LOG.isDebugEnabled()) { LOG.debug("URL redirect: {} -> {}", originalURL, targetURL); @@ -158,16 +131,17 @@ public String provideRedirectURL( return targetURL; } - //TODO is there value in moving this method to somewhere re-usable? + //MAYBE: is there value in moving this method to somewhere re-usable? private String resolveRedirectURL( - final String redirectURL, final String nonAsciiCharset) { + HttpResponse response, String redirectURL) { var url = redirectURL; // Is string containing only ASCII as it should? var isAscii = true; final var length = url.length(); - for (var offset = 0; offset < length;) { + var offset = 0; + while (offset < length) { final var codepoint = url.codePointAt(offset); if (codepoint > ASCII_MAX_CODEPOINT) { isAscii = false; @@ -184,30 +158,29 @@ private String resolveRedirectURL( Will try to fix. Redirect URL: {}""", redirectURL); // try to fix if non ascii charset is non UTF8. - if (StringUtils.isNotBlank(nonAsciiCharset)) { - var charset = CharsetUtils.clean(nonAsciiCharset); - if (!StandardCharsets.UTF_8.toString().equals(charset)) { - try { - return new String(url.getBytes(charset)); - } catch (UnsupportedEncodingException e) { - LOG.warn( - "Could not fix badly encoded URL with charset " - + "\"{}\". Redirect URL: {}", - charset, redirectURL, e); - } - } - } - - return new String(url.getBytes(StandardCharsets.UTF_8)); + return new String(url.getBytes(resolveCharset(response, redirectURL))); } - @Override - public void loadFromXML(Xml xml) { - setFallbackCharset(xml.getString("@fallbackCharset", fallbackCharset)); - } - - @Override - public void saveToXML(Xml xml) { - xml.setAttribute("fallbackCharset", fallbackCharset); + // Detect charset from response header or use fallback + private Charset resolveCharset(HttpResponse response, String redirectUrl) { + return ofNullable(response.getLastHeader("Content-Type")) + .map(Header::getValue) + .filter(ct -> ct.contains(";")) + .map(ct -> trimToNull(substringAfterLast(ct, "charset="))) + .map(chset -> { + try { + return CharsetUtils.forName(chset); + } catch (RuntimeException e) { + var charset = + ofNullable(configuration.getFallbackCharset()) + .orElse(DEFAULT_FALLBACK_CHARSET); + LOG.warn(""" + Could not fix badly encoded URL with charset \ + "{}". Redirect URL: "{}". Will try with \ + fallback charset: {}""", + charset, redirectUrl, charset); + return charset; + } + }).get(); } } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java new file mode 100644 index 000000000..17bc2e59b --- /dev/null +++ b/crawler/web/src/main/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderConfig.java @@ -0,0 +1,34 @@ +/* Copyright 2015-2024 Norconex Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.norconex.crawler.web.fetch.util; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import lombok.Data; +import lombok.experimental.Accessors; + +/** + * Configuration for {@link GenericRedirectUrlProvider}. + */ +@Data +@Accessors(chain = true) +public class GenericRedirectUrlProviderConfig { + + public static final Charset DEFAULT_FALLBACK_CHARSET = + StandardCharsets.UTF_8; + + private Charset fallbackCharset = DEFAULT_FALLBACK_CHARSET; +} diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java index ef0a7e002..9560ec923 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProvider.java @@ -51,20 +51,6 @@ * If robots instructions are provided in both the HTML page and * HTTP header, the ones in HTML page will take precedence, and the * ones in HTTP header will be ignored.
- * - * {@nx.xml.usage - *- * - * } - * - * {@nx.xml.example - *(string prefixing headers) - *- * } - * - * The above example ignores robot meta information. - *
*/ @Slf4j @EqualsAndHashCode diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java index 9ee56acf0..4bf348029 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java @@ -14,43 +14,11 @@ */ package com.norconex.crawler.web.robot.impl; -import com.norconex.crawler.web.robot.RobotsMetaProvider; - import lombok.Data; import lombok.experimental.Accessors; /** - *Implementation of {@link RobotsMetaProvider} as per X-Robots-Tag - * and ROBOTS standards. - * Extracts robots information from "ROBOTS" meta tag in an HTML page - * or "X-Robots-Tag" tag in the HTTP header (see - * - * https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag - * and - * - * http://www.robotstxt.org/meta.html). - *
- * - *If you specified a prefix for the HTTP headers, make sure to specify it - * again here or the robots meta tags will not be found.
- * - *If robots instructions are provided in both the HTML page and - * HTTP header, the ones in HTML page will take precedence, and the - * ones in HTTP header will be ignored.
- * - * {@nx.xml.usage - *- * - * } - * - * {@nx.xml.example - *(string prefixing headers) - *- * } - * - * The above example ignores robot meta information. - *
+ * Configuration for {@link StandardRobotsMetaProvider}. */ @Data @Accessors(chain = true) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java index e104ecf72..4994e5726 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsTxtProvider.java @@ -60,18 +60,6 @@ * described at * http://www.robotstxt.org/robotstxt.html. * - * {@nx.xml.usage - *- * } - * - * {@nx.xml.example - * - *- * } - * - * The above example ignores "robots.txt" files present on web sites. - *
*/ @Slf4j @EqualsAndHashCode diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java index e992ffeb1..2b3f19694 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocator.java @@ -41,23 +41,6 @@ * Default paths are:/sitemap.xml
and */sitemap_index.xml
* - * - * {@nx.xml.usage - *- * - * } */ @EqualsAndHashCode @ToString diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java index ea9ff1920..7eabace5b 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/GenericSitemapLocatorConfig.java @@ -19,40 +19,13 @@ import java.util.List; import com.norconex.commons.lang.collection.CollectionUtil; -import com.norconex.crawler.web.robot.RobotsTxtProvider; import lombok.Data; import lombok.experimental.Accessors; import lombok.experimental.FieldNameConstants; /** - *- * - * - *- * (Sitemap URL path relative to web site domain. - * Overwriting default when specified.) - * - *- * If there is a sitemap defined as a start reference for the same URL web site, - * this locator is not used. Otherwise, it tells the crawler to - * use the sitemap as defined in the web site "robots.txt" file (provided - * the web site defines one and {@link RobotsTxtProvider} is enabled). - * If no sitemap resolution was possible from "robots.txt", an attempt will - * be made to retrieve a sitemap using the configured sitemap paths. - * Default paths are:
- * - * {@nx.xml.usage - */sitemap.xml
and - */sitemap_index.xml
- *- * - * } + * Configuration for {@link GenericSitemapLocator}. */ @Data @Accessors(chain = true) @@ -62,8 +35,18 @@ public class GenericSitemapLocatorConfig { public static final List- * - * - *- * (Sitemap URL path relative to web site domain. - * Overwriting default when specified.) - * - *DEFAULT_PATHS = List.of("/sitemap.xml", "/sitemap_index.xml"); + /** + * The domain-relative URL paths where to look for sitemaps when not + * supplied as start reference or part of a web site robots.txt file. + * Defaults to /sitemap.xml
and + */sitemap_index.xml
. + */ private final Listpaths = new ArrayList<>(DEFAULT_PATHS); + /** + * Whether to disable checking for the sitemap locations in a web site + * robots.txt file. + */ private boolean robotsTxtSitemapDisabled; /** diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java index 481b5767f..92cc053ab 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapParser.java @@ -55,9 +55,8 @@ List parse( Xml.stream(is) .takeWhile(c -> { if (stopping.isTrue()) { - LOG.debug( - "Sitemap not entirely parsed due to " - + "crawler being stopped."); + LOG.debug("Sitemap not entirely parsed due to " + + "crawler being stopped."); return false; } return true; @@ -72,13 +71,11 @@ List parse( } }); } catch (XmlException e) { - LOG.error( - "Cannot fetch sitemap: {} -- Likely an invalid sitemap " - + "XML format causing a parsing error (actual error:{}).", + LOG.error("Cannot fetch sitemap: {} -- Likely an invalid sitemap " + + "XML format causing a parsing error (actual error:{}).", location, e.getMessage()); } catch (IOException e) { - LOG.error( - "Cannot fetch sitemap: {} ({})", + LOG.error("Cannot fetch sitemap: {} ({})", location, e.getMessage(), e); } return children; @@ -104,9 +101,8 @@ private Optional toDocRecord( // Is URL valid? if (StringUtils.isBlank(url) || (!lenient && !url.startsWith(sitemapLocationDir))) { - LOG.debug( - "Sitemap URL invalid for location directory." - + " URL: {} Location directory: {}", + LOG.debug("Sitemap URL invalid for location directory." + + " URL: {} Location directory: {}", url, sitemapLocationDir); return Optional.empty(); } diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java index af86a6100..70e2470f9 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/sitemap/impl/SitemapUtil.java @@ -73,10 +73,6 @@ static boolean shouldProcessSitemap( || cacheModifDate.isBefore(newRec.getLastModified()); } - // static ZonedDateTime now() { - // return ZonedDateTime.now(ZoneOffset.UTC); - // } - static SitemapRecord toSitemapRecord(CrawlDoc doc) { var indexRec = new SitemapRecord(); var docRec = Web.docContext(doc); diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java index 2d717cb30..995461b5d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/spi/CrawlerWebPtProvider.java @@ -35,7 +35,6 @@ import com.norconex.crawler.web.WebCrawlerConfig; import com.norconex.crawler.web.doc.operations.canon.CanonicalLinkDetector; import com.norconex.crawler.web.doc.operations.delay.DelayResolver; -import com.norconex.crawler.web.doc.operations.delay.impl.DelayRange; import com.norconex.crawler.web.doc.operations.link.LinkExtractor; import com.norconex.crawler.web.doc.operations.recrawl.RecrawlableResolver; import com.norconex.crawler.web.doc.operations.scope.UrlScopeResolver; @@ -62,7 +61,6 @@ public MultiValuedMap , Class>> getPolymorphicTypes() { addPolyType(map, MetadataChecksummer.class, "doc.operations.checksum"); addPolyType(map, EventListener.class, "event.listeners"); addPolyType(map, DelayResolver.class); - addPolyType(map, DelayRange.class); addPolyType( map, DocumentFilter.class, "doc.operations.filter"); //NOSONAR diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java b/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java index 64f7ceaef..60413b5e7 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/util/Web.java @@ -16,8 +16,6 @@ import static org.apache.commons.lang3.StringUtils.substring; -import java.util.Collection; -import java.util.List; import java.util.Optional; import java.util.regex.Pattern; @@ -27,7 +25,6 @@ import com.norconex.crawler.core.Crawler; import com.norconex.crawler.core.doc.CrawlDoc; import com.norconex.crawler.core.event.CrawlerEvent; -import com.norconex.crawler.core.fetch.Fetcher; import com.norconex.crawler.web.WebCrawlerConfig; import com.norconex.crawler.web.WebCrawlerContext; import com.norconex.crawler.web.doc.WebCrawlDocContext; @@ -48,38 +45,17 @@ public static void fireIfUrlOutOfScope( WebCrawlDocContext docContext, UrlScope urlScope) { if (!urlScope.isInScope()) { - crawler.fire( - CrawlerEvent - .builder() - .name(WebCrawlerEvent.REJECTED_OUT_OF_SCOPE) - .source(crawler) - .subject(Web.config(crawler).getUrlScopeResolver()) - .docContext(docContext) - .message(urlScope.outOfScopeReason()) - .build()); + crawler.fire(CrawlerEvent + .builder() + .name(WebCrawlerEvent.REJECTED_OUT_OF_SCOPE) + .source(crawler) + .subject(Web.config(crawler).getUrlScopeResolver()) + .docContext(docContext) + .message(urlScope.outOfScopeReason()) + .build()); } } - // private static final BeanMapper BEAN_MAPPER = - // CrawlSessionBeanMapperFactory.create( - // WebCrawlerConfig.class, b -> - // b.unboundPropertyMapping( - // "crawler", WebCrawlerMixIn.class)); - // private static class WebCrawlerMixIn { - // @JsonDeserialize(as = WebCrawlerConfig.class) - // private CrawlerConfig configuration; - // } - - // public static BeanMapper beanMapper() { - // return BEAN_MAPPER; - // } - - // public static WebCrawlerConfig config(CrawlerConfig cfg) { - // return (WebCrawlerConfig) cfg; - // } - // public static WebCrawlerConfig config(AbstractPipelineContext ctx) { - // return (WebCrawlerConfig) Web.config(ctx.getCrawler()); - // } public static WebCrawlerConfig config(Crawler crawler) { return (WebCrawlerConfig) crawler.getConfiguration(); } @@ -88,32 +64,6 @@ public static WebCrawlerContext crawlerContext(Crawler crawler) { return (WebCrawlerContext) crawler.getContext(); } - // public static WebImporterPipelineContext importerContext( - // AbstractPipelineContext ctx) { - // return (WebImporterPipelineContext) ctx; - // } - - // //TODO move this one to core? - // public static void fire( - // Crawler crawler, - // @NonNull - // Consumer > c) { - // if (crawler != null) { - // var builder = CrawlerEvent.builder(); - // c.accept(builder); - // crawler.getEventManager().fire(builder.build()); - // } - // } - - //TODO could probably move this where needed since generically, - // we would get the fetcher wrapper directly from crawler. - public static List toHttpFetcher( - @NonNull Collection > fetchers) { - return fetchers.stream() - .map(HttpFetcher.class::cast) - .toList(); - } - public static HttpFetcher fetcher(Crawler crawler) { return (HttpFetcher) crawler.getFetcher(); } @@ -130,10 +80,9 @@ public static WebCrawlDocContext cachedDocContext( public static RobotsTxt robotsTxt(Crawler crawler, String reference) { var cfg = Web.config(crawler); return Optional.ofNullable(cfg.getRobotsTxtProvider()) - .map( - rb -> rb.getRobotsTxt( - (HttpFetcher) crawler.getFetcher(), - reference)) + .map(rb -> rb.getRobotsTxt( + (HttpFetcher) crawler.getFetcher(), + reference)) .orElse(null); } @@ -199,15 +148,14 @@ public static Properties parseDomAttributes( if (StringUtils.isBlank(attribsStr)) { return props; } - doParseDomAttributes( - attribsStr - // strip before and after angle brackets as separate steps, - // in case of weird mark-up - .replaceFirst("(?s)^.*<\\s*[\\w-]+\\s*(.*)$", "$1") - .replaceFirst("(?s)^(.*?)>.*$", "$1") - .replaceAll("\\s+", " ") - .replace(" =", "=") - .replace("= ", "="), + doParseDomAttributes(attribsStr + // strip before and after angle brackets as separate steps, + // in case of weird mark-up + .replaceFirst("(?s)^.*<\\s*[\\w-]+\\s*(.*)$", "$1") + .replaceFirst("(?s)^(.*?)>.*$", "$1") + .replaceAll("\\s+", " ") + .replace(" =", "=") + .replace("= ", "="), props); return props; } diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java index 073be458c..ea8041132 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/cases/feature/StayOnSitemapTest.java @@ -134,7 +134,7 @@ public HttpFetchResponse fetch(HttpFetchRequest req) mem.getUpsertRequests().forEach(req -> { assertThat( req.getMetadata().getInteger( - "collector.depth")).isZero(); + "crawler.depth")).isZero(); assertThat(req.getReference()).containsAnyOf( page1Path, page2Path, diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.html b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.html similarity index 100% rename from crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageProcessorTest.html rename to crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/image/impl/FeaturedImageResolverTest.html diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java index 5ea5a0557..b0d93500f 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/recrawl/impl/GenericRecrawlableResolverTest.java @@ -32,6 +32,7 @@ import com.norconex.commons.lang.text.TextMatcher; import com.norconex.crawler.web.doc.WebCrawlDocContext; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency; +import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.MinFrequency.ApplyTo; import com.norconex.crawler.web.doc.operations.recrawl.impl.GenericRecrawlableResolverConfig.SitemapSupport; import lombok.extern.slf4j.Slf4j; @@ -45,10 +46,10 @@ void testWriteRead() { r.getConfiguration().setSitemapSupport(SitemapSupport.LAST); var f1 = new MinFrequency( - "reference", "monthly", + ApplyTo.REFERENCE, "monthly", TextMatcher.regex(".*\\.pdf").ignoreCase()); var f2 = new MinFrequency( - "contentType", "1234", + ApplyTo.CONTENT_TYPE, "1234", TextMatcher.regex(".*")); r.getConfiguration().setMinFrequencies(List.of(f1, f2)); @@ -72,13 +73,14 @@ void testCustomFrequency() { prevCrawl.setCrawlDate(prevCrawlDate); var f = new MinFrequency( - "reference", "120 days", TextMatcher.regex(".*")); + ApplyTo.REFERENCE, "120 days", TextMatcher.regex(".*")); r.getConfiguration().setMinFrequencies(List.of(f)); Assertions.assertFalse(r.isRecrawlable(prevCrawl)); // Delay has passed - f = new MinFrequency("reference", "5 days", TextMatcher.regex(".*")); + f = new MinFrequency( + ApplyTo.REFERENCE, "5 days", TextMatcher.regex(".*")); r.getConfiguration().setMinFrequencies(List.of(f)); Assertions.assertTrue(r.isRecrawlable(prevCrawl)); } @@ -163,10 +165,10 @@ void testIsRecrawlable( var matcher = "reference".equals(minFreqApplyTo) ? TextMatcher.basic(url) : TextMatcher.basic("text/html"); - resolver.getConfiguration().setMinFrequencies( - List.of( - new MinFrequency( - minFreqApplyTo, minFreqValue, matcher))); + resolver.getConfiguration().setMinFrequencies(List.of( + new MinFrequency("reference".equals(minFreqApplyTo) + ? ApplyTo.REFERENCE + : ApplyTo.CONTENT_TYPE, minFreqValue, matcher))); assertThat(resolver.isRecrawlable(prevRec)).isEqualTo(expected); } diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java index 405dc0529..3aea1ee01 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/AbstractWebDriverHttpFetcherTest.java @@ -25,6 +25,7 @@ import java.io.UncheckedIOException; import java.net.ServerSocket; import java.nio.file.Path; +import java.time.Duration; import java.util.List; import org.apache.commons.lang3.RandomStringUtils; @@ -282,11 +283,11 @@ void testResolvingUserAgent(ClientAndServer client) { // test setting a bunch of other params fetcher.getConfiguration() .setWindowSize(new java.awt.Dimension(640, 480)) - .setPageLoadTimeout(10_1000) - .setImplicitlyWait(1000) - .setScriptTimeout(10_000) + .setPageLoadTimeout(Duration.ofSeconds(10)) + .setImplicitlyWait(Duration.ofSeconds(1)) + .setScriptTimeout(Duration.ofSeconds(10)) .setWaitForElementSelector("p") - .setWaitForElementTimeout(10_000); + .setWaitForElementTimeout(Duration.ofSeconds(10)); cfg.setStartReferences(List.of(hostUrl(client, path))); }); diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java index b0be4a0db..7ca59db96 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/impl/webdriver/WebDriverHttpFetcherConfigTest.java @@ -21,6 +21,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Paths; +import java.time.Duration; import java.util.List; import org.junit.jupiter.api.Test; @@ -45,13 +46,13 @@ void testWriteReadFetcher() throws MalformedURLException { c.setBrowserPath(Paths.get("/some/browser/path")); c.setDriverPath(Paths.get("/some/driver/path")); c.setRemoteURL(new URL("http://example.com")); - c.setImplicitlyWait(4000); + c.setImplicitlyWait(Duration.ofSeconds(4)); c.setEarlyPageScript("alert('hello init!');"); - c.setPageLoadTimeout(5000); + c.setPageLoadTimeout(Duration.ofSeconds(5)); c.setLatePageScript("alert('hello page!');"); - c.setScriptTimeout(6000); + c.setScriptTimeout(Duration.ofSeconds(6)); c.setWaitForElementSelector("#header"); - c.setWaitForElementTimeout(1234); + c.setWaitForElementTimeout(Duration.ofMillis(1234)); c.setWaitForElementType(WaitElementType.ID); c.setWindowSize(new Dimension(666, 999)); c.setCapabilities( @@ -69,12 +70,9 @@ void testWriteReadFetcher() throws MalformedURLException { "rh2", "hrval2")); c.setHttpSniffer(snif); - c.setReferenceFilters( - List.of( - configure( - new GenericReferenceFilter(), cfg -> cfg - .setValueMatcher( - TextMatcher.regex("test.*"))))); + c.setReferenceFilters(List.of(configure( + new GenericReferenceFilter(), cfg -> cfg + .setValueMatcher(TextMatcher.regex("test.*"))))); var sh = new ScreenshotHandler(); sh.getConfiguration() diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java index f85159170..6ea9d6162 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/fetch/util/GenericRedirectUrlProviderTest.java @@ -27,7 +27,7 @@ class GenericRedirectUrlProviderTest { @Test void testWriteRead() { var p = new GenericRedirectUrlProvider(); - p.setFallbackCharset(StandardCharsets.UTF_8.toString()); + p.getConfiguration().setFallbackCharset(StandardCharsets.UTF_8); assertThatNoException() .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(p)); } diff --git a/crawler/web/src/test/resources/validation/web-crawl-session-large.xml b/crawler/web/src/test/resources/validation/web-crawl-session-large.xml index e2e838f8f..59021df4b 100644 --- a/crawler/web/src/test/resources/validation/web-crawl-session-large.xml +++ b/crawler/web/src/test/resources/validation/web-crawl-session-large.xml @@ -232,7 +232,7 @@ - + @@ -352,14 +352,17 @@ - + text/html dom dom 425x312 true 1234 /some/path -url, inline ++ url +inline +medium 25 true diff --git a/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java b/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java index 135fd09a0..eb1ba3fa1 100644 --- a/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java +++ b/importer/src/main/java/com/norconex/importer/handler/CommonMatchers.java @@ -214,6 +214,16 @@ public static TextMatcher imageIOStandardContentTypes() { return csv(IMAGE_IO_CONTENT_TYPES); } + /** + *+ * Matches all content types. + *
+ * @return text matcher + */ + public static TextMatcher all() { + return TextMatcher.regex(".*"); + } + private static TextMatcher csv(Setvalues) { return TextMatcher .csv(StringUtils.join(values, ',')) From 0f195992ace10c18e68a25d7839a8e124c62f113 Mon Sep 17 00:00:00 2001 From: essiembre Date: Sun, 8 Sep 2024 06:20:13 +0000 Subject: [PATCH 05/10] Apply Copyright year changes --- .../web/robot/impl/StandardRobotsMetaProviderConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java index 4bf348029..7900cbf5d 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/robot/impl/StandardRobotsMetaProviderConfig.java @@ -1,4 +1,4 @@ -/* Copyright 2010-2023 Norconex Inc. +/* Copyright 2010-2024 Norconex Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 1cee93bdf8accd9927401283aa19ce4f10d53c79 Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Tue, 10 Sep 2024 22:22:29 -0400 Subject: [PATCH 06/10] More code coverage. --- .../link/impl/RegexLinkExtractor.java | 4 +- .../delay/impl/GenericDelayResolverTest.java | 22 +++++++ .../link/impl/RegexLinkExtractorTest.java | 60 +++++++++++++++++++ 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java index 235cf3a6a..7ff6146f0 100644 --- a/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java +++ b/crawler/web/src/main/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractor.java @@ -86,9 +86,9 @@ public class RegexLinkExtractor //TODO make buffer size and overlap size configurable //1MB: make configurable - private static final int MAX_BUFFER_SIZE = 1024 * 1024; + static final int MAX_BUFFER_SIZE = 1024 * 1024; // max url leng is 2048 x 2 bytes x 2 for anchor attributes. - private static final int OVERLAP_SIZE = 2 * 2 * 2048; + static final int OVERLAP_SIZE = 2 * 2 * 2048; @Getter private final RegexLinkExtractorConfig configuration = diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java index c7211a68e..ff4881865 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/delay/impl/GenericDelayResolverTest.java @@ -27,6 +27,7 @@ import com.norconex.commons.lang.bean.BeanMapper; import com.norconex.crawler.web.doc.operations.delay.impl.BaseDelayResolverConfig.DelayResolverScope; +import com.norconex.crawler.web.robot.RobotsTxt; class GenericDelayResolverTest { @@ -57,6 +58,27 @@ void testWriteRead() { .isThrownBy(() -> BeanMapper.DEFAULT.assertWriteRead(r)); } + @Test + void testNullDelays() { + var r = new GenericDelayResolver(); + r.getConfiguration() + .setScope(null); + assertThatNoException().isThrownBy( + () -> r.delay(null, "http://somewhere.com")); + + } + + @Test + void testWithRobotsTxt() { + var r = new GenericDelayResolver(); + // r.getConfiguration() + // .setScope(null); + var robotsTxt = RobotsTxt.builder().crawlDelay(1000f).build(); + assertThatNoException().isThrownBy( + () -> r.delay(robotsTxt, "http://somewhere.com")); + + } + @Test void testDelayScheduleBoundaries() { //FYI: Jan 1, 2000 was a Saturday diff --git a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java index e7111051e..0599806b7 100644 --- a/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java +++ b/crawler/web/src/test/java/com/norconex/crawler/web/doc/operations/link/impl/RegexLinkExtractorTest.java @@ -15,9 +15,11 @@ package com.norconex.crawler.web.doc.operations.link.impl; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatNoException; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -25,6 +27,7 @@ import java.util.List; import java.util.Set; +import org.apache.commons.io.input.NullInputStream; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -32,6 +35,8 @@ import com.norconex.commons.lang.bean.BeanMapper.Format; import com.norconex.commons.lang.file.ContentType; import com.norconex.commons.lang.io.CachedInputStream; +import com.norconex.commons.lang.map.PropertyMatcher; +import com.norconex.commons.lang.text.TextMatcher; import com.norconex.crawler.core.doc.CrawlDoc; import com.norconex.crawler.web.doc.WebCrawlDocContext; import com.norconex.crawler.web.doc.operations.link.Link; @@ -141,6 +146,61 @@ void testGenericWriteRead() { () -> BeanMapper.DEFAULT.assertWriteRead(extractor)); } + @Test + void testFromFieldAndRestrictions() throws IOException { + var extractor = new RegexLinkExtractor(); + var cfg = extractor.getConfiguration(); + cfg.setPatterns( + List.of(new ExtractionPattern("http:.*?\\.html", null))); + cfg.getRestrictions().add(new PropertyMatcher(TextMatcher.regex(".*"))); + cfg.getFieldMatcher().setPattern("myfield"); + + var doc = toCrawlDoc("n/a", + ContentType.TEXT, + NullInputStream.nullInputStream()); + doc.getMetadata().set("myfield", + "http://one.com/1.html|http://two.com/2.html|NOT_ME"); + var links = extractor.extractLinks(doc); + assertThat(links).map(Link::getUrl).containsExactlyInAnyOrder( + "http://one.com/1.html", "http://two.com/2.html"); + + cfg.clearPatterns(); + cfg.clearRestrictions(); + cfg.setContentTypeMatcher(TextMatcher.basic("application/pdf")); + links = extractor.extractLinks(doc); + assertThat(links).isEmpty(); + } + + @Test + void testNoRestrictionMatch() throws IOException { + var extractor = new RegexLinkExtractor(); + var cfg = extractor.getConfiguration(); + cfg.getRestrictions().add( + new PropertyMatcher(TextMatcher.regex("NOPE"))); + + var doc = toCrawlDoc("n/a", + ContentType.TEXT, + NullInputStream.nullInputStream()); + var links = extractor.extractLinks(doc); + assertThat(links).isEmpty(); + } + + @Test + void testLargeContent() throws IOException { + var doc = toCrawlDoc("n/a", ContentType.TEXT, new ByteArrayInputStream( + ("http://one.com/1.html" + + "X".repeat(RegexLinkExtractor.MAX_BUFFER_SIZE) + + "http://two.com/2.html" + "X".repeat( + RegexLinkExtractor.MAX_BUFFER_SIZE)) + .getBytes())); + var extractor = new RegexLinkExtractor(); + extractor.getConfiguration().setPatterns( + List.of(new ExtractionPattern("http:.*?\\.html", null))); + var links = extractor.extractLinks(doc); + assertThat(links).map(Link::getUrl).containsExactlyInAnyOrder( + "http://one.com/1.html", "http://two.com/2.html"); + } + private boolean contains(Set links, String url) { for (Link link : links) { if (url.equals(link.getUrl())) { From ba0c2a61c3a9fbae1b0e873edbf6d165b84433ad Mon Sep 17 00:00:00 2001 From: Pascal Essiembre Date: Wed, 11 Sep 2024 00:06:45 -0400 Subject: [PATCH 07/10] Code coverage. --- crawler/web/pom.xml | 7 ++ .../com/norconex/crawler/web/WebCrawler.java | 40 ++++++++-- .../UrlStatusCrawlerEventListener.java | 4 +- .../norconex/crawler/web/WebCrawlerTest.java | 33 +++++++++ .../impl/HtmlDomTikaLinkExtractorTest.java | 24 ++++++ .../UrlStatusCrawlerEventListenerTest.java | 74 +++++++++++++++---- .../crawler/web/stubs/CrawlerStubs.java | 11 +-- 7 files changed, 161 insertions(+), 32 deletions(-) create mode 100644 crawler/web/src/test/java/com/norconex/crawler/web/WebCrawlerTest.java diff --git a/crawler/web/pom.xml b/crawler/web/pom.xml index d3074e379..85cdf08df 100644 --- a/crawler/web/pom.xml +++ b/crawler/web/pom.xml @@ -214,6 +214,13 @@ selenium test +