From 23c9eddbb2993b1fca5d57f54242bff42878f5f0 Mon Sep 17 00:00:00 2001 From: Marko Galevski Date: Fri, 11 Oct 2024 13:28:00 +0200 Subject: [PATCH 1/2] Ignore textContent links in html nodes This fixes issue #1462 by remove plaintext uri parsing in html5ever and pruning attribute-less URIs in html5gum --- lychee-lib/src/collector.rs | 4 ++- lychee-lib/src/extract/html/html5ever.rs | 23 +++++++++++-- lychee-lib/src/extract/html/html5gum.rs | 41 +++++++++++++++++------- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 40c24ce542..406eb23255 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -144,7 +144,9 @@ mod tests { // Helper function to run the collector on the given inputs async fn collect(inputs: Vec, base: Option) -> HashSet { - let responses = Collector::new(base).collect_links(inputs); + let responses = Collector::new(base) + .include_verbatim(true) + .collect_links(inputs); responses.map(|r| r.unwrap().uri).collect().await } diff --git a/lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html/html5ever.rs index d10ac4e00e..9bc0214767 100644 --- a/lychee-lib/src/extract/html/html5ever.rs +++ b/lychee-lib/src/extract/html/html5ever.rs @@ -28,9 +28,11 @@ impl TokenSink for LinkExtractor { if self.current_verbatim_element_name.borrow().is_some() { return TokenSinkResult::Continue; } - self.links - .borrow_mut() - .extend(extract_raw_uri_from_plaintext(&raw)); + if self.include_verbatim { + self.links + .borrow_mut() + .extend(extract_raw_uri_from_plaintext(&raw)); + } } Token::TagToken(tag) => { let Tag { @@ -413,4 +415,19 @@ mod tests { let uris = extract_html(input, false); assert!(uris.is_empty()); } + + #[test] + fn test_ignore_text_content_links() { + let input = r#" + https://ignoreme.com + "#; + let expected = vec![RawUri { + text: "https://example.com".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + + let uris = extract_html(input, false); + assert_eq!(uris, expected); + } } diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index 276d2e0e86..3fa80aa155 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -4,6 +4,15 @@ use std::collections::{HashMap, HashSet}; use super::{is_email_link, is_verbatim_elem, srcset}; use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri}; +#[derive(Clone, Default, Debug)] +struct Element { + /// Current element name being processed. + /// This is called a tag in html5gum. + name: String, + /// Whether the current element is a closing tag. + is_closing: bool, +} + /// Extract links from HTML documents. /// /// This is the main driver for the html5gum tokenizer. @@ -16,7 +25,7 @@ use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw: /// /// The `links` vector contains all links extracted from the HTML document and /// the `fragments` set contains all fragments extracted from the HTML document. -#[derive(Clone, Default)] +#[derive(Clone, Default, Debug)] struct LinkExtractor { /// Links extracted from the HTML document. links: Vec, @@ -39,15 +48,6 @@ struct LinkExtractor { verbatim_stack: Vec, } -#[derive(Clone, Default)] -struct Element { - /// Current element name being processed. - /// This is called a tag in html5gum. - name: String, - /// Whether the current element is a closing tag. - is_closing: bool, -} - impl LinkExtractor { /// Create a new `LinkExtractor`. /// @@ -325,7 +325,11 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { let mut extractor = LinkExtractor::new(include_verbatim); let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible(); assert!(tokenizer.next().is_none()); - extractor.links + extractor + .links + .into_iter() + .filter(|link| link.attribute.is_some() || include_verbatim) + .collect() } /// Extract fragments from id attributes within a HTML string. @@ -607,4 +611,19 @@ mod tests { let uris = extract_html(input, false); assert!(uris.is_empty()); } + + #[test] + fn test_ignore_text_content_links() { + let input = r#" + https://ignoreme.com + "#; + let expected = vec![RawUri { + text: "https://example.com".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + + let uris = extract_html(input, false); + assert_eq!(uris, expected); + } } From eae951bad8419123ebfdeb80f5eef4d4daf05302 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 14 Oct 2024 00:18:15 +0200 Subject: [PATCH 2/2] formatting --- lychee-lib/src/collector.rs | 2 ++ lychee-lib/src/extract/html/html5gum.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index c4a90ca6e3..341bb21dba 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -147,6 +147,8 @@ mod tests { let responses = Collector::new(base).collect_links(inputs); responses.map(|r| r.unwrap().uri).collect().await } + + // Helper function for collecting verbatim links async fn collect_verbatim(inputs: Vec, base: Option) -> HashSet { let responses = Collector::new(base) .include_verbatim(true) diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index bd23417a2f..be78a3116a 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -627,6 +627,7 @@ mod tests { let uris = extract_html(input, false); assert_eq!(uris, expected); } + #[test] fn test_skip_dns_prefetch() { let input = r#"