From 23c9eddbb2993b1fca5d57f54242bff42878f5f0 Mon Sep 17 00:00:00 2001
From: Marko Galevski <galevski.marko+github.com>
Date: Fri, 11 Oct 2024 13:28:00 +0200
Subject: [PATCH 1/2] Ignore textContent links in html nodes

This fixes issue #1462 by remove plaintext uri parsing
in html5ever and pruning attribute-less URIs in html5gum
---
 lychee-lib/src/collector.rs              |  4 ++-
 lychee-lib/src/extract/html/html5ever.rs | 23 +++++++++++--
 lychee-lib/src/extract/html/html5gum.rs  | 41 +++++++++++++++++-------
 3 files changed, 53 insertions(+), 15 deletions(-)
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 40c24ce542..406eb23255 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -144,7 +144,9 @@ mod tests {
 
     // Helper function to run the collector on the given inputs
     async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
-        let responses = Collector::new(base).collect_links(inputs);
+        let responses = Collector::new(base)
+            .include_verbatim(true)
+            .collect_links(inputs);
         responses.map(|r| r.unwrap().uri).collect().await
     }
 
diff --git a/lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html/html5ever.rs
index d10ac4e00e..9bc0214767 100644
--- a/lychee-lib/src/extract/html/html5ever.rs
+++ b/lychee-lib/src/extract/html/html5ever.rs
@@ -28,9 +28,11 @@ impl TokenSink for LinkExtractor {
                 if self.current_verbatim_element_name.borrow().is_some() {
                     return TokenSinkResult::Continue;
                 }
-                self.links
-                    .borrow_mut()
-                    .extend(extract_raw_uri_from_plaintext(&raw));
+                if self.include_verbatim {
+                    self.links
+                        .borrow_mut()
+                        .extend(extract_raw_uri_from_plaintext(&raw));
+                }
             }
             Token::TagToken(tag) => {
                 let Tag {
@@ -413,4 +415,19 @@ mod tests {
         let uris = extract_html(input, false);
         assert!(uris.is_empty());
     }
+
+    #[test]
+    fn test_ignore_text_content_links() {
+        let input = r#"
+            <a href="https://example.com">https://ignoreme.com</a>
+        "#;
+        let expected = vec![RawUri {
+            text: "https://example.com".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
 }
diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs
index 276d2e0e86..3fa80aa155 100644
--- a/lychee-lib/src/extract/html/html5gum.rs
+++ b/lychee-lib/src/extract/html/html5gum.rs
@@ -4,6 +4,15 @@ use std::collections::{HashMap, HashSet};
 use super::{is_email_link, is_verbatim_elem, srcset};
 use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri};
 
+#[derive(Clone, Default, Debug)]
+struct Element {
+    /// Current element name being processed.
+    /// This is called a tag in html5gum.
+    name: String,
+    /// Whether the current element is a closing tag.
+    is_closing: bool,
+}
+
 /// Extract links from HTML documents.
 ///
 /// This is the main driver for the html5gum tokenizer.
@@ -16,7 +25,7 @@ use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw:
 ///
 /// The `links` vector contains all links extracted from the HTML document and
 /// the `fragments` set contains all fragments extracted from the HTML document.
-#[derive(Clone, Default)]
+#[derive(Clone, Default, Debug)]
 struct LinkExtractor {
     /// Links extracted from the HTML document.
     links: Vec<RawUri>,
@@ -39,15 +48,6 @@ struct LinkExtractor {
     verbatim_stack: Vec<String>,
 }
 
-#[derive(Clone, Default)]
-struct Element {
-    /// Current element name being processed.
-    /// This is called a tag in html5gum.
-    name: String,
-    /// Whether the current element is a closing tag.
-    is_closing: bool,
-}
-
 impl LinkExtractor {
     /// Create a new `LinkExtractor`.
     ///
@@ -325,7 +325,11 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
     let mut extractor = LinkExtractor::new(include_verbatim);
     let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
     assert!(tokenizer.next().is_none());
-    extractor.links
+    extractor
+        .links
+        .into_iter()
+        .filter(|link| link.attribute.is_some() || include_verbatim)
+        .collect()
 }
 
 /// Extract fragments from id attributes within a HTML string.
@@ -607,4 +611,19 @@ mod tests {
         let uris = extract_html(input, false);
         assert!(uris.is_empty());
     }
+
+    #[test]
+    fn test_ignore_text_content_links() {
+        let input = r#"
+            <a href="https://example.com">https://ignoreme.com</a>
+        "#;
+        let expected = vec![RawUri {
+            text: "https://example.com".to_string(),
+            element: Some("a".to_string()),
+            attribute: Some("href".to_string()),
+        }];
+
+        let uris = extract_html(input, false);
+        assert_eq!(uris, expected);
+    }
 }

From eae951bad8419123ebfdeb80f5eef4d4daf05302 Mon Sep 17 00:00:00 2001
From: Matthias <matthias@endler.dev>
Date: Mon, 14 Oct 2024 00:18:15 +0200
Subject: [PATCH 2/2] formatting

---
 lychee-lib/src/collector.rs             | 2 ++
 lychee-lib/src/extract/html/html5gum.rs | 1 +
 2 files changed, 3 insertions(+)

diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index c4a90ca6e3..341bb21dba 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -147,6 +147,8 @@ mod tests {
         let responses = Collector::new(base).collect_links(inputs);
         responses.map(|r| r.unwrap().uri).collect().await
     }
+
+    // Helper function for collecting verbatim links
     async fn collect_verbatim(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
         let responses = Collector::new(base)
             .include_verbatim(true)
diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs
index bd23417a2f..be78a3116a 100644
--- a/lychee-lib/src/extract/html/html5gum.rs
+++ b/lychee-lib/src/extract/html/html5gum.rs
@@ -627,6 +627,7 @@ mod tests {
         let uris = extract_html(input, false);
         assert_eq!(uris, expected);
     }
+
     #[test]
     fn test_skip_dns_prefetch() {
         let input = r#"