Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore textContent links in html nodes #1528

Merged
merged 3 commits into from
Oct 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ mod tests {
responses.map(|r| r.unwrap().uri).collect().await
}

// Helper function for collecting verbatim links
async fn collect_verbatim(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base)
.include_verbatim(true)
.collect_links(inputs);
responses.map(|r| r.unwrap().uri).collect().await
}

const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
Expand Down Expand Up @@ -233,7 +241,7 @@ mod tests {
},
];

let links = collect(inputs, None).await;
let links = collect_verbatim(inputs, None).await;

let expected_links = HashSet::from_iter([
website(TEST_STRING),
Expand Down
23 changes: 20 additions & 3 deletions lychee-lib/src/extract/html/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ impl TokenSink for LinkExtractor {
if self.current_verbatim_element_name.borrow().is_some() {
return TokenSinkResult::Continue;
}
self.links
.borrow_mut()
.extend(extract_raw_uri_from_plaintext(&raw));
if self.include_verbatim {
self.links
.borrow_mut()
.extend(extract_raw_uri_from_plaintext(&raw));
}
}
Token::TagToken(tag) => {
let Tag {
Expand Down Expand Up @@ -414,6 +416,21 @@ mod tests {
assert!(uris.is_empty());
}

#[test]
fn test_ignore_text_content_links() {
let input = r#"
<a href="https://example.com">https://ignoreme.com</a>
"#;
let expected = vec![RawUri {
text: "https://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];

let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_dns_prefetch() {
let input = r#"
Expand Down
41 changes: 30 additions & 11 deletions lychee-lib/src/extract/html/html5gum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ use std::collections::{HashMap, HashSet};
use super::{is_email_link, is_verbatim_elem, srcset};
use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri};

#[derive(Clone, Default, Debug)]
struct Element {
/// Current element name being processed.
/// This is called a tag in html5gum.
name: String,
/// Whether the current element is a closing tag.
is_closing: bool,
}

/// Extract links from HTML documents.
///
/// This is the main driver for the html5gum tokenizer.
Expand All @@ -16,7 +25,7 @@ use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw:
///
/// The `links` vector contains all links extracted from the HTML document and
/// the `fragments` set contains all fragments extracted from the HTML document.
#[derive(Clone, Default)]
#[derive(Clone, Default, Debug)]
struct LinkExtractor {
/// Links extracted from the HTML document.
links: Vec<RawUri>,
Expand All @@ -39,15 +48,6 @@ struct LinkExtractor {
verbatim_stack: Vec<String>,
}

#[derive(Clone, Default)]
struct Element {
/// Current element name being processed.
/// This is called a tag in html5gum.
name: String,
/// Whether the current element is a closing tag.
is_closing: bool,
}

impl LinkExtractor {
/// Create a new `LinkExtractor`.
///
Expand Down Expand Up @@ -326,7 +326,11 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new(include_verbatim);
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
extractor
.links
.into_iter()
.filter(|link| link.attribute.is_some() || include_verbatim)
.collect()
}

/// Extract fragments from id attributes within a HTML string.
Expand Down Expand Up @@ -609,6 +613,21 @@ mod tests {
assert!(uris.is_empty());
}

#[test]
fn test_ignore_text_content_links() {
let input = r#"
<a href="https://example.com">https://ignoreme.com</a>
"#;
let expected = vec![RawUri {
text: "https://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];

let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_dns_prefetch() {
let input = r#"
Expand Down
Loading