diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e950b3..557c834 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,9 +27,11 @@ * If a URL points to a directory, check if index.html exists in that directory. [PR#90] * Treat absolute paths as absolute with respect to the `base_url`, not with respect to the file system. [PR#91] +* Check link fragments, with special handling for Rustdoc ranged fragments to highlight source code lines [PR#94] [PR#90]: https://github.com/deadlinks/cargo-deadlinks/pull/90 [PR#91]: https://github.com/deadlinks/cargo-deadlinks/pull/91 +[PR#94]: https://github.com/deadlinks/cargo-deadlinks/pull/94 #### Fixes diff --git a/Cargo.lock b/Cargo.lock index 93afec3..a5e4bde 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,6 +9,17 @@ dependencies = [ "memchr", ] +[[package]] +name = "assert-json-diff" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4259cbe96513d2f1073027a259fc2ca917feb3026a5a8d984e3628e490255cc0" +dependencies = [ + "extend", + "serde", + "serde_json", +] + [[package]] name = "assert_cmd" version = "1.0.1" @@ -75,19 +86,32 @@ version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +[[package]] +name = "cached" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c20611fb80d002056306ce7ec754bb7485caa30ac2fbe25341ef470ce3c76aa" +dependencies = [ + "once_cell", +] + [[package]] name = "cargo-deadlinks" version = "0.5.0" dependencies = [ "assert_cmd", + "cached", "cargo_metadata", "docopt", "env_logger", "log", "lol_html", + "mockito", "num_cpus", + "once_cell", "predicates", "rayon", + "regex", "serde", "serde_derive", "serde_json", @@ -141,6 +165,17 @@ dependencies = [ "bitflags", ] +[[package]] +name = "colored" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ffc801dacf156c5854b9df4f425a626539c3a6ef7893cc0c5084a23f0b6c59" +dependencies = [ + "atty", + "lazy_static", + "winapi", +] + [[package]] name = "const_fn" version = "0.4.2" @@ -293,6 +328,18 @@ dependencies = [ "termcolor", ] +[[package]] +name = "extend" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f47da3a72ec598d9c8937a7ebca8962a5c7a1f28444e38c2b33c771ba3f55f05" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "float-cmp" version = "0.8.0" @@ -317,6 +364,17 @@ dependencies = [ "byteorder", ] +[[package]] +name = "getrandom" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "wasi", +] + [[package]] name = "hermit-abi" version = "0.1.17" @@ -326,6 +384,12 @@ dependencies = [ "libc", ] +[[package]] +name = "httparse" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9" + [[package]] name = "humantime" version = "2.0.1" @@ -430,6 +494,24 @@ dependencies = [ "autocfg 1.0.1", ] +[[package]] +name = "mockito" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36a0eb7e686b49b02c1cb87c14b8e2a05de0d36c6eee0293653d0a875906d499" +dependencies = [ + "assert-json-diff", + "colored", + "difference", + "httparse", + "lazy_static", + "log", + "rand 0.7.3", + "regex", + "serde_json", + "serde_urlencoded", +] + [[package]] name = "nodrop" version = "0.1.14" @@ -463,9 +545,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.4.1" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "260e51e7efe62b592207e9e13a68e43692a7a279171d6ba57abd208bf23645ad" +checksum = "f53cef67919d7d247eb9a2f128ca9e522789967ef1eb4ccd8c71a95a8aedf596" [[package]] name = "percent-encoding" @@ -499,7 +581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" dependencies = [ "phf_shared", - "rand", + "rand 0.6.5", ] [[package]] @@ -511,6 +593,12 @@ dependencies = [ "siphasher", ] +[[package]] +name = "ppv-lite86" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" + [[package]] name = "precomputed-hash" version = "0.1.1" @@ -546,6 +634,30 @@ dependencies = [ "treeline", ] +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" version = "1.0.24" @@ -587,9 +699,9 @@ checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" dependencies = [ "autocfg 0.1.7", "libc", - "rand_chacha", + "rand_chacha 0.1.1", "rand_core 0.4.2", - "rand_hc", + "rand_hc 0.1.0", "rand_isaac", "rand_jitter", "rand_os", @@ -598,6 +710,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc 0.2.0", +] + [[package]] name = "rand_chacha" version = "0.1.1" @@ -608,6 +733,16 @@ dependencies = [ "rand_core 0.3.1", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_core" version = "0.3.1" @@ -623,6 +758,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom", +] + [[package]] name = "rand_hc" version = "0.1.0" @@ -632,6 +776,15 @@ dependencies = [ "rand_core 0.3.1", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_isaac" version = "0.1.1" @@ -868,6 +1021,18 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" +dependencies = [ + "dtoa", + "itoa", + "serde", + "url", +] + [[package]] name = "servo_arc" version = "0.1.1" @@ -1036,6 +1201,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" + [[package]] name = "wait-timeout" version = "0.2.0" @@ -1056,6 +1227,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasm-bindgen" version = "0.2.68" diff --git a/Cargo.toml b/Cargo.toml index 73cfc17..b9b21be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ cargo = ["cargo_metadata", "serde_json"] default = ["cargo"] [dependencies] +cached = { version = "0.20.0", default-features = false } cargo_metadata = { version = "0.9", optional = true } serde_json = { version = "1.0.34", optional = true } docopt = "1" @@ -29,7 +30,9 @@ env_logger = "0.8" lol_html = "0.2" log = "0.4" num_cpus = "1.8" +once_cell = "1.5.1" rayon = "1.0" +regex = { version = "1", default-features = false, features = ["std", "perf"] } ureq = { version = "1.5.2", features = ["tls"], default-features = false } serde = "1.0" serde_derive = "1.0" @@ -39,3 +42,4 @@ walkdir = "2.1" [dev-dependencies] assert_cmd = "1.0" predicates = "1.0.0" +mockito = "0.28.0" diff --git a/src/check.rs b/src/check.rs index 6e6eeff..68292fa 100644 --- a/src/check.rs +++ b/src/check.rs @@ -1,37 +1,86 @@ //! Provides functionality for checking the availablility of URLs. -use std::{fmt, path::PathBuf}; +use std::collections::HashSet; +use std::fmt; +use std::fs::read_to_string; +use std::path::{Path, PathBuf}; use log::debug; +use once_cell::sync::Lazy; +use regex::Regex; use url::Url; +#[cfg(not(test))] +use {cached::cached_key_result, cached::SizedCache}; + use super::CheckContext; +use crate::parse::parse_fragments; + const PREFIX_BLACKLIST: [&str; 1] = ["https://doc.rust-lang.org"]; #[derive(Debug)] -pub enum HttpError { - UnexpectedStatus(Url, ureq::Response), - Fetch(Url, ureq::Error), +pub enum IoError { + HttpUnexpectedStatus(Url, ureq::Response), + HttpFetch(Url, ureq::Error), + FileIo(String, std::io::Error), } -impl fmt::Display for HttpError { +impl fmt::Display for IoError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - HttpError::UnexpectedStatus(url, resp) => write!( + IoError::HttpUnexpectedStatus(url, resp) => write!( f, "Unexpected HTTP status fetching {}: {}", url, resp.status_text() ), - HttpError::Fetch(url, e) => write!(f, "Error fetching {}: {}", url, e), + IoError::HttpFetch(url, e) => write!(f, "Error fetching {}: {}", url, e), + IoError::FileIo(url, e) => write!(f, "Error fetching {}: {}", url, e), + } + } +} + +#[derive(Debug, Clone)] +pub enum Link { + File(String), + Http(Url), +} + +impl fmt::Display for Link { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Link::File(path) => f.write_str(path), + Link::Http(url) => f.write_str(url.as_str()), + } + } +} + +#[cfg(not(test))] +impl Link { + /// Removes the fragment + fn without_fragment(&self) -> Link { + match self { + Link::Http(url) => { + let mut url = url.clone(); + url.set_fragment(None); + + Link::Http(url) + } + _ => self.clone(), } } } #[derive(Debug)] pub enum CheckError { + /// A relatively linked file did not exist File(PathBuf), - Http(Box), + /// A linked HTTP URL did not exist + Http(Url), + /// The linked file existed, but was missing the linked HTML anchor + Fragment(Link, String, Option>), + /// An error occured while trying to find whether the file or URL existed + Io(Box), } impl fmt::Display for CheckError { @@ -40,7 +89,19 @@ impl fmt::Display for CheckError { CheckError::File(path) => { write!(f, "Linked file at path {} does not exist!", path.display()) } - CheckError::Http(err) => err.fmt(f), + CheckError::Http(url) => write!(f, "Linked URL {} does not exist!", url), + CheckError::Fragment(link, fragment, missing_parts) => match missing_parts { + Some(missing_parts) => write!( + f, + "Fragments #{} as expected by ranged fragment #{} at {} do not exist!\n\ + This is likely a bug in rustdoc itself.", + missing_parts.join(", #"), + fragment, + link + ), + None => write!(f, "Fragment #{} at {} does not exist!", fragment, link), + }, + CheckError::Io(err) => err.fmt(f), } } } @@ -61,16 +122,147 @@ pub fn is_available(url: &Url, ctx: &CheckContext) -> Result<(), CheckError> { } } +#[cfg(not(test))] +cached_key_result! { + CHECK_FILE: SizedCache> = SizedCache::with_size(100); + Key = { link.without_fragment().to_string() }; + // `fetch_html` is different depending on whether the link is being + // loaded from disk or from the network. + fn fragments_from( + link: &Link, + fetch_html: impl Fn() -> Result + ) -> Result, CheckError> = { + fragments_from_inner(fetch_html) + } +} + +#[cfg(test)] +fn fragments_from( + _link: &Link, + fetch_html: impl Fn() -> Result, +) -> Result, CheckError> { + fragments_from_inner(fetch_html) +} + +fn fragments_from_inner( + fetch_html: impl Fn() -> Result, +) -> Result, CheckError> { + fetch_html().map(|html| parse_fragments(&html)) +} + +fn is_fragment_available( + link: &Link, + fragment: &str, + fetch_html: impl Fn() -> Result, +) -> Result<(), CheckError> { + let fragments = fragments_from(link, fetch_html)?; + + if fragments.contains(fragment) { + return Ok(()); + } + + // Rust documentation uses `#n-m` fragments and JavaScript to highlight + // a range of lines in HTML of source code, an element with `id` + // attribute of (literal) "#n-m" will not exist, but elements with + // `id`s n through m should, this parses the ranged n-m anchor and + // checks if elements with `id`s n through m do exist + static RUST_LINE_HIGLIGHT_RX: Lazy = + Lazy::new(|| Regex::new(r#"^(?P[0-9]+)-(?P[0-9]+)$"#).unwrap()); + match RUST_LINE_HIGLIGHT_RX.captures(fragment) { + Some(capture) => match (capture.name("start"), capture.name("end")) { + (Some(start_str), Some(end_str)) => { + // NOTE: assumes there are less than 2.pow(32) lines in a source file + let start = start_str.as_str().parse::().unwrap(); + let end = end_str.as_str().parse::().unwrap(); + let missing = (start..=end) + .map(|i| i.to_string()) + .filter(|i| !fragments.contains(i)) + .collect::>(); + if !missing.is_empty() { + Err(CheckError::Fragment( + link.clone(), + fragment.to_string(), + Some(missing), + )) + } else { + Ok(()) + } + } + _ => unreachable!("if the regex matches, it should have capture groups"), + }, + None => Err(CheckError::Fragment( + link.clone(), + fragment.to_string(), + None, + )), + } +} + /// Check a URL with the "file" scheme for availability. Returns `false` if it is unavailable. fn check_file_url(url: &Url, _ctx: &CheckContext) -> Result<(), CheckError> { let path = url.to_file_path().unwrap(); - if path.is_file() || path.join("index.html").is_file() { - debug!("Linked file at path {} does exist.", path.display()); - Ok(()) + // determine the full path by looking if the path points to a directory, + // and if so append `index.html`, this is needed as we'll try to read + // the file, so `expanded_path` should point to a file not a directory + let index_html; + let expanded_path = if path.is_file() { + &path + } else if path.is_dir() && path.join("index.html").is_file() { + index_html = path.join("index.html"); + &index_html } else { debug!("Linked file at path {} does not exist!", path.display()); - Err(CheckError::File(path)) + return Err(CheckError::File(path)); + }; + + // The URL might contain a fragment. In that case we need a full GET + // request to check if the fragment exists. + match url.fragment() { + Some(fragment) => check_file_fragment(&path, expanded_path, fragment), + None => Ok(()), + } +} + +fn check_file_fragment( + path: &Path, + expanded_path: &Path, + fragment: &str, +) -> Result<(), CheckError> { + debug!( + "Checking fragment {} of file {}.", + fragment, + expanded_path.display() + ); + let fetch_html = || { + read_to_string(expanded_path).map_err(|err| { + CheckError::Io(Box::new(IoError::FileIo( + expanded_path.to_string_lossy().to_string(), + err, + ))) + }) + }; + + is_fragment_available( + &Link::File(path.to_str().unwrap().to_string()), + fragment, + fetch_html, + ) +} + +fn handle_response(url: &Url, resp: ureq::Response) -> Result { + if resp.synthetic() { + Err(CheckError::Io(Box::new(IoError::HttpFetch( + url.clone(), + resp.into_synthetic_error().unwrap(), + )))) + } else if resp.ok() { + Ok(resp) + } else { + Err(CheckError::Io(Box::new(IoError::HttpUnexpectedStatus( + url.clone(), + resp, + )))) } } @@ -94,42 +286,234 @@ fn check_http_url(url: &Url, ctx: &CheckContext) -> Result<(), CheckError> { } } - let resp = ureq::head(url.as_str()).call(); - if resp.synthetic() { - Err(CheckError::Http(Box::new(HttpError::Fetch( - url.clone(), - resp.into_synthetic_error().unwrap(), - )))) - } else if resp.ok() { - Ok(()) + // The URL might contain a fragment. In that case we need a full GET + // request to check if the fragment exists. + if url.fragment().is_none() { + let resp = ureq::head(url.as_str()).call(); + + handle_response(url, resp).map(|_: ureq::Response| ()) } else { - Err(CheckError::Http(Box::new(HttpError::UnexpectedStatus( - url.clone(), - resp, - )))) + // the URL might contain a fragment, in that case we need to check if + // the fragment exists, this issues a GET request + check_http_fragment(url, url.fragment().unwrap()) } } +fn check_http_fragment(url: &Url, fragment: &str) -> Result<(), CheckError> { + debug!("Checking fragment {} of URL {}.", fragment, url.as_str()); + + let fetch_html = || { + let resp = ureq::get(url.as_str()).call(); + handle_response(&url, resp).map(|resp| resp.into_string().unwrap()) + }; + + is_fragment_available(&Link::Http(url.clone()), fragment, fetch_html) +} + #[cfg(test)] mod test { - use super::{check_file_url, CheckContext}; + use super::{check_file_url, is_available, CheckContext, CheckError, Link}; + use mockito::{self, mock}; use std::env; use url::Url; - fn test_check_file_url(path: &str) { + fn url_for(path: &str) -> Url { let cwd = env::current_dir().unwrap(); - let url = Url::from_file_path(cwd.join(path)).unwrap(); + let mut parts = path.split('#'); + let file_path = parts.next().unwrap(); + + let mut url = if file_path.ends_with("/") { + Url::from_directory_path(cwd.join(file_path)) + } else { + Url::from_file_path(cwd.join(file_path)) + } + .unwrap(); + + url.set_fragment(parts.next()); + assert_eq!(parts.count(), 0); // make sure the anchor was valid, not `a.html#x#y` + + url + } - check_file_url(&url, &CheckContext::default()).unwrap(); + fn test_check_file_url(path: &str) -> Result<(), CheckError> { + check_file_url( + &url_for(path), + &CheckContext { + verbose: false, + check_http: false, + }, + ) } #[test] fn test_file_path() { - test_check_file_url("tests/html/index.html"); + test_check_file_url("tests/html/index.html").unwrap(); } #[test] fn test_directory_path() { - test_check_file_url("tests/html/"); + test_check_file_url("tests/html/").unwrap(); + } + + #[test] + fn test_anchors() { + test_check_file_url("tests/html/anchors.html#h1").unwrap(); + } + + #[test] + fn test_missing_anchors() { + match test_check_file_url("tests/html/anchors.html#nonexistent") { + Err(CheckError::Fragment(Link::File(path), fragment, None)) => { + assert!(path.ends_with("tests/html/anchors.html")); + assert_eq!("nonexistent", fragment); + } + x => panic!( + "Expected to report missing anchor (Err(CheckError::FileAnchor)), got {:?}", + x + ), + } + } + + #[test] + fn test_range_anchor() { + test_check_file_url("tests/html/range.html#2-4").unwrap(); + } + + #[test] + fn test_missing_range_anchor() { + match test_check_file_url("tests/html/range.html#4-6") { + Err(CheckError::Fragment(Link::File(path), fragment, Some(missing_parts))) => { + assert!(path.ends_with("tests/html/range.html")); + assert_eq!("4-6", fragment); + assert_eq!(missing_parts.len(), 1); + assert!(missing_parts.contains(&"6".to_string())); + } + x => panic!( + "Expected to report missing anchor (Err(CheckError::FileAnchorRange)), got {:?}", + x + ), + } + } + + #[test] + fn test_is_available_file_path() { + is_available( + &url_for("tests/html/index.html#i1"), + &CheckContext { + verbose: false, + check_http: false, + }, + ) + .unwrap(); + } + + #[test] + fn test_is_available_directory_path() { + is_available( + &url_for("tests/html/#i1"), + &CheckContext { + verbose: false, + check_http: false, + }, + ) + .unwrap(); + } + + #[test] + fn test_missing_dir_index_fragment() { + match is_available( + &url_for("tests/html/missing_index/#i1"), + &CheckContext { + verbose: false, + check_http: false, + }, + ) { + Err(CheckError::File(path)) => assert!(path.ends_with("tests/html/missing_index")), + x => panic!( + "Expected to report missing anchor (Err(CheckError::File)), got {:?}", + x + ), + } + } + + #[test] + fn test_http_check() { + let root = mock("HEAD", "/").with_status(200).create(); + + let mut url = mockito::server_url(); + url.push_str("/"); + + is_available( + &Url::parse(&url).unwrap(), + &CheckContext { + verbose: false, + check_http: true, + }, + ) + .unwrap(); + + root.assert(); + } + + #[test] + fn test_http_check_fragment() { + let root = mock("GET", "/") + .with_status(200) + .with_header("content-type", "text/html") + .with_body( + r#" + + + "#, + ) + .create(); + + let mut url = mockito::server_url(); + url.push_str("/#r1"); + + is_available( + &Url::parse(&url).unwrap(), + &CheckContext { + verbose: false, + check_http: true, + }, + ) + .unwrap(); + + root.assert(); + } + + #[test] + fn test_missing_http_fragment() { + let root = mock("GET", "/") + .with_status(200) + .with_header("content-type", "text/html") + .with_body( + r#" + "#, + ) + .create(); + + let mut url = mockito::server_url(); + url.push_str("/#missing"); + + match is_available( + &Url::parse(&url).unwrap(), + &CheckContext { + verbose: false, + check_http: true, + }, + ) { + Err(CheckError::Fragment(Link::Http(url), fragment, None)) => { + assert_eq!("http://127.0.0.1:1234/#missing", url.to_string()); + assert_eq!("missing", fragment); + } + x => panic!( + "Expected to report missing anchor (Err(CheckError::File)), got {:?}", + x + ), + } + + root.assert(); } } diff --git a/src/lib.rs b/src/lib.rs index a9e621b..3df205f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,7 @@ use walkdir::{DirEntry, WalkDir}; use check::is_available; use parse::parse_html_file; -pub use check::{CheckError, HttpError}; +pub use check::{CheckError, IoError}; mod check; mod parse; @@ -70,13 +70,18 @@ impl FileError { use CheckError::*; match e { - Http(_) => ret.push_str(&format!("\n\t{}", e)), File(epath) => { let epath = epath.strip_prefix(&prefix).unwrap_or(&epath); ret.push_str("\n\tLinked file at path "); ret.push_str(&epath.display().to_string()); ret.push_str(" does not exist!"); } + Http(_) => ret.push_str(&format!("\n\t{}", e)), + Fragment(_, _, _) => { + ret.push_str("\n\t"); + ret.push_str(e.to_string().as_str()); + } + Io(_) => ret.push_str(&format!("\n\t{}", e)), } } ret diff --git a/src/parse.rs b/src/parse.rs index d40474d..7774715 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -51,9 +51,28 @@ fn parse_a_hrefs(html: &str, root_url: Url, base_url: Url) -> HashSet { urls } +/// Parses the given string as HTML and returns values of all element's id attributes +pub(crate) fn parse_fragments(html: &str) -> HashSet { + let mut fragments = HashSet::new(); + lol_html::rewrite_str( + html, + RewriteStrSettings { + element_content_handlers: vec![element!("*[id]", |el| { + let id = el.get_attribute("id").unwrap(); + fragments.insert(id); + Ok(()) + })], + ..RewriteStrSettings::default() + }, + ) + .expect("html rewriting failed"); + + fragments +} + #[cfg(test)] mod test { - use super::parse_a_hrefs; + use super::{parse_a_hrefs, parse_fragments}; use url::Url; #[test] @@ -99,4 +118,21 @@ mod test { assert!(urls.contains(&Url::from_file_path("/root/b/c.html").unwrap())); assert!(urls.contains(&Url::from_file_path("/root/d.html").unwrap())); } + + #[test] + fn test_parse_fragments() { + let html = r#" + + + + a +

h1

+ + "#; + + let fragments = parse_fragments(html); + + assert!(fragments.contains("a")); + assert!(fragments.contains("h1")); + } } diff --git a/tests/html/anchors.html b/tests/html/anchors.html new file mode 100644 index 0000000..fb63d2a --- /dev/null +++ b/tests/html/anchors.html @@ -0,0 +1,10 @@ + + + + Test HTML file + + +

h1

+ Go to h1 + + diff --git a/tests/html/index.html b/tests/html/index.html index 50af157..7ce63f9 100644 --- a/tests/html/index.html +++ b/tests/html/index.html @@ -3,7 +3,8 @@ Test HTML file - +

Hi there

+ to anchors h1 diff --git a/tests/html/missing_index/.gitkeep b/tests/html/missing_index/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/html/range.html b/tests/html/range.html new file mode 100644 index 0000000..ef7a2c1 --- /dev/null +++ b/tests/html/range.html @@ -0,0 +1,15 @@ + + + + Test HTML file + + +
    +
  1. 1
  2. +
  3. 2
  4. +
  5. 3
  6. +
  7. 4
  8. +
  9. 5
  10. +
+ +