From 0959c9cc5d3573c7807e4dc9ede3616a0a4381fb Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Tue, 29 Oct 2024 04:14:17 +0100 Subject: [PATCH] add callbacks emitter and update readme (#91) --- .github/dependabot.yml | 6 + Cargo.toml | 6 + README.md | 32 +- examples/build_tree.rs | 4 +- examples/callback_emitter.rs | 53 +++ examples/scraper.rs | 4 +- fuzz/Cargo.lock | 569 +++++++++++++----------------- fuzz/Cargo.toml | 14 +- fuzz/README.md | 1 + fuzz/src/testcase/html5ever.rs | 8 +- fuzz/src/testcase/old_html5gum.rs | 24 +- src/callbacks.rs | 497 ++++++++++++++++++++++++++ src/default_emitter.rs | 266 ++++++++++++++ src/emitter.rs | 446 ----------------------- src/html5ever_emitter.rs | 272 +++++++------- src/htmlstring.rs | 130 +++++++ src/lib.rs | 9 +- src/tokenizer.rs | 29 +- tests/html5lib_tree_builder.rs | 6 +- 19 files changed, 1428 insertions(+), 948 deletions(-) create mode 100644 examples/callback_emitter.rs create mode 100644 src/callbacks.rs create mode 100644 src/default_emitter.rs create mode 100644 src/htmlstring.rs diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 093f76c..4508697 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,6 +10,12 @@ updates: schedule: interval: daily open-pull-requests-limit: 10 + +- package-ecosystem: cargo + directory: "/fuzz" + schedule: + interval: daily + open-pull-requests-limit: 10 - package-ecosystem: gitsubmodule directory: "/" diff --git a/Cargo.toml b/Cargo.toml index 279eea5..414bb21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -59,6 +59,12 @@ harness = false name = "build_tree" required-features = ["tree-builder"] +[[example]] +name = "custom_emitter" + +[[example]] +name = "callback_emitter" + [[example]] name = "scraper" required-features = ["tree-builder"] diff --git a/README.md b/README.md index 4890712..9c7562e 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,13 @@ for token in Tokenizer::new(html).infallible() { assert_eq!(new_html, "hello world"); ``` +`html5gum` provides multiple kinds of APIs: + +* Iterating over tokens as shown above. +* Implementing your own `Emitter` for maximum performance, see [the `custom_emitter.rs` example](examples/custom_emitter.rs). +* A callbacks-based API for a middleground between convenience and performance, see [the `callback_emitter.rs` example](examples/callback_emitter.rs). +* With the `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the `scraper.rs` example](examples/scraper.rs). + ## What a tokenizer does and what it does not do `html5gum` fully implements [13.2.5 of the WHATWG HTML @@ -42,9 +49,6 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S gracefully from invalid UTF-8. * `html5gum` **does not** [correct mis-nested tags.](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser) -* `html5gum` **does not** recognize implicitly self-closing elements like - ``, as a tokenizer it will simply emit a start token. It does however - emit a self-closing tag for ``. * `html5gum` doesn't implement the DOM, and unfortunately in the HTML spec, constructing the DOM ("tree construction") influences how tokenization is done. For an example of which problems this causes see [this example @@ -54,23 +58,9 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S 21](https://github.com/untitaker/html5gum/issues/21). With those caveats in mind, `html5gum` can pretty much ~parse~ _tokenize_ -anything that browsers can. - -## The `Emitter` trait - -A distinguishing feature of `html5gum` is that you can bring your own token -datastructure and hook into token creation by implementing the `Emitter` trait. -This allows you to: - -* Rewrite all per-HTML-tag allocations to use a custom allocator or datastructure. - -* Efficiently filter out uninteresting categories data without ever allocating - for it. For example if any plaintext between tokens is not of interest to - you, you can implement the respective trait methods as noop and therefore - avoid any overhead creating plaintext tokens. - -See [the `custom_emitter` example][examples/custom_emitter.rs] for how this -looks like in practice. +anything that browsers can. However, using the experimental `tree-builder` +feature, html5gum can be integrated with `html5ever` and `scraper`. See [the +`scraper.rs` example](examples/scraper.rs). ## Other features @@ -116,3 +106,5 @@ Licensed under the MIT license, see [`./LICENSE`][LICENSE]. [LICENSE]: ./LICENSE [examples/tokenize_with_state_switches.rs]: ./examples/tokenize_with_state_switches.rs [examples/custom_emitter.rs]: ./examples/custom_emitter.rs +[examples/callback_emitter.rs]: ./examples/callback_emitter.rs +[examples/scraper.rs]: ./examples/scraper.rs diff --git a/examples/build_tree.rs b/examples/build_tree.rs index 4a567a2..97ca5b7 100644 --- a/examples/build_tree.rs +++ b/examples/build_tree.rs @@ -2,8 +2,6 @@ /// building logic and DOM implementation. The result is a technically complete HTML5 parser. /// /// You may want to refer to `examples/scraper.rs` for better ergonomics. -use std::iter::repeat; - use html5ever::tree_builder::TreeBuilder; use html5gum::{Html5everEmitter, IoReader, Tokenizer}; use markup5ever_rcdom::{Handle, NodeData, RcDom}; @@ -11,7 +9,7 @@ use markup5ever_rcdom::{Handle, NodeData, RcDom}; fn walk(indent: usize, handle: &Handle) { let node = handle; // FIXME: don't allocate - print!("{}", repeat(" ").take(indent).collect::()); + print!("{}", " ".repeat(indent)); match node.data { NodeData::Document => println!("#Document"), diff --git a/examples/callback_emitter.rs b/examples/callback_emitter.rs new file mode 100644 index 0000000..de392d6 --- /dev/null +++ b/examples/callback_emitter.rs @@ -0,0 +1,53 @@ +//! A slightly simpler, but less performant version of the link extractor that can be found in +//! `examples/custom_emitter.rs`. +//! +//! ```text +//! printf '

Hello world!

bar' | cargo run --example=custom_emitter +//! ``` +//! +//! Output: +//! +//! ```text +//! link: foo +//! ``` +use html5gum::callbacks::{CallbackEmitter, CallbackEvent}; +use html5gum::{Emitter, IoReader, Tokenizer}; + +fn get_emitter() -> impl Emitter { + let mut is_anchor_tag = false; + let mut is_href_attr = false; + + CallbackEmitter::new(move |event: CallbackEvent<'_>| match event { + CallbackEvent::OpenStartTag { name } => { + is_anchor_tag = name == b"a"; + is_href_attr = false; + None + } + CallbackEvent::AttributeName { name } => { + is_href_attr = name == b"href"; + None + } + CallbackEvent::AttributeValue { value } if is_anchor_tag && is_href_attr => { + Some(String::from_utf8_lossy(value).into_owned()) + } + _ => None, + }) +} + +fn main() { + for token in + Tokenizer::new_with_emitter(IoReader::new(std::io::stdin().lock()), get_emitter()).flatten() + { + println!("link: {}", token); + } +} + +#[test] +fn basic() { + let tokens: Vec<_> = + Tokenizer::new_with_emitter("

Hello world

bar", get_emitter()) + .flatten() + .collect(); + + assert_eq!(tokens, vec!["foo".to_owned()]); +} diff --git a/examples/scraper.rs b/examples/scraper.rs index d6b476f..fd3a04d 100644 --- a/examples/scraper.rs +++ b/examples/scraper.rs @@ -6,7 +6,9 @@ /// echo '

Hello

' | cargo run --all-features --example scraper /// ``` /// -/// Essentially, your HTML parsing will be powered by a combination of html5gum and html5ever. +/// Essentially, your HTML parsing will be powered by a combination of html5gum and html5ever. This +/// has no immediate benefit over using scraper normally and is mostly done as a transitionary step +/// until html5gum has its own implementation of tree building and the DOM. /// /// Requires the tree-builder feature. use std::io::{stdin, Read}; diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 8bb0df3..b4743d2 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -14,12 +14,11 @@ dependencies = [ [[package]] name = "afl" -version = "0.11.1" +version = "0.15.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c0f4180b6d7095a1c3130f1aadead4ece7fb3094dc724c701adb30a92a95228" +checksum = "80bb240a3b9ff18002142c1a736e98046461d51a694d687c3e7329b456ab0fe4" dependencies = [ - "cc", - "clap", + "home", "libc", "rustc_version", "xdg", @@ -27,75 +26,54 @@ dependencies = [ [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", - "getrandom 0.2.10", "once_cell", "version_check", + "zerocopy", ] [[package]] name = "aho-corasick" -version = "1.0.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] -[[package]] -name = "ansi_term" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" -dependencies = [ - "winapi", -] - [[package]] name = "arbitrary" -version = "1.3.0" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d098ff73c1ca148721f37baad5ea6a465a13f9573aba8641fbbbae8164a54e" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" [[package]] name = "ast_node" -version = "0.9.5" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c09c69dffe06d222d072c878c3afe86eee2179806f20503faec97250268b4c24" +checksum = "f9184f2b369b3e8625712493c89b785881f27eedc6cde480a81883cef78868b2" dependencies = [ - "pmutil", "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.26", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", + "syn 2.0.85", ] [[package]] name = "autocfg" -version = "1.1.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "better_scoped_tls" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "794edcc9b3fb07bb4aecaa11f093fd45663b4feadb782d68303a2268bc2701de" +checksum = "297b153aa5e573b5863108a6ddc9d5c968bd0b20e75cc614ee9821d2f45679c7" dependencies = [ "scoped-tls", ] @@ -108,29 +86,31 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.3.3" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.4.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" [[package]] name = "cc" -version = "1.0.79" +version = "1.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" dependencies = [ "jobserver", + "libc", + "shlex", ] [[package]] @@ -139,21 +119,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "clap" -version = "2.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "ansi_term", - "atty", - "bitflags 1.3.2", - "strsim", - "textwrap", - "unicode-width", - "vec_map", -] - [[package]] name = "convert_case" version = "0.4.0" @@ -184,20 +149,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] name = "derive_more" -version = "0.99.17" +version = "0.99.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" dependencies = [ "convert_case", "proc-macro2", "quote", "rustc_version", - "syn 1.0.109", + "syn 2.0.85", ] [[package]] @@ -214,47 +179,46 @@ checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" [[package]] name = "dtoa-short" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" dependencies = [ "dtoa", ] [[package]] name = "either" -version = "1.9.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "encoding_rs" -version = "0.8.32" +version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ "cfg-if", ] [[package]] name = "form_urlencoded" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] [[package]] name = "from_variant" -version = "0.1.6" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ec5dc38ee19078d84a692b1c41181ff9f94331c76cee66ff0208c770b5e54f" +checksum = "32016f1242eb82af5474752d00fd8ebcd9004bd69b462b1c91de833972d08ed4" dependencies = [ - "pmutil", "proc-macro2", "swc_macros_common", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] @@ -289,9 +253,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -307,49 +271,41 @@ dependencies = [ "ahash", ] -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - [[package]] name = "home" -version = "0.5.5" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" dependencies = [ "windows-sys", ] [[package]] name = "html5ever" -version = "0.26.0" -source = "git+https://github.com/untitaker/html5ever?branch=html5lib-tests-update#2a5a62647c079e39a3d9b1b5029a069429db049b" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" dependencies = [ "log", "mac", "markup5ever", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.85", ] [[package]] name = "html5gum" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dad48b66db55322add2819ae1d7bda0c32f3415269a08330679dbc8b0afeb30" +version = "0.6.0" dependencies = [ "jetscii", ] [[package]] name = "html5gum" -version = "0.5.7" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4c69595eca3d5d246be4c0be51577b0c39c5f8c75d27a06c8a69c7b594cd37" dependencies = [ "jetscii", ] @@ -362,8 +318,8 @@ dependencies = [ "bytes", "encoding_rs", "html5ever", - "html5gum 0.4.0", - "html5gum 0.5.7", + "html5gum 0.6.0", + "html5gum 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", "libfuzzer-sys", "lol_html", "pretty_assertions", @@ -374,9 +330,9 @@ dependencies = [ [[package]] name = "idna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -384,15 +340,14 @@ dependencies = [ [[package]] name = "is-macro" -version = "0.3.0" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4467ed1321b310c2625c5aa6c1b1ffc5de4d9e42668cf697a08fb033ee8265e" +checksum = "2069faacbe981460232f880d26bf3c7634e322d49053aa48c27e3ae642f728f1" dependencies = [ "Inflector", - "pmutil", "proc-macro2", "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] @@ -403,9 +358,9 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jetscii" @@ -415,18 +370,18 @@ checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] [[package]] name = "lazy_static" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lazycell" @@ -436,15 +391,15 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libfuzzer-sys" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "beb09950ae85a0a94b27676cccf37da5ff13f27076aa1adbc6545dd0d0e1bd4e" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" dependencies = [ "arbitrary", "cc", @@ -453,9 +408,9 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -463,9 +418,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.19" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "lol_html" @@ -473,7 +428,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1610d7994d67a05bb35861cd733b069b1171de8693bc8452849c59361a1bb87b" dependencies = [ - "bitflags 2.3.3", + "bitflags 2.6.0", "cfg-if", "cssparser", "encoding_rs", @@ -495,8 +450,9 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.11.0" -source = "git+https://github.com/untitaker/html5ever?branch=html5lib-tests-update#2a5a62647c079e39a3d9b1b5029a069429db049b" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" dependencies = [ "log", "phf 0.11.2", @@ -514,9 +470,9 @@ checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" [[package]] name = "memchr" -version = "2.5.0" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "mime" @@ -526,9 +482,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "new_debug_unreachable" -version = "1.0.4" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" [[package]] name = "nodrop" @@ -538,45 +494,43 @@ checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -584,9 +538,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", @@ -597,9 +551,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" @@ -714,27 +668,19 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.10" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c40d25201921e5ff0c862a505c6557ea88568a4e3ace775ab55e93f2f4f9d57" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" [[package]] -name = "pmutil" -version = "0.6.1" +name = "ppv-lite86" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a40bc70c2c58040d2d8b167ba9a5ff59fc9dab7ad44771cfde3dcfde7a09c6" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.26", + "zerocopy", ] -[[package]] -name = "ppv-lite86" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" - [[package]] name = "precomputed-hash" version = "0.1.1" @@ -743,9 +689,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "pretty_assertions" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d" dependencies = [ "diff", "yansi", @@ -759,18 +705,18 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.65" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92de25114670a878b1261c79c9f8f729fb97e95bac93f6312f583c60dd6a1dfe" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.30" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5907a1b7c277254a8b15170f6e7c97cfa60ee7872a3217663bb81151e48184bb" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -835,7 +781,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.10", + "getrandom 0.2.15", ] [[package]] @@ -858,18 +804,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", ] [[package]] name = "regex" -version = "1.9.1" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -879,9 +825,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.3" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -890,9 +836,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.7.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustc-hash" @@ -902,18 +848,18 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "safemem" @@ -929,9 +875,9 @@ checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "selectors" @@ -955,37 +901,38 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.18" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.171" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e27d1e4fd7659406c492fd6cfaf2066ba8773de45ca75e855590f856dc34a9" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.171" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] name = "serde_json" -version = "1.0.103" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d03b412469450d4404fe8499a268edd7f8b79fecb074b0d812ad64ca21f4031b" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ - "itoa 1.0.9", + "itoa 1.0.11", + "memchr", "ryu", "serde", ] @@ -1000,17 +947,23 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "siphasher" -version = "0.3.10" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "stable_deref_trait" @@ -1046,28 +999,21 @@ dependencies = [ [[package]] name = "string_enum" -version = "0.4.1" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fa4d4f81d7c05b9161f8de839975d3326328b8ba2831164b465524cc2f55252" +checksum = "05e383308aebc257e7d7920224fa055c632478d92744eca77f99be8fa1545b90" dependencies = [ - "pmutil", "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.26", + "syn 2.0.85", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "swc_atoms" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8066e17abb484602da673e2d35138ab32ce53f26368d9c92113510e1659220b" +checksum = "9f54563d7dcba626d4acfe14ed12def7ecc28e004debe3ecd2c3ee07cc47e449" dependencies = [ "once_cell", "rustc-hash", @@ -1079,11 +1025,10 @@ dependencies = [ [[package]] name = "swc_common" -version = "0.31.18" +version = "0.31.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e30cd01afa791b15263fcfe8f77ecbbd020ddef659f0f58d3c7b794ad65c1738" +checksum = "88d00f960c667c59c133f30492f4d07f26242fcf988a066d3871e6d3d838d528" dependencies = [ - "ahash", "ast_node", "better_scoped_tls", "cfg-if", @@ -1106,21 +1051,20 @@ dependencies = [ [[package]] name = "swc_eq_ignore_macros" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a95d367e228d52484c53336991fdcf47b6b553ef835d9159db4ba40efb0ee8" +checksum = "63db0adcff29d220c3d151c5b25c0eabe7e32dd936212b84cdaa1392e3130497" dependencies = [ - "pmutil", "proc-macro2", "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] name = "swc_html_ast" -version = "0.31.18" +version = "0.31.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0edeedae12497a0f6e5bb97d77c4bc459495fc644a1a71f34d24b8cb988a14bc" +checksum = "bb2c7b319ef5ff810dffb876f772ebb7d6aac934454998c00cd41fd525e3c8ed" dependencies = [ "is-macro", "string_enum", @@ -1130,9 +1074,9 @@ dependencies = [ [[package]] name = "swc_html_parser" -version = "0.37.21" +version = "0.37.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d449b38d99056aad88f6a7743ce65ccb0c4ab8863089b2a781262d72acb1e3dd" +checksum = "89b0e2371bc52709de74dd73f2aad2a49f9f5fbf1030351d7d5815e849a91caf" dependencies = [ "swc_atoms", "swc_common", @@ -1142,9 +1086,9 @@ dependencies = [ [[package]] name = "swc_html_utils" -version = "0.16.18" +version = "0.16.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b703fedf29d2347ab77fe98bbf506e4a7f5a34462c7de8a966fe6ff00c79c78" +checksum = "1ac1904e208373342021df5c0ac501fdf825a93ca06cce2854f5921f1279e430" dependencies = [ "once_cell", "serde", @@ -1155,21 +1099,20 @@ dependencies = [ [[package]] name = "swc_macros_common" -version = "0.3.8" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a273205ccb09b51fabe88c49f3b34c5a4631c4c00a16ae20e03111d6a42e832" +checksum = "27e18fbfe83811ffae2bb23727e45829a0d19c6870bced7c0f545cc99ad248dd" dependencies = [ - "pmutil", "proc-macro2", "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] name = "swc_visit" -version = "0.5.7" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e87c337fbb2d191bf371173dea6a957f01899adb8f189c6c31b122a6cfc98fc3" +checksum = "043d11fe683dcb934583ead49405c0896a5af5face522e4682c16971ef7871b9" dependencies = [ "either", "swc_visit_macros", @@ -1177,16 +1120,15 @@ dependencies = [ [[package]] name = "swc_visit_macros" -version = "0.5.8" +version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f322730fb82f3930a450ac24de8c98523af7d34ab8cb2f46bcb405839891a99" +checksum = "92807d840959f39c60ce8a774a3f83e8193c658068e6d270dbe0a05e40e90b41" dependencies = [ "Inflector", - "pmutil", "proc-macro2", "quote", "swc_macros_common", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] @@ -1202,9 +1144,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.26" +version = "2.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970" +checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" dependencies = [ "proc-macro2", "quote", @@ -1222,15 +1164,6 @@ dependencies = [ "utf-8", ] -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - [[package]] name = "thin-slice" version = "0.1.1" @@ -1239,29 +1172,29 @@ checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" [[package]] name = "thiserror" -version = "1.0.43" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a35fc5b8971143ca348fa6df4f024d4d55264f3468c71ad1c2f365b0a4d58c42" +checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.43" +version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f" +checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" dependencies = [ "proc-macro2", "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -1274,11 +1207,10 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "cfg-if", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -1286,29 +1218,29 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.26", + "syn 2.0.85", ] [[package]] name = "tracing-core" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", ] [[package]] name = "triomphe" -version = "0.1.9" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee8098afad3fb0c54a9007aab6804558410503ad676d4633f9c2559a00ac0f" +checksum = "ef8f7726da4807b58ea5c96fdc122f80702030edc33b35aff9190a51148ccc85" dependencies = [ "serde", "stable_deref_trait", @@ -1316,36 +1248,36 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] [[package]] name = "unicode-width" -version = "0.1.10" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "url" -version = "2.4.0" +version = "2.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" dependencies = [ "form_urlencoded", "idna", @@ -1358,17 +1290,11 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" @@ -1382,46 +1308,25 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", + "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", @@ -1430,57 +1335,81 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "xdg" -version = "2.5.0" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213b7324336b53d2414b2db8537e56544d981803139155afa84f76eeebb7a546" + +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + +[[package]] +name = "zerocopy" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "688597db5a750e9cad4511cb94729a078e274308099a0382b5b8203bbc767fee" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ - "home", + "byteorder", + "zerocopy-derive", ] [[package]] -name = "yansi" -version = "0.5.1" +name = "zerocopy-derive" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.85", +] diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 3deb140..a32f1a4 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -8,21 +8,21 @@ cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4" +afl = { version = "0.15.0", optional = true } html5gum = { path = "../" } -html5gum_old = { version = "0.4.0", package = "html5gum" } -afl = { version = "0.11.0", optional = true } -# https://github.com/servo/html5ever/pull/460 -html5ever = { version = "*", git = "https://github.com/untitaker/html5ever", branch = "html5lib-tests-update" } pretty_assertions = "1.0.0" +# thirdparty crates to fuzz against +html5gum_old = { version = "0.6.0", package = "html5gum" } +html5ever = "0.27.0" +swc_common = "0.31.18" +swc_html_parser = "0.37.21" +swc_html_ast = "0.31.18" # lol-html and its dependencies lol_html = { version = "0.4", features = ["integration_test"] } encoding_rs = "0.8" bytes = "1" -swc_common = "0.31.18" -swc_html_parser = "0.37.21" -swc_html_ast = "0.31.18" # Prevent this from interfering with workspaces diff --git a/fuzz/README.md b/fuzz/README.md index acc2108..206dc88 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -22,6 +22,7 @@ target. * `FUZZ_IGNORE_PARSE_ERRORS=order` will sort errors from both parsers such that order can be ignored. * `FUZZ_IGNORE_PARSE_ERRORS=1` will delete all errors so that parsing errors are not compared at all. * `FUZZ_IGNORE_PARSE_ERRORS=if-reference-contains:duplicate-attribute` will delete all errors _if_ any of them _in the old version of html5gum_ contains the string `duplicate-attribute`. + * `FUZZ_IGNORE_PARSE_ERRORS=if-testing-contains:duplicate-attribute` will delete all errors _if_ any of them _in the new version of html5gum_ contains the string `duplicate-attribute`. This envvar is a comma-separated list of instructions. For example, `FUZZ_IGNORE_PARSE_ERRORS=order,if-reference-contains:foo` means "ignore diff --git a/fuzz/src/testcase/html5ever.rs b/fuzz/src/testcase/html5ever.rs index 0c45b22..88a9bc6 100644 --- a/fuzz/src/testcase/html5ever.rs +++ b/fuzz/src/testcase/html5ever.rs @@ -64,22 +64,22 @@ impl> html5ever::tokenizer::TokenSink for T match (token, reference_token) { (Some(Token::StartTag(tag)), Token2::TagToken(tag2)) => { assert_eq!(tag2.kind, TagKind::StartTag); - assert_eq!(tag.name, tag2.name.as_ref().as_bytes().to_owned().into()); + assert_eq!(tag.name, tag2.name.as_ref().as_bytes().to_owned()); } (Some(Token::EndTag(tag)), Token2::TagToken(tag2)) => { assert_eq!(tag2.kind, TagKind::EndTag); - assert_eq!(tag.name, tag2.name.as_ref().as_bytes().to_owned().into()); + assert_eq!(tag.name, tag2.name.as_ref().as_bytes().to_owned()); } (None, Token2::EOFToken) => {} (Some(Token::Comment(comment)), Token2::CommentToken(comment2)) => { - assert_eq!(comment, comment2.as_ref().as_bytes().to_owned().into()); + assert_eq!(comment, comment2.as_ref().as_bytes().to_owned()); } (Some(Token::Doctype(doctype)), Token2::DoctypeToken(doctype2)) => { assert_eq!( doctype.name, doctype2 .name - .map(|x| x.as_ref().to_owned().into_bytes().into()) + .map(|x| x.as_ref().to_owned().into_bytes()) .unwrap_or_default() ); assert_eq!( diff --git a/fuzz/src/testcase/old_html5gum.rs b/fuzz/src/testcase/old_html5gum.rs index 69681bf..3f50be4 100644 --- a/fuzz/src/testcase/old_html5gum.rs +++ b/fuzz/src/testcase/old_html5gum.rs @@ -43,6 +43,14 @@ pub fn run_old_html5gum(s: &str) { testing_tokens.retain(isnt_error); } } + x if x.starts_with("if-testing-contains:") => { + if testing_tokens.contains(&html5gum::Token::Error( + x["if-testing-contains:".len()..].parse().unwrap(), + )) { + reference_tokens.retain(isnt_old_error); + testing_tokens.retain(isnt_error); + } + } x => panic!("unknown FUZZ_IGNORE_PARSE_ERRORS instruction: {}", x), } } @@ -50,26 +58,26 @@ pub fn run_old_html5gum(s: &str) { let reference_tokens: Vec<_> = reference_tokens .into_iter() .map(|x| match x { - html5gum_old::Token::String(x) => Token::String(x.into()), - html5gum_old::Token::Comment(x) => Token::Comment(x.into()), + html5gum_old::Token::String(x) => Token::String(Vec::from(x).into()), + html5gum_old::Token::Comment(x) => Token::Comment(Vec::from(x).into()), html5gum_old::Token::StartTag(x) => Token::StartTag(StartTag { - name: x.name.into(), + name: Vec::from(x.name).into(), attributes: x .attributes .into_iter() - .map(|(k, v)| (k.into(), v.into())) + .map(|(k, v)| (Vec::from(k).into(), Vec::from(v).into())) .collect(), self_closing: x.self_closing, }), html5gum_old::Token::EndTag(x) => Token::EndTag(EndTag { - name: x.name.into(), + name: Vec::from(x.name).into(), }), html5gum_old::Token::Error(x) => Token::Error(x.to_string().parse().unwrap()), html5gum_old::Token::Doctype(x) => Token::Doctype(Doctype { - name: x.name.into(), + name: Vec::from(x.name).into(), force_quirks: x.force_quirks, - public_identifier: x.public_identifier.map(From::from), - system_identifier: x.system_identifier.map(From::from), + public_identifier: x.public_identifier.map(|x| Vec::from(x).into()), + system_identifier: x.system_identifier.map(|x| Vec::from(x).into()) }), }) .collect(); diff --git a/src/callbacks.rs b/src/callbacks.rs new file mode 100644 index 0000000..5b3f33c --- /dev/null +++ b/src/callbacks.rs @@ -0,0 +1,497 @@ +//! Consume the parsed HTML as a series of events through a callback. +//! +//! While using the [DefaultEmitter] provides an easy-to-use API with low performance, and +//! implementing your own [Emitter] brings maximal performance and maximal pain, this is a middle +//! ground. All strings are borrowed from some intermediate buffer instead of individually +//! allocated. +//! +//! ``` +//! // Extract all text between span tags, in a naive (but fast) way. Does not handle tags inside of the span. See `examples/` as well. +//! use html5gum::Tokenizer; +//! use html5gum::callbacks::{CallbackEvent, CallbackEmitter}; +//! +//! let mut is_in_span = false; +//! let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>| -> Option> { +//! match event { +//! CallbackEvent::OpenStartTag { name } => { +//! is_in_span = name == b"span"; +//! }, +//! CallbackEvent::String { value } if is_in_span => { +//! return Some(value.to_vec()); +//! } +//! CallbackEvent::EndTag { .. } => { +//! is_in_span = false; +//! } +//! _ => {} +//! } +//! +//! None +//! }); +//! +//! let input = r#"

Hello world!

"#; +//! let text_fragments = Tokenizer::new_with_emitter(input, emitter) +//! .infallible() +//! .collect::>(); +//! +//! assert_eq!(text_fragments, vec![b"Hello".to_vec()]); +//! ``` + +use std::collections::VecDeque; +use std::convert::Infallible; +use std::mem::swap; + +use crate::utils::trace_log; +use crate::{naive_next_state, Emitter, Error, State}; + +/// Events used by [CallbackEmitter]. +/// +/// This operates at a slightly lower level than [Token], as start tags are split up into multiple +/// events. +#[derive(Debug)] +pub enum CallbackEvent<'a> { + /// Visit the `""`. Signifies the beginning of a new start + /// tag. + /// + /// Attributes have not yet been read. + OpenStartTag { + /// The name of the start tag. + name: &'a [u8], + }, + + /// Visit an attribute name, for example `"mykey"` in `""`. + /// + /// The attribute value has not yet been read. + AttributeName { + /// The name of the attribute. + name: &'a [u8], + }, + + /// Visit an attribute value, for example `"myvalue"` in `""`. + /// + /// Things like whitespace, quote handling is taken care of. + /// + /// After this event, the start tag may be closed using [CloseStartTag], or another + /// [AttributeName] may follow. + AttributeValue { + /// The value of the attribute. + value: &'a [u8], + }, + + /// Visit the end of the start tag, for example `">"` in `""`. + /// + CloseStartTag { + /// Whether the tag ended with `"/>"`. + /// + /// Note that in HTML5 this difference is largely ignored, and tags are considered + /// self-closing based on a hardcoded list of names, not based on syntax. + self_closing: bool, + }, + + /// Visit `"". + /// + /// Note: Because of strangeness in the HTML spec, attributes may be observed outside of start + /// tags, before this event. It's best to ignore them as they are not valid HTML, but can still + /// be observed through most HTML parsers. + EndTag { + /// The name of the end tag. + name: &'a [u8], + }, + + /// Visit a string, as in, the actual text between tags. The content. Remember actual content + /// in HTML, before SPAs took over? I remember. + /// + /// It's guaranteed that all consecutive "character tokens" (as the spec calls them) are folded + /// into one string event. + String { + /// A series of character tokens. + value: &'a [u8], + }, + + /// Visit a comment, like `` + Comment { + /// The contents of the comment. + value: &'a [u8], + }, + + /// Visit ``. + Doctype { + /// Name of the docstring. + name: &'a [u8], + /// Public identifier (see spec) + public_identifier: Option<&'a [u8]>, + /// System identifier (see spec) + system_identifier: Option<&'a [u8]>, + /// Enable quirksmode + force_quirks: bool, + }, + + /// Visit a parsing error. + Error(Error), +} + +#[derive(Debug, Clone, Copy)] +enum CurrentTag { + Start, + End, +} + +#[derive(Debug)] +struct CallbackState { + callback: F, + emitted_tokens: VecDeque, +} + +/// This trait is implemented for all functions that have the same signature as +/// [Callback::handle_event]. The trait only exists in case you want to implement it on a nameable +/// type. +pub trait Callback { + /// Perform some action on a parsing event, and, optionally, return a value that can be yielded + /// from the [Tokenizer] iterator. + fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option; +} + +impl Callback for F +where + F: FnMut(CallbackEvent<'_>) -> Option, +{ + fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option { + self(event) + } +} + +impl CallbackState +where + F: Callback, +{ + fn emit_event(&mut self, event: CallbackEvent<'_>) { + let res = self.callback.handle_event(event); + if let Some(token) = res { + self.emitted_tokens.push_front(token); + } + } +} + +impl Default for CallbackState +where + F: Default, +{ + fn default() -> Self { + CallbackState { + callback: F::default(), + emitted_tokens: VecDeque::default(), + } + } +} + +#[derive(Debug, Default)] +struct EmitterState { + naively_switch_states: bool, + + current_characters: Vec, + current_comment: Vec, + + last_start_tag: Vec, + current_tag_had_attributes: bool, + current_tag_type: Option, + current_tag_self_closing: bool, + current_tag_name: Vec, + current_attribute_name: Vec, + current_attribute_value: Vec, + + // strings related to doctype + doctype_name: Vec, + doctype_has_public_identifier: bool, + doctype_has_system_identifier: bool, + doctype_public_identifier: Vec, + doctype_system_identifier: Vec, + doctype_force_quirks: bool, +} + +/// The emitter class to pass to [Tokenizer::new_with_emitter] +#[derive(Debug)] +pub struct CallbackEmitter { + // this struct is only split out so [CallbackState::emit_event] can borrow things concurrently + // with other attributes. + callback_state: CallbackState, + emitter_state: EmitterState, +} + +impl Default for CallbackEmitter +where + F: Default, +{ + fn default() -> Self { + CallbackEmitter { + callback_state: CallbackState::default(), + emitter_state: EmitterState::default(), + } + } +} + +impl CallbackEmitter +where + F: Callback, +{ + /// Create a new emitter. See type-level docs to understand basic usage. + /// + /// The given callback may return optional tokens that then become available through the + /// [Tokenizer]'s iterator. If that's not used, return [Option]. + pub fn new(callback: F) -> Self { + CallbackEmitter { + callback_state: CallbackState { + callback, + emitted_tokens: VecDeque::new(), + }, + emitter_state: EmitterState::default(), + } + } + + /// Get mutable access to the inner callback. + pub fn callback_mut(&mut self) -> &mut F { + &mut self.callback_state.callback + } + + /// Whether to use [`naive_next_state`] to switch states automatically. + /// + /// The default is off. + pub fn naively_switch_states(&mut self, yes: bool) { + self.emitter_state.naively_switch_states = yes; + } + + fn flush_attribute_name(&mut self) { + if !self.emitter_state.current_attribute_name.is_empty() { + self.callback_state + .emit_event(CallbackEvent::AttributeName { + name: &self.emitter_state.current_attribute_name, + }); + self.emitter_state.current_attribute_name.clear(); + } + } + + fn flush_attribute(&mut self) { + self.flush_attribute_name(); + + if !self.emitter_state.current_attribute_value.is_empty() { + self.callback_state + .emit_event(CallbackEvent::AttributeValue { + value: &self.emitter_state.current_attribute_value, + }); + self.emitter_state.current_attribute_value.clear(); + } + } + + fn flush_open_start_tag(&mut self) { + if matches!(self.emitter_state.current_tag_type, Some(CurrentTag::Start)) + && !self.emitter_state.current_tag_name.is_empty() + { + self.callback_state.emit_event(CallbackEvent::OpenStartTag { + name: &self.emitter_state.current_tag_name, + }); + + self.emitter_state.last_start_tag.clear(); + swap( + &mut self.emitter_state.last_start_tag, + &mut self.emitter_state.current_tag_name, + ); + } + } + + fn flush_current_characters(&mut self) { + if self.emitter_state.current_characters.is_empty() { + return; + } + + self.callback_state.emit_event(CallbackEvent::String { + value: &self.emitter_state.current_characters, + }); + self.emitter_state.current_characters.clear(); + } +} +impl Emitter for CallbackEmitter +where + F: Callback, +{ + type Token = T; + + fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) { + self.emitter_state.last_start_tag.clear(); + self.emitter_state + .last_start_tag + .extend(last_start_tag.unwrap_or_default()); + } + + fn emit_eof(&mut self) { + self.flush_current_characters(); + } + + fn emit_error(&mut self, error: Error) { + self.callback_state.emit_event(CallbackEvent::Error(error)); + } + + fn pop_token(&mut self) -> Option { + self.callback_state.emitted_tokens.pop_back() + } + + fn emit_string(&mut self, s: &[u8]) { + crate::utils::trace_log!("callbacks: emit_string, len={}", s.len()); + self.emitter_state.current_characters.extend(s); + } + + fn init_start_tag(&mut self) { + self.emitter_state.current_tag_name.clear(); + self.emitter_state.current_tag_type = Some(CurrentTag::Start); + self.emitter_state.current_tag_self_closing = false; + } + + fn init_end_tag(&mut self) { + self.emitter_state.current_tag_name.clear(); + self.emitter_state.current_tag_type = Some(CurrentTag::End); + self.emitter_state.current_tag_had_attributes = false; + } + + fn init_comment(&mut self) { + self.flush_current_characters(); + self.emitter_state.current_comment.clear(); + } + + fn emit_current_tag(&mut self) -> Option { + self.flush_attribute(); + self.flush_current_characters(); + match self.emitter_state.current_tag_type { + Some(CurrentTag::Start) => { + self.flush_open_start_tag(); + self.callback_state + .emit_event(CallbackEvent::CloseStartTag { + self_closing: self.emitter_state.current_tag_self_closing, + }); + } + Some(CurrentTag::End) => { + if self.emitter_state.current_tag_had_attributes { + self.emit_error(Error::EndTagWithAttributes); + } + self.emitter_state.last_start_tag.clear(); + self.callback_state.emit_event(CallbackEvent::EndTag { + name: &self.emitter_state.current_tag_name, + }); + } + _ => {} + } + + if self.emitter_state.naively_switch_states { + naive_next_state(&self.emitter_state.last_start_tag) + } else { + None + } + } + fn emit_current_comment(&mut self) { + self.callback_state.emit_event(CallbackEvent::Comment { + value: &self.emitter_state.current_comment, + }); + self.emitter_state.current_comment.clear(); + } + + fn emit_current_doctype(&mut self) { + self.callback_state.emit_event(CallbackEvent::Doctype { + name: &self.emitter_state.doctype_name, + public_identifier: if self.emitter_state.doctype_has_public_identifier { + Some(&self.emitter_state.doctype_public_identifier) + } else { + None + }, + system_identifier: if self.emitter_state.doctype_has_system_identifier { + Some(&self.emitter_state.doctype_system_identifier) + } else { + None + }, + force_quirks: self.emitter_state.doctype_force_quirks, + }); + } + + fn set_self_closing(&mut self) { + trace_log!("set_self_closing"); + if matches!(self.emitter_state.current_tag_type, Some(CurrentTag::End)) { + self.callback_state + .emit_event(CallbackEvent::Error(Error::EndTagWithTrailingSolidus)); + } else { + self.emitter_state.current_tag_self_closing = true; + } + } + + fn set_force_quirks(&mut self) { + self.emitter_state.doctype_force_quirks = true; + } + + fn push_tag_name(&mut self, s: &[u8]) { + self.emitter_state.current_tag_name.extend(s); + } + + fn push_comment(&mut self, s: &[u8]) { + self.emitter_state.current_comment.extend(s); + } + + fn push_doctype_name(&mut self, s: &[u8]) { + self.emitter_state.doctype_name.extend(s); + } + + fn init_doctype(&mut self) { + self.flush_current_characters(); + self.emitter_state.doctype_name.clear(); + self.emitter_state.doctype_has_public_identifier = false; + self.emitter_state.doctype_has_system_identifier = false; + self.emitter_state.doctype_public_identifier.clear(); + self.emitter_state.doctype_system_identifier.clear(); + self.emitter_state.doctype_force_quirks = false; + } + + fn init_attribute(&mut self) { + self.flush_open_start_tag(); + self.flush_attribute(); + self.emitter_state.current_tag_had_attributes = true; + } + + fn push_attribute_name(&mut self, s: &[u8]) { + self.emitter_state.current_attribute_name.extend(s); + } + + fn push_attribute_value(&mut self, s: &[u8]) { + self.flush_attribute_name(); + self.emitter_state.current_attribute_value.extend(s); + } + + fn set_doctype_public_identifier(&mut self, value: &[u8]) { + self.emitter_state.doctype_has_public_identifier = true; + self.emitter_state.doctype_public_identifier.clear(); + self.emitter_state.doctype_public_identifier.extend(value); + } + fn set_doctype_system_identifier(&mut self, value: &[u8]) { + self.emitter_state.doctype_has_system_identifier = true; + self.emitter_state.doctype_system_identifier.clear(); + self.emitter_state.doctype_system_identifier.extend(value); + } + fn push_doctype_public_identifier(&mut self, value: &[u8]) { + self.emitter_state.doctype_public_identifier.extend(value); + } + fn push_doctype_system_identifier(&mut self, value: &[u8]) { + self.emitter_state.doctype_system_identifier.extend(value); + } + + fn current_is_appropriate_end_tag_token(&mut self) -> bool { + if self.emitter_state.last_start_tag.is_empty() { + crate::utils::trace_log!( + "current_is_appropriate_end_tag_token: no, because last_start_tag is empty" + ); + return false; + } + + if !matches!(self.emitter_state.current_tag_type, Some(CurrentTag::End)) { + crate::utils::trace_log!( + "current_is_appropriate_end_tag_token: no, because current_tag_type is not end" + ); + return false; + } + + crate::utils::trace_log!("last_start_tag = {:?}", self.emitter_state.last_start_tag); + crate::utils::trace_log!("current_tag = {:?}", self.emitter_state.current_tag_name); + self.emitter_state.last_start_tag == self.emitter_state.current_tag_name + } +} diff --git a/src/default_emitter.rs b/src/default_emitter.rs new file mode 100644 index 0000000..b245795 --- /dev/null +++ b/src/default_emitter.rs @@ -0,0 +1,266 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::mem::take; + +use crate::{Emitter, Error, HtmlString, State}; + +use crate::callbacks::{Callback, CallbackEmitter, CallbackEvent}; + +#[derive(Debug, Default)] +struct OurCallback { + tag_name: Vec, + attribute_name: HtmlString, + attribute_map: BTreeMap, +} + +impl Callback for OurCallback { + fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option { + crate::utils::trace_log!("event: {:?}", event); + match event { + CallbackEvent::OpenStartTag { name } => { + self.tag_name.clear(); + self.tag_name.extend(name); + None + } + CallbackEvent::AttributeName { name } => { + self.attribute_name.clear(); + match self.attribute_map.entry(name.to_owned().into()) { + Entry::Occupied(_) => Some(Token::Error(Error::DuplicateAttribute)), + Entry::Vacant(vacant) => { + self.attribute_name.extend(name); + vacant.insert(Default::default()); + None + } + } + } + CallbackEvent::AttributeValue { value } => { + if !self.attribute_name.is_empty() { + self.attribute_map + .get_mut(&self.attribute_name) + .unwrap() + .extend(value); + } + None + } + CallbackEvent::CloseStartTag { self_closing } => Some(Token::StartTag(StartTag { + self_closing, + name: take(&mut self.tag_name).into(), + attributes: take(&mut self.attribute_map), + })), + CallbackEvent::EndTag { name } => { + self.attribute_map.clear(); + Some(Token::EndTag(EndTag { + name: name.to_owned().into(), + })) + } + CallbackEvent::String { value } => Some(Token::String(value.to_owned().into())), + CallbackEvent::Comment { value } => Some(Token::Comment(value.to_owned().into())), + CallbackEvent::Doctype { + name, + public_identifier, + system_identifier, + force_quirks, + } => Some(Token::Doctype(Doctype { + force_quirks, + name: name.to_owned().into(), + public_identifier: public_identifier.map(|x| x.to_owned().into()), + system_identifier: system_identifier.map(|x| x.to_owned().into()), + })), + CallbackEvent::Error(error) => Some(Token::Error(error)), + } + } +} + +/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. +#[derive(Default, Debug)] +pub struct DefaultEmitter { + inner: CallbackEmitter, +} + +impl DefaultEmitter { + /// Whether to use [`naive_next_state`] to switch states automatically. + /// + /// The default is off. + pub fn naively_switch_states(&mut self, yes: bool) { + self.inner.naively_switch_states(yes) + } +} + +// opaque type around inner emitter +impl Emitter for DefaultEmitter { + type Token = Token; + + fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) { + self.inner.set_last_start_tag(last_start_tag) + } + + fn emit_eof(&mut self) { + self.inner.emit_eof() + } + + fn emit_error(&mut self, error: Error) { + self.inner.emit_error(error) + } + + fn should_emit_errors(&mut self) -> bool { + self.inner.should_emit_errors() + } + + fn pop_token(&mut self) -> Option { + self.inner.pop_token() + } + fn emit_string(&mut self, c: &[u8]) { + self.inner.emit_string(c) + } + + fn init_start_tag(&mut self) { + self.inner.init_start_tag() + } + + fn init_end_tag(&mut self) { + self.inner.init_end_tag() + } + + fn init_comment(&mut self) { + self.inner.init_comment() + } + + fn emit_current_tag(&mut self) -> Option { + self.inner.emit_current_tag() + } + + fn emit_current_comment(&mut self) { + self.inner.emit_current_comment() + } + + fn emit_current_doctype(&mut self) { + self.inner.emit_current_doctype() + } + + fn set_self_closing(&mut self) { + self.inner.set_self_closing() + } + + fn set_force_quirks(&mut self) { + self.inner.set_force_quirks() + } + + fn push_tag_name(&mut self, s: &[u8]) { + self.inner.push_tag_name(s) + } + + fn push_comment(&mut self, s: &[u8]) { + self.inner.push_comment(s) + } + + fn push_doctype_name(&mut self, s: &[u8]) { + self.inner.push_doctype_name(s) + } + + fn init_doctype(&mut self) { + self.inner.init_doctype() + } + + fn init_attribute(&mut self) { + self.inner.init_attribute() + } + + fn push_attribute_name(&mut self, s: &[u8]) { + self.inner.push_attribute_name(s) + } + + fn push_attribute_value(&mut self, s: &[u8]) { + self.inner.push_attribute_value(s) + } + + fn set_doctype_public_identifier(&mut self, value: &[u8]) { + self.inner.set_doctype_public_identifier(value) + } + + fn set_doctype_system_identifier(&mut self, value: &[u8]) { + self.inner.set_doctype_system_identifier(value) + } + + fn push_doctype_public_identifier(&mut self, s: &[u8]) { + self.inner.push_doctype_public_identifier(s) + } + + fn push_doctype_system_identifier(&mut self, s: &[u8]) { + self.inner.push_doctype_system_identifier(s) + } + + fn current_is_appropriate_end_tag_token(&mut self) -> bool { + self.inner.current_is_appropriate_end_tag_token() + } + + fn adjusted_current_node_present_but_not_in_html_namespace(&mut self) -> bool { + self.inner + .adjusted_current_node_present_but_not_in_html_namespace() + } +} + +/// A HTML end/close tag, such as `

` or ``. +#[derive(Debug, Default, Eq, PartialEq, Clone)] +pub struct StartTag { + /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be + /// expected. + pub self_closing: bool, + + /// The start tag's name, such as `"p"` or `"a"`. + pub name: HtmlString, + + /// A mapping for any HTML attributes this start tag may have. + /// + /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own + /// [`Emitter`] to tweak this behavior. + pub attributes: BTreeMap, +} + +/// A HTML end/close tag, such as `

` or ``. +#[derive(Debug, Default, Eq, PartialEq, Clone)] +pub struct EndTag { + /// The ending tag's name, such as `"p"` or `"a"`. + pub name: HtmlString, +} + +/// A doctype. Some examples: +/// +/// * `` +/// * `` +/// * `` +/// * `` +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct Doctype { + /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag. + pub force_quirks: bool, + + /// The doctype's name. For HTML documents this is "html". + pub name: HtmlString, + + /// The doctype's public identifier. + pub public_identifier: Option, + + /// The doctype's system identifier. + pub system_identifier: Option, +} + +/// The token type used by default. You can define your own token type by implementing the +/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`]. +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum Token { + /// A HTML start tag. + StartTag(StartTag), + /// A HTML end tag. + EndTag(EndTag), + /// A literal string. + String(HtmlString), + /// A HTML comment. + Comment(HtmlString), + /// A HTML doctype declaration. + Doctype(Doctype), + /// A HTML parsing error. + /// + /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with + /// more tokens afterward. + Error(Error), +} diff --git a/src/emitter.rs b/src/emitter.rs index 0040fce..9e911e1 100644 --- a/src/emitter.rs +++ b/src/emitter.rs @@ -1,127 +1,5 @@ -use std::borrow::{Borrow, BorrowMut}; -use std::collections::BTreeMap; -use std::collections::BTreeSet; -use std::collections::VecDeque; -use std::fmt::{Debug, Formatter}; -use std::mem; -use std::ops::{Deref, DerefMut}; - use crate::{Error, State}; -/// A wrapper around a bytestring. -/// -/// This newtype only exists to provide a nicer `Debug` impl -#[derive(Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct HtmlString(pub Vec); - -impl Deref for HtmlString { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl DerefMut for HtmlString { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl Debug for HtmlString { - fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { - write!(f, "b\"")?; - for &byte in &self.0 { - for ch in std::ascii::escape_default(byte) { - write!(f, "{}", ch as char)?; - } - } - - write!(f, "\"") - } -} - -impl Borrow<[u8]> for HtmlString { - fn borrow(&self) -> &[u8] { - &self.0 - } -} - -impl BorrowMut<[u8]> for HtmlString { - fn borrow_mut(&mut self) -> &mut [u8] { - &mut self.0 - } -} - -impl AsRef<[u8]> for HtmlString { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl PartialEq<&[u8; N]> for HtmlString { - fn eq(&self, other: &&[u8; N]) -> bool { - self.0 == *other - } -} - -impl PartialEq for &[u8; N] { - fn eq(&self, other: &HtmlString) -> bool { - other.0 == *self - } -} - -impl PartialEq<&[u8]> for HtmlString { - fn eq(&self, other: &&[u8]) -> bool { - self.0 == *other - } -} - -impl PartialEq for &[u8] { - fn eq(&self, other: &HtmlString) -> bool { - *self == other.0 - } -} - -#[test] -fn test_eq_html_str_and_byte_literal() { - assert!(HtmlString(b"hello world".to_vec()) == b"hello world"); -} - -#[test] -fn test_eq_byte_literal_and_html_str() { - assert!(b"hello world" == HtmlString(b"hello world".to_vec())); -} - -#[test] -fn test_eq_html_str_and_byte_slice() { - assert!(HtmlString(b"hello world".to_vec()) == b"hello world".as_slice()); -} - -#[test] -fn test_eq_byte_slice_and_html_str() { - assert!(b"hello world".as_slice() == HtmlString(b"hello world".to_vec())); -} - -#[test] -fn test_borrowing() { - // demonstrate a usecase for Borrow/BorrowMut - let tag = StartTag::default(); - assert!(!tag.attributes.contains_key(b"href".as_slice())); -} - -impl From> for HtmlString { - fn from(vec: Vec) -> HtmlString { - HtmlString(vec) - } -} - -impl From for Vec { - fn from(other: HtmlString) -> Vec { - other.0 - } -} - /// An emitter is an object providing methods to the tokenizer to produce tokens. /// /// Domain-specific applications of the HTML tokenizer can manually implement this trait to @@ -349,327 +227,3 @@ pub fn naive_next_state(tag_name: &[u8]) -> Option { _ => None, } } - -/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. -#[derive(Debug, Default)] -pub struct DefaultEmitter { - current_characters: HtmlString, - current_token: Option, - last_start_tag: HtmlString, - current_attribute: Option<(HtmlString, HtmlString)>, - seen_attributes: BTreeSet, - emitted_tokens: VecDeque, - naively_switch_states: bool, -} - -impl DefaultEmitter { - /// Whether to use [`naive_next_state`] to switch states automatically. - /// - /// The default is off. - pub fn naively_switch_states(&mut self, yes: bool) { - self.naively_switch_states = yes; - } - - fn emit_token(&mut self, token: Token) { - self.flush_current_characters(); - self.emitted_tokens.push_front(token); - } - - fn flush_current_attribute(&mut self) { - if let Some((k, v)) = self.current_attribute.take() { - match self.current_token { - Some(Token::StartTag(ref mut tag)) => { - let mut error = None; - tag.attributes - .entry(k) - .and_modify(|_| { - error = Some(Error::DuplicateAttribute); - }) - .or_insert(v); - - if let Some(e) = error { - self.emit_error(e); - } - } - Some(Token::EndTag(_)) => { - if !self.seen_attributes.insert(k) { - self.emit_error(Error::DuplicateAttribute); - } - } - _ => { - debug_assert!(false); - } - } - } - } - - fn flush_current_characters(&mut self) { - if self.current_characters.is_empty() { - return; - } - - let s = mem::take(&mut self.current_characters); - self.emit_token(Token::String(s)); - } -} - -impl Emitter for DefaultEmitter { - type Token = Token; - - fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) { - self.last_start_tag.clear(); - self.last_start_tag - .extend(last_start_tag.unwrap_or_default()); - } - - fn emit_eof(&mut self) { - self.flush_current_characters(); - } - - fn emit_error(&mut self, error: Error) { - // bypass character flushing in self.emit_token: we don't need the error location to be - // that exact - self.emitted_tokens.push_front(Token::Error(error)); - } - - fn pop_token(&mut self) -> Option { - self.emitted_tokens.pop_back() - } - - fn emit_string(&mut self, s: &[u8]) { - self.current_characters.extend(s); - } - - fn init_start_tag(&mut self) { - self.current_token = Some(Token::StartTag(StartTag::default())); - } - fn init_end_tag(&mut self) { - self.current_token = Some(Token::EndTag(EndTag::default())); - self.seen_attributes.clear(); - } - - fn init_comment(&mut self) { - self.current_token = Some(Token::Comment(HtmlString::default())); - } - fn emit_current_tag(&mut self) -> Option { - self.flush_current_attribute(); - let mut token = self.current_token.take().unwrap(); - match token { - Token::EndTag(_) => { - if !self.seen_attributes.is_empty() { - self.emit_error(Error::EndTagWithAttributes); - } - self.seen_attributes.clear(); - self.set_last_start_tag(None); - } - Token::StartTag(ref mut tag) => { - self.set_last_start_tag(Some(&tag.name)); - } - _ => debug_assert!(false), - } - self.emit_token(token); - if self.naively_switch_states { - naive_next_state(&self.last_start_tag) - } else { - None - } - } - fn emit_current_comment(&mut self) { - let comment = self.current_token.take().unwrap(); - debug_assert!(matches!(comment, Token::Comment(_))); - self.emit_token(comment); - } - - fn emit_current_doctype(&mut self) { - let doctype = self.current_token.take().unwrap(); - debug_assert!(matches!(doctype, Token::Doctype(_))); - self.emit_token(doctype); - } - - fn set_self_closing(&mut self) { - let tag = self.current_token.as_mut().unwrap(); - match tag { - Token::StartTag(StartTag { - ref mut self_closing, - .. - }) => { - *self_closing = true; - } - Token::EndTag(_) => { - self.emit_error(Error::EndTagWithTrailingSolidus); - } - _ => { - debug_assert!(false); - } - } - } - fn set_force_quirks(&mut self) { - match self.current_token { - Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true, - _ => debug_assert!(false), - } - } - fn push_tag_name(&mut self, s: &[u8]) { - match self.current_token { - Some( - Token::StartTag(StartTag { ref mut name, .. }) - | Token::EndTag(EndTag { ref mut name, .. }), - ) => { - name.extend(s); - } - _ => debug_assert!(false), - } - } - - fn push_comment(&mut self, s: &[u8]) { - match self.current_token { - Some(Token::Comment(ref mut data)) => data.extend(s), - _ => debug_assert!(false), - } - } - - fn push_doctype_name(&mut self, s: &[u8]) { - match self.current_token { - Some(Token::Doctype(ref mut doctype)) => doctype.name.extend(s), - _ => debug_assert!(false), - } - } - fn init_doctype(&mut self) { - self.current_token = Some(Token::Doctype(Doctype { - name: HtmlString::default(), - force_quirks: false, - public_identifier: None, - system_identifier: None, - })); - } - - fn init_attribute(&mut self) { - self.flush_current_attribute(); - self.current_attribute = Some(Default::default()); - } - fn push_attribute_name(&mut self, s: &[u8]) { - self.current_attribute.as_mut().unwrap().0.extend(s); - } - fn push_attribute_value(&mut self, s: &[u8]) { - self.current_attribute.as_mut().unwrap().1.extend(s); - } - fn set_doctype_public_identifier(&mut self, value: &[u8]) { - if let Some(Token::Doctype(Doctype { - ref mut public_identifier, - .. - })) = self.current_token - { - *public_identifier = Some(value.to_vec().into()); - } else { - debug_assert!(false); - } - } - fn set_doctype_system_identifier(&mut self, value: &[u8]) { - if let Some(Token::Doctype(Doctype { - ref mut system_identifier, - .. - })) = self.current_token - { - *system_identifier = Some(value.to_vec().into()); - } else { - debug_assert!(false); - } - } - fn push_doctype_public_identifier(&mut self, s: &[u8]) { - if let Some(Token::Doctype(Doctype { - public_identifier: Some(ref mut id), - .. - })) = self.current_token - { - id.extend(s); - } else { - debug_assert!(false); - } - } - fn push_doctype_system_identifier(&mut self, s: &[u8]) { - if let Some(Token::Doctype(Doctype { - system_identifier: Some(ref mut id), - .. - })) = self.current_token - { - id.extend(s); - } else { - debug_assert!(false); - } - } - - fn current_is_appropriate_end_tag_token(&mut self) -> bool { - match self.current_token { - Some(Token::EndTag(ref tag)) => { - !self.last_start_tag.is_empty() && self.last_start_tag == tag.name - } - _ => false, - } - } -} - -/// A HTML end/close tag, such as `

` or ``. -#[derive(Debug, Default, Eq, PartialEq, Clone)] -pub struct StartTag { - /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be - /// expected. - pub self_closing: bool, - - /// The start tag's name, such as `"p"` or `"a"`. - pub name: HtmlString, - - /// A mapping for any HTML attributes this start tag may have. - /// - /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own - /// [`Emitter`] to tweak this behavior. - pub attributes: BTreeMap, -} - -/// A HTML end/close tag, such as `

` or ``. -#[derive(Debug, Default, Eq, PartialEq, Clone)] -pub struct EndTag { - /// The ending tag's name, such as `"p"` or `"a"`. - pub name: HtmlString, -} - -/// A doctype. Some examples: -/// -/// * `` -/// * `` -/// * `` -/// * `` -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct Doctype { - /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag. - pub force_quirks: bool, - - /// The doctype's name. For HTML documents this is "html". - pub name: HtmlString, - - /// The doctype's public identifier. - pub public_identifier: Option, - - /// The doctype's system identifier. - pub system_identifier: Option, -} - -/// The token type used by default. You can define your own token type by implementing the -/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`]. -#[derive(Debug, Eq, PartialEq, Clone)] -pub enum Token { - /// A HTML start tag. - StartTag(StartTag), - /// A HTML end tag. - EndTag(EndTag), - /// A literal string. - String(HtmlString), - /// A HTML comment. - Comment(HtmlString), - /// A HTML doctype declaration. - Doctype(Doctype), - /// A HTML parsing error. - /// - /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with - /// more tokens afterward. - Error(Error), -} diff --git a/src/html5ever_emitter.rs b/src/html5ever_emitter.rs index 4f792ae..4398427 100644 --- a/src/html5ever_emitter.rs +++ b/src/html5ever_emitter.rs @@ -1,78 +1,161 @@ use std::convert::Infallible; -use crate::{DefaultEmitter, Emitter, Error, State, Token}; -use html5ever::tokenizer::states::RawKind; +use crate::callbacks::{Callback, CallbackEmitter, CallbackEvent}; +use crate::utils::trace_log; +use crate::{Emitter, Error, State}; + use html5ever::tokenizer::{ - Doctype, Tag, TagKind, Token as Html5everToken, TokenSink, TokenSinkResult, + states::RawKind, Doctype, Tag, TagKind, Token as Html5everToken, TokenSink, TokenSinkResult, }; use html5ever::{Attribute, QualName}; const BOGUS_LINENO: u64 = 1; +#[derive(Debug)] +struct OurCallback<'a, S> { + sink: &'a mut S, + current_start_tag: Option, + next_state: Option, +} + +impl<'a, S: TokenSink> OurCallback<'a, S> { + fn handle_sink_result(&mut self, result: TokenSinkResult) { + match result { + TokenSinkResult::Continue => {} + TokenSinkResult::Script(_) => { + self.next_state = Some(State::Data); + // TODO: suspend tokenizer for script + } + TokenSinkResult::Plaintext => { + self.next_state = Some(State::PlainText); + } + TokenSinkResult::RawData(RawKind::Rcdata) => { + self.next_state = Some(State::RcData); + } + TokenSinkResult::RawData(RawKind::Rawtext) => { + self.next_state = Some(State::RawText); + } + TokenSinkResult::RawData(RawKind::ScriptData) => { + self.next_state = Some(State::ScriptData); + } + TokenSinkResult::RawData(RawKind::ScriptDataEscaped(_)) => { + todo!() + } + } + } + + fn sink_token(&mut self, token: Html5everToken) { + trace_log!("sink_token: {:?}", token); + let result = self.sink.process_token(token, BOGUS_LINENO); + self.handle_sink_result(result); + } +} + +impl<'a, S: TokenSink> Callback for OurCallback<'a, S> { + fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option { + trace_log!("Html5everEmitter::handle_event: {:?}", event); + match event { + CallbackEvent::OpenStartTag { name } => { + self.current_start_tag = Some(Tag { + kind: TagKind::StartTag, + name: String::from_utf8_lossy(name).into_owned().into(), + self_closing: false, + attrs: Default::default(), + }); + } + CallbackEvent::AttributeName { name } => { + if let Some(ref mut tag) = self.current_start_tag { + tag.attrs.push(Attribute { + name: QualName::new( + None, + Default::default(), + String::from_utf8_lossy(name).into_owned().into(), + ), + value: Default::default(), + }); + } + } + CallbackEvent::AttributeValue { value } => { + if let Some(ref mut tag) = self.current_start_tag { + if let Some(attr) = tag.attrs.last_mut() { + attr.value.push_slice(&String::from_utf8_lossy(value)); + } + } + } + CallbackEvent::CloseStartTag { self_closing } => { + if let Some(mut tag) = self.current_start_tag.take() { + tag.self_closing = self_closing; + self.sink_token(Html5everToken::TagToken(tag)); + } + } + CallbackEvent::EndTag { name } => { + self.sink_token(Html5everToken::TagToken(Tag { + kind: TagKind::EndTag, + name: String::from_utf8_lossy(name).into_owned().into(), + self_closing: false, + attrs: Default::default(), + })); + } + CallbackEvent::String { value } => { + let mut first = true; + for part in String::from_utf8_lossy(value).split('\0') { + if !first { + self.sink_token(Html5everToken::NullCharacterToken); + } + + first = false; + self.sink_token(Html5everToken::CharacterTokens(part.to_owned().into())); + } + } + CallbackEvent::Comment { value } => { + self.sink_token(Html5everToken::CommentToken( + String::from_utf8_lossy(value).into_owned().into(), + )); + } + CallbackEvent::Doctype { + name, + public_identifier, + system_identifier, + force_quirks, + } => { + self.sink_token(Html5everToken::DoctypeToken(Doctype { + name: Some(name) + .filter(|x| !x.is_empty()) + .map(|x| String::from_utf8_lossy(x).into_owned().into()), + public_id: public_identifier + .map(|x| String::from_utf8_lossy(x).into_owned().into()), + system_id: system_identifier + .map(|x| String::from_utf8_lossy(x).into_owned().into()), + force_quirks, + })); + } + CallbackEvent::Error(error) => { + self.sink_token(Html5everToken::ParseError(error.as_str().into())); + } + } + + None + } +} + /// A compatibility layer that allows you to plug the TreeBuilder from html5ever into the tokenizer /// from html5gum. /// /// See [`examples/scraper.rs`] for usage. #[derive(Debug)] pub struct Html5everEmitter<'a, S: TokenSink> { - next_state: Option, - sink: &'a mut S, - // TODO: get rid of default emitter, construct html5ever tokens directly - emitter_inner: DefaultEmitter, + emitter_inner: CallbackEmitter>, } impl<'a, S: TokenSink> Html5everEmitter<'a, S> { /// Construct the compatibility layer. pub fn new(sink: &'a mut S) -> Self { Html5everEmitter { - next_state: None, - sink, - emitter_inner: DefaultEmitter::default(), - } - } - - fn pop_token_inner(&mut self) { - while let Some(token) = self.emitter_inner.pop_token() { - token_to_html5ever(token, |token| { - crate::utils::trace_log!("tree builder token: {:?}", token); - match self.sink.process_token(token, BOGUS_LINENO) { - TokenSinkResult::Continue => {} - TokenSinkResult::Script(_) => { - if self.next_state.is_some() { - crate::utils::trace_log!("dropping state: {:?}", self.next_state); - } - self.next_state = Some(State::Data); - // TODO: suspend tokenizer for script - } - TokenSinkResult::Plaintext => { - if self.next_state.is_some() { - crate::utils::trace_log!("dropping state: {:?}", self.next_state); - } - self.next_state = Some(State::PlainText); - } - TokenSinkResult::RawData(RawKind::Rcdata) => { - if self.next_state.is_some() { - crate::utils::trace_log!("dropping state: {:?}", self.next_state); - } - self.next_state = Some(State::RcData); - } - TokenSinkResult::RawData(RawKind::Rawtext) => { - if self.next_state.is_some() { - crate::utils::trace_log!("dropping state: {:?}", self.next_state); - } - self.next_state = Some(State::RawText); - } - TokenSinkResult::RawData(RawKind::ScriptData) => { - if self.next_state.is_some() { - crate::utils::trace_log!("dropping state: {:?}", self.next_state); - } - self.next_state = Some(State::ScriptData); - } - TokenSinkResult::RawData(RawKind::ScriptDataEscaped(_)) => { - todo!() - } - } - }); + emitter_inner: CallbackEmitter::new(OurCallback { + sink, + current_start_tag: None, + next_state: None, + }), } } } @@ -86,16 +169,13 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> { fn emit_eof(&mut self) { self.emitter_inner.emit_eof(); - self.pop_token_inner(); - let _ignored = self - .sink - .process_token(Html5everToken::EOFToken, BOGUS_LINENO); - self.sink.end(); + let sink = &mut self.emitter_inner.callback_mut().sink; + let _ignored = sink.process_token(Html5everToken::EOFToken, BOGUS_LINENO); + sink.end(); } fn emit_error(&mut self, error: Error) { self.emitter_inner.emit_error(error); - self.pop_token_inner(); } fn pop_token(&mut self) -> Option { @@ -104,38 +184,31 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> { fn emit_string(&mut self, c: &[u8]) { self.emitter_inner.emit_string(c); - self.pop_token_inner(); } fn init_start_tag(&mut self) { self.emitter_inner.init_start_tag(); - self.pop_token_inner(); } fn init_end_tag(&mut self) { self.emitter_inner.init_end_tag(); - self.pop_token_inner(); } fn init_comment(&mut self) { self.emitter_inner.init_comment(); - self.pop_token_inner(); } fn emit_current_tag(&mut self) -> Option { assert!(self.emitter_inner.emit_current_tag().is_none()); - self.pop_token_inner(); - self.next_state.take() + self.emitter_inner.callback_mut().next_state.take() } fn emit_current_comment(&mut self) { self.emitter_inner.emit_current_comment(); - self.pop_token_inner(); } fn emit_current_doctype(&mut self) { self.emitter_inner.emit_current_doctype(); - self.pop_token_inner(); } fn set_self_closing(&mut self) { @@ -160,7 +233,6 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> { fn init_doctype(&mut self) { self.emitter_inner.init_doctype(); - self.pop_token_inner(); } fn init_attribute(&mut self) { @@ -196,63 +268,9 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> { } fn adjusted_current_node_present_but_not_in_html_namespace(&mut self) -> bool { - self.sink + self.emitter_inner + .callback_mut() + .sink .adjusted_current_node_present_but_not_in_html_namespace() } } - -fn token_to_html5ever(token: Token, mut foreach_fn: impl FnMut(Html5everToken)) { - match token { - Token::StartTag(tag) => foreach_fn(Html5everToken::TagToken(Tag { - kind: TagKind::StartTag, - name: String::from_utf8_lossy(&tag.name).into_owned().into(), - self_closing: tag.self_closing, - attrs: tag - .attributes - .into_iter() - .map(|(key, value)| Attribute { - name: QualName::new( - None, - Default::default(), - String::from_utf8_lossy(&key).into_owned().into(), - ), - value: String::from_utf8_lossy(&value).into_owned().into(), - }) - .collect(), - })), - Token::EndTag(tag) => foreach_fn(Html5everToken::TagToken(Tag { - kind: TagKind::EndTag, - name: String::from_utf8_lossy(&tag.name).into_owned().into(), - self_closing: false, - attrs: Vec::new(), - })), - Token::String(s) => { - let s = String::from_utf8_lossy(&s); - let mut first = true; - for part in s.split('\0') { - if !first { - foreach_fn(Html5everToken::NullCharacterToken); - } - - first = false; - foreach_fn(Html5everToken::CharacterTokens(part.to_owned().into())); - } - } - Token::Comment(c) => foreach_fn(Html5everToken::CommentToken( - String::from_utf8_lossy(&c).into_owned().into(), - )), - Token::Doctype(doctype) => foreach_fn(Html5everToken::DoctypeToken(Doctype { - name: Some(&*doctype.name) - .filter(|x| !x.is_empty()) - .map(|x| String::from_utf8_lossy(x).into_owned().into()), - public_id: doctype - .public_identifier - .map(|x| String::from_utf8_lossy(&x).into_owned().into()), - system_id: doctype - .system_identifier - .map(|x| String::from_utf8_lossy(&x).into_owned().into()), - force_quirks: doctype.force_quirks, - })), - Token::Error(err) => foreach_fn(Html5everToken::ParseError(err.as_str().into())), - } -} diff --git a/src/htmlstring.rs b/src/htmlstring.rs new file mode 100644 index 0000000..c789343 --- /dev/null +++ b/src/htmlstring.rs @@ -0,0 +1,130 @@ +use std::borrow::{Borrow, BorrowMut}; +use std::fmt::{Debug, Formatter}; +use std::ops::{Deref, DerefMut}; + +/// A wrapper around a bytestring. +/// +/// This newtype only exists to provide a nicer `Debug` impl +#[derive(Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct HtmlString(pub Vec); + +impl Deref for HtmlString { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for HtmlString { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Debug for HtmlString { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(f, "b\"")?; + for &byte in &self.0 { + for ch in std::ascii::escape_default(byte) { + write!(f, "{}", ch as char)?; + } + } + + write!(f, "\"") + } +} + +impl Borrow<[u8]> for HtmlString { + fn borrow(&self) -> &[u8] { + &self.0 + } +} + +impl BorrowMut<[u8]> for HtmlString { + fn borrow_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +impl AsRef<[u8]> for HtmlString { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl PartialEq<&[u8; N]> for HtmlString { + fn eq(&self, other: &&[u8; N]) -> bool { + self.0 == *other + } +} + +impl PartialEq for &[u8; N] { + fn eq(&self, other: &HtmlString) -> bool { + other.0 == *self + } +} + +impl PartialEq<&[u8]> for HtmlString { + fn eq(&self, other: &&[u8]) -> bool { + self.0 == *other + } +} + +impl PartialEq for &[u8] { + fn eq(&self, other: &HtmlString) -> bool { + *self == other.0 + } +} + +impl PartialEq> for HtmlString { + fn eq(&self, other: &Vec) -> bool { + self.0 == *other + } +} + +impl PartialEq for Vec { + fn eq(&self, other: &HtmlString) -> bool { + *self == other.0 + } +} + +#[test] +fn test_eq_html_str_and_byte_literal() { + assert!(HtmlString(b"hello world".to_vec()) == b"hello world"); +} + +#[test] +fn test_eq_byte_literal_and_html_str() { + assert!(b"hello world" == HtmlString(b"hello world".to_vec())); +} + +#[test] +fn test_eq_html_str_and_byte_slice() { + assert!(HtmlString(b"hello world".to_vec()) == b"hello world".as_slice()); +} + +#[test] +fn test_eq_byte_slice_and_html_str() { + assert!(b"hello world".as_slice() == HtmlString(b"hello world".to_vec())); +} + +#[test] +fn test_borrowing() { + use crate::StartTag; + // demonstrate a usecase for Borrow/BorrowMut + let tag = StartTag::default(); + assert!(!tag.attributes.contains_key(b"href".as_slice())); +} + +impl From> for HtmlString { + fn from(vec: Vec) -> HtmlString { + HtmlString(vec) + } +} + +impl From for Vec { + fn from(other: HtmlString) -> Vec { + other.0 + } +} diff --git a/src/lib.rs b/src/lib.rs index 340fcbf..d1a1421 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,12 +36,15 @@ macro_rules! blob_url_prefix { use blob_url_prefix; mod arrayvec; +pub mod callbacks; mod char_validator; +mod default_emitter; mod emitter; mod entities; mod error; #[cfg(feature = "html5ever")] mod html5ever_emitter; +mod htmlstring; mod machine; mod machine_helper; mod read_helper; @@ -54,10 +57,10 @@ mod utils; #[doc(hidden)] pub mod testutils; -pub use emitter::{ - naive_next_state, DefaultEmitter, Doctype, Emitter, EndTag, HtmlString, StartTag, Token, -}; +pub use default_emitter::{DefaultEmitter, Doctype, EndTag, StartTag, Token}; +pub use emitter::{naive_next_state, Emitter}; pub use error::Error; +pub use htmlstring::HtmlString; pub use reader::{IoReader, Readable, Reader, StringReader}; pub use state::State; pub use tokenizer::{InfallibleTokenizer, Tokenizer}; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 95d60f5..0d1908a 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -62,13 +62,32 @@ impl> Tokenizer { /// Some emitters don't ever produce any tokens and instead have other side effects. In those /// cases, you will find yourself writing code like this to handle errors: /// - /// ```norun - /// for _ in tokenizer { - /// result.unwrap(); - /// } /// ``` + /// use std::convert::Infallible; /// - /// This is a bit silly, so instead you can use `tokenizer.finish()`. + /// use html5gum::Tokenizer; + /// use html5gum::callbacks::{CallbackEvent, CallbackEmitter}; + /// + /// let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>| -> Option { + /// if let CallbackEvent::String { value } = event { + /// println!("{}", String::from_utf8_lossy(value)); + /// } + /// + /// // We may choose to return any Option (such as errors, or our own tokens), but since + /// // we do all the real work in the callback itself, we choose to use Option. + /// None + /// }); + /// + /// let tokenizer = Tokenizer::new_with_emitter("hello
world!", emitter); + /// + /// // this is a bit silly + /// // for _ in tokenizer { + /// // result.unwrap(); + /// // } + /// + /// // much better: + /// tokenizer.finish(); + /// ``` pub fn finish(self) -> Result<(), R::Error> { for result in self { result?; diff --git a/tests/html5lib_tree_builder.rs b/tests/html5lib_tree_builder.rs index eadc3ae..e5d1c3d 100644 --- a/tests/html5lib_tree_builder.rs +++ b/tests/html5lib_tree_builder.rs @@ -175,9 +175,7 @@ fn build_test(testcase: Testcase, fname: &str, i: usize, scripting: bool) -> Tri tokenizer.set_state(state); } - for result in tokenizer { - result.unwrap(); - } + tokenizer.finish().unwrap(); let rcdom = tree_builder.sink; let mut actual = String::new(); @@ -195,7 +193,7 @@ fn build_test(testcase: Testcase, fname: &str, i: usize, scripting: bool) -> Tri } let expected = testcase.document.unwrap(); - assert_eq!(expected, actual); + assert_eq!(actual, expected); }) }) }