add callbacks emitter and update readme (#91)

untitaker · Oct 29, 2024 · 0959c9c · 0959c9c
1 parent 884e961
commit 0959c9c
Show file tree

Hide file tree

Showing 19 changed files with 1,428 additions and 948 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -10,6 +10,12 @@ updates:
   schedule:
     interval: daily
   open-pull-requests-limit: 10
+
+- package-ecosystem: cargo
+  directory: "/fuzz"
+  schedule:
+    interval: daily
+  open-pull-requests-limit: 10
 
 - package-ecosystem: gitsubmodule
   directory: "/"

diff --git a/Cargo.toml b/Cargo.toml
@@ -59,6 +59,12 @@ harness = false
 name = "build_tree"
 required-features = ["tree-builder"]
 
+[[example]]
+name = "custom_emitter"
+
+[[example]]
+name = "callback_emitter"
+
 [[example]]
 name = "scraper"
 required-features = ["tree-builder"]

diff --git a/README.md b/README.md
@@ -30,6 +30,13 @@ for token in Tokenizer::new(html).infallible() {
 assert_eq!(new_html, "<title>hello world</title>");
 ```
 
+`html5gum` provides multiple kinds of APIs:
+
+* Iterating over tokens as shown above.
+* Implementing your own `Emitter` for maximum performance, see [the `custom_emitter.rs` example](examples/custom_emitter.rs).
+* A callbacks-based API for a middleground between convenience and performance, see [the `callback_emitter.rs` example](examples/callback_emitter.rs).
+* With the `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the `scraper.rs` example](examples/scraper.rs).
+
 ## What a tokenizer does and what it does not do
 
 `html5gum` fully implements [13.2.5 of the WHATWG HTML
@@ -42,9 +49,6 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S
   gracefully from invalid UTF-8.
 * `html5gum` **does not** [correct mis-nested
   tags.](https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser)
-* `html5gum` **does not** recognize implicitly self-closing elements like
-  `<img>`, as a tokenizer it will simply emit a start token. It does however
-  emit a self-closing tag for `<img .. />`.
 * `html5gum` doesn't implement the DOM, and unfortunately in the HTML spec,
   constructing the DOM ("tree construction") influences how tokenization is
   done. For an example of which problems this causes see [this example
@@ -54,23 +58,9 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S
   21](https://github.com/untitaker/html5gum/issues/21).
 
 With those caveats in mind, `html5gum` can pretty much ~parse~ _tokenize_
-anything that browsers can.
-
-## The `Emitter` trait
-
-A distinguishing feature of `html5gum` is that you can bring your own token
-datastructure and hook into token creation by implementing the `Emitter` trait.
-This allows you to:
-
-* Rewrite all per-HTML-tag allocations to use a custom allocator or datastructure.
-
-* Efficiently filter out uninteresting categories data without ever allocating
-  for it. For example if any plaintext between tokens is not of interest to
-  you, you can implement the respective trait methods as noop and therefore
-  avoid any overhead creating plaintext tokens.
-
-See [the `custom_emitter` example][examples/custom_emitter.rs] for how this
-looks like in practice.
+anything that browsers can. However, using the experimental `tree-builder`
+feature, html5gum can be integrated with `html5ever` and `scraper`. See [the
+`scraper.rs` example](examples/scraper.rs).
 
 ## Other features
 
@@ -116,3 +106,5 @@ Licensed under the MIT license, see [`./LICENSE`][LICENSE].
 [LICENSE]: ./LICENSE
 [examples/tokenize_with_state_switches.rs]: ./examples/tokenize_with_state_switches.rs
 [examples/custom_emitter.rs]: ./examples/custom_emitter.rs
+[examples/callback_emitter.rs]: ./examples/callback_emitter.rs
+[examples/scraper.rs]: ./examples/scraper.rs
diff --git a/examples/build_tree.rs b/examples/build_tree.rs
@@ -2,16 +2,14 @@
 /// building logic and DOM implementation. The result is a technically complete HTML5 parser.
 ///
 /// You may want to refer to `examples/scraper.rs` for better ergonomics.
-use std::iter::repeat;
-
 use html5ever::tree_builder::TreeBuilder;
 use html5gum::{Html5everEmitter, IoReader, Tokenizer};
 use markup5ever_rcdom::{Handle, NodeData, RcDom};
 
 fn walk(indent: usize, handle: &Handle) {
     let node = handle;
     // FIXME: don't allocate
-    print!("{}", repeat(" ").take(indent).collect::<String>());
+    print!("{}", " ".repeat(indent));
     match node.data {
         NodeData::Document => println!("#Document"),
 

diff --git a/examples/callback_emitter.rs b/examples/callback_emitter.rs
@@ -0,0 +1,53 @@
+//! A slightly simpler, but less performant version of the link extractor that can be found in
+//! `examples/custom_emitter.rs`.
+//!
+//! ```text
+//! printf '<h1>Hello world!</h1><a href="foo">bar</a>' | cargo run --example=custom_emitter
+//! ```
+//!
+//! Output:
+//!
+//! ```text
+//! link: foo
+//! ```
+use html5gum::callbacks::{CallbackEmitter, CallbackEvent};
+use html5gum::{Emitter, IoReader, Tokenizer};
+
+fn get_emitter() -> impl Emitter<Token = String> {
+    let mut is_anchor_tag = false;
+    let mut is_href_attr = false;
+
+    CallbackEmitter::new(move |event: CallbackEvent<'_>| match event {
+        CallbackEvent::OpenStartTag { name } => {
+            is_anchor_tag = name == b"a";
+            is_href_attr = false;
+            None
+        }
+        CallbackEvent::AttributeName { name } => {
+            is_href_attr = name == b"href";
+            None
+        }
+        CallbackEvent::AttributeValue { value } if is_anchor_tag && is_href_attr => {
+            Some(String::from_utf8_lossy(value).into_owned())
+        }
+        _ => None,
+    })
+}
+
+fn main() {
+    for token in
+        Tokenizer::new_with_emitter(IoReader::new(std::io::stdin().lock()), get_emitter()).flatten()
+    {
+        println!("link: {}", token);
+    }
+}
+
+#[test]
+fn basic() {
+    let tokens: Vec<_> =
+        Tokenizer::new_with_emitter("<h1>Hello world</h1><a href=foo>bar</a>", get_emitter())
+            .flatten()
+            .collect();
+
+    assert_eq!(tokens, vec!["foo".to_owned()]);
+}
diff --git a/examples/scraper.rs b/examples/scraper.rs
@@ -6,7 +6,9 @@
 /// echo '<h1><span class=hello>Hello</span></h1>' | cargo run --all-features --example scraper
 /// ```
 ///
-/// Essentially, your HTML parsing will be powered by a combination of html5gum and html5ever.
+/// Essentially, your HTML parsing will be powered by a combination of html5gum and html5ever. This
+/// has no immediate benefit over using scraper normally and is mostly done as a transitionary step
+/// until html5gum has its own implementation of tree building and the DOM.
 ///
 /// Requires the tree-builder feature.
 use std::io::{stdin, Read};