diff --git a/README.md b/README.md index 9c7562e..57830b0 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,9 @@ assert_eq!(new_html, "hello world"); `html5gum` provides multiple kinds of APIs: * Iterating over tokens as shown above. -* Implementing your own `Emitter` for maximum performance, see [the `custom_emitter.rs` example](examples/custom_emitter.rs). -* A callbacks-based API for a middleground between convenience and performance, see [the `callback_emitter.rs` example](examples/callback_emitter.rs). -* With the `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the `scraper.rs` example](examples/scraper.rs). +* Implementing your own `Emitter` for maximum performance, see [the `custom_emitter.rs` example][examples/custom_emitter.rs]. +* A callbacks-based API for a middleground between convenience and performance, see [the `callback_emitter.rs` example][examples/callback_emitter.rs]. +* With the `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the `scraper.rs` example][examples/scraper.rs]. ## What a tokenizer does and what it does not do @@ -60,7 +60,7 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S With those caveats in mind, `html5gum` can pretty much ~parse~ _tokenize_ anything that browsers can. However, using the experimental `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the -`scraper.rs` example](examples/scraper.rs). +`scraper.rs` example][examples/scraper.rs]. ## Other features diff --git a/examples/build_tree.rs b/examples/build_tree.rs index 97ca5b7..8314804 100644 --- a/examples/build_tree.rs +++ b/examples/build_tree.rs @@ -3,7 +3,8 @@ /// /// You may want to refer to `examples/scraper.rs` for better ergonomics. use html5ever::tree_builder::TreeBuilder; -use html5gum::{Html5everEmitter, IoReader, Tokenizer}; +use html5gum::emitters::html5ever::Html5everEmitter; +use html5gum::{IoReader, Tokenizer}; use markup5ever_rcdom::{Handle, NodeData, RcDom}; fn walk(indent: usize, handle: &Handle) { diff --git a/examples/callback_emitter.rs b/examples/callback_emitter.rs index de392d6..a9bbe93 100644 --- a/examples/callback_emitter.rs +++ b/examples/callback_emitter.rs @@ -10,7 +10,7 @@ //! ```text //! link: foo //! ``` -use html5gum::callbacks::{CallbackEmitter, CallbackEvent}; +use html5gum::emitters::callback::{CallbackEmitter, CallbackEvent}; use html5gum::{Emitter, IoReader, Tokenizer}; fn get_emitter() -> impl Emitter { diff --git a/examples/scraper.rs b/examples/scraper.rs index fd3a04d..1c85fbe 100644 --- a/examples/scraper.rs +++ b/examples/scraper.rs @@ -14,7 +14,8 @@ use std::io::{stdin, Read}; use html5ever::tree_builder::TreeBuilder; -use html5gum::{Html5everEmitter, IoReader, Tokenizer}; +use html5gum::emitters::html5ever::Html5everEmitter; +use html5gum::{IoReader, Tokenizer}; use scraper::{Html, Selector}; use argh::FromArgs; diff --git a/src/callbacks.rs b/src/emitters/callback.rs similarity index 94% rename from src/callbacks.rs rename to src/emitters/callback.rs index 5b3f33c..834cc2f 100644 --- a/src/callbacks.rs +++ b/src/emitters/callback.rs @@ -1,14 +1,14 @@ //! Consume the parsed HTML as a series of events through a callback. //! -//! While using the [DefaultEmitter] provides an easy-to-use API with low performance, and -//! implementing your own [Emitter] brings maximal performance and maximal pain, this is a middle +//! While using the [crate::DefaultEmitter] provides an easy-to-use API with low performance, and +//! implementing your own [crate::Emitter] brings maximal performance and maximal pain, this is a middle //! ground. All strings are borrowed from some intermediate buffer instead of individually //! allocated. //! //! ``` //! // Extract all text between span tags, in a naive (but fast) way. Does not handle tags inside of the span. See `examples/` as well. //! use html5gum::Tokenizer; -//! use html5gum::callbacks::{CallbackEvent, CallbackEmitter}; +//! use html5gum::emitters::callback::{CallbackEvent, CallbackEmitter}; //! //! let mut is_in_span = false; //! let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>| -> Option> { @@ -45,7 +45,7 @@ use crate::{naive_next_state, Emitter, Error, State}; /// Events used by [CallbackEmitter]. /// -/// This operates at a slightly lower level than [Token], as start tags are split up into multiple +/// This operates at a slightly lower level than [crate::Token], as start tags are split up into multiple /// events. #[derive(Debug)] pub enum CallbackEvent<'a> { @@ -70,8 +70,8 @@ pub enum CallbackEvent<'a> { /// /// Things like whitespace, quote handling is taken care of. /// - /// After this event, the start tag may be closed using [CloseStartTag], or another - /// [AttributeName] may follow. + /// After this event, the start tag may be closed using `CloseStartTag`, or another + /// `AttributeName` may follow. AttributeValue { /// The value of the attribute. value: &'a [u8], @@ -87,7 +87,7 @@ pub enum CallbackEvent<'a> { self_closing: bool, }, - /// Visit `"". + /// Visit `""`. /// /// Note: Because of strangeness in the HTML spec, attributes may be observed outside of start /// tags, before this event. It's best to ignore them as they are not valid HTML, but can still @@ -146,7 +146,7 @@ struct CallbackState { /// type. pub trait Callback { /// Perform some action on a parsing event, and, optionally, return a value that can be yielded - /// from the [Tokenizer] iterator. + /// from the [crate::Tokenizer] iterator. fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option; } @@ -207,7 +207,8 @@ struct EmitterState { doctype_force_quirks: bool, } -/// The emitter class to pass to [Tokenizer::new_with_emitter] +/// The emitter class to pass to [crate::Tokenizer::new_with_emitter]. Please refer to the +/// module-level documentation on [crate::emitters::callback] for usage. #[derive(Debug)] pub struct CallbackEmitter { // this struct is only split out so [CallbackState::emit_event] can borrow things concurrently @@ -232,10 +233,10 @@ impl CallbackEmitter where F: Callback, { - /// Create a new emitter. See type-level docs to understand basic usage. + /// Create a new emitter. /// /// The given callback may return optional tokens that then become available through the - /// [Tokenizer]'s iterator. If that's not used, return [Option]. + /// [crate::Tokenizer]'s iterator. If that's not used, return `Option`. pub fn new(callback: F) -> Self { CallbackEmitter { callback_state: CallbackState { diff --git a/src/default_emitter.rs b/src/emitters/default.rs similarity index 93% rename from src/default_emitter.rs rename to src/emitters/default.rs index b245795..4cdf0bb 100644 --- a/src/default_emitter.rs +++ b/src/emitters/default.rs @@ -1,10 +1,11 @@ +//! The default emitter is what powers the simple SAX-like API that you see in the README. use std::collections::btree_map::Entry; use std::collections::BTreeMap; use std::mem::take; use crate::{Emitter, Error, HtmlString, State}; -use crate::callbacks::{Callback, CallbackEmitter, CallbackEvent}; +use crate::emitters::callback::{Callback, CallbackEmitter, CallbackEvent}; #[derive(Debug, Default)] struct OurCallback { @@ -71,14 +72,15 @@ impl Callback for OurCallback { } } -/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens. +/// This is the emitter you implicitly use with [crate::Tokenizer::new]. Refer to the [crate +/// docs](crate) for how usage looks like. #[derive(Default, Debug)] pub struct DefaultEmitter { inner: CallbackEmitter, } impl DefaultEmitter { - /// Whether to use [`naive_next_state`] to switch states automatically. + /// Whether to use [crate::naive_next_state] to switch states automatically. /// /// The default is off. pub fn naively_switch_states(&mut self, yes: bool) { @@ -86,10 +88,11 @@ impl DefaultEmitter { } } -// opaque type around inner emitter impl Emitter for DefaultEmitter { type Token = Token; + // opaque type around inner emitter + fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) { self.inner.set_last_start_tag(last_start_tag) } @@ -202,7 +205,7 @@ impl Emitter for DefaultEmitter { /// A HTML end/close tag, such as `

` or ``. #[derive(Debug, Default, Eq, PartialEq, Clone)] pub struct StartTag { - /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be + /// Whether this tag is self-closing. If it is self-closing, no following [EndTag] should be /// expected. pub self_closing: bool, @@ -212,7 +215,7 @@ pub struct StartTag { /// A mapping for any HTML attributes this start tag may have. /// /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own - /// [`Emitter`] to tweak this behavior. + /// [crate::Emitter] to tweak this behavior. pub attributes: BTreeMap, } diff --git a/src/emitter.rs b/src/emitters/emitter.rs similarity index 100% rename from src/emitter.rs rename to src/emitters/emitter.rs diff --git a/src/html5ever_emitter.rs b/src/emitters/html5ever.rs similarity index 84% rename from src/html5ever_emitter.rs rename to src/emitters/html5ever.rs index 4398427..75ed9bf 100644 --- a/src/html5ever_emitter.rs +++ b/src/emitters/html5ever.rs @@ -1,6 +1,7 @@ +//! See [`examples/scraper.rs`] for usage. use std::convert::Infallible; -use crate::callbacks::{Callback, CallbackEmitter, CallbackEvent}; +use crate::emitters::callback::{Callback, CallbackEmitter, CallbackEvent}; use crate::utils::trace_log; use crate::{Emitter, Error, State}; @@ -175,27 +176,30 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> { } fn emit_error(&mut self, error: Error) { - self.emitter_inner.emit_error(error); + self.emitter_inner.emit_error(error) } - fn pop_token(&mut self) -> Option { - None + fn should_emit_errors(&mut self) -> bool { + self.emitter_inner.should_emit_errors() } + fn pop_token(&mut self) -> Option { + self.emitter_inner.pop_token() + } fn emit_string(&mut self, c: &[u8]) { - self.emitter_inner.emit_string(c); + self.emitter_inner.emit_string(c) } fn init_start_tag(&mut self) { - self.emitter_inner.init_start_tag(); + self.emitter_inner.init_start_tag() } fn init_end_tag(&mut self) { - self.emitter_inner.init_end_tag(); + self.emitter_inner.init_end_tag() } fn init_comment(&mut self) { - self.emitter_inner.init_comment(); + self.emitter_inner.init_comment() } fn emit_current_tag(&mut self) -> Option { @@ -204,63 +208,63 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> { } fn emit_current_comment(&mut self) { - self.emitter_inner.emit_current_comment(); + self.emitter_inner.emit_current_comment() } fn emit_current_doctype(&mut self) { - self.emitter_inner.emit_current_doctype(); + self.emitter_inner.emit_current_doctype() } fn set_self_closing(&mut self) { - self.emitter_inner.set_self_closing(); + self.emitter_inner.set_self_closing() } fn set_force_quirks(&mut self) { - self.emitter_inner.set_force_quirks(); + self.emitter_inner.set_force_quirks() } fn push_tag_name(&mut self, s: &[u8]) { - self.emitter_inner.push_tag_name(s); + self.emitter_inner.push_tag_name(s) } fn push_comment(&mut self, s: &[u8]) { - self.emitter_inner.push_comment(s); + self.emitter_inner.push_comment(s) } fn push_doctype_name(&mut self, s: &[u8]) { - self.emitter_inner.push_doctype_name(s); + self.emitter_inner.push_doctype_name(s) } fn init_doctype(&mut self) { - self.emitter_inner.init_doctype(); + self.emitter_inner.init_doctype() } fn init_attribute(&mut self) { - self.emitter_inner.init_attribute(); + self.emitter_inner.init_attribute() } fn push_attribute_name(&mut self, s: &[u8]) { - self.emitter_inner.push_attribute_name(s); + self.emitter_inner.push_attribute_name(s) } fn push_attribute_value(&mut self, s: &[u8]) { - self.emitter_inner.push_attribute_value(s); + self.emitter_inner.push_attribute_value(s) } fn set_doctype_public_identifier(&mut self, value: &[u8]) { - self.emitter_inner.set_doctype_public_identifier(value); + self.emitter_inner.set_doctype_public_identifier(value) } fn set_doctype_system_identifier(&mut self, value: &[u8]) { - self.emitter_inner.set_doctype_system_identifier(value); + self.emitter_inner.set_doctype_system_identifier(value) } - fn push_doctype_public_identifier(&mut self, value: &[u8]) { - self.emitter_inner.push_doctype_public_identifier(value); + fn push_doctype_public_identifier(&mut self, s: &[u8]) { + self.emitter_inner.push_doctype_public_identifier(s) } - fn push_doctype_system_identifier(&mut self, value: &[u8]) { - self.emitter_inner.push_doctype_system_identifier(value); + fn push_doctype_system_identifier(&mut self, s: &[u8]) { + self.emitter_inner.push_doctype_system_identifier(s) } fn current_is_appropriate_end_tag_token(&mut self) -> bool { diff --git a/src/emitters/mod.rs b/src/emitters/mod.rs new file mode 100644 index 0000000..2775fc8 --- /dev/null +++ b/src/emitters/mod.rs @@ -0,0 +1,25 @@ +//! [Emitter] is a "visitor" on the underlying token stream. +//! +//! When html5gum parses HTML, it (more specifically, the [crate::Tokenizer]) calls into emitters to keep +//! track of state and to produce output. +//! +//! Emitters can yield control to the _caller_ of the tokenizer by emitting tokens in +//! [Emitter::pop_token]. This is what powers the basic API where users just iterate over +//! [crate::Tokenizer] which is an iterator over [default::Token]. +//! +//! Most performant implementations don't implement `pop_token` and instead hold internal mutable +//! state, or directly produce side effects. +//! +//! Emitters are "a way to consume parsing results." The following ways are available: +//! +//! * [default::DefaultEmitter], if you don't care about speed and only want convenience. +//! * [callback::CallbackEmitter], if you can deal with some lifetime problems in exchange for way fewer allocations. +//! * Implementing your own [Emitter] for maximum performance and maximum pain. +pub mod callback; +pub mod default; +#[cfg(feature = "html5ever")] +pub mod html5ever; + +mod emitter; + +pub use emitter::{naive_next_state, Emitter}; diff --git a/src/lib.rs b/src/lib.rs index d1a1421..51af1e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,8 @@ #![doc = concat!("[LICENSE]: ", blob_url_prefix!(), "LICENSE")] #![doc = concat!("[examples/tokenize_with_state_switches.rs]: ", blob_url_prefix!(), "examples/tokenize_with_state_switches.rs")] #![doc = concat!("[examples/custom_emitter.rs]: ", blob_url_prefix!(), "examples/custom_emitter.rs")] +#![doc = concat!("[examples/callback_emitter.rs]: ", blob_url_prefix!(), "examples/callback_emitter.rs")] +#![doc = concat!("[examples/scraper.rs]: ", blob_url_prefix!(), "examples/scraper.rs")] #![doc = include_str!("../README.md")] // #![warn(clippy::all)] @@ -36,14 +38,10 @@ macro_rules! blob_url_prefix { use blob_url_prefix; mod arrayvec; -pub mod callbacks; mod char_validator; -mod default_emitter; -mod emitter; +pub mod emitters; mod entities; mod error; -#[cfg(feature = "html5ever")] -mod html5ever_emitter; mod htmlstring; mod machine; mod machine_helper; @@ -57,13 +55,10 @@ mod utils; #[doc(hidden)] pub mod testutils; -pub use default_emitter::{DefaultEmitter, Doctype, EndTag, StartTag, Token}; -pub use emitter::{naive_next_state, Emitter}; +pub use emitters::default::{DefaultEmitter, Doctype, EndTag, StartTag, Token}; +pub use emitters::{naive_next_state, Emitter}; pub use error::Error; pub use htmlstring::HtmlString; pub use reader::{IoReader, Readable, Reader, StringReader}; pub use state::State; pub use tokenizer::{InfallibleTokenizer, Tokenizer}; - -#[cfg(feature = "html5ever")] -pub use html5ever_emitter::Html5everEmitter; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 0d1908a..b355c1b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -66,7 +66,7 @@ impl> Tokenizer { /// use std::convert::Infallible; /// /// use html5gum::Tokenizer; - /// use html5gum::callbacks::{CallbackEvent, CallbackEmitter}; + /// use html5gum::emitters::callback::{CallbackEvent, CallbackEmitter}; /// /// let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>| -> Option { /// if let CallbackEvent::String { value } = event { diff --git a/tests/html5lib_tree_builder.rs b/tests/html5lib_tree_builder.rs index e5d1c3d..302a01d 100644 --- a/tests/html5lib_tree_builder.rs +++ b/tests/html5lib_tree_builder.rs @@ -22,7 +22,8 @@ use html5ever::tokenizer::states::{RawKind, State}; use html5ever::tree_builder::{TreeBuilder, TreeBuilderOpts}; use html5ever::{namespace_url, ns}; use html5ever::{LocalName, QualName}; -use html5gum::{testutils::trace_log, Html5everEmitter, Tokenizer}; +use html5gum::emitters::html5ever::Html5everEmitter; +use html5gum::{testutils::trace_log, Tokenizer}; use markup5ever_rcdom::{Handle, NodeData, RcDom}; use pretty_assertions::assert_eq;