From d6a0a3ee1f2e155560839ab5c7bd489edfc9c209 Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Mon, 31 Jul 2023 03:13:38 +0200 Subject: [PATCH] States as functions (#70) --- src/machine.rs | 1234 ++++++++++++++++++++++++----------------- src/machine_helper.rs | 158 ++++-- src/state.rs | 97 ---- src/tokenizer.rs | 11 +- src/utils.rs | 5 - 5 files changed, 857 insertions(+), 648 deletions(-) diff --git a/src/machine.rs b/src/machine.rs index b85b445..5f46dd8 100644 --- a/src/machine.rs +++ b/src/machine.rs @@ -1,27 +1,39 @@ use crate::entities::try_read_character_reference; use crate::machine_helper::{ cont, emit_current_tag_and_switch_to, enter_state, eof, error, error_immediate, exit_state, - mutate_character_reference, read_byte, reconsume_in, switch_to, + mutate_character_reference, read_byte, reconsume_in, reconsume_in_return_state, switch_to, + ControlToken, }; use crate::read_helper::{fast_read_char, slow_read_byte}; -use crate::state::MachineState as State; -use crate::utils::{ctostr, noncharacter_pat, surrogate_pat, with_lowercase_str, ControlToken}; +use crate::utils::{ctostr, noncharacter_pat, surrogate_pat, with_lowercase_str}; use crate::{Emitter, Error, Reader, Tokenizer}; -// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that -// should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance -pub(crate) fn consume( - slf: &mut Tokenizer, -) -> Result { - match slf.machine_helper.state() { - State::Data => fast_read_char!( +macro_rules! define_state { + ($state:ident, $slf:ident, $($body:tt)*) => { + #[allow(non_snake_case)] + pub(crate) mod $state { + use super::*; + + #[inline(always)] + pub(crate) fn run($slf: &mut Tokenizer) -> Result, R::Error> { + $($body)* + } + } + }; +} + +pub(crate) mod states { + use super::*; + + define_state!(Data, slf, { + fast_read_char!( slf, match xs { Some(b"&") => { - enter_state!(slf, State::CharacterReference) + enter_state!(slf, CharacterReference, false) } Some(b"<") => { - switch_to!(slf, State::TagOpen) + switch_to!(slf, TagOpen)?.inline_next_state(slf) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -36,16 +48,18 @@ pub(crate) fn consume( eof!() } } - ), + ) + }); - State::RcData => fast_read_char!( + define_state!(RcData, slf, { + fast_read_char!( slf, match xs { Some(b"&") => { - enter_state!(slf, State::CharacterReference) + enter_state!(slf, CharacterReference, false) } Some(b"<") => { - switch_to!(slf, State::RcDataLessThanSign) + switch_to!(slf, RcDataLessThanSign) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -60,12 +74,15 @@ pub(crate) fn consume( eof!() } } - ), - State::RawText => fast_read_char!( + ) + }); + + define_state!(RawText, slf, { + fast_read_char!( slf, match xs { Some(b"<") => { - switch_to!(slf, State::RawTextLessThanSign) + switch_to!(slf, RawTextLessThanSign) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -80,12 +97,15 @@ pub(crate) fn consume( eof!() } } - ), - State::ScriptData => fast_read_char!( + ) + }); + + define_state!(ScriptData, slf, { + fast_read_char!( slf, match xs { Some(b"<") => { - switch_to!(slf, State::ScriptDataLessThanSign) + switch_to!(slf, ScriptDataLessThanSign) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -100,8 +120,11 @@ pub(crate) fn consume( eof!() } } - ), - State::PlainText => fast_read_char!( + ) + }); + + define_state!(PlainText, slf, { + fast_read_char!( slf, match xs { Some(b"\0") => { @@ -117,24 +140,27 @@ pub(crate) fn consume( eof!() } } - ), - State::TagOpen => slow_read_byte!( + ) + }); + + define_state!(TagOpen, slf, { + slow_read_byte!( slf, match c { Some(b'!') => { - switch_to!(slf, State::MarkupDeclarationOpen) + switch_to!(slf, MarkupDeclarationOpen) } Some(b'/') => { - switch_to!(slf, State::EndTagOpen) + switch_to!(slf, EndTagOpen)?.inline_next_state(slf) } Some(x) if x.is_ascii_alphabetic() => { slf.emitter.init_start_tag(); - reconsume_in!(slf, Some(x), State::TagName) + reconsume_in!(slf, Some(x), TagName)?.inline_next_state(slf) } c @ Some(b'?') => { error!(slf, Error::UnexpectedQuestionMarkInsteadOfTagName); slf.emitter.init_comment(); - reconsume_in!(slf, c, State::BogusComment) + reconsume_in!(slf, c, BogusComment) } None => { error!(slf, Error::EofBeforeTagName); @@ -144,20 +170,23 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::InvalidFirstCharacterOfTagName); slf.emitter.emit_string(b"<"); - reconsume_in!(slf, c, State::Data) + reconsume_in!(slf, c, Data) } } - ), - State::EndTagOpen => slow_read_byte!( + ) + }); + + define_state!(EndTagOpen, slf, { + slow_read_byte!( slf, match c { Some(x) if x.is_ascii_alphabetic() => { slf.emitter.init_end_tag(); - reconsume_in!(slf, Some(x), State::TagName) + reconsume_in!(slf, Some(x), TagName) } Some(b'>') => { error!(slf, Error::MissingEndTagName); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofBeforeTagName); @@ -167,21 +196,25 @@ pub(crate) fn consume( Some(x) => { error!(slf, Error::InvalidFirstCharacterOfTagName); slf.emitter.init_comment(); - reconsume_in!(slf, Some(x), State::BogusComment) + reconsume_in!(slf, Some(x), BogusComment) } } - ), - State::TagName => fast_read_char!( + ) + }); + + define_state!(TagName, slf, { + fast_read_char!( slf, match xs { Some(b"\t" | b"\x0A" | b"\x0C" | b" ") => { - switch_to!(slf, State::BeforeAttributeName) + switch_to!(slf, BeforeAttributeName) } Some(b"/") => { - switch_to!(slf, State::SelfClosingStartTag) + switch_to!(slf, SelfClosingStartTag) } Some(b">") => { - emit_current_tag_and_switch_to!(slf, State::Data) + // candidate for inline_next_state except it'd be cyclic + emit_current_tag_and_switch_to!(slf, Data) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -201,46 +234,55 @@ pub(crate) fn consume( eof!() } } - ), - State::RcDataLessThanSign => slow_read_byte!( + ) + }); + + define_state!(RcDataLessThanSign, slf, { + slow_read_byte!( slf, match c { Some(b'/') => { slf.machine_helper.temporary_buffer.clear(); - switch_to!(slf, State::RcDataEndTagOpen) + switch_to!(slf, RcDataEndTagOpen) } c => { slf.emitter.emit_string(b"<"); - reconsume_in!(slf, c, State::RcData) + reconsume_in!(slf, c, RcData) } } - ), - State::RcDataEndTagOpen => slow_read_byte!( + ) + }); + + define_state!(RcDataEndTagOpen, slf, { + slow_read_byte!( slf, match c { Some(x) if x.is_ascii_alphabetic() => { slf.emitter.init_end_tag(); - reconsume_in!(slf, Some(x), State::RcDataEndTagName) + reconsume_in!(slf, Some(x), RcDataEndTagName) } c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(RcDataEndTagName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::BeforeAttributeName) + switch_to!(slf, BeforeAttributeName) } Some(b'/') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::SelfClosingStartTag) + switch_to!(slf, SelfClosingStartTag) } Some(b'>') if slf.emitter.current_is_appropriate_end_tag_token() => { - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } Some(x) if x.is_ascii_alphabetic() => { slf.emitter.push_tag_name(&[x.to_ascii_lowercase()]); @@ -250,49 +292,58 @@ pub(crate) fn consume( c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(RawTextLessThanSign, slf, { + slow_read_byte!( slf, match c { Some(b'/') => { slf.machine_helper.temporary_buffer.clear(); - switch_to!(slf, State::RawTextEndTagOpen) + switch_to!(slf, RawTextEndTagOpen) } c => { slf.emitter.emit_string(b"<"); - reconsume_in!(slf, c, State::RawText) + reconsume_in!(slf, c, RawText) } } - ), - State::RawTextEndTagOpen => slow_read_byte!( + ) + }); + + define_state!(RawTextEndTagOpen, slf, { + slow_read_byte!( slf, match c { Some(x) if x.is_ascii_alphabetic() => { slf.emitter.init_end_tag(); - reconsume_in!(slf, Some(x), State::RawTextEndTagName) + reconsume_in!(slf, Some(x), RawTextEndTagName) } c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(RawTextEndTagName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::BeforeAttributeName) + switch_to!(slf, BeforeAttributeName) } Some(b'/') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::SelfClosingStartTag) + switch_to!(slf, SelfClosingStartTag) } Some(b'>') if slf.emitter.current_is_appropriate_end_tag_token() => { - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } Some(x) if x.is_ascii_alphabetic() => { slf.emitter.push_tag_name(&[x.to_ascii_lowercase()]); @@ -302,53 +353,62 @@ pub(crate) fn consume( c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(ScriptDataLessThanSign, slf, { + slow_read_byte!( slf, match c { Some(b'/') => { slf.machine_helper.temporary_buffer.clear(); - switch_to!(slf, State::ScriptDataEndTagOpen) + switch_to!(slf, ScriptDataEndTagOpen) } Some(b'!') => { slf.emitter.emit_string(b" { slf.emitter.emit_string(b"<"); - reconsume_in!(slf, c, State::ScriptData) + reconsume_in!(slf, c, ScriptData) } } - ), - State::ScriptDataEndTagOpen => slow_read_byte!( + ) + }); + + define_state!(ScriptDataEndTagOpen, slf, { + slow_read_byte!( slf, match c { Some(x) if x.is_ascii_alphabetic() => { slf.emitter.init_end_tag(); - reconsume_in!(slf, Some(x), State::ScriptDataEndTagName) + reconsume_in!(slf, Some(x), ScriptDataEndTagName) } c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(ScriptDataEndTagName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::BeforeAttributeName) + switch_to!(slf, BeforeAttributeName) } Some(b'/') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::SelfClosingStartTag) + switch_to!(slf, SelfClosingStartTag) } Some(b'>') if slf.emitter.current_is_appropriate_end_tag_token() => { - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } Some(x) if x.is_ascii_alphabetic() => { slf.emitter.push_tag_name(&[x.to_ascii_lowercase()]); @@ -360,43 +420,52 @@ pub(crate) fn consume( c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapeStart, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { slf.emitter.emit_string(b"-"); - switch_to!(slf, State::ScriptDataEscapeStartDash) + switch_to!(slf, ScriptDataEscapeStartDash) } c => { - reconsume_in!(slf, c, State::ScriptData) + reconsume_in!(slf, c, ScriptData) } } - ), - State::ScriptDataEscapeStartDash => slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapeStartDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { slf.emitter.emit_string(b"-"); - switch_to!(slf, State::ScriptDataEscapedDashDash) + switch_to!(slf, ScriptDataEscapedDashDash) } c => { - reconsume_in!(slf, c, State::ScriptData) + reconsume_in!(slf, c, ScriptData) } } - ), - State::ScriptDataEscaped => fast_read_char!( + ) + }); + + define_state!(ScriptDataEscaped, slf, { + fast_read_char!( slf, match xs { Some(b"-") => { slf.emitter.emit_string(b"-"); - switch_to!(slf, State::ScriptDataEscapedDash) + switch_to!(slf, ScriptDataEscapedDash) } Some(b"<") => { - switch_to!(slf, State::ScriptDataEscapedLessThanSign) + switch_to!(slf, ScriptDataEscapedLessThanSign) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -412,33 +481,39 @@ pub(crate) fn consume( eof!() } } - ), - State::ScriptDataEscapedDash => slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapedDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { slf.emitter.emit_string(b"-"); - switch_to!(slf, State::ScriptDataEscapedDashDash) + switch_to!(slf, ScriptDataEscapedDashDash) } Some(b'<') => { - switch_to!(slf, State::ScriptDataEscapedLessThanSign) + switch_to!(slf, ScriptDataEscapedLessThanSign) } Some(b'\0') => { error!(slf, Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}".as_bytes()); - switch_to!(slf, State::ScriptDataEscaped) + switch_to!(slf, ScriptDataEscaped) } Some(x) => { slf.emitter.emit_string(&[x]); - switch_to!(slf, State::ScriptDataEscaped) + switch_to!(slf, ScriptDataEscaped) } None => { error!(slf, Error::EofInScriptHtmlCommentLikeText); eof!() } } - ), - State::ScriptDataEscapedDashDash => slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapedDashDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { @@ -446,71 +521,80 @@ pub(crate) fn consume( cont!() } Some(b'<') => { - switch_to!(slf, State::ScriptDataEscapedLessThanSign) + switch_to!(slf, ScriptDataEscapedLessThanSign) } Some(b'>') => { slf.emitter.emit_string(b">"); - switch_to!(slf, State::ScriptData) + switch_to!(slf, ScriptData) } Some(b'\0') => { error!(slf, Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}".as_bytes()); - switch_to!(slf, State::ScriptDataEscaped) + switch_to!(slf, ScriptDataEscaped) } Some(x) => { slf.emitter.emit_string(&[x]); - switch_to!(slf, State::ScriptDataEscaped) + switch_to!(slf, ScriptDataEscaped) } None => { error!(slf, Error::EofInScriptHtmlCommentLikeText); eof!() } } - ), - State::ScriptDataEscapedLessThanSign => slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapedLessThanSign, slf, { + slow_read_byte!( slf, match c { Some(b'/') => { slf.machine_helper.temporary_buffer.clear(); - switch_to!(slf, State::ScriptDataEscapedEndTagOpen) + switch_to!(slf, ScriptDataEscapedEndTagOpen) } Some(x) if x.is_ascii_alphabetic() => { slf.machine_helper.temporary_buffer.clear(); slf.emitter.emit_string(b"<"); - reconsume_in!(slf, Some(x), State::ScriptDataDoubleEscapeStart) + reconsume_in!(slf, Some(x), ScriptDataDoubleEscapeStart) } c => { slf.emitter.emit_string(b"<"); - reconsume_in!(slf, c, State::ScriptDataEscaped) + reconsume_in!(slf, c, ScriptDataEscaped) } } - ), - State::ScriptDataEscapedEndTagOpen => slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapedEndTagOpen, slf, { + slow_read_byte!( slf, match c { Some(x) if x.is_ascii_alphabetic() => { slf.emitter.init_end_tag(); - reconsume_in!(slf, Some(x), State::ScriptDataEscapedEndTagName) + reconsume_in!(slf, Some(x), ScriptDataEscapedEndTagName) } c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(ScriptDataEscapedEndTagName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::BeforeAttributeName) + switch_to!(slf, BeforeAttributeName) } Some(b'/') if slf.emitter.current_is_appropriate_end_tag_token() => { - switch_to!(slf, State::SelfClosingStartTag) + switch_to!(slf, SelfClosingStartTag) } Some(b'>') if slf.emitter.current_is_appropriate_end_tag_token() => { - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } Some(x) if x.is_ascii_alphabetic() => { slf.emitter.push_tag_name(&[x.to_ascii_lowercase()]); @@ -520,19 +604,22 @@ pub(crate) fn consume( c => { slf.emitter.emit_string(b" slow_read_byte!( + ) + }); + + define_state!(ScriptDataDoubleEscapeStart, slf, { + slow_read_byte!( slf, match c { Some(x @ (b'\t' | b'\x0A' | b'\x0C' | b' ' | b'/' | b'>')) => { slf.emitter.emit_string(&[x]); if slf.machine_helper.temporary_buffer == b"script" { - switch_to!(slf, State::ScriptDataDoubleEscaped) + switch_to!(slf, ScriptDataDoubleEscaped) } else { - switch_to!(slf, State::ScriptDataEscaped) + switch_to!(slf, ScriptDataEscaped) } } Some(x) if x.is_ascii_alphabetic() => { @@ -543,20 +630,23 @@ pub(crate) fn consume( cont!() } c => { - reconsume_in!(slf, c, State::ScriptDataEscaped) + reconsume_in!(slf, c, ScriptDataEscaped) } } - ), - State::ScriptDataDoubleEscaped => fast_read_char!( + ) + }); + + define_state!(ScriptDataDoubleEscaped, slf, { + fast_read_char!( slf, match xs { Some(b"-") => { slf.emitter.emit_string(b"-"); - switch_to!(slf, State::ScriptDataDoubleEscapedDash) + switch_to!(slf, ScriptDataDoubleEscapedDash) } Some(b"<") => { slf.emitter.emit_string(b"<"); - switch_to!(slf, State::ScriptDataDoubleEscapedLessThanSign) + switch_to!(slf, ScriptDataDoubleEscapedLessThanSign) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -572,34 +662,40 @@ pub(crate) fn consume( eof!() } } - ), - State::ScriptDataDoubleEscapedDash => slow_read_byte!( + ) + }); + + define_state!(ScriptDataDoubleEscapedDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { slf.emitter.emit_string(b"-"); - switch_to!(slf, State::ScriptDataDoubleEscapedDashDash) + switch_to!(slf, ScriptDataDoubleEscapedDashDash) } Some(b'<') => { slf.emitter.emit_string(b"<"); - switch_to!(slf, State::ScriptDataDoubleEscapedLessThanSign) + switch_to!(slf, ScriptDataDoubleEscapedLessThanSign) } Some(b'\0') => { error!(slf, Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}".as_bytes()); - switch_to!(slf, State::ScriptDataDoubleEscaped) + switch_to!(slf, ScriptDataDoubleEscaped) } Some(x) => { slf.emitter.emit_string(&[x]); - switch_to!(slf, State::ScriptDataDoubleEscaped) + switch_to!(slf, ScriptDataDoubleEscaped) } None => { error!(slf, Error::EofInScriptHtmlCommentLikeText); eof!() } } - ), - State::ScriptDataDoubleEscapedDashDash => slow_read_byte!( + ) + }); + + define_state!(ScriptDataDoubleEscapedDashDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { @@ -608,50 +704,56 @@ pub(crate) fn consume( } Some(b'<') => { slf.emitter.emit_string(b"<"); - switch_to!(slf, State::ScriptDataDoubleEscapedLessThanSign) + switch_to!(slf, ScriptDataDoubleEscapedLessThanSign) } Some(b'>') => { slf.emitter.emit_string(b">"); - switch_to!(slf, State::ScriptData) + switch_to!(slf, ScriptData) } Some(b'\0') => { error!(slf, Error::UnexpectedNullCharacter); slf.emitter.emit_string("\u{fffd}".as_bytes()); - switch_to!(slf, State::ScriptDataDoubleEscaped) + switch_to!(slf, ScriptDataDoubleEscaped) } Some(x) => { slf.emitter.emit_string(&[x]); - switch_to!(slf, State::ScriptDataDoubleEscaped) + switch_to!(slf, ScriptDataDoubleEscaped) } None => { error!(slf, Error::EofInScriptHtmlCommentLikeText); eof!() } } - ), - State::ScriptDataDoubleEscapedLessThanSign => slow_read_byte!( + ) + }); + + define_state!(ScriptDataDoubleEscapedLessThanSign, slf, { + slow_read_byte!( slf, match c { Some(b'/') => { slf.machine_helper.temporary_buffer.clear(); slf.emitter.emit_string(b"/"); - switch_to!(slf, State::ScriptDataDoubleEscapeEnd) + switch_to!(slf, ScriptDataDoubleEscapeEnd) } c => { - reconsume_in!(slf, c, State::ScriptDataDoubleEscaped) + reconsume_in!(slf, c, ScriptDataDoubleEscaped) } } - ), - State::ScriptDataDoubleEscapeEnd => slow_read_byte!( + ) + }); + + define_state!(ScriptDataDoubleEscapeEnd, slf, { + slow_read_byte!( slf, match c { Some(x @ (b'\t' | b'\x0A' | b'\x0C' | b' ' | b'/' | b'>')) => { slf.emitter.emit_string(&[x]); if slf.machine_helper.temporary_buffer == b"script" { - switch_to!(slf, State::ScriptDataEscaped) + switch_to!(slf, ScriptDataEscaped) } else { - switch_to!(slf, State::ScriptDataDoubleEscaped) + switch_to!(slf, ScriptDataDoubleEscaped) } } Some(x) if x.is_ascii_alphabetic() => { @@ -662,37 +764,43 @@ pub(crate) fn consume( cont!() } c => { - reconsume_in!(slf, c, State::ScriptDataDoubleEscaped) + reconsume_in!(slf, c, ScriptDataDoubleEscaped) } } - ), - State::BeforeAttributeName => slow_read_byte!( + ) + }); + + define_state!(BeforeAttributeName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), c @ (Some(b'/' | b'>') | None) => { - reconsume_in!(slf, c, State::AfterAttributeName) + reconsume_in!(slf, c, AfterAttributeName)?.inline_next_state(slf) } Some(b'=') => { error!(slf, Error::UnexpectedEqualsSignBeforeAttributeName); slf.emitter.init_attribute(); slf.emitter.push_attribute_name("=".as_bytes()); - switch_to!(slf, State::AttributeName) + switch_to!(slf, AttributeName) } Some(x) => { slf.emitter.init_attribute(); - reconsume_in!(slf, Some(x), State::AttributeName) + reconsume_in!(slf, Some(x), AttributeName)?.inline_next_state(slf) } } - ), - State::AttributeName => fast_read_char!( + ) + }); + + define_state!(AttributeName, slf, { + fast_read_char!( slf, match xs { Some(b"\t" | b"\x0A" | b"\x0C" | b" " | b"/" | b">") => { - reconsume_in!(slf, Some(xs.unwrap()[0]), State::AfterAttributeName) + reconsume_in!(slf, Some(xs.unwrap()[0]), AfterAttributeName) } Some(b"=") => { - switch_to!(slf, State::BeforeAttributeValue) + switch_to!(slf, BeforeAttributeValue)?.inline_next_state(slf) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -712,22 +820,25 @@ pub(crate) fn consume( cont!() } None => { - reconsume_in!(slf, None, State::AfterAttributeName) + reconsume_in!(slf, None, AfterAttributeName) } } - ), - State::AfterAttributeName => slow_read_byte!( + ) + }); + + define_state!(AfterAttributeName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'/') => { - switch_to!(slf, State::SelfClosingStartTag) + switch_to!(slf, SelfClosingStartTag) } Some(b'=') => { - switch_to!(slf, State::BeforeAttributeValue) + switch_to!(slf, BeforeAttributeValue) } Some(b'>') => { - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } None => { error!(slf, Error::EofInTag); @@ -735,37 +846,43 @@ pub(crate) fn consume( } Some(x) => { slf.emitter.init_attribute(); - reconsume_in!(slf, Some(x), State::AttributeName) + reconsume_in!(slf, Some(x), AttributeName) } } - ), - State::BeforeAttributeValue => slow_read_byte!( + ) + }); + + define_state!(BeforeAttributeValue, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'"') => { - switch_to!(slf, State::AttributeValueDoubleQuoted) + switch_to!(slf, AttributeValueDoubleQuoted)?.inline_next_state(slf) } Some(b'\'') => { - switch_to!(slf, State::AttributeValueSingleQuoted) + switch_to!(slf, AttributeValueSingleQuoted) } Some(b'>') => { error!(slf, Error::MissingAttributeValue); - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } c => { - reconsume_in!(slf, c, State::AttributeValueUnquoted) + reconsume_in!(slf, c, AttributeValueUnquoted) } } - ), - State::AttributeValueDoubleQuoted => fast_read_char!( + ) + }); + + define_state!(AttributeValueDoubleQuoted, slf, { + fast_read_char!( slf, match xs { Some(b"\"") => { - switch_to!(slf, State::AfterAttributeValueQuoted) + switch_to!(slf, AfterAttributeValueQuoted)?.inline_next_state(slf) } Some(b"&") => { - enter_state!(slf, State::CharacterReference) + enter_state!(slf, CharacterReference, true) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -781,15 +898,18 @@ pub(crate) fn consume( eof!() } } - ), - State::AttributeValueSingleQuoted => fast_read_char!( + ) + }); + + define_state!(AttributeValueSingleQuoted, slf, { + fast_read_char!( slf, match xs { Some(b"'") => { - switch_to!(slf, State::AfterAttributeValueQuoted) + switch_to!(slf, AfterAttributeValueQuoted) } Some(b"&") => { - enter_state!(slf, State::CharacterReference) + enter_state!(slf, CharacterReference, true) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -805,18 +925,21 @@ pub(crate) fn consume( eof!() } } - ), - State::AttributeValueUnquoted => fast_read_char!( + ) + }); + + define_state!(AttributeValueUnquoted, slf, { + fast_read_char!( slf, match xs { Some(b"\t" | b"\x0A" | b"\x0C" | b" ") => { - switch_to!(slf, State::BeforeAttributeName) + switch_to!(slf, BeforeAttributeName) } Some(b"&") => { - enter_state!(slf, State::CharacterReference) + enter_state!(slf, CharacterReference, true) } Some(b">") => { - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -837,25 +960,31 @@ pub(crate) fn consume( eof!() } } - ), - State::AfterAttributeValueQuoted => slow_read_byte!( + ) + }); + + define_state!(AfterAttributeValueQuoted, slf, { + slow_read_byte!( slf, match c { c @ (Some(b'\t' | b'\x0A' | b'\x0C' | b' ' | b'/' | b'>') | None) => { - reconsume_in!(slf, c, State::BeforeAttributeName) + reconsume_in!(slf, c, BeforeAttributeName)?.inline_next_state(slf) } c => { error!(slf, Error::MissingWhitespaceBetweenAttributes); - reconsume_in!(slf, c, State::BeforeAttributeName) + reconsume_in!(slf, c, BeforeAttributeName) } } - ), - State::SelfClosingStartTag => slow_read_byte!( + ) + }); + + define_state!(SelfClosingStartTag, slf, { + slow_read_byte!( slf, match c { Some(b'>') => { slf.emitter.set_self_closing(); - emit_current_tag_and_switch_to!(slf, State::Data) + emit_current_tag_and_switch_to!(slf, Data) } None => { error!(slf, Error::EofInTag); @@ -863,16 +992,19 @@ pub(crate) fn consume( } Some(x) => { error_immediate!(slf, Error::UnexpectedSolidusInTag); - reconsume_in!(slf, Some(x), State::BeforeAttributeName) + reconsume_in!(slf, Some(x), BeforeAttributeName) } } - ), - State::BogusComment => fast_read_char!( + ) + }); + + define_state!(BogusComment, slf, { + fast_read_char!( slf, match xs { Some(b">") => { slf.emitter.emit_current_comment(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -888,20 +1020,23 @@ pub(crate) fn consume( eof!() } } - ), - State::MarkupDeclarationOpen => slow_read_byte!( + ) + }); + + define_state!(MarkupDeclarationOpen, slf, { + slow_read_byte!( slf, match c { Some(b'-') if slf.reader.try_read_string(&mut slf.validator, "-", true)? => { slf.emitter.init_comment(); - switch_to!(slf, State::CommentStart) + switch_to!(slf, CommentStart) } Some(b'd' | b'D') if slf .reader .try_read_string(&mut slf.validator, "octype", false)? => { - switch_to!(slf, State::Doctype) + switch_to!(slf, Doctype) } Some(b'[') if slf @@ -912,48 +1047,54 @@ pub(crate) fn consume( .emitter .adjusted_current_node_present_but_not_in_html_namespace() { - switch_to!(slf, State::CdataSection) + switch_to!(slf, CdataSection) } else { error!(slf, Error::CdataInHtmlContent); slf.emitter.init_comment(); slf.emitter.push_comment(b"[CDATA["); - switch_to!(slf, State::BogusComment) + switch_to!(slf, BogusComment) } } c => { error!(slf, Error::IncorrectlyOpenedComment); slf.emitter.init_comment(); - reconsume_in!(slf, c, State::BogusComment) + reconsume_in!(slf, c, BogusComment) } } - ), - State::CommentStart => slow_read_byte!( + ) + }); + + define_state!(CommentStart, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { - switch_to!(slf, State::CommentStartDash) + switch_to!(slf, CommentStartDash) } Some(b'>') => { error!(slf, Error::AbruptClosingOfEmptyComment); slf.emitter.emit_current_comment(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } c => { - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::CommentStartDash => slow_read_byte!( + ) + }); + + define_state!(CommentStartDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { - switch_to!(slf, State::CommentEnd) + switch_to!(slf, CommentEnd) } Some(b'>') => { error!(slf, Error::AbruptClosingOfEmptyComment); slf.emitter.emit_current_comment(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInComment); @@ -962,19 +1103,22 @@ pub(crate) fn consume( } c @ Some(_) => { slf.emitter.push_comment(b"-"); - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::Comment => fast_read_char!( + ) + }); + + define_state!(Comment, slf, { + fast_read_char!( slf, match xs { Some(b"<") => { slf.emitter.push_comment(b"<"); - switch_to!(slf, State::CommentLessThanSign) + switch_to!(slf, CommentLessThanSign) } Some(b"-") => { - switch_to!(slf, State::CommentEndDash) + switch_to!(slf, CommentEndDash) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -991,62 +1135,77 @@ pub(crate) fn consume( eof!() } } - ), - State::CommentLessThanSign => slow_read_byte!( + ) + }); + + define_state!(CommentLessThanSign, slf, { + slow_read_byte!( slf, match c { Some(b'!') => { slf.emitter.push_comment(b"!"); - switch_to!(slf, State::CommentLessThanSignBang) + switch_to!(slf, CommentLessThanSignBang) } Some(b'<') => { slf.emitter.push_comment(b"<"); cont!() } c => { - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::CommentLessThanSignBang => slow_read_byte!( + ) + }); + + define_state!(CommentLessThanSignBang, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { - switch_to!(slf, State::CommentLessThanSignBangDash) + switch_to!(slf, CommentLessThanSignBangDash) } c => { - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::CommentLessThanSignBangDash => slow_read_byte!( + ) + }); + + define_state!(CommentLessThanSignBangDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { - switch_to!(slf, State::CommentLessThanSignBangDashDash) + switch_to!(slf, CommentLessThanSignBangDashDash) } c => { - reconsume_in!(slf, c, State::CommentEndDash) + reconsume_in!(slf, c, CommentEndDash) } } - ), - State::CommentLessThanSignBangDashDash => slow_read_byte!( + ) + }); + + define_state!(CommentLessThanSignBangDashDash, slf, { + slow_read_byte!( slf, match c { c @ (Some(b'>') | None) => { - reconsume_in!(slf, c, State::CommentEnd) + reconsume_in!(slf, c, CommentEnd) } c => { error!(slf, Error::NestedComment); - reconsume_in!(slf, c, State::CommentEnd) + reconsume_in!(slf, c, CommentEnd) } } - ), - State::CommentEndDash => slow_read_byte!( + ) + }); + + define_state!(CommentEndDash, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { - switch_to!(slf, State::CommentEnd) + switch_to!(slf, CommentEnd) } None => { error!(slf, Error::EofInComment); @@ -1055,19 +1214,22 @@ pub(crate) fn consume( } c => { slf.emitter.push_comment(b"-"); - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::CommentEnd => slow_read_byte!( + ) + }); + + define_state!(CommentEnd, slf, { + slow_read_byte!( slf, match c { Some(b'>') => { slf.emitter.emit_current_comment(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(b'!') => { - switch_to!(slf, State::CommentEndBang) + switch_to!(slf, CommentEndBang) } Some(b'-') => { slf.emitter.push_comment(b"-"); @@ -1080,21 +1242,24 @@ pub(crate) fn consume( } c @ Some(_) => { slf.emitter.push_comment(b"--"); - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::CommentEndBang => slow_read_byte!( + ) + }); + + define_state!(CommentEndBang, slf, { + slow_read_byte!( slf, match c { Some(b'-') => { slf.emitter.push_comment(b"--!"); - switch_to!(slf, State::CommentEndDash) + switch_to!(slf, CommentEndDash) } Some(b'>') => { error!(slf, Error::IncorrectlyClosedComment); slf.emitter.emit_current_comment(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInComment); @@ -1103,18 +1268,21 @@ pub(crate) fn consume( } c @ Some(_) => { slf.emitter.push_comment(b"--!"); - reconsume_in!(slf, c, State::Comment) + reconsume_in!(slf, c, Comment) } } - ), - State::Doctype => slow_read_byte!( + ) + }); + + define_state!(Doctype, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => { - switch_to!(slf, State::BeforeDoctypeName) + switch_to!(slf, BeforeDoctypeName) } c @ Some(b'>') => { - reconsume_in!(slf, c, State::BeforeDoctypeName) + reconsume_in!(slf, c, BeforeDoctypeName) } None => { error!(slf, Error::EofInDoctype); @@ -1125,11 +1293,14 @@ pub(crate) fn consume( } c @ Some(_) => { error!(slf, Error::MissingWhitespaceBeforeDoctypeName); - reconsume_in!(slf, c, State::BeforeDoctypeName) + reconsume_in!(slf, c, BeforeDoctypeName) } } - ), - State::BeforeDoctypeName => slow_read_byte!( + ) + }); + + define_state!(BeforeDoctypeName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), @@ -1137,14 +1308,14 @@ pub(crate) fn consume( error!(slf, Error::UnexpectedNullCharacter); slf.emitter.init_doctype(); slf.emitter.push_doctype_name("\u{fffd}".as_bytes()); - switch_to!(slf, State::DoctypeName) + switch_to!(slf, DoctypeName) } Some(b'>') => { error!(slf, Error::MissingDoctypeName); slf.emitter.init_doctype(); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1156,19 +1327,22 @@ pub(crate) fn consume( Some(x) => { slf.emitter.init_doctype(); slf.emitter.push_doctype_name(&[x.to_ascii_lowercase()]); - switch_to!(slf, State::DoctypeName) + switch_to!(slf, DoctypeName) } } - ), - State::DoctypeName => fast_read_char!( + ) + }); + + define_state!(DoctypeName, slf, { + fast_read_char!( slf, match xs { Some(b"\t" | b"\x0A" | b"\x0C" | b" ") => { - switch_to!(slf, State::AfterDoctypeName) + switch_to!(slf, AfterDoctypeName) } Some(b">") => { slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -1189,14 +1363,17 @@ pub(crate) fn consume( eof!() } } - ), - State::AfterDoctypeName => slow_read_byte!( + ) + }); + + define_state!(AfterDoctypeName, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'>') => { slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1209,43 +1386,46 @@ pub(crate) fn consume( .reader .try_read_string(&mut slf.validator, "ublic", false)? => { - switch_to!(slf, State::AfterDoctypePublicKeyword) + switch_to!(slf, AfterDoctypePublicKeyword) } Some(b's' | b'S') if slf .reader .try_read_string(&mut slf.validator, "ystem", false)? => { - switch_to!(slf, State::AfterDoctypeSystemKeyword) + switch_to!(slf, AfterDoctypeSystemKeyword) } c @ Some(_) => { error!(slf, Error::InvalidCharacterSequenceAfterDoctypeName); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::AfterDoctypePublicKeyword => slow_read_byte!( + ) + }); + + define_state!(AfterDoctypePublicKeyword, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => { - switch_to!(slf, State::BeforeDoctypePublicIdentifier) + switch_to!(slf, BeforeDoctypePublicIdentifier) } Some(b'"') => { error!(slf, Error::MissingWhitespaceAfterDoctypePublicKeyword); slf.emitter.set_doctype_public_identifier(b""); - switch_to!(slf, State::DoctypePublicIdentifierDoubleQuoted) + switch_to!(slf, DoctypePublicIdentifierDoubleQuoted) } Some(b'\'') => { error!(slf, Error::MissingWhitespaceAfterDoctypePublicKeyword); slf.emitter.set_doctype_public_identifier(b""); - switch_to!(slf, State::DoctypePublicIdentifierSingleQuoted) + switch_to!(slf, DoctypePublicIdentifierSingleQuoted) } Some(b'>') => { error!(slf, Error::MissingDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1256,27 +1436,30 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::MissingQuoteBeforeDoctypePublicIdentifier); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::BeforeDoctypePublicIdentifier => slow_read_byte!( + ) + }); + + define_state!(BeforeDoctypePublicIdentifier, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'"') => { slf.emitter.set_doctype_public_identifier(b""); - switch_to!(slf, State::DoctypePublicIdentifierDoubleQuoted) + switch_to!(slf, DoctypePublicIdentifierDoubleQuoted) } Some(b'\'') => { slf.emitter.set_doctype_public_identifier(b""); - switch_to!(slf, State::DoctypePublicIdentifierSingleQuoted) + switch_to!(slf, DoctypePublicIdentifierSingleQuoted) } Some(b'>') => { error!(slf, Error::MissingDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1287,15 +1470,18 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::MissingQuoteBeforeDoctypePublicIdentifier); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::DoctypePublicIdentifierDoubleQuoted => fast_read_char!( + ) + }); + + define_state!(DoctypePublicIdentifierDoubleQuoted, slf, { + fast_read_char!( slf, match xs { Some(b"\"") => { - switch_to!(slf, State::AfterDoctypePublicIdentifier) + switch_to!(slf, AfterDoctypePublicIdentifier) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -1307,7 +1493,7 @@ pub(crate) fn consume( error!(slf, Error::AbruptDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(xs) => { slf.emitter.push_doctype_public_identifier(xs); @@ -1320,12 +1506,15 @@ pub(crate) fn consume( eof!() } } - ), - State::DoctypePublicIdentifierSingleQuoted => fast_read_char!( + ) + }); + + define_state!(DoctypePublicIdentifierSingleQuoted, slf, { + fast_read_char!( slf, match xs { Some(b"'") => { - switch_to!(slf, State::AfterDoctypePublicIdentifier) + switch_to!(slf, AfterDoctypePublicIdentifier) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -1337,7 +1526,7 @@ pub(crate) fn consume( error!(slf, Error::AbruptDoctypePublicIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(xs) => { slf.emitter.push_doctype_public_identifier(xs); @@ -1350,16 +1539,19 @@ pub(crate) fn consume( eof!() } } - ), - State::AfterDoctypePublicIdentifier => slow_read_byte!( + ) + }); + + define_state!(AfterDoctypePublicIdentifier, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => { - switch_to!(slf, State::BetweenDoctypePublicAndSystemIdentifiers) + switch_to!(slf, BetweenDoctypePublicAndSystemIdentifiers) } Some(b'>') => { slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(b'"') => { error!( @@ -1367,7 +1559,7 @@ pub(crate) fn consume( Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers ); slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierDoubleQuoted) + switch_to!(slf, DoctypeSystemIdentifierDoubleQuoted) } Some(b'\'') => { error!( @@ -1375,7 +1567,7 @@ pub(crate) fn consume( Error::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers ); slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierSingleQuoted) + switch_to!(slf, DoctypeSystemIdentifierSingleQuoted) } None => { error!(slf, Error::EofInDoctype); @@ -1386,25 +1578,28 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::BetweenDoctypePublicAndSystemIdentifiers => slow_read_byte!( + ) + }); + + define_state!(BetweenDoctypePublicAndSystemIdentifiers, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'>') => { slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(b'"') => { slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierDoubleQuoted) + switch_to!(slf, DoctypeSystemIdentifierDoubleQuoted) } Some(b'\'') => { slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierSingleQuoted) + switch_to!(slf, DoctypeSystemIdentifierSingleQuoted) } None => { error!(slf, Error::EofInDoctype); @@ -1415,31 +1610,34 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::AfterDoctypeSystemKeyword => slow_read_byte!( + ) + }); + + define_state!(AfterDoctypeSystemKeyword, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => { - switch_to!(slf, State::BeforeDoctypeSystemIdentifier) + switch_to!(slf, BeforeDoctypeSystemIdentifier) } Some(b'"') => { error!(slf, Error::MissingWhitespaceAfterDoctypeSystemKeyword); slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierDoubleQuoted) + switch_to!(slf, DoctypeSystemIdentifierDoubleQuoted) } Some(b'\'') => { error!(slf, Error::MissingWhitespaceAfterDoctypeSystemKeyword); slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierSingleQuoted) + switch_to!(slf, DoctypeSystemIdentifierSingleQuoted) } Some(b'>') => { error!(slf, Error::MissingDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1450,27 +1648,30 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::BeforeDoctypeSystemIdentifier => slow_read_byte!( + ) + }); + + define_state!(BeforeDoctypeSystemIdentifier, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'"') => { slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierDoubleQuoted) + switch_to!(slf, DoctypeSystemIdentifierDoubleQuoted) } Some(b'\'') => { slf.emitter.set_doctype_system_identifier(b""); - switch_to!(slf, State::DoctypeSystemIdentifierSingleQuoted) + switch_to!(slf, DoctypeSystemIdentifierSingleQuoted) } Some(b'>') => { error!(slf, Error::MissingDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1481,15 +1682,18 @@ pub(crate) fn consume( c @ Some(_) => { error!(slf, Error::MissingQuoteBeforeDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::DoctypeSystemIdentifierDoubleQuoted => fast_read_char!( + ) + }); + + define_state!(DoctypeSystemIdentifierDoubleQuoted, slf, { + fast_read_char!( slf, match xs { Some(b"\"") => { - switch_to!(slf, State::AfterDoctypeSystemIdentifier) + switch_to!(slf, AfterDoctypeSystemIdentifier) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -1501,7 +1705,7 @@ pub(crate) fn consume( error!(slf, Error::AbruptDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(xs) => { slf.emitter.push_doctype_system_identifier(xs); @@ -1514,12 +1718,15 @@ pub(crate) fn consume( eof!() } } - ), - State::DoctypeSystemIdentifierSingleQuoted => fast_read_char!( + ) + }); + + define_state!(DoctypeSystemIdentifierSingleQuoted, slf, { + fast_read_char!( slf, match xs { Some(b"\'") => { - switch_to!(slf, State::AfterDoctypeSystemIdentifier) + switch_to!(slf, AfterDoctypeSystemIdentifier) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -1531,7 +1738,7 @@ pub(crate) fn consume( error!(slf, Error::AbruptDoctypeSystemIdentifier); slf.emitter.set_force_quirks(); slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(xs) => { slf.emitter.push_doctype_system_identifier(xs); @@ -1544,14 +1751,17 @@ pub(crate) fn consume( eof!() } } - ), - State::AfterDoctypeSystemIdentifier => slow_read_byte!( + ) + }); + + define_state!(AfterDoctypeSystemIdentifier, slf, { + slow_read_byte!( slf, match c { Some(b'\t' | b'\x0A' | b'\x0C' | b' ') => cont!(), Some(b'>') => { slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } None => { error!(slf, Error::EofInDoctype); @@ -1561,16 +1771,19 @@ pub(crate) fn consume( } c @ Some(_) => { error!(slf, Error::UnexpectedCharacterAfterDoctypeSystemIdentifier); - reconsume_in!(slf, c, State::BogusDoctype) + reconsume_in!(slf, c, BogusDoctype) } } - ), - State::BogusDoctype => fast_read_char!( + ) + }); + + define_state!(BogusDoctype, slf, { + fast_read_char!( slf, match xs { Some(b">") => { slf.emitter.emit_current_doctype(); - switch_to!(slf, State::Data) + switch_to!(slf, Data) } Some(b"\0") => { error!(slf, Error::UnexpectedNullCharacter); @@ -1584,12 +1797,15 @@ pub(crate) fn consume( eof!() } } - ), - State::CdataSection => fast_read_char!( + ) + }); + + define_state!(CdataSection, slf, { + fast_read_char!( slf, match xs { Some(b"]") => { - switch_to!(slf, State::CdataSectionBracket) + switch_to!(slf, CdataSectionBracket) } Some(xs) => { slf.emitter.emit_string(xs); @@ -1600,20 +1816,26 @@ pub(crate) fn consume( eof!() } } - ), - State::CdataSectionBracket => slow_read_byte!( + ) + }); + + define_state!(CdataSectionBracket, slf, { + slow_read_byte!( slf, match c { Some(b']') => { - switch_to!(slf, State::CdataSectionEnd) + switch_to!(slf, CdataSectionEnd) } c => { slf.emitter.emit_string(b"]"); - reconsume_in!(slf, c, State::CdataSection) + reconsume_in!(slf, c, CdataSection) } } - ), - State::CdataSectionEnd => slow_read_byte!( + ) + }); + + define_state!(CdataSectionEnd, slf, { + slow_read_byte!( slf, match c { Some(b']') => { @@ -1621,81 +1843,86 @@ pub(crate) fn consume( cont!() } Some(b'>') => { - switch_to!(slf, State::Data) + switch_to!(slf, Data) } c => { slf.emitter.emit_string(b"]]"); - reconsume_in!(slf, c, State::CdataSection) + reconsume_in!(slf, c, CdataSection) } } - ), - State::CharacterReference => { - slf.machine_helper.temporary_buffer.clear(); - slf.machine_helper.temporary_buffer.push(b'&'); + ) + }); - slow_read_byte!( - slf, - match c { - Some(x) if x.is_ascii_alphanumeric() => { - reconsume_in!(slf, Some(x), State::NamedCharacterReference) - } - Some(b'#') => { - slf.machine_helper.temporary_buffer.push(b'#'); - switch_to!(slf, State::NumericCharacterReference) - } - c => { - slf.machine_helper - .flush_code_points_consumed_as_character_reference(&mut slf.emitter); - reconsume_in!(slf, c, slf.machine_helper.pop_return_state()) - } + define_state!(CharacterReference, slf, { + slf.machine_helper.temporary_buffer.clear(); + slf.machine_helper.temporary_buffer.push(b'&'); + + slow_read_byte!( + slf, + match c { + Some(x) if x.is_ascii_alphanumeric() => { + reconsume_in!(slf, Some(x), NamedCharacterReference) } - ) - } - State::NamedCharacterReference => { - let c = read_byte!(slf)?; + Some(b'#') => { + slf.machine_helper.temporary_buffer.push(b'#'); + switch_to!(slf, NumericCharacterReference) + } + c => { + slf.machine_helper + .flush_code_points_consumed_as_character_reference(&mut slf.emitter); + reconsume_in_return_state!(slf, c) + } + } + ) + }); - let char_ref = match c { - Some(x) => try_read_character_reference(x as char, |x| { - slf.reader.try_read_string(&mut slf.validator, x, true) - })? - .map(|char_ref| (x, char_ref)), + define_state!(NamedCharacterReference, slf, { + let c = read_byte!(slf)?; - None => None, - }; + let char_ref = match c { + Some(x) => try_read_character_reference(x as char, |x| { + slf.reader.try_read_string(&mut slf.validator, x, true) + })? + .map(|char_ref| (x, char_ref)), - if let Some((x, char_ref)) = char_ref { - let char_ref_name_last_character = char_ref.name.chars().last(); - let next_character = read_byte!(slf)?; + None => None, + }; - if !slf.machine_helper.is_consumed_as_part_of_an_attribute() - || char_ref_name_last_character == Some(';') - || !matches!(next_character, Some(x) if x == b'=' || x.is_ascii_alphanumeric()) - { - if char_ref_name_last_character != Some(';') { - error!(slf, Error::MissingSemicolonAfterCharacterReference); - } + if let Some((x, char_ref)) = char_ref { + let char_ref_name_last_character = char_ref.name.chars().last(); + let next_character = read_byte!(slf)?; - slf.machine_helper.temporary_buffer.clear(); - slf.machine_helper - .temporary_buffer - .extend(char_ref.characters.as_bytes()); - } else { - slf.machine_helper.temporary_buffer.extend(&[x]); - slf.machine_helper - .temporary_buffer - .extend(char_ref.name.as_bytes()); + if !slf.machine_helper.is_consumed_as_part_of_an_attribute() + || char_ref_name_last_character == Some(';') + || !matches!(next_character, Some(x) if x == b'=' || x.is_ascii_alphanumeric()) + { + if char_ref_name_last_character != Some(';') { + error!(slf, Error::MissingSemicolonAfterCharacterReference); } + slf.machine_helper.temporary_buffer.clear(); slf.machine_helper - .flush_code_points_consumed_as_character_reference(&mut slf.emitter); - reconsume_in!(slf, next_character, slf.machine_helper.pop_return_state()) + .temporary_buffer + .extend(char_ref.characters.as_bytes()); } else { + slf.machine_helper.temporary_buffer.extend(&[x]); slf.machine_helper - .flush_code_points_consumed_as_character_reference(&mut slf.emitter); - reconsume_in!(slf, c, State::AmbiguousAmpersand) + .temporary_buffer + .extend(char_ref.name.as_bytes()); } + + slf.machine_helper + .flush_code_points_consumed_as_character_reference(&mut slf.emitter); + reconsume_in_return_state!(slf, next_character) + } else { + slf.machine_helper + .flush_code_points_consumed_as_character_reference(&mut slf.emitter); + reconsume_in!(slf, c, AmbiguousAmpersand) } - State::AmbiguousAmpersand => slow_read_byte!( + }); + + define_state!(AmbiguousAmpersand, slf, { + slow_read_byte!( slf, match c { Some(x) if x.is_ascii_alphanumeric() => { @@ -1709,50 +1936,57 @@ pub(crate) fn consume( } c @ Some(b';') => { error!(slf, Error::UnknownNamedCharacterReference); - reconsume_in!(slf, c, slf.machine_helper.pop_return_state()) + reconsume_in_return_state!(slf, c) } c => { - reconsume_in!(slf, c, slf.machine_helper.pop_return_state()) + reconsume_in_return_state!(slf, c) } } - ), - State::NumericCharacterReference => { - slf.machine_helper.character_reference_code = 0; + ) + }); - slow_read_byte!( - slf, - match c { - Some(x @ (b'x' | b'X')) => { - slf.machine_helper.temporary_buffer.push(x); - switch_to!(slf, State::HexadecimalCharacterReferenceStart) - } - Some(x @ b'0'..=b'9') => { - reconsume_in!(slf, Some(x), State::DecimalCharacterReference) - } - c => { - error!(slf, Error::AbsenceOfDigitsInNumericCharacterReference); - slf.machine_helper - .flush_code_points_consumed_as_character_reference(&mut slf.emitter); - reconsume_in!(slf, c, slf.machine_helper.pop_return_state()) - } + define_state!(NumericCharacterReference, slf, { + slf.machine_helper.character_reference_code = 0; + + slow_read_byte!( + slf, + match c { + Some(x @ (b'x' | b'X')) => { + slf.machine_helper.temporary_buffer.push(x); + switch_to!(slf, HexadecimalCharacterReferenceStart) } - ) - } - State::HexadecimalCharacterReferenceStart => slow_read_byte!( + Some(x @ b'0'..=b'9') => { + reconsume_in!(slf, Some(x), DecimalCharacterReference) + } + c => { + error!(slf, Error::AbsenceOfDigitsInNumericCharacterReference); + slf.machine_helper + .flush_code_points_consumed_as_character_reference(&mut slf.emitter); + reconsume_in_return_state!(slf, c) + } + } + ) + }); + + define_state!(HexadecimalCharacterReferenceStart, slf, { + slow_read_byte!( slf, match c { c @ Some(b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f') => { - reconsume_in!(slf, c, State::HexadecimalCharacterReference) + reconsume_in!(slf, c, HexadecimalCharacterReference) } c => { error!(slf, Error::AbsenceOfDigitsInNumericCharacterReference); slf.machine_helper .flush_code_points_consumed_as_character_reference(&mut slf.emitter); - reconsume_in!(slf, c, slf.machine_helper.pop_return_state()) + reconsume_in_return_state!(slf, c) } } - ), - State::HexadecimalCharacterReference => slow_read_byte!( + ) + }); + + define_state!(HexadecimalCharacterReference, slf, { + slow_read_byte!( slf, match c { Some(x @ b'0'..=b'9') => { @@ -1768,15 +2002,18 @@ pub(crate) fn consume( cont!() } Some(b';') => { - switch_to!(slf, State::NumericCharacterReferenceEnd) + switch_to!(slf, NumericCharacterReferenceEnd) } c => { error!(slf, Error::MissingSemicolonAfterCharacterReference); - reconsume_in!(slf, c, State::NumericCharacterReferenceEnd) + reconsume_in!(slf, c, NumericCharacterReferenceEnd) } } - ), - State::DecimalCharacterReference => slow_read_byte!( + ) + }); + + define_state!(DecimalCharacterReference, slf, { + slow_read_byte!( slf, match c { Some(x @ b'0'..=b'9') => { @@ -1784,79 +2021,80 @@ pub(crate) fn consume( cont!() } Some(b';') => { - switch_to!(slf, State::NumericCharacterReferenceEnd) + switch_to!(slf, NumericCharacterReferenceEnd) } c => { error!(slf, Error::MissingSemicolonAfterCharacterReference); - reconsume_in!(slf, c, State::NumericCharacterReferenceEnd) + reconsume_in!(slf, c, NumericCharacterReferenceEnd) } } - ), - State::NumericCharacterReferenceEnd => { - match slf.machine_helper.character_reference_code { - 0x00 => { - error!(slf, Error::NullCharacterReference); - slf.machine_helper.character_reference_code = 0xfffd; - } - 0x0011_0000.. => { - error!(slf, Error::CharacterReferenceOutsideUnicodeRange); - slf.machine_helper.character_reference_code = 0xfffd; - } - surrogate_pat!() => { - error!(slf, Error::SurrogateCharacterReference); - slf.machine_helper.character_reference_code = 0xfffd; - } - // noncharacter - noncharacter_pat!() => { - error!(slf, Error::NoncharacterCharacterReference); - } - // 0x000d, or a control that is not whitespace - x @ (0x000d | 0x0d | 0x0000..=0x001f | 0x007f..=0x009f) - if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => - { - error!(slf, Error::ControlCharacterReference); - slf.machine_helper.character_reference_code = match x { - 0x80 => 0x20AC, // EURO SIGN (€) - 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) - 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ) - 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („) - 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…) - 0x86 => 0x2020, // DAGGER (†) - 0x87 => 0x2021, // DOUBLE DAGGER (‡) - 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) - 0x89 => 0x2030, // PER MILLE SIGN (‰) - 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š) - 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) - 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ) - 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž) - 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘) - 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’) - 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“) - 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”) - 0x95 => 0x2022, // BULLET (•) - 0x96 => 0x2013, // EN DASH (–) - 0x97 => 0x2014, // EM DASH (—) - 0x98 => 0x02DC, // SMALL TILDE (˜) - 0x99 => 0x2122, // TRADE MARK SIGN (™) - 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š) - 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) - 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ) - 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž) - 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) - _ => slf.machine_helper.character_reference_code, - }; - } - _ => (), - } - - slf.machine_helper.temporary_buffer.clear(); - slf.machine_helper.temporary_buffer.extend( - ctostr!(std::char::from_u32(slf.machine_helper.character_reference_code).unwrap()) - .as_bytes(), - ); - slf.machine_helper - .flush_code_points_consumed_as_character_reference(&mut slf.emitter); - exit_state!(slf) + ) + }); + + define_state!(NumericCharacterReferenceEnd, slf, { + match slf.machine_helper.character_reference_code { + 0x00 => { + error!(slf, Error::NullCharacterReference); + slf.machine_helper.character_reference_code = 0xfffd; + } + 0x0011_0000.. => { + error!(slf, Error::CharacterReferenceOutsideUnicodeRange); + slf.machine_helper.character_reference_code = 0xfffd; + } + surrogate_pat!() => { + error!(slf, Error::SurrogateCharacterReference); + slf.machine_helper.character_reference_code = 0xfffd; + } + // noncharacter + noncharacter_pat!() => { + error!(slf, Error::NoncharacterCharacterReference); + } + // 0x000d, or a control that is not whitespace + x @ (0x000d | 0x0d | 0x0000..=0x001f | 0x007f..=0x009f) + if !matches!(x, 0x0009 | 0x000a | 0x000c | 0x0020) => + { + error!(slf, Error::ControlCharacterReference); + slf.machine_helper.character_reference_code = match x { + 0x80 => 0x20AC, // EURO SIGN (€) + 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚) + 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ) + 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („) + 0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…) + 0x86 => 0x2020, // DAGGER (†) + 0x87 => 0x2021, // DOUBLE DAGGER (‡) + 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) + 0x89 => 0x2030, // PER MILLE SIGN (‰) + 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š) + 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) + 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ) + 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž) + 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘) + 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’) + 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“) + 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”) + 0x95 => 0x2022, // BULLET (•) + 0x96 => 0x2013, // EN DASH (–) + 0x97 => 0x2014, // EM DASH (—) + 0x98 => 0x02DC, // SMALL TILDE (˜) + 0x99 => 0x2122, // TRADE MARK SIGN (™) + 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š) + 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) + 0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ) + 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž) + 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) + _ => slf.machine_helper.character_reference_code, + }; + } + _ => (), } - } + + slf.machine_helper.temporary_buffer.clear(); + slf.machine_helper.temporary_buffer.extend( + ctostr!(std::char::from_u32(slf.machine_helper.character_reference_code).unwrap()) + .as_bytes(), + ); + slf.machine_helper + .flush_code_points_consumed_as_character_reference(&mut slf.emitter); + exit_state!(slf) + }); } diff --git a/src/machine_helper.rs b/src/machine_helper.rs index a10aa40..fa52cc4 100644 --- a/src/machine_helper.rs +++ b/src/machine_helper.rs @@ -1,42 +1,88 @@ -use crate::state::MachineState as State; use crate::utils::trace_log; -use crate::Emitter; +use crate::{Emitter, Reader, State, Tokenizer}; #[derive(Debug)] -pub(crate) struct MachineHelper { +pub(crate) struct MachineState { + pub function: fn(&mut Tokenizer) -> Result, R::Error>, + #[cfg(debug_assertions)] + pub debug_name: &'static str, +} + +impl Copy for MachineState {} +impl Clone for MachineState { + fn clone(&self) -> Self { + *self + } +} + +pub(crate) enum ControlToken { + Eof, + Continue, + SwitchTo(MachineState), +} + +impl ControlToken { + #[inline(always)] + pub(crate) fn inline_next_state(self, slf: &mut Tokenizer) -> Result { + match self { + ControlToken::SwitchTo(state) => { + slf.machine_helper.switch_to(state); + (state.function)(slf) + } + _ => { + #[cfg(debug_assertions)] + panic!("use of inline_next_state is invalid in this context as no state switch is happening"); + + #[cfg(not(debug_assertions))] + Ok(self) + } + } + } +} + +impl Into> for State { + fn into(self) -> MachineState { + // TODO: instead of this conversion, can we rig the enums to be of same layout? + match self { + State::Data => state_ref!(Data), + State::PlainText => state_ref!(PlainText), + State::RcData => state_ref!(RcData), + State::RawText => state_ref!(RawText), + State::ScriptData => state_ref!(ScriptData), + State::CdataSection => state_ref!(CdataSection), + } + } +} + +#[derive(Debug)] +pub(crate) struct MachineHelper { // XXX: allocation that cannot be controlled/reused by the user pub(crate) temporary_buffer: Vec, pub(crate) character_reference_code: u32, - pub(crate) state: State, - return_state: Option, + pub(crate) state: MachineState, + return_state: Option<(MachineState, bool)>, } -impl Default for MachineHelper { +impl Default for MachineHelper { fn default() -> Self { MachineHelper { temporary_buffer: Vec::new(), character_reference_code: 0, - state: State::Data, + state: state_ref!(Data), return_state: None, } } } -impl MachineHelper { +impl MachineHelper { pub(crate) fn is_consumed_as_part_of_an_attribute(&self) -> bool { - matches!( - self.return_state, - Some( - State::AttributeValueDoubleQuoted - | State::AttributeValueSingleQuoted - | State::AttributeValueUnquoted - ) - ) + match self.return_state { + Some((_state, is_attribute)) => is_attribute, + None => false, + } } - pub(crate) fn flush_code_points_consumed_as_character_reference( - &mut self, - emitter: &mut E, - ) { + + pub(crate) fn flush_code_points_consumed_as_character_reference(&mut self, emitter: &mut E) { if self.is_consumed_as_part_of_an_attribute() { emitter.push_attribute_value(&self.temporary_buffer); self.temporary_buffer.clear(); @@ -44,19 +90,20 @@ impl MachineHelper { self.flush_buffer_characters(emitter); } } - pub(crate) fn flush_buffer_characters(&mut self, emitter: &mut E) { + + pub(crate) fn flush_buffer_characters(&mut self, emitter: &mut E) { emitter.emit_string(&self.temporary_buffer); self.temporary_buffer.clear(); } - pub(crate) fn enter_state(&mut self, state: State) { + pub(crate) fn enter_state(&mut self, state: MachineState, is_attribute: bool) { debug_assert!(self.return_state.is_none()); - self.return_state = Some(self.state); + self.return_state = Some((self.state, is_attribute)); self.switch_to(state); } - pub(crate) fn pop_return_state(&mut self) -> State { - self.return_state.take().unwrap() + pub(crate) fn pop_return_state(&mut self) -> MachineState { + self.return_state.take().unwrap().0 } pub(crate) fn exit_state(&mut self) { @@ -64,16 +111,28 @@ impl MachineHelper { self.switch_to(state); } - pub(crate) fn state(&self) -> State { - self.state - } - - pub(crate) fn switch_to(&mut self, state: State) { - trace_log!("switch_to: {:?} -> {:?}", self.state, state); + pub(crate) fn switch_to(&mut self, state: MachineState) { + trace_log!( + "switch_to: {} -> {}", + self.state.debug_name, + state.debug_name + ); self.state = state; } } +macro_rules! state_ref { + ($state:ident) => {{ + crate::machine_helper::MachineState { + function: crate::machine::states::$state::run, + #[cfg(debug_assertions)] + debug_name: stringify!($state), + } + }}; +} + +pub(crate) use state_ref; + macro_rules! mutate_character_reference { ($slf:expr, * $mul:literal + $x:ident - $sub:literal) => { match $slf @@ -94,26 +153,29 @@ macro_rules! mutate_character_reference { pub(crate) use mutate_character_reference; macro_rules! emit_current_tag_and_switch_to { - ($slf:expr, $state:expr) => {{ - let state = $slf.emitter.emit_current_tag().map(From::from); - switch_to!($slf, state.unwrap_or($state)) + ($slf:expr, $state:ident) => {{ + let state = $slf.emitter.emit_current_tag().map(Into::into); + $slf.machine_helper + .switch_to(state.unwrap_or($crate::machine_helper::state_ref!($state))); + Ok(ControlToken::Continue) }}; } pub(crate) use emit_current_tag_and_switch_to; macro_rules! switch_to { - ($slf:expr, $state:expr) => {{ - $slf.machine_helper.switch_to($state); - Ok(ControlToken::Continue) + ($slf:expr, $state:ident) => {{ + let new_state = $crate::machine_helper::state_ref!($state); + Ok(ControlToken::SwitchTo(new_state)) }}; } pub(crate) use switch_to; macro_rules! enter_state { - ($slf:expr, $state:expr) => {{ - $slf.machine_helper.enter_state($state); + ($slf:expr, $state:ident, $is_attribute:expr) => {{ + $slf.machine_helper + .enter_state($crate::machine_helper::state_ref!($state), $is_attribute); Ok(ControlToken::Continue) }}; } @@ -130,17 +192,27 @@ macro_rules! exit_state { pub(crate) use exit_state; macro_rules! reconsume_in { - ($slf:expr, $c:expr, $state:expr) => {{ - let new_state = $state; + ($slf:expr, $c:expr, $state:ident) => {{ + let new_state = $crate::machine_helper::state_ref!($state); let c = $c; $slf.reader.unread_byte(c); - $slf.machine_helper.switch_to(new_state); - Ok(ControlToken::Continue) + Ok(ControlToken::SwitchTo(new_state)) }}; } pub(crate) use reconsume_in; +macro_rules! reconsume_in_return_state { + ($slf:expr, $c:expr) => {{ + let new_state = $slf.machine_helper.pop_return_state(); + let c = $c; + $slf.reader.unread_byte(c); + Ok(ControlToken::SwitchTo(new_state)) + }}; +} + +pub(crate) use reconsume_in_return_state; + macro_rules! cont { () => {{ continue; diff --git a/src/state.rs b/src/state.rs index 644529c..77da87a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -14,100 +14,3 @@ pub enum State { /// The cdata section state. CdataSection, } - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum MachineState { - Data, - RcData, - RawText, - ScriptData, - PlainText, - TagOpen, - EndTagOpen, - TagName, - RcDataLessThanSign, - RcDataEndTagOpen, - RcDataEndTagName, - RawTextLessThanSign, - RawTextEndTagOpen, - RawTextEndTagName, - ScriptDataLessThanSign, - ScriptDataEndTagOpen, - ScriptDataEndTagName, - ScriptDataEscapeStart, - ScriptDataEscapeStartDash, - ScriptDataEscaped, - ScriptDataEscapedDash, - ScriptDataEscapedDashDash, - ScriptDataEscapedLessThanSign, - ScriptDataEscapedEndTagOpen, - ScriptDataEscapedEndTagName, - ScriptDataDoubleEscapeStart, - ScriptDataDoubleEscaped, - ScriptDataDoubleEscapedDash, - ScriptDataDoubleEscapedDashDash, - ScriptDataDoubleEscapedLessThanSign, - ScriptDataDoubleEscapeEnd, - BeforeAttributeName, - AttributeName, - AfterAttributeName, - BeforeAttributeValue, - AttributeValueDoubleQuoted, - AttributeValueSingleQuoted, - AttributeValueUnquoted, - AfterAttributeValueQuoted, - SelfClosingStartTag, - BogusComment, - MarkupDeclarationOpen, - CommentStart, - CommentStartDash, - Comment, - CommentLessThanSign, - CommentLessThanSignBang, - CommentLessThanSignBangDash, - CommentLessThanSignBangDashDash, - CommentEndDash, - CommentEnd, - CommentEndBang, - Doctype, - BeforeDoctypeName, - DoctypeName, - AfterDoctypeName, - AfterDoctypePublicKeyword, - BeforeDoctypePublicIdentifier, - DoctypePublicIdentifierDoubleQuoted, - DoctypePublicIdentifierSingleQuoted, - AfterDoctypePublicIdentifier, - BetweenDoctypePublicAndSystemIdentifiers, - AfterDoctypeSystemKeyword, - BeforeDoctypeSystemIdentifier, - DoctypeSystemIdentifierDoubleQuoted, - DoctypeSystemIdentifierSingleQuoted, - AfterDoctypeSystemIdentifier, - BogusDoctype, - CdataSection, - CdataSectionBracket, - CdataSectionEnd, - CharacterReference, - NamedCharacterReference, - AmbiguousAmpersand, - NumericCharacterReference, - HexadecimalCharacterReferenceStart, - HexadecimalCharacterReference, - DecimalCharacterReference, - NumericCharacterReferenceEnd, -} - -impl From for MachineState { - fn from(s: State) -> MachineState { - // TODO: instead of this conversion, can we rig the enums to be of same layout? - match s { - State::Data => MachineState::Data, - State::PlainText => MachineState::PlainText, - State::RcData => MachineState::RcData, - State::RawText => MachineState::RawText, - State::ScriptData => MachineState::ScriptData, - State::CdataSection => MachineState::CdataSection, - } - } -} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 989dfd4..14f9883 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,10 +1,8 @@ use std::convert::Infallible; use crate::char_validator::CharValidator; -use crate::machine; -use crate::machine_helper::MachineHelper; +use crate::machine_helper::{ControlToken, MachineHelper}; use crate::read_helper::ReadHelper; -use crate::utils::ControlToken; use crate::{DefaultEmitter, Emitter, Readable, Reader}; #[cfg(debug_assertions)] @@ -17,7 +15,7 @@ pub struct Tokenizer { pub(crate) validator: CharValidator, pub(crate) emitter: E, pub(crate) reader: ReadHelper, - pub(crate) machine_helper: MachineHelper, + pub(crate) machine_helper: MachineHelper, } impl Tokenizer { @@ -72,8 +70,11 @@ impl Iterator for Tokenizer { if let Some(token) = self.emitter.pop_token() { break Some(Ok(token)); } else if !self.eof { - match machine::consume(self) { + match (self.machine_helper.state.function)(self) { Ok(ControlToken::Continue) => (), + Ok(ControlToken::SwitchTo(next_state)) => { + self.machine_helper.switch_to(next_state); + } Ok(ControlToken::Eof) => { self.validator.flush_character_error(&mut self.emitter); self.eof = true; diff --git a/src/utils.rs b/src/utils.rs index 665e6fe..08d442b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -49,11 +49,6 @@ macro_rules! noncharacter_pat { pub(crate) use noncharacter_pat; -pub(crate) enum ControlToken { - Eof, - Continue, -} - macro_rules! ctostr { ($c:expr) => { &*$c.encode_utf8(&mut [0; 4])