From 2f538fa0aba5bf3520e55c6a9d1dcc9183d9104d Mon Sep 17 00:00:00 2001 From: lkirkwood Date: Tue, 30 Apr 2024 23:04:07 +1000 Subject: [PATCH] Rework TOML token parsing (#100) * Add bare_key_chars macro to toml * Added toml test for unquoted keys * Match toml chars as u32 + remove alnum match logic Now casting the current toml character to u32 for easier matching against the values provided in the abnf for toml. Removed the existing logic for parsing most characters to start again. * Restore original toml number parsing logic * Add allow for overlapping toml char pattern lint * Add toml parse_bare_key method * Change language from bare_key to ident in toml * Updated toml key test case * Implemented toml parse_ident * Add toml nan/inf todo and removed unused token * Impl into string for toml token * Reordered toml number parsing conditionals * Cleaned up some toml match statements * Formatted toml token comments and rename macro * Moved toml number parsing into separate function * Added guard for ident starting with num toml * Return err if num contains illegal chars toml * Added toml ident_term_chars macro * Fix not pushing chars during inf/nan toml parse * Start using ident_term_chars to detect inf/nan * Start using ident_term_chars to detect toml idents * Add close block char to toml ident_term_chars * Updated toml key test * Replaced tomltok into string with from impl * Removed unnecessary return * Fixed toml no_std test * Always match toml chars as char not u32 --- src/toml.rs | 344 +++++++++++++++++++++++++++++++------------------- tests/toml.rs | 29 +++++ 2 files changed, 241 insertions(+), 132 deletions(-) diff --git a/src/toml.rs b/src/toml.rs index 29bab86..bfe4843 100644 --- a/src/toml.rs +++ b/src/toml.rs @@ -10,6 +10,42 @@ use hashbrown::HashMap; #[cfg(not(feature = "no_std"))] use std::collections::HashMap; +/// Pattern matching any valid unquoted key character as u32. +/// ABNF line: https://github.com/toml-lang/toml/blob/2431aa308a7bc97eeb50673748606e23a6e0f201/toml.abnf#L55 +macro_rules! ident_chars { + () => { + '\u{41}'..='\u{5A}' + | '\u{61}'..='\u{7A}' + | '\u{30}'..='\u{39}' + | '\u{2D}' + | '\u{5F}' + | '\u{B2}' + | '\u{B3}' + | '\u{B9}' + | '\u{BC}'..='\u{BE}' + | '\u{C0}'..='\u{D6}' + | '\u{D8}'..='\u{F6}' + | '\u{F8}'..='\u{37D}' + | '\u{37F}'..='\u{1FFF}' + | '\u{200C}'..='\u{200D}' + | '\u{203F}'..='\u{2040}' + | '\u{2070}'..='\u{218F}' + | '\u{2460}'..='\u{24FF}' + | '\u{2C00}'..='\u{2FEF}' + | '\u{3001}'..='\u{D7FF}' + | '\u{F900}'..='\u{FDCF}' + | '\u{FDF0}'..='\u{FFFD}' + | '\u{10000}'..='\u{EFFFF}' + } +} + +/// Pattern matching a character that can terminate a valid ident. +macro_rules! ident_term_chars { + () => { + ' ' | '\t' | '\n' | '\0' | '=' | ']' + }; +} + /// A parser for TOML string values. /// /// ```rust @@ -34,6 +70,7 @@ pub enum TomlTok { I64(i64), F64(f64), Bool(bool), + // TODO add option to enforce + sign for conversion to ident Nan(bool), Inf(bool), Date(String), @@ -41,10 +78,42 @@ pub enum TomlTok { BlockOpen, BlockClose, Comma, - Bof, Eof, } +impl From for String { + fn from(value: TomlTok) -> Self { + match value { + TomlTok::Ident(string) => string, + TomlTok::Str(string) => string, + TomlTok::U64(number) => number.to_string(), + TomlTok::I64(number) => number.to_string(), + TomlTok::F64(number) => number.to_string(), + TomlTok::Bool(boolean) => boolean.to_string(), + TomlTok::Nan(negative) => { + if negative { + "-nan".to_string() + } else { + "nan".to_string() + } + } + TomlTok::Inf(negative) => { + if negative { + "-inf".to_string() + } else { + "inf".to_string() + } + } + TomlTok::Date(string) => string, + TomlTok::Equals => '='.to_string(), + TomlTok::BlockOpen => '['.to_string(), + TomlTok::BlockClose => ']'.to_string(), + TomlTok::Comma => ','.to_string(), + TomlTok::Eof => '\0'.to_string(), + } + } +} + /// A TOML value. #[derive(Debug, PartialEq)] pub enum Toml { @@ -236,6 +305,13 @@ impl TomlParser { let tok = self.next_tok(i)?; let key = match tok { TomlTok::Ident(key) => key, + TomlTok::U64(_) + | TomlTok::I64(_) + | TomlTok::F64(_) + | TomlTok::Bool(_) + | TomlTok::Nan(_) + | TomlTok::Inf(_) + | TomlTok::Date(_) => tok.into(), _ => return Err(self.err_token(tok)), }; let tok = self.next_tok(i)?; @@ -251,9 +327,15 @@ impl TomlParser { _ => return Err(self.err_token(tok)), } } - TomlTok::Str(key) | TomlTok::Ident(key) => { - self.parse_key_value(local_scope, key, i, out.out())? - } + TomlTok::Str(_) + | TomlTok::Ident(_) + | TomlTok::U64(_) + | TomlTok::I64(_) + | TomlTok::F64(_) + | TomlTok::Bool(_) + | TomlTok::Nan(_) + | TomlTok::Inf(_) + | TomlTok::Date(_) => self.parse_key_value(local_scope, tok.into(), i, out.out())?, _ => return Err(self.err_token(tok)), } Ok(true) @@ -350,6 +432,8 @@ impl TomlParser { if self.cur == '\0' { return Ok(TomlTok::Eof); } + + #[allow(unreachable_patterns)] match self.cur { ',' => { self.next(i); @@ -367,132 +451,6 @@ impl TomlParser { self.next(i); return Ok(TomlTok::Equals); } - '+' | '-' | '0'..='9' => { - let mut num = String::new(); - let is_neg = if self.cur == '-' { - num.push(self.cur); - self.next(i); - true - } else { - if self.cur == '+' { - self.next(i); - } - false - }; - if self.cur == 'n' { - self.next(i); - if self.cur == 'a' { - self.next(i); - if self.cur == 'n' { - self.next(i); - return Ok(TomlTok::Nan(is_neg)); - } else { - return Err(self.err_parse("nan")); - } - } else { - return Err(self.err_parse("nan")); - } - } - if self.cur == 'i' { - self.next(i); - if self.cur == 'n' { - self.next(i); - if self.cur == 'f' { - self.next(i); - return Ok(TomlTok::Inf(is_neg)); - } else { - return Err(self.err_parse("inf")); - } - } else { - return Err(self.err_parse("nan")); - } - } - while self.cur >= '0' && self.cur <= '9' || self.cur == '_' { - if self.cur != '_' { - num.push(self.cur); - } - self.next(i); - } - if self.cur == '.' { - num.push(self.cur); - self.next(i); - while self.cur >= '0' && self.cur <= '9' || self.cur == '_' { - if self.cur != '_' { - num.push(self.cur); - } - self.next(i); - } - if let Ok(num) = num.parse() { - return Ok(TomlTok::F64(num)); - } else { - return Err(self.err_parse("number")); - } - } else if self.cur == '-' { - // lets assume its a date. whatever. i don't feel like more parsing today - num.push(self.cur); - self.next(i); - while self.cur >= '0' && self.cur <= '9' - || self.cur == ':' - || self.cur == '-' - || self.cur == 'T' - { - num.push(self.cur); - self.next(i); - } - return Ok(TomlTok::Date(num)); - } else { - if is_neg { - if let Ok(num) = num.parse() { - return Ok(TomlTok::I64(num)); - } else { - return Err(self.err_parse("number")); - } - } - if let Ok(num) = num.parse() { - return Ok(TomlTok::U64(num)); - } else { - return Err(self.err_parse("number")); - } - } - } - 'a'..='z' | 'A'..='Z' | '_' => { - let mut ident = String::new(); - while self.cur >= 'a' && self.cur <= 'z' - || self.cur >= 'A' && self.cur <= 'Z' - || self.cur == '_' - || self.cur == '-' - { - ident.push(self.cur); - self.next(i); - } - if self.cur == '.' { - while self.cur == '.' { - self.next(i); - while self.cur >= 'a' && self.cur <= 'z' - || self.cur >= 'A' && self.cur <= 'Z' - || self.cur == '_' - || self.cur == '-' - { - ident.push(self.cur); - self.next(i); - } - } - return Ok(TomlTok::Ident(ident)); - } - if ident == "true" { - return Ok(TomlTok::Bool(true)); - } - if ident == "false" { - return Ok(TomlTok::Bool(false)); - } - if ident == "inf" { - return Ok(TomlTok::Inf(false)); - } - if ident == "nan" { - return Ok(TomlTok::Nan(false)); - } - return Ok(TomlTok::Ident(ident)); - } '#' => { while self.cur != '\n' && self.cur != '\0' { self.next(i); @@ -506,6 +464,7 @@ impl TomlParser { self.next(i); } } + '+' | '-' | '0'..='9' => return self.parse_num(i), '"' => { let mut val = String::new(); self.next(i); @@ -544,10 +503,131 @@ impl TomlParser { self.next(i); return Ok(TomlTok::Str(val)); } - _ => { - return Err(self.err_parse("tokenizer")); + ident_chars!() => return self.parse_ident(i, String::new()), + _ => return Err(self.err_parse("tokenizer")), + } + } + } + + /// Parse an ident or similar, starting with the current character. + fn parse_ident(&mut self, i: &mut Chars, mut start: String) -> Result { + while matches!(self.cur, ident_chars!()) { + start.push(self.cur); + self.next(i); + } + + if self.cur == '.' { + start.push(self.cur); + self.next(i); + return self.parse_ident(i, start); // recursion here could be a problem + } + + if matches!(self.cur, ident_term_chars!()) { + return Ok(match start.as_ref() { + "true" => TomlTok::Bool(true), + "false" => TomlTok::Bool(false), + "inf" => TomlTok::Inf(false), + "nan" => TomlTok::Nan(false), + _ => TomlTok::Ident(start), + }); + } + + Err(self.err_parse("tokenizer")) + } + + /// Parses a number (or an ident that starts with numbers), starting with the current character. + fn parse_num(&mut self, i: &mut Chars) -> Result { + let mut num = String::new(); + + let mut negative = false; + if self.cur == '+' { + self.next(i) + } else if self.cur == '-' { + num.push(self.cur); + negative = true; + self.next(i); + } + + if self.cur == 'n' { + num.push(self.cur); + self.next(i); + if self.cur == 'a' { + num.push(self.cur); + self.next(i); + if self.cur == 'n' { + num.push(self.cur); + self.next(i); + if matches!(self.cur, ident_term_chars!()) { + return Ok(TomlTok::Nan(negative)); + } } } + } else if self.cur == 'i' { + num.push(self.cur); + self.next(i); + if self.cur == 'n' { + num.push(self.cur); + self.next(i); + if self.cur == 'f' { + num.push(self.cur); + self.next(i); + if matches!(self.cur, ident_term_chars!()) { + return Ok(TomlTok::Inf(negative)); + } + } + } + } + + while matches!(self.cur, '0'..='9' | '_') { + if self.cur != '_' { + num.push(self.cur); + } + self.next(i); + } + + if self.cur == '.' { + num.push(self.cur); + self.next(i); + while matches!(self.cur, '0'..='9' | '_') { + if self.cur != '_' { + num.push(self.cur); + } + self.next(i); + } + if let Ok(num) = num.parse() { + return Ok(TomlTok::F64(num)); + } else { + return Err(self.err_parse("number")); + } + } else if self.cur == '-' { + // lets assume its a date. whatever. i don't feel like more parsing today + num.push(self.cur); + self.next(i); + while matches!(self.cur, '0'..='9' | ':' | '-' | 'T') { + num.push(self.cur); + self.next(i); + } + return Ok(TomlTok::Date(num)); + // TODO rework this } + + if matches!(self.cur, ident_chars!()) { + return self.parse_ident(i, num); + } + + match negative { + true => { + if let Ok(num) = num.parse() { + return Ok(TomlTok::I64(num)); + } + } + false => { + if let Ok(num) = num.parse() { + return Ok(TomlTok::U64(num)); + } + } + } + + Err(self.err_parse("tokenizer")) } } diff --git a/tests/toml.rs b/tests/toml.rs index c89d919..4b1c821 100644 --- a/tests/toml.rs +++ b/tests/toml.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "no_std")] +use hashbrown::HashMap; +#[cfg(not(feature = "no_std"))] +use std::collections::HashMap; + use nanoserde::Toml; use nanoserde::TomlParser; @@ -84,3 +89,27 @@ fn assert_specific_toml_types() { ] ); } + +#[test] +fn toml_key_chars() { + let toml_str = r#" + [foo.bar.baz] + 123abc456def = "myval" + -inf = 0 + 2024-04-30 = 100 + ½ = 0.5 + "#; + + assert_eq!( + TomlParser::parse(toml_str).unwrap(), + HashMap::from([ + ( + "foo.bar.baz.123abc456def".to_string(), + Toml::Str("myval".to_string()) + ), + ("foo.bar.baz.-inf".to_string(), Toml::Num(0.0)), + ("foo.bar.baz.2024-04-30".to_string(), Toml::Num(100.0)), + ("foo.bar.baz.½".to_string(), Toml::Num(0.5)) + ]) + ); +}