From 6ae75a7f582573e2b7801032e39486a854481941 Mon Sep 17 00:00:00 2001 From: Josh Pschorr Date: Fri, 11 Oct 2024 09:51:00 -0700 Subject: [PATCH 1/3] Change Lexing/Parsing of embedded docs to not eagerly validate --- .../src/ast_to_dot.rs | 2 +- partiql-ast/src/ast.rs | 2 +- partiql-ast/src/pretty.rs | 2 +- partiql-logical-planner/src/lower.rs | 3 +- partiql-parser/Cargo.toml | 1 + partiql-parser/src/error.rs | 4 +- partiql-parser/src/lexer/embedded_doc.rs | 112 +++++++++++++++ partiql-parser/src/lexer/embedded_ion.rs | 135 ------------------ partiql-parser/src/lexer/mod.rs | 106 +++++++------- partiql-parser/src/lexer/partiql.rs | 49 ++++--- partiql-parser/src/parse/mod.rs | 6 +- partiql-parser/src/parse/partiql.lalrpop | 10 +- partiql/Cargo.toml | 2 +- 13 files changed, 211 insertions(+), 223 deletions(-) create mode 100644 partiql-parser/src/lexer/embedded_doc.rs delete mode 100644 partiql-parser/src/lexer/embedded_ion.rs diff --git a/extension/partiql-extension-visualize/src/ast_to_dot.rs b/extension/partiql-extension-visualize/src/ast_to_dot.rs index 3c641eb9..74dd1283 100644 --- a/extension/partiql-extension-visualize/src/ast_to_dot.rs +++ b/extension/partiql-extension-visualize/src/ast_to_dot.rs @@ -185,7 +185,7 @@ fn lit_to_str(ast: &ast::Lit) -> String { Lit::FloatLit(l) => l.to_string(), Lit::DoubleLit(l) => l.to_string(), Lit::BoolLit(l) => (if *l { "TRUE" } else { "FALSE" }).to_string(), - Lit::IonStringLit(l) => format!("`{}`", l), + Lit::EmbeddedDocLit(l) => format!("`{}`", l), Lit::CharStringLit(l) => format!("'{}'", l), Lit::NationalCharStringLit(l) => format!("'{}'", l), Lit::BitStringLit(l) => format!("b'{}'", l), diff --git a/partiql-ast/src/ast.rs b/partiql-ast/src/ast.rs index 782ddeb6..b4db2608 100644 --- a/partiql-ast/src/ast.rs +++ b/partiql-ast/src/ast.rs @@ -444,7 +444,7 @@ pub enum Lit { #[visit(skip)] BoolLit(bool), #[visit(skip)] - IonStringLit(String), + EmbeddedDocLit(String), #[visit(skip)] CharStringLit(String), #[visit(skip)] diff --git a/partiql-ast/src/pretty.rs b/partiql-ast/src/pretty.rs index e7d6ba29..3f86d589 100644 --- a/partiql-ast/src/pretty.rs +++ b/partiql-ast/src/pretty.rs @@ -394,7 +394,7 @@ impl PrettyDoc for Lit { Lit::FloatLit(inner) => arena.text(inner.to_string()), Lit::DoubleLit(inner) => arena.text(inner.to_string()), Lit::BoolLit(inner) => arena.text(inner.to_string()), - Lit::IonStringLit(inner) => inner.pretty_doc(arena), + Lit::EmbeddedDocLit(inner) => inner.pretty_doc(arena), // TODO better pretty for embedded doc Lit::CharStringLit(inner) => inner.pretty_doc(arena), Lit::NationalCharStringLit(inner) => inner.pretty_doc(arena), Lit::BitStringLit(inner) => inner.pretty_doc(arena), diff --git a/partiql-logical-planner/src/lower.rs b/partiql-logical-planner/src/lower.rs index 912f1199..23a6a0bf 100644 --- a/partiql-logical-planner/src/lower.rs +++ b/partiql-logical-planner/src/lower.rs @@ -1933,7 +1933,7 @@ fn lit_to_value(lit: &Lit) -> Result { Lit::FloatLit(f) => Value::Real(OrderedFloat::from(f64::from(*f))), Lit::DoubleLit(f) => Value::Real(OrderedFloat::from(*f)), Lit::BoolLit(b) => Value::Boolean(*b), - Lit::IonStringLit(s) => parse_embedded_ion_str(s)?, + Lit::EmbeddedDocLit(s) => parse_embedded_ion_str(s)?, Lit::CharStringLit(s) => Value::String(Box::new(s.clone())), Lit::NationalCharStringLit(s) => Value::String(Box::new(s.clone())), Lit::BitStringLit(_) => { @@ -1978,6 +1978,7 @@ fn lit_to_value(lit: &Lit) -> Result { Ok(val) } +// TODO fn parse_embedded_ion_str(contents: &str) -> Result { fn lit_err(literal: &str, err: impl std::error::Error) -> AstTransformError { AstTransformError::Literal { diff --git a/partiql-parser/Cargo.toml b/partiql-parser/Cargo.toml index 012379ad..66e68354 100644 --- a/partiql-parser/Cargo.toml +++ b/partiql-parser/Cargo.toml @@ -47,6 +47,7 @@ serde = { version = "1", features = ["derive"], optional = true } [dev-dependencies] criterion = "0.5" +assert_matches = "1" [features] default = [] diff --git a/partiql-parser/src/error.rs b/partiql-parser/src/error.rs index 5c5fad16..7760001c 100644 --- a/partiql-parser/src/error.rs +++ b/partiql-parser/src/error.rs @@ -23,8 +23,8 @@ pub enum LexError<'input> { #[error("Lexing error: invalid input `{}`", .0)] InvalidInput(Cow<'input, str>), /// Embedded Ion value is not properly terminated. - #[error("Lexing error: unterminated ion literal")] - UnterminatedIonLiteral, + #[error("Lexing error: unterminated embedded document literal")] + UnterminatedDocLiteral, /// Comment is not properly terminated. #[error("Lexing error: unterminated comment")] UnterminatedComment, diff --git a/partiql-parser/src/lexer/embedded_doc.rs b/partiql-parser/src/lexer/embedded_doc.rs new file mode 100644 index 00000000..e1337047 --- /dev/null +++ b/partiql-parser/src/lexer/embedded_doc.rs @@ -0,0 +1,112 @@ +use crate::error::LexError; +use crate::lexer::SpannedResult; +use logos::{Logos, Span}; +use partiql_common::syntax::line_offset_tracker::LineOffsetTracker; +use partiql_common::syntax::location::ByteOffset; + +/// An embedded Doc string (e.g. `[{a: 1}, {b: 2}]`) with [`ByteOffset`] span +/// relative to lexed source. +/// +/// Note: +/// - The lexer parses the embedded Doc value enclosed in backticks. +/// - The returned string *does not* include the backticks +/// - The returned `ByteOffset` span *does* include the backticks +type EmbeddedDocStringResult<'input> = SpannedResult<&'input str, ByteOffset, LexError<'input>>; + +/// Tokens used to parse Doc literals embedded in backticks (\`) +#[derive(Logos, Debug, Clone, PartialEq)] +#[logos(skip r#"[^/*'"`\r\n\u0085\u2028\u2029]+"#)] // skip things that aren't newlines or backticks +enum EmbeddedDocToken { + // Skip newlines, but record their position. + // For line break recommendations, + // see https://www.unicode.org/standard/reports/tr13/tr13-5.html + #[regex(r"(([\r])?[\n])|\u0085|\u2028|\u2029")] + Newline, + + // An embed open/close tag is a (greedily-captured) odd-number of backticks + #[regex(r"`(``)*")] + Embed, +} + +/// A Lexer for Doc literals embedded in backticks (\`) that returns the parsed [`EmbeddedDocString`] +/// +/// Parses just enough Doc to make sure not to include a backtick that is inside a string or comment. +pub struct EmbeddedDocLexer<'input, 'tracker> { + /// Wrap a logos-generated lexer + lexer: logos::Lexer<'input, EmbeddedDocToken>, + tracker: &'tracker mut LineOffsetTracker, +} + +impl<'input, 'tracker> EmbeddedDocLexer<'input, 'tracker> { + /// Creates a new embedded Doc lexer over `input` text. + #[inline] + pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self { + EmbeddedDocLexer { + lexer: EmbeddedDocToken::lexer(input), + tracker, + } + } + + /// Parses a single embedded Doc value, quoted between backticks (`), and returns it + fn next_internal(&mut self) -> Option> { + let next_token = self.lexer.next(); + match next_token { + Some(Ok(EmbeddedDocToken::Embed)) => { + let Span { + start: b_start, + end: b_end, + } = self.lexer.span(); + let start_quote_len = b_end - b_start; + loop { + let next_tok = self.lexer.next(); + match next_tok { + Some(Ok(EmbeddedDocToken::Newline)) => { + // track the newline, and keep accumulating + self.tracker.record(self.lexer.span().end.into()); + } + Some(Ok(EmbeddedDocToken::Embed)) => { + let Span { + start: e_start, + end: e_end, + } = self.lexer.span(); + let end_quote_len = e_end - e_start; + if end_quote_len >= start_quote_len { + let backup = end_quote_len - start_quote_len; + let (str_start, str_end) = + (b_start + start_quote_len, e_end - end_quote_len); + let doc_value = &self.lexer.source()[str_start..str_end]; + + return Some(Ok(( + b_start.into(), + doc_value, + (e_end - backup).into(), + ))); + } + } + Some(_) => { + // just consume all other tokens + } + None => { + let Span { end, .. } = self.lexer.span(); + return Some(Err(( + b_start.into(), + LexError::UnterminatedDocLiteral, + end.into(), + ))); + } + } + } + } + _ => None, + } + } +} + +impl<'input, 'tracker> Iterator for EmbeddedDocLexer<'input, 'tracker> { + type Item = EmbeddedDocStringResult<'input>; + + #[inline(always)] + fn next(&mut self) -> Option { + self.next_internal() + } +} diff --git a/partiql-parser/src/lexer/embedded_ion.rs b/partiql-parser/src/lexer/embedded_ion.rs deleted file mode 100644 index 66370052..00000000 --- a/partiql-parser/src/lexer/embedded_ion.rs +++ /dev/null @@ -1,135 +0,0 @@ -use crate::error::LexError; -use crate::lexer::{CommentLexer, SpannedResult}; -use logos::{Logos, Span}; -use partiql_common::syntax::line_offset_tracker::LineOffsetTracker; -use partiql_common::syntax::location::ByteOffset; - -/// An embedded Ion string (e.g. `[{a: 1}, {b: 2}]`) with [`ByteOffset`] span -/// relative to lexed source. -/// -/// Note: -/// - The lexer parses the embedded ion value enclosed in backticks. -/// - The returned string *does not* include the backticks -/// - The returned `ByteOffset` span *does* include the backticks -type EmbeddedIonStringResult<'input> = SpannedResult<&'input str, ByteOffset, LexError<'input>>; - -/// Tokens used to parse Ion literals embedded in backticks (\`) -#[derive(Logos, Debug, Clone, PartialEq)] -#[logos(skip r#"[^/*'"`\r\n\u0085\u2028\u2029]+"#)] -enum EmbeddedIonToken { - // Skip newlines, but record their position. - // For line break recommendations, - // see https://www.unicode.org/standard/reports/tr13/tr13-5.html - #[regex(r"(([\r])?[\n])|\u0085|\u2028|\u2029")] - Newline, - - #[token("`")] - Embed, - - #[regex(r"//[^\n]*")] - CommentLine, - #[token("/*")] - CommentBlock, - - #[regex(r#""([^"\\]|\\t|\\u|\\")*""#)] - String, - #[regex(r#"'([^'\\]|\\t|\\u|\\')*'"#)] - Symbol, - #[token("'''")] - LongString, -} - -/// A Lexer for Ion literals embedded in backticks (\`) that returns the parsed [`EmbeddedIonString`] -/// -/// Parses just enough Ion to make sure not to include a backtick that is inside a string or comment. -pub struct EmbeddedIonLexer<'input, 'tracker> { - /// Wrap a logos-generated lexer - lexer: logos::Lexer<'input, EmbeddedIonToken>, - tracker: &'tracker mut LineOffsetTracker, -} - -impl<'input, 'tracker> EmbeddedIonLexer<'input, 'tracker> { - /// Creates a new embedded ion lexer over `input` text. - #[inline] - pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self { - EmbeddedIonLexer { - lexer: EmbeddedIonToken::lexer(input), - tracker, - } - } - - /// Parses a single embedded ion value, quoted between backticks (`), and returns it - fn next_internal(&mut self) -> Option> { - let next_token = self.lexer.next(); - match next_token { - Some(Ok(EmbeddedIonToken::Embed)) => { - let Span { start, .. } = self.lexer.span(); - 'ion_value: loop { - let next_tok = self.lexer.next(); - match next_tok { - Some(Ok(EmbeddedIonToken::Newline)) => { - self.tracker.record(self.lexer.span().end.into()); - } - Some(Ok(EmbeddedIonToken::Embed)) => { - break 'ion_value; - } - Some(Ok(EmbeddedIonToken::CommentBlock)) => { - let embed = self.lexer.span(); - let remaining = &self.lexer.source()[embed.start..]; - let mut comment_tracker = LineOffsetTracker::default(); - let mut comment_lexer = - CommentLexer::new(remaining, &mut comment_tracker); - match comment_lexer.next() { - Some(Ok((s, _c, e))) => { - self.tracker.append(&comment_tracker, embed.start.into()); - self.lexer.bump((e - s).to_usize() - embed.len()); - } - Some(Err((s, err, e))) => { - let offset: ByteOffset = embed.start.into(); - return Some(Err((s + offset, err, e + offset))); - } - None => unreachable!(), - } - } - Some(Ok(EmbeddedIonToken::LongString)) => { - 'triple_quote: loop { - let next_tok = self.lexer.next(); - match next_tok { - Some(Ok(EmbeddedIonToken::LongString)) => break 'triple_quote, - Some(_) => (), // just consume all other tokens - None => continue 'ion_value, - } - } - } - Some(_) => { - // just consume all other tokens - } - None => { - let Span { end, .. } = self.lexer.span(); - return Some(Err(( - start.into(), - LexError::UnterminatedIonLiteral, - end.into(), - ))); - } - } - } - let Span { end, .. } = self.lexer.span(); - let (str_start, str_end) = (start + 1, end - 1); - let ion_value = &self.lexer.source()[str_start..str_end]; - - Some(Ok((start.into(), ion_value, end.into()))) - } - _ => None, - } - } -} - -impl<'input, 'tracker> Iterator for EmbeddedIonLexer<'input, 'tracker> { - type Item = EmbeddedIonStringResult<'input>; - - #[inline(always)] - fn next(&mut self) -> Option { - self.next_internal() - } -} diff --git a/partiql-parser/src/lexer/mod.rs b/partiql-parser/src/lexer/mod.rs index 7a81fefb..bc56c8d8 100644 --- a/partiql-parser/src/lexer/mod.rs +++ b/partiql-parser/src/lexer/mod.rs @@ -3,14 +3,14 @@ use partiql_common::syntax::location::{ByteOffset, BytePosition, ToLocated}; use crate::error::{LexError, ParseError}; mod comment; -mod embedded_ion; +mod embedded_doc; mod partiql; pub use comment::*; -pub use embedded_ion::*; +pub use embedded_doc::*; pub use partiql::*; -/// A 3-tuple of (start, `Tok`, end) denoting a token and it start and end offsets. +/// A 3-tuple of (start, `Tok`, end) denoting a token and it's start and end offsets. pub type Spanned = (Loc, Tok, Loc); /// A [`Result`] of a [`Spanned`] token. pub(crate) type SpannedResult = Result, Spanned>; @@ -72,6 +72,7 @@ where #[cfg(test)] mod tests { use super::*; + use assert_matches::assert_matches; use partiql_common::syntax::line_offset_tracker::{LineOffsetError, LineOffsetTracker}; use partiql_common::syntax::location::{ CharOffset, LineAndCharPosition, LineAndColumn, LineOffset, Located, Location, @@ -126,7 +127,7 @@ mod tests { let ion_value = r" `{'input':1, 'b':1}`--comment "; let mut offset_tracker = LineOffsetTracker::default(); - let ion_lexer = EmbeddedIonLexer::new(ion_value.trim(), &mut offset_tracker); + let ion_lexer = EmbeddedDocLexer::new(ion_value.trim(), &mut offset_tracker); assert_eq!(ion_lexer.into_iter().count(), 1); assert_eq!(offset_tracker.num_lines(), 1); @@ -134,9 +135,7 @@ mod tests { let mut lexer = PartiqlLexer::new(ion_value, &mut offset_tracker); let tok = lexer.next().unwrap().unwrap(); - assert!( - matches!(tok, (ByteOffset(5), Token::Ion(ion_str), ByteOffset(24)) if ion_str == "{'input':1, 'b':1}") - ); + assert_matches!(tok, (ByteOffset(4), Token::EmbeddedDoc(ion_str), ByteOffset(25)) if ion_str == "{'input':1, 'b':1}"); let tok = lexer.next().unwrap().unwrap(); assert!( matches!(tok, (ByteOffset(25), Token::CommentLine(cmt_str), ByteOffset(35)) if cmt_str == "--comment ") @@ -145,27 +144,47 @@ mod tests { #[test] fn ion() { - let ion_value = r#" `{'input' // comment ' " + let embedded_ion_doc = r#" `{'input' // comment ' " :1, /* comment */ 'b':1}` "#; - let mut offset_tracker = LineOffsetTracker::default(); - let ion_lexer = EmbeddedIonLexer::new(ion_value.trim(), &mut offset_tracker); - assert_eq!(ion_lexer.into_iter().count(), 1); + let mut lexer = PartiqlLexer::new(embedded_ion_doc, &mut offset_tracker); + + let next_tok = lexer.next(); + let tok = next_tok.unwrap().unwrap(); + assert_matches!(tok, (ByteOffset(1), Token::EmbeddedDoc(ion_str), ByteOffset(159)) if ion_str == embedded_ion_doc.trim().trim_matches('`')); assert_eq!(offset_tracker.num_lines(), 5); + } + #[test] + fn ion_5_backticks() { + let embedded_ion_doc = r#" `````{'input' // comment ' " + :1, /* + comment + */ + 'b':1}````` "#; let mut offset_tracker = LineOffsetTracker::default(); - let mut lexer = PartiqlLexer::new(ion_value, &mut offset_tracker); + let mut lexer = PartiqlLexer::new(embedded_ion_doc, &mut offset_tracker); - let tok = lexer.next().unwrap().unwrap(); - assert!( - matches!(tok, (ByteOffset(2), Token::Ion(ion_str), ByteOffset(158)) if ion_str == ion_value.trim().trim_matches('`')) - ); + let next_tok = lexer.next(); + let tok = next_tok.unwrap().unwrap(); + assert_matches!(tok, (ByteOffset(1), Token::EmbeddedDoc(ion_str), ByteOffset(165)) if ion_str == embedded_ion_doc.trim().trim_matches('`')); assert_eq!(offset_tracker.num_lines(), 5); } + #[test] + fn empty_doc() { + let embedded_empty_doc = r#" `````` "#; + let mut offset_tracker = LineOffsetTracker::default(); + let mut lexer = PartiqlLexer::new(embedded_empty_doc, &mut offset_tracker); + + let next_tok = lexer.next(); + let tok = next_tok.unwrap().unwrap(); + assert_matches!(tok, (ByteOffset(1), Token::EmbeddedDoc(empty_str), ByteOffset(7)) if empty_str.is_empty()); + } + #[test] fn nested_comments() { let comments = r#"/* @@ -188,14 +207,14 @@ mod tests { let toks: Result, Spanned, ByteOffset>> = nonnested_lex.collect(); assert!(toks.is_err()); let error = toks.unwrap_err(); - assert!(matches!( + assert_matches!( error, ( ByteOffset(187), LexError::UnterminatedComment, ByteOffset(189) ) - )); + ); assert_eq!(error.1.to_string(), "Lexing error: unterminated comment"); } @@ -320,16 +339,16 @@ mod tests { lexer.count(); let last = offset_tracker.at(query, ByteOffset(query.len() as u32).into()); - assert!(matches!( + assert_matches!( last, Ok(LineAndCharPosition { line: LineOffset(4), char: CharOffset(10) }) - )); + ); let overflow = offset_tracker.at(query, ByteOffset(1 + query.len() as u32).into()); - assert!(matches!(overflow, Err(LineOffsetError::EndOfInput))); + assert_matches!(overflow, Err(LineOffsetError::EndOfInput)); } #[test] @@ -433,11 +452,11 @@ mod tests { error.to_string(), r"Lexing error: invalid input `#` at `(b7..b8)`" ); - assert!(matches!(error, + assert_matches!(error, ParseError::LexicalError(Located { inner: LexError::InvalidInput(s), location: Location{start: BytePosition(ByteOffset(7)), end: BytePosition(ByteOffset(8))} - }) if s == "#")); + }) if s == "#"); assert_eq!(offset_tracker.num_lines(), 1); assert_eq!( LineAndColumn::from(offset_tracker.at(query, 7.into()).unwrap()), @@ -450,27 +469,8 @@ mod tests { let query = r#" ` "fooo` "#; let mut offset_tracker = LineOffsetTracker::default(); let toks: Result, _> = PartiqlLexer::new(query, &mut offset_tracker).collect(); - assert!(toks.is_err()); - let error = toks.unwrap_err(); - - assert!(matches!( - error, - ParseError::LexicalError(Located { - inner: LexError::UnterminatedIonLiteral, - location: Location { - start: BytePosition(ByteOffset(1)), - end: BytePosition(ByteOffset(10)) - } - }) - )); - assert_eq!( - error.to_string(), - "Lexing error: unterminated ion literal at `(b1..b10)`" - ); - assert_eq!( - LineAndColumn::from(offset_tracker.at(query, BytePosition::from(1)).unwrap()), - LineAndColumn::new(1, 2).unwrap() - ); + // ion is not eagerly parsed, so unterminated ion does not cause a lex/parse error + assert!(toks.is_ok()); } #[test] @@ -480,7 +480,7 @@ mod tests { let toks: Result, _> = PartiqlLexer::new(query, &mut offset_tracker).collect(); assert!(toks.is_err()); let error = toks.unwrap_err(); - assert!(matches!( + assert_matches!( error, ParseError::LexicalError(Located { inner: LexError::UnterminatedComment, @@ -489,7 +489,7 @@ mod tests { end: BytePosition(ByteOffset(11)) } }) - )); + ); assert_eq!( error.to_string(), "Lexing error: unterminated comment at `(b1..b11)`" @@ -504,18 +504,8 @@ mod tests { fn err_unterminated_ion_comment() { let query = r" `/*12345678`"; let mut offset_tracker = LineOffsetTracker::default(); - let ion_lexer = EmbeddedIonLexer::new(query, &mut offset_tracker); + let ion_lexer = EmbeddedDocLexer::new(query, &mut offset_tracker); let toks: Result, Spanned, ByteOffset>> = ion_lexer.collect(); - assert!(toks.is_err()); - let error = toks.unwrap_err(); - assert!(matches!( - error, - (ByteOffset(2), LexError::UnterminatedComment, ByteOffset(13)) - )); - assert_eq!(error.1.to_string(), "Lexing error: unterminated comment"); - assert_eq!( - LineAndColumn::from(offset_tracker.at(query, BytePosition::from(2)).unwrap()), - LineAndColumn::new(1, 3).unwrap() - ); + assert!(toks.is_ok()); } } diff --git a/partiql-parser/src/lexer/partiql.rs b/partiql-parser/src/lexer/partiql.rs index da153e4a..bfc1f32b 100644 --- a/partiql-parser/src/lexer/partiql.rs +++ b/partiql-parser/src/lexer/partiql.rs @@ -1,5 +1,5 @@ use crate::error::LexError; -use crate::lexer::{CommentLexer, EmbeddedIonLexer, InternalLexResult, LexResult}; +use crate::lexer::{CommentLexer, EmbeddedDocLexer, InternalLexResult, LexResult}; use logos::{Logos, Span}; use partiql_common::syntax::line_offset_tracker::LineOffsetTracker; use partiql_common::syntax::location::ByteOffset; @@ -35,6 +35,7 @@ impl<'input, 'tracker> PartiqlLexer<'input, 'tracker> { Err((start.into(), err_ctor(region.into()), end.into())) } + #[inline(always)] pub fn slice(&self) -> &'input str { self.lexer.slice() } @@ -59,7 +60,8 @@ impl<'input, 'tracker> PartiqlLexer<'input, 'tracker> { continue 'next_tok; } - Token::EmbeddedIonQuote => self.parse_embedded_ion(), + Token::EmbeddedDocQuote => self.parse_embedded_doc(), + Token::EmptyEmbeddedDocQuote => self.parse_empty_embedded_doc(), Token::CommentBlockStart => self.parse_block_comment(), @@ -92,20 +94,20 @@ impl<'input, 'tracker> PartiqlLexer<'input, 'tracker> { }) } - /// Uses [`EmbeddedIonLexer`] to parse an embedded ion value - fn parse_embedded_ion(&mut self) -> Option> { + /// Uses [`EmbeddedDocLexer`] to parse an embedded doc value + fn parse_embedded_doc(&mut self) -> Option> { let embed = self.lexer.span(); let remaining = &self.lexer.source()[embed.start..]; - let mut ion_tracker = LineOffsetTracker::default(); - let mut ion_lexer = EmbeddedIonLexer::new(remaining, &mut ion_tracker); - ion_lexer.next().map(|res| match res { - Ok((s, ion, e)) => { + let mut doc_tracker = LineOffsetTracker::default(); + let mut doc_lexer = EmbeddedDocLexer::new(remaining, &mut doc_tracker); + doc_lexer.next().map(|res| match res { + Ok((s, doc, e)) => { let val_len = e - s; - let val_start = embed.end.into(); // embed end is 1 past the starting '`' - let val_end = val_start + val_len - 2; // sub 2 to remove surrounding '`' - self.tracker.append(&ion_tracker, embed.start.into()); + let val_start = embed.start.into(); // embed end is 1 past the starting '/*' + let val_end = val_start + val_len; + self.tracker.append(&doc_tracker, embed.start.into()); self.lexer.bump(val_len.to_usize() - embed.len()); - Ok((val_start, Token::Ion(ion), val_end)) + Ok((val_start, Token::EmbeddedDoc(doc), val_end)) } Err((s, err, e)) => { let offset: ByteOffset = embed.start.into(); @@ -113,6 +115,14 @@ impl<'input, 'tracker> PartiqlLexer<'input, 'tracker> { } }) } + + #[inline] + fn parse_empty_embedded_doc(&mut self) -> Option> { + let embed = self.lexer.span(); + let mid = embed.start + ((embed.end - embed.start) / 2); + let doc = &self.lexer.source()[mid..mid]; + Some(self.wrap(Token::EmbeddedDoc(doc))) + } } impl<'input, 'tracker> Iterator for PartiqlLexer<'input, 'tracker> { @@ -241,9 +251,13 @@ pub enum Token<'input> { |lex| lex.slice().trim_matches('\''))] String(&'input str), - #[token("`")] - EmbeddedIonQuote, - Ion(&'input str), + // An embed open/close tag is a (greedily-captured) odd-number of backticks + #[regex(r"`(``)*")] + EmbeddedDocQuote, + // An empty embedded doc is a (greedily-captured) even-number of backticks + #[regex(r"(``)+")] + EmptyEmbeddedDocQuote, + EmbeddedDoc(&'input str), // Keywords #[regex("(?i:All)")] @@ -492,8 +506,9 @@ impl<'input> fmt::Display for Token<'input> { Token::ExpReal(txt) => write!(f, "<{txt}:REAL>"), Token::Real(txt) => write!(f, "<{txt}:REAL>"), Token::String(txt) => write!(f, "<{txt}:STRING>"), - Token::EmbeddedIonQuote => write!(f, ""), - Token::Ion(txt) => write!(f, "<{txt}:ION>"), + Token::EmbeddedDocQuote => write!(f, ""), + Token::EmbeddedDoc(txt) => write!(f, "<```{txt}```:DOC>"), + Token::EmptyEmbeddedDocQuote => write!(f, "<``:DOC>"), Token::All | Token::Asc diff --git a/partiql-parser/src/parse/mod.rs b/partiql-parser/src/parse/mod.rs index 31396661..f8407a80 100644 --- a/partiql-parser/src/parse/mod.rs +++ b/partiql-parser/src/parse/mod.rs @@ -211,7 +211,9 @@ mod tests { #[test] fn ion() { parse!(r#" `[{'a':1, 'b':1}, {'a':2}, "foo"]` "#); - parse!(r#" `[{'a':1, 'b':1}, {'a':2}, "foo", 'a`b', "a`b", '''`s''', {{"a`b"}}]` "#); + parse!( + r#" ```[{'a':1, 'b':1}, {'a':2}, "foo", 'a`b', "a`b", '''`s''', {{"a`b"}}]``` "# + ); parse!( r#" `{'a':1, // comment ' " 'b':1} ` "# @@ -798,7 +800,7 @@ mod tests { assert_eq!( err_data.errors[1], ParseError::LexicalError(Located { - inner: LexError::UnterminatedIonLiteral, + inner: LexError::UnterminatedDocLiteral, location: Location { start: BytePosition::from(1), end: BytePosition::from(4), diff --git a/partiql-parser/src/parse/partiql.lalrpop b/partiql-parser/src/parse/partiql.lalrpop index d65ca1b0..7a9708c0 100644 --- a/partiql-parser/src/parse/partiql.lalrpop +++ b/partiql-parser/src/parse/partiql.lalrpop @@ -1197,7 +1197,7 @@ ExcludePathStep: ast::ExcludePathStep = { Literal: ast::Lit = { , , - , + , , } @@ -1250,11 +1250,13 @@ LiteralNumber: ast::Lit = { }) }, } + #[inline] -LiteralIon: ast::Lit = { - => ast::Lit::IonStringLit(ion.to_owned()), +LiteralEmbeddedDoc: ast::Lit = { + => ast::Lit::EmbeddedDocLit(ion.to_owned()), } + #[inline] TypeKeywordStr: &'static str = { "DATE" => "DATE", @@ -1425,7 +1427,7 @@ extern { "Real" => lexer::Token::Real(<&'input str>), "ExpReal" => lexer::Token::ExpReal(<&'input str>), "String" => lexer::Token::String(<&'input str>), - "Ion" => lexer::Token::Ion(<&'input str>), + "EmbeddedDoc" => lexer::Token::EmbeddedDoc(<&'input str>), // Keywords "ALL" => lexer::Token::All, diff --git a/partiql/Cargo.toml b/partiql/Cargo.toml index 5cea77e3..e4ff2850 100644 --- a/partiql/Cargo.toml +++ b/partiql/Cargo.toml @@ -46,7 +46,7 @@ time = { version = "0.3", features = ["macros"] } criterion = "0.5" rand = "0.8" -assert_matches = "1.5" +assert_matches = "1" [[bench]] name = "bench_eval_multi_like" From 52b360c476b48509cf5d48ed89259fff6fdf23d4 Mon Sep 17 00:00:00 2001 From: Josh Pschorr Date: Fri, 18 Oct 2024 13:52:54 -0700 Subject: [PATCH 2/3] Update partiql-ast/src/pretty.rs Co-authored-by: Arash Maymandi <27716912+am357@users.noreply.github.com> --- partiql-ast/src/pretty.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/partiql-ast/src/pretty.rs b/partiql-ast/src/pretty.rs index 3f86d589..e4913149 100644 --- a/partiql-ast/src/pretty.rs +++ b/partiql-ast/src/pretty.rs @@ -394,7 +394,7 @@ impl PrettyDoc for Lit { Lit::FloatLit(inner) => arena.text(inner.to_string()), Lit::DoubleLit(inner) => arena.text(inner.to_string()), Lit::BoolLit(inner) => arena.text(inner.to_string()), - Lit::EmbeddedDocLit(inner) => inner.pretty_doc(arena), // TODO better pretty for embedded doc + Lit::EmbeddedDocLit(inner) => inner.pretty_doc(arena), // TODO better pretty for embedded doc: https://github.com/partiql/partiql-lang-rust/issues/508 Lit::CharStringLit(inner) => inner.pretty_doc(arena), Lit::NationalCharStringLit(inner) => inner.pretty_doc(arena), Lit::BitStringLit(inner) => inner.pretty_doc(arena), From 70d5375a8ac6ddba667e31f7d34cf1ede817ba26 Mon Sep 17 00:00:00 2001 From: Josh Pschorr Date: Sat, 19 Oct 2024 11:55:58 -0700 Subject: [PATCH 3/3] cleanup --- partiql-parser/src/lexer/mod.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/partiql-parser/src/lexer/mod.rs b/partiql-parser/src/lexer/mod.rs index bc56c8d8..f48c953d 100644 --- a/partiql-parser/src/lexer/mod.rs +++ b/partiql-parser/src/lexer/mod.rs @@ -10,7 +10,7 @@ pub use comment::*; pub use embedded_doc::*; pub use partiql::*; -/// A 3-tuple of (start, `Tok`, end) denoting a token and it's start and end offsets. +/// A 3-tuple of (start, `Tok`, end) denoting a token and its start and end offsets. pub type Spanned = (Loc, Tok, Loc); /// A [`Result`] of a [`Spanned`] token. pub(crate) type SpannedResult = Result, Spanned>; @@ -465,7 +465,7 @@ mod tests { } #[test] - fn err_unterminated_ion() { + fn unterminated_ion() { let query = r#" ` "fooo` "#; let mut offset_tracker = LineOffsetTracker::default(); let toks: Result, _> = PartiqlLexer::new(query, &mut offset_tracker).collect(); @@ -501,11 +501,12 @@ mod tests { } #[test] - fn err_unterminated_ion_comment() { + fn unterminated_ion_comment() { let query = r" `/*12345678`"; let mut offset_tracker = LineOffsetTracker::default(); let ion_lexer = EmbeddedDocLexer::new(query, &mut offset_tracker); let toks: Result, Spanned, ByteOffset>> = ion_lexer.collect(); + // ion is not eagerly parsed, so unterminated ion does not cause a lex/parse error assert!(toks.is_ok()); } }