Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change Lexing/Parsing of embedded docs to not eagerly validate #507

Merged
merged 3 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion extension/partiql-extension-visualize/src/ast_to_dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ fn lit_to_str(ast: &ast::Lit) -> String {
Lit::FloatLit(l) => l.to_string(),
Lit::DoubleLit(l) => l.to_string(),
Lit::BoolLit(l) => (if *l { "TRUE" } else { "FALSE" }).to_string(),
Lit::IonStringLit(l) => format!("`{}`", l),
Lit::EmbeddedDocLit(l) => format!("`{}`", l),
Lit::CharStringLit(l) => format!("'{}'", l),
Lit::NationalCharStringLit(l) => format!("'{}'", l),
Lit::BitStringLit(l) => format!("b'{}'", l),
Expand Down
2 changes: 1 addition & 1 deletion partiql-ast/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ pub enum Lit {
#[visit(skip)]
BoolLit(bool),
#[visit(skip)]
IonStringLit(String),
EmbeddedDocLit(String),
#[visit(skip)]
CharStringLit(String),
#[visit(skip)]
Expand Down
2 changes: 1 addition & 1 deletion partiql-ast/src/pretty.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ impl PrettyDoc for Lit {
Lit::FloatLit(inner) => arena.text(inner.to_string()),
Lit::DoubleLit(inner) => arena.text(inner.to_string()),
Lit::BoolLit(inner) => arena.text(inner.to_string()),
Lit::IonStringLit(inner) => inner.pretty_doc(arena),
Lit::EmbeddedDocLit(inner) => inner.pretty_doc(arena), // TODO better pretty for embedded doc: https://github.com/partiql/partiql-lang-rust/issues/508
Lit::CharStringLit(inner) => inner.pretty_doc(arena),
Lit::NationalCharStringLit(inner) => inner.pretty_doc(arena),
Lit::BitStringLit(inner) => inner.pretty_doc(arena),
Expand Down
3 changes: 2 additions & 1 deletion partiql-logical-planner/src/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1933,7 +1933,7 @@ fn lit_to_value(lit: &Lit) -> Result<Value, AstTransformError> {
Lit::FloatLit(f) => Value::Real(OrderedFloat::from(f64::from(*f))),
Lit::DoubleLit(f) => Value::Real(OrderedFloat::from(*f)),
Lit::BoolLit(b) => Value::Boolean(*b),
Lit::IonStringLit(s) => parse_embedded_ion_str(s)?,
Lit::EmbeddedDocLit(s) => parse_embedded_ion_str(s)?,
Lit::CharStringLit(s) => Value::String(Box::new(s.clone())),
Lit::NationalCharStringLit(s) => Value::String(Box::new(s.clone())),
Lit::BitStringLit(_) => {
Expand Down Expand Up @@ -1978,6 +1978,7 @@ fn lit_to_value(lit: &Lit) -> Result<Value, AstTransformError> {
Ok(val)
}

// TODO
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be moved/removed in a future PR as part of this dev-ion-doc feature branch.

fn parse_embedded_ion_str(contents: &str) -> Result<Value, AstTransformError> {
fn lit_err(literal: &str, err: impl std::error::Error) -> AstTransformError {
AstTransformError::Literal {
Expand Down
1 change: 1 addition & 0 deletions partiql-parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ serde = { version = "1", features = ["derive"], optional = true }

[dev-dependencies]
criterion = "0.5"
assert_matches = "1"

[features]
default = []
Expand Down
4 changes: 2 additions & 2 deletions partiql-parser/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ pub enum LexError<'input> {
#[error("Lexing error: invalid input `{}`", .0)]
InvalidInput(Cow<'input, str>),
/// Embedded Ion value is not properly terminated.
#[error("Lexing error: unterminated ion literal")]
UnterminatedIonLiteral,
#[error("Lexing error: unterminated embedded document literal")]
UnterminatedDocLiteral,
/// Comment is not properly terminated.
#[error("Lexing error: unterminated comment")]
UnterminatedComment,
Expand Down
112 changes: 112 additions & 0 deletions partiql-parser/src/lexer/embedded_doc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
use crate::error::LexError;
use crate::lexer::SpannedResult;
use logos::{Logos, Span};
use partiql_common::syntax::line_offset_tracker::LineOffsetTracker;
use partiql_common::syntax::location::ByteOffset;

/// An embedded Doc string (e.g. `[{a: 1}, {b: 2}]`) with [`ByteOffset`] span
/// relative to lexed source.
///
/// Note:
/// - The lexer parses the embedded Doc value enclosed in backticks.
/// - The returned string *does not* include the backticks
/// - The returned `ByteOffset` span *does* include the backticks
type EmbeddedDocStringResult<'input> = SpannedResult<&'input str, ByteOffset, LexError<'input>>;

/// Tokens used to parse Doc literals embedded in backticks (\`)
#[derive(Logos, Debug, Clone, PartialEq)]
#[logos(skip r#"[^/*'"`\r\n\u0085\u2028\u2029]+"#)] // skip things that aren't newlines or backticks
enum EmbeddedDocToken {
// Skip newlines, but record their position.
// For line break recommendations,
// see https://www.unicode.org/standard/reports/tr13/tr13-5.html
#[regex(r"(([\r])?[\n])|\u0085|\u2028|\u2029")]
Newline,

// An embed open/close tag is a (greedily-captured) odd-number of backticks
#[regex(r"`(``)*")]
Embed,
}

/// A Lexer for Doc literals embedded in backticks (\`) that returns the parsed [`EmbeddedDocString`]
///
/// Parses just enough Doc to make sure not to include a backtick that is inside a string or comment.
pub struct EmbeddedDocLexer<'input, 'tracker> {
/// Wrap a logos-generated lexer
lexer: logos::Lexer<'input, EmbeddedDocToken>,
tracker: &'tracker mut LineOffsetTracker,
}

impl<'input, 'tracker> EmbeddedDocLexer<'input, 'tracker> {
/// Creates a new embedded Doc lexer over `input` text.
#[inline]
pub fn new(input: &'input str, tracker: &'tracker mut LineOffsetTracker) -> Self {
EmbeddedDocLexer {
lexer: EmbeddedDocToken::lexer(input),
tracker,
}
}

/// Parses a single embedded Doc value, quoted between backticks (`), and returns it
fn next_internal(&mut self) -> Option<EmbeddedDocStringResult<'input>> {
let next_token = self.lexer.next();
match next_token {
Some(Ok(EmbeddedDocToken::Embed)) => {
let Span {
start: b_start,
end: b_end,
} = self.lexer.span();
let start_quote_len = b_end - b_start;
loop {
let next_tok = self.lexer.next();
match next_tok {
Some(Ok(EmbeddedDocToken::Newline)) => {
// track the newline, and keep accumulating
self.tracker.record(self.lexer.span().end.into());
}
Some(Ok(EmbeddedDocToken::Embed)) => {
let Span {
start: e_start,
end: e_end,
} = self.lexer.span();
let end_quote_len = e_end - e_start;
if end_quote_len >= start_quote_len {
let backup = end_quote_len - start_quote_len;
let (str_start, str_end) =
(b_start + start_quote_len, e_end - end_quote_len);
let doc_value = &self.lexer.source()[str_start..str_end];

return Some(Ok((
b_start.into(),
doc_value,
(e_end - backup).into(),
)));
}
}
Some(_) => {
// just consume all other tokens
}
None => {
let Span { end, .. } = self.lexer.span();
return Some(Err((
b_start.into(),
LexError::UnterminatedDocLiteral,
end.into(),
)));
}
}
}
}
_ => None,
}
}
}

impl<'input, 'tracker> Iterator for EmbeddedDocLexer<'input, 'tracker> {
type Item = EmbeddedDocStringResult<'input>;

#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
self.next_internal()
}
}
135 changes: 0 additions & 135 deletions partiql-parser/src/lexer/embedded_ion.rs

This file was deleted.

Loading
Loading