diff --git a/prqlc/prqlc-parser/src/lexer.rs b/prqlc/prqlc-parser/src/lexer.rs index 48149ea821de..b73b005651ed 100644 --- a/prqlc/prqlc-parser/src/lexer.rs +++ b/prqlc/prqlc-parser/src/lexer.rs @@ -40,7 +40,22 @@ pub enum Token { // Pow, // ** Annotate, // @ + // Aesthetics only Comment(String), + /// Vec contains comments between the newline and the line wrap + // Currently we include the comments with the LineWrap token. This isn't + // ideal, but I'm not sure of an easy way of having them be separate. + // - The line wrap span technically include the comments — on a newline, + // we need to look ahead to _after_ the comments to see if there's a + // line wrap, and exclude the newline if there is. + // - We can only pass one token back + // + // Alternatives: + // - Post-process the stream, removing the newline prior to a line wrap. + // But requires a whole extra pass. + // - Change the functionality. But it's very nice to be able to comment + // something out and have line-wraps still work. + LineWrap(Vec), } /// Lex chars to tokens until the end of the input @@ -98,15 +113,9 @@ pub fn lex_token() -> impl Parser> { .then(quoted_string(true)) .map(|(c, s)| Token::Interpolation(c, s)); - // I think declaring this and then cloning will be more performant than - // calling the function on each invocation. - // https://github.com/zesterer/chumsky/issues/501 would allow us to avoid - // this, and let us split up this giant function without sacrificing - // performance. - let newline = newline(); - let token = choice(( - newline.to(Token::NewLine), + line_wrap(), + newline().to(Token::NewLine), control_multi, interpolation, param, @@ -133,7 +142,7 @@ pub fn lex_token() -> impl Parser> { } fn ignored() -> impl Parser> { - choice((whitespace(), line_wrap())).repeated().ignored() + whitespace().repeated().ignored() } fn whitespace() -> impl Parser> { @@ -143,28 +152,24 @@ fn whitespace() -> impl Parser> { .ignored() } -fn line_wrap() -> impl Parser> { +fn line_wrap() -> impl Parser> { newline() - .then( - // We can optionally have an empty line, or a line with a comment, - // between the initial line and the continued line + .ignore_then( whitespace() - .or_not() - .then(comment().or_not()) - .then(newline()) + .repeated() + .ignore_then(comment()) + .then_ignore(newline()) .repeated(), ) - .then(whitespace().repeated()) - .then(just('\\')) - .ignored() + .then_ignore(whitespace().repeated()) + .then_ignore(just('\\')) + .map(Token::LineWrap) } fn comment() -> impl Parser> { - let comment_line = just('#').ignore_then(newline().not().repeated().collect::()); - - comment_line - .chain(newline().ignore_then(comment_line).repeated()) - .map(|lines: Vec| Token::Comment(lines.join("\n"))) + just('#') + .ignore_then(newline().not().repeated().collect::()) + .map(Token::Comment) } pub fn ident_part() -> impl Parser> + Clone { @@ -522,8 +527,12 @@ impl std::fmt::Display for Token { write!(f, "{c}\"{}\"", s) } Token::Comment(s) => { - for line in s.lines() { - writeln!(f, "#{}", line)? + writeln!(f, "#{}", s) + } + Token::LineWrap(comments) => { + write!(f, "\n\\ ")?; + for comment in comments { + write!(f, "{}", comment)?; } Ok(()) } @@ -564,25 +573,39 @@ mod test { assert_debug_snapshot!(TokenVec(lexer().parse(r"5 + \ 3 " ).unwrap()), @r###" - TokenVec ( - 0..1: Literal(Integer(5)), - 2..3: Control('+'), - 10..11: Literal(Integer(3)), - ) - "###); + TokenVec ( + 0..1: Literal(Integer(5)), + 2..3: Control('+'), + 3..9: LineWrap([]), + 10..11: Literal(Integer(3)), + ) + "###); - // Comments get skipped over + // Comments are included; no newline after the comments assert_debug_snapshot!(TokenVec(lexer().parse(r"5 + # comment # comment with whitespace \ 3 " ).unwrap()), @r###" - TokenVec ( - 0..1: Literal(Integer(5)), - 2..3: Control('+'), - 47..48: Literal(Integer(3)), - ) - "###); + TokenVec ( + 0..1: Literal(Integer(5)), + 2..3: Control('+'), + 3..46: LineWrap([Comment(" comment"), Comment(" comment with whitespace")]), + 47..48: Literal(Integer(3)), + ) + "###); + + // Check display, for the test coverage (use `assert_eq` because the + // line-break doesn't work well with snapshots) + assert_eq!( + format!( + "{}", + Token::LineWrap(vec![Token::Comment(" a comment".to_string())]) + ), + r#" +\ # a comment +"# + ); } #[test] @@ -620,15 +643,18 @@ mod test { } #[test] - fn test_single_line_comment() { + fn comment() { + assert_debug_snapshot!(TokenVec(lexer().parse("# comment\n# second line").unwrap()), @r###" + TokenVec ( + 0..9: Comment(" comment"), + 9..10: NewLine, + 10..23: Comment(" second line"), + ) + "###); + assert_display_snapshot!(Token::Comment(" This is a single-line comment".to_string()), @r###" # This is a single-line comment "###); - assert_display_snapshot!(Token::Comment(" This is a\n multi-line\n comment".to_string()), @r###" - # This is a - # multi-line - # comment - "###); } #[test] diff --git a/prqlc/prqlc-parser/src/lib.rs b/prqlc/prqlc-parser/src/lib.rs index 98b10b55b2f4..d783dd0f4752 100644 --- a/prqlc/prqlc-parser/src/lib.rs +++ b/prqlc/prqlc-parser/src/lib.rs @@ -33,14 +33,14 @@ pub fn parse_source(source: &str, source_id: u16) -> Result, Vec = tokens.map(|tokens| { + let semantic_only: Option<_> = tokens.map(|tokens| { tokens .into_iter() - .filter(|TokenSpan(t, _)| !matches!(t, Token::Comment(_))) + .filter(|TokenSpan(t, _)| !matches!(t, Token::Comment(_) | Token::LineWrap(_))) }); - let ast = if let Some(without_comments) = without_comments { - let stream = prepare_stream(without_comments, source, source_id); + let ast = if let Some(semantic_only) = semantic_only { + let stream = prepare_stream(semantic_only, source, source_id); let (ast, parse_errors) = ::chumsky::Parser::parse_recovery(&stmt::source(), stream); @@ -122,6 +122,8 @@ mod common { } } +/// Convert the output of the lexer into the input of the parser. Requires +/// supplying the original source code. fn prepare_stream( tokens: impl Iterator, source: &str, diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__aggregation.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__aggregation.snap index 9d462aa9e577..f2418a549028 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__aggregation.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__aggregation.snap @@ -4,7 +4,13 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/aggregation.prql --- TokenVec ( - 0..101: Comment(" mssql:skip\n mysql:skip\n clickhouse:skip\n glaredb:skip (the string_agg function is not supported)"), + 0..12: Comment(" mssql:skip"), + 12..13: NewLine, + 13..25: Comment(" mysql:skip"), + 25..26: NewLine, + 26..43: Comment(" clickhouse:skip"), + 43..44: NewLine, + 44..101: Comment(" glaredb:skip (the string_agg function is not supported)"), 101..102: NewLine, 102..106: Ident("from"), 107..113: Ident("tracks"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__date_to_text.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__date_to_text.snap index dca48d510c37..31178d57118d 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__date_to_text.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__date_to_text.snap @@ -4,7 +4,13 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/date_to_text.prql --- TokenVec ( - 0..56: Comment(" generic:skip\n glaredb:skip\n sqlite:skip\n mssql:test"), + 0..14: Comment(" generic:skip"), + 14..15: NewLine, + 15..29: Comment(" glaredb:skip"), + 29..30: NewLine, + 30..43: Comment(" sqlite:skip"), + 43..44: NewLine, + 44..56: Comment(" mssql:test"), 56..57: NewLine, 57..61: Ident("from"), 62..70: Ident("invoices"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__genre_counts.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__genre_counts.snap index cc9633f2bd63..1ef6e90b147c 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__genre_counts.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__genre_counts.snap @@ -4,7 +4,9 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/genre_counts.prql --- TokenVec ( - 0..116: Comment(" clickhouse:skip (ClickHouse prefers aliases to column names https://github.com/PRQL/prql/issues/2827)\n mssql:test"), + 0..103: Comment(" clickhouse:skip (ClickHouse prefers aliases to column names https://github.com/PRQL/prql/issues/2827)"), + 103..104: NewLine, + 104..116: Comment(" mssql:test"), 116..117: NewLine, 117..120: Keyword("let"), 121..132: Ident("genre_count"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__group_sort_limit_take.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__group_sort_limit_take.snap index 75d4d0acd188..eab866c765fe 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__group_sort_limit_take.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__group_sort_limit_take.snap @@ -4,7 +4,9 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/group_sort_limit_take.prql --- TokenVec ( - 0..75: Comment(" Compute the 3 longest songs for each genre and sort by genre\n mssql:test"), + 0..62: Comment(" Compute the 3 longest songs for each genre and sort by genre"), + 62..63: NewLine, + 63..75: Comment(" mssql:test"), 75..76: NewLine, 76..80: Ident("from"), 81..87: Ident("tracks"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__loop_01.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__loop_01.snap index d6f42f1a523f..99ba45a3fea7 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__loop_01.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__loop_01.snap @@ -4,7 +4,9 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/loop_01.prql --- TokenVec ( - 0..161: Comment(" clickhouse:skip (DB::Exception: Syntax error)\n glaredb:skip (DataFusion does not support recursive CTEs https://github.com/apache/arrow-datafusion/issues/462)"), + 0..47: Comment(" clickhouse:skip (DB::Exception: Syntax error)"), + 47..48: NewLine, + 48..161: Comment(" glaredb:skip (DataFusion does not support recursive CTEs https://github.com/apache/arrow-datafusion/issues/462)"), 161..162: NewLine, 162..166: Ident("from"), 167..168: Control('['), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__math_module.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__math_module.snap index b531d0b2d535..524cecd86b61 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__math_module.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__math_module.snap @@ -4,7 +4,9 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/math_module.prql --- TokenVec ( - 0..81: Comment(" mssql:test\n sqlite:skip (see https://github.com/rusqlite/rusqlite/issues/1211)"), + 0..12: Comment(" mssql:test"), + 12..13: NewLine, + 13..81: Comment(" sqlite:skip (see https://github.com/rusqlite/rusqlite/issues/1211)"), 81..82: NewLine, 82..86: Ident("from"), 87..95: Ident("invoices"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__pipelines.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__pipelines.snap index 29aa1ec752a4..33b2a6ac8a2c 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__pipelines.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__pipelines.snap @@ -4,7 +4,11 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/pipelines.prql --- TokenVec ( - 0..164: Comment(" sqlite:skip (Only works on Sqlite implementations which have the extension\n installed\n https://stackoverflow.com/questions/24037982/how-to-use-regexp-in-sqlite)"), + 0..76: Comment(" sqlite:skip (Only works on Sqlite implementations which have the extension"), + 76..77: NewLine, + 77..88: Comment(" installed"), + 88..89: NewLine, + 89..164: Comment(" https://stackoverflow.com/questions/24037982/how-to-use-regexp-in-sqlite)"), 164..165: NewLine, 165..166: NewLine, 166..170: Ident("from"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__read_csv.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__read_csv.snap index 61c9041a4ac8..06d1d753e4cb 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__read_csv.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__read_csv.snap @@ -4,7 +4,11 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/read_csv.prql --- TokenVec ( - 0..42: Comment(" sqlite:skip\n postgres:skip\n mysql:skip"), + 0..13: Comment(" sqlite:skip"), + 13..14: NewLine, + 14..29: Comment(" postgres:skip"), + 29..30: NewLine, + 30..42: Comment(" mysql:skip"), 42..43: NewLine, 43..47: Ident("from"), 48..49: Control('('), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__switch.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__switch.snap index 6140491b67bc..1e0d88328391 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__switch.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__switch.snap @@ -4,7 +4,9 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/switch.prql --- TokenVec ( - 0..88: Comment(" glaredb:skip (May be a bag of String type conversion for Postgres Client)\n mssql:test"), + 0..75: Comment(" glaredb:skip (May be a bag of String type conversion for Postgres Client)"), + 75..76: NewLine, + 76..88: Comment(" mssql:test"), 88..89: NewLine, 89..93: Ident("from"), 94..100: Ident("tracks"), diff --git a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__window.snap b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__window.snap index 073b54040036..0b66793a09df 100644 --- a/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__window.snap +++ b/prqlc/prqlc/tests/integration/snapshots/integration__queries__lex__window.snap @@ -4,7 +4,13 @@ expression: tokens input_file: prqlc/prqlc/tests/integration/queries/window.prql --- TokenVec ( - 0..337: Comment(" mssql:skip Conversion(\"cannot interpret I64(Some(1)) as an i32 value\")', connection.rs:200:34\n duckdb:skip problems with DISTINCT ON (duckdb internal error: [with INPUT_TYPE = int; RESULT_TYPE = unsigned char]: Assertion `min_val <= input' failed.)\n clickhouse:skip problems with DISTINCT ON\n postgres:skip problems with DISTINCT ON"), + 0..95: Comment(" mssql:skip Conversion(\"cannot interpret I64(Some(1)) as an i32 value\")', connection.rs:200:34"), + 95..96: NewLine, + 96..251: Comment(" duckdb:skip problems with DISTINCT ON (duckdb internal error: [with INPUT_TYPE = int; RESULT_TYPE = unsigned char]: Assertion `min_val <= input' failed.)"), + 251..252: NewLine, + 252..295: Comment(" clickhouse:skip problems with DISTINCT ON"), + 295..296: NewLine, + 296..337: Comment(" postgres:skip problems with DISTINCT ON"), 337..338: NewLine, 338..342: Ident("from"), 343..349: Ident("tracks"),