feat(parser): calculate leading and trailing position for comments (#…

…5785)
oxc-project · Sep 16, 2024 · 8e7556f · 8e7556f
1 parent 31e9db4
commit 8e7556f
Show file tree

Hide file tree

Showing 6 changed files with 211 additions and 16 deletions.
diff --git a/crates/oxc_ast/src/lib.rs b/crates/oxc_ast/src/lib.rs
@@ -61,7 +61,7 @@ pub use crate::{
     ast_builder::AstBuilder,
     ast_builder_impl::NONE,
     ast_kind::{AstKind, AstType},
-    trivia::{Comment, CommentKind, SortedComments, Trivias},
+    trivia::{Comment, CommentKind, CommentPosition, SortedComments, Trivias},
     visit::{Visit, VisitMut},
 };
 

diff --git a/crates/oxc_ast/src/trivia.rs b/crates/oxc_ast/src/trivia.rs
@@ -14,20 +14,63 @@ pub enum CommentKind {
     Block,
 }
 
+#[derive(Debug, Clone, Copy, Eq, PartialEq)]
+pub enum CommentPosition {
+    /// Comments prior to a token until another token or trailing comment.
+    ///
+    /// e.g.
+    ///
+    /// ```
+    /// /* leading */ token;
+    /// /* leading */
+    /// // leading
+    /// token;
+    /// ```
+    Leading,
+
+    /// Comments tailing a token until a newline.
+    /// e.g. `token /* trailing */ // trailing`
+    Trailing,
+}
+
 /// Single or multiline comment
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub struct Comment {
     /// The span of the comment text (without leading/trailing delimiters).
     pub span: Span,
 
+    /// Line or block comment
     pub kind: CommentKind,
+
+    /// Leading or trailing comment
+    pub position: CommentPosition,
+
+    /// Start of token this leading comment is attached to.
+    /// `/* Leading */ token`
+    ///                ^ This start
+    /// NOTE: Trailing comment attachment is not computed yet.
+    pub attached_to: u32,
+
+    /// Whether this comment has a preceding newline.
+    /// Used to avoid becoming a trailing comment in codegen.
+    pub preceded_by_newline: bool,
+
+    /// Whether this comment has a tailing newline.
+    pub followed_by_newline: bool,
 }
 
 impl Comment {
     #[inline]
     pub fn new(start: u32, end: u32, kind: CommentKind) -> Self {
         let span = Span::new(start, end);
-        Self { span, kind }
+        Self {
+            span,
+            kind,
+            position: CommentPosition::Trailing,
+            attached_to: 0,
+            preceded_by_newline: false,
+            followed_by_newline: false,
+        }
     }
 
     pub fn is_line(self) -> bool {
@@ -38,6 +81,14 @@ impl Comment {
         self.kind == CommentKind::Block
     }
 
+    pub fn is_leading(self) -> bool {
+        self.position == CommentPosition::Leading
+    }
+
+    pub fn is_trailing(self) -> bool {
+        self.position == CommentPosition::Trailing
+    }
+
     pub fn real_span(&self) -> Span {
         Span::new(self.real_span_start(), self.real_span_end())
     }
@@ -55,8 +106,6 @@ impl Comment {
     }
 }
 
-impl CommentKind {}
-
 /// Sorted set of unique trivia comments, in ascending order by starting position.
 pub type SortedComments = Box<[Comment]>;
 

diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs
@@ -35,7 +35,7 @@ impl<'a> Lexer<'a> {
                 if next_byte != LS_OR_PS_FIRST {
                     // `\r` or `\n`
                     self.trivia_builder
-                        .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                        .add_line_comment(self.token.start, self.source.offset_of(pos));
                     // SAFETY: Safe to consume `\r` or `\n` as both are ASCII
                     pos = unsafe { pos.add(1) };
                     // We've found the end. Do not continue searching.
@@ -50,7 +50,7 @@ impl<'a> Lexer<'a> {
                         if matches!(next2, LS_BYTES_2_AND_3 | PS_BYTES_2_AND_3) {
                             // Irregular line break
                             self.trivia_builder
-                                .add_single_line_comment(self.token.start, self.source.offset_of(pos));
+                                .add_line_comment(self.token.start, self.source.offset_of(pos));
                             // Advance `pos` to after this char.
                             // SAFETY: `0xE2` is always 1st byte of a 3-byte UTF-8 char,
                             // so consuming 3 bytes will place `pos` on next UTF-8 char boundary.
@@ -69,7 +69,7 @@ impl<'a> Lexer<'a> {
                 }
             },
             handle_eof: {
-                self.trivia_builder.add_single_line_comment(self.token.start, self.offset());
+                self.trivia_builder.add_line_comment(self.token.start, self.offset());
                 return Kind::Skip;
             },
         };
@@ -145,7 +145,7 @@ impl<'a> Lexer<'a> {
             },
         };
 
-        self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
+        self.trivia_builder.add_block_comment(self.token.start, self.offset());
         Kind::Skip
     }
 
@@ -165,7 +165,7 @@ impl<'a> Lexer<'a> {
         if let Some(index) = finder.find(remaining) {
             // SAFETY: `pos + index + 2` is end of `*/`, so a valid `SourcePosition`
             self.source.set_position(unsafe { pos.add(index + 2) });
-            self.trivia_builder.add_multi_line_comment(self.token.start, self.offset());
+            self.trivia_builder.add_block_comment(self.token.start, self.offset());
             Kind::Skip
         } else {
             self.source.advance_to_end();

diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
@@ -218,6 +218,7 @@ impl<'a> Lexer<'a> {
         self.token.end = self.offset();
         debug_assert!(self.token.start <= self.token.end);
         let token = self.token;
+        self.trivia_builder.handle_token(token.start);
         self.token = Token::default();
         token
     }

diff --git a/crates/oxc_parser/src/lexer/trivia_builder.rs b/crates/oxc_parser/src/lexer/trivia_builder.rs
@@ -1,30 +1,66 @@
-use oxc_ast::{Comment, CommentKind, Trivias};
+use oxc_ast::{Comment, CommentKind, CommentPosition, Trivias};
 use oxc_span::Span;
 
 #[derive(Debug, Default)]
 pub struct TriviaBuilder {
-    // NOTE(lucab): This is a set of unique comments. Duplicated
+    // This is a set of unique comments. Duplicated
     // comments could be generated in case of rewind; they are
     // filtered out at insertion time.
     pub(crate) comments: Vec<Comment>,
+
     irregular_whitespaces: Vec<Span>,
+
+    // states
+    /// index of processed comments
+    processed: usize,
+
+    /// Saw a newline before this position
+    saw_newline: bool,
 }
 
 impl TriviaBuilder {
     pub fn build(self) -> Trivias {
         Trivias::new(self.comments.into_boxed_slice(), self.irregular_whitespaces)
     }
 
-    pub fn add_single_line_comment(&mut self, start: u32, end: u32) {
+    pub fn add_irregular_whitespace(&mut self, start: u32, end: u32) {
+        self.irregular_whitespaces.push(Span::new(start, end));
+    }
+
+    pub fn add_line_comment(&mut self, start: u32, end: u32) {
         // skip leading `//`
         self.add_comment(Comment::new(start + 2, end, CommentKind::Line));
     }
 
-    pub fn add_multi_line_comment(&mut self, start: u32, end: u32) {
+    pub fn add_block_comment(&mut self, start: u32, end: u32) {
         // skip leading `/*` and trailing `*/`
         self.add_comment(Comment::new(start + 2, end - 2, CommentKind::Block));
     }
 
+    // For block comments only. This function is not called after line comments because the lexer skips
+    // newline after line comments.
+    pub fn handle_newline(&mut self) {
+        // The last unprocessed comment is on a newline.
+        let len = self.comments.len();
+        if self.processed < len {
+            self.comments[len - 1].followed_by_newline = true;
+        }
+        self.saw_newline = true;
+    }
+
+    pub fn handle_token(&mut self, token_start: u32) {
+        let len = self.comments.len();
+        if self.processed < len {
+            // All unprocess preceding comments are leading comments attached to this token start.
+            for comment in &mut self.comments[self.processed..] {
+                comment.position = CommentPosition::Leading;
+                comment.attached_to = token_start;
+            }
+            self.processed = len;
+        }
+        self.saw_newline = false;
+    }
+
     fn add_comment(&mut self, comment: Comment) {
         // The comments array is an ordered vec, only add the comment if its not added before,
         // to avoid situations where the parser needs to rewind and tries to reinsert the comment.
@@ -33,10 +69,118 @@ impl TriviaBuilder {
                 return;
             }
         }
+
+        let mut comment = comment;
+        // This newly added comment may be preceded by a newline.
+        comment.preceded_by_newline = self.saw_newline;
+        if comment.is_line() {
+            // A line comment is always followed by a newline. This is never set in `handle_newline`.
+            comment.followed_by_newline = true;
+            // A line comment is trailing when it is no preceded by a newline.
+            if !self.saw_newline {
+                self.processed = self.comments.len() + 1; // +1 to include this comment.
+            }
+            self.saw_newline = true;
+        }
+
         self.comments.push(comment);
     }
+}
 
-    pub fn add_irregular_whitespace(&mut self, start: u32, end: u32) {
-        self.irregular_whitespaces.push(Span::new(start, end));
+#[cfg(test)]
+mod test {
+    use crate::Parser;
+    use oxc_allocator::Allocator;
+    use oxc_ast::{Comment, CommentKind, CommentPosition};
+    use oxc_span::{SourceType, Span};
+
+    #[test]
+    fn comment_attachments() {
+        let allocator = Allocator::default();
+        let source_type = SourceType::default();
+        let source_text = "
+        /* Leading 1 */
+        // Leading 2
+        /* Leading 3 */ token /* Trailing 1 */ // Trailing 2
+        // Leading of EOF token
+        ";
+        let ret = Parser::new(&allocator, source_text, source_type).parse();
+        let comments = ret.trivias.comments().copied().collect::<Vec<_>>();
+        let expected = [
+            Comment {
+                span: Span::new(11, 22),
+                kind: CommentKind::Block,
+                position: CommentPosition::Leading,
+                attached_to: 70,
+                preceded_by_newline: true,
+                followed_by_newline: true,
+            },
+            Comment {
+                span: Span::new(35, 45),
+                kind: CommentKind::Line,
+                position: CommentPosition::Leading,
+                attached_to: 70,
+                preceded_by_newline: true,
+                followed_by_newline: true,
+            },
+            Comment {
+                span: Span::new(56, 67),
+                kind: CommentKind::Block,
+                position: CommentPosition::Leading,
+                attached_to: 70,
+                preceded_by_newline: true,
+                followed_by_newline: false,
+            },
+            Comment {
+                span: Span::new(78, 90),
+                kind: CommentKind::Block,
+                position: CommentPosition::Trailing,
+                attached_to: 0,
+                preceded_by_newline: false,
+                followed_by_newline: false,
+            },
+            Comment {
+                span: Span::new(95, 106),
+                kind: CommentKind::Line,
+                position: CommentPosition::Trailing,
+                attached_to: 0,
+                preceded_by_newline: false,
+                followed_by_newline: true,
+            },
+            Comment {
+                span: Span::new(117, 138),
+                kind: CommentKind::Line,
+                position: CommentPosition::Leading,
+                attached_to: 147,
+                preceded_by_newline: true,
+                followed_by_newline: true,
+            },
+        ];
+
+        assert_eq!(comments.len(), expected.len());
+        for (comment, expected) in comments.iter().copied().zip(expected) {
+            assert_eq!(comment, expected, "{}", comment.real_span().source_text(source_text));
+        }
+    }
+
+    #[test]
+    fn comment_attachments2() {
+        let allocator = Allocator::default();
+        let source_type = SourceType::default();
+        let source_text = "#!/usr/bin/env node
+/* Leading 1 */
+token
+        ";
+        let ret = Parser::new(&allocator, source_text, source_type).parse();
+        let comments = ret.trivias.comments().copied().collect::<Vec<_>>();
+        let expected = vec![Comment {
+            span: Span::new(22, 33),
+            kind: CommentKind::Block,
+            position: CommentPosition::Leading,
+            attached_to: 36,
+            preceded_by_newline: false, // hashbang comment always end in newline
+            followed_by_newline: true,
+        }];
+        assert_eq!(comments, expected);
     }
 }
diff --git a/crates/oxc_parser/src/lexer/whitespace.rs b/crates/oxc_parser/src/lexer/whitespace.rs
@@ -9,6 +9,7 @@ static NOT_REGULAR_WHITESPACE_OR_LINE_BREAK_TABLE: SafeByteMatchTable =
 impl<'a> Lexer<'a> {
     pub(super) fn line_break_handler(&mut self) -> Kind {
         self.token.is_on_new_line = true;
+        self.trivia_builder.handle_newline();
 
         // Indentation is common after a line break.
         // Consume it, along with any further line breaks.