Skip to content

Commit

Permalink
fix(transformer): JSX source calculate correct column when Unicode ch…
Browse files Browse the repository at this point in the history
…ars (#3615)

Fix column number in JSX source transform, and add tests.

It was correct in all cases, except for when a Unicode character with code point above `0xFFFF` appears earlier on the line.

Such characters are:

* 4 bytes in UTF-8.
* 2 characters in UTF-16.
* 1 `char` in Rust.

Babel (which we're trying to match) uses count of UTF-16 characters for column number, whereas we were using count of Rust `char`s.
  • Loading branch information
overlookmotel committed Jun 11, 2024
1 parent 9e8f4d6 commit 8d237c4
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 8 deletions.
3 changes: 3 additions & 0 deletions crates/oxc_transformer/src/react/jsx_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ impl<'a> ReactJsxSource<'a> {
let key = JSXAttributeName::Identifier(
self.ctx.ast.alloc(self.ctx.ast.jsx_identifier(SPAN, SOURCE.into())),
);
// TODO: We shouldn't calculate line + column from scratch each time as it's expensive.
// Build a table of byte indexes of each line's start on first usage, and save it.
// Then calculate line and column from that.
let (line, column) = get_line_column(elem.span.start, self.ctx.source_text);
let object = self.get_source_object(line, column, ctx);
let expr = self.ctx.ast.jsx_expression_container(SPAN, JSXExpression::from(object));
Expand Down
93 changes: 85 additions & 8 deletions crates/oxc_transformer/src/react/utils.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,91 @@
use ropey::Rope;

/// Get line and column from offset and source text
/// Get line and column from offset and source text.
///
/// Line number starts at 1.
/// Column number is in UTF-16 characters, and starts at 1.
///
/// This matches Babel's output.
pub fn get_line_column(offset: u32, source_text: &str) -> (usize, usize) {
let offset = offset as usize;
let rope = Rope::from_str(source_text);
let line = rope.byte_to_line(offset);
let first_char_of_line = rope.line_to_char(line);
// Original offset is byte, but Rope uses char offset
let offset = rope.byte_to_char(offset);
let column = offset - first_char_of_line;
// line and column is zero-indexed, but we want 1-indexed
(line + 1, column + 1)
// Get line number and byte offset of start of line
let line_index = rope.byte_to_line(offset);
let line_offset = rope.line_to_byte(line_index);
// Get column number
let column_index = source_text[line_offset..offset].encode_utf16().count();
// line and column are zero-indexed, but we want 1-indexed
(line_index + 1, column_index + 1)
}

#[test]
fn empty_file() {
assert_eq!(get_line_column(0, ""), (1, 1));
}

#[test]
fn first_line_start() {
assert_eq!(get_line_column(0, "foo\nbar\n"), (1, 1));
}

#[test]
fn first_line_middle() {
assert_eq!(get_line_column(5, "blahblahblah\noops\n"), (1, 6));
}

#[test]
fn later_line_start() {
assert_eq!(get_line_column(8, "foo\nbar\nblahblahblah"), (3, 1));
}

#[test]
fn later_line_middle() {
assert_eq!(get_line_column(12, "foo\nbar\nblahblahblah"), (3, 5));
}

#[test]
fn after_2_byte_unicode() {
assert_eq!("£".len(), 2);
assert_eq!(utf16_len("£"), 1);
assert_eq!(get_line_column(4, "£abc"), (1, 4));
}

#[test]
fn after_3_byte_unicode() {
assert_eq!("अ".len(), 3);
assert_eq!(utf16_len("अ"), 1);
assert_eq!(get_line_column(5, "अabc"), (1, 4));
}

#[test]
fn after_4_byte_unicode() {
assert_eq!("🍄".len(), 4);
assert_eq!(utf16_len("🍄"), 2);
assert_eq!(get_line_column(6, "🍄abc"), (1, 5));
}

#[test]
fn after_2_byte_unicode_on_previous_line() {
assert_eq!("£".len(), 2);
assert_eq!(utf16_len("£"), 1);
assert_eq!(get_line_column(4, \nabc"), (2, 2));
}

#[test]
fn after_3_byte_unicode_on_previous_line() {
assert_eq!("अ".len(), 3);
assert_eq!(utf16_len("अ"), 1);
assert_eq!(get_line_column(5, "अ\nabc"), (2, 2));
}

#[test]
fn after_4_byte_unicode_on_previous_line() {
assert_eq!("🍄".len(), 4);
assert_eq!(utf16_len("🍄"), 2);
assert_eq!(get_line_column(6, "🍄\nabc"), (2, 2));
}

#[cfg(test)]
fn utf16_len(s: &str) -> usize {
s.encode_utf16().count()
}

0 comments on commit 8d237c4

Please sign in to comment.