Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Handle multi-byte utf8 characters in formatter #6118

Merged
merged 9 commits into from
Sep 24, 2024
6 changes: 6 additions & 0 deletions compiler/noirc_frontend/src/lexer/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ pub enum LexerErrorKind {
InvalidEscape { escaped: char, span: Span },
#[error("Invalid quote delimiter `{delimiter}`, valid delimiters are `{{`, `[`, and `(`")]
InvalidQuoteDelimiter { delimiter: SpannedToken },
#[error("Non-ASCII characters are invalid in comments")]
NonAsciiComment { span: Span },
#[error("Expected `{end_delim}` to close this {start_delim}")]
UnclosedQuote { start_delim: SpannedToken, end_delim: Token },
}
Expand Down Expand Up @@ -65,6 +67,7 @@ impl LexerErrorKind {
LexerErrorKind::UnterminatedStringLiteral { span } => *span,
LexerErrorKind::InvalidEscape { span, .. } => *span,
LexerErrorKind::InvalidQuoteDelimiter { delimiter } => delimiter.to_span(),
LexerErrorKind::NonAsciiComment { span, .. } => *span,
LexerErrorKind::UnclosedQuote { start_delim, .. } => start_delim.to_span(),
}
}
Expand Down Expand Up @@ -124,6 +127,9 @@ impl LexerErrorKind {
LexerErrorKind::InvalidQuoteDelimiter { delimiter } => {
(format!("Invalid quote delimiter `{delimiter}`"), "Valid delimiters are `{`, `[`, and `(`".to_string(), delimiter.to_span())
},
LexerErrorKind::NonAsciiComment { span } => {
("Non-ASCII character in comment".to_string(), "Invalid comment character: only ASCII is currently supported.".to_string(), *span)
}
LexerErrorKind::UnclosedQuote { start_delim, end_delim } => {
("Unclosed `quote` expression".to_string(), format!("Expected a `{end_delim}` to close this `{start_delim}`"), start_delim.to_span())
}
Expand Down
24 changes: 24 additions & 0 deletions compiler/noirc_frontend/src/lexer/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
position: Position,
done: bool,
skip_comments: bool,
skip_whitespaces: bool,

Check warning on line 21 in compiler/noirc_frontend/src/lexer/lexer.rs

View workflow job for this annotation

GitHub Actions / Code

Unknown word (whitespaces)
max_integer: BigInt,
}

Expand Down Expand Up @@ -46,8 +46,8 @@
position: 0,
done: false,
skip_comments: true,
skip_whitespaces: true,

Check warning on line 49 in compiler/noirc_frontend/src/lexer/lexer.rs

View workflow job for this annotation

GitHub Actions / Code

Unknown word (whitespaces)
max_integer: BigInt::from_biguint(num_bigint::Sign::Plus, FieldElement::modulus())

Check warning on line 50 in compiler/noirc_frontend/src/lexer/lexer.rs

View workflow job for this annotation

GitHub Actions / Code

Unknown word (biguint)
- BigInt::one(),
}
}
Expand All @@ -57,8 +57,8 @@
self
}

pub fn skip_whitespaces(mut self, flag: bool) -> Self {

Check warning on line 60 in compiler/noirc_frontend/src/lexer/lexer.rs

View workflow job for this annotation

GitHub Actions / Code

Unknown word (whitespaces)
self.skip_whitespaces = flag;

Check warning on line 61 in compiler/noirc_frontend/src/lexer/lexer.rs

View workflow job for this annotation

GitHub Actions / Code

Unknown word (whitespaces)
self
}

Expand Down Expand Up @@ -606,6 +606,11 @@
};
let comment = self.eat_while(None, |ch| ch != '\n');

if !comment.is_ascii() {
let span = Span::from(start..self.position);
return Err(LexerErrorKind::NonAsciiComment { span });
}

if doc_style.is_none() && self.skip_comments {
return self.next_token();
}
Expand Down Expand Up @@ -651,6 +656,11 @@
}

if depth == 0 {
if !content.is_ascii() {
let span = Span::from(start..self.position);
return Err(LexerErrorKind::NonAsciiComment { span });
}

if doc_style.is_none() && self.skip_comments {
return self.next_token();
}
Expand Down Expand Up @@ -1331,6 +1341,7 @@

Err(LexerErrorKind::InvalidIntegerLiteral { .. })
| Err(LexerErrorKind::UnexpectedCharacter { .. })
| Err(LexerErrorKind::NonAsciiComment { .. })
| Err(LexerErrorKind::UnterminatedBlockComment { .. }) => {
expected_token_found = true;
}
Expand Down Expand Up @@ -1389,4 +1400,17 @@
}
}
}

#[test]
fn test_non_ascii_comments() {
let cases = vec!["// 🙂", "// schön", "/* in the middle 🙂 of a comment */"];

for source in cases {
let mut lexer = Lexer::new(source);
assert!(
lexer.any(|token| matches!(token, Err(LexerErrorKind::NonAsciiComment { .. }))),
"Expected NonAsciiComment error"
);
}
}
}
17 changes: 15 additions & 2 deletions tooling/nargo_fmt/src/visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ impl<'me> FmtVisitor<'me> {

pub(crate) fn slice(&self, span: impl Into<Span>) -> &'me str {
let span = span.into();
&self.source[span.start() as usize..span.end() as usize]
str_slice(self.source, span.start() as usize, span.end() as usize)
}

pub(crate) fn span_after(&self, span: impl Into<Span>, token: Token) -> Span {
Expand Down Expand Up @@ -188,7 +188,7 @@ impl<'me> FmtVisitor<'me> {

match comment.token() {
Token::LineComment(_, _) | Token::BlockComment(_, _) => {
let comment = &slice[span.start() as usize..span.end() as usize];
let comment = str_slice(slice, span.start() as usize, span.end() as usize);
if result.ends_with('\n') {
result.push_str(&indent);
} else if !self.at_start() {
Expand Down Expand Up @@ -247,6 +247,19 @@ impl<'me> FmtVisitor<'me> {
}
}

pub(crate) fn str_slice(s: &str, start: usize, end: usize) -> &str {
&s[start..ceil_char_boundary(s, end)]
}

pub(crate) fn ceil_char_boundary(s: &str, byte_index: usize) -> usize {
for i in byte_index..s.len() {
if s.is_char_boundary(i) {
return i;
}
}
s.len()
}

#[derive(Clone, Copy, Debug, Default)]
pub(crate) struct Indent {
block_indent: usize,
Expand Down
Loading