Skip to content

Commit

Permalink
Fix unicode escape in identifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
jevancc committed Jan 27, 2021
1 parent 8937bb2 commit c98ef89
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 39 deletions.
36 changes: 36 additions & 0 deletions boa/src/syntax/ast/keyword.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Classes/extends
Extends,

/// The `false` keyword.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
False,

/// The `finally` keyword.
///
/// More information:
Expand Down Expand Up @@ -301,6 +311,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/new
New,

/// The `null` keyword.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-NullLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/null
Null,

/// The `of` keyword.
///
/// More information:
Expand Down Expand Up @@ -369,6 +389,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions
Throw,

/// The `true` keyword
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
True,

/// The `try` keyword.
///
/// More information:
Expand Down Expand Up @@ -479,6 +509,7 @@ impl Keyword {
Self::Enum => "enum",
Self::Extends => "extends",
Self::Export => "export",
Self::False => "false",
Self::Finally => "finally",
Self::For => "for",
Self::Function => "function",
Expand All @@ -488,12 +519,14 @@ impl Keyword {
Self::Import => "import",
Self::Let => "let",
Self::New => "new",
Self::Null => "null",
Self::Of => "of",
Self::Return => "return",
Self::Super => "super",
Self::Switch => "switch",
Self::This => "this",
Self::Throw => "throw",
Self::True => "true",
Self::Try => "try",
Self::TypeOf => "typeof",
Self::Var => "var",
Expand Down Expand Up @@ -552,6 +585,7 @@ impl FromStr for Keyword {
"enum" => Ok(Self::Enum),
"extends" => Ok(Self::Extends),
"export" => Ok(Self::Export),
"false" => Ok(Self::False),
"finally" => Ok(Self::Finally),
"for" => Ok(Self::For),
"function" => Ok(Self::Function),
Expand All @@ -561,12 +595,14 @@ impl FromStr for Keyword {
"import" => Ok(Self::Import),
"let" => Ok(Self::Let),
"new" => Ok(Self::New),
"null" => Ok(Self::Null),
"of" => Ok(Self::Of),
"return" => Ok(Self::Return),
"super" => Ok(Self::Super),
"switch" => Ok(Self::Switch),
"this" => Ok(Self::This),
"throw" => Ok(Self::Throw),
"true" => Ok(Self::True),
"try" => Ok(Self::Try),
"typeof" => Ok(Self::TypeOf),
"var" => Ok(Self::Var),
Expand Down
2 changes: 2 additions & 0 deletions boa/src/syntax/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ where
/// predicate on the ascii char
///
/// The buffer is not incremented.
#[allow(dead_code)]
#[inline]
pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
where
Expand Down Expand Up @@ -191,6 +192,7 @@ where
/// It also stops when there is no next character.
///
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
#[allow(dead_code)]
pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
where
F: Fn(u32) -> bool,
Expand Down
129 changes: 93 additions & 36 deletions boa/src/syntax/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::{
profiler::BoaProfiler,
syntax::{
ast::{Keyword, Position, Span},
lexer::{Token, TokenKind},
lexer::{StringLiteral, Token, TokenKind},
},
};
use boa_unicode::UnicodeProperties;
Expand Down Expand Up @@ -86,43 +86,100 @@ impl<R> Tokenizer<R> for Identifier {
{
let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");

let mut init_buf = [0u8; 4];
let mut buf = Vec::new();
self.init.encode_utf8(&mut init_buf);
buf.extend(init_buf.iter().take(self.init.len_utf8()));

cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;

let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
let tk = match token_str {
"true" => TokenKind::BooleanLiteral(true),
"false" => TokenKind::BooleanLiteral(false),
"null" => TokenKind::NullLiteral,
slice => {
if let Ok(keyword) = slice.parse() {
if cursor.strict_mode() && keyword == Keyword::With {
return Err(Error::Syntax(
"using 'with' statement not allowed in strict mode".into(),
start_pos,
));
}
TokenKind::Keyword(keyword)
} else {
if cursor.strict_mode() && STRICT_FORBIDDEN_IDENTIFIERS.contains(&slice) {
return Err(Error::Syntax(
format!(
"using future reserved keyword '{}' not allowed in strict mode",
slice
)
.into(),
start_pos,
));
}
TokenKind::identifier(slice)
}
let (identifier_name, contains_escaped_chars) =
Self::take_identifier_name(cursor, start_pos, self.init)?;

let token_kind = if let Ok(keyword) = identifier_name.parse() {
if contains_escaped_chars {
return Err(Error::Syntax(
"unicode escaped characters are not allowed in keyword".into(),
start_pos,
));
}

if cursor.strict_mode() && keyword == Keyword::With {
return Err(Error::Syntax(
"using 'with' statement not allowed in strict mode".into(),
start_pos,
));
}

match keyword {
Keyword::True => TokenKind::BooleanLiteral(true),
Keyword::False => TokenKind::BooleanLiteral(false),
Keyword::Null => TokenKind::NullLiteral,
_ => TokenKind::Keyword(keyword),
}
} else {
if cursor.strict_mode()
&& STRICT_FORBIDDEN_IDENTIFIERS.contains(&identifier_name.as_str())
{
return Err(Error::Syntax(
format!(
"using future reserved keyword '{}' not allowed in strict mode",
identifier_name
)
.into(),
start_pos,
));
}
TokenKind::identifier(identifier_name.into_boxed_str())
};

Ok(Token::new(tk, Span::new(start_pos, cursor.pos())))
Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
}
}

impl Identifier {
#[inline]
fn take_identifier_name<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
init: char,
) -> Result<(String, bool), Error>
where
R: Read,
{
let mut contains_escaped_chars = false;
let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? {
let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;

if Self::is_identifier_start(ch) {
contains_escaped_chars = true;
String::from(char::try_from(ch).unwrap())
} else {
return Err(Error::Syntax("invalid identifier start".into(), start_pos));
}
} else {
// The caller guarantees that `init` is a valid identifier start
String::from(init)
};

loop {
let ch = match cursor.peek_char()? {
Some(0x005C /* \ */) if cursor.peek_n(2)? >> 8 == 0x0075 /* u */ => {
let pos = cursor.pos();
let _ = cursor.next_byte();
let _ = cursor.next_byte();
let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?;

if Self::is_identifier_part(ch) {
contains_escaped_chars = true;
ch
} else {
return Err(Error::Syntax("invalid identifier part".into(), pos));
}
}
Some(ch) if Self::is_identifier_part(ch) => {
let _ = cursor.next_char()?;
ch
},
_ => break,
};

identifier_name.push(char::try_from(ch).unwrap());
}

Ok((identifier_name, contains_escaped_chars))
}
}
7 changes: 5 additions & 2 deletions boa/src/syntax/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,15 @@ impl<R> Lexer<R> {
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
'\\' if self.cursor.peek()? == Some(b'u') => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ if Identifier::is_identifier_start(c as u32) => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ => {
let details = format!(
"unexpected '{}' at line {}, column {}",
Expand Down
5 changes: 4 additions & 1 deletion boa/src/syntax/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fn check_multi_line_comment() {

#[test]
fn check_identifier() {
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D} \\u0078 \\u0078\\u0078 \\u{0078}x\\u{0078}";
let mut lexer = Lexer::new(s.as_bytes());

let expected = [
Expand All @@ -85,6 +85,9 @@ fn check_identifier() {
TokenKind::identifier("Ѐ"),
TokenKind::identifier("ЀЀ"),
TokenKind::identifier("x\u{200C}\u{200D}"),
TokenKind::identifier("x"),
TokenKind::identifier("xx"),
TokenKind::identifier("xxx"),
];

expect_tokens(&mut lexer, &expected);
Expand Down

0 comments on commit c98ef89

Please sign in to comment.