Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle invalid Unicode code point in the string literals #853

Merged
merged 4 commits into from
Oct 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions boa/src/syntax/lexer/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,7 @@ where

/// Retrieves the next UTF-8 checked character.
fn next_char(&mut self) -> io::Result<Option<char>> {
if let Some(v) = self.peeked_char {
let _ = self.peeked_char.take();
if let Some(v) = self.peeked_char.take() {
return Ok(v);
}

Expand Down
145 changes: 62 additions & 83 deletions boa/src/syntax/lexer/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ use crate::{
},
};
use std::{
char::{decode_utf16, from_u32},
convert::TryFrom,
io::{self, ErrorKind, Read},
str,
};
Expand Down Expand Up @@ -57,7 +55,7 @@ impl<R> Tokenizer<R> for StringLiteral {
{
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");

let mut buf = String::new();
let mut buf: Vec<u16> = Vec::new();
loop {
let next_chr_start = cursor.pos();
let next_chr = cursor.next_char()?.ok_or_else(|| {
Expand All @@ -84,108 +82,81 @@ impl<R> Tokenizer<R> for StringLiteral {
"unterminated escape sequence in string literal",
))
})?;

if escape != '\n' {
let escaped_ch = match escape {
'n' => '\n',
'r' => '\r',
't' => '\t',
'b' => '\x08',
'f' => '\x0c',
'0' => '\0',
match escape {
'n' => buf.push('\n' as u16),
'r' => buf.push('\r' as u16),
't' => buf.push('\t' as u16),
'b' => buf.push('\x08' as u16),
'f' => buf.push('\x0c' as u16),
'0' => buf.push('\0' as u16),
'x' => {
let mut nums = [0u8; 2];
cursor.fill_bytes(&mut nums)?;
let nums = str::from_utf8(&nums).expect("non-UTF-8 bytes found");

let as_num = match u64::from_str_radix(&nums, 16) {
Ok(v) => v,
Err(_) => 0,
};
match from_u32(as_num as u32) {
Some(v) => v,
None => {
return Err(Error::syntax(
format!(
"{}: {} is not a valid Unicode scalar value",
cursor.pos(),
as_num
),
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Hexadecimal character escape sequence");
Razican marked this conversation as resolved.
Show resolved Hide resolved
let code_point =
u16::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Hexadecimal escape sequence",
cursor.pos(),
))
}
}
)
})?;

buf.push(code_point);
}
'u' => {
// There are 2 types of codepoints. Surragate codepoints and
// unicode codepoints. UTF-16 could be surrogate codepoints,
// "\uXXXX\uXXXX" which make up a single unicode codepoint. We will
// need to loop to make sure we catch all UTF-16 codepoints

// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is('{')? {
cursor.next_char()?.expect("{ character vanished"); // Consume the '{'.

// The biggest code point is 0x10FFFF
// TODO: use bytes for a bit better performance (using stack)
let mut code_point = String::with_capacity(6);
cursor.take_until('}', &mut code_point)?;
let mut code_point_str = String::with_capacity(6);
cursor.take_until('}', &mut code_point_str)?;

cursor.next_char()?.expect("} character vanished"); // Consume the '}'.

// We know this is a single unicode codepoint, convert to u32
let as_num =
u32::from_str_radix(&code_point, 16).map_err(|_| {
let code_point = u32::from_str_radix(&code_point_str, 16)
.map_err(|_| {
Error::syntax(
"malformed Unicode character escape sequence",
cursor.pos(),
)
})?;
if as_num > 0x10_FFFF {

// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
} else if code_point <= 65535 {
buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1);
buf.push(cu2);
}
char::try_from(as_num).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?
} else {
let mut codepoints: Vec<u16> = vec![];
loop {
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point = [0u8; 4];
cursor.fill_bytes(&mut code_point)?;

// Convert to u16
let as_num = match u16::from_str_radix(
str::from_utf8(&code_point)
.expect("the cursor returned invalid UTF-8"),
16,
) {
Ok(v) => v,
Err(_) => 0,
};

codepoints.push(as_num);

// Check for another UTF-16 codepoint
if cursor.next_is('\\')? && cursor.next_is('u')? {
continue;
}
break;
}
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;

// Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence");
HalidOdat marked this conversation as resolved.
Show resolved Hide resolved
let code_point = u16::from_str_radix(code_point_str, 16)
.map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?;

// codepoints length should either be 1 (unicode codepoint) or
// 2 (surrogate codepoint). Rust's decode_utf16 will deal with
// it regardless
// TODO: do not panic with invalid code points.
decode_utf16(codepoints.iter().copied())
.next()
.expect("Could not get next codepoint")
.expect("Could not get next codepoint")
buf.push(code_point);
}
}
'\'' | '"' | '\\' => escape,
'\'' | '"' | '\\' => buf.push(escape as u16),
ch => {
let details = format!(
"invalid escape sequence `{}` at line {}, column {}",
Expand All @@ -196,15 +167,23 @@ impl<R> Tokenizer<R> for StringLiteral {
return Err(Error::syntax(details, cursor.pos()));
}
};
buf.push(escaped_ch);
}
}
next_ch => buf.push(next_ch),
next_ch => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);

buf.extend(code_point_bytes.iter());
}
}
}
}

Ok(Token::new(
TokenKind::string_literal(buf),
TokenKind::string_literal(String::from_utf16_lossy(buf.as_slice())),
Span::new(start_pos, cursor.pos()),
))
}
Expand Down