Skip to content

Commit

Permalink
[parser] support unicode escape codes in identifiers
Browse files Browse the repository at this point in the history
Summary:
//IdentifierName// can include //UnicodeEscapeSequence//, `\u1234` or
`\u{12}`. most things use the //StringValue// of the identifier (`value`, not
`raw`), except for keywords (which don't support escape codes) and use `raw`.

Reviewed By: avikchaudhuri

Differential Revision: D5725308

fbshipit-source-id: 93aee2d737763b938241c6f318e54c8b69329943
  • Loading branch information
mroch authored and facebook-github-bot committed Aug 29, 2017
1 parent e706246 commit 1e4f3e5
Show file tree
Hide file tree
Showing 20 changed files with 158 additions and 188 deletions.
2 changes: 1 addition & 1 deletion src/parser/expression_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ module Expression
Expect.token env T_PERIOD;
let meta = start_loc, "new" in
match Peek.token env with
| T_IDENTIFIER "target" ->
| T_IDENTIFIER { raw = "target"; _ } ->
let property = Parse.identifier env in
let end_loc = fst property in
Loc.btwn start_loc end_loc, Expression.(MetaProperty MetaProperty.({
Expand Down
79 changes: 71 additions & 8 deletions src/parser/lexer.ml
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,12 @@ let neg = [%sedlex.regexp? '-', Star whitespace]

let line_terminator_sequence = [%sedlex.regexp? '\n' | '\r' | "\r\n"]

(* TODO: allow \u{} *)
let js_id_start = [%sedlex.regexp? '$' | '_' | id_start]
let hex_quad = [%sedlex.regexp? hex_digit, hex_digit, hex_digit, hex_digit]
let unicode_escape = [%sedlex.regexp? "\\u", hex_quad]
let codepoint_escape = [%sedlex.regexp? "\\u{", Plus hex_digit, '}']
let js_id_start = [%sedlex.regexp? '$' | '_' | id_start | unicode_escape | codepoint_escape]
let js_id_continue = [%sedlex.regexp?
'$' | '_' | 0x200C | 0x200D | id_continue
'$' | '_' | 0x200C | 0x200D | id_continue | unicode_escape | codepoint_escape
]

let loc_of_offsets env start_offset end_offset =
Expand Down Expand Up @@ -291,6 +293,56 @@ let mk_num_singleton number_type raw =
let value = if neg then ~-.value else value in
T_NUMBER_SINGLETON_TYPE { kind = number_type; value; raw }

let decode_identifier =
let assert_valid_unicode_in_identifier env loc code =
let lexbuf = Sedlexing.from_int_array [|code|] in
match%sedlex lexbuf with
| js_id_start -> env
| js_id_continue -> env
| any
| eof -> lex_error env loc Parse_error.IllegalUnicodeEscape
| _ -> failwith "unreachable"
in
let loc_and_lexeme env offset lexbuf =
let start_offset = offset + Sedlexing.lexeme_start lexbuf in
let end_offset = offset + Sedlexing.lexeme_end lexbuf in
let loc = loc_of_offsets env start_offset end_offset in
loc, lexeme lexbuf
in
let rec id_char env offset buf lexbuf =
match%sedlex lexbuf with
| unicode_escape ->
let loc, str = loc_and_lexeme env offset lexbuf in
let hex = String.sub str 2 (String.length str - 2) in
let code = int_of_string ("0x"^hex) in
let env = assert_valid_unicode_in_identifier env loc code in
Wtf8.add_wtf_8 buf code;
id_char env offset buf lexbuf

| codepoint_escape ->
let loc, str = loc_and_lexeme env offset lexbuf in
let hex = String.sub str 3 (String.length str - 4) in
let code = int_of_string ("0x"^hex) in
let env = assert_valid_unicode_in_identifier env loc code in
Wtf8.add_wtf_8 buf code;
id_char env offset buf lexbuf

| eof ->
env, Buffer.contents buf

| any ->
let x = lexeme lexbuf in
Buffer.add_string buf x;
id_char env offset buf lexbuf

| _ -> failwith "unreachable"
in
fun env raw ->
let offset = Sedlexing.lexeme_start env.lex_lb in
let lexbuf = Sedlexing.Utf8.from_string raw in
let buf = Buffer.create (String.length raw) in
id_char env offset buf lexbuf

let recover env lexbuf ~f =
let env = illegal env (loc_of_lexbuf env lexbuf) in
Sedlexing.rollback lexbuf;
Expand Down Expand Up @@ -404,7 +456,7 @@ let string_escape env lexbuf =
let code = int_of_string ("0o"^str) in (* 0o1 *)
env, str, [|code|], true

| 'u', hex_digit, hex_digit, hex_digit, hex_digit ->
| 'u', hex_quad ->
let str = lexeme lexbuf in
let hex = String.sub str 1 (String.length str - 1) in
let code = int_of_string ("0x"^hex) in
Expand Down Expand Up @@ -747,11 +799,18 @@ let token (env: Lex_env.t) lexbuf : result =
| "yield" -> Token (env, T_YIELD)

(* Identifiers *)
| js_id_start, Star js_id_continue -> Token (env, T_IDENTIFIER (lexeme lexbuf))
| js_id_start, Star js_id_continue ->
let loc = loc_of_lexbuf env lexbuf in
let raw = lexeme lexbuf in
let env, value = decode_identifier env raw in
Token (env, T_IDENTIFIER { loc; value; raw })

(* TODO: Use [Symbol.iterator] instead of @@iterator. *)
| "@@iterator" -> Token (env, T_IDENTIFIER "@@iterator")
| "@@asyncIterator" -> Token (env, T_IDENTIFIER "@@asyncIterator")
| "@@iterator"
| "@@asyncIterator" ->
let loc = loc_of_lexbuf env lexbuf in
let raw = lexeme lexbuf in
Token (env, T_IDENTIFIER { loc; value = raw; raw })

(* Syntax *)
| "{" -> Token (env, T_LCURLY)
Expand Down Expand Up @@ -1571,7 +1630,11 @@ let type_token env lexbuf =
| "void" -> Token (env, T_VOID_TYPE)

(* Identifiers *)
| js_id_start, Star js_id_continue -> Token (env, T_IDENTIFIER (lexeme lexbuf))
| js_id_start, Star js_id_continue ->
let loc = loc_of_lexbuf env lexbuf in
let raw = lexeme lexbuf in
let env, value = decode_identifier env raw in
Token (env, T_IDENTIFIER { loc; value; raw })

| "%checks" -> Token (env, T_CHECKS)
(* Syntax *)
Expand Down
27 changes: 15 additions & 12 deletions src/parser/object_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,9 @@ module Object
-> Declaration.async env
in
let generator = Declaration.generator env in
match async, generator, key env with
| false, false, (_, (Property.Identifier (_, "get") as key)) ->
match async, generator, Peek.token env with
| false, false, T_IDENTIFIER { raw = "get"; _ } ->
let _, key = key env in
begin match Peek.token env with
| T_ASSIGN
| T_COLON
Expand All @@ -163,7 +164,8 @@ module Object
| T_RCURLY -> init env start_loc key false false
| _ -> get env start_loc, []
end
| false, false, (_, (Property.Identifier (_, "set") as key)) ->
| false, false, T_IDENTIFIER { raw = "set"; _ } ->
let _, key = key env in
begin match Peek.token env with
| T_ASSIGN
| T_COLON
Expand All @@ -173,7 +175,8 @@ module Object
| T_RCURLY -> init env start_loc key false false
| _ -> set env start_loc, []
end
| async, generator, (_, key) ->
| async, generator, _ ->
let _, key = key env in
init env start_loc key async generator
end

Expand Down Expand Up @@ -563,7 +566,7 @@ module Object
decorators;
})))

in fun env -> Ast.Expression.Object.Property.(
in fun env ->
let start_loc = Peek.loc env in
let decorators = decorator_list env in
let static =
Expand All @@ -580,9 +583,9 @@ module Object
| false, Some _ -> Declaration.generator env
| _ -> generator
in
match (async, generator, key ~class_body:true env) with
| false, false,
(_, (Identifier (_, "get") as key)) ->
match async, generator, Peek.token env with
| false, false, T_IDENTIFIER { raw = "get"; _ } ->
let _, key = key ~class_body:true env in
(match Peek.token env with
| T_LESS_THAN
| T_COLON
Expand All @@ -593,8 +596,8 @@ module Object
| _ ->
error_unsupported_variance env variance;
get env start_loc decorators static)
| false, false,
(_, (Identifier (_, "set") as key)) ->
| false, false, T_IDENTIFIER { raw = "set"; _ } ->
let _, key = key ~class_body:true env in
(match Peek.token env with
| T_LESS_THAN
| T_COLON
Expand All @@ -605,9 +608,9 @@ module Object
| _ ->
error_unsupported_variance env variance;
set env start_loc decorators static)
| _, _, (_, key) ->
| _, _, _ ->
let _, key = key ~class_body:true env in
init env start_loc decorators key async generator static variance
)

let class_declaration env decorators =
(* 10.2.1 says all parts of a class definition are strict *)
Expand Down
2 changes: 2 additions & 0 deletions src/parser/parse_error.ml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type t =
| IllegalContinue
| IllegalBreak
| IllegalReturn
| IllegalUnicodeEscape
| StrictModeWith
| StrictCatchVariable
| StrictVarName
Expand Down Expand Up @@ -154,6 +155,7 @@ module PP =
| IllegalContinue -> "Illegal continue statement"
| IllegalBreak -> "Illegal break statement"
| IllegalReturn -> "Illegal return statement"
| IllegalUnicodeEscape -> "Illegal Unicode escape"
| StrictModeWith -> "Strict mode code may not include a with statement"
| StrictCatchVariable -> "Catch variable may not be eval or arguments in strict mode"
| StrictVarName -> "Variable name may not be eval or arguments in strict mode"
Expand Down
2 changes: 1 addition & 1 deletion src/parser/parser_common.ml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ let identifier_name env =
let loc = Peek.loc env in
let name = match Peek.token env with
(* obviously, Identifier is a valid IdentifierName *)
| T_IDENTIFIER id -> id
| T_IDENTIFIER { value; _ } -> value
(* keywords are also IdentifierNames *)
| T_AWAIT -> "await"
| T_BREAK -> "break"
Expand Down
13 changes: 10 additions & 3 deletions src/parser/parser_env.ml
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ let is_future_reserved = function
| _ -> false

let token_is_future_reserved = Token.(function
| T_IDENTIFIER name when is_future_reserved name -> true
| T_IDENTIFIER { raw; _ } when is_future_reserved raw -> true
| T_ENUM -> true
| _ -> false
)
Expand All @@ -420,7 +420,7 @@ let is_strict_reserved = function
| _ -> false

let token_is_strict_reserved = Token.(function
| T_IDENTIFIER name when is_strict_reserved name -> true
| T_IDENTIFIER { raw; _ } when is_strict_reserved raw -> true
| T_INTERFACE
| T_IMPLEMENTS
| T_PACKAGE
Expand All @@ -441,7 +441,7 @@ let is_restricted = function
| _ -> false

let token_is_restricted = Token.(function
| T_IDENTIFIER name when is_restricted name -> true
| T_IDENTIFIER { raw; _ } when is_restricted raw -> true
| _ -> false
)

Expand Down Expand Up @@ -629,6 +629,13 @@ module Expect = struct
if Peek.token env <> t then error_unexpected env;
Eat.token env

let identifier env name =
begin match Peek.token env with
| Token.T_IDENTIFIER { raw; _ } when raw = name -> ()
| _ -> error_unexpected env
end;
Eat.token env

(* If the next token is t, then eat it and return true
* else return false *)
let maybe env t =
Expand Down
1 change: 1 addition & 0 deletions src/parser/parser_env.mli
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ end

module Expect : sig
val token : env -> Token.t -> unit
val identifier : env -> string -> unit
val maybe : env -> Token.t -> bool
end

Expand Down
Loading

0 comments on commit 1e4f3e5

Please sign in to comment.