diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index d80ce9a1fb60..58dfa7578c6a 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -22,6 +22,7 @@ #include "rust-linemap.h" #include "rust-session-manager.h" #include "safe-ctype.h" +#include "cpplib.h" namespace Rust { // TODO: move to separate compilation unit? @@ -103,11 +104,17 @@ check_valid_float_dot_end (char character) return character != '.' && character != '_' && !ISALPHA (character); } -// ISSPACE from safe-ctype but may change in future bool -is_whitespace (char character) +is_whitespace (int character) { - return ISSPACE (character); + // https://doc.rust-lang.org/reference/whitespace.html + return character == '\t' || character == '\n' || character == '\v' + || character == '\f' || character == '\r' || character == ' ' + || character == 0x0085 // next line + || character == 0x200e // left-to-right mark + || character == 0x200f // right-to-left mark + || character == 0x2028 // line separator + || character == 0x2029; // pragraph separator } bool @@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character) return character == 'x' || character == 'o' || character == 'b'; } +bool +is_identifier_start (int codepoint) +{ + return (check_xid_property (codepoint) & XID_START) || codepoint == '_'; +} + +bool +is_identifier_continue (int codepoint) +{ + return check_xid_property (codepoint) & XID_CONTINUE; +} + Lexer::Lexer (const std::string &input) : input (RAIIFile::create_error ()), current_line (1), current_column (1), line_map (nullptr), dump_lex_out (Optional::none ()), @@ -283,22 +302,22 @@ Lexer::build_token () while (true) { Location loc = get_current_location (); - current_char = peek_input (); - skip_input (); // detect UTF8 bom // // Must be the first thing on the first line. // There might be an optional BOM (Byte Order Mark), which for UTF-8 is // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. - if (current_line == 1 && current_column == 1 && current_char == 0xef - && peek_input () == 0xbb && peek_input (1) == 0xbf) + if (current_line == 1 && current_column == 1 && peek_input () == 0xef + && peek_input (1) == 0xbb && peek_input (2) == 0xbf) { - skip_input (1); - current_char = peek_input (); - skip_input (); + skip_input (2); } + current_char = peek_input (); + current_char32 = peek_codepoint_input (); + skip_codepoint_input (); + // detect shebang // Must be the first thing on the first line, starting with #! // But since an attribute can also start with an #! we don't count it as a @@ -311,6 +330,7 @@ Lexer::build_token () int n = 1; while (true) { + // TODO use utf-8 codepoint to skip whitespaces int next_char = peek_input (n); if (is_whitespace (next_char)) n++; @@ -1051,7 +1071,8 @@ Lexer::build_token () int peek = peek_input (); int peek1 = peek_input (1); - if (peek == '#' && (ISALPHA (peek1) || peek1 == '_')) + // TODO (tamaron) parse Unicode ident + if (peek == '#' && is_identifier_start (peek1)) { TokenPtr raw_ident_ptr = parse_raw_identifier (loc); if (raw_ident_ptr != nullptr) @@ -1068,8 +1089,8 @@ Lexer::build_token () } } - // find identifiers and keywords - if (ISALPHA (current_char) || current_char == '_') + // find identifiers and keywords. + if (is_identifier_start (current_char32.value)) return parse_identifier_or_keyword (loc); // int and float literals @@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue () int additional_length_offset = 1; // string continue + // TODO use utf-8 codepoint to skip whitespaces while (is_whitespace (current_char)) { if (current_char == '\n') @@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape () // wrong bracketm whitespace or single/double quotes are wrong // termination, otherwise it is a wrong character, then skip to the actual // terminator. + // TODO use utf-8 codepoint to skip whitespaces if (current_char == '{' || is_whitespace (current_char) || current_char == '\'' || current_char == '"') { @@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape () rust_error_at (get_current_location (), "invalid character %<%c%> in unicode escape", current_char); + // TODO use utf-8 codepoint to skip whitespaces while (current_char != '}' && current_char != '{' && !is_whitespace (current_char) && current_char != '\'' && current_char != '"') @@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc) int length = 0; current_char = peek_input (); // loop through entire name - while (ISALPHA (current_char) || ISDIGIT (current_char) - || current_char == '_') + while (is_identifier_continue (current_char)) { length++; @@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc) { std::string str; str.reserve (16); // default - str += current_char; + str += current_char32.as_string (); bool first_is_underscore = current_char == '_'; int length = 1; - current_char = peek_input (); + current_char32 = peek_codepoint_input (); + // loop through entire name - while (ISALPHA (current_char) || ISDIGIT (current_char) - || current_char == '_') + while (is_identifier_continue (current_char32.value)) { + auto s = current_char32.as_string (); length++; - str += current_char; - skip_input (); - current_char = peek_input (); + str += current_char32.as_string (); + skip_codepoint_input (); + current_char32 = peek_codepoint_input (); } current_column += length; @@ -2443,21 +2467,19 @@ Lexer::parse_char_or_lifetime (Location loc) return Token::make_char (loc, current_char32); } - else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value) - || current_char32.value == '_') + else if (is_identifier_start (current_char32.value)) { // parse lifetime name std::string str; str += current_char32; length++; - current_char = peek_input (); - while (ISDIGIT (current_char) || ISALPHA (current_char) - || current_char == '_') + current_char32 = peek_codepoint_input (); + while (is_identifier_continue (current_char32.value)) { - str += current_char; - skip_input (); - current_char = peek_input (); + str += current_char32; + skip_codepoint_input (); + current_char32 = peek_codepoint_input (); length++; } @@ -2465,6 +2487,9 @@ Lexer::parse_char_or_lifetime (Location loc) loc += length - 1; + // TODO some keywords cannot be used for a lifetime label + // https://doc.rust-lang.org/reference/tokens.html#lifetimes-and-loop-labels + str.shrink_to_fit (); return Token::make_lifetime (loc, std::move (str)); } @@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input () void Lexer::skip_codepoint_input () { + if (peek_input () == EOF) + return; int toSkip = get_input_codepoint_length (); gcc_assert (toSkip >= 1); diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h index 14008397154a..837e95b5282b 100644 --- a/gcc/rust/lex/rust-lex.h +++ b/gcc/rust/lex/rust-lex.h @@ -118,9 +118,9 @@ class Lexer // Advances current input char to n + 1 chars ahead of current position. void skip_input (int n); - // Returns char n chars ahead of current position. - int peek_input (); // Peeks the current char. + int peek_input (); + // Returns char n bytes ahead of current position. int peek_input (int n); // Classifies keyword (i.e. gets id for keyword). @@ -140,6 +140,7 @@ class Lexer int get_input_codepoint_length (); int test_get_input_codepoint_n_length (int n_start_offset); + // Peeks the current utf-8 char Codepoint peek_codepoint_input (); Codepoint test_peek_codepoint_input (int n); void skip_codepoint_input (); @@ -220,6 +221,7 @@ class Lexer int current_column; // Current character. int current_char; + Codepoint current_char32; // Line map. Linemap *line_map; diff --git a/libcpp/charset.cc b/libcpp/charset.cc index d7f323b2cd52..6edbf24283a4 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1256,6 +1256,42 @@ _cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name) return result; } +/* Returns flags representing the XID properties of the given codepoint. */ +unsigned int +check_xid_property (cppchar_t c) +{ + // fast path for ASCII + if (c < 0x80) + { + if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')) + return XID_START | XID_CONTINUE; + if (('0' <= c && c <= '9') || c == '_') + return XID_CONTINUE; + } + + if (c > UCS_LIMIT) + return 0; + + int mn, mx, md; + mn = 0; + mx = ARRAY_SIZE (ucnranges) - 1; + while (mx != mn) + { + md = (mn + mx) / 2; + if (c <= ucnranges[md].end) + mx = md; + else + mn = md + 1; + } + + unsigned short flags = ucnranges[mn].flags; + + if (flags & CXX23) + return XID_START | XID_CONTINUE; + if (flags & NXX23) + return XID_CONTINUE; + return 0; +} /* Returns 1 if C is valid in an identifier, 2 if C is valid except at the start of an identifier, and 0 if C is not valid in an diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index a6f0abd894c2..6779bb0d58b2 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1602,4 +1602,11 @@ bool cpp_input_conversion_is_trivial (const char *input_charset); int cpp_check_utf8_bom (const char *data, size_t data_length); bool cpp_valid_utf8_p (const char *data, size_t num_bytes); +enum { + XID_START = 1, + XID_CONTINUE = 2 +}; + +unsigned int check_xid_property (cppchar_t c); + #endif /* ! LIBCPP_CPPLIB_H */