Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenize Unicode identifiers #2284

Merged
merged 2 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 57 additions & 30 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "rust-linemap.h"
#include "rust-session-manager.h"
#include "safe-ctype.h"
#include "cpplib.h"

namespace Rust {
// TODO: move to separate compilation unit?
Expand Down Expand Up @@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
return character != '.' && character != '_' && !ISALPHA (character);
}

// ISSPACE from safe-ctype but may change in future
bool
is_whitespace (char character)
is_whitespace (int character)
{
return ISSPACE (character);
// https://doc.rust-lang.org/reference/whitespace.html
return character == '\t' || character == '\n' || character == '\v'
|| character == '\f' || character == '\r' || character == ' '
|| character == 0x0085 // next line
|| character == 0x200e // left-to-right mark
|| character == 0x200f // right-to-left mark
|| character == 0x2028 // line separator
Comment on lines 107 to +116
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some codepoints of whitespaces.
But non-ascii whitespaces are not actually checked during tokenization because this func is called with argument whose type is char (1 byte)

|| character == 0x2029; // pragraph separator
Comment on lines +113 to +117
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all of those characters accepted by rustc as whitespace?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. All of these values are defined in the Rust ref.
You can find URL to this just before the selected lines.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I missed this! Sorry! Thanks for pointing it out haha

}

bool
Expand All @@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
return character == 'x' || character == 'o' || character == 'b';
}

bool
is_identifier_start (int codepoint)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to unify with the Codepoint alias in rust-codepoint.h ? Also what about specifying an explicit size (eg. std::uint32_t, maybe even wchar_t ?) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think uint32_t is better. If we unify types for paramters of such functions, other several functions should also use the same type.
e.g. is_x_digit, is_octal_digit, etc.

bool
is_x_digit (char number)
{
return ISXDIGIT (number);
}

{
return (check_xid_property (codepoint) & XID_START) || codepoint == '_';
}

bool
is_identifier_continue (int codepoint)
{
return check_xid_property (codepoint) & XID_CONTINUE;
}

Lexer::Lexer (const std::string &input)
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
Expand Down Expand Up @@ -283,22 +302,22 @@ Lexer::build_token ()
while (true)
{
Location loc = get_current_location ();
current_char = peek_input ();
skip_input ();

// detect UTF8 bom
//
// Must be the first thing on the first line.
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
if (current_line == 1 && current_column == 1 && current_char == 0xef
&& peek_input () == 0xbb && peek_input (1) == 0xbf)
if (current_line == 1 && current_column == 1 && peek_input () == 0xef
&& peek_input (1) == 0xbb && peek_input (2) == 0xbf)
{
skip_input (1);
current_char = peek_input ();
skip_input ();
skip_input (2);
}

current_char = peek_input ();
current_char32 = peek_codepoint_input ();
skip_codepoint_input ();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we skipping the codepoint input here but not the char?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we skip one byte here, only the first byte of current utf-8 character can be skipped by the lexer, which we do not expect.
For example, if the lexer tokenizes identifier あああ , it should skip the first utf8 character あ, not its first byte.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thank you!


// detect shebang
// Must be the first thing on the first line, starting with #!
// But since an attribute can also start with an #! we don't count it as a
Expand All @@ -311,6 +330,7 @@ Lexer::build_token ()
int n = 1;
while (true)
{
// TODO use utf-8 codepoint to skip whitespaces
int next_char = peek_input (n);
if (is_whitespace (next_char))
n++;
Expand Down Expand Up @@ -1051,7 +1071,8 @@ Lexer::build_token ()
int peek = peek_input ();
int peek1 = peek_input (1);

if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
// TODO (tamaron) parse Unicode ident
if (peek == '#' && is_identifier_start (peek1))
{
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
if (raw_ident_ptr != nullptr)
Expand All @@ -1068,8 +1089,8 @@ Lexer::build_token ()
}
}

// find identifiers and keywords
if (ISALPHA (current_char) || current_char == '_')
// find identifiers and keywords.
if (is_identifier_start (current_char32.value))
return parse_identifier_or_keyword (loc);

// int and float literals
Expand Down Expand Up @@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue ()
int additional_length_offset = 1;

// string continue
// TODO use utf-8 codepoint to skip whitespaces
while (is_whitespace (current_char))
{
if (current_char == '\n')
Expand Down Expand Up @@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape ()
// wrong bracketm whitespace or single/double quotes are wrong
// termination, otherwise it is a wrong character, then skip to the actual
// terminator.
// TODO use utf-8 codepoint to skip whitespaces
if (current_char == '{' || is_whitespace (current_char)
|| current_char == '\'' || current_char == '"')
{
Expand All @@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape ()
rust_error_at (get_current_location (),
"invalid character %<%c%> in unicode escape",
current_char);
// TODO use utf-8 codepoint to skip whitespaces
while (current_char != '}' && current_char != '{'
&& !is_whitespace (current_char) && current_char != '\''
&& current_char != '"')
Expand Down Expand Up @@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc)
int length = 0;
current_char = peek_input ();
// loop through entire name
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
while (is_identifier_continue (current_char))
{
length++;

Expand Down Expand Up @@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
{
std::string str;
str.reserve (16); // default
str += current_char;
str += current_char32.as_string ();

bool first_is_underscore = current_char == '_';

int length = 1;
current_char = peek_input ();
current_char32 = peek_codepoint_input ();

// loop through entire name
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
while (is_identifier_continue (current_char32.value))
{
auto s = current_char32.as_string ();
length++;

str += current_char;
skip_input ();
current_char = peek_input ();
str += current_char32.as_string ();
skip_codepoint_input ();
current_char32 = peek_codepoint_input ();
}

current_column += length;
Expand Down Expand Up @@ -2443,28 +2467,29 @@ Lexer::parse_char_or_lifetime (Location loc)

return Token::make_char (loc, current_char32);
}
else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
|| current_char32.value == '_')
else if (is_identifier_start (current_char32.value))
{
// parse lifetime name
std::string str;
str += current_char32;
length++;

current_char = peek_input ();
while (ISDIGIT (current_char) || ISALPHA (current_char)
|| current_char == '_')
current_char32 = peek_codepoint_input ();
while (is_identifier_continue (current_char32.value))
{
str += current_char;
skip_input ();
current_char = peek_input ();
str += current_char32;
skip_codepoint_input ();
current_char32 = peek_codepoint_input ();
length++;
}

current_column += length;

loc += length - 1;

// TODO some keywords cannot be used for a lifetime label #2306
// https://doc.rust-lang.org/reference/tokens.html

str.shrink_to_fit ();
return Token::make_lifetime (loc, std::move (str));
}
Expand Down Expand Up @@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input ()
void
Lexer::skip_codepoint_input ()
{
if (peek_input () == EOF)
return;
int toSkip = get_input_codepoint_length ();
gcc_assert (toSkip >= 1);

Expand Down
6 changes: 4 additions & 2 deletions gcc/rust/lex/rust-lex.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ class Lexer
// Advances current input char to n + 1 chars ahead of current position.
void skip_input (int n);

// Returns char n chars ahead of current position.
int peek_input ();
// Peeks the current char.
int peek_input ();
// Returns char n bytes ahead of current position.
int peek_input (int n);

// Classifies keyword (i.e. gets id for keyword).
Expand All @@ -137,6 +137,7 @@ class Lexer

int get_input_codepoint_length ();
int test_get_input_codepoint_n_length (int n_start_offset);
// Peeks the current utf-8 char
Codepoint peek_codepoint_input ();
Codepoint test_peek_codepoint_input (int n);
void skip_codepoint_input ();
Expand Down Expand Up @@ -220,6 +221,7 @@ class Lexer
int current_column;
// Current character.
int current_char;
Codepoint current_char32;
// Line map.
Linemap *line_map;

Expand Down
36 changes: 36 additions & 0 deletions libcpp/charset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,42 @@ _cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name)
return result;
}

/* Returns flags representing the XID properties of the given codepoint. */
unsigned int
check_xid_property (cppchar_t c)
{
// fast path for ASCII
if (c < 0x80)
{
if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
return XID_START | XID_CONTINUE;
if (('0' <= c && c <= '9') || c == '_')
return XID_CONTINUE;
}

if (c > UCS_LIMIT)
return 0;

int mn, mx, md;
mn = 0;
mx = ARRAY_SIZE (ucnranges) - 1;
while (mx != mn)
{
md = (mn + mx) / 2;
if (c <= ucnranges[md].end)
mx = md;
else
mn = md + 1;
}

unsigned short flags = ucnranges[mn].flags;

if (flags & CXX23)
return XID_START | XID_CONTINUE;
if (flags & NXX23)
return XID_CONTINUE;
return 0;
}

/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
Expand Down
7 changes: 7 additions & 0 deletions libcpp/include/cpplib.h
Original file line number Diff line number Diff line change
Expand Up @@ -1602,4 +1602,11 @@ bool cpp_input_conversion_is_trivial (const char *input_charset);
int cpp_check_utf8_bom (const char *data, size_t data_length);
bool cpp_valid_utf8_p (const char *data, size_t num_bytes);

enum {
XID_START = 1,
XID_CONTINUE = 2
};

unsigned int check_xid_property (cppchar_t c);

#endif /* ! LIBCPP_CPPLIB_H */