Skip to content

Commit

Permalink
add utils to check XID properties
Browse files Browse the repository at this point in the history
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
  • Loading branch information
tamaroning committed Jun 16, 2023
1 parent c7b7e29 commit 41e4ea7
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 32 deletions.
87 changes: 57 additions & 30 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "rust-linemap.h"
#include "rust-session-manager.h"
#include "safe-ctype.h"
#include "cpplib.h"

namespace Rust {
// TODO: move to separate compilation unit?
Expand Down Expand Up @@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
return character != '.' && character != '_' && !ISALPHA (character);
}

// ISSPACE from safe-ctype but may change in future
bool
is_whitespace (char character)
is_whitespace (int character)
{
return ISSPACE (character);
// https://doc.rust-lang.org/reference/whitespace.html
return character == '\t' || character == '\n' || character == '\v'
|| character == '\f' || character == '\r' || character == ' '
|| character == 0x0085 // next line
|| character == 0x200e // left-to-right mark
|| character == 0x200f // right-to-left mark
|| character == 0x2028 // line separator
|| character == 0x2029; // pragraph separator
}

bool
Expand All @@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
return character == 'x' || character == 'o' || character == 'b';
}

bool
is_identifier_start (int codepoint)
{
return (check_xid_property (codepoint) & XID_START) || codepoint == '_';
}

bool
is_identifier_continue (int codepoint)
{
return check_xid_property (codepoint) & XID_CONTINUE;
}

Lexer::Lexer (const std::string &input)
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
Expand Down Expand Up @@ -283,22 +302,22 @@ Lexer::build_token ()
while (true)
{
Location loc = get_current_location ();
current_char = peek_input ();
skip_input ();

// detect UTF8 bom
//
// Must be the first thing on the first line.
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
if (current_line == 1 && current_column == 1 && current_char == 0xef
&& peek_input () == 0xbb && peek_input (1) == 0xbf)
if (current_line == 1 && current_column == 1 && peek_input () == 0xef
&& peek_input (1) == 0xbb && peek_input (2) == 0xbf)
{
skip_input (1);
current_char = peek_input ();
skip_input ();
skip_input (2);
}

current_char = peek_input ();
current_char32 = peek_codepoint_input ();
skip_codepoint_input ();

// detect shebang
// Must be the first thing on the first line, starting with #!
// But since an attribute can also start with an #! we don't count it as a
Expand All @@ -311,6 +330,7 @@ Lexer::build_token ()
int n = 1;
while (true)
{
// TODO use utf-8 codepoint to skip whitespaces
int next_char = peek_input (n);
if (is_whitespace (next_char))
n++;
Expand Down Expand Up @@ -1051,7 +1071,8 @@ Lexer::build_token ()
int peek = peek_input ();
int peek1 = peek_input (1);

if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
// TODO (tamaron) parse Unicode ident
if (peek == '#' && is_identifier_start (peek1))
{
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
if (raw_ident_ptr != nullptr)
Expand All @@ -1068,8 +1089,8 @@ Lexer::build_token ()
}
}

// find identifiers and keywords
if (ISALPHA (current_char) || current_char == '_')
// find identifiers and keywords.
if (is_identifier_start (current_char32.value))
return parse_identifier_or_keyword (loc);

// int and float literals
Expand Down Expand Up @@ -1467,6 +1488,7 @@ Lexer::parse_partial_string_continue ()
int additional_length_offset = 1;

// string continue
// TODO use utf-8 codepoint to skip whitespaces
while (is_whitespace (current_char))
{
if (current_char == '\n')
Expand Down Expand Up @@ -1610,6 +1632,7 @@ Lexer::parse_partial_unicode_escape ()
// wrong bracketm whitespace or single/double quotes are wrong
// termination, otherwise it is a wrong character, then skip to the actual
// terminator.
// TODO use utf-8 codepoint to skip whitespaces
if (current_char == '{' || is_whitespace (current_char)
|| current_char == '\'' || current_char == '"')
{
Expand All @@ -1622,6 +1645,7 @@ Lexer::parse_partial_unicode_escape ()
rust_error_at (get_current_location (),
"invalid character %<%c%> in unicode escape",
current_char);
// TODO use utf-8 codepoint to skip whitespaces
while (current_char != '}' && current_char != '{'
&& !is_whitespace (current_char) && current_char != '\''
&& current_char != '"')
Expand Down Expand Up @@ -1904,8 +1928,7 @@ Lexer::parse_raw_identifier (Location loc)
int length = 0;
current_char = peek_input ();
// loop through entire name
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
while (is_identifier_continue (current_char))
{
length++;

Expand Down Expand Up @@ -2041,21 +2064,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
{
std::string str;
str.reserve (16); // default
str += current_char;
str += current_char32.as_string ();

bool first_is_underscore = current_char == '_';

int length = 1;
current_char = peek_input ();
current_char32 = peek_codepoint_input ();

// loop through entire name
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
while (is_identifier_continue (current_char32.value))
{
auto s = current_char32.as_string ();
length++;

str += current_char;
skip_input ();
current_char = peek_input ();
str += current_char32.as_string ();
skip_codepoint_input ();
current_char32 = peek_codepoint_input ();
}

current_column += length;
Expand Down Expand Up @@ -2443,28 +2467,29 @@ Lexer::parse_char_or_lifetime (Location loc)

return Token::make_char (loc, current_char32);
}
else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
|| current_char32.value == '_')
else if (is_identifier_start (current_char32.value))
{
// parse lifetime name
std::string str;
str += current_char32;
length++;

current_char = peek_input ();
while (ISDIGIT (current_char) || ISALPHA (current_char)
|| current_char == '_')
current_char32 = peek_codepoint_input ();
while (is_identifier_continue (current_char32.value))
{
str += current_char;
skip_input ();
current_char = peek_input ();
str += current_char32;
skip_codepoint_input ();
current_char32 = peek_codepoint_input ();
length++;
}

current_column += length;

loc += length - 1;

// TODO some keywords cannot be used for a lifetime label
// https://doc.rust-lang.org/reference/tokens.html#lifetimes-and-loop-labels

str.shrink_to_fit ();
return Token::make_lifetime (loc, std::move (str));
}
Expand Down Expand Up @@ -2636,6 +2661,8 @@ Lexer::peek_codepoint_input ()
void
Lexer::skip_codepoint_input ()
{
if (peek_input () == EOF)
return;
int toSkip = get_input_codepoint_length ();
gcc_assert (toSkip >= 1);

Expand Down
6 changes: 4 additions & 2 deletions gcc/rust/lex/rust-lex.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ class Lexer
// Advances current input char to n + 1 chars ahead of current position.
void skip_input (int n);

// Returns char n chars ahead of current position.
int peek_input ();
// Peeks the current char.
int peek_input ();
// Returns char n bytes ahead of current position.
int peek_input (int n);

// Classifies keyword (i.e. gets id for keyword).
Expand All @@ -140,6 +140,7 @@ class Lexer

int get_input_codepoint_length ();
int test_get_input_codepoint_n_length (int n_start_offset);
// Peeks the current utf-8 char
Codepoint peek_codepoint_input ();
Codepoint test_peek_codepoint_input (int n);
void skip_codepoint_input ();
Expand Down Expand Up @@ -220,6 +221,7 @@ class Lexer
int current_column;
// Current character.
int current_char;
Codepoint current_char32;
// Line map.
Linemap *line_map;

Expand Down
36 changes: 36 additions & 0 deletions libcpp/charset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,42 @@ _cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name)
return result;
}

/* Returns flags representing the XID properties of the given codepoint. */
unsigned int
check_xid_property (cppchar_t c)
{
// fast path for ASCII
if (c < 0x80)
{
if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
return XID_START | XID_CONTINUE;
if (('0' <= c && c <= '9') || c == '_')
return XID_CONTINUE;
}

if (c > UCS_LIMIT)
return 0;

int mn, mx, md;
mn = 0;
mx = ARRAY_SIZE (ucnranges) - 1;
while (mx != mn)
{
md = (mn + mx) / 2;
if (c <= ucnranges[md].end)
mx = md;
else
mn = md + 1;
}

unsigned short flags = ucnranges[mn].flags;

if (flags & CXX23)
return XID_START | XID_CONTINUE;
if (flags & NXX23)
return XID_CONTINUE;
return 0;
}

/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
Expand Down
7 changes: 7 additions & 0 deletions libcpp/include/cpplib.h
Original file line number Diff line number Diff line change
Expand Up @@ -1602,4 +1602,11 @@ bool cpp_input_conversion_is_trivial (const char *input_charset);
int cpp_check_utf8_bom (const char *data, size_t data_length);
bool cpp_valid_utf8_p (const char *data, size_t num_bytes);

enum {
XID_START = 1,
XID_CONTINUE = 2
};

unsigned int check_xid_property (cppchar_t c);

#endif /* ! LIBCPP_CPPLIB_H */

0 comments on commit 41e4ea7

Please sign in to comment.