From faefec5b1f96a93dfdecccfb241e3bf340743ba7 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 16:09:56 -0700 Subject: [PATCH 1/5] gh-97997: Add col_offset field to tokenizer and use that for AST nodes --- Parser/tokenizer.c | 46 +++++++++++++++++++++++++++++++++++++++------- Parser/tokenizer.h | 2 ++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index c5d3e580247cc1..0420cb23092f73 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -37,6 +37,8 @@ #define TABSIZE 8 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) +#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ + type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) /* Forward */ static struct tok_state *tok_new(void); @@ -73,6 +75,8 @@ tok_new(void) tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; + tok->starting_col_offset = -1; + tok->col_offset = -1; tok->level = 0; tok->altindstack[0] = 0; tok->decoding_state = STATE_INIT; @@ -872,6 +876,7 @@ tok_underflow_string(struct tok_state *tok) { } tok->line_start = tok->cur; tok->lineno++; + tok->col_offset = 0; tok->inp = end; return 1; } @@ -931,6 +936,7 @@ tok_underflow_interactive(struct tok_state *tok) { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; size_t size = strlen(newtok); tok->lineno++; + tok->col_offset = 0; if (!tok_reserve_buf(tok, size + 1)) { PyMem_Free(tok->buf); tok->buf = NULL; @@ -944,6 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) { } else { tok->lineno++; + tok->col_offset = 0; PyMem_Free(tok->buf); tok->buf = newtok; tok->cur = tok->buf; @@ -999,6 +1006,7 @@ tok_underflow_file(struct tok_state *tok) { } tok->lineno++; + tok->col_offset = 0; if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; @@ -1056,6 +1064,7 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { + tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } if (tok->done != E_OK) { @@ -1104,6 +1113,7 @@ tok_backup(struct tok_state *tok, int c) if ((int)(unsigned char)*tok->cur != c) { Py_FatalError("tok_backup: wrong character"); } + tok->col_offset--; } } @@ -1390,6 +1400,19 @@ tok_continuation_line(struct tok_state *tok) { return c; } +static int +type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset, + int end_col_offset, const char *start, const char *end) +{ + token->level = tok->level; + token->lineno = token->end_lineno = tok->lineno; + token->col_offset = col_offset; + token->end_col_offset = end_col_offset; + token->start = start; + token->end = end; + return type; +} + static int token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) { @@ -1397,14 +1420,13 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st token->level = tok->level; token->lineno = type == STRING ? tok->first_lineno : tok->lineno; token->end_lineno = tok->lineno; - token->col_offset = -1; - token->end_col_offset = -1; + token->col_offset = token->end_col_offset = -1; token->start = start; token->end = end; + if (start != NULL && end != NULL) { - const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start; - token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1; - token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1; + token->col_offset = tok->starting_col_offset; + token->end_col_offset = tok->col_offset; } return type; } @@ -1419,6 +1441,7 @@ tok_get(struct tok_state *tok, struct token *token) const char *p_end = NULL; nextline: tok->start = NULL; + tok->starting_col_offset = -1; blankline = 0; /* Get indentation level */ @@ -1426,6 +1449,7 @@ tok_get(struct tok_state *tok, struct token *token) int col = 0; int altcol = 0; tok->atbol = 0; + tok->starting_col_offset = 0; int cont_line_col = 0; for (;;) { c = tok_nextc(tok); @@ -1518,6 +1542,7 @@ tok_get(struct tok_state *tok, struct token *token) } tok->start = tok->cur; + tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ if (tok->pendin != 0) { @@ -1565,10 +1590,12 @@ tok_get(struct tok_state *tok, struct token *token) /* Set start of current token */ tok->start = tok->cur == NULL ? NULL : tok->cur - 1; + tok->starting_col_offset = tok->col_offset - 1; /* Skip comment, unless it's a type comment */ if (c == '#') { const char *prefix, *p, *type_start; + int current_starting_col_offset; while (c != EOF && c != '\n') { c = tok_nextc(tok); @@ -1576,14 +1603,17 @@ tok_get(struct tok_state *tok, struct token *token) if (tok->type_comments) { p = tok->start; + current_starting_col_offset = tok->starting_col_offset; prefix = type_comment_prefix; while (*prefix && p < tok->cur) { if (*prefix == ' ') { while (*p == ' ' || *p == '\t') { p++; + current_starting_col_offset++; } } else if (*prefix == *p) { p++; + current_starting_col_offset++; } else { break; } @@ -1595,6 +1625,7 @@ tok_get(struct tok_state *tok, struct token *token) if (!*prefix) { int is_type_ignore = 1; const char *ignore_end = p + 6; + const int ignore_end_col_offset = current_starting_col_offset + 6; tok_backup(tok, c); /* don't eat the newline or EOF */ type_start = p; @@ -1615,11 +1646,12 @@ tok_get(struct tok_state *tok, struct token *token) tok_nextc(tok); tok->atbol = 1; } - return MAKE_TOKEN(TYPE_IGNORE); + // +6 below cause we need to skip the ignore part + return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); } else { p_start = type_start; p_end = tok->cur; - return MAKE_TOKEN(TYPE_COMMENT); + return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); } } } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 5b8c7f314386ec..2542d30e1da0ed 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -57,6 +57,8 @@ struct tok_state { int lineno; /* Current line number */ int first_lineno; /* First line of a single line or multi line string expression (cf. issue 16806) */ + int starting_col_offset; /* The column offset at the beginning of a token */ + int col_offset; /* Current col offset */ int level; /* () [] {} Parentheses nesting level */ /* Used to allow free continuations inside them */ char parenstack[MAXLEVEL]; From d3f852ca568be8258d0c8ba6bf069876e05e2790 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 6 Oct 2022 23:13:37 +0000 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst new file mode 100644 index 00000000000000..5cb5e2126638be --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst @@ -0,0 +1 @@ +Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic. From 5c47419ee8941e752ddab2a608d8f6f0d5aafee6 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 17:18:59 -0700 Subject: [PATCH 3/5] Remove unnecessary assignment of starting_col_offset --- Parser/tokenizer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0420cb23092f73..68808a62826c70 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1449,7 +1449,6 @@ tok_get(struct tok_state *tok, struct token *token) int col = 0; int altcol = 0; tok->atbol = 0; - tok->starting_col_offset = 0; int cont_line_col = 0; for (;;) { c = tok_nextc(tok); From 4796a400f71a3adb0f285f11f301f3ba33f188d8 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 17:21:16 -0700 Subject: [PATCH 4/5] Add comment --- Parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 68808a62826c70..62af172f52b3cd 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1623,6 +1623,7 @@ tok_get(struct tok_state *tok, struct token *token) /* This is a type comment if we matched all of type_comment_prefix. */ if (!*prefix) { int is_type_ignore = 1; + // +6 in order to skip the word 'ignore' const char *ignore_end = p + 6; const int ignore_end_col_offset = current_starting_col_offset + 6; tok_backup(tok, c); /* don't eat the newline or EOF */ @@ -1645,7 +1646,6 @@ tok_get(struct tok_state *tok, struct token *token) tok_nextc(tok); tok->atbol = 1; } - // +6 below cause we need to skip the ignore part return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); } else { p_start = type_start; From d128a37eae72ce5eafa3536f421668ee1b1bbbbc Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 7 Oct 2022 10:20:47 -0700 Subject: [PATCH 5/5] Address feedback; add macro to advance new line --- Parser/tokenizer.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 62af172f52b3cd..1c356d3d47c945 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -39,6 +39,9 @@ #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) +#define ADVANCE_LINENO() \ + tok->lineno++; \ + tok->col_offset = 0; /* Forward */ static struct tok_state *tok_new(void); @@ -875,8 +878,7 @@ tok_underflow_string(struct tok_state *tok) { tok->buf = tok->cur; } tok->line_start = tok->cur; - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); tok->inp = end; return 1; } @@ -935,8 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) { else if (tok->start != NULL) { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; size_t size = strlen(newtok); - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); if (!tok_reserve_buf(tok, size + 1)) { PyMem_Free(tok->buf); tok->buf = NULL; @@ -949,8 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) { tok->multi_line_start = tok->buf + cur_multi_line_start; } else { - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); PyMem_Free(tok->buf); tok->buf = newtok; tok->cur = tok->buf; @@ -1005,8 +1005,7 @@ tok_underflow_file(struct tok_state *tok) { *tok->inp = '\0'; } - tok->lineno++; - tok->col_offset = 0; + ADVANCE_LINENO(); if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL;