Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-97997: Add col_offset field to tokenizer and use that for AST nodes #98000

Merged
merged 5 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic.
52 changes: 41 additions & 11 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
#define TABSIZE 8

#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
#define ADVANCE_LINENO() \
tok->lineno++; \
tok->col_offset = 0;

/* Forward */
static struct tok_state *tok_new(void);
Expand Down Expand Up @@ -73,6 +78,8 @@ tok_new(void)
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
tok->starting_col_offset = -1;
tok->col_offset = -1;
tok->level = 0;
tok->altindstack[0] = 0;
tok->decoding_state = STATE_INIT;
Expand Down Expand Up @@ -871,7 +878,7 @@ tok_underflow_string(struct tok_state *tok) {
tok->buf = tok->cur;
}
tok->line_start = tok->cur;
tok->lineno++;
ADVANCE_LINENO();
tok->inp = end;
return 1;
}
Expand Down Expand Up @@ -930,7 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) {
else if (tok->start != NULL) {
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
size_t size = strlen(newtok);
tok->lineno++;
ADVANCE_LINENO();
if (!tok_reserve_buf(tok, size + 1)) {
PyMem_Free(tok->buf);
tok->buf = NULL;
Expand All @@ -943,7 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) {
tok->multi_line_start = tok->buf + cur_multi_line_start;
}
else {
tok->lineno++;
ADVANCE_LINENO();
PyMem_Free(tok->buf);
tok->buf = newtok;
tok->cur = tok->buf;
Expand Down Expand Up @@ -998,7 +1005,7 @@ tok_underflow_file(struct tok_state *tok) {
*tok->inp = '\0';
}

tok->lineno++;
ADVANCE_LINENO();
if (tok->decoding_state != STATE_NORMAL) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
Expand Down Expand Up @@ -1056,6 +1063,7 @@ tok_nextc(struct tok_state *tok)
int rc;
for (;;) {
if (tok->cur != tok->inp) {
tok->col_offset++;
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
if (tok->done != E_OK) {
Expand Down Expand Up @@ -1104,6 +1112,7 @@ tok_backup(struct tok_state *tok, int c)
if ((int)(unsigned char)*tok->cur != c) {
Py_FatalError("tok_backup: wrong character");
}
tok->col_offset--;
}
}

Expand Down Expand Up @@ -1390,21 +1399,33 @@ tok_continuation_line(struct tok_state *tok) {
return c;
}

static int
type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
int end_col_offset, const char *start, const char *end)
{
token->level = tok->level;
token->lineno = token->end_lineno = tok->lineno;
token->col_offset = col_offset;
token->end_col_offset = end_col_offset;
token->start = start;
token->end = end;
return type;
}

static int
token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
{
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
token->level = tok->level;
token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
token->end_lineno = tok->lineno;
token->col_offset = -1;
token->end_col_offset = -1;
token->col_offset = token->end_col_offset = -1;
token->start = start;
token->end = end;

if (start != NULL && end != NULL) {
const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
token->col_offset = tok->starting_col_offset;
token->end_col_offset = tok->col_offset;
}
return type;
}
Expand All @@ -1419,6 +1440,7 @@ tok_get(struct tok_state *tok, struct token *token)
const char *p_end = NULL;
nextline:
tok->start = NULL;
tok->starting_col_offset = -1;
blankline = 0;

/* Get indentation level */
Expand Down Expand Up @@ -1518,6 +1540,7 @@ tok_get(struct tok_state *tok, struct token *token)
}

tok->start = tok->cur;
tok->starting_col_offset = tok->col_offset;

/* Return pending indents/dedents */
if (tok->pendin != 0) {
Expand Down Expand Up @@ -1565,25 +1588,30 @@ tok_get(struct tok_state *tok, struct token *token)

/* Set start of current token */
tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
tok->starting_col_offset = tok->col_offset - 1;

/* Skip comment, unless it's a type comment */
if (c == '#') {
const char *prefix, *p, *type_start;
int current_starting_col_offset;

while (c != EOF && c != '\n') {
c = tok_nextc(tok);
}

if (tok->type_comments) {
p = tok->start;
current_starting_col_offset = tok->starting_col_offset;
prefix = type_comment_prefix;
while (*prefix && p < tok->cur) {
if (*prefix == ' ') {
while (*p == ' ' || *p == '\t') {
p++;
current_starting_col_offset++;
}
} else if (*prefix == *p) {
p++;
current_starting_col_offset++;
} else {
break;
}
Expand All @@ -1594,7 +1622,9 @@ tok_get(struct tok_state *tok, struct token *token)
/* This is a type comment if we matched all of type_comment_prefix. */
if (!*prefix) {
int is_type_ignore = 1;
// +6 in order to skip the word 'ignore'
const char *ignore_end = p + 6;
const int ignore_end_col_offset = current_starting_col_offset + 6;
tok_backup(tok, c); /* don't eat the newline or EOF */

type_start = p;
Expand All @@ -1615,11 +1645,11 @@ tok_get(struct tok_state *tok, struct token *token)
tok_nextc(tok);
tok->atbol = 1;
}
return MAKE_TOKEN(TYPE_IGNORE);
return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
} else {
p_start = type_start;
p_end = tok->cur;
return MAKE_TOKEN(TYPE_COMMENT);
return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions Parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ struct tok_state {
int lineno; /* Current line number */
int first_lineno; /* First line of a single line or multi line string
expression (cf. issue 16806) */
int starting_col_offset; /* The column offset at the beginning of a token */
int col_offset; /* Current col offset */
int level; /* () [] {} Parentheses nesting level */
/* Used to allow free continuations inside them */
char parenstack[MAXLEVEL];
Expand Down