From faefec5b1f96a93dfdecccfb241e3bf340743ba7 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 6 Oct 2022 16:09:56 -0700
Subject: [PATCH 1/5] gh-97997: Add col_offset field to tokenizer and use that
 for AST nodes

---
 Parser/tokenizer.c | 46 +++++++++++++++++++++++++++++++++++++++-------
 Parser/tokenizer.h |  2 ++
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index c5d3e580247cc1..0420cb23092f73 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -37,6 +37,8 @@
 #define TABSIZE 8
 
 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
+#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
+                type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
 
 /* Forward */
 static struct tok_state *tok_new(void);
@@ -73,6 +75,8 @@ tok_new(void)
     tok->pendin = 0;
     tok->prompt = tok->nextprompt = NULL;
     tok->lineno = 0;
+    tok->starting_col_offset = -1;
+    tok->col_offset = -1;
     tok->level = 0;
     tok->altindstack[0] = 0;
     tok->decoding_state = STATE_INIT;
@@ -872,6 +876,7 @@ tok_underflow_string(struct tok_state *tok) {
     }
     tok->line_start = tok->cur;
     tok->lineno++;
+    tok->col_offset = 0;
     tok->inp = end;
     return 1;
 }
@@ -931,6 +936,7 @@ tok_underflow_interactive(struct tok_state *tok) {
         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
         size_t size = strlen(newtok);
         tok->lineno++;
+        tok->col_offset = 0;
         if (!tok_reserve_buf(tok, size + 1)) {
             PyMem_Free(tok->buf);
             tok->buf = NULL;
@@ -944,6 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) {
     }
     else {
         tok->lineno++;
+        tok->col_offset = 0;
         PyMem_Free(tok->buf);
         tok->buf = newtok;
         tok->cur = tok->buf;
@@ -999,6 +1006,7 @@ tok_underflow_file(struct tok_state *tok) {
     }
 
     tok->lineno++;
+    tok->col_offset = 0;
     if (tok->decoding_state != STATE_NORMAL) {
         if (tok->lineno > 2) {
             tok->decoding_state = STATE_NORMAL;
@@ -1056,6 +1064,7 @@ tok_nextc(struct tok_state *tok)
     int rc;
     for (;;) {
         if (tok->cur != tok->inp) {
+            tok->col_offset++;
             return Py_CHARMASK(*tok->cur++); /* Fast path */
         }
         if (tok->done != E_OK) {
@@ -1104,6 +1113,7 @@ tok_backup(struct tok_state *tok, int c)
         if ((int)(unsigned char)*tok->cur != c) {
             Py_FatalError("tok_backup: wrong character");
         }
+        tok->col_offset--;
     }
 }
 
@@ -1390,6 +1400,19 @@ tok_continuation_line(struct tok_state *tok) {
     return c;
 }
 
+static int
+type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset,
+                         int end_col_offset, const char *start, const char *end)
+{
+    token->level = tok->level;
+    token->lineno = token->end_lineno = tok->lineno;
+    token->col_offset = col_offset;
+    token->end_col_offset = end_col_offset;
+    token->start = start;
+    token->end = end;
+    return type;
+}
+
 static int
 token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
 {
@@ -1397,14 +1420,13 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
     token->level = tok->level;
     token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
     token->end_lineno = tok->lineno;
-    token->col_offset = -1;
-    token->end_col_offset = -1;
+    token->col_offset = token->end_col_offset = -1;
     token->start = start;
     token->end = end;
+
     if (start != NULL && end != NULL) {
-        const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
-        token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
-        token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
+        token->col_offset = tok->starting_col_offset;
+        token->end_col_offset = tok->col_offset;
     }
     return type;
 }
@@ -1419,6 +1441,7 @@ tok_get(struct tok_state *tok, struct token *token)
     const char *p_end = NULL;
   nextline:
     tok->start = NULL;
+    tok->starting_col_offset = -1;
     blankline = 0;
 
     /* Get indentation level */
@@ -1426,6 +1449,7 @@ tok_get(struct tok_state *tok, struct token *token)
         int col = 0;
         int altcol = 0;
         tok->atbol = 0;
+        tok->starting_col_offset = 0;
         int cont_line_col = 0;
         for (;;) {
             c = tok_nextc(tok);
@@ -1518,6 +1542,7 @@ tok_get(struct tok_state *tok, struct token *token)
     }
 
     tok->start = tok->cur;
+    tok->starting_col_offset = tok->col_offset;
 
     /* Return pending indents/dedents */
     if (tok->pendin != 0) {
@@ -1565,10 +1590,12 @@ tok_get(struct tok_state *tok, struct token *token)
 
     /* Set start of current token */
     tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
+    tok->starting_col_offset = tok->col_offset - 1;
 
     /* Skip comment, unless it's a type comment */
     if (c == '#') {
         const char *prefix, *p, *type_start;
+        int current_starting_col_offset;
 
         while (c != EOF && c != '\n') {
             c = tok_nextc(tok);
@@ -1576,14 +1603,17 @@ tok_get(struct tok_state *tok, struct token *token)
 
         if (tok->type_comments) {
             p = tok->start;
+            current_starting_col_offset = tok->starting_col_offset;
             prefix = type_comment_prefix;
             while (*prefix && p < tok->cur) {
                 if (*prefix == ' ') {
                     while (*p == ' ' || *p == '\t') {
                         p++;
+                        current_starting_col_offset++;
                     }
                 } else if (*prefix == *p) {
                     p++;
+                    current_starting_col_offset++;
                 } else {
                     break;
                 }
@@ -1595,6 +1625,7 @@ tok_get(struct tok_state *tok, struct token *token)
             if (!*prefix) {
                 int is_type_ignore = 1;
                 const char *ignore_end = p + 6;
+                const int ignore_end_col_offset = current_starting_col_offset + 6;
                 tok_backup(tok, c);  /* don't eat the newline or EOF */
 
                 type_start = p;
@@ -1615,11 +1646,12 @@ tok_get(struct tok_state *tok, struct token *token)
                         tok_nextc(tok);
                         tok->atbol = 1;
                     }
-                    return MAKE_TOKEN(TYPE_IGNORE);
+                    // +6 below cause we need to skip the ignore part
+                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
                 } else {
                     p_start = type_start;
                     p_end = tok->cur;
-                    return MAKE_TOKEN(TYPE_COMMENT);
+                    return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
                 }
             }
         }
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 5b8c7f314386ec..2542d30e1da0ed 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -57,6 +57,8 @@ struct tok_state {
     int lineno;         /* Current line number */
     int first_lineno;   /* First line of a single line or multi line string
                            expression (cf. issue 16806) */
+    int starting_col_offset; /* The column offset at the beginning of a token */
+    int col_offset;     /* Current col offset */
     int level;          /* () [] {} Parentheses nesting level */
             /* Used to allow free continuations inside them */
     char parenstack[MAXLEVEL];

From d3f852ca568be8258d0c8ba6bf069876e05e2790 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Thu, 6 Oct 2022 23:13:37 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst                | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst
new file mode 100644
index 00000000000000..5cb5e2126638be
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-23-13-34.gh-issue-97997.JQaJKF.rst	
@@ -0,0 +1 @@
+Add running column offset to the tokenizer state to avoid calculating AST column information with pointer arithmetic.

From 5c47419ee8941e752ddab2a608d8f6f0d5aafee6 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 6 Oct 2022 17:18:59 -0700
Subject: [PATCH 3/5] Remove unnecessary assignment of starting_col_offset

---
 Parser/tokenizer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 0420cb23092f73..68808a62826c70 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1449,7 +1449,6 @@ tok_get(struct tok_state *tok, struct token *token)
         int col = 0;
         int altcol = 0;
         tok->atbol = 0;
-        tok->starting_col_offset = 0;
         int cont_line_col = 0;
         for (;;) {
             c = tok_nextc(tok);

From 4796a400f71a3adb0f285f11f301f3ba33f188d8 Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Thu, 6 Oct 2022 17:21:16 -0700
Subject: [PATCH 4/5] Add comment

---
 Parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 68808a62826c70..62af172f52b3cd 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1623,6 +1623,7 @@ tok_get(struct tok_state *tok, struct token *token)
             /* This is a type comment if we matched all of type_comment_prefix. */
             if (!*prefix) {
                 int is_type_ignore = 1;
+                // +6 in order to skip the word 'ignore'
                 const char *ignore_end = p + 6;
                 const int ignore_end_col_offset = current_starting_col_offset + 6;
                 tok_backup(tok, c);  /* don't eat the newline or EOF */
@@ -1645,7 +1646,6 @@ tok_get(struct tok_state *tok, struct token *token)
                         tok_nextc(tok);
                         tok->atbol = 1;
                     }
-                    // +6 below cause we need to skip the ignore part
                     return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
                 } else {
                     p_start = type_start;

From d128a37eae72ce5eafa3536f421668ee1b1bbbbc Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Fri, 7 Oct 2022 10:20:47 -0700
Subject: [PATCH 5/5] Address feedback; add macro to advance new line

---
 Parser/tokenizer.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 62af172f52b3cd..1c356d3d47c945 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -39,6 +39,9 @@
 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
 #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
                 type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
+#define ADVANCE_LINENO() \
+            tok->lineno++; \
+            tok->col_offset = 0;
 
 /* Forward */
 static struct tok_state *tok_new(void);
@@ -875,8 +878,7 @@ tok_underflow_string(struct tok_state *tok) {
         tok->buf = tok->cur;
     }
     tok->line_start = tok->cur;
-    tok->lineno++;
-    tok->col_offset = 0;
+    ADVANCE_LINENO();
     tok->inp = end;
     return 1;
 }
@@ -935,8 +937,7 @@ tok_underflow_interactive(struct tok_state *tok) {
     else if (tok->start != NULL) {
         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
         size_t size = strlen(newtok);
-        tok->lineno++;
-        tok->col_offset = 0;
+        ADVANCE_LINENO();
         if (!tok_reserve_buf(tok, size + 1)) {
             PyMem_Free(tok->buf);
             tok->buf = NULL;
@@ -949,8 +950,7 @@ tok_underflow_interactive(struct tok_state *tok) {
         tok->multi_line_start = tok->buf + cur_multi_line_start;
     }
     else {
-        tok->lineno++;
-        tok->col_offset = 0;
+        ADVANCE_LINENO();
         PyMem_Free(tok->buf);
         tok->buf = newtok;
         tok->cur = tok->buf;
@@ -1005,8 +1005,7 @@ tok_underflow_file(struct tok_state *tok) {
         *tok->inp = '\0';
     }
 
-    tok->lineno++;
-    tok->col_offset = 0;
+    ADVANCE_LINENO();
     if (tok->decoding_state != STATE_NORMAL) {
         if (tok->lineno > 2) {
             tok->decoding_state = STATE_NORMAL;