From 0d9e7ecb7dc6f1ca5fedbb87875ec5f071fca9b2 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Fri, 26 May 2023 22:02:26 +0100 Subject: [PATCH] gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer (GH-104980) (cherry picked from commit 46b52e6e2bda51d0b89a64ee36ce2d305a7409f3) Co-authored-by: Pablo Galindo Salgado Signed-off-by: Pablo Galindo --- Lib/test/test_tokenize.py | 15 ++++++----- Lib/tokenize.py | 5 ---- ...-05-26-15-16-11.gh-issue-104976.6dLitD.rst | 3 +++ Python/Python-tokenize.c | 26 ++++++++++++++++--- 4 files changed, 34 insertions(+), 15 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 0b7c25838d6782..abb68859be944c 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -82,7 +82,7 @@ def test_basic(self): NAME 'False' (4, 11) (4, 16) COMMENT '# NEWLINE' (4, 17) (4, 26) NEWLINE '\\n' (4, 26) (4, 27) - DEDENT '' (4, 27) (4, 27) + DEDENT '' (5, 0) (5, 0) """) indent_error_file = b"""\ def k(x): @@ -755,8 +755,8 @@ def test_tabs(self): NEWLINE '\\n' (2, 5) (2, 6) INDENT ' \\t' (3, 0) (3, 9) NAME 'pass' (3, 9) (3, 13) - DEDENT '' (3, 14) (3, 14) - DEDENT '' (3, 14) (3, 14) + DEDENT '' (4, 0) (4, 0) + DEDENT '' (4, 0) (4, 0) """) def test_non_ascii_identifiers(self): @@ -968,7 +968,7 @@ async def foo(): NUMBER '1' (2, 17) (2, 18) OP ':' (2, 18) (2, 19) NAME 'pass' (2, 20) (2, 24) - DEDENT '' (2, 25) (2, 25) + DEDENT '' (3, 0) (3, 0) """) self.check_tokenize('''async def foo(async): await''', """\ @@ -1016,7 +1016,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (6, 12) (6, 12) + DEDENT '' (7, 0) (7, 0) """) self.check_tokenize('''\ @@ -1054,7 +1054,7 @@ async def bar(): pass NAME 'await' (6, 2) (6, 7) OP '=' (6, 8) (6, 9) NUMBER '2' (6, 10) (6, 11) - DEDENT '' (6, 12) (6, 12) + DEDENT '' (7, 0) (7, 0) """) def test_newline_after_parenthesized_block_with_comment(self): @@ -2680,7 +2680,8 @@ def generate_source(indents): valid = generate_source(MAXINDENT - 1) tokens = list(_generate_tokens_from_c_tokenizer(valid)) - self.assertEqual(tokens[-1].type, DEDENT) + self.assertEqual(tokens[-2].type, DEDENT) + self.assertEqual(tokens[-1].type, ENDMARKER) compile(valid, "", "exec") invalid = generate_source(MAXINDENT) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 911f0f12f9bb7e..4895e94d1dfda7 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -447,13 +447,8 @@ def tokenize(readline): def _tokenize(rl_gen, encoding): source = b"".join(rl_gen).decode(encoding) - token = None for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): yield token - if token is not None: - last_line, _ = token.start - yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') - def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst new file mode 100644 index 00000000000000..377e8e76362687 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst @@ -0,0 +1,3 @@ +Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted +by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo +Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 88087c12562413..01c2215366a736 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t typedef struct { PyObject_HEAD struct tok_state *tok; + int done; } tokenizeriterobject; /*[clinic input] @@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, if (extra_tokens) { self->tok->tok_extra_tokens = 1; } + self->done = 0; return (PyObject *)self; } @@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it) } goto exit; } - if (type == ERRORTOKEN || type == ENDMARKER) { + if (it->done || type == ERRORTOKEN) { PyErr_SetString(PyExc_StopIteration, "EOF"); + it->done = 1; goto exit; } PyObject *str = NULL; @@ -194,9 +197,19 @@ tokenizeriter_next(tokenizeriterobject *it) goto exit; } + int is_trailing_token = 0; + if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) { + is_trailing_token = 1; + } + const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; - Py_ssize_t size = it->tok->inp - line_start; - PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace"); + PyObject* line = NULL; + if (it->tok->tok_extra_tokens && is_trailing_token) { + line = PyUnicode_FromString(""); + } else { + Py_ssize_t size = it->tok->inp - line_start; + line = PyUnicode_DecodeUTF8(line_start, size, "replace"); + } if (line == NULL) { Py_DECREF(str); goto exit; @@ -214,6 +227,10 @@ tokenizeriter_next(tokenizeriterobject *it) } if (it->tok->tok_extra_tokens) { + if (is_trailing_token) { + lineno = end_lineno = lineno + 1; + col_offset = end_col_offset = 0; + } // Necessary adjustments to match the original Python tokenize // implementation if (type > DEDENT && type < OP) { @@ -231,6 +248,9 @@ tokenizeriter_next(tokenizeriterobject *it) result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line); exit: _PyToken_Free(&token); + if (type == ENDMARKER) { + it->done = 1; + } return result; }