diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e6a761b91f353..7b223a553e114 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -102,9 +102,17 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then MSG='Check for use of not concatenated strings' ; echo $MSG if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" . + $BASE_DIR/scripts/validate_string_concatenation.py --validation-type="strings_to_concatenate" --format="##[error]{source_path}:{line_number}:{msg}" . else - $BASE_DIR/scripts/validate_string_concatenation.py . + $BASE_DIR/scripts/validate_string_concatenation.py --validation-type="strings_to_concatenate" . + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for strings with wrong placed spaces' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_string_concatenation.py --validation-type="strings_with_wrong_placed_whitespace" --format="##[error]{source_path}:{line_number}:{msg}" . + else + $BASE_DIR/scripts/validate_string_concatenation.py --validation-type="strings_with_wrong_placed_whitespace" . fi RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py new file mode 100644 index 0000000000000..6ad5ab44a29d6 --- /dev/null +++ b/scripts/tests/test_validate_unwanted_patterns.py @@ -0,0 +1,422 @@ +import io + +import pytest + +# TODO: change this import to "import validate_unwanted_patterns" +# when renaming "scripts/validate_string_concatenation.py" to +# "scripts/validate_unwanted_patterns.py" +import validate_string_concatenation as validate_unwanted_patterns + + +class TestBarePytestRaises: + @pytest.mark.parametrize( + "data", + [ + ( + """ + with pytest.raises(ValueError, match="foo"): + pass + """ + ), + ( + """ + # with pytest.raises(ValueError, match="foo"): + # pass + """ + ), + ( + """ + # with pytest.raises(ValueError): + # pass + """ + ), + ( + """ + with pytest.raises( + ValueError, + match="foo" + ): + pass + """ + ), + ], + ) + def test_pytest_raises(self, data): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + with pytest.raises(ValueError): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError, match="foo"): + with pytest.raises(ValueError): + pass + pass + """ + ), + [ + ( + 2, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError): + with pytest.raises(ValueError, match="foo"): + pass + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError, + # match = "foo" + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ], + ) + def test_pytest_raises_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == expected + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + 'msg = ("bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + ], + ), + ( + 'msg = ("foo " "bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ], + ), + ], +) +def test_strings_to_concatenate(data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.strings_to_concatenate(fd)) + assert result == expected + + +class TestStringsWithWrongPlacedWhitespace: + @pytest.mark.parametrize( + "data", + [ + ( + """ + msg = ( + "foo\n" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + " bar" + "baz" + ) + """ + ), + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + rf" bar" + ) + """ + ), + ], + ) + def test_strings_with_wrong_placed_whitespace(self, data): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + msg = ( + "foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + " baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + " bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ], + ) + def test_strings_with_wrong_placed_whitespace_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == expected diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py index c5f257c641b25..c4be85ffe7306 100755 --- a/scripts/validate_string_concatenation.py +++ b/scripts/validate_string_concatenation.py @@ -1,24 +1,13 @@ #!/usr/bin/env python3 """ -GH #30454 +Unwanted patterns test cases. -Check where there is a string that needs to be concatenated. +The reason this file exist despite the fact we already have +`ci/code_checks.sh`, +(see https://github.com/pandas-dev/pandas/blob/master/ci/code_checks.sh) -This is necessary after black formatting, -where for example black transforms this: - ->>> foo = ( -... "bar " -... "baz" -... ) - -into this: - ->>> foo = ("bar " "baz") - -Black is not considering this as an -issue (see issue https://github.com/psf/black/issues/1051), -so we are checking it here. +is that some of the test cases are more complex/imposible to validate via regex. +So this file is somewhat an extensions to `ci/code_checks.sh` """ import argparse @@ -26,26 +15,288 @@ import sys import token import tokenize -from typing import Generator, List, Tuple +from typing import IO, Callable, Iterable, List, Tuple + +FILE_EXTENSIONS_TO_CHECK: Tuple[str, ...] = (".py", ".pyx", ".pxi.ini", ".pxd") + + +def _get_literal_string_prefix_len(token_string: str) -> int: + """ + Getting the length of the literal string prefix. + + Parameters + ---------- + token_string : str + String to check. + + Returns + ------- + int + Length of the literal string prefix. + + Examples + -------- + >>> example_string = "'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 0 + >>> example_string = "r'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 1 + """ + try: + return min( + token_string.find(quote) + for quote in (r"'", r'"') + if token_string.find(quote) >= 0 + ) + except ValueError: + return 0 + + +def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Test Case for bare pytest raises. + + For example, this is wrong: + + >>> with pytest.raise(ValueError): + ... # Some code that raises ValueError + + And this is what we want instead: + + >>> with pytest.raise(ValueError, match="foo"): + ... # Some code that raises ValueError + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explenation of the error. + + Notes + ----- + GH #23922 + """ + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for counter, current_token in enumerate(tokens, start=1): + if not (current_token.type == token.NAME and current_token.string == "raises"): + continue + for next_token in tokens[counter:]: + if next_token.type == token.NAME and next_token.string == "match": + break + # token.NEWLINE refers to the end of a logical line + # unlike token.NL or "\n" which represents a newline + if next_token.type == token.NEWLINE: + yield ( + current_token.start[0], + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + break + + +def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + This test case is necessary after 'Black' (https://github.com/psf/black), + is formating strings over multiple lines. + + For example, when this: + + >>> foo = ( + ... "bar " + ... "baz" + ... ) + + Is becoming this: + + >>> foo = ("bar " "baz") + + 'Black' is not considering this as an + issue (see https://github.com/psf/black/issues/1051), + so we are checking it here instead. + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explenation of the error. + + Notes + ----- + GH #30454 + """ + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token.type == next_token.type == token.STRING: + yield ( + current_token.start[0], + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + -FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") +def strings_with_wrong_placed_whitespace( + file_obj: IO[str], +) -> Iterable[Tuple[int, str]]: + """ + Test case for leading spaces in concated strings. + + For example: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + Instead of: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explenation of the error. + """ + + def has_wrong_whitespace(first_line: str, second_line: str) -> bool: + """ + Checking if the two lines are mattching the unwanted pattern. + + Parameters + ---------- + first_line : str + First line to check. + second_line : str + Second line to check. + + Returns + ------- + bool + True if the two recived string match, an unwanted pattern. + + Notes + ----- + The unwanted pattern that we are trying to catch is if the spaces in + a string that is concatenated over multiple lines are placed at the + end of each string, unless this string is ending with a + newline character (\n). + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + And what we want is: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + And if the string is ending with a new line character (\n) we + do not want any trailing whitespaces after it. + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n " + ... "not at the end, like always" + ... ) + + And what we do want is: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n" + ... " not at the end, like always" + ... ) + """ + if first_line.endswith(r"\n"): + return False + elif first_line.startswith(" ") or second_line.startswith(" "): + return False + elif first_line.endswith(" ") or second_line.endswith(" "): + return False + elif (not first_line.endswith(" ")) and second_line.startswith(" "): + return True + return False + + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for first_token, second_token, third_token in zip(tokens, tokens[1:], tokens[2:]): + # Checking if we are in a block of concated string + if ( + first_token.type == third_token.type == token.STRING + and second_token.type == token.NL + ): + # Striping the quotes, with the string litteral prefix + first_string: str = first_token.string[ + _get_literal_string_prefix_len(first_token.string) + 1 : -1 + ] + second_string: str = third_token.string[ + _get_literal_string_prefix_len(third_token.string) + 1 : -1 + ] + + if has_wrong_whitespace(first_string, second_string): + yield ( + third_token.start[0], + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) -def main(source_path: str, output_format: str) -> bool: +def main( + function: Callable[[IO[str]], Iterable[Tuple[int, str]]], + source_path: str, + output_format: str, +) -> bool: """ Main entry point of the script. Parameters ---------- + function : Callable + Function to execute for the specified validation type. source_path : str Source path representing path to a file/directory. output_format : str - Output format of the script. + Output format of the error message. Returns ------- bool - True if found any strings that needs to be concatenated. + True if found any patterns are found related to the given function. Raises ------ @@ -53,66 +304,50 @@ def main(source_path: str, output_format: str) -> bool: If the `source_path` is not pointing to existing file/directory. """ if not os.path.exists(source_path): - raise ValueError( - "Please enter a valid path, pointing to a valid file/directory." - ) + raise ValueError("Please enter a valid path, pointing to a file/directory.") is_failed: bool = False - - msg = "String unnecessarily split in two by black. Please merge them manually." + file_path: str = "" if os.path.isfile(source_path): - for source_path, line_number in strings_to_concatenate(source_path): - is_failed = True - print( - output_format.format( - source_path=source_path, line_number=line_number, msg=msg + file_path = source_path + with open(file_path, "r") as file_obj: + for line_number, msg in function(file_obj): + is_failed = True + print( + output_format.format( + source_path=file_path, line_number=line_number, msg=msg + ) ) - ) for subdir, _, files in os.walk(source_path): for file_name in files: - if any( + if not any( file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK ): - for source_path, line_number in strings_to_concatenate( - os.path.join(subdir, file_name) - ): + continue + + file_path = os.path.join(subdir, file_name) + with open(file_path, "r") as file_obj: + for line_number, msg in function(file_obj): is_failed = True print( output_format.format( - source_path=source_path, line_number=line_number, msg=msg + source_path=file_path, line_number=line_number, msg=msg ) ) - return is_failed - - -def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, None]: - """ - Yielding the strings that needs to be concatenated in a given file. - - Parameters - ---------- - source_path : str - File path pointing to a single file. - Yields - ------ - source_path : str - Source file path. - line_number : int - Line number of unconcatenated string. - """ - with open(source_path, "r") as file_name: - tokens: List = list(tokenize.generate_tokens(file_name.readline)) - - for current_token, next_token in zip(tokens, tokens[1:]): - if current_token[0] == next_token[0] == token.STRING: - yield source_path, current_token[2][0] + return is_failed if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Validate concatenated strings") + available_validation_types: List[str] = [ + "bare_pytest_raises", + "strings_to_concatenate", + "strings_with_wrong_placed_whitespace", + ] + + parser = argparse.ArgumentParser(description="Unwanted patterns checker.") parser.add_argument( "path", nargs="?", default=".", help="Source path of file/directory to check." @@ -120,10 +355,23 @@ def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, parser.add_argument( "--format", "-f", - default="{source_path}:{line_number}:{msg}", - help="Output format of the unconcatenated strings.", + default="{source_path}:{line_number}:{msg}.", + help="Output format of the error message.", + ) + parser.add_argument( + "--validation-type", + "-vt", + choices=available_validation_types, + required=True, + help="Validation test case to check.", ) args = parser.parse_args() - sys.exit(main(source_path=args.path, output_format=args.format)) + sys.exit( + main( + function=globals().get(args.validation_type), # type: ignore + source_path=args.path, + output_format=args.format, + ) + )