diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e2dc543360a62..18f701c140e50 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -320,6 +320,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Validate correct capitalization among titles in documentation' ; echo $MSG + $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development/contributing.rst + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DEPENDENCIES ### diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index f904781178656..db9e23035b977 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -146,7 +146,7 @@ requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. -Using a Docker Container +Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ Instead of manually setting up a development environment, you can use Docker to @@ -754,7 +754,7 @@ You can then verify the changes look ok, then git :ref:`commit `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. @@ -919,7 +919,7 @@ For example, quite a few functions in *pandas* accept a ``dtype`` argument. This This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. -Validating Type Hints +Validating type hints ~~~~~~~~~~~~~~~~~~~~~ *pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running @@ -1539,7 +1539,7 @@ The branch will still exist on GitHub, so to delete it there do:: .. _Gitter: https://gitter.im/pydata/pandas -Tips for a successful Pull Request +Tips for a successful pull request ================================== If you have made it to the `Review your code`_ phase, one of the core contributors may diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py new file mode 100755 index 0000000000000..17752134e5049 --- /dev/null +++ b/scripts/validate_rst_title_capitalization.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python +""" +Validate that the titles in the rst files follow the proper capitalization convention. + +Print the titles that do not follow the convention. + +Usage:: +./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst +./scripts/validate_rst_title_capitalization.py doc/source/ + +""" +import argparse +import sys +import re +import os +from typing import Tuple, Generator, List +import glob + + +CAPITALIZATION_EXCEPTIONS = { + "pandas", + "Python", + "IPython", + "PyTables", + "Excel", + "JSON", + "HTML", + "SAS", + "SQL", + "BigQuery", + "STATA", + "Interval", + "PEP8", + "Period", + "Series", + "Index", + "DataFrame", + "C", + "Git", + "GitHub", + "NumPy", + "Apache", + "Arrow", + "Parquet", + "MultiIndex", + "NumFOCUS", + "sklearn", + "Docker", +} + +CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} + +err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" + +symbols = ("*", "=", "-", "^", "~", "#", '"') + + +def correct_title_capitalization(title: str) -> str: + """ + Algorithm to create the correct capitalization for a given title. + + Parameters + ---------- + title : str + Heading string to correct. + + Returns + ------- + str + Correctly capitalized heading. + """ + + # Strip all non-word characters from the beginning of the title to the + # first word character. + correct_title: str = re.sub(r"^\W*", "", title).capitalize() + + # Remove a URL from the title. We do this because words in a URL must + # stay lowercase, even if they are a capitalization exception. + removed_https_title = re.sub(r"", "", correct_title) + + # Split a title into a list using non-word character delimiters. + word_list = re.split(r"\W", removed_https_title) + + for word in word_list: + if word.lower() in CAP_EXCEPTIONS_DICT: + correct_title = re.sub( + rf"\b{word}\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title + ) + + return correct_title + + +def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: + """ + Algorithm to identify particular text that should be considered headings in an + RST file. + + See for details + on what constitutes a string as a heading in RST. + + Parameters + ---------- + rst_file : str + RST file to scan through for headings. + + Yields + ------- + title : str + A heading found in the rst file. + + line_number : int + The corresponding line number of the heading. + """ + + with open(rst_file, "r") as fd: + previous_line = "" + for i, line in enumerate(fd): + line = line[:-1] + line_chars = set(line) + if ( + len(line_chars) == 1 + and line_chars.pop() in symbols + and len(line) == len(previous_line) + ): + yield re.sub(r"[`\*_]", "", previous_line), i + previous_line = line + + +def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: + """ + Given the command line arguments of directory paths, this method + yields the strings of the .rst file directories that these paths contain. + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments. + + Yields + ------- + str + Directory address of a .rst files found in command line argument directories. + """ + + for directory_address in source_paths: + if not os.path.exists(directory_address): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + elif directory_address.endswith(".rst"): + yield directory_address + else: + for filename in glob.glob( + pathname=f"{directory_address}/**/*.rst", recursive=True + ): + yield filename + + +def main(source_paths: List[str], output_format: str) -> bool: + """ + The main method to print all headings with incorrect capitalization. + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments. + output_format : str + Output format of the script. + + Returns + ------- + int + Number of incorrect headings found overall. + """ + + number_of_errors: int = 0 + + for filename in find_rst_files(source_paths): + for title, line_number in find_titles(filename): + if title != correct_title_capitalization(title): + print( + f"""{filename}:{line_number}:{err_msg} "{title}" to "{ + correct_title_capitalization(title)}" """ + ) + number_of_errors += 1 + + return number_of_errors + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Validate heading capitalization") + + parser.add_argument( + "paths", nargs="+", default=".", help="Source paths of file/directory to check." + ) + + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}:{heading}:{correct_heading}", + help="Output format of incorrectly capitalized titles", + ) + + args = parser.parse_args() + + sys.exit(main(args.paths, args.format))