-
-
Notifications
You must be signed in to change notification settings - Fork 17.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CI: Adding script to validate consistent and correct capitalization among headings in documentation (#26941) #31114
Changes from 42 commits
85e3fe6
e2d5354
f089c0c
2dd5791
2294331
c06c951
2ffeee0
1364f86
bb535ae
6b51df6
d6198a6
21693b6
0810c09
30c4f8c
aabd136
4c83edb
50661c3
f513f29
2d3cfe7
4ceea5e
11556b7
e55776f
9fc312a
635163d
c4ff8bd
927e3ed
83f778c
1907d45
de06ec8
b7c0bfd
3d3a7f4
7ea58df
d71be41
60d8db9
0e344ad
3757712
3d95777
56bfc44
0ec38e2
deddc2d
0311fe0
3256615
df01730
c1e3abb
9a9a57a
bafbf96
dd5c983
5f0f84a
2fc019f
ee45f98
88dfc46
78a49c1
95d3488
1c7de87
f4ffd32
3d2e9ce
687053f
ed3cdc6
c690281
c9775cc
66c651a
ac5c5b7
ceedac5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,272 @@ | ||||||
#!/usr/bin/env python | ||||||
""" | ||||||
Validate that the titles in the rst files follow the proper capitalization convention. | ||||||
|
||||||
Print the titles that do not follow the convention. | ||||||
|
||||||
Usage:: | ||||||
./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst | ||||||
./scripts/validate_rst_title_capitalization.py doc/source/ | ||||||
|
||||||
""" | ||||||
import argparse | ||||||
import sys | ||||||
import re | ||||||
import os | ||||||
from typing import Tuple, Generator, List | ||||||
|
||||||
|
||||||
CAPITALIZATION_EXCEPTIONS = { | ||||||
"pandas", | ||||||
"Python", | ||||||
"IPython", | ||||||
"PyTables", | ||||||
"Excel", | ||||||
"JSON", | ||||||
"HTML", | ||||||
"SAS", | ||||||
"SQL", | ||||||
"BigQuery", | ||||||
"STATA", | ||||||
"Interval", | ||||||
"PEP8", | ||||||
"Period", | ||||||
"Series", | ||||||
"Index", | ||||||
"DataFrame", | ||||||
"C", | ||||||
"Git", | ||||||
"GitHub", | ||||||
"NumPy", | ||||||
"Apache", | ||||||
"Arrow", | ||||||
"Parquet", | ||||||
"MultiIndex", | ||||||
"NumFOCUS", | ||||||
"sklearn", | ||||||
"Docker", | ||||||
} | ||||||
|
||||||
CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} | ||||||
|
||||||
bad_title_dict = {} | ||||||
|
||||||
err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" | ||||||
|
||||||
|
||||||
def correct_title_capitalization(title: str) -> str: | ||||||
""" | ||||||
Algorithm to create the correct capitalization for a given title | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
title : str | ||||||
Heading string to correct | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
Returns | ||||||
------- | ||||||
correct_title : str | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
Correctly capitalized heading | ||||||
|
||||||
""" | ||||||
|
||||||
correct_title: str = re.sub(r"^\W*", "", title).capitalize() | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
removed_https_title = re.sub(r"<https?:\/\/.*[\r\n]*>", "", correct_title) | ||||||
|
||||||
word_list = re.split(r"\W", removed_https_title) | ||||||
|
||||||
for word in word_list: | ||||||
if word.lower() in CAP_EXCEPTIONS_DICT: | ||||||
correct_title = re.sub( | ||||||
r"\b" + word + r"\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
) | ||||||
|
||||||
return correct_title | ||||||
|
||||||
|
||||||
def is_following_capitalization_convention(title: str) -> bool: | ||||||
""" | ||||||
Function to return if a given title is capitalized correctly | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
title : str | ||||||
Heading string to validate | ||||||
|
||||||
Returns | ||||||
------- | ||||||
bool | ||||||
True if title capitalized correctly, False if not | ||||||
|
||||||
""" | ||||||
|
||||||
correct_title = correct_title_capitalization(title) | ||||||
|
||||||
if title != correct_title: | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return False | ||||||
else: | ||||||
return True | ||||||
|
||||||
|
||||||
def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: | ||||||
""" | ||||||
Algorithm to identify particular text that should be considered headings in an | ||||||
RST file | ||||||
|
||||||
See <https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html> for details | ||||||
on what constitutes a string as a heading in RST | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
rst_file : str | ||||||
RST file to scan through for headings | ||||||
|
||||||
Yields | ||||||
------- | ||||||
title : str | ||||||
A heading found in the rst file | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No blank line here |
||||||
line_number : int | ||||||
The corresponding line number of the heading | ||||||
|
||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
""" | ||||||
|
||||||
with open(rst_file, "r") as file_obj: | ||||||
lines = file_obj.read().split("\n") | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
regex = { | ||||||
"*": r"^(?:\*{1})*$", | ||||||
"=": r"^(?:={1})*$", | ||||||
"-": r"^(?:-{1})*$", | ||||||
"^": r"^(?:\^{1})*$", | ||||||
"~": r"^(?:~{1})*$", | ||||||
"#": r"^(?:#{1})*$", | ||||||
'"': r'^(?:"{1})*$', | ||||||
} | ||||||
|
||||||
table = str.maketrans("", "", "*`_") | ||||||
|
||||||
for line_no in range(1, len(lines)): | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
if len(lines[line_no]) != 0 and len(lines[line_no - 1]) != 0: | ||||||
for key in regex: | ||||||
match = re.search(regex[key], lines[line_no]) | ||||||
if match is not None: | ||||||
if line_no >= 2: | ||||||
if lines[line_no] == lines[line_no - 2]: | ||||||
if len(lines[line_no]) == len(lines[line_no - 1]): | ||||||
yield lines[line_no - 1].translate(table), line_no | ||||||
break | ||||||
if len(lines[line_no]) >= len(lines[line_no - 1]): | ||||||
yield lines[line_no - 1].translate(table), line_no | ||||||
|
||||||
|
||||||
def fill_bad_title_dict(rst_file: str) -> None: | ||||||
""" | ||||||
Method that fills up the bad_title_dict with incorrectly capitalized headings | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
rst_file : str | ||||||
Directory address of a .rst file as a string | ||||||
|
||||||
""" | ||||||
|
||||||
if rst_file in bad_title_dict: | ||||||
return | ||||||
|
||||||
for title, line_number in find_titles(rst_file): | ||||||
if not is_following_capitalization_convention(title): | ||||||
if rst_file not in bad_title_dict: | ||||||
bad_title_dict[rst_file] = [(title, line_number)] | ||||||
else: | ||||||
bad_title_dict[rst_file].append((title, line_number)) | ||||||
|
||||||
|
||||||
def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: | ||||||
""" | ||||||
Given the command line arguments of directory paths, this method | ||||||
yields the strings of the .rst file directories that these paths contain | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
source_paths : str | ||||||
List of directories to validate, provided through command line arguments | ||||||
|
||||||
Yields | ||||||
------- | ||||||
directory_address : str | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Directory address of a .rst files found in command line argument directories | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better remove this blank line (we're not validating this docstring, but this blank line would make it fail if we do. |
||||||
""" | ||||||
|
||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
for directory_address in source_paths: | ||||||
if not os.path.exists(directory_address): | ||||||
raise ValueError( | ||||||
"Please enter a valid path, pointing to a valid file/directory." | ||||||
) | ||||||
elif directory_address.endswith(".rst"): | ||||||
yield directory_address | ||||||
else: | ||||||
for (dirpath, _, filenames) in os.walk(directory_address): | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
for file in filenames: | ||||||
if file.endswith(".rst"): | ||||||
yield os.path.join(dirpath, file) | ||||||
|
||||||
|
||||||
def main(source_paths: List[str], output_format: str) -> bool: | ||||||
""" | ||||||
The main method to print all headings with incorrect capitalization | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
source_paths : str | ||||||
List of directories to validate, provided through command line arguments | ||||||
output_format : str | ||||||
Output format of the script. | ||||||
|
||||||
Returns | ||||||
------- | ||||||
number_of_errors : int | ||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
True if there are headings that are printed, False if not | ||||||
|
||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
""" | ||||||
|
||||||
tonywu1999 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
number_of_errors: int = 0 | ||||||
|
||||||
directory_list = find_rst_files(source_paths) | ||||||
|
||||||
for filename in directory_list: | ||||||
fill_bad_title_dict(filename) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This F feels overcomplicated. Just loop over I think you can get rid a lot of the code by doing that, and things will be much clearer. |
||||||
|
||||||
if len(bad_title_dict) == 0: | ||||||
return number_of_errors | ||||||
|
||||||
for key in bad_title_dict: | ||||||
for line in bad_title_dict[key]: | ||||||
print( | ||||||
f"""{key}:{line[1]}:{err_msg} "{line[0]}" to "{ | ||||||
correct_title_capitalization(line[0])}" """ | ||||||
) | ||||||
number_of_errors += 1 | ||||||
|
||||||
return number_of_errors | ||||||
|
||||||
|
||||||
if __name__ == "__main__": | ||||||
parser = argparse.ArgumentParser(description="Validate heading capitalization") | ||||||
|
||||||
parser.add_argument( | ||||||
"paths", nargs="+", default=".", help="Source paths of file/directory to check." | ||||||
) | ||||||
|
||||||
parser.add_argument( | ||||||
"--format", | ||||||
"-f", | ||||||
default="{source_path}:{line_number}:{msg}:{heading}", | ||||||
help="Output format of incorrectly capitalized titles", | ||||||
) | ||||||
|
||||||
args = parser.parse_args() | ||||||
|
||||||
sys.exit(main(args.paths, args.format)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we're using this just once, I'd move it to the function where it's being used.