From 85e3fe602ed85d1b4156dadce1e0f9503461c3fa Mon Sep 17 00:00:00 2001 From: awu42 Date: Mon, 13 Jan 2020 18:00:20 -0500 Subject: [PATCH 01/44] changed name --- doc/sphinxext/contributors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index d9ba2bb2cfb07..19ebf53a2b285 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -16,7 +16,7 @@ class ContributorsDirective(Directive): required_arguments = 1 - name = "contributors" + name = "contributor" def run(self): range_ = self.arguments[0] From e2d53543f7796487b7e1add1b0f5dd8b9990cb50 Mon Sep 17 00:00:00 2001 From: awu42 Date: Mon, 13 Jan 2020 21:09:14 -0500 Subject: [PATCH 02/44] adding sphinx extension --- doc/sphinxext/titles.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 doc/sphinxext/titles.py diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py new file mode 100644 index 0000000000000..8e6035a8971d1 --- /dev/null +++ b/doc/sphinxext/titles.py @@ -0,0 +1,30 @@ +"""Sphinx extension for collecting the titles in the rst files and validating +if they follow the capitalization convention. This sends a warning if +it is not followed. + +Usage:: + + .. contents:: + +This will be replaced with nothing (or hello world haha) +""" + +from docutils import nodes +from docutils.parsers.rst import Directive + + +class HelloWorld(Directive): + + def run(self): + paragraph_node = nodes.paragraph(text='Hello World!') + return [paragraph_node] + + +def setup(app): + app.add_directive("helloworld", HelloWorld) + + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } From f089c0c2ba935ced41eb26cd49fdbc0cf4488afe Mon Sep 17 00:00:00 2001 From: awu42 Date: Mon, 13 Jan 2020 21:58:00 -0500 Subject: [PATCH 03/44] Starting builder --- doc/sphinxext/titles.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 8e6035a8971d1..ea9079e61022d 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -11,17 +11,33 @@ from docutils import nodes from docutils.parsers.rst import Directive +from sphinx.builders import Builder +# Use spelling builder as an example!!!!! -class HelloWorld(Directive): - def run(self): +class HeadingCapitalizationValidator(Builder): + + def init(self): + z = 0 + if z == 0: + self.warning("Warning: capitalization not followed") paragraph_node = nodes.paragraph(text='Hello World!') return [paragraph_node] + def write_doc(self, docname, doctree): + for node in doctree.traverse(nodes.Text): + if (node.tagname == '#title'): + print("Bro") + + def finish(self): + z = 1 + return + def setup(app): - app.add_directive("helloworld", HelloWorld) + app.info("Initializing capitalization validator") + app. return { 'version': '0.1', From 2dd5791980794be59e4493d2455dec29be5b76cf Mon Sep 17 00:00:00 2001 From: awu42 Date: Mon, 13 Jan 2020 22:27:08 -0500 Subject: [PATCH 04/44] experimenting with builder --- doc/sphinxext/titles.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index ea9079e61022d..0c34bde5492df 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -19,16 +19,18 @@ class HeadingCapitalizationValidator(Builder): def init(self): + self.docnames = [] + self.document_data = [] + return + + def write_doc(self, docname, doctree): z = 0 if z == 0: self.warning("Warning: capitalization not followed") - paragraph_node = nodes.paragraph(text='Hello World!') - return [paragraph_node] - def write_doc(self, docname, doctree): for node in doctree.traverse(nodes.Text): - if (node.tagname == '#title'): - print("Bro") + if (node.tagname == '#subtitle'): + print(node.astext()) def finish(self): z = 1 @@ -37,10 +39,12 @@ def finish(self): def setup(app): app.info("Initializing capitalization validator") - app. + app.add_builder(HeadingCapitalizationValidator) + return + - return { - 'version': '0.1', - 'parallel_read_safe': True, - 'parallel_write_safe': True, - } + # return { + # 'version': '0.1', + # 'parallel_read_safe': True, + # 'parallel_write_safe': True, + # } From 22943315d9ab2a50f9cff6533b3ff3b6d8eaf262 Mon Sep 17 00:00:00 2001 From: awu42 Date: Mon, 13 Jan 2020 22:42:30 -0500 Subject: [PATCH 05/44] before running build --- doc/source/conf.py | 1 + doc/sphinxext/titles.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/conf.py b/doc/source/conf.py index 481c03ab8f388..e4f61c2a8ee55 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -65,6 +65,7 @@ "sphinx.ext.linkcode", "nbsphinx", "contributors", # custom pandas extension + "titles", # Tony Wu custom pandas extension ] exclude_patterns = ["**.ipynb_checkpoints"] diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 0c34bde5492df..9f18c77200cfa 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -18,6 +18,8 @@ class HeadingCapitalizationValidator(Builder): + name = 'capitalizationValidator' + def init(self): self.docnames = [] self.document_data = [] From c06c95179de2277e9cc7fbca2e64dd0f8e377054 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 00:05:42 -0500 Subject: [PATCH 06/44] 126 warnings? --- doc/source/conf.py | 2 +- doc/sphinxext/titles.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index e4f61c2a8ee55..0d32c2c0674cf 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -65,7 +65,7 @@ "sphinx.ext.linkcode", "nbsphinx", "contributors", # custom pandas extension - "titles", # Tony Wu custom pandas extension + "titles", # Tony Wu custom pandas extension, MIGHT NEEDA FIX THIS LOWKEY ] exclude_patterns = ["**.ipynb_checkpoints"] diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 9f18c77200cfa..4cddf1faef7a2 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -19,7 +19,7 @@ class HeadingCapitalizationValidator(Builder): name = 'capitalizationValidator' - + def init(self): self.docnames = [] self.document_data = [] @@ -40,7 +40,6 @@ def finish(self): def setup(app): - app.info("Initializing capitalization validator") app.add_builder(HeadingCapitalizationValidator) return From 2ffeee0c0441bc335164538479d570486b953636 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 00:20:43 -0500 Subject: [PATCH 07/44] contributors --- doc/sphinxext/contributors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 19ebf53a2b285..d9ba2bb2cfb07 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -16,7 +16,7 @@ class ContributorsDirective(Directive): required_arguments = 1 - name = "contributor" + name = "contributors" def run(self): range_ = self.arguments[0] From 1364f86b9abd1ba895352a60ea944f7181a27ce4 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 10:33:52 -0500 Subject: [PATCH 08/44] experimenting --- doc/source/conf.py | 2 +- doc/sphinxext/helloworld.py | 19 +++++++++++++++++++ doc/sphinxext/titles.py | 3 ++- 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 doc/sphinxext/helloworld.py diff --git a/doc/source/conf.py b/doc/source/conf.py index 0d32c2c0674cf..0e2e49d3db113 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -65,7 +65,7 @@ "sphinx.ext.linkcode", "nbsphinx", "contributors", # custom pandas extension - "titles", # Tony Wu custom pandas extension, MIGHT NEEDA FIX THIS LOWKEY + "helloworld", # Tony Wu custom pandas extension, MIGHT NEEDA FIX THIS LOWKEY ] exclude_patterns = ["**.ipynb_checkpoints"] diff --git a/doc/sphinxext/helloworld.py b/doc/sphinxext/helloworld.py new file mode 100644 index 0000000000000..52fb7626d226e --- /dev/null +++ b/doc/sphinxext/helloworld.py @@ -0,0 +1,19 @@ +from docutils import nodes +from docutils.parsers.rst import Directive + + +class HelloWorld(Directive): + + def run(self): + paragraph_node = nodes.paragraph(text='Hello World!') + return [paragraph_node] + + +def setup(app): + app.add_directive("contents", HelloWorld) + + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 4cddf1faef7a2..758dbe4703e00 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -23,12 +23,13 @@ class HeadingCapitalizationValidator(Builder): def init(self): self.docnames = [] self.document_data = [] + print("BRO") return def write_doc(self, docname, doctree): z = 0 if z == 0: - self.warning("Warning: capitalization not followed") + self.error("Warning: capitalization not followed") for node in doctree.traverse(nodes.Text): if (node.tagname == '#subtitle'): From bb535aea5f495f879740016b32be3f890339edd2 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 11:00:13 -0500 Subject: [PATCH 09/44] update --- doc/source/conf.py | 2 +- doc/sphinxext/titles.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 0e2e49d3db113..e4f61c2a8ee55 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -65,7 +65,7 @@ "sphinx.ext.linkcode", "nbsphinx", "contributors", # custom pandas extension - "helloworld", # Tony Wu custom pandas extension, MIGHT NEEDA FIX THIS LOWKEY + "titles", # Tony Wu custom pandas extension ] exclude_patterns = ["**.ipynb_checkpoints"] diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 758dbe4703e00..ad02a9a810075 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -42,7 +42,11 @@ def finish(self): def setup(app): app.add_builder(HeadingCapitalizationValidator) - return + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } # return { From 6b51df6209750164f99f5f1e7098cd4a99eda2f0 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 12:50:30 -0500 Subject: [PATCH 10/44] parser created --- doc/source/conf.py | 2 +- doc/sphinxext/helloworld.py | 3 ++- doc/sphinxext/titles.py | 21 ++++++++++++++++++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index e4f61c2a8ee55..283a5d587dbb7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -65,7 +65,7 @@ "sphinx.ext.linkcode", "nbsphinx", "contributors", # custom pandas extension - "titles", # Tony Wu custom pandas extension + "helloworld", # Tony Wu custom pandas extension ] exclude_patterns = ["**.ipynb_checkpoints"] diff --git a/doc/sphinxext/helloworld.py b/doc/sphinxext/helloworld.py index 52fb7626d226e..2755ba90cb166 100644 --- a/doc/sphinxext/helloworld.py +++ b/doc/sphinxext/helloworld.py @@ -5,12 +5,13 @@ class HelloWorld(Directive): def run(self): + self.error("found an error") paragraph_node = nodes.paragraph(text='Hello World!') return [paragraph_node] def setup(app): - app.add_directive("contents", HelloWorld) + app.add_directive("helloworld", HelloWorld) return { 'version': '0.1', diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index ad02a9a810075..65c0cefaf090d 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -4,7 +4,7 @@ Usage:: - .. contents:: + .. contents:: http://code.nabla.net/doc/docutils/api/docutils/docutils.nodes.html This will be replaced with nothing (or hello world haha) """ @@ -54,3 +54,22 @@ def setup(app): # 'parallel_read_safe': True, # 'parallel_write_safe': True, # } + +# http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html#get_children +import docutils +from docutils import nodes +from docutils.parsers.rst import Parser + +parser = Parser() +f = open("contributing.rst", "r") +input = f.read() +settings = docutils.frontend.OptionParser( + components=(docutils.parsers.rst.Parser,) + ).get_default_values() +document = docutils.utils.new_document('Document', settings) +parser.parse(input, document) + +# node.tagname = #text, parent.tagname = title (ALL OF THEM) +for node in document.traverse(nodes.Text): + if (node.tagname == '#text' and node.parent.tagname == 'title'): + print(node.astext()) From d6198a68567bb016f3255e3b2f1e353385d893f3 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 16:34:59 -0500 Subject: [PATCH 11/44] italics working --- doc/sphinxext/titles.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 65c0cefaf090d..7edc575e5f33a 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -70,6 +70,9 @@ def setup(app): parser.parse(input, document) # node.tagname = #text, parent.tagname = title (ALL OF THEM) +listOfMarkers = ['emphasis', 'strong', 'reference'] for node in document.traverse(nodes.Text): - if (node.tagname == '#text' and node.parent.tagname == 'title'): - print(node.astext()) + if (node.tagname == '#text'): + if (node.parent.tagname == 'title' or (node.parent.parent.tagname == 'title' and + node.parent.tagname in listOfMarkers)): + print(node.astext()) From 21693b6b09db158599afd8864e84640cca238d50 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 17:32:16 -0500 Subject: [PATCH 12/44] found a way to collect all heading strings from doctree --- doc/sphinxext/titles.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py index 7edc575e5f33a..bfc9d04e4a22c 100644 --- a/doc/sphinxext/titles.py +++ b/doc/sphinxext/titles.py @@ -69,10 +69,32 @@ def setup(app): document = docutils.utils.new_document('Document', settings) parser.parse(input, document) +# print list of all the subtitles/headings that we want. # node.tagname = #text, parent.tagname = title (ALL OF THEM) -listOfMarkers = ['emphasis', 'strong', 'reference'] +listOfMarkers = ['emphasis', 'strong', 'reference', 'literal'] +myText = "" +markerGrandparent = "" +beforeMarker = False for node in document.traverse(nodes.Text): - if (node.tagname == '#text'): - if (node.parent.tagname == 'title' or (node.parent.parent.tagname == 'title' and - node.parent.tagname in listOfMarkers)): - print(node.astext()) + if (node.parent.tagname == 'title'): + if (beforeMarker and markerGrandparent == node.parent): + myText = myText + node.astext() + beforeMarker = False + else: + if (myText != ""): + print(myText) + myText = node.astext() + beforeMarker = False + elif (node.parent.parent.tagname == 'title' and + node.parent.tagname in listOfMarkers): + myText = myText + node.astext() + beforeMarker = True + markerGrandparent = node.parent.parent + else: + beforeMarker = False + if (myText != ""): + print(myText) + myText = "" + +if (myText != ""): + print(myText) From 0810c094e51ad6a21ea174f237711744bbd90a7e Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 14 Jan 2020 22:47:04 -0500 Subject: [PATCH 13/44] testing script --- doc/sphinxext/helloworld.py | 43 ++++++++++--- doc/sphinxext/titles.py | 100 ----------------------------- titles.py | 122 ++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 107 deletions(-) delete mode 100644 doc/sphinxext/titles.py create mode 100644 titles.py diff --git a/doc/sphinxext/helloworld.py b/doc/sphinxext/helloworld.py index 2755ba90cb166..ca7f71c1454e1 100644 --- a/doc/sphinxext/helloworld.py +++ b/doc/sphinxext/helloworld.py @@ -2,19 +2,48 @@ from docutils.parsers.rst import Directive -class HelloWorld(Directive): +from docutils import nodes +from docutils.parsers.rst import Directive +from sphinx.builders import Builder - def run(self): - self.error("found an error") - paragraph_node = nodes.paragraph(text='Hello World!') - return [paragraph_node] +# Use spelling builder as an example!!!!! -def setup(app): - app.add_directive("helloworld", HelloWorld) +class HeadingCapitalizationValidator(Builder): + + name = 'capitalizationValidator' + + def init(self): + self.docnames = [] + self.document_data = [] + print("BRO") + return + + def write_doc(self, docname, doctree): + z = 0 + if z == 0: + self.error("Warning: capitalization not followed") + for node in doctree.traverse(nodes.Text): + if (node.tagname == 'title'): + print(node.astext()) + + def finish(self): + z = 1 + return + + +def setup(app): + app.add_builder(HeadingCapitalizationValidator) return { 'version': '0.1', 'parallel_read_safe': True, 'parallel_write_safe': True, } + + + # return { + # 'version': '0.1', + # 'parallel_read_safe': True, + # 'parallel_write_safe': True, + # } diff --git a/doc/sphinxext/titles.py b/doc/sphinxext/titles.py deleted file mode 100644 index bfc9d04e4a22c..0000000000000 --- a/doc/sphinxext/titles.py +++ /dev/null @@ -1,100 +0,0 @@ -"""Sphinx extension for collecting the titles in the rst files and validating -if they follow the capitalization convention. This sends a warning if -it is not followed. - -Usage:: - - .. contents:: http://code.nabla.net/doc/docutils/api/docutils/docutils.nodes.html - -This will be replaced with nothing (or hello world haha) -""" - -from docutils import nodes -from docutils.parsers.rst import Directive -from sphinx.builders import Builder - -# Use spelling builder as an example!!!!! - - -class HeadingCapitalizationValidator(Builder): - - name = 'capitalizationValidator' - - def init(self): - self.docnames = [] - self.document_data = [] - print("BRO") - return - - def write_doc(self, docname, doctree): - z = 0 - if z == 0: - self.error("Warning: capitalization not followed") - - for node in doctree.traverse(nodes.Text): - if (node.tagname == '#subtitle'): - print(node.astext()) - - def finish(self): - z = 1 - return - - -def setup(app): - app.add_builder(HeadingCapitalizationValidator) - return { - 'version': '0.1', - 'parallel_read_safe': True, - 'parallel_write_safe': True, - } - - - # return { - # 'version': '0.1', - # 'parallel_read_safe': True, - # 'parallel_write_safe': True, - # } - -# http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html#get_children -import docutils -from docutils import nodes -from docutils.parsers.rst import Parser - -parser = Parser() -f = open("contributing.rst", "r") -input = f.read() -settings = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,) - ).get_default_values() -document = docutils.utils.new_document('Document', settings) -parser.parse(input, document) - -# print list of all the subtitles/headings that we want. -# node.tagname = #text, parent.tagname = title (ALL OF THEM) -listOfMarkers = ['emphasis', 'strong', 'reference', 'literal'] -myText = "" -markerGrandparent = "" -beforeMarker = False -for node in document.traverse(nodes.Text): - if (node.parent.tagname == 'title'): - if (beforeMarker and markerGrandparent == node.parent): - myText = myText + node.astext() - beforeMarker = False - else: - if (myText != ""): - print(myText) - myText = node.astext() - beforeMarker = False - elif (node.parent.parent.tagname == 'title' and - node.parent.tagname in listOfMarkers): - myText = myText + node.astext() - beforeMarker = True - markerGrandparent = node.parent.parent - else: - beforeMarker = False - if (myText != ""): - print(myText) - myText = "" - -if (myText != ""): - print(myText) diff --git a/titles.py b/titles.py new file mode 100644 index 0000000000000..d36370c8a8777 --- /dev/null +++ b/titles.py @@ -0,0 +1,122 @@ +"""Sphinx extension for collecting the titles in the rst files and validating +if they follow the capitalization convention. This sends a warning if +it is not followed. + +Reference: +http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html#get_children + +""" + +from docutils.parsers.rst import Parser +import docutils +from docutils import nodes +import re +import os +from os import walk + + + +def followCapitalizationConvention(title): + # Keynames that would not follow capitalization convention + keyNames = {'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', + 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', + 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub'} + + # Lowercase representation of keynames + keyNamesLower = {'pandas'} + for k in keyNames: + keyNamesLower.add(k.lower()) + + # split with delimiters comma, semicolon and space, parentheses, colon + wordList = re.split(r'[;,():\s]\s*', title) # followed by any amount of extra whitespace. + + + # Edge Case: First word is an empty string + if (len(wordList[0]) == 0): + return False + + # Dealing with the first word of the title + if wordList[0] not in keyNames: + # word is not in keyNames but has different capitalization + if wordList[0] in keyNamesLower: + return False + # First letter of first word must be uppercase + if (not wordList[0][0].isupper()): + return False + # Remaining letters of first word must not be uppercase + for j in range(1, len(wordList[0])): + if wordList[0][j].isupper(): + return False + + # Remaining letters must not be uppercase letters + for i in range(1, len(wordList)): + if wordList[i] not in keyNames: + # word is not in keyNames but has different capitalization + if wordList[i] in keyNamesLower: + return False + # Remaining letters must not be uppercase + for j in range(len(wordList[i])): + if wordList[i][j].isupper(): + return False + + return True + + +def printBadTitles(rstFile): + badTitles = [] + parser = docutils.parsers.rst.Parser() + # f = open("doc/source/development/contributing.rst", "r") + f = open(rstFile, "r") + input = f.read() + settings = docutils.frontend.OptionParser( + components=(docutils.parsers.rst.Parser,) + ).get_default_values() + document = docutils.utils.new_document('Document', settings) + parser.parse(input, document) + + # print list of all the subtitles/headings that we want. + # Note: allParentTagsOfText = {'problematic', 'title', 'emphasis', 'inline', 'strong', 'literal', 'literal_block', 'title_reference', 'reference', 'paragraph'} + listOfMarkers = {'emphasis', 'strong', 'reference', 'literal'} + myText = "" + markerGrandparent = "" + beforeMarker = False + titleList = [] + for node in document.traverse(nodes.Text): + if (node.parent.tagname == 'title'): + if (beforeMarker and markerGrandparent == node.parent): + myText = myText + node.astext() + beforeMarker = False + else: + if (myText != ""): + titleList.append(myText) + myText = node.astext() + beforeMarker = False + elif (node.parent.parent.tagname == 'title' and + node.parent.tagname in listOfMarkers): + myText = myText + node.astext() + beforeMarker = True + markerGrandparent = node.parent.parent + else: + beforeMarker = False + if (myText != ""): + titleList.append(myText) + myText = "" + + if (myText != ""): + titleList.append(myText) + + for text in titleList: + if not followCapitalizationConvention(text): + badTitles.append(text) + + print(badTitles) + +f = [] +for (dirpath, dirnames, filenames) in walk('doc/source'): + for file in filenames: + if file.endswith(".rst"): + f.append(os.path.join(dirpath, file)) + +for filename in f: + print(filename) + printBadTitles(filename) From 30c4f8c363873201fe2238d62fdb9d317fd2f7c9 Mon Sep 17 00:00:00 2001 From: awu42 Date: Wed, 15 Jan 2020 11:08:54 -0500 Subject: [PATCH 14/44] modified validation script --- .../validate_rst_title_capitalization.py | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) rename titles.py => scripts/validate_rst_title_capitalization.py (79%) diff --git a/titles.py b/scripts/validate_rst_title_capitalization.py similarity index 79% rename from titles.py rename to scripts/validate_rst_title_capitalization.py index d36370c8a8777..f5c6c22d497ec 100644 --- a/titles.py +++ b/scripts/validate_rst_title_capitalization.py @@ -7,6 +7,7 @@ """ +import sys from docutils.parsers.rst import Parser import docutils from docutils import nodes @@ -14,17 +15,20 @@ import os from os import walk +# Keynames that would not follow capitalization convention +CAPITALIZATION_EXCEPTIONS = { + 'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', + 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', + 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub' +} + def followCapitalizationConvention(title): - # Keynames that would not follow capitalization convention - keyNames = {'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', - 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', - 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub'} # Lowercase representation of keynames keyNamesLower = {'pandas'} - for k in keyNames: + for k in CAPITALIZATION_EXCEPTIONS: keyNamesLower.add(k.lower()) # split with delimiters comma, semicolon and space, parentheses, colon @@ -36,9 +40,9 @@ def followCapitalizationConvention(title): return False # Dealing with the first word of the title - if wordList[0] not in keyNames: + if wordList[0] not in CAPITALIZATION_EXCEPTIONS: # word is not in keyNames but has different capitalization - if wordList[0] in keyNamesLower: + if wordList[0].lower() in keyNamesLower: return False # First letter of first word must be uppercase if (not wordList[0][0].isupper()): @@ -50,9 +54,9 @@ def followCapitalizationConvention(title): # Remaining letters must not be uppercase letters for i in range(1, len(wordList)): - if wordList[i] not in keyNames: + if wordList[i] not in CAPITALIZATION_EXCEPTIONS: # word is not in keyNames but has different capitalization - if wordList[i] in keyNamesLower: + if wordList[i].lower() in keyNamesLower: return False # Remaining letters must not be uppercase for j in range(len(wordList[i])): @@ -107,16 +111,22 @@ def printBadTitles(rstFile): for text in titleList: if not followCapitalizationConvention(text): - badTitles.append(text) + print(text) + # badTitles.append(text) + + # print(badTitles) - print(badTitles) +def findBadTitles(directoryAddress): + f = [] + for (dirpath, dirnames, filenames) in walk(directoryAddress): + for file in filenames: + if file.endswith(".rst"): + f.append(os.path.join(dirpath, file)) -f = [] -for (dirpath, dirnames, filenames) in walk('doc/source'): - for file in filenames: - if file.endswith(".rst"): - f.append(os.path.join(dirpath, file)) + for filename in f: + print(filename) + printBadTitles(filename) -for filename in f: - print(filename) - printBadTitles(filename) +if __name__ == "__main__": + for i in range(1, len(sys.argv)): + findBadTitles(sys.argv[i]) From aabd136c787fef1d411d9bde0dc9789f6a3f6dce Mon Sep 17 00:00:00 2001 From: awu42 Date: Wed, 15 Jan 2020 11:23:54 -0500 Subject: [PATCH 15/44] command line arguments possible for validation script --- scripts/validate_rst_title_capitalization.py | 27 ++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index f5c6c22d497ec..a8e275a4d596c 100644 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,8 +1,16 @@ -"""Sphinx extension for collecting the titles in the rst files and validating -if they follow the capitalization convention. This sends a warning if -it is not followed. +"""Python script for collecting the titles in the rst files and validating +if they follow the capitalization convention. Prints the titles that do not +follow the convention. Particularly used for .rst files in the doc/source folder -Reference: +NOTE: Run from the root directory of pandas repository + +Example: +python ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst + +Folders that have been validated: +doc/source/development + +Reference: doctree elements http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html#get_children """ @@ -118,10 +126,13 @@ def printBadTitles(rstFile): def findBadTitles(directoryAddress): f = [] - for (dirpath, dirnames, filenames) in walk(directoryAddress): - for file in filenames: - if file.endswith(".rst"): - f.append(os.path.join(dirpath, file)) + if (directoryAddress.endswith(".rst")): + f.append(directoryAddress) + else: + for (dirpath, dirnames, filenames) in walk(directoryAddress): + for file in filenames: + if file.endswith(".rst"): + f.append(os.path.join(dirpath, file)) for filename in f: print(filename) From 4c83edb930ddd9f527307ff303306ebfdb881fc9 Mon Sep 17 00:00:00 2001 From: awu42 Date: Wed, 15 Jan 2020 19:04:15 -0500 Subject: [PATCH 16/44] validation script needs better commenting --- scripts/validate_rst_title_capitalization.py | 47 +++++++++++++++----- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index a8e275a4d596c..7f4e646e55ac5 100644 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -7,11 +7,12 @@ Example: python ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst -Folders that have been validated: -doc/source/development +Files that cannot be validated: (code crashes when validating for some reason) +doc/source/user_guide/io.rst +doc/source/whatsnew/v0.17.1.rst Reference: doctree elements -http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html#get_children +http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html """ @@ -27,11 +28,17 @@ CAPITALIZATION_EXCEPTIONS = { 'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', - 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub' + 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub', 'NumPy', + 'Apache', 'Arrow', 'Parquet', 'Triage', 'MultiIndex', 'NumFOCUS' } +# Dictionary of bad titles that will be printed later +badTitleDictionary = {} +# List of files that, when validated, causes the program to crash +cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] +# Method returns true or false depending on whether title follows convention def followCapitalizationConvention(title): # Lowercase representation of keynames @@ -73,11 +80,19 @@ def followCapitalizationConvention(title): return True - +# Method prints all of the bad titles def printBadTitles(rstFile): - badTitles = [] + # Ensure file isn't one that causes the code to crash + if rstFile in cannotValidate: + return + # Initialize this file's badtitleDictionary slot + if rstFile in badTitleDictionary: + return + else: + badTitleDictionary[rstFile] = [] + + # Parse through rstFile parser = docutils.parsers.rst.Parser() - # f = open("doc/source/development/contributing.rst", "r") f = open(rstFile, "r") input = f.read() settings = docutils.frontend.OptionParser( @@ -119,11 +134,9 @@ def printBadTitles(rstFile): for text in titleList: if not followCapitalizationConvention(text): - print(text) - # badTitles.append(text) - - # print(badTitles) + badTitleDictionary[rstFile].append(text) +# Method finds all the bad titles, runs printBadTitles def findBadTitles(directoryAddress): f = [] if (directoryAddress.endswith(".rst")): @@ -135,9 +148,19 @@ def findBadTitles(directoryAddress): f.append(os.path.join(dirpath, file)) for filename in f: - print(filename) printBadTitles(filename) +# Main Method if __name__ == "__main__": for i in range(1, len(sys.argv)): findBadTitles(sys.argv[i]) + + print("\n \nBAD TITLES \n \n") + + # Print badTitleDictionary Results + for key in badTitleDictionary: + if (len(badTitleDictionary[key]) != 0): + print(key) + for titles in badTitleDictionary[key]: + print(titles) + print() From 50661c3af10f80cb9416222fe39025cb9b6126ea Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 08:36:07 -0500 Subject: [PATCH 17/44] added line number to validation script --- scripts/validate_rst_title_capitalization.py | 120 ++++++++++++++++--- 1 file changed, 104 insertions(+), 16 deletions(-) mode change 100644 => 100755 scripts/validate_rst_title_capitalization.py diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py old mode 100644 new mode 100755 index 7f4e646e55ac5..a2d20a4036e59 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """Python script for collecting the titles in the rst files and validating if they follow the capitalization convention. Prints the titles that do not follow the convention. Particularly used for .rst files in the doc/source folder @@ -5,7 +7,7 @@ NOTE: Run from the root directory of pandas repository Example: -python ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst +./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst Files that cannot be validated: (code crashes when validating for some reason) doc/source/user_guide/io.rst @@ -24,6 +26,43 @@ import os from os import walk +class suppress_stdout_stderr(object): + ''' + Code source: + https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions + + A context manager for doing a "deep suppression" of stdout and stderr in + Python, i.e. will suppress all print, even if the print originates in a + compiled C/Fortran sub-function. + This will not suppress raised exceptions, since exceptions are printed + to stderr just before a script exits, and after the context manager has + exited (at least, I think that is why it lets exceptions through). + + This code is needed to suppress output from the parser method + because the parser method prints to stdout when encountering Sphinx + references, as it cannot parse those at this moment. + + ''' + def __init__(self): + # Open a pair of null files + self.null_fds = [os.open(os.devnull,os.O_RDWR) for x in range(2)] + # Save the actual stdout (1) and stderr (2) file descriptors. + self.save_fds = [os.dup(1), os.dup(2)] + + def __enter__(self): + # Assign the null pointers to stdout and stderr. + os.dup2(self.null_fds[0],1) + os.dup2(self.null_fds[1],2) + + def __exit__(self, *_): + # Re-assign the real stdout/stderr back to (1) and (2) + os.dup2(self.save_fds[0],1) + os.dup2(self.save_fds[1],2) + # Close all file descriptors + for fd in self.null_fds + self.save_fds: + os.close(fd) + + # Keynames that would not follow capitalization convention CAPITALIZATION_EXCEPTIONS = { 'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', @@ -35,11 +74,22 @@ # Dictionary of bad titles that will be printed later badTitleDictionary = {} +# List of problematic tags that are exceptions to parent rule +listOfMarkers = {'emphasis', 'strong', 'reference', 'literal'} + # List of files that, when validated, causes the program to crash cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] -# Method returns true or false depending on whether title follows convention -def followCapitalizationConvention(title): +# Error Message: +errMessage = "Title capitalization formatted incorrectly. Manually format correctly" + + +def followCapitalizationConvention(title: str) -> bool: + ''' + Method returns true or false depending on whether a title follows + the capitalization convention + + ''' # Lowercase representation of keynames keyNamesLower = {'pandas'} @@ -80,8 +130,24 @@ def followCapitalizationConvention(title): return True -# Method prints all of the bad titles -def printBadTitles(rstFile): +def findLineNumber(node: docutils.nodes) -> int: + ''' + Method that finds the line number in a document for a particular node + + ''' + if (node.tagname == 'document'): + return 1 + elif (node.line == None): + return findLineNumber(node.parent) + else: + return node.line - 1 + +def fillBadTitleDictionary(rstFile: str) -> None: + ''' + Method that prints all of the bad titles + Message: [directory of rstFile, line number of bad title, error message] + + ''' # Ensure file isn't one that causes the code to crash if rstFile in cannotValidate: return @@ -99,15 +165,17 @@ def printBadTitles(rstFile): components=(docutils.parsers.rst.Parser,) ).get_default_values() document = docutils.utils.new_document('Document', settings) - parser.parse(input, document) - # print list of all the subtitles/headings that we want. - # Note: allParentTagsOfText = {'problematic', 'title', 'emphasis', 'inline', 'strong', 'literal', 'literal_block', 'title_reference', 'reference', 'paragraph'} - listOfMarkers = {'emphasis', 'strong', 'reference', 'literal'} + with suppress_stdout_stderr(): + parser.parse(input, document) + + + # Fill up the titleList with lines that follow the title pattern myText = "" markerGrandparent = "" beforeMarker = False titleList = [] + lineNumberList = [] for node in document.traverse(nodes.Text): if (node.parent.tagname == 'title'): if (beforeMarker and markerGrandparent == node.parent): @@ -116,10 +184,13 @@ def printBadTitles(rstFile): else: if (myText != ""): titleList.append(myText) + lineNumberList.append(lineno) + lineno = findLineNumber(node) myText = node.astext() beforeMarker = False elif (node.parent.parent.tagname == 'title' and node.parent.tagname in listOfMarkers): + lineno = findLineNumber(node) myText = myText + node.astext() beforeMarker = True markerGrandparent = node.parent.parent @@ -127,17 +198,28 @@ def printBadTitles(rstFile): beforeMarker = False if (myText != ""): titleList.append(myText) + lineNumberList.append(lineno) myText = "" + lineno = 0 if (myText != ""): titleList.append(myText) + lineNumberList.append(lineno) + - for text in titleList: - if not followCapitalizationConvention(text): - badTitleDictionary[rstFile].append(text) + # For each line in the titleList, append the badTitleDictionary if + # the capitalization convention is not followed + for i in range(len(titleList)): + if not followCapitalizationConvention(titleList[i]): + badTitleDictionary[rstFile].append((titleList[i], lineNumberList[i])) -# Method finds all the bad titles, runs printBadTitles -def findBadTitles(directoryAddress): + +def findBadTitles(directoryAddress: str) -> None: + + ''' + Method finds all the bad titles, runs fillBadTitleDictionary + + ''' f = [] if (directoryAddress.endswith(".rst")): f.append(directoryAddress) @@ -148,19 +230,25 @@ def findBadTitles(directoryAddress): f.append(os.path.join(dirpath, file)) for filename in f: - printBadTitles(filename) + fillBadTitleDictionary(filename) # Main Method if __name__ == "__main__": for i in range(1, len(sys.argv)): findBadTitles(sys.argv[i]) - print("\n \nBAD TITLES \n \n") + print("BAD TITLES \n \n") # Print badTitleDictionary Results + printed = False for key in badTitleDictionary: if (len(badTitleDictionary[key]) != 0): + printed = True print(key) for titles in badTitleDictionary[key]: print(titles) print() + + # Exit code of 1 if there were bad titles + if (printed): + sys.exit(1) From f513f29cd9216892842119d6c7957dcd28149495 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 08:50:29 -0500 Subject: [PATCH 18/44] edited code_checks.sh --- ci/code_checks.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 83ceb11dfcbf4..8cccee87cc66d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -340,5 +340,10 @@ if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" fi +### VALIDATE TITLE CAPITALIZATION CONVENTION (Do I need an IF statement?)### +MSG='Validate correct capitalization among titles in documentation' ; echo $MSG +$BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development/contributing.rst +$BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/index.rst $BASE_DIR/doc/source/ecosystem.rst +RET=$(($RET + $?)) ; echo $MSG "DONE" exit $RET From 2d3cfe77cba488591aac0cebbc68887afbb4d585 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 12:13:59 -0500 Subject: [PATCH 19/44] argument parser correctly implemented --- scripts/validate_rst_title_capitalization.py | 165 ++++++++++++------- 1 file changed, 107 insertions(+), 58 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index a2d20a4036e59..1bb4a3369b8d0 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,6 +1,9 @@ #!/usr/bin/env python -"""Python script for collecting the titles in the rst files and validating +""" +Author: tonywu1999, Date Edited: 01/17/2020 + +Python script for collecting the titles in the rst files and validating if they follow the capitalization convention. Prints the titles that do not follow the convention. Particularly used for .rst files in the doc/source folder @@ -18,6 +21,7 @@ """ +import argparse import sys from docutils.parsers.rst import Parser import docutils @@ -25,6 +29,7 @@ import re import os from os import walk +from typing import Generator, List, Tuple class suppress_stdout_stderr(object): ''' @@ -71,7 +76,11 @@ def __exit__(self, *_): 'Apache', 'Arrow', 'Parquet', 'Triage', 'MultiIndex', 'NumFOCUS' } -# Dictionary of bad titles that will be printed later +# Lowercase representation of CAPITALIZATION_EXCEPTIONS +CAPITALIZATION_EXCEPTIONS_LOWER = {word.lower() for word in CAPITALIZATION_EXCEPTIONS} + +# Dictionary of bad titles that will be printed later along with line numbers +# Key: Document Directory, Value: Pair(Bad Title, Line Number) badTitleDictionary = {} # List of problematic tags that are exceptions to parent rule @@ -83,23 +92,18 @@ def __exit__(self, *_): # Error Message: errMessage = "Title capitalization formatted incorrectly. Manually format correctly" - def followCapitalizationConvention(title: str) -> bool: ''' - Method returns true or false depending on whether a title follows - the capitalization convention + tonywu1999's algorithm to determine if a heading follows the capitalization convention - ''' + This method returns true if the title follows the convention + and false if it does not - # Lowercase representation of keynames - keyNamesLower = {'pandas'} - for k in CAPITALIZATION_EXCEPTIONS: - keyNamesLower.add(k.lower()) + ''' # split with delimiters comma, semicolon and space, parentheses, colon wordList = re.split(r'[;,():\s]\s*', title) # followed by any amount of extra whitespace. - # Edge Case: First word is an empty string if (len(wordList[0]) == 0): return False @@ -107,7 +111,7 @@ def followCapitalizationConvention(title: str) -> bool: # Dealing with the first word of the title if wordList[0] not in CAPITALIZATION_EXCEPTIONS: # word is not in keyNames but has different capitalization - if wordList[0].lower() in keyNamesLower: + if wordList[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # First letter of first word must be uppercase if (not wordList[0][0].isupper()): @@ -121,7 +125,7 @@ def followCapitalizationConvention(title: str) -> bool: for i in range(1, len(wordList)): if wordList[i] not in CAPITALIZATION_EXCEPTIONS: # word is not in keyNames but has different capitalization - if wordList[i].lower() in keyNamesLower: + if wordList[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # Remaining letters must not be uppercase for j in range(len(wordList[i])): @@ -132,7 +136,8 @@ def followCapitalizationConvention(title: str) -> bool: def findLineNumber(node: docutils.nodes) -> int: ''' - Method that finds the line number in a document for a particular node + Recursive method that finds the line number in a document for a particular node + in the doctree ''' if (node.tagname == 'document'): @@ -142,21 +147,11 @@ def findLineNumber(node: docutils.nodes) -> int: else: return node.line - 1 -def fillBadTitleDictionary(rstFile: str) -> None: +def parseRST(rstFile: str) -> docutils.nodes.document: ''' - Method that prints all of the bad titles - Message: [directory of rstFile, line number of bad title, error message] + Method to parse through an rstFile and return a document tree ''' - # Ensure file isn't one that causes the code to crash - if rstFile in cannotValidate: - return - # Initialize this file's badtitleDictionary slot - if rstFile in badTitleDictionary: - return - else: - badTitleDictionary[rstFile] = [] - # Parse through rstFile parser = docutils.parsers.rst.Parser() f = open(rstFile, "r") @@ -169,8 +164,14 @@ def fillBadTitleDictionary(rstFile: str) -> None: with suppress_stdout_stderr(): parser.parse(input, document) + return document + +def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[str], List[int], None]: + ''' + tonywu1999's algorithm to identify particular text nodes as headings + along with the text node's line number - # Fill up the titleList with lines that follow the title pattern + ''' myText = "" markerGrandparent = "" beforeMarker = False @@ -206,49 +207,97 @@ def fillBadTitleDictionary(rstFile: str) -> None: titleList.append(myText) lineNumberList.append(lineno) + return titleList, lineNumberList + +def fillBadTitleDictionary(rstFile: str) -> None: + ''' + Method that prints all of the bad titles + Message: [directory of rstFile, line number of bad title, error message] + + ''' + + # Ensure file isn't one that causes the code to crash + if rstFile in cannotValidate: + return + + # Ensure this file doesn't already have a badtitleDictionary slot + if rstFile in badTitleDictionary: + return + + # Parse rstFile with an RST parser + document = parseRST(rstFile) - # For each line in the titleList, append the badTitleDictionary if - # the capitalization convention is not followed + # Produce a list of headings along with their line numbers from the root document node + titleList, lineNumberList = findBadTitlesInDoctree(document) + + # Append the badTitleDictionary if the capitalization convention for a heading is not followed for i in range(len(titleList)): if not followCapitalizationConvention(titleList[i]): - badTitleDictionary[rstFile].append((titleList[i], lineNumberList[i])) - + if rstFile not in badTitleDictionary: + badTitleDictionary[rstFile] = [(titleList[i], lineNumberList[i])] + else: + badTitleDictionary[rstFile].append((titleList[i], lineNumberList[i])) -def findBadTitles(directoryAddress: str) -> None: +def createRSTDirectoryList(source_paths: List[str]) -> List[str]: ''' - Method finds all the bad titles, runs fillBadTitleDictionary + Given the command line arguments of directory paths, this method + creates a list of all of the .rst file directories that these paths contain ''' f = [] - if (directoryAddress.endswith(".rst")): - f.append(directoryAddress) - else: - for (dirpath, dirnames, filenames) in walk(directoryAddress): - for file in filenames: - if file.endswith(".rst"): - f.append(os.path.join(dirpath, file)) + for directoryAddress in source_paths: + if (directoryAddress.endswith(".rst")): + f.append(directoryAddress) + else: + for (dirpath, dirnames, filenames) in walk(directoryAddress): + for file in filenames: + if file.endswith(".rst"): + f.append(os.path.join(dirpath, file)) - for filename in f: - fillBadTitleDictionary(filename) + return f -# Main Method -if __name__ == "__main__": - for i in range(1, len(sys.argv)): - findBadTitles(sys.argv[i]) +def main(source_paths: List[str], output_format: str) -> bool: + ''' + The main method to execute all commands - print("BAD TITLES \n \n") + ''' + + # Create a list of all RST files from command line directory list + directoryList = createRSTDirectoryList(source_paths) + + # Fill the badTitleDictionary, which contains all incorrectly capitalized headings + for filename in directoryList: + fillBadTitleDictionary(filename) + + # Return an exit status of 0 if there are no bad titles in the dictionary + if (len(badTitleDictionary) == 0): + return False # Print badTitleDictionary Results - printed = False for key in badTitleDictionary: - if (len(badTitleDictionary[key]) != 0): - printed = True - print(key) - for titles in badTitleDictionary[key]: - print(titles) - print() - - # Exit code of 1 if there were bad titles - if (printed): - sys.exit(1) + print() + print(key) + for titles in badTitleDictionary[key]: + print(titles) + + # Exit status of 1 + return True + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = 'Validate capitalization for document headings') + + parser.add_argument( + "paths", nargs="+", default=".", help="Source paths of file/directory to check." + ) + + parser.add_argument( + "--format", + default="{source_path}:{line_number}:{heading}:{msg}", + help="Output format of incorrectly capitalized titles", + ) + + args = parser.parse_args() + + sys.exit(main(args.paths, args.format)) From 4ceea5e4002f0c7c942313e843d687ac0273ffff Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 16:42:02 -0500 Subject: [PATCH 20/44] Added comments --- scripts/validate_rst_title_capitalization.py | 117 ++++++++++++++++--- 1 file changed, 98 insertions(+), 19 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 1bb4a3369b8d0..e21116c91fd4a 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -9,8 +9,10 @@ NOTE: Run from the root directory of pandas repository -Example: +Examples: ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst +./scripts/validate_rst_title_capitalization.py doc/source/index.rst doc/source/ecosystem.rst +./scripts/validate_rst_title_capitalization.py doc/source/ Files that cannot be validated: (code crashes when validating for some reason) doc/source/user_guide/io.rst @@ -90,19 +92,19 @@ def __exit__(self, *_): cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] # Error Message: -errMessage = "Title capitalization formatted incorrectly. Manually format correctly" +errMessage = "Heading capitalization formatted incorrectly. Please correctly capitalize" def followCapitalizationConvention(title: str) -> bool: ''' - tonywu1999's algorithm to determine if a heading follows the capitalization convention + Algorithm to determine if a heading follows the capitalization convention This method returns true if the title follows the convention and false if it does not ''' - # split with delimiters comma, semicolon and space, parentheses, colon - wordList = re.split(r'[;,():\s]\s*', title) # followed by any amount of extra whitespace. + # split with delimiters comma, semicolon and space, parentheses, colon, slashes + wordList = re.split(r'[;,/():\s]\s*', title) # followed by any amount of extra whitespace. # Edge Case: First word is an empty string if (len(wordList[0]) == 0): @@ -110,7 +112,7 @@ def followCapitalizationConvention(title: str) -> bool: # Dealing with the first word of the title if wordList[0] not in CAPITALIZATION_EXCEPTIONS: - # word is not in keyNames but has different capitalization + # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization if wordList[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # First letter of first word must be uppercase @@ -124,7 +126,7 @@ def followCapitalizationConvention(title: str) -> bool: # Remaining letters must not be uppercase letters for i in range(1, len(wordList)): if wordList[i] not in CAPITALIZATION_EXCEPTIONS: - # word is not in keyNames but has different capitalization + # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization if wordList[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # Remaining letters must not be uppercase @@ -132,6 +134,7 @@ def followCapitalizationConvention(title: str) -> bool: if wordList[i][j].isupper(): return False + # Returning True if the heading follows the capitalization convention return True def findLineNumber(node: docutils.nodes) -> int: @@ -139,6 +142,10 @@ def findLineNumber(node: docutils.nodes) -> int: Recursive method that finds the line number in a document for a particular node in the doctree + Text nodes usually don't have any value for its "line" instance variable, + so instead, we recursively look through the parent nodes to eventually find the + correct line number, which I determined would be node.line - 1 + ''' if (node.tagname == 'document'): return 1 @@ -152,32 +159,92 @@ def parseRST(rstFile: str) -> docutils.nodes.document: Method to parse through an rstFile and return a document tree ''' - # Parse through rstFile + # Create rst Parser object parser = docutils.parsers.rst.Parser() + + # Open and read the .rst file and store the string of data into input f = open(rstFile, "r") input = f.read() + + # Set up default settings for the document tree settings = docutils.frontend.OptionParser( components=(docutils.parsers.rst.Parser,) ).get_default_values() + + # Initialize an empty document tree with the default settings from above document = docutils.utils.new_document('Document', settings) + # Parse the input string into an RST document tree, suppressing any stdout from the parse method with suppress_stdout_stderr(): parser.parse(input, document) + # Return the root node of the document tree return document def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[str], List[int], None]: ''' - tonywu1999's algorithm to identify particular text nodes as headings - along with the text node's line number + Algorithm to identify particular text nodes as headings + along with the text node's line number. + + The idea is that when we traverse through the text nodes, nodes whose + parents have a tagname of 'title' are definitely considered to be part + of headings. + + However, the problem occurs when we encounter text that has been either + italicized, bolded, referenced, etc. In these situations, the tagname of + the parent node could be one of the following: 'emphasis', 'strong', 'reference', 'literal', + stored in the 'listOfMarkers' set variable. In this situation, the node's + grandparent would have the 'title' tagname instead. + + Let's see an example that can cause a problem. The heading provided will be + 'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example, + the stars around pandas in the first string italicizes the word. + However, the doctree would be representing both doctrees as follows: + + 'Looking at *pandas* docs' 'Looking at pandas docs' + title title + / | \ | + #text emphasis #text VS #text + | | | | + 'Looking at' #text 'docs' 'Looking at pandas docs' + | + 'pandas' + + When iterating through the nodes, we first encounter the node: 'Looking at'. + However, this isn't the full line of the heading (Looking at pandas docs). + We're still missing 'pandas docs'. Hence, we must store this first word into + a variable (myText in my function) and append this string variable with more + words in case we encounter text that has a parent with tagname in listOfMarkers. + In this example, we have to go through two more nodes to get the full heading. + + Meanwhile, when nothing has a parent with tagname in listOfMarkers, we only need to + access one node to find the 'Looking at the pandas docs' text. + + My algorithm adjusts for this pattern, iterating through nodes and + identifying when headings are complete. ''' - myText = "" - markerGrandparent = "" - beforeMarker = False - titleList = [] - lineNumberList = [] + + # Initialize an empty string. myText will be used to construct headings and append into titleList + myText: str = "" + + # A docutils.nodes object that stores a listOfMarkers text's grandparent node, + # which should have a tagname of title + markerGrandparent: docutils.nodes.Title + + # True if the most recent node encountered had a parent with a listOfMarkers tagname + # and a grandparent with a tagname of title + beforeMarker: bool = False + + # titleList is the list of headings that is encountered in the doctree + titleList: List[str] = [] + + # A list of line numbers that the corresponding headings in titleList can be found at + lineNumberList: List[int] = [] + + # Traverse through the nodes.Text in the document tree to construct headings for node in document.traverse(nodes.Text): + # Case 1: Encounter a node with a parent tagname of title if (node.parent.tagname == 'title'): if (beforeMarker and markerGrandparent == node.parent): myText = myText + node.astext() @@ -189,12 +256,14 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ lineno = findLineNumber(node) myText = node.astext() beforeMarker = False + # Case 2: Encounter a node with parent tagname in listOfMarkers elif (node.parent.parent.tagname == 'title' and node.parent.tagname in listOfMarkers): lineno = findLineNumber(node) myText = myText + node.astext() beforeMarker = True markerGrandparent = node.parent.parent + # Case 3: Encounter a node with parent tagname from none of the above (Ex. 'paragraph' tagname) else: beforeMarker = False if (myText != ""): @@ -203,10 +272,12 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ myText = "" lineno = 0 + # Sometimes, there is leftover string that hasn't been appended yet due to how the for loop works if (myText != ""): titleList.append(myText) lineNumberList.append(lineno) + # Return a list of the headings and a list of their corresponding line numbers return titleList, lineNumberList def fillBadTitleDictionary(rstFile: str) -> None: @@ -245,9 +316,16 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]: creates a list of all of the .rst file directories that these paths contain ''' + # List of .rst file paths f = [] + + # Loop through source_paths. If address is a folder, recursively look through the folder for .rst files for directoryAddress in source_paths: - if (directoryAddress.endswith(".rst")): + if not os.path.exists(directoryAddress): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + elif (directoryAddress.endswith(".rst")): f.append(directoryAddress) else: for (dirpath, dirnames, filenames) in walk(directoryAddress): @@ -255,6 +333,7 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]: if file.endswith(".rst"): f.append(os.path.join(dirpath, file)) + # Return the filled up list of .rst file paths return f def main(source_paths: List[str], output_format: str) -> bool: @@ -275,11 +354,10 @@ def main(source_paths: List[str], output_format: str) -> bool: return False # Print badTitleDictionary Results + print() for key in badTitleDictionary: - print() - print(key) for titles in badTitleDictionary[key]: - print(titles) + print(key + ":" + str(titles[1]) + ": " + errMessage + " \"" + titles[0] + "\"") # Exit status of 1 return True @@ -294,6 +372,7 @@ def main(source_paths: List[str], output_format: str) -> bool: parser.add_argument( "--format", + "-f", default="{source_path}:{line_number}:{heading}:{msg}", help="Output format of incorrectly capitalized titles", ) From 11556b72c62350de20fa91de76e4c58568f84829 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 16:46:21 -0500 Subject: [PATCH 21/44] Validate consistency of title capitalization in documentation script added (#26941) --- scripts/validate_rst_title_capitalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index e21116c91fd4a..d2ec5fd5a5994 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -75,7 +75,7 @@ def __exit__(self, *_): 'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub', 'NumPy', - 'Apache', 'Arrow', 'Parquet', 'Triage', 'MultiIndex', 'NumFOCUS' + 'Apache', 'Arrow', 'Parquet', 'Triage', 'MultiIndex', 'NumFOCUS', 'sklearn-pandas' } # Lowercase representation of CAPITALIZATION_EXCEPTIONS From 9fc312aa52554da10f1617a93f4965494d5cb2e9 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 17:00:39 -0500 Subject: [PATCH 22/44] Adding script to validate consistency of title capitalization (#26941) --- doc/sphinxext/helloworld.py | 49 ------------------------------------- 1 file changed, 49 deletions(-) delete mode 100644 doc/sphinxext/helloworld.py diff --git a/doc/sphinxext/helloworld.py b/doc/sphinxext/helloworld.py deleted file mode 100644 index ca7f71c1454e1..0000000000000 --- a/doc/sphinxext/helloworld.py +++ /dev/null @@ -1,49 +0,0 @@ -from docutils import nodes -from docutils.parsers.rst import Directive - - -from docutils import nodes -from docutils.parsers.rst import Directive -from sphinx.builders import Builder - -# Use spelling builder as an example!!!!! - - -class HeadingCapitalizationValidator(Builder): - - name = 'capitalizationValidator' - - def init(self): - self.docnames = [] - self.document_data = [] - print("BRO") - return - - def write_doc(self, docname, doctree): - z = 0 - if z == 0: - self.error("Warning: capitalization not followed") - - for node in doctree.traverse(nodes.Text): - if (node.tagname == 'title'): - print(node.astext()) - - def finish(self): - z = 1 - return - - -def setup(app): - app.add_builder(HeadingCapitalizationValidator) - return { - 'version': '0.1', - 'parallel_read_safe': True, - 'parallel_write_safe': True, - } - - - # return { - # 'version': '0.1', - # 'parallel_read_safe': True, - # 'parallel_write_safe': True, - # } From 635163d783e7c2200389697f967e01172ce4dda8 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 17:04:03 -0500 Subject: [PATCH 23/44] Adding validate_rst_title_capitalization.py (#26941) --- doc/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index be47b78dfb810..7f24d02a496e1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -66,7 +66,6 @@ "sphinx.ext.linkcode", "nbsphinx", "contributors", # custom pandas extension - "helloworld", # Tony Wu custom pandas extension ] exclude_patterns = ["**.ipynb_checkpoints"] From c4ff8bd606a21ab82748541376d1eb965cebb2e5 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 22:13:11 -0500 Subject: [PATCH 24/44] Testing validate_rst_capitalization.py script (#26941) --- ci/code_checks.sh | 11 ++-- scripts/validate_rst_title_capitalization.py | 65 +++++++++++--------- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a27847df10c10..29cdb7ae295fd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -318,6 +318,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" + ### VALIDATE TITLE CAPITALIZATION CONVENTION ### + MSG='Validate correct capitalization among titles in documentation' ; echo $MSG + $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development/contributing.rst + RET=$(($RET + $?)) ; echo $MSG "DONE" + fi ### DEPENDENCIES ### @@ -340,10 +345,4 @@ if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" fi -### VALIDATE TITLE CAPITALIZATION CONVENTION (Do I need an IF statement?)### -MSG='Validate correct capitalization among titles in documentation' ; echo $MSG -$BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development/contributing.rst -$BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/index.rst $BASE_DIR/doc/source/ecosystem.rst -RET=$(($RET + $?)) ; echo $MSG "DONE" - exit $RET diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index d2ec5fd5a5994..9636555618220 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -11,7 +11,6 @@ Examples: ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst -./scripts/validate_rst_title_capitalization.py doc/source/index.rst doc/source/ecosystem.rst ./scripts/validate_rst_title_capitalization.py doc/source/ Files that cannot be validated: (code crashes when validating for some reason) @@ -33,7 +32,8 @@ from os import walk from typing import Generator, List, Tuple -class suppress_stdout_stderr(object): + +class suppress_stdout_stderr: ''' Code source: https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions @@ -52,19 +52,19 @@ class suppress_stdout_stderr(object): ''' def __init__(self): # Open a pair of null files - self.null_fds = [os.open(os.devnull,os.O_RDWR) for x in range(2)] + self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] # Save the actual stdout (1) and stderr (2) file descriptors. self.save_fds = [os.dup(1), os.dup(2)] def __enter__(self): # Assign the null pointers to stdout and stderr. - os.dup2(self.null_fds[0],1) - os.dup2(self.null_fds[1],2) + os.dup2(self.null_fds[0], 1) + os.dup2(self.null_fds[1], 2) def __exit__(self, *_): # Re-assign the real stdout/stderr back to (1) and (2) - os.dup2(self.save_fds[0],1) - os.dup2(self.save_fds[1],2) + os.dup2(self.save_fds[0], 1) + os.dup2(self.save_fds[1], 2) # Close all file descriptors for fd in self.null_fds + self.save_fds: os.close(fd) @@ -72,7 +72,7 @@ def __exit__(self, *_): # Keynames that would not follow capitalization convention CAPITALIZATION_EXCEPTIONS = { - 'pandas', 'Python', 'IPython','PyTables', 'Excel', 'JSON', + 'pandas', 'Python', 'IPython', 'PyTables', 'Excel', 'JSON', 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub', 'NumPy', 'Apache', 'Arrow', 'Parquet', 'Triage', 'MultiIndex', 'NumFOCUS', 'sklearn-pandas' @@ -92,7 +92,8 @@ def __exit__(self, *_): cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] # Error Message: -errMessage = "Heading capitalization formatted incorrectly. Please correctly capitalize" +errMessage = 'Heading capitalization formatted incorrectly. Please correctly capitalize' + def followCapitalizationConvention(title: str) -> bool: ''' @@ -104,7 +105,7 @@ def followCapitalizationConvention(title: str) -> bool: ''' # split with delimiters comma, semicolon and space, parentheses, colon, slashes - wordList = re.split(r'[;,/():\s]\s*', title) # followed by any amount of extra whitespace. + wordList = re.split(r'[;,/():\s]\s*', title) # Edge Case: First word is an empty string if (len(wordList[0]) == 0): @@ -137,6 +138,7 @@ def followCapitalizationConvention(title: str) -> bool: # Returning True if the heading follows the capitalization convention return True + def findLineNumber(node: docutils.nodes) -> int: ''' Recursive method that finds the line number in a document for a particular node @@ -149,11 +151,12 @@ def findLineNumber(node: docutils.nodes) -> int: ''' if (node.tagname == 'document'): return 1 - elif (node.line == None): + elif (node.line is None): return findLineNumber(node.parent) else: return node.line - 1 + def parseRST(rstFile: str) -> docutils.nodes.document: ''' Method to parse through an rstFile and return a document tree @@ -169,19 +172,21 @@ def parseRST(rstFile: str) -> docutils.nodes.document: # Set up default settings for the document tree settings = docutils.frontend.OptionParser( components=(docutils.parsers.rst.Parser,) - ).get_default_values() + ).get_default_values() # Initialize an empty document tree with the default settings from above document = docutils.utils.new_document('Document', settings) - # Parse the input string into an RST document tree, suppressing any stdout from the parse method + # Parse input into an RST doctree, suppressing any stdout from parse method with suppress_stdout_stderr(): parser.parse(input, document) # Return the root node of the document tree return document -def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[str], List[int], None]: + +def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[ + List[str], List[int], None]: ''' Algorithm to identify particular text nodes as headings along with the text node's line number. @@ -192,9 +197,9 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ However, the problem occurs when we encounter text that has been either italicized, bolded, referenced, etc. In these situations, the tagname of - the parent node could be one of the following: 'emphasis', 'strong', 'reference', 'literal', - stored in the 'listOfMarkers' set variable. In this situation, the node's - grandparent would have the 'title' tagname instead. + the parent node could be one of the following: 'emphasis', 'strong', + 'reference', and 'literal', stored in the 'listOfMarkers' set variable. In + this situation, the node's grandparent would have the 'title' tagname instead. Let's see an example that can cause a problem. The heading provided will be 'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example, @@ -203,7 +208,7 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ 'Looking at *pandas* docs' 'Looking at pandas docs' title title - / | \ | + / | | | #text emphasis #text VS #text | | | | 'Looking at' #text 'docs' 'Looking at pandas docs' @@ -225,7 +230,7 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ ''' - # Initialize an empty string. myText will be used to construct headings and append into titleList + # myText will be used to construct headings and append into titleList myText: str = "" # A docutils.nodes object that stores a listOfMarkers text's grandparent node, @@ -239,7 +244,7 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ # titleList is the list of headings that is encountered in the doctree titleList: List[str] = [] - # A list of line numbers that the corresponding headings in titleList can be found at + # List of line numbers that corresponding headings in titleList can be found at lineNumberList: List[int] = [] # Traverse through the nodes.Text in the document tree to construct headings @@ -258,12 +263,12 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ beforeMarker = False # Case 2: Encounter a node with parent tagname in listOfMarkers elif (node.parent.parent.tagname == 'title' and - node.parent.tagname in listOfMarkers): + node.parent.tagname in listOfMarkers): lineno = findLineNumber(node) myText = myText + node.astext() beforeMarker = True markerGrandparent = node.parent.parent - # Case 3: Encounter a node with parent tagname from none of the above (Ex. 'paragraph' tagname) + # Case 3: Encounter parent tagname of none of the above (Ex. 'paragraph') else: beforeMarker = False if (myText != ""): @@ -272,7 +277,7 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ myText = "" lineno = 0 - # Sometimes, there is leftover string that hasn't been appended yet due to how the for loop works + # Leftover string that hasn't been appended yet due to how the for loop works if (myText != ""): titleList.append(myText) lineNumberList.append(lineno) @@ -280,6 +285,7 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[List[ # Return a list of the headings and a list of their corresponding line numbers return titleList, lineNumberList + def fillBadTitleDictionary(rstFile: str) -> None: ''' Method that prints all of the bad titles @@ -298,10 +304,10 @@ def fillBadTitleDictionary(rstFile: str) -> None: # Parse rstFile with an RST parser document = parseRST(rstFile) - # Produce a list of headings along with their line numbers from the root document node + # Make a list of headings along with their line numbers from document tree titleList, lineNumberList = findBadTitlesInDoctree(document) - # Append the badTitleDictionary if the capitalization convention for a heading is not followed + # Append the badTitleDictionary if the capitalization convention not followed for i in range(len(titleList)): if not followCapitalizationConvention(titleList[i]): if rstFile not in badTitleDictionary: @@ -319,7 +325,7 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]: # List of .rst file paths f = [] - # Loop through source_paths. If address is a folder, recursively look through the folder for .rst files + # Loop through source_paths, recursively looking for .rst files for directoryAddress in source_paths: if not os.path.exists(directoryAddress): raise ValueError( @@ -336,6 +342,7 @@ def createRSTDirectoryList(source_paths: List[str]) -> List[str]: # Return the filled up list of .rst file paths return f + def main(source_paths: List[str], output_format: str) -> bool: ''' The main method to execute all commands @@ -357,14 +364,16 @@ def main(source_paths: List[str], output_format: str) -> bool: print() for key in badTitleDictionary: for titles in badTitleDictionary[key]: - print(key + ":" + str(titles[1]) + ": " + errMessage + " \"" + titles[0] + "\"") + print(key + ":" + str(titles[1]) + ": " + errMessage + + " \"" + titles[0] + "\"" + ) # Exit status of 1 return True if __name__ == "__main__": - parser = argparse.ArgumentParser(description = 'Validate capitalization for document headings') + parser = argparse.ArgumentParser(description='Validate heading capitalization') parser.add_argument( "paths", nargs="+", default=".", help="Source paths of file/directory to check." From 83f778c7d1908de0e7f13a4bc150842b53c7b2d0 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 17 Jan 2020 22:56:17 -0500 Subject: [PATCH 25/44] Edited validate script (#26941) --- scripts/validate_rst_title_capitalization.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 9636555618220..1ea36defa589b 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -30,7 +30,7 @@ import re import os from os import walk -from typing import Generator, List, Tuple +from typing import Generator, List class suppress_stdout_stderr: @@ -163,7 +163,7 @@ def parseRST(rstFile: str) -> docutils.nodes.document: ''' # Create rst Parser object - parser = docutils.parsers.rst.Parser() + parser = Parser() # Open and read the .rst file and store the string of data into input f = open(rstFile, "r") @@ -233,9 +233,12 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[ # myText will be used to construct headings and append into titleList myText: str = "" + # lineno will be used to retrieve line numbers of certain headings + lineno: int = 0 + # A docutils.nodes object that stores a listOfMarkers text's grandparent node, # which should have a tagname of title - markerGrandparent: docutils.nodes.Title + markerGrandparent: docutils.nodes.Title = None # True if the most recent node encountered had a parent with a listOfMarkers tagname # and a grandparent with a tagname of title @@ -363,9 +366,9 @@ def main(source_paths: List[str], output_format: str) -> bool: # Print badTitleDictionary Results print() for key in badTitleDictionary: - for titles in badTitleDictionary[key]: - print(key + ":" + str(titles[1]) + ": " + errMessage - + " \"" + titles[0] + "\"" + for line in badTitleDictionary[key]: + print( + key + ":" + str(line[1]) + ": " + errMessage + " \"" + line[0] + "\"" ) # Exit status of 1 From 1907d45ef7ccd2be48c0ee2ee67955ccf6e926df Mon Sep 17 00:00:00 2001 From: awu42 Date: Sat, 18 Jan 2020 16:44:59 -0500 Subject: [PATCH 26/44] Added parameter and return value information in docstrings --- ci/code_checks.sh | 1 - scripts/validate_rst_title_capitalization.py | 303 +++++++++++-------- 2 files changed, 185 insertions(+), 119 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 29cdb7ae295fd..4bf47f2901ca6 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -318,7 +318,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" - ### VALIDATE TITLE CAPITALIZATION CONVENTION ### MSG='Validate correct capitalization among titles in documentation' ; echo $MSG $BASE_DIR/scripts/validate_rst_title_capitalization.py $BASE_DIR/doc/source/development/contributing.rst RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 1ea36defa589b..0234f71680b12 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,15 +1,14 @@ #!/usr/bin/env python """ -Author: tonywu1999, Date Edited: 01/17/2020 +GH #29641 -Python script for collecting the titles in the rst files and validating -if they follow the capitalization convention. Prints the titles that do not -follow the convention. Particularly used for .rst files in the doc/source folder +Collect the titles in the rst files and validate if they follow the proper +capitalization convention. -NOTE: Run from the root directory of pandas repository +Prints the titles that do not follow the convention. -Examples: +Usage:: ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst ./scripts/validate_rst_title_capitalization.py doc/source/ @@ -51,21 +50,25 @@ class suppress_stdout_stderr: ''' def __init__(self): - # Open a pair of null files self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] - # Save the actual stdout (1) and stderr (2) file descriptors. self.save_fds = [os.dup(1), os.dup(2)] def __enter__(self): - # Assign the null pointers to stdout and stderr. + ''' + Assign the null pointers to stdout and stderr. + + ''' os.dup2(self.null_fds[0], 1) os.dup2(self.null_fds[1], 2) def __exit__(self, *_): - # Re-assign the real stdout/stderr back to (1) and (2) + ''' + Re-assign the real stdout/stderr back to (1) and (2) and close all + file descriptors + + ''' os.dup2(self.save_fds[0], 1) os.dup2(self.save_fds[1], 2) - # Close all file descriptors for fd in self.null_fds + self.save_fds: os.close(fd) @@ -75,7 +78,7 @@ def __exit__(self, *_): 'pandas', 'Python', 'IPython', 'PyTables', 'Excel', 'JSON', 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub', 'NumPy', - 'Apache', 'Arrow', 'Parquet', 'Triage', 'MultiIndex', 'NumFOCUS', 'sklearn-pandas' + 'Apache', 'Arrow', 'Parquet', 'MultiIndex', 'NumFOCUS', 'sklearn-pandas' } # Lowercase representation of CAPITALIZATION_EXCEPTIONS @@ -83,63 +86,73 @@ def __exit__(self, *_): # Dictionary of bad titles that will be printed later along with line numbers # Key: Document Directory, Value: Pair(Bad Title, Line Number) -badTitleDictionary = {} +bad_title_dict = {} # List of problematic tags that are exceptions to parent rule -listOfMarkers = {'emphasis', 'strong', 'reference', 'literal'} +list_of_markers = {'emphasis', 'strong', 'reference', 'literal'} # List of files that, when validated, causes the program to crash -cannotValidate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] +cannot_validate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] # Error Message: -errMessage = 'Heading capitalization formatted incorrectly. Please correctly capitalize' +err_msg = 'Heading capitalization formatted incorrectly. Please correctly capitalize' -def followCapitalizationConvention(title: str) -> bool: +def follow_capitalization_convention(title: str) -> bool: ''' Algorithm to determine if a heading follows the capitalization convention This method returns true if the title follows the convention and false if it does not + Parameters + ---------- + title : str + Heading string to validate + + Returns + ------- + bool + True if capitalization is correct, False if not + ''' # split with delimiters comma, semicolon and space, parentheses, colon, slashes - wordList = re.split(r'[;,/():\s]\s*', title) + word_list = re.split(r'[;,/():\s]\s*', title) # Edge Case: First word is an empty string - if (len(wordList[0]) == 0): + if (len(word_list[0]) == 0): return False # Dealing with the first word of the title - if wordList[0] not in CAPITALIZATION_EXCEPTIONS: + if word_list[0] not in CAPITALIZATION_EXCEPTIONS: # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization - if wordList[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: + if word_list[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # First letter of first word must be uppercase - if (not wordList[0][0].isupper()): + if (not word_list[0][0].isupper()): return False # Remaining letters of first word must not be uppercase - for j in range(1, len(wordList[0])): - if wordList[0][j].isupper(): + for j in range(1, len(word_list[0])): + if word_list[0][j].isupper(): return False # Remaining letters must not be uppercase letters - for i in range(1, len(wordList)): - if wordList[i] not in CAPITALIZATION_EXCEPTIONS: + for i in range(1, len(word_list)): + if word_list[i] not in CAPITALIZATION_EXCEPTIONS: # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization - if wordList[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: + if word_list[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # Remaining letters must not be uppercase - for j in range(len(wordList[i])): - if wordList[i][j].isupper(): + for j in range(len(word_list[i])): + if word_list[i][j].isupper(): return False # Returning True if the heading follows the capitalization convention return True -def findLineNumber(node: docutils.nodes) -> int: +def find_line_number(node: docutils.nodes) -> int: ''' Recursive method that finds the line number in a document for a particular node in the doctree @@ -148,25 +161,45 @@ def findLineNumber(node: docutils.nodes) -> int: so instead, we recursively look through the parent nodes to eventually find the correct line number, which I determined would be node.line - 1 + Parameters + ---------- + node : docutils.node + Name of the object of the docstring to validate. + + Returns + ------- + int + The line number of the node + ''' if (node.tagname == 'document'): return 1 elif (node.line is None): - return findLineNumber(node.parent) + return find_line_number(node.parent) else: return node.line - 1 -def parseRST(rstFile: str) -> docutils.nodes.document: +def parse_RST(rst_file: str) -> docutils.nodes.document: ''' - Method to parse through an rstFile and return a document tree + Method to parse through an rst_file and return a document tree + + Parameters + ---------- + rst_file : str + Directory address of a .rst file as a string + + Returns + ------- + document : docutils.nodes.document + Root node of the .rst file's document tree ''' - # Create rst Parser object + # Initialize rst Parser object parser = Parser() # Open and read the .rst file and store the string of data into input - f = open(rstFile, "r") + f = open(rst_file, "r") input = f.read() # Set up default settings for the document tree @@ -185,7 +218,7 @@ def parseRST(rstFile: str) -> docutils.nodes.document: return document -def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[ +def find_titles_in_doctree(document: docutils.nodes.document) -> Generator[ List[str], List[int], None]: ''' Algorithm to identify particular text nodes as headings @@ -198,7 +231,7 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[ However, the problem occurs when we encounter text that has been either italicized, bolded, referenced, etc. In these situations, the tagname of the parent node could be one of the following: 'emphasis', 'strong', - 'reference', and 'literal', stored in the 'listOfMarkers' set variable. In + 'reference', and 'literal', stored in the 'list_of_markers' set variable. In this situation, the node's grandparent would have the 'title' tagname instead. Let's see an example that can cause a problem. The heading provided will be @@ -218,157 +251,191 @@ def findBadTitlesInDoctree(document: docutils.nodes.document) -> Generator[ When iterating through the nodes, we first encounter the node: 'Looking at'. However, this isn't the full line of the heading (Looking at pandas docs). We're still missing 'pandas docs'. Hence, we must store this first word into - a variable (myText in my function) and append this string variable with more - words in case we encounter text that has a parent with tagname in listOfMarkers. + a variable (my_text in my function) and append this string variable with more + words in case we encounter text that has a parent with tagname in list_of_markers. In this example, we have to go through two more nodes to get the full heading. - Meanwhile, when nothing has a parent with tagname in listOfMarkers, we only need to - access one node to find the 'Looking at the pandas docs' text. + Meanwhile, when nothing has a parent with tagname in list_of_markers, we only + need to access one node to find the 'Looking at the pandas docs' text. My algorithm adjusts for this pattern, iterating through nodes and identifying when headings are complete. + Parameters + ---------- + document : docutils.nodes.document + Root node of a .rst file's document tree + + Returns + ------- + title_list : List[str] + A list of heading strings found in the document tree + + line_number_list : List[int] + The corresponding line numbers of the headings in title_list + ''' - # myText will be used to construct headings and append into titleList - myText: str = "" + # my_text will be used to construct headings and append into title_list + my_text: str = "" - # lineno will be used to retrieve line numbers of certain headings - lineno: int = 0 + # line_no will be used to retrieve line numbers of certain headings + line_no: int = 0 - # A docutils.nodes object that stores a listOfMarkers text's grandparent node, + # A docutils.nodes object that stores a list_of_markers text's grandparent node, # which should have a tagname of title - markerGrandparent: docutils.nodes.Title = None + marker_grandparent: docutils.nodes.Title = None - # True if the most recent node encountered had a parent with a listOfMarkers tagname - # and a grandparent with a tagname of title - beforeMarker: bool = False + # True if the most recent node encountered had a parent with a list_of_markers + # tagname and a grandparent with a tagname of title + before_marker: bool = False - # titleList is the list of headings that is encountered in the doctree - titleList: List[str] = [] + # title_list is the list of headings that is encountered in the doctree + title_list: List[str] = [] - # List of line numbers that corresponding headings in titleList can be found at - lineNumberList: List[int] = [] + # List of line numbers that corresponding headings in title_list can be found at + line_number_list: List[int] = [] # Traverse through the nodes.Text in the document tree to construct headings for node in document.traverse(nodes.Text): # Case 1: Encounter a node with a parent tagname of title if (node.parent.tagname == 'title'): - if (beforeMarker and markerGrandparent == node.parent): - myText = myText + node.astext() - beforeMarker = False + if (before_marker and marker_grandparent == node.parent): + my_text = my_text + node.astext() + before_marker = False else: - if (myText != ""): - titleList.append(myText) - lineNumberList.append(lineno) - lineno = findLineNumber(node) - myText = node.astext() - beforeMarker = False - # Case 2: Encounter a node with parent tagname in listOfMarkers + if (my_text != ""): + title_list.append(my_text) + line_number_list.append(line_no) + line_no = find_line_number(node) + my_text = node.astext() + before_marker = False + # Case 2: Encounter a node with parent tagname in list_of_markers elif (node.parent.parent.tagname == 'title' and - node.parent.tagname in listOfMarkers): - lineno = findLineNumber(node) - myText = myText + node.astext() - beforeMarker = True - markerGrandparent = node.parent.parent + node.parent.tagname in list_of_markers): + line_no = find_line_number(node) + my_text = my_text + node.astext() + before_marker = True + marker_grandparent = node.parent.parent # Case 3: Encounter parent tagname of none of the above (Ex. 'paragraph') else: - beforeMarker = False - if (myText != ""): - titleList.append(myText) - lineNumberList.append(lineno) - myText = "" - lineno = 0 + before_marker = False + if (my_text != ""): + title_list.append(my_text) + line_number_list.append(line_no) + my_text = "" + line_no = 0 # Leftover string that hasn't been appended yet due to how the for loop works - if (myText != ""): - titleList.append(myText) - lineNumberList.append(lineno) + if (my_text != ""): + title_list.append(my_text) + line_number_list.append(line_no) # Return a list of the headings and a list of their corresponding line numbers - return titleList, lineNumberList + return title_list, line_number_list -def fillBadTitleDictionary(rstFile: str) -> None: +def fill_bad_title_dict(rst_file: str) -> None: ''' - Method that prints all of the bad titles - Message: [directory of rstFile, line number of bad title, error message] + Method that fills up the bad_title_dict with incorrectly capitalized headings + + Parameters + ---------- + rst_file : str + Directory address of a .rst file as a string ''' # Ensure file isn't one that causes the code to crash - if rstFile in cannotValidate: + if rst_file in cannot_validate: return - # Ensure this file doesn't already have a badtitleDictionary slot - if rstFile in badTitleDictionary: + # Ensure this file doesn't already have a bad_title_dict slot + if rst_file in bad_title_dict: return - # Parse rstFile with an RST parser - document = parseRST(rstFile) + # Parse rst_file with an RST parser + document = parse_RST(rst_file) # Make a list of headings along with their line numbers from document tree - titleList, lineNumberList = findBadTitlesInDoctree(document) + title_list, line_number_list = find_titles_in_doctree(document) - # Append the badTitleDictionary if the capitalization convention not followed - for i in range(len(titleList)): - if not followCapitalizationConvention(titleList[i]): - if rstFile not in badTitleDictionary: - badTitleDictionary[rstFile] = [(titleList[i], lineNumberList[i])] + # Append the bad_title_dict if the capitalization convention not followed + for i in range(len(title_list)): + if not follow_capitalization_convention(title_list[i]): + if rst_file not in bad_title_dict: + bad_title_dict[rst_file] = [(title_list[i], line_number_list[i])] else: - badTitleDictionary[rstFile].append((titleList[i], lineNumberList[i])) + bad_title_dict[rst_file].append((title_list[i], line_number_list[i])) -def createRSTDirectoryList(source_paths: List[str]) -> List[str]: +def find_rst_files(source_paths: List[str]) -> List[str]: ''' Given the command line arguments of directory paths, this method - creates a list of all of the .rst file directories that these paths contain + yields the strings of the .rst file directories that these paths contain + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments + + Yields + ------- + directory_address : str + Directory address of a .rst files found in command line argument directories ''' - # List of .rst file paths - f = [] # Loop through source_paths, recursively looking for .rst files - for directoryAddress in source_paths: - if not os.path.exists(directoryAddress): + for directory_address in source_paths: + if not os.path.exists(directory_address): raise ValueError( "Please enter a valid path, pointing to a valid file/directory." ) - elif (directoryAddress.endswith(".rst")): - f.append(directoryAddress) + elif (directory_address.endswith(".rst")): + yield directory_address else: - for (dirpath, dirnames, filenames) in walk(directoryAddress): + for (dirpath, dirnames, filenames) in walk(directory_address): for file in filenames: if file.endswith(".rst"): - f.append(os.path.join(dirpath, file)) - - # Return the filled up list of .rst file paths - return f + yield os.path.join(dirpath, file) def main(source_paths: List[str], output_format: str) -> bool: ''' - The main method to execute all commands + The main method to print all headings with incorrect capitalization + + Parameters + ---------- + source_paths : str + List of directories to validate, provided through command line arguments + output_format : str + Output format of the script. + + Returns + ------- + bool + True if there are headings that are printed, False if not ''' - # Create a list of all RST files from command line directory list - directoryList = createRSTDirectoryList(source_paths) + # Make a list of all RST files from command line directory list + directory_list = find_rst_files(source_paths) - # Fill the badTitleDictionary, which contains all incorrectly capitalized headings - for filename in directoryList: - fillBadTitleDictionary(filename) + # Fill the bad_title_dict, which contains all incorrectly capitalized headings + for filename in directory_list: + fill_bad_title_dict(filename) # Return an exit status of 0 if there are no bad titles in the dictionary - if (len(badTitleDictionary) == 0): + if (len(bad_title_dict) == 0): return False - # Print badTitleDictionary Results + # Print bad_title_dict Results print() - for key in badTitleDictionary: - for line in badTitleDictionary[key]: + for key in bad_title_dict: + for line in bad_title_dict[key]: print( - key + ":" + str(line[1]) + ": " + errMessage + " \"" + line[0] + "\"" + key + ":" + str(line[1]) + ": " + err_msg + " \"" + line[0] + "\"" ) # Exit status of 1 From de06ec83b4726d4e4793adbbc2ac94ecd5bc54db Mon Sep 17 00:00:00 2001 From: awu42 Date: Sat, 18 Jan 2020 18:18:31 -0500 Subject: [PATCH 27/44] Edited validate_rst_title_capitalization.py for review (#29641) --- scripts/validate_rst_title_capitalization.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 0234f71680b12..ab847b0f49b0d 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -35,7 +35,7 @@ class suppress_stdout_stderr: ''' Code source: - https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions + https://stackoverflow.com/questions/11130156/ A context manager for doing a "deep suppression" of stdout and stderr in Python, i.e. will suppress all print, even if the print originates in a @@ -50,7 +50,7 @@ class suppress_stdout_stderr: ''' def __init__(self): - self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] + self.null_fds = [os.open(os.devnull, os.O_WRONLY) for x in range(2)] self.save_fds = [os.dup(1), os.dup(2)] def __enter__(self): @@ -210,9 +210,8 @@ def parse_RST(rst_file: str) -> docutils.nodes.document: # Initialize an empty document tree with the default settings from above document = docutils.utils.new_document('Document', settings) - # Parse input into an RST doctree, suppressing any stdout from parse method - with suppress_stdout_stderr(): - parser.parse(input, document) + # Parse input into an RST doctree, suppressing any stderr from parse method + parser.parse(input, document) # Return the root node of the document tree return document @@ -423,8 +422,9 @@ def main(source_paths: List[str], output_format: str) -> bool: directory_list = find_rst_files(source_paths) # Fill the bad_title_dict, which contains all incorrectly capitalized headings - for filename in directory_list: - fill_bad_title_dict(filename) + with suppress_stdout_stderr(): + for filename in directory_list: + fill_bad_title_dict(filename) # Return an exit status of 0 if there are no bad titles in the dictionary if (len(bad_title_dict) == 0): From 7ea58df456cd3acabd65a689175d8a8a0f7cd831 Mon Sep 17 00:00:00 2001 From: awu42 Date: Sat, 18 Jan 2020 23:06:16 -0500 Subject: [PATCH 28/44] Checking if stderr output will be suppressed (#26941) --- scripts/validate_rst_title_capitalization.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index ab847b0f49b0d..4013fcc5379b1 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -413,11 +413,13 @@ def main(source_paths: List[str], output_format: str) -> bool: Returns ------- - bool + is_failed : bool True if there are headings that are printed, False if not ''' + is_failed : bool = False + # Make a list of all RST files from command line directory list directory_list = find_rst_files(source_paths) @@ -428,7 +430,7 @@ def main(source_paths: List[str], output_format: str) -> bool: # Return an exit status of 0 if there are no bad titles in the dictionary if (len(bad_title_dict) == 0): - return False + return is_failed # Print bad_title_dict Results print() @@ -438,8 +440,8 @@ def main(source_paths: List[str], output_format: str) -> bool: key + ":" + str(line[1]) + ": " + err_msg + " \"" + line[0] + "\"" ) - # Exit status of 1 - return True + # Exit status of 0 + return is_failed if __name__ == "__main__": From 60d8db943ffcc56359e42213f250f37732b59538 Mon Sep 17 00:00:00 2001 From: awu42 Date: Sun, 19 Jan 2020 12:27:15 -0500 Subject: [PATCH 29/44] Simplified validate_rst_title_capitalization.py to print correctly (#26941) --- scripts/validate_rst_title_capitalization.py | 266 +++---------------- 1 file changed, 41 insertions(+), 225 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 4013fcc5379b1..ec2182d7897e0 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -12,67 +12,16 @@ ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst ./scripts/validate_rst_title_capitalization.py doc/source/ -Files that cannot be validated: (code crashes when validating for some reason) -doc/source/user_guide/io.rst -doc/source/whatsnew/v0.17.1.rst - -Reference: doctree elements -http://epydoc.sourceforge.net/docutils/public/docutils.nodes.Element-class.html - """ import argparse import sys -from docutils.parsers.rst import Parser -import docutils -from docutils import nodes import re import os from os import walk from typing import Generator, List -class suppress_stdout_stderr: - ''' - Code source: - https://stackoverflow.com/questions/11130156/ - - A context manager for doing a "deep suppression" of stdout and stderr in - Python, i.e. will suppress all print, even if the print originates in a - compiled C/Fortran sub-function. - This will not suppress raised exceptions, since exceptions are printed - to stderr just before a script exits, and after the context manager has - exited (at least, I think that is why it lets exceptions through). - - This code is needed to suppress output from the parser method - because the parser method prints to stdout when encountering Sphinx - references, as it cannot parse those at this moment. - - ''' - def __init__(self): - self.null_fds = [os.open(os.devnull, os.O_WRONLY) for x in range(2)] - self.save_fds = [os.dup(1), os.dup(2)] - - def __enter__(self): - ''' - Assign the null pointers to stdout and stderr. - - ''' - os.dup2(self.null_fds[0], 1) - os.dup2(self.null_fds[1], 2) - - def __exit__(self, *_): - ''' - Re-assign the real stdout/stderr back to (1) and (2) and close all - file descriptors - - ''' - os.dup2(self.save_fds[0], 1) - os.dup2(self.save_fds[1], 2) - for fd in self.null_fds + self.save_fds: - os.close(fd) - - # Keynames that would not follow capitalization convention CAPITALIZATION_EXCEPTIONS = { 'pandas', 'Python', 'IPython', 'PyTables', 'Excel', 'JSON', @@ -88,12 +37,6 @@ def __exit__(self, *_): # Key: Document Directory, Value: Pair(Bad Title, Line Number) bad_title_dict = {} -# List of problematic tags that are exceptions to parent rule -list_of_markers = {'emphasis', 'strong', 'reference', 'literal'} - -# List of files that, when validated, causes the program to crash -cannot_validate = ['doc/source/user_guide/io.rst', 'doc/source/whatsnew/v0.17.1.rst'] - # Error Message: err_msg = 'Heading capitalization formatted incorrectly. Please correctly capitalize' @@ -152,118 +95,18 @@ def follow_capitalization_convention(title: str) -> bool: return True -def find_line_number(node: docutils.nodes) -> int: +def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: ''' - Recursive method that finds the line number in a document for a particular node - in the doctree + Algorithm to identify particular text that should be considered headings in an + RST file - Text nodes usually don't have any value for its "line" instance variable, - so instead, we recursively look through the parent nodes to eventually find the - correct line number, which I determined would be node.line - 1 - - Parameters - ---------- - node : docutils.node - Name of the object of the docstring to validate. - - Returns - ------- - int - The line number of the node - - ''' - if (node.tagname == 'document'): - return 1 - elif (node.line is None): - return find_line_number(node.parent) - else: - return node.line - 1 - - -def parse_RST(rst_file: str) -> docutils.nodes.document: - ''' - Method to parse through an rst_file and return a document tree + See for details + on what constitutes a string as a heading in RST Parameters ---------- rst_file : str - Directory address of a .rst file as a string - - Returns - ------- - document : docutils.nodes.document - Root node of the .rst file's document tree - - ''' - # Initialize rst Parser object - parser = Parser() - - # Open and read the .rst file and store the string of data into input - f = open(rst_file, "r") - input = f.read() - - # Set up default settings for the document tree - settings = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,) - ).get_default_values() - - # Initialize an empty document tree with the default settings from above - document = docutils.utils.new_document('Document', settings) - - # Parse input into an RST doctree, suppressing any stderr from parse method - parser.parse(input, document) - - # Return the root node of the document tree - return document - - -def find_titles_in_doctree(document: docutils.nodes.document) -> Generator[ - List[str], List[int], None]: - ''' - Algorithm to identify particular text nodes as headings - along with the text node's line number. - - The idea is that when we traverse through the text nodes, nodes whose - parents have a tagname of 'title' are definitely considered to be part - of headings. - - However, the problem occurs when we encounter text that has been either - italicized, bolded, referenced, etc. In these situations, the tagname of - the parent node could be one of the following: 'emphasis', 'strong', - 'reference', and 'literal', stored in the 'list_of_markers' set variable. In - this situation, the node's grandparent would have the 'title' tagname instead. - - Let's see an example that can cause a problem. The heading provided will be - 'Looking at *pandas* docs' versus 'Looking at pandas docs'. In this example, - the stars around pandas in the first string italicizes the word. - However, the doctree would be representing both doctrees as follows: - - 'Looking at *pandas* docs' 'Looking at pandas docs' - title title - / | | | - #text emphasis #text VS #text - | | | | - 'Looking at' #text 'docs' 'Looking at pandas docs' - | - 'pandas' - - When iterating through the nodes, we first encounter the node: 'Looking at'. - However, this isn't the full line of the heading (Looking at pandas docs). - We're still missing 'pandas docs'. Hence, we must store this first word into - a variable (my_text in my function) and append this string variable with more - words in case we encounter text that has a parent with tagname in list_of_markers. - In this example, we have to go through two more nodes to get the full heading. - - Meanwhile, when nothing has a parent with tagname in list_of_markers, we only - need to access one node to find the 'Looking at the pandas docs' text. - - My algorithm adjusts for this pattern, iterating through nodes and - identifying when headings are complete. - - Parameters - ---------- - document : docutils.nodes.document - Root node of a .rst file's document tree + RST file to scan through for headings Returns ------- @@ -275,62 +118,42 @@ def find_titles_in_doctree(document: docutils.nodes.document) -> Generator[ ''' - # my_text will be used to construct headings and append into title_list - my_text: str = "" - - # line_no will be used to retrieve line numbers of certain headings - line_no: int = 0 - - # A docutils.nodes object that stores a list_of_markers text's grandparent node, - # which should have a tagname of title - marker_grandparent: docutils.nodes.Title = None - - # True if the most recent node encountered had a parent with a list_of_markers - # tagname and a grandparent with a tagname of title - before_marker: bool = False - # title_list is the list of headings that is encountered in the doctree title_list: List[str] = [] # List of line numbers that corresponding headings in title_list can be found at line_number_list: List[int] = [] - # Traverse through the nodes.Text in the document tree to construct headings - for node in document.traverse(nodes.Text): - # Case 1: Encounter a node with a parent tagname of title - if (node.parent.tagname == 'title'): - if (before_marker and marker_grandparent == node.parent): - my_text = my_text + node.astext() - before_marker = False - else: - if (my_text != ""): - title_list.append(my_text) - line_number_list.append(line_no) - line_no = find_line_number(node) - my_text = node.astext() - before_marker = False - # Case 2: Encounter a node with parent tagname in list_of_markers - elif (node.parent.parent.tagname == 'title' and - node.parent.tagname in list_of_markers): - line_no = find_line_number(node) - my_text = my_text + node.astext() - before_marker = True - marker_grandparent = node.parent.parent - # Case 3: Encounter parent tagname of none of the above (Ex. 'paragraph') - else: - before_marker = False - if (my_text != ""): - title_list.append(my_text) - line_number_list.append(line_no) - my_text = "" - line_no = 0 - - # Leftover string that hasn't been appended yet due to how the for loop works - if (my_text != ""): - title_list.append(my_text) - line_number_list.append(line_no) - - # Return a list of the headings and a list of their corresponding line numbers + # Open and read the .rst file and store the string of data into input + f = open(rst_file, "r") + input = f.read().split('\n') + + # Regular expressions that denote a title beforehand + regex = { + '*': r'^(?:\*{1})*$', '=': r'^(?:={1})*$', '-': r'^(?:-{1})*$', + '^': r'^(?:\^{1})*$', '~': r'^(?:~{1})*$', '#': r'^(?:#{1})*$', + '"': r'^(?:"{1})*$' + } + + # '*`_' markers are removed from original string text. + table = str.maketrans("", "", '*`_') + + # Loop through input lines, appending if they are considered headings + for lineno in range(1, len(input)): + if (len(input[lineno]) != 0 and len(input[lineno - 1]) != 0): + for key in regex: + match = re.search(regex[key], input[lineno]) + if (match is not None): + if (lineno >= 2): + if (input[lineno] == input[lineno - 2]): + if (len(input[lineno]) == len(input[lineno - 1])): + title_list.append(input[lineno - 1].translate(table)) + line_number_list.append(lineno) + break + if (len(input[lineno]) >= len(input[lineno - 1])): + title_list.append(input[lineno - 1].translate(table)) + line_number_list.append(lineno) + return title_list, line_number_list @@ -345,19 +168,12 @@ def fill_bad_title_dict(rst_file: str) -> None: ''' - # Ensure file isn't one that causes the code to crash - if rst_file in cannot_validate: - return - # Ensure this file doesn't already have a bad_title_dict slot if rst_file in bad_title_dict: return - # Parse rst_file with an RST parser - document = parse_RST(rst_file) - - # Make a list of headings along with their line numbers from document tree - title_list, line_number_list = find_titles_in_doctree(document) + # Make a list of headings along with their line numbers + title_list, line_number_list = findTitles(rst_file) # Append the bad_title_dict if the capitalization convention not followed for i in range(len(title_list)): @@ -424,9 +240,9 @@ def main(source_paths: List[str], output_format: str) -> bool: directory_list = find_rst_files(source_paths) # Fill the bad_title_dict, which contains all incorrectly capitalized headings - with suppress_stdout_stderr(): - for filename in directory_list: - fill_bad_title_dict(filename) + # with suppress_stdout_stderr(): + for filename in directory_list: + fill_bad_title_dict(filename) # Return an exit status of 0 if there are no bad titles in the dictionary if (len(bad_title_dict) == 0): From 0e344adfb8cbd362db18572565ed1952b6e51781 Mon Sep 17 00:00:00 2001 From: awu42 Date: Sun, 19 Jan 2020 15:37:22 -0500 Subject: [PATCH 30/44] Testing script on doc/source/development/contributing.rst (#26941) --- scripts/validate_rst_title_capitalization.py | 101 ++++++++++++------- 1 file changed, 64 insertions(+), 37 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index ec2182d7897e0..ead76493b9ed3 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -24,10 +24,34 @@ # Keynames that would not follow capitalization convention CAPITALIZATION_EXCEPTIONS = { - 'pandas', 'Python', 'IPython', 'PyTables', 'Excel', 'JSON', - 'HTML', 'SAS', 'SQL', 'BigQuery', 'STATA', 'Interval', 'PEP8', - 'Period', 'Series', 'Index', 'DataFrame', 'C', 'Git', 'GitHub', 'NumPy', - 'Apache', 'Arrow', 'Parquet', 'MultiIndex', 'NumFOCUS', 'sklearn-pandas' + "pandas", + "Python", + "IPython", + "PyTables", + "Excel", + "JSON", + "HTML", + "SAS", + "SQL", + "BigQuery", + "STATA", + "Interval", + "PEP8", + "Period", + "Series", + "Index", + "DataFrame", + "C", + "Git", + "GitHub", + "NumPy", + "Apache", + "Arrow", + "Parquet", + "MultiIndex", + "NumFOCUS", + "sklearn", + "Docker", } # Lowercase representation of CAPITALIZATION_EXCEPTIONS @@ -38,11 +62,11 @@ bad_title_dict = {} # Error Message: -err_msg = 'Heading capitalization formatted incorrectly. Please correctly capitalize' +err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" def follow_capitalization_convention(title: str) -> bool: - ''' + """ Algorithm to determine if a heading follows the capitalization convention This method returns true if the title follows the convention @@ -58,13 +82,13 @@ def follow_capitalization_convention(title: str) -> bool: bool True if capitalization is correct, False if not - ''' + """ # split with delimiters comma, semicolon and space, parentheses, colon, slashes - word_list = re.split(r'[;,/():\s]\s*', title) + word_list = re.split(r"[;,-/():\s]\s*", title) # Edge Case: First word is an empty string - if (len(word_list[0]) == 0): + if len(word_list[0]) == 0: return False # Dealing with the first word of the title @@ -73,7 +97,7 @@ def follow_capitalization_convention(title: str) -> bool: if word_list[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: return False # First letter of first word must be uppercase - if (not word_list[0][0].isupper()): + if not word_list[0][0].isupper(): return False # Remaining letters of first word must not be uppercase for j in range(1, len(word_list[0])): @@ -96,7 +120,7 @@ def follow_capitalization_convention(title: str) -> bool: def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: - ''' + """ Algorithm to identify particular text that should be considered headings in an RST file @@ -116,7 +140,7 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: line_number_list : List[int] The corresponding line numbers of the headings in title_list - ''' + """ # title_list is the list of headings that is encountered in the doctree title_list: List[str] = [] @@ -126,31 +150,36 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: # Open and read the .rst file and store the string of data into input f = open(rst_file, "r") - input = f.read().split('\n') + input = f.read().split("\n") + f.close() # Regular expressions that denote a title beforehand regex = { - '*': r'^(?:\*{1})*$', '=': r'^(?:={1})*$', '-': r'^(?:-{1})*$', - '^': r'^(?:\^{1})*$', '~': r'^(?:~{1})*$', '#': r'^(?:#{1})*$', - '"': r'^(?:"{1})*$' + "*": r"^(?:\*{1})*$", + "=": r"^(?:={1})*$", + "-": r"^(?:-{1})*$", + "^": r"^(?:\^{1})*$", + "~": r"^(?:~{1})*$", + "#": r"^(?:#{1})*$", + '"': r'^(?:"{1})*$', } # '*`_' markers are removed from original string text. - table = str.maketrans("", "", '*`_') + table = str.maketrans("", "", "*`_") # Loop through input lines, appending if they are considered headings for lineno in range(1, len(input)): - if (len(input[lineno]) != 0 and len(input[lineno - 1]) != 0): + if len(input[lineno]) != 0 and len(input[lineno - 1]) != 0: for key in regex: match = re.search(regex[key], input[lineno]) - if (match is not None): - if (lineno >= 2): - if (input[lineno] == input[lineno - 2]): - if (len(input[lineno]) == len(input[lineno - 1])): + if match is not None: + if lineno >= 2: + if input[lineno] == input[lineno - 2]: + if len(input[lineno]) == len(input[lineno - 1]): title_list.append(input[lineno - 1].translate(table)) line_number_list.append(lineno) break - if (len(input[lineno]) >= len(input[lineno - 1])): + if len(input[lineno]) >= len(input[lineno - 1]): title_list.append(input[lineno - 1].translate(table)) line_number_list.append(lineno) @@ -158,7 +187,7 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: def fill_bad_title_dict(rst_file: str) -> None: - ''' + """ Method that fills up the bad_title_dict with incorrectly capitalized headings Parameters @@ -166,7 +195,7 @@ def fill_bad_title_dict(rst_file: str) -> None: rst_file : str Directory address of a .rst file as a string - ''' + """ # Ensure this file doesn't already have a bad_title_dict slot if rst_file in bad_title_dict: @@ -185,7 +214,7 @@ def fill_bad_title_dict(rst_file: str) -> None: def find_rst_files(source_paths: List[str]) -> List[str]: - ''' + """ Given the command line arguments of directory paths, this method yields the strings of the .rst file directories that these paths contain @@ -199,7 +228,7 @@ def find_rst_files(source_paths: List[str]) -> List[str]: directory_address : str Directory address of a .rst files found in command line argument directories - ''' + """ # Loop through source_paths, recursively looking for .rst files for directory_address in source_paths: @@ -207,7 +236,7 @@ def find_rst_files(source_paths: List[str]) -> List[str]: raise ValueError( "Please enter a valid path, pointing to a valid file/directory." ) - elif (directory_address.endswith(".rst")): + elif directory_address.endswith(".rst"): yield directory_address else: for (dirpath, dirnames, filenames) in walk(directory_address): @@ -217,7 +246,7 @@ def find_rst_files(source_paths: List[str]) -> List[str]: def main(source_paths: List[str], output_format: str) -> bool: - ''' + """ The main method to print all headings with incorrect capitalization Parameters @@ -232,9 +261,9 @@ def main(source_paths: List[str], output_format: str) -> bool: is_failed : bool True if there are headings that are printed, False if not - ''' + """ - is_failed : bool = False + is_failed: bool = False # Make a list of all RST files from command line directory list directory_list = find_rst_files(source_paths) @@ -245,23 +274,21 @@ def main(source_paths: List[str], output_format: str) -> bool: fill_bad_title_dict(filename) # Return an exit status of 0 if there are no bad titles in the dictionary - if (len(bad_title_dict) == 0): + if len(bad_title_dict) == 0: return is_failed # Print bad_title_dict Results - print() + is_failed = True for key in bad_title_dict: for line in bad_title_dict[key]: - print( - key + ":" + str(line[1]) + ": " + err_msg + " \"" + line[0] + "\"" - ) + print(key + ":" + str(line[1]) + ": " + err_msg + ' "' + line[0] + '"') # Exit status of 0 return is_failed if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Validate heading capitalization') + parser = argparse.ArgumentParser(description="Validate heading capitalization") parser.add_argument( "paths", nargs="+", default=".", help="Source paths of file/directory to check." From 3757712f78bb7eeed8c5311ad94b6ed4d669748e Mon Sep 17 00:00:00 2001 From: awu42 Date: Sun, 19 Jan 2020 16:59:03 -0500 Subject: [PATCH 31/44] validate_rst_title_capitalization.py MomIsBestFriend edits (#26941) --- scripts/validate_rst_title_capitalization.py | 37 ++++++++++---------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index ead76493b9ed3..88f6e8aca84b2 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -65,7 +65,7 @@ err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" -def follow_capitalization_convention(title: str) -> bool: +def is_following_capitalization_convention(title: str) -> bool: """ Algorithm to determine if a heading follows the capitalization convention @@ -84,7 +84,10 @@ def follow_capitalization_convention(title: str) -> bool: """ - # split with delimiters comma, semicolon and space, parentheses, colon, slashes + # Remove https link if present in heading + title = re.sub(r"", "", title) + + # Split with delimiters comma, semicolon and space, parentheses, colon, slashes word_list = re.split(r"[;,-/():\s]\s*", title) # Edge Case: First word is an empty string @@ -148,10 +151,9 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: # List of line numbers that corresponding headings in title_list can be found at line_number_list: List[int] = [] - # Open and read the .rst file and store the string of data into input - f = open(rst_file, "r") - input = f.read().split("\n") - f.close() + # Open and read the .rst file and store the string of data into lines + with open(rst_file, "r") as file_obj: + lines = file_obj.read().split("\n") # Regular expressions that denote a title beforehand regex = { @@ -167,20 +169,20 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: # '*`_' markers are removed from original string text. table = str.maketrans("", "", "*`_") - # Loop through input lines, appending if they are considered headings - for lineno in range(1, len(input)): - if len(input[lineno]) != 0 and len(input[lineno - 1]) != 0: + # Loop through lines lines, appending if they are considered headings + for lineno in range(1, len(lines)): + if len(lines[lineno]) != 0 and len(lines[lineno - 1]) != 0: for key in regex: - match = re.search(regex[key], input[lineno]) + match = re.search(regex[key], lines[lineno]) if match is not None: if lineno >= 2: - if input[lineno] == input[lineno - 2]: - if len(input[lineno]) == len(input[lineno - 1]): - title_list.append(input[lineno - 1].translate(table)) + if lines[lineno] == lines[lineno - 2]: + if len(lines[lineno]) == len(lines[lineno - 1]): + title_list.append(lines[lineno - 1].translate(table)) line_number_list.append(lineno) break - if len(input[lineno]) >= len(input[lineno - 1]): - title_list.append(input[lineno - 1].translate(table)) + if len(lines[lineno]) >= len(lines[lineno - 1]): + title_list.append(lines[lineno - 1].translate(table)) line_number_list.append(lineno) return title_list, line_number_list @@ -206,14 +208,14 @@ def fill_bad_title_dict(rst_file: str) -> None: # Append the bad_title_dict if the capitalization convention not followed for i in range(len(title_list)): - if not follow_capitalization_convention(title_list[i]): + if not is_following_capitalization_convention(title_list[i]): if rst_file not in bad_title_dict: bad_title_dict[rst_file] = [(title_list[i], line_number_list[i])] else: bad_title_dict[rst_file].append((title_list[i], line_number_list[i])) -def find_rst_files(source_paths: List[str]) -> List[str]: +def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: """ Given the command line arguments of directory paths, this method yields the strings of the .rst file directories that these paths contain @@ -269,7 +271,6 @@ def main(source_paths: List[str], output_format: str) -> bool: directory_list = find_rst_files(source_paths) # Fill the bad_title_dict, which contains all incorrectly capitalized headings - # with suppress_stdout_stderr(): for filename in directory_list: fill_bad_title_dict(filename) From 56bfc44a27bda8376e8fbb99c1080aecc2ab4dc2 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 21 Jan 2020 18:25:01 -0500 Subject: [PATCH 32/44] Created method to correct title capitalization (#26941) --- ci/code_checks.sh | 1 + scripts/validate_rst_title_capitalization.py | 180 ++++++++----------- 2 files changed, 73 insertions(+), 108 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4bf47f2901ca6..317909ce73b70 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -344,4 +344,5 @@ if [[ -z "$CHECK" || "$CHECK" == "typing" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" fi + exit $RET diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 88f6e8aca84b2..19a459f650516 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,28 +1,21 @@ #!/usr/bin/env python - """ -GH #29641 - -Collect the titles in the rst files and validate if they follow the proper -capitalization convention. +Validate that the titles in the rst files follow the proper capitalization convention. -Prints the titles that do not follow the convention. +Print the titles that do not follow the convention. Usage:: ./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst ./scripts/validate_rst_title_capitalization.py doc/source/ """ - import argparse import sys import re import os -from os import walk -from typing import Generator, List +from typing import Tuple, Generator, List -# Keynames that would not follow capitalization convention CAPITALIZATION_EXCEPTIONS = { "pandas", "Python", @@ -54,23 +47,48 @@ "Docker", } -# Lowercase representation of CAPITALIZATION_EXCEPTIONS -CAPITALIZATION_EXCEPTIONS_LOWER = {word.lower() for word in CAPITALIZATION_EXCEPTIONS} +CAP_EXCEPTIONS_DICT = { + word.lower(): word for word in CAPITALIZATION_EXCEPTIONS +} -# Dictionary of bad titles that will be printed later along with line numbers -# Key: Document Directory, Value: Pair(Bad Title, Line Number) bad_title_dict = {} -# Error Message: err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" +def correct_title_capitalization(title: str) -> str: + """ + Algorithm to create the correct capitalization for a given title + + Parameters + ---------- + title : str + Heading string to correct + + Returns + ------- + correct_title : str + Correctly capitalized title -def is_following_capitalization_convention(title: str) -> bool: """ - Algorithm to determine if a heading follows the capitalization convention - This method returns true if the title follows the convention - and false if it does not + correct_title : str = title.capitalize() + + removed_https_title = re.sub(r"", "", correct_title) + + word_list = re.split(r"\W", removed_https_title) + + for word in word_list: + if word.lower() in CAP_EXCEPTIONS_DICT: + correct_title = re.sub( + r'\b' + word + r'\b', CAP_EXCEPTIONS_DICT[word.lower()], correct_title + ) + + return correct_title + + +def is_following_capitalization_convention(title: str) -> bool: + """ + Algorithm to determine if a title is capitalized correctly Parameters ---------- @@ -80,49 +98,19 @@ def is_following_capitalization_convention(title: str) -> bool: Returns ------- bool - True if capitalization is correct, False if not + True if title capitalized correctly, False if not """ - # Remove https link if present in heading - title = re.sub(r"", "", title) + correct_title = correct_title_capitalization(title) - # Split with delimiters comma, semicolon and space, parentheses, colon, slashes - word_list = re.split(r"[;,-/():\s]\s*", title) - - # Edge Case: First word is an empty string - if len(word_list[0]) == 0: + if (title != correct_title): return False + else: + return True + - # Dealing with the first word of the title - if word_list[0] not in CAPITALIZATION_EXCEPTIONS: - # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization - if word_list[0].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: - return False - # First letter of first word must be uppercase - if not word_list[0][0].isupper(): - return False - # Remaining letters of first word must not be uppercase - for j in range(1, len(word_list[0])): - if word_list[0][j].isupper(): - return False - - # Remaining letters must not be uppercase letters - for i in range(1, len(word_list)): - if word_list[i] not in CAPITALIZATION_EXCEPTIONS: - # word is not in CAPITALIZATION_EXCEPTIONS but has different capitalization - if word_list[i].lower() in CAPITALIZATION_EXCEPTIONS_LOWER: - return False - # Remaining letters must not be uppercase - for j in range(len(word_list[i])): - if word_list[i][j].isupper(): - return False - - # Returning True if the heading follows the capitalization convention - return True - - -def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: +def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: """ Algorithm to identify particular text that should be considered headings in an RST file @@ -135,27 +123,19 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: rst_file : str RST file to scan through for headings - Returns + Yields ------- - title_list : List[str] - A list of heading strings found in the document tree + title : str + A heading found in the rst file - line_number_list : List[int] - The corresponding line numbers of the headings in title_list + line_number : int + The corresponding line number of the heading """ - # title_list is the list of headings that is encountered in the doctree - title_list: List[str] = [] - - # List of line numbers that corresponding headings in title_list can be found at - line_number_list: List[int] = [] - - # Open and read the .rst file and store the string of data into lines with open(rst_file, "r") as file_obj: lines = file_obj.read().split("\n") - # Regular expressions that denote a title beforehand regex = { "*": r"^(?:\*{1})*$", "=": r"^(?:={1})*$", @@ -166,26 +146,20 @@ def findTitles(rst_file: str) -> Generator[List[str], List[int], None]: '"': r'^(?:"{1})*$', } - # '*`_' markers are removed from original string text. table = str.maketrans("", "", "*`_") - # Loop through lines lines, appending if they are considered headings - for lineno in range(1, len(lines)): - if len(lines[lineno]) != 0 and len(lines[lineno - 1]) != 0: + for line_no in range(1, len(lines)): + if len(lines[line_no]) != 0 and len(lines[line_no - 1]) != 0: for key in regex: - match = re.search(regex[key], lines[lineno]) + match = re.search(regex[key], lines[line_no]) if match is not None: - if lineno >= 2: - if lines[lineno] == lines[lineno - 2]: - if len(lines[lineno]) == len(lines[lineno - 1]): - title_list.append(lines[lineno - 1].translate(table)) - line_number_list.append(lineno) + if line_no >= 2: + if lines[line_no] == lines[line_no - 2]: + if len(lines[line_no]) == len(lines[line_no - 1]): + yield lines[line_no - 1].translate(table), line_no break - if len(lines[lineno]) >= len(lines[lineno - 1]): - title_list.append(lines[lineno - 1].translate(table)) - line_number_list.append(lineno) - - return title_list, line_number_list + if len(lines[line_no]) >= len(lines[line_no - 1]): + yield lines[line_no - 1].translate(table), line_no def fill_bad_title_dict(rst_file: str) -> None: @@ -199,20 +173,15 @@ def fill_bad_title_dict(rst_file: str) -> None: """ - # Ensure this file doesn't already have a bad_title_dict slot if rst_file in bad_title_dict: return - # Make a list of headings along with their line numbers - title_list, line_number_list = findTitles(rst_file) - - # Append the bad_title_dict if the capitalization convention not followed - for i in range(len(title_list)): - if not is_following_capitalization_convention(title_list[i]): + for title, line_number in find_titles(rst_file): + if not is_following_capitalization_convention(title): if rst_file not in bad_title_dict: - bad_title_dict[rst_file] = [(title_list[i], line_number_list[i])] + bad_title_dict[rst_file] = [(title, line_number)] else: - bad_title_dict[rst_file].append((title_list[i], line_number_list[i])) + bad_title_dict[rst_file].append((title, line_number)) def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: @@ -232,7 +201,6 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: """ - # Loop through source_paths, recursively looking for .rst files for directory_address in source_paths: if not os.path.exists(directory_address): raise ValueError( @@ -241,7 +209,7 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: elif directory_address.endswith(".rst"): yield directory_address else: - for (dirpath, dirnames, filenames) in walk(directory_address): + for (dirpath, _, filenames) in os.walk(directory_address): for file in filenames: if file.endswith(".rst"): yield os.path.join(dirpath, file) @@ -260,32 +228,28 @@ def main(source_paths: List[str], output_format: str) -> bool: Returns ------- - is_failed : bool + number_of_errors : int True if there are headings that are printed, False if not """ - is_failed: bool = False + number_of_errors: int = 0 - # Make a list of all RST files from command line directory list directory_list = find_rst_files(source_paths) - # Fill the bad_title_dict, which contains all incorrectly capitalized headings for filename in directory_list: fill_bad_title_dict(filename) - # Return an exit status of 0 if there are no bad titles in the dictionary - if len(bad_title_dict) == 0: - return is_failed + if (len(bad_title_dict) == 0): + return number_of_errors - # Print bad_title_dict Results - is_failed = True for key in bad_title_dict: for line in bad_title_dict[key]: - print(key + ":" + str(line[1]) + ": " + err_msg + ' "' + line[0] + '"') + correct_title = correct_title_capitalization(line[0]) + print(f'{key}:{line[1]}:{err_msg} "{line[0]}" to "{correct_title}"') + number_of_errors += 1 - # Exit status of 0 - return is_failed + return number_of_errors if __name__ == "__main__": @@ -298,7 +262,7 @@ def main(source_paths: List[str], output_format: str) -> bool: parser.add_argument( "--format", "-f", - default="{source_path}:{line_number}:{heading}:{msg}", + default="{source_path}:{line_number}:{msg}:{heading}", help="Output format of incorrectly capitalized titles", ) From deddc2ddce819eabdcd4e7dd603a3a26461424e6 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 21 Jan 2020 18:31:26 -0500 Subject: [PATCH 33/44] Ran black on validate_rst_title_capitalization (#26941) --- scripts/validate_rst_title_capitalization.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 19a459f650516..994308c746df9 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -47,14 +47,13 @@ "Docker", } -CAP_EXCEPTIONS_DICT = { - word.lower(): word for word in CAPITALIZATION_EXCEPTIONS -} +CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} bad_title_dict = {} err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" + def correct_title_capitalization(title: str) -> str: """ Algorithm to create the correct capitalization for a given title @@ -71,7 +70,7 @@ def correct_title_capitalization(title: str) -> str: """ - correct_title : str = title.capitalize() + correct_title: str = title.capitalize() removed_https_title = re.sub(r"", "", correct_title) @@ -80,7 +79,7 @@ def correct_title_capitalization(title: str) -> str: for word in word_list: if word.lower() in CAP_EXCEPTIONS_DICT: correct_title = re.sub( - r'\b' + word + r'\b', CAP_EXCEPTIONS_DICT[word.lower()], correct_title + r"\b" + word + r"\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title ) return correct_title @@ -104,7 +103,7 @@ def is_following_capitalization_convention(title: str) -> bool: correct_title = correct_title_capitalization(title) - if (title != correct_title): + if title != correct_title: return False else: return True @@ -240,7 +239,7 @@ def main(source_paths: List[str], output_format: str) -> bool: for filename in directory_list: fill_bad_title_dict(filename) - if (len(bad_title_dict) == 0): + if len(bad_title_dict) == 0: return number_of_errors for key in bad_title_dict: From 0311fe0a6fea8288d8fe97405b042c1b760216bc Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 21 Jan 2020 20:04:43 -0500 Subject: [PATCH 34/44] Edit: titles with non-word character as first character are not valid --- scripts/validate_rst_title_capitalization.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 994308c746df9..929d33f855c57 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -66,11 +66,11 @@ def correct_title_capitalization(title: str) -> str: Returns ------- correct_title : str - Correctly capitalized title + Correctly capitalized heading """ - correct_title: str = title.capitalize() + correct_title: str = re.sub(r"^\W*", "", title).capitalize() removed_https_title = re.sub(r"", "", correct_title) @@ -87,7 +87,7 @@ def correct_title_capitalization(title: str) -> str: def is_following_capitalization_convention(title: str) -> bool: """ - Algorithm to determine if a title is capitalized correctly + Function to return if a given title is capitalized correctly Parameters ---------- @@ -244,8 +244,10 @@ def main(source_paths: List[str], output_format: str) -> bool: for key in bad_title_dict: for line in bad_title_dict[key]: - correct_title = correct_title_capitalization(line[0]) - print(f'{key}:{line[1]}:{err_msg} "{line[0]}" to "{correct_title}"') + print( + f"""{key}:{line[1]}:{err_msg} "{line[0]}" to "{ + correct_title_capitalization(line[0])}" """ + ) number_of_errors += 1 return number_of_errors From df0173082c42f6cae03a457e86864cdc074ec1b0 Mon Sep 17 00:00:00 2001 From: awu42 Date: Wed, 22 Jan 2020 09:02:41 -0500 Subject: [PATCH 35/44] Simplified validate_rst_title_capitalization main method (#26941) --- scripts/validate_rst_title_capitalization.py | 73 +++----------------- 1 file changed, 9 insertions(+), 64 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 929d33f855c57..ff93e278ee8e0 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -49,8 +49,6 @@ CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} -bad_title_dict = {} - err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" @@ -79,36 +77,12 @@ def correct_title_capitalization(title: str) -> str: for word in word_list: if word.lower() in CAP_EXCEPTIONS_DICT: correct_title = re.sub( - r"\b" + word + r"\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title + rf"\b{word}\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title ) return correct_title -def is_following_capitalization_convention(title: str) -> bool: - """ - Function to return if a given title is capitalized correctly - - Parameters - ---------- - title : str - Heading string to validate - - Returns - ------- - bool - True if title capitalized correctly, False if not - - """ - - correct_title = correct_title_capitalization(title) - - if title != correct_title: - return False - else: - return True - - def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: """ Algorithm to identify particular text that should be considered headings in an @@ -161,28 +135,6 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: yield lines[line_no - 1].translate(table), line_no -def fill_bad_title_dict(rst_file: str) -> None: - """ - Method that fills up the bad_title_dict with incorrectly capitalized headings - - Parameters - ---------- - rst_file : str - Directory address of a .rst file as a string - - """ - - if rst_file in bad_title_dict: - return - - for title, line_number in find_titles(rst_file): - if not is_following_capitalization_convention(title): - if rst_file not in bad_title_dict: - bad_title_dict[rst_file] = [(title, line_number)] - else: - bad_title_dict[rst_file].append((title, line_number)) - - def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: """ Given the command line arguments of directory paths, this method @@ -234,21 +186,14 @@ def main(source_paths: List[str], output_format: str) -> bool: number_of_errors: int = 0 - directory_list = find_rst_files(source_paths) - - for filename in directory_list: - fill_bad_title_dict(filename) - - if len(bad_title_dict) == 0: - return number_of_errors - - for key in bad_title_dict: - for line in bad_title_dict[key]: - print( - f"""{key}:{line[1]}:{err_msg} "{line[0]}" to "{ - correct_title_capitalization(line[0])}" """ - ) - number_of_errors += 1 + for filename in find_rst_files(source_paths): + for title, line_number in find_titles(filename): + if title != correct_title_capitalization(title): + print( + f"""{filename}:{line_number}:{err_msg} "{title}" to "{ + correct_title_capitalization(title)}" """ + ) + number_of_errors += 1 return number_of_errors From 9a9a57ae0c51317582b6d1b3e48e9ef8f909fdd4 Mon Sep 17 00:00:00 2001 From: awu42 Date: Wed, 22 Jan 2020 09:26:10 -0500 Subject: [PATCH 36/44] Edited parameter and return value description of main function --- scripts/validate_rst_title_capitalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index ff93e278ee8e0..7d762c1f24354 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -180,7 +180,7 @@ def main(source_paths: List[str], output_format: str) -> bool: Returns ------- number_of_errors : int - True if there are headings that are printed, False if not + Number of incorrect headings found overall """ @@ -208,7 +208,7 @@ def main(source_paths: List[str], output_format: str) -> bool: parser.add_argument( "--format", "-f", - default="{source_path}:{line_number}:{msg}:{heading}", + default="{source_path}:{line_number}:{msg}:{heading}:{correct_heading}", help="Output format of incorrectly capitalized titles", ) From 5f0f84ab22b67e574faa2016aaa6d535b34a5452 Mon Sep 17 00:00:00 2001 From: awu42 Date: Mon, 27 Jan 2020 16:09:52 -0500 Subject: [PATCH 37/44] Added glob module to script 01-27-2020 --- scripts/validate_rst_title_capitalization.py | 75 ++++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 7d762c1f24354..1a620425a402f 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -14,6 +14,7 @@ import re import os from typing import Tuple, Generator, List +import glob CAPITALIZATION_EXCEPTIONS = { @@ -54,24 +55,29 @@ def correct_title_capitalization(title: str) -> str: """ - Algorithm to create the correct capitalization for a given title + Algorithm to create the correct capitalization for a given title. Parameters ---------- title : str - Heading string to correct + Heading string to correct. Returns ------- correct_title : str - Correctly capitalized heading + Correctly capitalized heading. """ + # Function to strip all non-word characters from the beginning of the title to the + # first word character. correct_title: str = re.sub(r"^\W*", "", title).capitalize() + # Function to remove a URL from the title. We do this because words in a URL must + # stay lowercase, even if they are a capitalization exception. removed_https_title = re.sub(r"", "", correct_title) + # Function to split a title into a list using non-word character delimiters. word_list = re.split(r"\W", removed_https_title) for word in word_list: @@ -86,69 +92,60 @@ def correct_title_capitalization(title: str) -> str: def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: """ Algorithm to identify particular text that should be considered headings in an - RST file + RST file. See for details - on what constitutes a string as a heading in RST + on what constitutes a string as a heading in RST. Parameters ---------- rst_file : str - RST file to scan through for headings + RST file to scan through for headings. Yields ------- title : str - A heading found in the rst file + A heading found in the rst file. line_number : int - The corresponding line number of the heading + The corresponding line number of the heading. """ with open(rst_file, "r") as file_obj: lines = file_obj.read().split("\n") - regex = { - "*": r"^(?:\*{1})*$", - "=": r"^(?:={1})*$", - "-": r"^(?:-{1})*$", - "^": r"^(?:\^{1})*$", - "~": r"^(?:~{1})*$", - "#": r"^(?:#{1})*$", - '"': r'^(?:"{1})*$', - } + symbols = ("*", "=", "-", "^", "~", "#", '"') table = str.maketrans("", "", "*`_") - for line_no in range(1, len(lines)): - if len(lines[line_no]) != 0 and len(lines[line_no - 1]) != 0: - for key in regex: - match = re.search(regex[key], lines[line_no]) - if match is not None: - if line_no >= 2: - if lines[line_no] == lines[line_no - 2]: - if len(lines[line_no]) == len(lines[line_no - 1]): - yield lines[line_no - 1].translate(table), line_no - break - if len(lines[line_no]) >= len(lines[line_no - 1]): - yield lines[line_no - 1].translate(table), line_no + for i, line in enumerate(lines): + if len(line) != 0 and len(lines[i - 1]) != 0: + line_chars = set(line) + if len(line_chars) == 1 and line_chars.pop() in symbols: + if i >= 2: + if line == lines[i - 2]: + if len(line) == len(lines[i - 1]): + yield lines[i - 1].translate(table), i + continue + if len(line) >= len(lines[i - 1]): + yield lines[i - 1].translate(table), i def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: """ Given the command line arguments of directory paths, this method - yields the strings of the .rst file directories that these paths contain + yields the strings of the .rst file directories that these paths contain. Parameters ---------- source_paths : str - List of directories to validate, provided through command line arguments + List of directories to validate, provided through command line arguments. Yields ------- directory_address : str - Directory address of a .rst files found in command line argument directories + Directory address of a .rst files found in command line argument directories. """ @@ -160,27 +157,27 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: elif directory_address.endswith(".rst"): yield directory_address else: - for (dirpath, _, filenames) in os.walk(directory_address): - for file in filenames: - if file.endswith(".rst"): - yield os.path.join(dirpath, file) + for filename in glob.glob( + pathname=f"{directory_address}/**/*.rst", recursive=True + ): + yield filename def main(source_paths: List[str], output_format: str) -> bool: """ - The main method to print all headings with incorrect capitalization + The main method to print all headings with incorrect capitalization. Parameters ---------- source_paths : str - List of directories to validate, provided through command line arguments + List of directories to validate, provided through command line arguments. output_format : str Output format of the script. Returns ------- number_of_errors : int - Number of incorrect headings found overall + Number of incorrect headings found overall. """ From ee45f984c552f126ccc5406f75b212f6bd836543 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 4 Feb 2020 09:15:46 -0500 Subject: [PATCH 38/44] Edited len(line) != 0 correction --- scripts/validate_rst_title_capitalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 1a620425a402f..7b10aa8f7069e 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -120,7 +120,7 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: table = str.maketrans("", "", "*`_") for i, line in enumerate(lines): - if len(line) != 0 and len(lines[i - 1]) != 0: + if line and lines[i - 1]: line_chars = set(line) if len(line_chars) == 1 and line_chars.pop() in symbols: if i >= 2: From 78a49c1e5fa0926e2e79133f1b8a68685c894b79 Mon Sep 17 00:00:00 2001 From: awu42 Date: Tue, 4 Feb 2020 10:50:21 -0500 Subject: [PATCH 39/44] Edited find_titles method --- scripts/validate_rst_title_capitalization.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 7b10aa8f7069e..a0334fc8255a6 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -120,16 +120,13 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: table = str.maketrans("", "", "*`_") for i, line in enumerate(lines): - if line and lines[i - 1]: - line_chars = set(line) - if len(line_chars) == 1 and line_chars.pop() in symbols: - if i >= 2: - if line == lines[i - 2]: - if len(line) == len(lines[i - 1]): - yield lines[i - 1].translate(table), i - continue - if len(line) >= len(lines[i - 1]): - yield lines[i - 1].translate(table), i + line_chars = set(line) + if ( + len(line_chars) == 1 + and line_chars.pop() in symbols + and len(line) == len(lines[i - 1]) + ): + yield lines[i - 1].translate(table), i def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: From f4ffd32cf2aee0ea981c67a0c86bf314fe65088b Mon Sep 17 00:00:00 2001 From: awu42 Date: Wed, 26 Feb 2020 16:18:01 -0500 Subject: [PATCH 40/44] edited contributing.rst to have no errors --- doc/source/development/contributing.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index f904781178656..db9e23035b977 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -146,7 +146,7 @@ requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. -Using a Docker Container +Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ Instead of manually setting up a development environment, you can use Docker to @@ -754,7 +754,7 @@ You can then verify the changes look ok, then git :ref:`commit `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. @@ -919,7 +919,7 @@ For example, quite a few functions in *pandas* accept a ``dtype`` argument. This This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. -Validating Type Hints +Validating type hints ~~~~~~~~~~~~~~~~~~~~~ *pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running @@ -1539,7 +1539,7 @@ The branch will still exist on GitHub, so to delete it there do:: .. _Gitter: https://gitter.im/pydata/pandas -Tips for a successful Pull Request +Tips for a successful pull request ================================== If you have made it to the `Review your code`_ phase, one of the core contributors may From 687053fe86bb6a87faa5feec66e6031d63507a98 Mon Sep 17 00:00:00 2001 From: awu42 Date: Thu, 5 Mar 2020 22:35:19 -0500 Subject: [PATCH 41/44] modified validation script --- scripts/validate_rst_title_capitalization.py | 40 +++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index a0334fc8255a6..5cfbaccf9f585 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -52,6 +52,7 @@ err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize" +symbols = ("*", "=", "-", "^", "~", "#", '"') def correct_title_capitalization(title: str) -> str: """ @@ -64,12 +65,11 @@ def correct_title_capitalization(title: str) -> str: Returns ------- - correct_title : str + str Correctly capitalized heading. - """ - # Function to strip all non-word characters from the beginning of the title to the + # Strip all non-word characters from the beginning of the title to the # first word character. correct_title: str = re.sub(r"^\W*", "", title).capitalize() @@ -109,24 +109,20 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: line_number : int The corresponding line number of the heading. - """ - with open(rst_file, "r") as file_obj: - lines = file_obj.read().split("\n") - - symbols = ("*", "=", "-", "^", "~", "#", '"') - - table = str.maketrans("", "", "*`_") - - for i, line in enumerate(lines): - line_chars = set(line) - if ( - len(line_chars) == 1 - and line_chars.pop() in symbols - and len(line) == len(lines[i - 1]) - ): - yield lines[i - 1].translate(table), i + with open(rst_file, "r") as fd: + previous_line = '' + for i, line in enumerate(fd): + line = line[:-1] + line_chars = set(line) + if ( + len(line_chars) == 1 + and line_chars.pop() in symbols + and len(line) == len(previous_line) + ): + yield re.sub('[`\*_]', '', previous_line), i + previous_line = line def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: @@ -141,9 +137,8 @@ def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]: Yields ------- - directory_address : str + str Directory address of a .rst files found in command line argument directories. - """ for directory_address in source_paths: @@ -173,9 +168,8 @@ def main(source_paths: List[str], output_format: str) -> bool: Returns ------- - number_of_errors : int + int Number of incorrect headings found overall. - """ number_of_errors: int = 0 From c690281c489349d7032938e49047783eb2c5f34f Mon Sep 17 00:00:00 2001 From: awu42 Date: Thu, 5 Mar 2020 23:05:06 -0500 Subject: [PATCH 42/44] fix linting errors --- scripts/validate_rst_title_capitalization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 5cfbaccf9f585..893b6057cbb01 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -54,6 +54,7 @@ symbols = ("*", "=", "-", "^", "~", "#", '"') + def correct_title_capitalization(title: str) -> str: """ Algorithm to create the correct capitalization for a given title. @@ -121,7 +122,7 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: and line_chars.pop() in symbols and len(line) == len(previous_line) ): - yield re.sub('[`\*_]', '', previous_line), i + yield re.sub(r'[`\*_]', '', previous_line), i previous_line = line From c9775cc7e7b1d3fb1d831274f598690a553ddef8 Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 6 Mar 2020 00:29:44 -0500 Subject: [PATCH 43/44] black pandas-dev change --- scripts/validate_rst_title_capitalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 893b6057cbb01..837c0f73b0318 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -113,7 +113,7 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: """ with open(rst_file, "r") as fd: - previous_line = '' + previous_line = "" for i, line in enumerate(fd): line = line[:-1] line_chars = set(line) @@ -122,7 +122,7 @@ def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]: and line_chars.pop() in symbols and len(line) == len(previous_line) ): - yield re.sub(r'[`\*_]', '', previous_line), i + yield re.sub(r"[`\*_]", "", previous_line), i previous_line = line From 66c651ae74308eb1b86dea6a10068a13dd60787d Mon Sep 17 00:00:00 2001 From: awu42 Date: Fri, 6 Mar 2020 14:16:45 -0500 Subject: [PATCH 44/44] modified changes to comments --- scripts/validate_rst_title_capitalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 837c0f73b0318..17752134e5049 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -74,11 +74,11 @@ def correct_title_capitalization(title: str) -> str: # first word character. correct_title: str = re.sub(r"^\W*", "", title).capitalize() - # Function to remove a URL from the title. We do this because words in a URL must + # Remove a URL from the title. We do this because words in a URL must # stay lowercase, even if they are a capitalization exception. removed_https_title = re.sub(r"", "", correct_title) - # Function to split a title into a list using non-word character delimiters. + # Split a title into a list using non-word character delimiters. word_list = re.split(r"\W", removed_https_title) for word in word_list: