From cce169a5f4865aa356ae19464c51ede917e91ef4 Mon Sep 17 00:00:00 2001
From: Aidan Feldman <aidan.feldman@gmail.com>
Date: Sun, 27 Dec 2020 17:43:04 -0500
Subject: [PATCH 1/2] DOC: create shared includes for content shared by
 comparison docs

This will help ensure consistency between the examples.
---
 .../comparison/comparison_with_sas.rst        | 65 ++-----------------
 .../comparison_with_spreadsheets.rst          |  2 +-
 .../comparison/comparison_with_sql.rst        | 21 +-----
 .../comparison/comparison_with_stata.rst      | 65 ++-----------------
 .../includes/construct_dataframe.rst          | 11 ++++
 .../comparison/includes/filtering.rst         | 18 +++++
 .../comparison/includes/if_then.rst           | 14 ++++
 .../introduction.rst}                         |  2 +
 .../comparison/includes/sorting.rst           |  9 +++
 .../comparison/includes/time_date.rst         | 24 +++++++
 10 files changed, 94 insertions(+), 137 deletions(-)
 create mode 100644 doc/source/getting_started/comparison/includes/construct_dataframe.rst
 create mode 100644 doc/source/getting_started/comparison/includes/filtering.rst
 create mode 100644 doc/source/getting_started/comparison/includes/if_then.rst
 rename doc/source/getting_started/comparison/{comparison_boilerplate.rst => includes/introduction.rst} (96%)
 create mode 100644 doc/source/getting_started/comparison/includes/sorting.rst
 create mode 100644 doc/source/getting_started/comparison/includes/time_date.rst
diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst
index c6f508aae0e21..f23191b6d383b 100644
--- a/doc/source/getting_started/comparison/comparison_with_sas.rst
+++ b/doc/source/getting_started/comparison/comparison_with_sas.rst
@@ -8,7 +8,7 @@ For potential users coming from `SAS <https://en.wikipedia.org/wiki/SAS_(softwar
 this page is meant to demonstrate how different SAS operations would be
 performed in pandas.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 .. note::
 
@@ -93,16 +93,7 @@ specifying the column names.
        ;
    run;
 
-A pandas ``DataFrame`` can be constructed in many different ways,
-but for a small number of values, it is often convenient to specify it as
-a Python dictionary, where the keys are the column names
-and the values are the data.
-
-.. ipython:: python
-
-   df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
-   df
-
+.. include:: includes/construct_dataframe.rst
 
 Reading external data
 ~~~~~~~~~~~~~~~~~~~~~
@@ -217,12 +208,7 @@ or more columns.
           DATA step begins and can also be used in PROC statements */
    run;
 
-DataFrames can be filtered in multiple ways; the most intuitive of which is using
-:ref:`boolean indexing <indexing.boolean>`
-
-.. ipython:: python
-
-   tips[tips["total_bill"] > 10].head()
+.. include:: includes/filtering.rst
 
 If/then logic
 ~~~~~~~~~~~~~
@@ -239,18 +225,7 @@ In SAS, if/then logic can be used to create new columns.
        else bucket = 'high';
    run;
 
-The same operation in pandas can be accomplished using
-the ``where`` method from ``numpy``.
-
-.. ipython:: python
-
-   tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
-   tips.head()
-
-.. ipython:: python
-   :suppress:
-
-   tips = tips.drop("bucket", axis=1)
+.. include:: includes/if_then.rst
 
 Date functionality
 ~~~~~~~~~~~~~~~~~~
@@ -278,28 +253,7 @@ functions pandas supports other Time Series features
 not available in Base SAS (such as resampling and custom offsets) -
 see the :ref:`timeseries documentation<timeseries>` for more details.
 
-.. ipython:: python
-
-   tips["date1"] = pd.Timestamp("2013-01-15")
-   tips["date2"] = pd.Timestamp("2015-02-15")
-   tips["date1_year"] = tips["date1"].dt.year
-   tips["date2_month"] = tips["date2"].dt.month
-   tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()
-   tips["months_between"] = tips["date2"].dt.to_period("M") - tips[
-       "date1"
-   ].dt.to_period("M")
-
-   tips[
-       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]
-   ].head()
-
-.. ipython:: python
-   :suppress:
-
-   tips = tips.drop(
-       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"],
-       axis=1,
-   )
+.. include:: includes/time_date.rst
 
 Selection of columns
 ~~~~~~~~~~~~~~~~~~~~
@@ -349,14 +303,7 @@ Sorting in SAS is accomplished via ``PROC SORT``
        by sex total_bill;
    run;
 
-pandas objects have a :meth:`~DataFrame.sort_values` method, which
-takes a list of columns to sort by.
-
-.. ipython:: python
-
-   tips = tips.sort_values(["sex", "total_bill"])
-   tips.head()
-
+.. include:: includes/sorting.rst
 
 String processing
 -----------------
diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst
index 73645d429cc66..7b779b02e20f8 100644
--- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst
+++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst
@@ -14,7 +14,7 @@ terminology and link to documentation for Excel, but much will be the same/simil
 `Apple Numbers <https://www.apple.com/mac/numbers/compatibility/functions.html>`_, and other
 Excel-compatible spreadsheet software.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 Data structures
 ---------------
diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst
index 4fe7b7e96cf50..52799442d6118 100644
--- a/doc/source/getting_started/comparison/comparison_with_sql.rst
+++ b/doc/source/getting_started/comparison/comparison_with_sql.rst
@@ -8,7 +8,7 @@ Since many potential pandas users have some familiarity with
 `SQL <https://en.wikipedia.org/wiki/SQL>`_, this page is meant to provide some examples of how
 various SQL operations would be performed using pandas.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 Most of the examples will utilize the ``tips`` dataset found within pandas tests.  We'll read
 the data into a DataFrame called ``tips`` and assume we have a database table of the same name and
@@ -65,24 +65,9 @@ Filtering in SQL is done via a WHERE clause.
 
     SELECT *
     FROM tips
-    WHERE time = 'Dinner'
-    LIMIT 5;
-
-DataFrames can be filtered in multiple ways; the most intuitive of which is using
-:ref:`boolean indexing <indexing.boolean>`
-
-.. ipython:: python
-
-    tips[tips["time"] == "Dinner"].head(5)
-
-The above statement is simply passing a ``Series`` of True/False objects to the DataFrame,
-returning all rows with True.
-
-.. ipython:: python
+    WHERE time = 'Dinner';
 
-    is_dinner = tips["time"] == "Dinner"
-    is_dinner.value_counts()
-    tips[is_dinner].head(5)
+.. include:: includes/filtering.rst
 
 Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and &
 (AND).
diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst
index b3ed9b1ba630f..3125dd814f0e8 100644
--- a/doc/source/getting_started/comparison/comparison_with_stata.rst
+++ b/doc/source/getting_started/comparison/comparison_with_stata.rst
@@ -8,7 +8,7 @@ For potential users coming from `Stata <https://en.wikipedia.org/wiki/Stata>`__
 this page is meant to demonstrate how different Stata operations would be
 performed in pandas.
 
-.. include:: comparison_boilerplate.rst
+.. include:: includes/introduction.rst
 
 .. note::
 
@@ -89,16 +89,7 @@ specifying the column names.
    5 6
    end
 
-A pandas ``DataFrame`` can be constructed in many different ways,
-but for a small number of values, it is often convenient to specify it as
-a Python dictionary, where the keys are the column names
-and the values are the data.
-
-.. ipython:: python
-
-   df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
-   df
-
+.. include:: includes/construct_dataframe.rst
 
 Reading external data
 ~~~~~~~~~~~~~~~~~~~~~
@@ -210,12 +201,7 @@ Filtering in Stata is done with an ``if`` clause on one or more columns.
 
    list if total_bill > 10
 
-DataFrames can be filtered in multiple ways; the most intuitive of which is using
-:ref:`boolean indexing <indexing.boolean>`.
-
-.. ipython:: python
-
-   tips[tips["total_bill"] > 10].head()
+.. include:: includes/filtering.rst
 
 If/then logic
 ~~~~~~~~~~~~~
@@ -227,18 +213,7 @@ In Stata, an ``if`` clause can also be used to create new columns.
    generate bucket = "low" if total_bill < 10
    replace bucket = "high" if total_bill >= 10
 
-The same operation in pandas can be accomplished using
-the ``where`` method from ``numpy``.
-
-.. ipython:: python
-
-   tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
-   tips.head()
-
-.. ipython:: python
-   :suppress:
-
-   tips = tips.drop("bucket", axis=1)
+.. include:: includes/if_then.rst
 
 Date functionality
 ~~~~~~~~~~~~~~~~~~
@@ -266,28 +241,7 @@ functions, pandas supports other Time Series features
 not available in Stata (such as time zone handling and custom offsets) --
 see the :ref:`timeseries documentation<timeseries>` for more details.
 
-.. ipython:: python
-
-   tips["date1"] = pd.Timestamp("2013-01-15")
-   tips["date2"] = pd.Timestamp("2015-02-15")
-   tips["date1_year"] = tips["date1"].dt.year
-   tips["date2_month"] = tips["date2"].dt.month
-   tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()
-   tips["months_between"] = tips["date2"].dt.to_period("M") - tips[
-       "date1"
-   ].dt.to_period("M")
-
-   tips[
-       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]
-   ].head()
-
-.. ipython:: python
-   :suppress:
-
-   tips = tips.drop(
-       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"],
-       axis=1,
-   )
+.. include:: includes/time_date.rst
 
 Selection of columns
 ~~~~~~~~~~~~~~~~~~~~
@@ -327,14 +281,7 @@ Sorting in Stata is accomplished via ``sort``
 
    sort sex total_bill
 
-pandas objects have a :meth:`DataFrame.sort_values` method, which
-takes a list of columns to sort by.
-
-.. ipython:: python
-
-   tips = tips.sort_values(["sex", "total_bill"])
-   tips.head()
-
+.. include:: includes/sorting.rst
 
 String processing
 -----------------
diff --git a/doc/source/getting_started/comparison/includes/construct_dataframe.rst b/doc/source/getting_started/comparison/includes/construct_dataframe.rst
new file mode 100644
index 0000000000000..9abd619346b96
--- /dev/null
+++ b/doc/source/getting_started/comparison/includes/construct_dataframe.rst
@@ -0,0 +1,11 @@
+:orphan:
+
+A pandas ``DataFrame`` can be constructed in many different ways,
+but for a small number of values, it is often convenient to specify it as
+a Python dictionary, where the keys are the column names
+and the values are the data.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]})
+   df
diff --git a/doc/source/getting_started/comparison/includes/filtering.rst b/doc/source/getting_started/comparison/includes/filtering.rst
new file mode 100644
index 0000000000000..bf30bee415f75
--- /dev/null
+++ b/doc/source/getting_started/comparison/includes/filtering.rst
@@ -0,0 +1,18 @@
+:orphan:
+
+DataFrames can be filtered in multiple ways; the most intuitive of which is using
+:ref:`boolean indexing <indexing.boolean>`
+
+.. ipython:: python
+
+   tips[tips["total_bill"] > 10]
+
+The above statement is simply passing a ``Series`` of ``True``/``False`` objects to the DataFrame,
+returning all rows with ``True``.
+
+.. ipython:: python
+
+    is_dinner = tips["time"] == "Dinner"
+    is_dinner
+    is_dinner.value_counts()
+    tips[is_dinner]
diff --git a/doc/source/getting_started/comparison/includes/if_then.rst b/doc/source/getting_started/comparison/includes/if_then.rst
new file mode 100644
index 0000000000000..02d11df7a9372
--- /dev/null
+++ b/doc/source/getting_started/comparison/includes/if_then.rst
@@ -0,0 +1,14 @@
+:orphan:
+
+The same operation in pandas can be accomplished using
+the ``where`` method from ``numpy``.
+
+.. ipython:: python
+
+   tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
+   tips.head()
+
+.. ipython:: python
+   :suppress:
+
+   tips = tips.drop("bucket", axis=1)
diff --git a/doc/source/getting_started/comparison/comparison_boilerplate.rst b/doc/source/getting_started/comparison/includes/introduction.rst
similarity index 96%
rename from doc/source/getting_started/comparison/comparison_boilerplate.rst
rename to doc/source/getting_started/comparison/includes/introduction.rst
index aedf2875dc452..0f052e1f698dd 100644
--- a/doc/source/getting_started/comparison/comparison_boilerplate.rst
+++ b/doc/source/getting_started/comparison/includes/introduction.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>`
 to familiarize yourself with the library.
 
diff --git a/doc/source/getting_started/comparison/includes/sorting.rst b/doc/source/getting_started/comparison/includes/sorting.rst
new file mode 100644
index 0000000000000..30f82af30428a
--- /dev/null
+++ b/doc/source/getting_started/comparison/includes/sorting.rst
@@ -0,0 +1,9 @@
+:orphan:
+
+pandas objects have a :meth:`DataFrame.sort_values` method, which
+takes a list of columns to sort by.
+
+.. ipython:: python
+
+   tips = tips.sort_values(["sex", "total_bill"])
+   tips.head()
diff --git a/doc/source/getting_started/comparison/includes/time_date.rst b/doc/source/getting_started/comparison/includes/time_date.rst
new file mode 100644
index 0000000000000..cf4e0ac433fd5
--- /dev/null
+++ b/doc/source/getting_started/comparison/includes/time_date.rst
@@ -0,0 +1,24 @@
+:orphan:
+
+.. ipython:: python
+
+   tips["date1"] = pd.Timestamp("2013-01-15")
+   tips["date2"] = pd.Timestamp("2015-02-15")
+   tips["date1_year"] = tips["date1"].dt.year
+   tips["date2_month"] = tips["date2"].dt.month
+   tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin()
+   tips["months_between"] = tips["date2"].dt.to_period("M") - tips[
+       "date1"
+   ].dt.to_period("M")
+
+   tips[
+       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"]
+   ].head()
+
+.. ipython:: python
+   :suppress:
+
+   tips = tips.drop(
+       ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"],
+       axis=1,
+   )

From 6dcd18ff833154e156fa7be80766b171d2285144 Mon Sep 17 00:00:00 2001
From: Aidan Feldman <aidan.feldman@gmail.com>
Date: Wed, 30 Dec 2020 05:56:20 +0000
Subject: [PATCH 2/2] DOC: define tips variable to avoid flake8-rst error

---
 .../getting_started/comparison/includes/filtering.rst      | 7 +++++++
 doc/source/getting_started/comparison/includes/if_then.rst | 7 +++++++
 doc/source/getting_started/comparison/includes/sorting.rst | 7 +++++++
 .../getting_started/comparison/includes/time_date.rst      | 7 +++++++
 4 files changed, 28 insertions(+)

diff --git a/doc/source/getting_started/comparison/includes/filtering.rst b/doc/source/getting_started/comparison/includes/filtering.rst
index bf30bee415f75..19a0a5118f8f6 100644
--- a/doc/source/getting_started/comparison/includes/filtering.rst
+++ b/doc/source/getting_started/comparison/includes/filtering.rst
@@ -3,6 +3,13 @@
 DataFrames can be filtered in multiple ways; the most intuitive of which is using
 :ref:`boolean indexing <indexing.boolean>`
 
+.. ipython:: python
+   :suppress:
+
+   # ensure tips is defined when scanning with flake8-rst
+   if 'tips' not in vars():
+       tips = {}
+
 .. ipython:: python
 
    tips[tips["total_bill"] > 10]
diff --git a/doc/source/getting_started/comparison/includes/if_then.rst b/doc/source/getting_started/comparison/includes/if_then.rst
index 02d11df7a9372..bbe9b3e6c99e8 100644
--- a/doc/source/getting_started/comparison/includes/if_then.rst
+++ b/doc/source/getting_started/comparison/includes/if_then.rst
@@ -3,6 +3,13 @@
 The same operation in pandas can be accomplished using
 the ``where`` method from ``numpy``.
 
+.. ipython:: python
+   :suppress:
+
+   # ensure tips is defined when scanning with flake8-rst
+   if 'tips' not in vars():
+       tips = {}
+
 .. ipython:: python
 
    tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high")
diff --git a/doc/source/getting_started/comparison/includes/sorting.rst b/doc/source/getting_started/comparison/includes/sorting.rst
index 30f82af30428a..179bf866f9f6d 100644
--- a/doc/source/getting_started/comparison/includes/sorting.rst
+++ b/doc/source/getting_started/comparison/includes/sorting.rst
@@ -3,6 +3,13 @@
 pandas objects have a :meth:`DataFrame.sort_values` method, which
 takes a list of columns to sort by.
 
+.. ipython:: python
+   :suppress:
+
+   # ensure tips is defined when scanning with flake8-rst
+   if 'tips' not in vars():
+       tips = {}
+
 .. ipython:: python
 
    tips = tips.sort_values(["sex", "total_bill"])
diff --git a/doc/source/getting_started/comparison/includes/time_date.rst b/doc/source/getting_started/comparison/includes/time_date.rst
index cf4e0ac433fd5..9779eada49dfb 100644
--- a/doc/source/getting_started/comparison/includes/time_date.rst
+++ b/doc/source/getting_started/comparison/includes/time_date.rst
@@ -1,5 +1,12 @@
 :orphan:
 
+.. ipython:: python
+   :suppress:
+
+   # ensure tips is defined when scanning with flake8-rst
+   if 'tips' not in vars():
+       tips = {}
+
 .. ipython:: python
 
    tips["date1"] = pd.Timestamp("2013-01-15")