From cce169a5f4865aa356ae19464c51ede917e91ef4 Mon Sep 17 00:00:00 2001 From: Aidan Feldman Date: Sun, 27 Dec 2020 17:43:04 -0500 Subject: [PATCH 1/2] DOC: create shared includes for content shared by comparison docs This will help ensure consistency between the examples. --- .../comparison/comparison_with_sas.rst | 65 ++----------------- .../comparison_with_spreadsheets.rst | 2 +- .../comparison/comparison_with_sql.rst | 21 +----- .../comparison/comparison_with_stata.rst | 65 ++----------------- .../includes/construct_dataframe.rst | 11 ++++ .../comparison/includes/filtering.rst | 18 +++++ .../comparison/includes/if_then.rst | 14 ++++ .../introduction.rst} | 2 + .../comparison/includes/sorting.rst | 9 +++ .../comparison/includes/time_date.rst | 24 +++++++ 10 files changed, 94 insertions(+), 137 deletions(-) create mode 100644 doc/source/getting_started/comparison/includes/construct_dataframe.rst create mode 100644 doc/source/getting_started/comparison/includes/filtering.rst create mode 100644 doc/source/getting_started/comparison/includes/if_then.rst rename doc/source/getting_started/comparison/{comparison_boilerplate.rst => includes/introduction.rst} (96%) create mode 100644 doc/source/getting_started/comparison/includes/sorting.rst create mode 100644 doc/source/getting_started/comparison/includes/time_date.rst diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index c6f508aae0e21..f23191b6d383b 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -8,7 +8,7 @@ For potential users coming from `SAS ` - -.. ipython:: python - - tips[tips["total_bill"] > 10].head() +.. include:: includes/filtering.rst If/then logic ~~~~~~~~~~~~~ @@ -239,18 +225,7 @@ In SAS, if/then logic can be used to create new columns. else bucket = 'high'; run; -The same operation in pandas can be accomplished using -the ``where`` method from ``numpy``. - -.. ipython:: python - - tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") - tips.head() - -.. ipython:: python - :suppress: - - tips = tips.drop("bucket", axis=1) +.. include:: includes/if_then.rst Date functionality ~~~~~~~~~~~~~~~~~~ @@ -278,28 +253,7 @@ functions pandas supports other Time Series features not available in Base SAS (such as resampling and custom offsets) - see the :ref:`timeseries documentation` for more details. -.. ipython:: python - - tips["date1"] = pd.Timestamp("2013-01-15") - tips["date2"] = pd.Timestamp("2015-02-15") - tips["date1_year"] = tips["date1"].dt.year - tips["date2_month"] = tips["date2"].dt.month - tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() - tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ - "date1" - ].dt.to_period("M") - - tips[ - ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] - ].head() - -.. ipython:: python - :suppress: - - tips = tips.drop( - ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], - axis=1, - ) +.. include:: includes/time_date.rst Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -349,14 +303,7 @@ Sorting in SAS is accomplished via ``PROC SORT`` by sex total_bill; run; -pandas objects have a :meth:`~DataFrame.sort_values` method, which -takes a list of columns to sort by. - -.. ipython:: python - - tips = tips.sort_values(["sex", "total_bill"]) - tips.head() - +.. include:: includes/sorting.rst String processing ----------------- diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index 73645d429cc66..7b779b02e20f8 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -14,7 +14,7 @@ terminology and link to documentation for Excel, but much will be the same/simil `Apple Numbers `_, and other Excel-compatible spreadsheet software. -.. include:: comparison_boilerplate.rst +.. include:: includes/introduction.rst Data structures --------------- diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 4fe7b7e96cf50..52799442d6118 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -8,7 +8,7 @@ Since many potential pandas users have some familiarity with `SQL `_, this page is meant to provide some examples of how various SQL operations would be performed using pandas. -.. include:: comparison_boilerplate.rst +.. include:: includes/introduction.rst Most of the examples will utilize the ``tips`` dataset found within pandas tests. We'll read the data into a DataFrame called ``tips`` and assume we have a database table of the same name and @@ -65,24 +65,9 @@ Filtering in SQL is done via a WHERE clause. SELECT * FROM tips - WHERE time = 'Dinner' - LIMIT 5; - -DataFrames can be filtered in multiple ways; the most intuitive of which is using -:ref:`boolean indexing ` - -.. ipython:: python - - tips[tips["time"] == "Dinner"].head(5) - -The above statement is simply passing a ``Series`` of True/False objects to the DataFrame, -returning all rows with True. - -.. ipython:: python + WHERE time = 'Dinner'; - is_dinner = tips["time"] == "Dinner" - is_dinner.value_counts() - tips[is_dinner].head(5) +.. include:: includes/filtering.rst Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and & (AND). diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index b3ed9b1ba630f..3125dd814f0e8 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -8,7 +8,7 @@ For potential users coming from `Stata `__ this page is meant to demonstrate how different Stata operations would be performed in pandas. -.. include:: comparison_boilerplate.rst +.. include:: includes/introduction.rst .. note:: @@ -89,16 +89,7 @@ specifying the column names. 5 6 end -A pandas ``DataFrame`` can be constructed in many different ways, -but for a small number of values, it is often convenient to specify it as -a Python dictionary, where the keys are the column names -and the values are the data. - -.. ipython:: python - - df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) - df - +.. include:: includes/construct_dataframe.rst Reading external data ~~~~~~~~~~~~~~~~~~~~~ @@ -210,12 +201,7 @@ Filtering in Stata is done with an ``if`` clause on one or more columns. list if total_bill > 10 -DataFrames can be filtered in multiple ways; the most intuitive of which is using -:ref:`boolean indexing `. - -.. ipython:: python - - tips[tips["total_bill"] > 10].head() +.. include:: includes/filtering.rst If/then logic ~~~~~~~~~~~~~ @@ -227,18 +213,7 @@ In Stata, an ``if`` clause can also be used to create new columns. generate bucket = "low" if total_bill < 10 replace bucket = "high" if total_bill >= 10 -The same operation in pandas can be accomplished using -the ``where`` method from ``numpy``. - -.. ipython:: python - - tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") - tips.head() - -.. ipython:: python - :suppress: - - tips = tips.drop("bucket", axis=1) +.. include:: includes/if_then.rst Date functionality ~~~~~~~~~~~~~~~~~~ @@ -266,28 +241,7 @@ functions, pandas supports other Time Series features not available in Stata (such as time zone handling and custom offsets) -- see the :ref:`timeseries documentation` for more details. -.. ipython:: python - - tips["date1"] = pd.Timestamp("2013-01-15") - tips["date2"] = pd.Timestamp("2015-02-15") - tips["date1_year"] = tips["date1"].dt.year - tips["date2_month"] = tips["date2"].dt.month - tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() - tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ - "date1" - ].dt.to_period("M") - - tips[ - ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] - ].head() - -.. ipython:: python - :suppress: - - tips = tips.drop( - ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], - axis=1, - ) +.. include:: includes/time_date.rst Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -327,14 +281,7 @@ Sorting in Stata is accomplished via ``sort`` sort sex total_bill -pandas objects have a :meth:`DataFrame.sort_values` method, which -takes a list of columns to sort by. - -.. ipython:: python - - tips = tips.sort_values(["sex", "total_bill"]) - tips.head() - +.. include:: includes/sorting.rst String processing ----------------- diff --git a/doc/source/getting_started/comparison/includes/construct_dataframe.rst b/doc/source/getting_started/comparison/includes/construct_dataframe.rst new file mode 100644 index 0000000000000..9abd619346b96 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/construct_dataframe.rst @@ -0,0 +1,11 @@ +:orphan: + +A pandas ``DataFrame`` can be constructed in many different ways, +but for a small number of values, it is often convenient to specify it as +a Python dictionary, where the keys are the column names +and the values are the data. + +.. ipython:: python + + df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) + df diff --git a/doc/source/getting_started/comparison/includes/filtering.rst b/doc/source/getting_started/comparison/includes/filtering.rst new file mode 100644 index 0000000000000..bf30bee415f75 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/filtering.rst @@ -0,0 +1,18 @@ +:orphan: + +DataFrames can be filtered in multiple ways; the most intuitive of which is using +:ref:`boolean indexing ` + +.. ipython:: python + + tips[tips["total_bill"] > 10] + +The above statement is simply passing a ``Series`` of ``True``/``False`` objects to the DataFrame, +returning all rows with ``True``. + +.. ipython:: python + + is_dinner = tips["time"] == "Dinner" + is_dinner + is_dinner.value_counts() + tips[is_dinner] diff --git a/doc/source/getting_started/comparison/includes/if_then.rst b/doc/source/getting_started/comparison/includes/if_then.rst new file mode 100644 index 0000000000000..02d11df7a9372 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/if_then.rst @@ -0,0 +1,14 @@ +:orphan: + +The same operation in pandas can be accomplished using +the ``where`` method from ``numpy``. + +.. ipython:: python + + tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") + tips.head() + +.. ipython:: python + :suppress: + + tips = tips.drop("bucket", axis=1) diff --git a/doc/source/getting_started/comparison/comparison_boilerplate.rst b/doc/source/getting_started/comparison/includes/introduction.rst similarity index 96% rename from doc/source/getting_started/comparison/comparison_boilerplate.rst rename to doc/source/getting_started/comparison/includes/introduction.rst index aedf2875dc452..0f052e1f698dd 100644 --- a/doc/source/getting_started/comparison/comparison_boilerplate.rst +++ b/doc/source/getting_started/comparison/includes/introduction.rst @@ -1,3 +1,5 @@ +:orphan: + If you're new to pandas, you might want to first read through :ref:`10 Minutes to pandas<10min>` to familiarize yourself with the library. diff --git a/doc/source/getting_started/comparison/includes/sorting.rst b/doc/source/getting_started/comparison/includes/sorting.rst new file mode 100644 index 0000000000000..30f82af30428a --- /dev/null +++ b/doc/source/getting_started/comparison/includes/sorting.rst @@ -0,0 +1,9 @@ +:orphan: + +pandas objects have a :meth:`DataFrame.sort_values` method, which +takes a list of columns to sort by. + +.. ipython:: python + + tips = tips.sort_values(["sex", "total_bill"]) + tips.head() diff --git a/doc/source/getting_started/comparison/includes/time_date.rst b/doc/source/getting_started/comparison/includes/time_date.rst new file mode 100644 index 0000000000000..cf4e0ac433fd5 --- /dev/null +++ b/doc/source/getting_started/comparison/includes/time_date.rst @@ -0,0 +1,24 @@ +:orphan: + +.. ipython:: python + + tips["date1"] = pd.Timestamp("2013-01-15") + tips["date2"] = pd.Timestamp("2015-02-15") + tips["date1_year"] = tips["date1"].dt.year + tips["date2_month"] = tips["date2"].dt.month + tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() + tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ + "date1" + ].dt.to_period("M") + + tips[ + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] + ].head() + +.. ipython:: python + :suppress: + + tips = tips.drop( + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], + axis=1, + ) From 6dcd18ff833154e156fa7be80766b171d2285144 Mon Sep 17 00:00:00 2001 From: Aidan Feldman Date: Wed, 30 Dec 2020 05:56:20 +0000 Subject: [PATCH 2/2] DOC: define tips variable to avoid flake8-rst error --- .../getting_started/comparison/includes/filtering.rst | 7 +++++++ doc/source/getting_started/comparison/includes/if_then.rst | 7 +++++++ doc/source/getting_started/comparison/includes/sorting.rst | 7 +++++++ .../getting_started/comparison/includes/time_date.rst | 7 +++++++ 4 files changed, 28 insertions(+) diff --git a/doc/source/getting_started/comparison/includes/filtering.rst b/doc/source/getting_started/comparison/includes/filtering.rst index bf30bee415f75..19a0a5118f8f6 100644 --- a/doc/source/getting_started/comparison/includes/filtering.rst +++ b/doc/source/getting_started/comparison/includes/filtering.rst @@ -3,6 +3,13 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is using :ref:`boolean indexing ` +.. ipython:: python + :suppress: + + # ensure tips is defined when scanning with flake8-rst + if 'tips' not in vars(): + tips = {} + .. ipython:: python tips[tips["total_bill"] > 10] diff --git a/doc/source/getting_started/comparison/includes/if_then.rst b/doc/source/getting_started/comparison/includes/if_then.rst index 02d11df7a9372..bbe9b3e6c99e8 100644 --- a/doc/source/getting_started/comparison/includes/if_then.rst +++ b/doc/source/getting_started/comparison/includes/if_then.rst @@ -3,6 +3,13 @@ The same operation in pandas can be accomplished using the ``where`` method from ``numpy``. +.. ipython:: python + :suppress: + + # ensure tips is defined when scanning with flake8-rst + if 'tips' not in vars(): + tips = {} + .. ipython:: python tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") diff --git a/doc/source/getting_started/comparison/includes/sorting.rst b/doc/source/getting_started/comparison/includes/sorting.rst index 30f82af30428a..179bf866f9f6d 100644 --- a/doc/source/getting_started/comparison/includes/sorting.rst +++ b/doc/source/getting_started/comparison/includes/sorting.rst @@ -3,6 +3,13 @@ pandas objects have a :meth:`DataFrame.sort_values` method, which takes a list of columns to sort by. +.. ipython:: python + :suppress: + + # ensure tips is defined when scanning with flake8-rst + if 'tips' not in vars(): + tips = {} + .. ipython:: python tips = tips.sort_values(["sex", "total_bill"]) diff --git a/doc/source/getting_started/comparison/includes/time_date.rst b/doc/source/getting_started/comparison/includes/time_date.rst index cf4e0ac433fd5..9779eada49dfb 100644 --- a/doc/source/getting_started/comparison/includes/time_date.rst +++ b/doc/source/getting_started/comparison/includes/time_date.rst @@ -1,5 +1,12 @@ :orphan: +.. ipython:: python + :suppress: + + # ensure tips is defined when scanning with flake8-rst + if 'tips' not in vars(): + tips = {} + .. ipython:: python tips["date1"] = pd.Timestamp("2013-01-15")