From dcb8b6a779874663d5cfa8b61d3a2c6896f29a0f Mon Sep 17 00:00:00 2001 From: Vincent La Date: Sun, 11 Nov 2018 16:21:58 -0800 Subject: [PATCH] DOC: Enhancing pivot / reshape docs (#21038) --- doc/source/reshaping.rst | 110 ++++++++++++++++++++++++++++++++++++--- pandas/core/frame.py | 72 ++++++++++++++++--------- 2 files changed, 151 insertions(+), 31 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 6163b6f2ae89a..ff867a2ddfe6d 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -17,6 +17,8 @@ Reshaping and Pivot Tables Reshaping by pivoting DataFrame objects --------------------------------------- +.. image:: _static/reshaping_pivot.png + .. ipython:: :suppress: @@ -33,8 +35,7 @@ Reshaping by pivoting DataFrame objects In [3]: df = unpivot(tm.makeTimeDataFrame()) -Data is often stored in CSV files or databases in so-called "stacked" or -"record" format: +Data is often stored in so-called "stacked" or "record" format: .. ipython:: python @@ -66,8 +67,6 @@ To select out everything for variable ``A`` we could do: df[df['variable'] == 'A'] -.. image:: _static/reshaping_pivot.png - But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an ``index`` of dates identifies individual observations. To reshape the data into @@ -87,7 +86,7 @@ column: .. ipython:: python df['value2'] = df['value'] * 2 - pivoted = df.pivot('date', 'variable') + pivoted = df.pivot(index='date', columns='variable') pivoted You can then select subsets from the pivoted ``DataFrame``: @@ -99,6 +98,12 @@ You can then select subsets from the pivoted ``DataFrame``: Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. +.. note:: + :func:`~pandas.pivot` will error with a ``ValueError: Index contains duplicate + entries, cannot reshape`` if the index/column pair is not unique. In this + case, consider using :func:`~pandas.pivot_table` which is a generalization + of pivot that can handle duplicate values for one index/column pair. + .. _reshaping.stacking: Reshaping by stacking and unstacking @@ -704,10 +709,103 @@ handling of NaN: In [3]: np.unique(x, return_inverse=True)[::-1] Out[3]: (array([3, 3, 0, 4, 1, 2]), array([nan, 3.14, inf, 'A', 'B'], dtype=object)) - .. note:: If you just want to handle one column as a categorical variable (like R's factor), you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. + +Examples +-------- + +In this section, we will review frequently asked questions and examples. The +column names and relevant column values are named to correspond with how this +DataFrame will be pivoted in the answers below. + +.. ipython:: python + + np.random.seed([3, 1415]) + n = 20 + + cols = np.array(['key', 'row', 'item', 'col']) + df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str)) + df.columns = cols + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) + + df + +Pivoting with Single Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose we wanted to pivot ``df`` such that the ``col`` values are columns, +``row`` values are the index, and the mean of ``val0`` are the values? In +particular, the resulting DataFrame should look like: + +.. code-block:: ipython + + col col0 col1 col2 col3 col4 + row + row0 0.77 0.605 NaN 0.860 0.65 + row2 0.13 NaN 0.395 0.500 0.25 + row3 NaN 0.310 NaN 0.545 NaN + row4 NaN 0.100 0.395 0.760 0.24 + +This solution uses :func:`~pandas.pivot_table`. Also note that +``aggfunc='mean'`` is the default. It is included here to be explicit. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean') + +Note that we can also replace the missing values by using the ``fill_value`` +parameter. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + +Also note that we can pass in other aggregation functions as well. For example, +we can also pass in ``sum``. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + +Another aggregation we can do is calculate the frequency in which the columns +and rows occur together a.k.a. "cross tabulation". To do this, we can pass +``size`` to the ``aggfunc`` parameter. + +.. ipython:: python + + df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') + +Pivoting with Multiple Aggregations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We can also perform multiple aggregations. For example, to perform both a +``sum`` and ``mean``, we can pass in a list to the ``aggfunc`` argument. + +.. ipython:: python + + df.pivot_table( + values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + +Note to aggregate over multiple value columns, we can pass in a list to the +``values`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + +Note to subdivide over multiple columns we can pass in a list to the +``columns`` parameter. + +.. ipython:: python + + df.pivot_table( + values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7153f5c2e7007..f8d153327f135 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5518,50 +5518,72 @@ def pivot(self, index=None, columns=None, values=None): ... "C": ["small", "large", "large", "small", ... "small", "large", "small", "small", ... "large"], - ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D E + 0 foo one small 1 2 + 1 foo one large 2 4 + 2 foo one large 2 5 + 3 foo two small 3 5 + 4 foo two small 3 6 + 5 bar one large 4 6 + 6 bar one small 5 8 + 7 bar two small 6 9 + 8 bar two large 7 9 + + This first example aggregates values by taking the sum. >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two NaN 6 + + We can also fill missing values using the `fill_value` parameter. >>> table = pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc=np.sum) + ... columns=['C'], aggfunc=np.sum, fill_value=0) >>> table C large small A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 + bar one 4 5 + two 7 6 + foo one 4 1 + two 0 6 + + The next example aggregates by taking the mean across multiple columns. + + >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], + ... aggfunc={'D': np.mean, + ... 'E': np.mean}) + >>> table + D E + mean mean + A C + bar large 5.500000 7.500000 + small 5.500000 8.500000 + foo large 2.000000 4.500000 + small 2.333333 4.333333 + + We can also calculate multiple types of aggregations for any given + value column. >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'], ... aggfunc={'D': np.mean, ... 'E': [min, max, np.mean]}) >>> table D E - mean max median min + mean max mean min A C - bar large 5.500000 16 14.5 13 - small 5.500000 15 14.5 14 - foo large 2.000000 10 9.5 9 - small 2.333333 12 11.0 8 + bar large 5.500000 9 7.500000 6 + small 5.500000 9 8.500000 8 + foo large 2.000000 5 4.500000 4 + small 2.333333 6 4.333333 2 Returns -------