Merge branch 'main' into depr-fastpath

jbrockmendel · Oct 1, 2023 · 9518a42 · 9518a42
2 parents cae2dd9 + 6f0cd8d
commit 9518a42
Show file tree

Hide file tree

Showing 193 changed files with 4,131 additions and 3,037 deletions.
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
@@ -24,7 +24,7 @@ jobs:
     runs-on: ubuntu-22.04
     strategy:
       matrix:
-        extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output_formatting", "clipboard", "compression", "consortium-standard", "all"]
+        extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"]
       fail-fast: false
     name: Install Extras - ${{ matrix.extra }}
     concurrency:

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -230,11 +230,13 @@ jobs:
               git -c user.email="you@example.com" merge --no-commit my_ref_name
           fi
       - name: Build environment and Run Tests
+        # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388
         run: |
           /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
-          python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
+          python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
+          python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
           python -m pip install --no-cache-dir --no-build-isolation -e .
           python -m pip list --no-cache-dir
           export PANDAS_CI=1

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -138,7 +138,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.15.0
+        uses: pypa/cibuildwheel@v2.16.0
         with:
          package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,11 +20,11 @@ ci:
 repos:
 -   repo: https://github.com/hauntsaninja/black-pre-commit-mirror
     # black compiled with mypyc
-    rev: 23.7.0
+    rev: 23.9.1
     hooks:
       - id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.287
+    rev: v0.0.291
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -107,7 +107,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+    rev: v3.13.0
     hooks:
     -   id: pyupgrade
         args: [--py39-plus]

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -841,6 +841,23 @@ def time_groupby_sum_multiindex(self):
         self.df.groupby(level=[0, 1]).sum()
 
 
+class SumTimeDelta:
+    # GH 20660
+    def setup(self):
+        N = 10**4
+        self.df = DataFrame(
+            np.random.randint(1000, 100000, (N, 100)),
+            index=np.random.randint(200, size=(N,)),
+        ).astype("timedelta64[ns]")
+        self.df_int = self.df.copy().astype("int64")
+
+    def time_groupby_sum_timedelta(self):
+        self.df.groupby(lambda x: x).sum()
+
+    def time_groupby_sum_int(self):
+        self.df_int.groupby(lambda x: x).sum()
+
+
 class Transform:
     def setup(self):
         n1 = 400

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -35,7 +35,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   - numba>=0.55.2
   - numexpr>=2.8.0
   - odfpy>=1.4.1

diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
@@ -34,7 +34,7 @@ dependencies:
   - gcsfs>=2022.05.0
   - jinja2>=3.1.2
   - lxml>=4.8.0
-  - matplotlib>=3.6.1
+  - matplotlib>=3.6.1, <3.8
   # test_numba_vs_cython segfaults with numba 0.57
   - numba>=0.55.2, <0.57.0
   - numexpr>=2.8.0

diff --git a/doc/make.py b/doc/make.py
@@ -123,14 +123,14 @@ def _sphinx_build(self, kind: str):
 
         Parameters
         ----------
-        kind : {'html', 'latex'}
+        kind : {'html', 'latex', 'linkcheck'}
 
         Examples
         --------
         >>> DocBuilder(num_jobs=4)._sphinx_build('html')
         """
-        if kind not in ("html", "latex"):
-            raise ValueError(f"kind must be html or latex, not {kind}")
+        if kind not in ("html", "latex", "linkcheck"):
+            raise ValueError(f"kind must be html, latex or linkcheck, not {kind}")
 
         cmd = ["sphinx-build", "-b", kind]
         if self.num_jobs:
@@ -288,6 +288,12 @@ def zip_html(self):
         os.chdir(dirname)
         self._run_os("zip", zip_fname, "-r", "-q", *fnames)
 
+    def linkcheck(self):
+        """
+        Check for broken links in the documentation.
+        """
+        return self._sphinx_build("linkcheck")
+
 
 def main():
     cmds = [method for method in dir(DocBuilder) if not method.startswith("_")]

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -162,7 +162,7 @@
 # General information about the project.
 project = "pandas"
 # We have our custom "pandas_footer.html" template, using copyright for the current year
-copyright = f"{datetime.now().year}"
+copyright = f"{datetime.now().year},"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -247,14 +247,14 @@ Dependency                                            Minimum Version    pip ext
 Visualization
 ^^^^^^^^^^^^^
 
-Installable with ``pip install "pandas[plot, output_formatting]"``.
+Installable with ``pip install "pandas[plot, output-formatting]"``.
 
 ========================= ================== ================== =============================================================
 Dependency                Minimum Version    pip extra          Notes
 ========================= ================== ================== =============================================================
 matplotlib                3.6.1              plot               Plotting library
-Jinja2                    3.1.2              output_formatting  Conditional formatting with DataFrame.style
-tabulate                  0.8.10             output_formatting  Printing in Markdown-friendly format (see `tabulate`_)
+Jinja2                    3.1.2              output-formatting  Conditional formatting with DataFrame.style
+tabulate                  0.8.10             output-formatting  Printing in Markdown-friendly format (see `tabulate`_)
 ========================= ================== ================== =============================================================
 
 Computation

diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst
@@ -295,7 +295,7 @@ Aggregate the current hourly time series values to the monthly maximum value in
 
 .. ipython:: python
 
-    monthly_max = no_2.resample("M").max()
+    monthly_max = no_2.resample("ME").max()
     monthly_max
 
 A very powerful method on time series data with a datetime index, is the

diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
@@ -134,11 +134,6 @@ is the missing value for datetime data.
 
    Timestamp
 
-.. autosummary::
-   :toctree: api/
-
-   NaT
-
 Properties
 ~~~~~~~~~~
 .. autosummary::
@@ -257,11 +252,6 @@ is the missing value for timedelta data.
 
    Timedelta
 
-.. autosummary::
-   :toctree: api/
-
-   NaT
-
 Properties
 ~~~~~~~~~~
 .. autosummary::
@@ -465,7 +455,6 @@ pandas provides this through :class:`arrays.IntegerArray`.
    UInt16Dtype
    UInt32Dtype
    UInt64Dtype
-   NA
 
 .. _api.arrays.float_na:
 
@@ -484,7 +473,6 @@ Nullable float
 
    Float32Dtype
    Float64Dtype
-   NA
 
 .. _api.arrays.categorical:
 
@@ -621,7 +609,6 @@ with a bool :class:`numpy.ndarray`.
    :template: autosummary/class_without_autosummary.rst
 
    BooleanDtype
-   NA
 
 
 .. Dtype attributes which are manually listed in their docstrings: including

diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst
@@ -34,6 +34,7 @@ objects.
 
       api.extensions.ExtensionArray._accumulate
       api.extensions.ExtensionArray._concat_same_type
+      api.extensions.ExtensionArray._explode
       api.extensions.ExtensionArray._formatter
       api.extensions.ExtensionArray._from_factorized
       api.extensions.ExtensionArray._from_sequence

diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
@@ -53,6 +53,7 @@ are mentioned in the documentation.
    options
    extensions
    testing
+   missing_value
 
 .. This is to prevent warnings in the doc build. We don't want to encourage
 .. these methods.

diff --git a/doc/source/reference/missing_value.rst b/doc/source/reference/missing_value.rst
@@ -0,0 +1,24 @@
+{{ header }}
+
+.. _api.missing_value:
+
+==============
+Missing values
+==============
+.. currentmodule:: pandas
+
+NA is the way to represent missing values for nullable dtypes (see below):
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   NA
+
+NaT is the missing value for timedelta and datetime data (see below):
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   NaT
diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -771,7 +771,7 @@ To create year and month cross tabulation:
 
    df = pd.DataFrame(
        {"value": np.random.randn(36)},
-       index=pd.date_range("2011-01-01", freq="M", periods=36),
+       index=pd.date_range("2011-01-01", freq="ME", periods=36),
    )
 
    pd.pivot_table(
@@ -794,12 +794,12 @@ Apply
        index=["I", "II", "III"],
    )
 
-   def make_df(ser):
-       new_vals = [pd.Series(value, name=name) for name, value in ser.items()]
-       return pd.DataFrame(new_vals)
-
-   df_orgz = pd.concat({ind: row.pipe(make_df) for ind, row in df.iterrows()})
+   def SeriesFromSubList(aList):
+       return pd.Series(aList)
 
+   df_orgz = pd.concat(
+       {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}
+   )
    df_orgz
 
 `Rolling apply with a DataFrame returning a Series

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1213,6 +1213,19 @@ The dimension of the returned result can also change:
 
     grouped.apply(f)
 
+``apply`` on a Series can operate on a returned value from the applied function
+that is itself a series, and possibly upcast the result to a DataFrame:
+
+.. ipython:: python
+
+    def f(x):
+        return pd.Series([x, x ** 2], index=["x", "x^2"])
+
+
+    s = pd.Series(np.random.rand(5))
+    s
+    s.apply(f)
+
 Similar to :ref:`groupby.aggregate.agg`, the resulting dtype will reflect that of the
 apply function. If the results from different groups have different dtypes, then
 a common dtype will be determined in the same way as ``DataFrame`` construction.
@@ -1403,7 +1416,7 @@ Groupby a specific column with the desired frequency. This is like resampling.
 
 .. ipython:: python
 
-   df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="1ME", key="Date"), "Buyer"])[["Quantity"]].sum()
 
 When ``freq`` is specified, the object returned by ``pd.Grouper`` will be an
 instance of ``pandas.api.typing.TimeGrouper``. You have an ambiguous specification
@@ -1413,9 +1426,9 @@ in that you have a named index and a column that could be potential groupers.
 
    df = df.set_index("Date")
    df["Date"] = df.index + pd.offsets.MonthEnd(2)
-   df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="6ME", key="Date"), "Buyer"])[["Quantity"]].sum()
 
-   df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum()
+   df.groupby([pd.Grouper(freq="6ME", level="Date"), "Buyer"])[["Quantity"]].sum()
 
 
 Taking the first rows of each group

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1837,7 +1837,7 @@ This however is operating on a copy and will not work.
    :okwarning:
    :okexcept:
 
-   with option_context('mode.chained_assignment','warn'):
+   with pd.option_context('mode.chained_assignment','warn'):
        dfb[dfb['a'].str.startswith('o')]['c'] = 42
 
 A chained assignment can also crop up in setting in a mixed dtype frame.
@@ -1879,7 +1879,7 @@ Last, the subsequent example will **not** work at all, and so should be avoided:
    :okwarning:
    :okexcept:
 
-   with option_context('mode.chained_assignment','raise'):
+   with pd.option_context('mode.chained_assignment','raise'):
        dfd.loc[0]['a'] = 1111
 
 .. warning::

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -136,7 +136,7 @@ Also, you can use :class:`Grouper` for ``index`` and ``columns`` keywords. For d
 
 .. ipython:: python
 
-   pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C")
+   pd.pivot_table(df, values="D", index=pd.Grouper(freq="ME", key="F"), columns="C")
 
 .. _reshaping.pivot.margins: