Merge branch 'pandas-dev:main' into methods_test_drop_duplicates_fixt…

…ure_docs
pandas-dev · Jul 17, 2024 · 27f6cfb · 27f6cfb
2 parents 4bc6e84 + 288af5f
commit 27f6cfb
Show file tree

Hide file tree

Showing 43 changed files with 398 additions and 244 deletions.
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -388,9 +388,8 @@ jobs:
 
       - name: Run Tests
         uses: ./.github/actions/run-tests
-        env:
-          PYTHON_GIL: 0
 
+  # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml
   emscripten:
     # Note: the Python version, Emscripten toolchain version are determined
     # by the Pyodide version. The appropriate versions can be found in the

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -100,6 +100,13 @@ jobs:
         - [windows-2022, win_amd64]
         # TODO: support PyPy?
         python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]]
+
+        # Build Pyodide wheels and upload them to Anaconda.org
+        # NOTE: this job is similar to the one in unit-tests.yml except for the fact
+        # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup.
+        include:
+          - buildplat: [ubuntu-22.04, pyodide_wasm32]
+            python: ["cp312", "3.12"]
     env:
       IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
       IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
@@ -146,6 +153,7 @@ jobs:
         env:
           CIBW_PRERELEASE_PYTHONS: True
           CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
+          CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }}
 
       - name: Set up Python
         uses: mamba-org/setup-micromamba@v1

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -70,15 +70,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         --format=actions \
         -i ES01 `# For now it is ok if docstrings are missing the extended summary` \
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-        -i "pandas.DataFrame.max RT03" \
-        -i "pandas.DataFrame.mean RT03" \
-        -i "pandas.DataFrame.median RT03" \
-        -i "pandas.DataFrame.min RT03" \
-        -i "pandas.DataFrame.plot PR02" \
-        -i "pandas.Grouper PR02" \
         -i "pandas.MultiIndex.append PR07,SA01" \
         -i "pandas.MultiIndex.copy PR07,RT03,SA01" \
-        -i "pandas.MultiIndex.drop PR07,RT03,SA01" \
         -i "pandas.MultiIndex.get_level_values SA01" \
         -i "pandas.MultiIndex.get_loc PR07" \
         -i "pandas.MultiIndex.get_loc_level PR07" \
@@ -160,13 +153,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.lt SA01" \
         -i "pandas.Series.ne SA01" \
         -i "pandas.Series.pad PR01,SA01" \
-        -i "pandas.Series.plot PR02" \
         -i "pandas.Series.pop SA01" \
         -i "pandas.Series.prod RT03" \
         -i "pandas.Series.product RT03" \
         -i "pandas.Series.reorder_levels RT03,SA01" \
         -i "pandas.Series.sem PR01,RT03,SA01" \
-        -i "pandas.Series.skew RT03,SA01" \
+        -i "pandas.Series.skew SA01" \
         -i "pandas.Series.sparse PR01,SA01" \
         -i "pandas.Series.sparse.density SA01" \
         -i "pandas.Series.sparse.fill_value SA01" \
@@ -314,11 +306,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_period_dtype SA01" \
         -i "pandas.api.types.is_re PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
-        -i "pandas.api.types.is_signed_integer_dtype SA01" \
         -i "pandas.api.types.is_sparse SA01" \
         -i "pandas.api.types.is_string_dtype SA01" \
         -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \
-        -i "pandas.api.types.is_unsigned_integer_dtype SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.api.types.union_categoricals RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \

diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst
@@ -32,8 +32,10 @@ How do I create plots in pandas?
     air_quality.head()
 
 .. note::
-    The usage of the ``index_col`` and ``parse_dates`` parameters of the ``read_csv`` function to define the first (0th) column as
-    index of the resulting ``DataFrame`` and convert the dates in the column to :class:`Timestamp` objects, respectively.
+    The ``index_col=0`` and ``parse_dates=True`` parameters passed to the ``read_csv`` function define
+    the first (0th) column as index of the resulting ``DataFrame`` and convert the dates in the column
+    to :class:`Timestamp`  objects, respectively.
+
 
 .. raw:: html
 

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -32,6 +32,7 @@ Other enhancements
 - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
 - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
 - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
+- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
 - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
 - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
 - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
@@ -502,11 +503,13 @@ Datetimelike
 - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
 - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
 - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
+- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
 - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
 - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
 - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`)
 - Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`)
 - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
+- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
 - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
 
 Timedelta
@@ -556,6 +559,7 @@ MultiIndex
 - :func:`DataFrame.loc` with ``axis=0``  and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`)
 - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`)
 - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`)
+- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`)
 -
 
 I/O
@@ -581,9 +585,9 @@ Period
 
 Plotting
 ^^^^^^^^
-- Bug in  :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`)
+- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`)
 - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
--
+- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`)
 
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -615,6 +619,7 @@ ExtensionArray
 ^^^^^^^^^^^^^^
 - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
 - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
+- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`)
 
 Styler
 ^^^^^^

diff --git a/meson.build b/meson.build
@@ -44,6 +44,11 @@ else
     meson.add_dist_script(py, versioneer, '-o', '_version_meson.py')
 endif
 
+cy = meson.get_compiler('cython')
+if cy.version().version_compare('>=3.1.0')
+  add_project_arguments('-Xfreethreading_compatible=true', language : 'cython')
+endif
+
 # Needed by pandas.test() when it looks for the pytest ini options
 py.install_sources(
     'pyproject.toml',

diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c
@@ -384,6 +384,10 @@ PyMODINIT_FUNC PyInit_json(void) {
     return NULL;
   }
 
+#ifdef Py_GIL_DISABLED
+  PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED);
+#endif
+
 #ifndef PYPY_VERSION
   PyObject *mod_decimal = PyImport_ImportModule("decimal");
   if (mod_decimal) {

diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -606,37 +606,42 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz,
         # equiv: datetime.today().replace(tzinfo=tz)
         return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us)
     else:
-        string_to_dts_failed = string_to_dts(
-            ts, &dts, &out_bestunit, &out_local,
-            &out_tzoffset, False
-        )
-        if not string_to_dts_failed:
-            reso = get_supported_reso(out_bestunit)
-            check_dts_bounds(&dts, reso)
-            obj = _TSObject()
-            obj.dts = dts
-            obj.creso = reso
-            ival = npy_datetimestruct_to_datetime(reso, &dts)
-
-            if out_local == 1:
-                obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
-                obj.value = tz_localize_to_utc_single(
-                    ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso
-                )
-                if tz is None:
-                    check_overflows(obj, reso)
-                    return obj
-                _adjust_tsobject_tz_using_offset(obj, tz)
-                return  obj
-            else:
-                if tz is not None:
-                    # shift for _localize_tso
-                    ival = tz_localize_to_utc_single(
-                        ival, tz, ambiguous="raise", nonexistent=None, creso=reso
+        if not dayfirst:  # GH 58859
+            string_to_dts_failed = string_to_dts(
+                ts, &dts, &out_bestunit, &out_local,
+                &out_tzoffset, False
+            )
+            if not string_to_dts_failed:
+                reso = get_supported_reso(out_bestunit)
+                check_dts_bounds(&dts, reso)
+                obj = _TSObject()
+                obj.dts = dts
+                obj.creso = reso
+                ival = npy_datetimestruct_to_datetime(reso, &dts)
+
+                if out_local == 1:
+                    obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
+                    obj.value = tz_localize_to_utc_single(
+                        ival,
+                        obj.tzinfo,
+                        ambiguous="raise",
+                        nonexistent=None,
+                        creso=reso,
                     )
-                obj.value = ival
-                maybe_localize_tso(obj, tz, obj.creso)
-                return obj
+                    if tz is None:
+                        check_overflows(obj, reso)
+                        return obj
+                    _adjust_tsobject_tz_using_offset(obj, tz)
+                    return  obj
+                else:
+                    if tz is not None:
+                        # shift for _localize_tso
+                        ival = tz_localize_to_utc_single(
+                            ival, tz, ambiguous="raise", nonexistent=None, creso=reso
+                        )
+                    obj.value = ival
+                    maybe_localize_tso(obj, tz, obj.creso)
+                    return obj
 
         dt = parse_datetime_string(
             ts,

diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -4676,6 +4676,7 @@ prefix_mapping = {
         Hour,  # 'h'
         Day,  # 'D'
         WeekOfMonth,  # 'WOM'
+        LastWeekOfMonth,  # 'LWOM'
         FY5253,
         FY5253Quarter,
     ]
@@ -4894,7 +4895,7 @@ cpdef to_offset(freq, bint is_period=False):
                         f"\'{name}\' is deprecated and will be removed "
                         f"in a future version, please use "
                         f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' "
-                        f" instead.",
+                        f"instead.",
                         FutureWarning,
                         stacklevel=find_stack_level(),
                         )

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -377,32 +377,33 @@ def parse_datetime_string_with_reso(
         raise ValueError(f'Given date string "{date_string}" not likely a datetime')
 
     # Try iso8601 first, as it handles nanoseconds
-    string_to_dts_failed = string_to_dts(
-        date_string, &dts, &out_bestunit, &out_local,
-        &out_tzoffset, False
-    )
-    if not string_to_dts_failed:
-        # Match Timestamp and drop picoseconds, femtoseconds, attoseconds
-        # The new resolution will just be nano
-        # GH#50417
-        if out_bestunit in _timestamp_units:
-            out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
-
-        if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns:
-            # TODO: avoid circular import
-            from pandas import Timestamp
-            parsed = Timestamp(date_string)
-        else:
-            if out_local:
-                tz = timezone(timedelta(minutes=out_tzoffset))
+    if not dayfirst:  # GH 58859
+        string_to_dts_failed = string_to_dts(
+            date_string, &dts, &out_bestunit, &out_local,
+            &out_tzoffset, False
+        )
+        if not string_to_dts_failed:
+            # Match Timestamp and drop picoseconds, femtoseconds, attoseconds
+            # The new resolution will just be nano
+            # GH#50417
+            if out_bestunit in _timestamp_units:
+                out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
+
+            if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns:
+                # TODO: avoid circular import
+                from pandas import Timestamp
+                parsed = Timestamp(date_string)
             else:
-                tz = None
-            parsed = datetime_new(
-                dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz
-            )
+                if out_local:
+                    tz = timezone(timedelta(minutes=out_tzoffset))
+                else:
+                    tz = None
+                parsed = datetime_new(
+                    dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz
+                )
 
-        reso = npy_unit_to_attrname[out_bestunit]
-        return parsed, reso
+            reso = npy_unit_to_attrname[out_bestunit]
+            return parsed, reso
 
     parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
     if parsed is not None:

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -951,6 +951,9 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
     ]
 )
 def ea_scalar_and_dtype(request):
+    """
+    Fixture that tests each scalar and datetime type.
+    """
     return request.param
 
 

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -90,16 +90,19 @@ def frame_apply(
     kwargs=None,
 ) -> FrameApply:
     """construct and return a row or column based frame apply object"""
+    _, func, columns, _ = reconstruct_func(func, **kwargs)
+
     axis = obj._get_axis_number(axis)
     klass: type[FrameApply]
     if axis == 0:
         klass = FrameRowApply
     elif axis == 1:
+        if columns:
+            raise NotImplementedError(
+                f"Named aggregation is not supported when {axis=}."
+            )
         klass = FrameColumnApply
 
-    _, func, _, _ = reconstruct_func(func, **kwargs)
-    assert func is not None
-
     return klass(
         obj,
         func,

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1706,8 +1706,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
         if name == "median":
             # GH 52679: Use quantile instead of approximate_median; returns array
             result = result[0]
-        if pc.is_null(result).as_py():
-            return result
 
         if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
             result = result.cast(pa_type)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1986,7 +1986,10 @@ def _reduce(
             )
         result = meth(skipna=skipna, **kwargs)
         if keepdims:
-            result = np.array([result])
+            if name in ["min", "max"]:
+                result = self._from_sequence([result], dtype=self.dtype)
+            else:
+                result = np.array([result])
 
         return result
 

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -65,6 +65,7 @@
     ScalarIndexer,
     Self,
     SequenceIndexer,
+    TakeIndexer,
     TimeAmbiguous,
     TimeNonexistent,
     npt,
@@ -2340,6 +2341,27 @@ def interpolate(
             return self
         return type(self)._simple_new(out_data, dtype=self.dtype)
 
+    def take(
+        self,
+        indices: TakeIndexer,
+        *,
+        allow_fill: bool = False,
+        fill_value: Any = None,
+        axis: AxisInt = 0,
+    ) -> Self:
+        result = super().take(
+            indices=indices, allow_fill=allow_fill, fill_value=fill_value, axis=axis
+        )
+
+        indices = np.asarray(indices, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
+
+        if isinstance(maybe_slice, slice):
+            freq = self._get_getitem_freq(maybe_slice)
+            result._freq = freq
+
+        return result
+
     # --------------------------------------------------------------
     # Unsorted