diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 982877ee7f3654..a9585c17454fba 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -388,9 +388,8 @@ jobs: - name: Run Tests uses: ./.github/actions/run-tests - env: - PYTHON_GIL: 0 + # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml emscripten: # Note: the Python version, Emscripten toolchain version are determined # by the Pyodide version. The appropriate versions can be found in the diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f61ef550f74df8..02100648b636a5 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -100,6 +100,13 @@ jobs: - [windows-2022, win_amd64] # TODO: support PyPy? python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] + + # Build Pyodide wheels and upload them to Anaconda.org + # NOTE: this job is similar to the one in unit-tests.yml except for the fact + # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. + include: + - buildplat: [ubuntu-22.04, pyodide_wasm32] + python: ["cp312", "3.12"] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -146,6 +153,7 @@ jobs: env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python uses: mamba-org/setup-micromamba@v1 diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b01866a6d6c829..59cbc075b5bc8b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,15 +70,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.DataFrame.max RT03" \ - -i "pandas.DataFrame.mean RT03" \ - -i "pandas.DataFrame.median RT03" \ - -i "pandas.DataFrame.min RT03" \ - -i "pandas.DataFrame.plot PR02" \ - -i "pandas.Grouper PR02" \ -i "pandas.MultiIndex.append PR07,SA01" \ -i "pandas.MultiIndex.copy PR07,RT03,SA01" \ - -i "pandas.MultiIndex.drop PR07,RT03,SA01" \ -i "pandas.MultiIndex.get_level_values SA01" \ -i "pandas.MultiIndex.get_loc PR07" \ -i "pandas.MultiIndex.get_loc_level PR07" \ @@ -160,13 +153,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.lt SA01" \ -i "pandas.Series.ne SA01" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.plot PR02" \ -i "pandas.Series.pop SA01" \ -i "pandas.Series.prod RT03" \ -i "pandas.Series.product RT03" \ -i "pandas.Series.reorder_levels RT03,SA01" \ -i "pandas.Series.sem PR01,RT03,SA01" \ - -i "pandas.Series.skew RT03,SA01" \ + -i "pandas.Series.skew SA01" \ -i "pandas.Series.sparse PR01,SA01" \ -i "pandas.Series.sparse.density SA01" \ -i "pandas.Series.sparse.fill_value SA01" \ @@ -314,11 +306,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_period_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ - -i "pandas.api.types.is_signed_integer_dtype SA01" \ -i "pandas.api.types.is_sparse SA01" \ -i "pandas.api.types.is_string_dtype SA01" \ -i "pandas.api.types.is_timedelta64_ns_dtype SA01" \ - -i "pandas.api.types.is_unsigned_integer_dtype SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.api.types.union_categoricals RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index 49cf7d32e0ef54..e9f83c602d086a 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -32,8 +32,10 @@ How do I create plots in pandas? air_quality.head() .. note:: - The usage of the ``index_col`` and ``parse_dates`` parameters of the ``read_csv`` function to define the first (0th) column as - index of the resulting ``DataFrame`` and convert the dates in the column to :class:`Timestamp` objects, respectively. + The ``index_col=0`` and ``parse_dates=True`` parameters passed to the ``read_csv`` function define + the first (0th) column as index of the resulting ``DataFrame`` and convert the dates in the column + to :class:`Timestamp` objects, respectively. + .. raw:: html diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cd917924880f13..ba6636cb42b6cd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -32,6 +32,7 @@ Other enhancements - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) +- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) @@ -502,11 +503,13 @@ Datetimelike - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) +- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) +- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta @@ -556,6 +559,7 @@ MultiIndex - :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`) - :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`) - :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`) +- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - I/O @@ -581,9 +585,9 @@ Period Plotting ^^^^^^^^ -- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) +- Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) -- +- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -615,6 +619,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) +- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler ^^^^^^ diff --git a/meson.build b/meson.build index 06623a305ab544..efe543b7a267c2 100644 --- a/meson.build +++ b/meson.build @@ -44,6 +44,11 @@ else meson.add_dist_script(py, versioneer, '-o', '_version_meson.py') endif +cy = meson.get_compiler('cython') +if cy.version().version_compare('>=3.1.0') + add_project_arguments('-Xfreethreading_compatible=true', language : 'cython') +endif + # Needed by pandas.test() when it looks for the pytest ini options py.install_sources( 'pyproject.toml', diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index 075411a23b0757..f369d122a3dbe3 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -384,6 +384,10 @@ PyMODINIT_FUNC PyInit_json(void) { return NULL; } +#ifdef Py_GIL_DISABLED + PyUnstable_Module_SetGIL(module, Py_MOD_GIL_NOT_USED); +#endif + #ifndef PYPY_VERSION PyObject *mod_decimal = PyImport_ImportModule("decimal"); if (mod_decimal) { diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3a55f5fa0c0030..0fadbbbed2c725 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -606,37 +606,42 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # equiv: datetime.today().replace(tzinfo=tz) return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: - string_to_dts_failed = string_to_dts( - ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - reso = get_supported_reso(out_bestunit) - check_dts_bounds(&dts, reso) - obj = _TSObject() - obj.dts = dts - obj.creso = reso - ival = npy_datetimestruct_to_datetime(reso, &dts) - - if out_local == 1: - obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) - obj.value = tz_localize_to_utc_single( - ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso - ) - if tz is None: - check_overflows(obj, reso) - return obj - _adjust_tsobject_tz_using_offset(obj, tz) - return obj - else: - if tz is not None: - # shift for _localize_tso - ival = tz_localize_to_utc_single( - ival, tz, ambiguous="raise", nonexistent=None, creso=reso + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + ts, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + reso = get_supported_reso(out_bestunit) + check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + + if out_local == 1: + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, + obj.tzinfo, + ambiguous="raise", + nonexistent=None, + creso=reso, ) - obj.value = ival - maybe_localize_tso(obj, tz, obj.creso) - return obj + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj + else: + if tz is not None: + # shift for _localize_tso + ival = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=reso + ) + obj.value = ival + maybe_localize_tso(obj, tz, obj.creso) + return obj dt = parse_datetime_string( ts, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0afeb002a81516..db35cc0c93237f 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4676,6 +4676,7 @@ prefix_mapping = { Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' + LastWeekOfMonth, # 'LWOM' FY5253, FY5253Quarter, ] @@ -4894,7 +4895,7 @@ cpdef to_offset(freq, bint is_period=False): f"\'{name}\' is deprecated and will be removed " f"in a future version, please use " f"\'{c_PERIOD_AND_OFFSET_DEPR_FREQSTR.get(name)}\' " - f" instead.", + f"instead.", FutureWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 35d2433a707a0d..308183402198d0 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -377,32 +377,33 @@ def parse_datetime_string_with_reso( raise ValueError(f'Given date string "{date_string}" not likely a datetime') # Try iso8601 first, as it handles nanoseconds - string_to_dts_failed = string_to_dts( - date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - # Match Timestamp and drop picoseconds, femtoseconds, attoseconds - # The new resolution will just be nano - # GH#50417 - if out_bestunit in _timestamp_units: - out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns - - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # TODO: avoid circular import - from pandas import Timestamp - parsed = Timestamp(date_string) - else: - if out_local: - tz = timezone(timedelta(minutes=out_tzoffset)) + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + date_string, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + # Match Timestamp and drop picoseconds, femtoseconds, attoseconds + # The new resolution will just be nano + # GH#50417 + if out_bestunit in _timestamp_units: + out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns + + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # TODO: avoid circular import + from pandas import Timestamp + parsed = Timestamp(date_string) else: - tz = None - parsed = datetime_new( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz - ) + if out_local: + tz = timezone(timedelta(minutes=out_tzoffset)) + else: + tz = None + parsed = datetime_new( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz + ) - reso = npy_unit_to_attrname[out_bestunit] - return parsed, reso + reso = npy_unit_to_attrname[out_bestunit] + return parsed, reso parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit) if parsed is not None: diff --git a/pandas/conftest.py b/pandas/conftest.py index 70e729dfb98a4c..5e0dfd7ee644d1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -951,6 +951,9 @@ def rand_series_with_duplicate_datetimeindex() -> Series: ] ) def ea_scalar_and_dtype(request): + """ + Fixture that tests each scalar and datetime type. + """ return request.param diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 607a65598783f5..d024afa570a1ef 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -90,16 +90,19 @@ def frame_apply( kwargs=None, ) -> FrameApply: """construct and return a row or column based frame apply object""" + _, func, columns, _ = reconstruct_func(func, **kwargs) + axis = obj._get_axis_number(axis) klass: type[FrameApply] if axis == 0: klass = FrameRowApply elif axis == 1: + if columns: + raise NotImplementedError( + f"Named aggregation is not supported when {axis=}." + ) klass = FrameColumnApply - _, func, _, _ = reconstruct_func(func, **kwargs) - assert func is not None - return klass( obj, func, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 943656ba484325..5da479760047f6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1706,8 +1706,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs): if name == "median": # GH 52679: Use quantile instead of approximate_median; returns array result = result[0] - if pc.is_null(result).as_py(): - return result if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): result = result.cast(pa_type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1e8fec7fde3de6..b429b7c1b1fc47 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1986,7 +1986,10 @@ def _reduce( ) result = meth(skipna=skipna, **kwargs) if keepdims: - result = np.array([result]) + if name in ["min", "max"]: + result = self._from_sequence([result], dtype=self.dtype) + else: + result = np.array([result]) return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c90ff410b4b93f..ad0bde3abbdd4b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -65,6 +65,7 @@ ScalarIndexer, Self, SequenceIndexer, + TakeIndexer, TimeAmbiguous, TimeNonexistent, npt, @@ -2340,6 +2341,27 @@ def interpolate( return self return type(self)._simple_new(out_data, dtype=self.dtype) + def take( + self, + indices: TakeIndexer, + *, + allow_fill: bool = False, + fill_value: Any = None, + axis: AxisInt = 0, + ) -> Self: + result = super().take( + indices=indices, allow_fill=allow_fill, fill_value=fill_value, axis=axis + ) + + indices = np.asarray(indices, dtype=np.intp) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + + if isinstance(maybe_slice, slice): + freq = self._get_getitem_freq(maybe_slice) + result._freq = freq + + return result + # -------------------------------------------------------------- # Unsorted diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 34d25f04b69e15..dddfc440109d37 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2275,6 +2275,19 @@ def to_julian_date(self) -> npt.NDArray[np.float64]: # ----------------------------------------------------------------- # Reductions + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + if name == "std": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(result) + else: + return self._from_sequence(result, dtype=self.dtype) + return result + def std( self, axis=None, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 93471788e72ab1..92ed690e527c77 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1198,7 +1198,7 @@ def _wrap_na_result(self, *, name, axis, mask_size): mask = np.ones(mask_size, dtype=bool) float_dtyp = "float32" if self.dtype == "Float32" else "float64" - if name in ["mean", "median", "var", "std", "skew", "kurt"]: + if name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: np_dtype = float_dtyp elif name in ["min", "max"] or self.dtype.itemsize == 8: np_dtype = self.dtype.numpy_dtype.name diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index e762c3e5478199..b3513dd083e41f 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -956,6 +956,17 @@ def _check_timedeltalike_freq_compat(self, other): delta = delta.view("i8") return lib.item_from_zerodim(delta) + # ------------------------------------------------------------------ + # Reductions + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + if keepdims and isinstance(result, np.ndarray): + return self._from_sequence(result, dtype=self.dtype) + return result + def raise_on_incompatible(left, right) -> IncompatibleFrequency: """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 291cc2e62be628..13c26f0c979349 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -522,10 +522,19 @@ def astype(self, dtype, copy: bool = True): return super().astype(dtype, copy) def _reduce( - self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs + self, + name: str, + *, + skipna: bool = True, + keepdims: bool = False, + axis: AxisInt | None = 0, + **kwargs, ): if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna, axis=axis) + result = getattr(self, name)(skipna=skipna, axis=axis) + if keepdims: + return self._from_sequence([result], dtype=self.dtype) + return result raise TypeError(f"Cannot perform reduction '{name}' with string dtype") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bee8af46baa64d..7db3f8ecebf2af 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -715,6 +715,15 @@ def is_signed_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of a signed integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer_dtype: Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype: Check whether the provided array or dtype + is of a numeric dtype. + api.types.is_unsigned_integer_dtype: Check whether the provided array + or dtype is of an unsigned integer dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_signed_integer_dtype @@ -771,6 +780,15 @@ def is_unsigned_integer_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of an unsigned integer dtype. + See Also + -------- + api.types.is_signed_integer_dtype : Check whether the provided array + or dtype is of an signed integer dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype + is of an integer dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype + is of a numeric dtype. + Examples -------- >>> from pandas.api.types import is_unsigned_integer_dtype diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5ef663564a0168..ee48f546815bb1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4473,7 +4473,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No or punctuations (besides underscores) or starting with digits must be surrounded by backticks. (For example, a column named "Area (cm^2)" would be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "list", "for", "import", etc) cannot be used. + (like "if", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. @@ -4554,8 +4554,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No For example, ```it's` > `that's``` will raise an error, as it forms a quoted string (``'s > `that'``) with a backtick inside. - See also the Python documentation about lexical analysis - (https://docs.python.org/3/reference/lexical_analysis.html) + See also the `Python documentation about lexical analysis + `__ in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2a0495dff66811..fc9821a65777db 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11777,6 +11777,8 @@ def last_valid_index(self) -> Hashable: Returns ------- {name1} or scalar\ + + Value containing the calculation referenced in the description.\ {see_also}\ {examples} """ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5f680de77649f9..5f9ebdcea4a2db 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -72,6 +72,9 @@ class Grouper: Currently unused, reserved for future use. **kwargs Dictionary of the keyword arguments to pass to Grouper. + + Attributes + ---------- key : str, defaults to None Groupby key, which selects the grouping column of the target. level : name/number, defaults to None diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 19c94fa4104d72..ee24e485a93310 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2316,16 +2316,32 @@ def drop( # type: ignore[override] """ Make a new :class:`pandas.MultiIndex` with the passed list of codes deleted. + This method allows for the removal of specified labels from a MultiIndex. + The labels to be removed can be provided as a list of tuples if no level + is specified, or as a list of labels from a specific level if the level + parameter is provided. This can be useful for refining the structure of a + MultiIndex to fit specific requirements. + Parameters ---------- codes : array-like Must be a list of tuples when ``level`` is not specified. level : int or level name, default None + Level from which the labels will be dropped. errors : str, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- MultiIndex + A new MultiIndex with the specified labels removed. + + See Also + -------- + MultiIndex.remove_unused_levels : Create new MultiIndex from current that + removes unused levels. + MultiIndex.reorder_levels : Rearrange levels using input order. + MultiIndex.rename : Rename levels in a MultiIndex. Examples -------- diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 38a443b56ee3d4..5725b96f66cd42 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -49,6 +49,8 @@ for more details. A passed user-defined-function will be passed a Series for evaluation. + +If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. {examples}""" _shared_docs["compare"] = """ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index e8faea76897c60..719afe160614f2 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -274,46 +274,34 @@ def _make_index( self, data, alldata, columns, indexnamerow: list[Scalar] | None = None ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]: index: Index | None - if not is_index_col(self.index_col) or not self.index_col: - index = None + if isinstance(self.index_col, list) and len(self.index_col): + to_remove = [] + indexes = [] + for idx in self.index_col: + if isinstance(idx, str): + raise ValueError(f"Index {idx} invalid") + to_remove.append(idx) + indexes.append(alldata[idx]) + # remove index items from content and columns, don't pop in + # loop + for i in sorted(to_remove, reverse=True): + alldata.pop(i) + if not self._implicit_index: + columns.pop(i) + index = self._agg_index(indexes) + + # add names for the index + if indexnamerow: + coffset = len(indexnamerow) - len(columns) + index = index.set_names(indexnamerow[:coffset]) else: - simple_index = self._get_simple_index(alldata, columns) - index = self._agg_index(simple_index) - - # add names for the index - if indexnamerow: - coffset = len(indexnamerow) - len(columns) - assert index is not None - index = index.set_names(indexnamerow[:coffset]) + index = None # maybe create a mi on the columns columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns - @final - def _get_simple_index(self, data, columns): - def ix(col): - if not isinstance(col, str): - return col - raise ValueError(f"Index {col} invalid") - - to_remove = [] - index = [] - for idx in self.index_col: - i = ix(idx) - to_remove.append(i) - index.append(data[i]) - - # remove index items from content and columns, don't pop in - # loop - for i in sorted(to_remove, reverse=True): - data.pop(i) - if not self._implicit_index: - columns.pop(i) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" @@ -333,12 +321,13 @@ def _clean_mapping(self, mapping): return clean @final - def _agg_index(self, index, try_parse_dates: bool = True) -> Index: + def _agg_index(self, index) -> Index: arrays = [] converters = self._clean_mapping(self.converters) + clean_dtypes = self._clean_mapping(self.dtype) for i, arr in enumerate(index): - if try_parse_dates and self._should_parse_dates(i): + if self._should_parse_dates(i): arr = date_converter( arr, col=self.index_names[i] if self.index_names is not None else None, @@ -364,8 +353,6 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: else: col_na_values, col_na_fvalues = set(), set() - clean_dtypes = self._clean_mapping(self.dtype) - cast_type = None index_converter = False if self.index_names is not None: @@ -632,35 +619,6 @@ def _check_data_length( stacklevel=find_stack_level(), ) - @overload - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object], - names: Iterable[Hashable], - ) -> set[int]: ... - - @overload - def _evaluate_usecols( - self, usecols: SequenceT, names: Iterable[Hashable] - ) -> SequenceT: ... - - @final - def _evaluate_usecols( - self, - usecols: Callable[[Hashable], object] | SequenceT, - names: Iterable[Hashable], - ) -> SequenceT | set[int]: - """ - Check whether or not the 'usecols' parameter - is a callable. If so, enumerates the 'names' - parameter and returns a set of indices for - each entry in 'names' that evaluates to True. - If not a callable, returns 'usecols'. - """ - if callable(usecols): - return {i for i, name in enumerate(names) if usecols(name)} - return usecols - @final def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT: """ @@ -988,3 +946,32 @@ def _validate_usecols_arg(usecols): return usecols, usecols_dtype return usecols, None + + +@overload +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object], + names: Iterable[Hashable], +) -> set[int]: ... + + +@overload +def evaluate_callable_usecols( + usecols: SequenceT, names: Iterable[Hashable] +) -> SequenceT: ... + + +def evaluate_callable_usecols( + usecols: Callable[[Hashable], object] | SequenceT, + names: Iterable[Hashable], +) -> SequenceT | set[int]: + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return {i for i, name in enumerate(names) if usecols(name)} + return usecols diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index b59a778624c49e..f4198ac2a14438 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -31,6 +31,7 @@ ParserBase, ParserError, date_converter, + evaluate_callable_usecols, is_index_col, validate_parse_dates_presence, ) @@ -133,7 +134,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.orig_names = self.names[:] # type: ignore[has-type] if self.usecols: - usecols = self._evaluate_usecols(self.usecols, self.orig_names) + usecols = evaluate_callable_usecols(self.usecols, self.orig_names) # GH 14671 # assert for mypy, orig_names is List or None, None would error in issubset @@ -256,8 +257,7 @@ def read( columns, self.col_names ) - if self.usecols is not None: - columns = self._filter_usecols(columns) + columns = _filter_usecols(self.usecols, columns) col_dict = {k: v for k, v in col_dict.items() if k in columns} @@ -290,13 +290,21 @@ def read( else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, try_parse_dates=True) + if self._should_parse_dates(i): + values = date_converter( + values, + col=self.index_names[i] + if self.index_names is not None + else None, + dayfirst=self.dayfirst, + cache_dates=self.cache_dates, + date_format=self.date_format, + ) arrays.append(values) index = ensure_index_from_sequences(arrays) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) @@ -320,8 +328,7 @@ def read( names = list(self.orig_names) names = dedup_names(names, is_potential_multi_index(names, self.index_col)) - if self.usecols is not None: - names = self._filter_usecols(names) + names = _filter_usecols(self.usecols, names) # columns as list alldata = [x[1] for x in data_tups] @@ -335,25 +342,13 @@ def read( return index, column_names, date_data - def _filter_usecols(self, names: SequenceT) -> SequenceT | list[Hashable]: - # hackish - usecols = self._evaluate_usecols(self.usecols, names) - if usecols is not None and len(names) != len(usecols): - return [ - name for i, name in enumerate(names) if i in usecols or name in usecols - ] - return names - - def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True): - if try_parse_dates and self._should_parse_dates(index): - values = date_converter( - values, - col=self.index_names[index] if self.index_names is not None else None, - dayfirst=self.dayfirst, - cache_dates=self.cache_dates, - date_format=self.date_format, - ) - return values + +def _filter_usecols(usecols, names: SequenceT) -> SequenceT | list[Hashable]: + # hackish + usecols = evaluate_callable_usecols(usecols, names) + if usecols is not None and len(names) != len(usecols): + return [name for i, name in enumerate(names) if i in usecols or name in usecols] + return names def _concatenate_chunks( diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 05fe963e9b2b7e..c445529a6db484 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -59,6 +59,7 @@ ) from pandas.io.parsers.base_parser import ( ParserBase, + evaluate_callable_usecols, get_na_values, parser_defaults, validate_parse_dates_presence, @@ -127,9 +128,8 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.has_index_names = False - if "has_index_names" in kwds: - self.has_index_names = kwds["has_index_names"] + # Passed from read_excel + self.has_index_names = kwds.get("has_index_names", False) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -299,9 +299,10 @@ def read( return index, conv_columns, col_dict # handle new style for names in index - count_empty_content_vals = count_empty_vals(content[0]) indexnamerow = None - if self.has_index_names and count_empty_content_vals == len(columns): + if self.has_index_names and sum( + int(v == "" or v is None) for v in content[0] + ) == len(columns): indexnamerow = content[0] content = content[1:] @@ -605,7 +606,7 @@ def _infer_columns( # serve as the 'line' for parsing if have_mi_columns and hr > 0: if clear_buffer: - self._clear_buffer() + self.buf.clear() columns.append([None] * len(columns[-1])) return columns, num_original_columns, unnamed_cols @@ -687,7 +688,7 @@ def _infer_columns( num_original_columns = len(this_columns) if clear_buffer: - self._clear_buffer() + self.buf.clear() first_line: list[Scalar] | None if names is not None: @@ -774,7 +775,7 @@ def _handle_usecols( col_indices: set[int] | list[int] if self.usecols is not None: if callable(self.usecols): - col_indices = self._evaluate_usecols(self.usecols, usecols_key) + col_indices = evaluate_callable_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: raise ValueError( @@ -1094,9 +1095,6 @@ def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: lines=lines, search=self.decimal, replace="." ) - def _clear_buffer(self) -> None: - self.buf = [] - def _get_index_name( self, ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]: @@ -1526,10 +1524,6 @@ def _remove_empty_lines(self, lines: list[list[T]]) -> list[list[T]]: ] -def count_empty_vals(vals) -> int: - return sum(1 for v in vals if v == "" or v is None) - - def _validate_skipfooter_arg(skipfooter: int) -> int: """ Validate the 'skipfooter' parameter. diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 61c44e58b643a7..17df98f0266567 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -652,6 +652,9 @@ class PlotAccessor(PandasObject): ---------- data : Series or DataFrame The object for which the method is called. + + Attributes + ---------- x : label or position, default None Only used if data is a DataFrame. y : label, position or list of label, positions, default None diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 8b108346160d67..fb7d785a94bc43 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -55,7 +55,6 @@ from pandas.core.dtypes.missing import isna import pandas.core.common as com -from pandas.core.frame import DataFrame from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -94,6 +93,7 @@ ) from pandas import ( + DataFrame, Index, Series, ) @@ -183,7 +183,7 @@ def __init__( # Assign the rest of columns into self.columns if by is explicitly defined # while column is not, only need `columns` in hist/box plot when it's DF # TODO: Might deprecate `column` argument in future PR (#28373) - if isinstance(data, DataFrame): + if isinstance(data, ABCDataFrame): if column: self.columns = com.maybe_make_list(column) elif self.by is None: @@ -2035,9 +2035,12 @@ def _kind(self) -> Literal["pie"]: _layout_type = "horizontal" - def __init__(self, data, kind=None, **kwargs) -> None: + def __init__(self, data: Series | DataFrame, kind=None, **kwargs) -> None: data = data.fillna(value=0) - if (data < 0).any().any(): + lt_zero = data < 0 + if isinstance(data, ABCDataFrame) and lt_zero.any().any(): + raise ValueError(f"{self._kind} plot doesn't allow negative values") + elif isinstance(data, ABCSeries) and lt_zero.any(): raise ValueError(f"{self._kind} plot doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 939997f44c1a95..78c52d3ddfbdfe 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1330,6 +1330,14 @@ def test_agg_reduce(axis, float_frame): tm.assert_frame_equal(result, expected) +def test_named_agg_reduce_axis1_raises(float_frame): + name1, name2 = float_frame.axes[0].unique()[:2].sort_values() + msg = "Named aggregation is not supported when axis=1." + for axis in [1, "columns"]: + with pytest.raises(NotImplementedError, match=msg): + float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis) + + def test_nuiscance_columns(): # GH 15015 df = DataFrame( diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c3a6daee2dd545..4b3431d938f96a 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -4,7 +4,6 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import is_numeric_dtype class BaseReduceTests: @@ -57,7 +56,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): arr = ser.array df = pd.DataFrame({"a": arr}) - kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} + kwargs = {"ddof": 1} if op_name in ["var", "std", "sem"] else {} cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) @@ -119,10 +118,8 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if not is_numeric_dtype(ser.dtype): - pytest.skip(f"{ser.dtype} is not numeric dtype") - if op_name in ["count", "kurt", "sem"]: + if op_name == "count": pytest.skip(f"{op_name} not an array method") if not self._supports_reduction(ser, op_name): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 6f18761f771380..070feb1fec4b91 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -72,6 +72,8 @@ def _get_expected_exception( return None def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "sem"]: + return False return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4fad5e45409b92..ea9c5096638d5d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -67,7 +67,10 @@ pa = pytest.importorskip("pyarrow") -from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.arrow.array import ( + ArrowExtensionArray, + get_unit_from_pa_dtype, +) from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -464,17 +467,14 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["kurt", "skew"]: + return False + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] - if pa.types.is_temporal(pa_dtype) and op_name in [ - "sum", - "var", - "skew", - "kurt", - "prod", - ]: + if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod"]: if pa.types.is_duration(pa_dtype) and op_name in ["sum"]: # summing timedeltas is one case that *is* well-defined pass @@ -490,8 +490,6 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: "std", "sem", "var", - "skew", - "kurt", ]: return False @@ -505,6 +503,16 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # behavior which does not support this. return False + if pa.types.is_boolean(pa_dtype) and op_name in [ + "median", + "std", + "var", + "skew", + "kurt", + "sem", + ]: + return False + return True def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): @@ -528,32 +536,6 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = getattr(alt, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): - dtype = data.dtype - pa_dtype = dtype.pyarrow_dtype - - xfail_mark = pytest.mark.xfail( - raises=TypeError, - reason=( - f"{all_numeric_reductions} is not implemented in " - f"pyarrow={pa.__version__} for {pa_dtype}" - ), - ) - if all_numeric_reductions in {"skew", "kurt"} and ( - dtype._is_numeric or dtype.kind == "b" - ): - request.applymarker(xfail_mark) - - elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { - "sem", - "std", - "var", - "median", - }: - request.applymarker(xfail_mark) - super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) - @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean( self, data, all_boolean_reductions, skipna, na_value, request @@ -574,15 +556,32 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + pa_type = arr._pa_array.type + if op_name in ["max", "min"]: cmp_dtype = arr.dtype + elif pa.types.is_temporal(pa_type): + if op_name in ["std", "sem"]: + if pa.types.is_duration(pa_type): + cmp_dtype = arr.dtype + elif pa.types.is_date(pa_type): + cmp_dtype = ArrowDtype(pa.duration("s")) + elif pa.types.is_time(pa_type): + unit = get_unit_from_pa_dtype(pa_type) + cmp_dtype = ArrowDtype(pa.duration(unit)) + else: + cmp_dtype = ArrowDtype(pa.duration(pa_type.unit)) + else: + cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": - if op_name not in ["median", "var", "std"]: + if op_name not in ["median", "var", "std", "sem"]: cmp_dtype = arr.dtype else: cmp_dtype = "float64[pyarrow]" - elif op_name in ["median", "var", "std", "mean", "skew"]: + elif op_name in ["median", "var", "std", "mean", "skew", "sem"]: cmp_dtype = "float64[pyarrow]" + elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type): + cmp_dtype = "uint64[pyarrow]" else: cmp_dtype = { "i": "int64[pyarrow]", @@ -598,6 +597,14 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") request.applymarker(mark) + elif ( + op_name in ["std", "sem"] + and pa.types.is_date64(data._pa_array.type) + and skipna + ): + # overflow + mark = pytest.mark.xfail(reason="Cannot cast") + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index a42fa6088d9c84..356d5352f41f4d 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -95,6 +95,11 @@ def _get_expected_exception(self, op_name, obj, other): return None return super()._get_expected_exception(op_name, obj, other) + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): + if op_name == "std": + return "timedelta64[ns]" + return arr.dtype + def _supports_accumulation(self, ser, op_name: str) -> bool: return op_name in ["cummin", "cummax"] diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 69ce42203d5101..3b9079d06e2317 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -301,7 +301,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name - elif op_name in ["mean", "median", "var", "std", "skew"]: + elif op_name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]: cmp_dtype = "Float64" elif op_name in ["max", "min"]: cmp_dtype = arr.dtype.name @@ -323,9 +323,7 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): else "UInt64" ) elif arr.dtype.kind == "b": - if op_name in ["mean", "median", "var", "std", "skew"]: - cmp_dtype = "Float64" - elif op_name in ["min", "max"]: + if op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: cmp_dtype = ( diff --git a/pandas/tests/indexes/multi/test_get_level_values.py b/pandas/tests/indexes/multi/test_get_level_values.py index 28c77e78924cbc..4db74a716c514f 100644 --- a/pandas/tests/indexes/multi/test_get_level_values.py +++ b/pandas/tests/indexes/multi/test_get_level_values.py @@ -122,3 +122,12 @@ def test_values_loses_freq_of_underlying_index(): midx.values assert idx.freq is not None tm.assert_index_equal(idx, expected) + + +def test_get_level_values_gets_frequency_correctly(): + # GH#57949 GH#58327 + datetime_index = date_range(start=pd.to_datetime("1/1/2018"), periods=4, freq="YS") + other_index = ["A"] + multi_index = MultiIndex.from_product([datetime_index, other_index]) + + assert multi_index.get_level_values(0).freq == datetime_index.freq diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 279d9a18d8df74..2ca9dbf92e6171 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -377,6 +377,12 @@ def test_pie_series(self): _check_text_labels(ax.texts, series.index) assert ax.get_ylabel() == "" + def test_pie_arrow_type(self): + # GH 59192 + pytest.importorskip("pyarrow") + ser = Series([1, 2, 3, 4], dtype="int32[pyarrow]") + _check_plot_works(ser.plot.pie) + def test_pie_series_no_label(self): series = Series( np.random.default_rng(2).integers(1, 5), diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 7b93416600f8f0..ca97af0d3eb32a 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas.compat import WASM + import pandas as pd from pandas import Series import pandas._testing as tm @@ -233,6 +235,7 @@ def test_temp_setattr(with_exception): assert ser.name == "first" +@pytest.mark.skipif(WASM, reason="Can't start subprocesses in WASM") @pytest.mark.single_cpu def test_str_size(): # GH#21758 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c1d6baaf17c92d..3a47d87286711a 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2988,6 +2988,8 @@ def test_parsers_nat(self): ("20/12/21", True, False, datetime(2021, 12, 20)), ("20/12/21", False, True, datetime(2020, 12, 21)), ("20/12/21", True, True, datetime(2020, 12, 21)), + # GH 58859 + ("20201012", True, False, datetime(2020, 12, 10)), ], ) def test_parsers_dayfirst_yearfirst( diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 9e32a336505915..67521c7e2a3acd 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -31,6 +31,7 @@ ("2SME-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), ("2SMS-15", offsets.SemiMonthBegin(2)), + ("LWOM-MON", offsets.LastWeekOfMonth()), ], ) def test_to_offset(freq_input, expected): diff --git a/pyproject.toml b/pyproject.toml index 9156c73efbb359..47fd540d67ab23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -177,6 +177,16 @@ test-command = "" select = "*-macosx*" environment = {CFLAGS="-g0"} +[[tool.cibuildwheel.overrides]] +select = "*pyodide*" +test-requires = "pytest>=7.3.2 hypothesis>=6.46.1" +# Pyodide repairs wheels on its own, using auditwheel-emscripten +repair-wheel-command = "" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [tool.ruff] line-length = 88 target-version = "py310" diff --git a/web/pandas_web.py b/web/pandas_web.py index aac07433f2712b..b3872b829c73ac 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -280,6 +280,7 @@ def roadmap_pdeps(context): PDEP's in different status from the directory tree and GitHub. """ KNOWN_STATUS = { + "Draft", "Under discussion", "Accepted", "Implemented", @@ -319,7 +320,7 @@ def roadmap_pdeps(context): github_repo_url = context["main"]["github_repo_url"] resp = requests.get( "https://api.github.com/search/issues?" - f"q=is:pr is:open label:PDEP repo:{github_repo_url}", + f"q=is:pr is:open label:PDEP draft:false repo:{github_repo_url}", headers=GITHUB_API_HEADERS, timeout=5, )