diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 3bd68c07dcbc3..4260c0836bbea 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index deaf2be0a0423..af452363666b5 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -67,7 +67,7 @@ jobs: run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='benchmarks' web/build/ web@${{ secrets.server_ip }}:/var/www/html if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload dev docs diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 139cfcde95b2c..96010a4a0227d 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] "cython<3.0.3" numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata "cython<3.0.3" hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 python -m pip install -ve . --no-build-isolation --no-index python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 4c7a7b329777b..6647bc03df17f 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -138,7 +138,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.0 + uses: pypa/cibuildwheel@v2.16.2 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/.gitignore b/.gitignore index cd22c2bb8cb5b..051a3ec11b794 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,7 @@ .mesonpy-native-file.ini MANIFEST compile_commands.json -debug +.debug # Python files # ################ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0b511e1048c6..c911edfa03670 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -84,7 +84,7 @@ repos: '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] - repo: https://github.com/pylint-dev/pylint - rev: v3.0.0a7 + rev: v3.0.0b0 hooks: - id: pylint stages: [manual] diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 2584e1f13853a..192f19c36b47d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,6 +1,7 @@ from importlib import import_module import numpy as np +import pyarrow as pa import pandas as pd @@ -72,7 +73,16 @@ class Duplicated: params = [ [True, False], ["first", "last", False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int", + "uint", + "float", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "timestamp[ms][pyarrow]", + "duration[s][pyarrow]", + ], ] param_names = ["unique", "keep", "dtype"] @@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), + "timestamp[ms][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) + ), + "duration[s][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.duration("s")) + ), }[dtype] if not unique: data = data.repeat(5) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 288369145576e..f52f7a4bef37a 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -28,9 +28,6 @@ def time_constructor_dict(self): def time_constructor_no_data(self): Series(data=None, index=self.idx) - def time_constructor_fastpath(self): - Series(self.array, index=self.idx2, name="name", fastpath=True) - class ToFrame: params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]] diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index a92fbbe8d4dbe..67f3b7736018d 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -72,7 +72,7 @@ def time_now(self, freq): self.per.now(freq) def time_asfreq(self, freq): - self.per.asfreq("A") + self.per.asfreq("Y") def time_str(self, freq): str(self.per) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index aba42f3733a3f..6caa39ae42926 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -63,16 +63,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then MSG='Partially validate docstrings (EX03)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \ - pandas.Series.loc \ - pandas.Series.iloc \ - pandas.Series.pop \ - pandas.Series.describe \ - pandas.Series.skew \ - pandas.Series.var \ - pandas.Series.last \ - pandas.Series.tz_convert \ - pandas.Series.tz_localize \ - pandas.Series.dt.month_name \ pandas.Series.dt.day_name \ pandas.Series.str.len \ pandas.Series.cat.set_categories \ diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 2522605cf5c38..ebd1556b8a5f5 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index dc67122c6e72e..4d0406814c873 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 7fd3a65ec91f8..9795a1fb39c6f 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -8,7 +8,7 @@ dependencies: - versioneer[toml] - meson[ninja]=1.2.1 - meson-python=0.13.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 # test dependencies - pytest>=7.3.2 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 893341350f4ef..c259286a5359c 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.2.1 - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index c9faaa2146235..f6df5a6e894a7 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index b7cc6e9e891ce..586768765325e 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index f51a8e04fbc7e..3751651a2a2f2 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 4923c94ab08f3..db0723cd3b8fa 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index d1a9e336eaeac..65f72278a0291 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=0.29.33 + - cython>=0.29.33, <3.0.3 - meson[ninja]=1.2.1 - meson-python=0.13.1 diff --git a/doc/redirects.csv b/doc/redirects.csv index 97cd20b295e65..bd60cc6a732bd 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -127,7 +127,6 @@ generated/pandas.api.types.is_number,../reference/api/pandas.api.types.is_number generated/pandas.api.types.is_numeric_dtype,../reference/api/pandas.api.types.is_numeric_dtype generated/pandas.api.types.is_object_dtype,../reference/api/pandas.api.types.is_object_dtype generated/pandas.api.types.is_period_dtype,../reference/api/pandas.api.types.is_period_dtype -generated/pandas.api.types.is_period,../reference/api/pandas.api.types.is_period generated/pandas.api.types.is_re_compilable,../reference/api/pandas.api.types.is_re_compilable generated/pandas.api.types.is_re,../reference/api/pandas.api.types.is_re generated/pandas.api.types.is_scalar,../reference/api/pandas.api.types.is_scalar diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 41f4b4d5783ea..e0aa8be066914 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -528,7 +528,7 @@ If a test is known to fail but the manner in which it fails is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument ``strict=False``. This -will make it so pytest does not fail if the test happens to pass. +will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort. Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` over usage within a test so that the test is appropriately marked during the diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 27f812b65e261..63154369dfd88 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -14,8 +14,8 @@ For Python developers with limited or no C/C++ experience this can seem a daunti 2. `Fundamental Python Debugging Part 2 - Python Extensions `_ 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ -Generating debug builds ------------------------ +Debugging locally +----------------- By default building pandas from source will generate a release build. To generate a development build you can type:: @@ -27,6 +27,32 @@ By default building pandas from source will generate a release build. To generat By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. +Using Docker +------------ + +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locallly. + +You can then mount your pandas repository into this image via: + +.. code-block:: sh + + docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug + +Inside the image, you can use meson to build/install pandas and place the build artifacts into a ``debug`` folder using a command as follows: + +.. code-block:: sh + + python -m pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + +If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application. + +.. code-block:: sh + + cd debug + cygdb + +Within the debugger you can use `cygdb commands `_ to navigate cython extensions. + Editor support -------------- diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index b49c9644e1b2a..29cc256f35a4e 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -44,6 +44,9 @@ reading. Issue triage ------------ +Triage is an important first step in addressing issues reported by the community, and even +partial contributions are a great way to help maintain pandas. Only remove the "Needs Triage" +tag once all of the steps below have been completed. Here's a typical workflow for triaging a newly opened issue. @@ -67,9 +70,9 @@ Here's a typical workflow for triaging a newly opened issue. 3. **Is this a duplicate issue?** We have many open issues. If a new issue is clearly a duplicate, label the - new issue as "Duplicate" assign the milestone "No Action", and close the issue - with a link to the original issue. Make sure to still thank the reporter, and - encourage them to chime in on the original issue, and perhaps try to fix it. + new issue as "Duplicate" and close the issue with a link to the original issue. + Make sure to still thank the reporter, and encourage them to chime in on the + original issue, and perhaps try to fix it. If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to @@ -90,6 +93,10 @@ Here's a typical workflow for triaging a newly opened issue. If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag + until all steps have been completed. Add a comment to the issue once you have + verified it exists on the main branch, so others know it has been confirmed. + 5. **Is this a clearly defined feature request?** Generally, pandas prefers to discuss and design new features in issues, before @@ -97,8 +104,9 @@ Here's a typical workflow for triaging a newly opened issue. for the new feature. Having them write a full docstring is a good way to pin down specifics. - We'll need a discussion from several pandas maintainers before deciding whether - the proposal is in scope for pandas. + Tag new feature requests with "Needs Discussion", as we'll need a discussion + from several pandas maintainers before deciding whether the proposal is in + scope for pandas. 6. **Is this a usage question?** @@ -117,10 +125,6 @@ Here's a typical workflow for triaging a newly opened issue. If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Typically, new issues will be assigned the "Contributions welcome" milestone, - unless it's know that this issue should be addressed in a specific release (say - because it's a large regression). - Once you have completed the above, make sure to remove the "needs triage" label. .. _maintaining.regressions: diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 83f830bb11198..e412793a328a3 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -49,6 +49,7 @@ objects. api.extensions.ExtensionArray.copy api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna + api.extensions.ExtensionArray.duplicated api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 5def84b91705c..2c612e31d33b6 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -451,7 +451,7 @@ Merge Concat ~~~~~~ -pandas provides various facilities for easily combining together :class:`Series`` and +pandas provides various facilities for easily combining together :class:`Series` and :class:`DataFrame` objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 682fa4c9b4fcc..453536098cfbb 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -976,7 +976,7 @@ of :ref:`frequency aliases ` with datetime-like inter pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W") - pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H") + pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9h") Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals are closed on. Intervals are closed on the right side by default. diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2e299da5e5794..d3c9d83b943ce 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -408,20 +408,6 @@ raise a ValueError: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) -Note that this is different from the NumPy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index bc2f4420da784..c4721f3a6b09c 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``. ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): ...: assert (col_a.dtype == np.float64 - ...: and col_b.dtype == np.float64 and col_N.dtype == np.int_) + ...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int)) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6bd181740c78d..c2fe277c4f4e5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2332,7 +2332,7 @@ A few notes on the generated table schema: .. ipython:: python - s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4)) + s_per = pd.Series(1, index=pd.period_range("2016", freq="Y-DEC", periods=4)) build_table_schema(s_per) * Categoricals use the ``any`` type and an ``enum`` constraint listing diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index cd567f8442671..5daf204f39bcf 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -392,7 +392,7 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases = 0.9.3. (:issue:`5945`) - ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index dffb4c7b9ff9e..8dafed1efee97 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -332,16 +332,37 @@ Timezone handling improvements - ``tz_localize(None)`` for tz-aware ``Timestamp`` and ``DatetimeIndex`` now removes timezone holding local time, previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`) - .. ipython:: python + .. code-block:: ipython + + In [58]: ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') + + In[59]: ts + Out[59]: Timestamp('2014-08-01 09:00:00-0400', tz='US/Eastern') - ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') - ts - ts.tz_localize(None) + In [60]: ts.tz_localize(None) + Out[60]: Timestamp('2014-08-01 09:00:00') - didx = pd.date_range(start='2014-08-01 09:00', freq='H', - periods=10, tz='US/Eastern') - didx - didx.tz_localize(None) + In [61]: didx = pd.date_range(start='2014-08-01 09:00', freq='H', + ....: periods=10, tz='US/Eastern') + ....: + + In [62]: didx + Out[62]: + DatetimeIndex(['2014-08-01 09:00:00-04:00', '2014-08-01 10:00:00-04:00', + '2014-08-01 11:00:00-04:00', '2014-08-01 12:00:00-04:00', + '2014-08-01 13:00:00-04:00', '2014-08-01 14:00:00-04:00', + '2014-08-01 15:00:00-04:00', '2014-08-01 16:00:00-04:00', + '2014-08-01 17:00:00-04:00', '2014-08-01 18:00:00-04:00'], + dtype='datetime64[ns, US/Eastern]', freq='H') + + In [63]: didx.tz_localize(None) + Out[63]: + DatetimeIndex(['2014-08-01 09:00:00', '2014-08-01 10:00:00', + '2014-08-01 11:00:00', '2014-08-01 12:00:00', + '2014-08-01 13:00:00', '2014-08-01 14:00:00', + '2014-08-01 15:00:00', '2014-08-01 16:00:00', + '2014-08-01 17:00:00', '2014-08-01 18:00:00'], + dtype='datetime64[ns]', freq=None) - ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT, @@ -1050,16 +1071,35 @@ Other: If ``Period`` freq is ``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``, ``Timedelta``-like can be added if the result can have same freq. Otherwise, only the same ``offsets`` can be added. - .. ipython:: python + .. code-block:: ipython - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') - idx - idx + pd.offsets.Hour(2) - idx + pd.Timedelta('120m') + In [104]: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') - idx = pd.period_range('2014-07', periods=5, freq='M') - idx - idx + pd.offsets.MonthEnd(3) + In [105]: idx + Out[105]: + PeriodIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', + '2014-07-01 12:00', '2014-07-01 13:00'], + dtype='period[H]') + + In [106]: idx + pd.offsets.Hour(2) + Out[106]: + PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00', + '2014-07-01 14:00', '2014-07-01 15:00'], + dtype='period[H]') + + In [107]: idx + pd.Timedelta('120m') + Out[107]: + PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00', + '2014-07-01 14:00', '2014-07-01 15:00'], + dtype='period[H]') + + In [108]: idx = pd.period_range('2014-07', periods=5, freq='M') + + In [109]: idx + Out[109]: PeriodIndex(['2014-07', '2014-08', '2014-09', '2014-10', '2014-11'], dtype='period[M]') + + In [110]: idx + pd.offsets.MonthEnd(3) + Out[110]: PeriodIndex(['2014-10', '2014-11', '2014-12', '2015-01', '2015-02'], dtype='period[M]') - Added experimental compatibility with ``openpyxl`` for versions >= 2.0. The ``DataFrame.to_excel`` method ``engine`` keyword now recognizes ``openpyxl1`` and ``openpyxl2`` diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index 02e47553cd184..8984109da2a43 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1000,10 +1000,16 @@ Other API changes ^^^^^^^^^^^^^^^^^ - ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) - .. ipython:: python + .. code-block:: ipython + + In [107]: s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s.between_time("7:00am", "9:00am") + In [108]: s.between_time("7:00am", "9:00am") + Out[108]: + 2015-01-01 07:00:00 7 + 2015-01-01 08:00:00 8 + 2015-01-01 09:00:00 9 + Freq: H, Length: 3, dtype: int64 This will now raise. diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index ee6a60144bc35..85e0e63016729 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -256,26 +256,78 @@ Partial string indexing on ``DatetimeIndex`` when part of a ``MultiIndex`` Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiIndex`` (:issue:`10331`) -.. ipython:: python +.. code-block:: ipython - dft2 = pd.DataFrame( - np.random.randn(20, 1), - columns=["A"], - index=pd.MultiIndex.from_product( - [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] - ), - ) - dft2 - dft2.loc["2013-01-05"] + In [20]: dft2 = pd.DataFrame( + ....: np.random.randn(20, 1), + ....: columns=["A"], + ....: index=pd.MultiIndex.from_product( + ....: [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ....: ), + ....: ) + ....: + + In [21]: dft2 + Out[21]: + A + 2013-01-01 00:00:00 a 0.469112 + b -0.282863 + 2013-01-01 12:00:00 a -1.509059 + b -1.135632 + 2013-01-02 00:00:00 a 1.212112 + ... ... + 2013-01-04 12:00:00 b 0.271860 + 2013-01-05 00:00:00 a -0.424972 + b 0.567020 + 2013-01-05 12:00:00 a 0.276232 + b -1.087401 + + [20 rows x 1 columns] + + In [22]: dft2.loc["2013-01-05"] + Out[22]: + A + 2013-01-05 00:00:00 a -0.424972 + b 0.567020 + 2013-01-05 12:00:00 a 0.276232 + b -1.087401 + + [4 rows x 1 columns] On other levels -.. ipython:: python +.. code-block:: ipython - idx = pd.IndexSlice - dft2 = dft2.swaplevel(0, 1).sort_index() - dft2 - dft2.loc[idx[:, "2013-01-05"], :] + In [26]: idx = pd.IndexSlice + + In [27]: dft2 = dft2.swaplevel(0, 1).sort_index() + + In [28]: dft2 + Out[28]: + A + a 2013-01-01 00:00:00 0.469112 + 2013-01-01 12:00:00 -1.509059 + 2013-01-02 00:00:00 1.212112 + 2013-01-02 12:00:00 0.119209 + 2013-01-03 00:00:00 -0.861849 + ... ... + b 2013-01-03 12:00:00 1.071804 + 2013-01-04 00:00:00 -0.706771 + 2013-01-04 12:00:00 0.271860 + 2013-01-05 00:00:00 0.567020 + 2013-01-05 12:00:00 -1.087401 + + [20 rows x 1 columns] + + In [29]: dft2.loc[idx[:, "2013-01-05"], :] + Out[29]: + A + a 2013-01-05 00:00:00 -0.424972 + 2013-01-05 12:00:00 0.276232 + b 2013-01-05 00:00:00 0.567020 + 2013-01-05 12:00:00 -1.087401 + + [4 rows x 1 columns] .. _whatsnew_0181.enhancements.assembling: diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 9005fafaf18f7..ae70eb078f6d9 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -614,11 +614,18 @@ New behavior: ``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32`` -.. ipython:: python +.. code-block:: ipython + + In [64]: s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H') + ....: .tz_localize('Asia/Tokyo')) + ....: - s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H') - .tz_localize('Asia/Tokyo')) - s + In [65]: s + Out[65]: + 0 2011-01-02 00:00:00+09:00 + 1 2011-01-02 01:00:00+09:00 + 2 2011-01-02 02:00:00+09:00 + Length: 3, dtype: datetime64[ns, Asia/Tokyo] Previous behavior: @@ -633,9 +640,14 @@ Previous behavior: New behavior: -.. ipython:: python +.. code-block:: ipython - s.map(lambda x: x.hour) + In [66]: s.map(lambda x: x.hour) + Out[66]: + 0 0 + 1 1 + 2 2 + Length: 3, dtype: int64 .. _whatsnew_0200.api_breaking.index_dt_field: @@ -659,10 +671,12 @@ Previous behaviour: New behavior: -.. ipython:: python +.. code-block:: ipython + + In [67]: idx = pd.date_range("2015-01-01", periods=5, freq='10H') - idx = pd.date_range("2015-01-01", periods=5, freq='10H') - idx.hour + In [68]: idx.hour + Out[68]: Index([0, 10, 20, 6, 16], dtype='int32') This has the advantage that specific ``Index`` methods are still available on the result. On the other hand, this might have backward incompatibilities: e.g. diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index f8eacd28fa795..62296550a472b 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -671,15 +671,35 @@ Previous behavior: New behavior: -.. ipython:: python +.. code-block:: ipython + + In [56]: pi = pd.period_range(start='2000-01-01', freq='D', periods=10) + + In [57]: s = pd.Series(np.arange(10), index=pi) - pi = pd.period_range(start='2000-01-01', freq='D', periods=10) + In [58]: s.resample('H').ohlc() + Out[58]: + open high low close + 2000-01-01 00:00 0.0 0.0 0.0 0.0 + 2000-01-01 01:00 NaN NaN NaN NaN + 2000-01-01 02:00 NaN NaN NaN NaN + 2000-01-01 03:00 NaN NaN NaN NaN + 2000-01-01 04:00 NaN NaN NaN NaN + ... ... ... ... ... + 2000-01-10 19:00 NaN NaN NaN NaN + 2000-01-10 20:00 NaN NaN NaN NaN + 2000-01-10 21:00 NaN NaN NaN NaN + 2000-01-10 22:00 NaN NaN NaN NaN + 2000-01-10 23:00 NaN NaN NaN NaN - s = pd.Series(np.arange(10), index=pi) + [240 rows x 4 columns] - s.resample('H').ohlc() + In [59]: s.resample('M').ohlc() + Out[59]: + open high low close + 2000-01 0 9 0 9 - s.resample('M').ohlc() + [1 rows x 4 columns] .. _whatsnew_0210.api_breaking.pandas_eval: diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index c494b4f286662..a33a8f7addeef 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -187,16 +187,27 @@ entirely valid. *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython - idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) - pd.Series([1, 2], index=idx).resample("12H").sum() + In [14]: idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) + In [15]: pd.Series([1, 2], index=idx).resample("12H").sum() + Out[15]: + 2017-01-01 00:00:00 1 + 2017-01-01 12:00:00 0 + 2017-01-02 00:00:00 2 + Freq: 12H, Length: 3, dtype: int64 Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. -.. ipython:: python +.. code-block:: ipython + + In [16]: pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) + Out[16]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, Length: 3, dtype: float64 - pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) Rolling and expanding ^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 44728e7e552ab..8fa1361cc30c1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -112,14 +112,33 @@ to the index in the resample when :meth:`.Resampler.apply` is used. of pandas, not specifying ``group_keys`` will default to the same behavior as ``group_keys=False``. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'a': range(6)}, - index=pd.date_range("2021-01-01", periods=6, freq="8H") - ) - df.resample("D", group_keys=True).apply(lambda x: x) - df.resample("D", group_keys=False).apply(lambda x: x) + In [11]: df = pd.DataFrame( + ....: {'a': range(6)}, + ....: index=pd.date_range("2021-01-01", periods=6, freq="8H") + ....: ) + ....: + + In [12]: df.resample("D", group_keys=True).apply(lambda x: x) + Out[12]: + a + 2021-01-01 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 + + In [13]: df.resample("D", group_keys=False).apply(lambda x: x) + Out[13]: + a + 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 Previously, the resulting index would depend upon the values returned by ``apply``, as seen in the following example. @@ -461,19 +480,20 @@ upon serialization. (Related issue :issue:`12997`) *Old Behavior* -.. ipython:: python +.. code-block:: ipython - index = pd.date_range( - start='2020-12-28 00:00:00', - end='2020-12-28 02:00:00', - freq='1H', - ) - a = pd.Series( - data=range(3), - index=index, - ) + In [32]: index = pd.date_range( + ....: start='2020-12-28 00:00:00', + ....: end='2020-12-28 02:00:00', + ....: freq='1H', + ....: ) + ....: -.. code-block:: ipython + In [33]: a = pd.Series( + ....: data=range(3), + ....: index=index, + ....: ) + ....: In [4]: from io import StringIO @@ -485,12 +505,16 @@ upon serialization. (Related issue :issue:`12997`) *New Behavior* -.. ipython:: python +.. code-block:: ipython + + In [34]: from io import StringIO + + In [35]: a.to_json(date_format='iso') + Out[35]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}' - from io import StringIO - a.to_json(date_format='iso') # Roundtripping now works - pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index + In [36]: pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index + Out[36]: array([ True, True, True]) .. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical: diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..cc51e22265d7c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -13,17 +13,24 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Fixed bug where PDEP-6 warning about setting an item of an incompatible dtype was being shown when creating a new conditional column (:issue:`55025`) -- +- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`) +- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`) +- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: Bug fixes ~~~~~~~~~ -- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) -- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) +- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) +- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) +- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) +- Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) +- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`) +- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 03ed3f685f822..7cc23cc616503 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -74,11 +74,12 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: @@ -132,10 +133,36 @@ and ``sort=False``: result -.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels: + +:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder +index levels when joining on two indexes with different levels (:issue:`34133`). + +.. ipython:: python + + left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) + right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + result = left.join(right) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + left right + B A C + 1 x 1 1 2 + 2 x 2 1 2 + +*New Behavior* -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +.. ipython:: python + + result .. --------------------------------------------------------------------------- .. _whatsnew_220.api_breaking: @@ -205,7 +232,7 @@ For example: Other Deprecations ^^^^^^^^^^^^^^^^^^ -- Changed :meth:`Timedelta.resolution_string` to return ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard`. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf``. (:issue:`54229`) - Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict`. (:issue:`54229`) @@ -225,9 +252,12 @@ Other Deprecations - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) - Deprecated including the groups in computations when using :meth:`DataFrameGroupBy.apply` and :meth:`DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated not passing a tuple to :class:`DataFrameGroupBy.get_group` or :class:`SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) -- Deprecated strings ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`52536`) +- Deprecated strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`52536`) +- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) -- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) +- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) - Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) - Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) - Deprecating downcasting the results of :meth:`DataFrame.fillna`, :meth:`Series.fillna`, :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, :meth:`Series.bfill` in object-dtype cases. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`54261`) @@ -242,6 +272,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) @@ -250,10 +281,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) -- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) -- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) -- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`) Categorical ^^^^^^^^^^^ @@ -267,12 +294,12 @@ Datetimelike Timedelta ^^^^^^^^^ -- +- Bug in rendering (``__repr__``) of :class:`TimedeltaIndex` and :class:`Series` with timedelta64 values with non-nanosecond resolution entries that are all multiples of 24 hours failing to use the compact representation used in the nanosecond cases (:issue:`55405`) - Timezones ^^^^^^^^^ -- +- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Numeric @@ -282,11 +309,12 @@ Numeric Conversion ^^^^^^^^^^ -- +- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) - Strings ^^^^^^^ +- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) - - @@ -311,7 +339,7 @@ Missing MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) - I/O @@ -320,6 +348,7 @@ I/O - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) +- Bug in :meth:`pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) Period ^^^^^^ @@ -333,13 +362,18 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - Reshaping ^^^^^^^^^ - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) +- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) +- Sparse ^^^^^^ @@ -359,6 +393,9 @@ Styler Other ^^^^^ - Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) +- .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index fd632790546f6..b4662d6bf8dd2 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -101,12 +101,20 @@ libs_sources = { 'writers': {'sources': ['writers.pyx']} } +cython_args = [ + '--include-dir', + meson.current_build_dir(), + '-X always_allow_keywords=true' +] +if get_option('buildtype') == 'debug' + cython_args += ['--gdb'] +endif foreach ext_name, ext_dict : libs_sources py.extension_module( ext_name, ext_dict.get('sources'), - cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], + cython_args: cython_args, include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index e050ac5a6c7b7..bda4fcf04234b 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -13,6 +13,8 @@ cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cpdef freq_to_period_freqstr(freq_n, freq_name) cdef dict c_OFFSET_TO_PERIOD_FREQSTR +cdef dict c_OFFSET_DEPR_FREQSTR +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR cdef dict c_DEPR_ABBREVS cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index 72a8fa8ff0b38..d8680ed2d27b4 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -7,6 +7,7 @@ from pandas._libs.tslibs.timedeltas import UnitChoices _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] OFFSET_TO_PERIOD_FREQSTR: dict[str, str] +OFFSET_DEPR_FREQSTR: dict[str, str] DEPR_ABBREVS: dict[str, UnitChoices] def periods_per_day(reso: int) -> int: ... diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index cca379c620aeb..86f620beeec3b 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -101,19 +101,19 @@ cdef class PeriodDtypeBase: _period_code_map = { # Annual freqs with various fiscal year ends. - # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 - "A-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end - "A-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end - "A-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end - "A-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end - "A-APR": PeriodDtypeCode.A_APR, # Annual - April year end - "A-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end - "A-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end - "A-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end - "A-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end - "A-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end - "A-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end - "A-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end + # eg, 2005 for Y-FEB runs Mar 1, 2004 to Feb 28, 2005 + "Y-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end + "Y-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end + "Y-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end + "Y-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end + "Y-APR": PeriodDtypeCode.A_APR, # Annual - April year end + "Y-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end + "Y-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end + "Y-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end + "Y-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end + "Y-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end + "Y-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end + "Y-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end # Quarterly frequencies with various fiscal year ends. # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 @@ -142,7 +142,7 @@ _period_code_map = { "B": PeriodDtypeCode.B, # Business days "D": PeriodDtypeCode.D, # Daily - "H": PeriodDtypeCode.H, # Hourly + "h": PeriodDtypeCode.H, # Hourly "min": PeriodDtypeCode.T, # Minutely "s": PeriodDtypeCode.S, # Secondly "ms": PeriodDtypeCode.L, # Millisecondly @@ -156,26 +156,26 @@ _reverse_period_code_map = { # Yearly aliases; careful not to put these in _reverse_period_code_map _period_code_map.update({"Y" + key[1:]: _period_code_map[key] for key in _period_code_map - if key.startswith("A-")}) + if key.startswith("Y-")}) _period_code_map.update({ "Q": 2000, # Quarterly - December year end (default quarterly) - "A": PeriodDtypeCode.A, # Annual + "Y": PeriodDtypeCode.A, # Annual "W": 4000, # Weekly "C": 5000, # Custom Business Day }) cdef set _month_names = { - x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("A-") + x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("Y-") } # Map attribute-name resolutions to resolution abbreviations _attrname_to_abbrevs = { - "year": "A", + "year": "Y", "quarter": "Q", "month": "M", "day": "D", - "hour": "H", + "hour": "h", "minute": "min", "second": "s", "millisecond": "ms", @@ -192,9 +192,9 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "BQS": "Q", "QS": "Q", "BQ": "Q", - "BA": "A", - "AS": "A", - "BAS": "A", + "BA": "Y", + "AS": "Y", + "BAS": "Y", "MS": "M", "D": "D", "B": "B", @@ -203,17 +203,21 @@ OFFSET_TO_PERIOD_FREQSTR: dict = { "ms": "ms", "us": "us", "ns": "ns", - "H": "H", + "h": "h", "Q": "Q", - "A": "A", + "Y": "Y", "W": "W", "ME": "M", - "Y": "A", - "BY": "A", - "YS": "A", - "BYS": "A", + "BY": "Y", + "YS": "Y", + "BYS": "Y", +} +OFFSET_DEPR_FREQSTR: dict[str, str]= { + "M": "ME", } cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_OFFSET_DEPR_FREQSTR = OFFSET_DEPR_FREQSTR +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = {v: k for k, v in OFFSET_DEPR_FREQSTR.items()} cpdef freq_to_period_freqstr(freq_n, freq_name): if freq_n == 1: @@ -226,6 +230,23 @@ cpdef freq_to_period_freqstr(freq_n, freq_name): # Map deprecated resolution abbreviations to correct resolution abbreviations DEPR_ABBREVS: dict[str, str]= { + "A": "Y", + "a": "Y", + "A-DEC": "Y-DEC", + "A-JAN": "Y-JAN", + "A-FEB": "Y-FEB", + "A-MAR": "Y-MAR", + "A-APR": "Y-APR", + "A-MAY": "Y-MAY", + "A-JUN": "Y-JUN", + "A-JUL": "Y-JUL", + "A-AUG": "Y-AUG", + "A-SEP": "Y-SEP", + "A-OCT": "Y-OCT", + "A-NOV": "Y-NOV", + "H": "h", + "BH": "bh", + "CBH": "cbh", "T": "min", "t": "min", "S": "s", @@ -321,10 +342,10 @@ class Resolution(Enum): Examples -------- - >>> Resolution.get_reso_from_freqstr('H') + >>> Resolution.get_reso_from_freqstr('h') - >>> Resolution.get_reso_from_freqstr('H') == Resolution.RESO_HR + >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ try: diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build index a1b0c54d1f48c..85410f771233f 100644 --- a/pandas/_libs/tslibs/meson.build +++ b/pandas/_libs/tslibs/meson.build @@ -19,11 +19,20 @@ tslibs_sources = { 'vectorized': {'sources': ['vectorized.pyx']}, } +cython_args = [ + '--include-dir', + meson.current_build_dir(), + '-X always_allow_keywords=true' +] +if get_option('buildtype') == 'debug' + cython_args += ['--gdb'] +endif + foreach ext_name, ext_dict : tslibs_sources py.extension_module( ext_name, ext_dict.get('sources'), - cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], + cython_args: cython_args, include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs/tslibs', diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index bb497f2e17b93..9f9549b93fefe 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1000,7 +1000,7 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='H') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.round(freq='min') # minute @@ -1017,9 +1017,9 @@ timedelta}, default 'raise' >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30min') + >>> ts.round(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1032,10 +1032,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.round("H", ambiguous=False) + >>> ts_tz.round("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.round("H", ambiguous=True) + >>> ts_tz.round("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1089,7 +1089,7 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='H') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') >>> ts.floor(freq='min') # minute @@ -1106,9 +1106,9 @@ timedelta}, default 'raise' >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30min') + >>> ts.floor(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1121,10 +1121,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.floor("2H", ambiguous=False) + >>> ts_tz.floor("2h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.floor("2H", ambiguous=True) + >>> ts_tz.floor("2h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1178,7 +1178,7 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='H') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.ceil(freq='min') # minute @@ -1195,9 +1195,9 @@ timedelta}, default 'raise' >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30min') + >>> ts.ceil(freq='1h30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: @@ -1210,10 +1210,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.ceil("H", ambiguous=False) + >>> ts_tz.ceil("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.ceil("H", ambiguous=True) + >>> ts_tz.ceil("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 74398eb0e2405..042d5dafe3046 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -15,6 +15,7 @@ from cpython.datetime cimport ( time as dt_time, timedelta, ) + import warnings import_datetime() @@ -48,7 +49,6 @@ from pandas._libs.tslibs.ccalendar import ( ) from pandas.util._exceptions import find_stack_level - from pandas._libs.tslibs.ccalendar cimport ( dayofweek, get_days_in_month, @@ -58,6 +58,8 @@ from pandas._libs.tslibs.ccalendar cimport ( from pandas._libs.tslibs.conversion cimport localize_pydatetime from pandas._libs.tslibs.dtypes cimport ( c_DEPR_ABBREVS, + c_OFFSET_DEPR_FREQSTR, + c_REVERSE_OFFSET_DEPR_FREQSTR, periods_per_day, ) from pandas._libs.tslibs.nattype cimport ( @@ -601,10 +603,10 @@ cdef class BaseOffset: Examples -------- >>> pd.offsets.Hour().name - 'H' + 'h' >>> pd.offsets.Hour(5).name - 'H' + 'h' """ return self.rule_code @@ -627,7 +629,7 @@ cdef class BaseOffset: '<5 * DateOffsets>' >>> pd.offsets.BusinessHour(2).freqstr - '2BH' + '2bh' >>> pd.offsets.Nano().freqstr 'ns' @@ -1164,7 +1166,7 @@ cdef class Hour(Tick): Timestamp('2022-12-09 11:00:00') """ _nanos_inc = 3600 * 1_000_000_000 - _prefix = "H" + _prefix = "h" _period_dtype_code = PeriodDtypeCode.H _creso = NPY_DATETIMEUNIT.NPY_FR_h @@ -1628,7 +1630,7 @@ cdef class BusinessMixin(SingleConstructorOffset): # Older (<0.22.0) versions have offset attribute instead of _offset self._offset = state.pop("offset") - if self._prefix.startswith("C"): + if self._prefix.startswith(("C", "c")): # i.e. this is a Custom class weekmask = state.pop("weekmask") holidays = state.pop("holidays") @@ -1694,7 +1696,7 @@ cdef class BusinessDay(BusinessMixin): s = td.seconds hrs = int(s / 3600) if hrs != 0: - off_str += str(hrs) + "H" + off_str += str(hrs) + "h" s -= hrs * 3600 mts = int(s / 60) if mts != 0: @@ -1891,10 +1893,10 @@ cdef class BusinessHour(BusinessMixin): '2022-12-12 06:00:00', '2022-12-12 07:00:00', '2022-12-12 10:00:00', '2022-12-12 11:00:00', '2022-12-12 15:00:00', '2022-12-12 16:00:00'], - dtype='datetime64[ns]', freq='BH') + dtype='datetime64[ns]', freq='bh') """ - _prefix = "BH" + _prefix = "bh" _anchor = 0 _attributes = tuple(["n", "normalize", "start", "end", "offset"]) _adjust_dst = False @@ -2014,7 +2016,7 @@ cdef class BusinessHour(BusinessMixin): nb_offset = 1 else: nb_offset = -1 - if self._prefix.startswith("C"): + if self._prefix.startswith(("c")): # CustomBusinessHour return CustomBusinessDay( n=nb_offset, @@ -2174,7 +2176,7 @@ cdef class BusinessHour(BusinessMixin): # adjust by business days first if bd != 0: - if self._prefix.startswith("C"): + if self._prefix.startswith("c"): # GH#30593 this is a Custom offset skip_bd = CustomBusinessDay( n=bd, @@ -2240,7 +2242,7 @@ cdef class BusinessHour(BusinessMixin): dt = datetime( dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond ) - # Valid BH can be on the different BusinessDay during midnight + # Valid bh can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time return self._is_on_offset(dt) @@ -2250,7 +2252,7 @@ cdef class BusinessHour(BusinessMixin): """ # if self.normalize and not _is_normalized(dt): # return False - # Valid BH can be on the different BusinessDay during midnight + # Valid bh can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time if self.n >= 0: op = self._prev_opening_time(dt) @@ -2496,7 +2498,7 @@ cdef class YearEnd(YearOffset): """ _default_month = 12 - _prefix = "A" + _prefix = "Y" _day_opt = "end" cdef readonly: @@ -4275,7 +4277,7 @@ cdef class CustomBusinessHour(BusinessHour): '2022-12-12 06:00:00', '2022-12-12 07:00:00', '2022-12-12 10:00:00', '2022-12-12 11:00:00', '2022-12-12 15:00:00', '2022-12-12 16:00:00'], - dtype='datetime64[ns]', freq='CBH') + dtype='datetime64[ns]', freq='cbh') Business days can be specified by ``weekmask`` parameter. To convert the returned datetime object to its string representation @@ -4304,10 +4306,10 @@ cdef class CustomBusinessHour(BusinessHour): '2022-12-15 11:00:00', '2022-12-15 12:00:00', '2022-12-16 10:00:00', '2022-12-16 11:00:00', '2022-12-16 12:00:00'], - dtype='datetime64[ns]', freq='CBH') + dtype='datetime64[ns]', freq='cbh') """ - _prefix = "CBH" + _prefix = "cbh" _anchor = 0 _attributes = tuple( ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"] @@ -4329,28 +4331,6 @@ cdef class CustomBusinessHour(BusinessHour): cdef class _CustomBusinessMonth(BusinessMixin): - """ - DateOffset subclass representing custom business month(s). - - Increments between beginning/end of month dates. - - Parameters - ---------- - n : int, default 1 - The number of months represented. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - weekmask : str, Default 'Mon Tue Wed Thu Fri' - Weekmask of valid business days, passed to ``numpy.busdaycalendar``. - holidays : list - List/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar``. - calendar : np.busdaycalendar - Calendar to integrate. - offset : timedelta, default timedelta(0) - Time offset to apply. - """ - _attributes = tuple( ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] ) @@ -4426,10 +4406,124 @@ cdef class _CustomBusinessMonth(BusinessMixin): cdef class CustomBusinessMonthEnd(_CustomBusinessMonth): + """ + DateOffset subclass representing custom business month(s). + + Increments between end of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + In the example below we use the default parameters. + + >>> ts = pd.Timestamp(2022, 8, 5) + >>> ts + pd.offsets.CustomBusinessMonthEnd() + Timestamp('2022-08-31 00:00:00') + + Custom business month end can be specified by ``weekmask`` parameter. + To convert the returned datetime object to its string representation + the function strftime() is used in the next example. + + >>> import datetime as dt + >>> freq = pd.offsets.CustomBusinessMonthEnd(weekmask="Wed Thu") + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18), + ... freq=freq).strftime('%a %d %b %Y %H:%M') + Index(['Thu 28 Jul 2022 00:00', 'Wed 31 Aug 2022 00:00', + 'Thu 29 Sep 2022 00:00', 'Thu 27 Oct 2022 00:00', + 'Wed 30 Nov 2022 00:00'], + dtype='object') + + Using NumPy business day calendar you can define custom holidays. + + >>> import datetime as dt + >>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30', + ... '2022-10-31', '2022-11-01']) + >>> freq = pd.offsets.CustomBusinessMonthEnd(calendar=bdc) + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) + DatetimeIndex(['2022-07-29', '2022-08-31', '2022-09-29', '2022-10-28'], + dtype='datetime64[ns]', freq='CBM') + """ + _prefix = "CBM" cdef class CustomBusinessMonthBegin(_CustomBusinessMonth): + """ + DateOffset subclass representing custom business month(s). + + Increments between beginning of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + In the example below we use the default parameters. + + >>> ts = pd.Timestamp(2022, 8, 5) + >>> ts + pd.offsets.CustomBusinessMonthBegin() + Timestamp('2022-09-01 00:00:00') + + Custom business month start can be specified by ``weekmask`` parameter. + To convert the returned datetime object to its string representation + the function strftime() is used in the next example. + + >>> import datetime as dt + >>> freq = pd.offsets.CustomBusinessMonthBegin(weekmask="Wed Thu") + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18), + ... freq=freq).strftime('%a %d %b %Y %H:%M') + Index(['Wed 03 Aug 2022 00:00', 'Thu 01 Sep 2022 00:00', + 'Wed 05 Oct 2022 00:00', 'Wed 02 Nov 2022 00:00', + 'Thu 01 Dec 2022 00:00'], + dtype='object') + + Using NumPy business day calendar you can define custom holidays. + + >>> import datetime as dt + >>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30', + ... '2022-10-31', '2022-11-01']) + >>> freq = pd.offsets.CustomBusinessMonthBegin(calendar=bdc) + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) + DatetimeIndex(['2022-08-02', '2022-09-01', '2022-10-03', '2022-11-02'], + dtype='datetime64[ns]', freq='CBMS') + """ + _prefix = "CBMS" @@ -4447,7 +4541,7 @@ prefix_mapping = { offset._prefix: offset for offset in [ YearBegin, # 'AS' - YearEnd, # 'A' + YearEnd, # 'Y' BYearBegin, # 'BAS' BYearEnd, # 'BA' BusinessDay, # 'B' @@ -4455,11 +4549,11 @@ prefix_mapping = { BusinessMonthEnd, # 'BM' BQuarterEnd, # 'BQ' BQuarterBegin, # 'BQS' - BusinessHour, # 'BH' + BusinessHour, # 'bh' CustomBusinessDay, # 'C' CustomBusinessMonthEnd, # 'CBM' CustomBusinessMonthBegin, # 'CBMS' - CustomBusinessHour, # 'CBH' + CustomBusinessHour, # 'cbh' MonthEnd, # 'ME' MonthBegin, # 'MS' Nano, # 'ns' @@ -4472,7 +4566,7 @@ prefix_mapping = { QuarterEnd, # 'Q' QuarterBegin, # 'QS' Milli, # 'ms' - Hour, # 'H' + Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' FY5253, @@ -4489,8 +4583,7 @@ _lite_rule_alias = { "W": "W-SUN", "Q": "Q-DEC", - "A": "A-DEC", # YearEnd(month=12), - "Y": "A-DEC", + "Y": "Y-DEC", # YearEnd(month=12), "AS": "AS-JAN", # YearBegin(month=1), "YS": "AS-JAN", "BA": "BA-DEC", # BYearEnd(month=12), @@ -4505,7 +4598,7 @@ _lite_rule_alias = { "ns": "ns", } -_dont_uppercase = {"MS", "ms", "s", "me"} +_dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s", "me"} INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4574,7 +4667,7 @@ cpdef to_offset(freq, bint is_period=False): >>> to_offset("5min") <5 * Minutes> - >>> to_offset("1D1H") + >>> to_offset("1D1h") <25 * Hours> >>> to_offset("2W") @@ -4615,21 +4708,22 @@ cpdef to_offset(freq, bint is_period=False): tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): - if is_period is False and name == "M": + if is_period is False and name in c_OFFSET_DEPR_FREQSTR: warnings.warn( - "\'M\' will be deprecated, please use \'ME\' " - "for \'month end\'", + f"\'{name}\' will be deprecated, please use " + f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.", UserWarning, stacklevel=find_stack_level(), ) - name = "ME" - if is_period is True and name == "ME": + name = c_OFFSET_DEPR_FREQSTR[name] + if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR: raise ValueError( - r"for Period, please use \'M\' " - "instead of \'ME\'" + f"for Period, please use " + f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name)}\' " + f"instead of \'{name}\'" ) - elif is_period is True and name == "M": - name = "ME" + elif is_period is True and name in c_OFFSET_DEPR_FREQSTR: + name = c_OFFSET_DEPR_FREQSTR.get(name) if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") @@ -4641,16 +4735,18 @@ cpdef to_offset(freq, bint is_period=False): if prefix in c_DEPR_ABBREVS: warnings.warn( - f"\'{prefix}\' is deprecated and will be removed in a " - f"future version. Please use \'{c_DEPR_ABBREVS.get(prefix)}\' " + f"\'{prefix}\' is deprecated and will be removed " + f"in a future version. Please use " + f"\'{c_DEPR_ABBREVS.get(prefix)}\' " f"instead of \'{prefix}\'.", FutureWarning, stacklevel=find_stack_level(), ) prefix = c_DEPR_ABBREVS[prefix] - if prefix in {"D", "H", "min", "s", "ms", "us", "ns"}: - # For these prefixes, we have something like "3H" or - # "2.5T", so we can construct a Timedelta with the + + if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: + # For these prefixes, we have something like "3h" or + # "2.5min", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick td = Timedelta(1, unit=prefix) off = delta_to_tick(td) @@ -4661,7 +4757,7 @@ cpdef to_offset(freq, bint is_period=False): offset *= stride_sign else: stride = int(stride) - offset = _get_offset(name) + offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 71be1d213437a..b23611124ea7c 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -716,7 +716,7 @@ cdef datetime dateutil_parse( elif res.tzoffset: ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - # dateutil can return a datetime with a tzoffset outside of (-24H, 24H) + # dateutil can return a datetime with a tzoffset outside of (-24h, 24h) # bounds, which is invalid (can be constructed, but raises if we call # str(ret)). Check that and raise here if necessary. try: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 5fecc77044b4b..cacfe43b236d8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1942,8 +1942,8 @@ cdef class _Period(PeriodMixin): Examples -------- >>> period = pd.Period('2023-1-1', freq='D') - >>> period.asfreq('H') - Period('2023-01-01 23:00', 'H') + >>> period.asfreq('h') + Period('2023-01-01 23:00', 'h') """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) @@ -2054,7 +2054,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", freq='H') + >>> p = pd.Period("2018-03-11", freq='h') >>> p.day 11 """ @@ -2155,7 +2155,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", "H") + >>> p = pd.Period("2018-03-11", "h") >>> p.weekofyear 10 @@ -2186,7 +2186,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", "H") + >>> p = pd.Period("2018-03-11", "h") >>> p.week 10 @@ -2226,14 +2226,14 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> per = pd.Period('2017-12-31 22:00', 'H') + >>> per = pd.Period('2017-12-31 22:00', 'h') >>> per.day_of_week 6 For periods that span over multiple days, the day at the beginning of the period is returned. - >>> per = pd.Period('2017-12-31 22:00', '4H') + >>> per = pd.Period('2017-12-31 22:00', '4h') >>> per.day_of_week 6 >>> per.start_time.day_of_week @@ -2277,14 +2277,14 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> per = pd.Period('2017-12-31 22:00', 'H') + >>> per = pd.Period('2017-12-31 22:00', 'h') >>> per.dayofweek 6 For periods that span over multiple days, the day at the beginning of the period is returned. - >>> per = pd.Period('2017-12-31 22:00', '4H') + >>> per = pd.Period('2017-12-31 22:00', '4h') >>> per.dayofweek 6 >>> per.start_time.dayofweek @@ -2326,7 +2326,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> period = pd.Period("2015-10-23", freq='H') + >>> period = pd.Period("2015-10-23", freq='h') >>> period.day_of_year 296 >>> period = pd.Period("2012-12-31", freq='D') @@ -2447,7 +2447,7 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> p = pd.Period("2018-03-11", freq='H') + >>> p = pd.Period("2018-03-11", freq='h') >>> p.daysinmonth 31 """ @@ -2482,8 +2482,8 @@ cdef class _Period(PeriodMixin): Examples -------- - >>> pd.Period.now('H') # doctest: +SKIP - Period('2023-06-12 11:00', 'H') + >>> pd.Period.now('h') # doctest: +SKIP + Period('2023-06-12 11:00', 'h') """ return Period(datetime.now(), freq=freq) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 2f6fa35cae070..e5d81bd5928b9 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -898,8 +898,8 @@ cdef int64_t parse_iso_format_string(str ts) except? -1: elif c in ["W", "D", "H", "M"]: if c in ["H", "M"] and len(number) > 2: raise ValueError(err_msg) - if c == "M": - c = "min" + if c in ["M", "H"]: + c = c.replace("M", "min").replace("H", "h") unit.append(c) r = timedelta_from_spec(number, "0", unit) result += timedelta_as_neg(r, neg) @@ -1442,7 +1442,7 @@ cdef class _Timedelta(timedelta): Resolution: Return value * Days: 'D' - * Hours: 'H' + * Hours: 'h' * Minutes: 'min' * Seconds: 's' * Milliseconds: 'ms' @@ -1484,7 +1484,7 @@ cdef class _Timedelta(timedelta): elif self._m: return "min" elif self._h: - return "H" + return "h" else: return "D" @@ -1725,8 +1725,8 @@ class Timedelta(_Timedelta): .. deprecated:: 2.2.0 - Values `T`, `S`, `L`, `U`, and `N` are deprecated in favour of the values - `min`, `s`, `ms`, `us`, and `ns`. + Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour + of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. **kwargs Available kwargs: {days, seconds, microseconds, diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 1b4332c2d26cf..edd061fd8cdf1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1249,7 +1249,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> # Year end frequency >>> ts.to_period(freq='Y') - Period('2020', 'A-DEC') + Period('2020', 'Y-DEC') >>> # Month end frequency >>> ts.to_period(freq='M') @@ -1982,7 +1982,7 @@ timedelta}, default 'raise' A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='H') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.round(freq='min') # minute @@ -1999,9 +1999,9 @@ timedelta}, default 'raise' >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30min') + >>> ts.round(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2014,10 +2014,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.round("H", ambiguous=False) + >>> ts_tz.round("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.round("H", ambiguous=True) + >>> ts_tz.round("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round( @@ -2073,7 +2073,7 @@ timedelta}, default 'raise' A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='H') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') >>> ts.floor(freq='min') # minute @@ -2090,9 +2090,9 @@ timedelta}, default 'raise' >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30min') + >>> ts.floor(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2105,10 +2105,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.floor("2H", ambiguous=False) + >>> ts_tz.floor("2h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.floor("2H", ambiguous=True) + >>> ts_tz.floor("2h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @@ -2162,7 +2162,7 @@ timedelta}, default 'raise' A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='H') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') >>> ts.ceil(freq='min') # minute @@ -2179,9 +2179,9 @@ timedelta}, default 'raise' >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30min' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30min') + >>> ts.ceil(freq='1h30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: @@ -2194,10 +2194,10 @@ timedelta}, default 'raise' >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.ceil("H", ambiguous=False) + >>> ts_tz.ceil("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.ceil("H", ambiguous=True) + >>> ts_tz.ceil("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) diff --git a/pandas/_typing.py b/pandas/_typing.py index 0e2a0881f0122..de01434c09c39 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -4,6 +4,7 @@ Hashable, Iterator, Mapping, + MutableMapping, Sequence, ) from datetime import ( @@ -103,6 +104,7 @@ TypeGuard: Any = None HashableT = TypeVar("HashableT", bound=Hashable) +MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping) # array-like @@ -507,3 +509,12 @@ def closed(self) -> bool: # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] + +# read_csv: usecols +UsecolsArgType = Union[ + SequenceNotStr[Hashable], + range, + AnyArrayLike, + Callable[[HashableT], bool], + None, +] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index d376fa4c1919e..51c9892b64a08 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -1,4 +1,6 @@ """ support numpy compatibility across versions """ +import warnings + import numpy as np from pandas.util.version import Version @@ -21,6 +23,27 @@ ) +np_long: type +np_ulong: type + +if _nlv >= Version("2.0.0.dev0"): + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*In the future `np\.long` will be defined as.*", + FutureWarning, + ) + np_long = np.long # type: ignore[attr-defined] + np_ulong = np.ulong # type: ignore[attr-defined] + except AttributeError: + np_long = np.int_ + np_ulong = np.uint +else: + np_long = np.int_ + np_ulong = np.uint + + __all__ = [ "np", "_np_version", diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c952178f4c998..dd45969a13fd7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,7 +55,6 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( - ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -979,14 +978,16 @@ def value_counts_arraylike( def duplicated( - values: ArrayLike, keep: Literal["first", "last", False] = "first" + values: ArrayLike, + keep: Literal["first", "last", False] = "first", + mask: npt.NDArray[np.bool_] | None = None, ) -> npt.NDArray[np.bool_]: """ Return boolean ndarray denoting duplicate values. Parameters ---------- - values : nd.array, ExtensionArray or Series + values : np.ndarray or ExtensionArray Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first @@ -994,21 +995,15 @@ def duplicated( - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. + mask : ndarray[bool], optional + array indicating which elements to exclude from checking Returns ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": - values = values._to_masked() # type: ignore[union-attr] - - if isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) - values = _ensure_data(values) - return htable.duplicated(values, keep=keep) + return htable.duplicated(values, keep=keep, mask=mask) def mode( @@ -1641,7 +1636,7 @@ def safe_sort( else: mask = None else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..60c42c01e9f6f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -30,8 +30,12 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.cast import can_hold_element +from pandas.core.dtypes.cast import ( + can_hold_element, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( + CategoricalDtype, is_array_like, is_bool_dtype, is_integer, @@ -42,6 +46,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ( + algorithms as algos, missing, roperator, ) @@ -627,7 +632,9 @@ def __setstate__(self, state) -> None: def _cmp_method(self, other, op): pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)): + if isinstance( + other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) + ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): result = pc_func(self._pa_array, self._box_pa(other)) elif is_scalar(other): try: @@ -1289,6 +1296,30 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + pa_type = self._pa_array.type + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + values = self.to_numpy(na_value=0) + elif pa.types.is_boolean(pa_type): + values = self.to_numpy(na_value=False) + elif pa.types.is_temporal(pa_type): + if pa_type.bit_width == 32: + pa_type = pa.int32() + else: + pa_type = pa.int64() + arr = self.astype(ArrowDtype(pa_type)) + values = arr.to_numpy(na_value=0) + else: + # factorize the values to avoid the performance penalty of + # converting to object dtype + values = self.factorize()[0] + + mask = self.isna() if self._hasna else None + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the ArrowExtensionArray of unique values. @@ -1599,13 +1630,21 @@ def _reduce( pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) if keepdims: - result = pa.array([pa_result.as_py()], type=pa_result.type) + if isinstance(pa_result, pa.Scalar): + result = pa.array([pa_result.as_py()], type=pa_result.type) + else: + result = pa.array( + [pa_result], + type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), + ) return type(self)(result) if pc.is_null(pa_result).as_py(): return self.dtype.na_value - else: + elif isinstance(pa_result, pa.Scalar): return pa_result.as_py() + else: + return pa_result def _explode(self): """ @@ -1708,7 +1747,7 @@ def __setitem__(self, key, value) -> None: data = pa.chunked_array([data]) self._pa_array = data - def _rank( + def _rank_calc( self, *, axis: AxisInt = 0, @@ -1717,9 +1756,6 @@ def _rank( ascending: bool = True, pct: bool = False, ): - """ - See Series.rank.__doc__. - """ if pa_version_under9p0 or axis != 0: ranked = super()._rank( axis=axis, @@ -1734,7 +1770,7 @@ def _rank( else: pa_type = pa.uint64() result = pa.array(ranked, type=pa_type, from_pandas=True) - return type(self)(result) + return result data = self._pa_array.combine_chunks() sort_keys = "ascending" if ascending else "descending" @@ -1773,7 +1809,29 @@ def _rank( divisor = pc.count(result) result = pc.divide(result, divisor) - return type(self)(result) + return result + + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return type(self)( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: """ @@ -2467,7 +2525,7 @@ def _round_temporally( if offset is None: raise ValueError(f"Must specify a valid frequency: {freq}") pa_supported_unit = { - "A": "year", + "Y": "year", "AS": "year", "Q": "quarter", "QS": "quarter", @@ -2475,7 +2533,7 @@ def _round_temporally( "MS": "month", "W": "week", "D": "day", - "H": "hour", + "h": "hour", "min": "minute", "s": "second", "ms": "millisecond", diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 933944dbd4632..31c143ee012bb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -61,6 +61,7 @@ roperator, ) from pandas.core.algorithms import ( + duplicated, factorize_array, isin, map_array, @@ -125,6 +126,7 @@ class ExtensionArray: astype copy dropna + duplicated factorize fillna equals @@ -891,7 +893,6 @@ def interpolate( limit, limit_direction, limit_area, - fill_value, copy: bool, **kwargs, ) -> Self: @@ -1116,6 +1117,31 @@ def dropna(self) -> Self: # error: Unsupported operand type for ~ ("ExtensionArray") return self[~self.isna()] # type: ignore[operator] + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ + mask = self.isna().astype(np.bool_, copy=False) + return duplicated(values=self, keep=keep, mask=mask) + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. @@ -1703,6 +1729,17 @@ def transpose(self, *axes: int) -> ExtensionArray: Because ExtensionArrays are always 1D, this is a no-op. It is included for compatibility with np.ndarray. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> pd.array([1, 2, 3]).transpose() + + [1, 2, 3] + Length: 3, dtype: Int64 """ return self[:] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8d2633c10b428..5059f5d000ccd 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1819,7 +1819,7 @@ def _empty( # type: ignore[override] return arr._from_backing_data(backing) - def _internal_get_values(self): + def _internal_get_values(self) -> ArrayLike: """ Return the values. @@ -1827,15 +1827,19 @@ def _internal_get_values(self): Returns ------- - np.ndarray or Index - A numpy array of the same dtype as categorical.categories.dtype or - Index if datetime / periods. + np.ndarray or ExtensionArray + A numpy array or ExtensionArray of the same dtype as + categorical.categories.dtype. """ # if we are a datetime and period index, return Index to keep metadata if needs_i8_conversion(self.categories.dtype): - return self.categories.take(self._codes, fill_value=NaT) + return self.categories.take(self._codes, fill_value=NaT)._values elif is_integer_dtype(self.categories.dtype) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, fill_value=np.nan) + return ( + self.categories.astype("object") + .take(self._codes, fill_value=np.nan) + ._values + ) return np.array(self) def check_for_ordered(self, op) -> None: @@ -2147,21 +2151,6 @@ def _formatter(self, boxed: bool = False): # Defer to CategoricalFormatter's formatter. return None - def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: - """ - a short repr displaying only max_vals and an optional (but default - footer) - """ - num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - - result = f"{head[:-1]}, ..., {tail[1:]}" - if footer: - result = f"{result}\n{self._repr_footer()}" - - return str(result) - def _repr_categories(self) -> list[str]: """ return the base repr for the categories @@ -2178,11 +2167,11 @@ def _repr_categories(self) -> list[str]: ) if len(self.categories) > max_categories: num = max_categories // 2 - head = format_array(self.categories[:num]) - tail = format_array(self.categories[-num:]) + head = format_array(self.categories[:num]._values) + tail = format_array(self.categories[-num:]._values) category_strs = head + ["..."] + tail else: - category_strs = format_array(self.categories) + category_strs = format_array(self.categories._values) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] @@ -2217,33 +2206,49 @@ def _repr_categories_info(self) -> str: # replace to simple save space by return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]" - def _repr_footer(self) -> str: - info = self._repr_categories_info() - return f"Length: {len(self)}\n{info}" - - def _get_repr( - self, length: bool = True, na_rep: str = "NaN", footer: bool = True - ) -> str: + def _get_values_repr(self) -> str: from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter( - self, length=length, na_rep=na_rep, footer=footer + assert len(self) > 0 + + vals = self._internal_get_values() + fmt_values = fmt.format_array( + vals, + None, + float_format=None, + na_rep="NaN", + quoting=QUOTE_NONNUMERIC, ) - result = formatter.to_string() - return str(result) + + fmt_values = [i.strip() for i in fmt_values] + joined = ", ".join(fmt_values) + result = "[" + joined + "]" + return result def __repr__(self) -> str: """ String representation. """ - _maxlen = 10 - if len(self._codes) > _maxlen: - result = self._tidy_repr(_maxlen) - elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen) + footer = self._repr_categories_info() + length = len(self) + max_len = 10 + if length > max_len: + # In long cases we do not display all entries, so we add Length + # information to the __repr__. + num = max_len // 2 + head = self[:num]._get_values_repr() + tail = self[-(max_len - num) :]._get_values_repr() + body = f"{head[:-1]}, ..., {tail[1:]}" + length_info = f"Length: {len(self)}" + result = f"{body}\n{length_info}\n{footer}" + elif length > 0: + body = self._get_values_repr() + result = f"{body}\n{footer}" else: - msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = f"[], {msg}" + # In the empty case we use a comma instead of newline to get + # a more compact __repr__ + body = "[]" + result = f"{body}, {footer}" return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 52596f29ffc0c..fd51303ebd55f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -41,6 +41,7 @@ iNaT, ints_to_pydatetime, ints_to_pytimedelta, + periods_per_day, to_offset, ) from pandas._libs.tslibs.fields import ( @@ -1825,14 +1826,14 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: dtype='datetime64[ns]', freq='min') """ -_round_example = """>>> rng.round('H') +_round_example = """>>> rng.round('h') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.round("H") + >>> pd.Series(rng).dt.round("h") 0 2018-01-01 12:00:00 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 @@ -1843,23 +1844,23 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.floor("2H", ambiguous=False) + >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.floor("2H", ambiguous=True) + >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ -_floor_example = """>>> rng.floor('H') +_floor_example = """>>> rng.floor('h') DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.floor("H") + >>> pd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 @@ -1870,23 +1871,23 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.floor("2H", ambiguous=False) + >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.floor("2H", ambiguous=True) + >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ -_ceil_example = """>>> rng.ceil('H') +_ceil_example = """>>> rng.ceil('h') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 13:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.ceil("H") + >>> pd.Series(rng).dt.ceil("h") 0 2018-01-01 12:00:00 1 2018-01-01 12:00:00 2 2018-01-01 13:00:00 @@ -1897,11 +1898,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz = pd.DatetimeIndex(["2021-10-31 01:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.ceil("H", ambiguous=False) + >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.ceil("H", ambiguous=True) + >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ @@ -2312,6 +2313,30 @@ def interpolate( return self return type(self)._simple_new(out_data, dtype=self.dtype) + # -------------------------------------------------------------- + # Unsorted + + @property + def _is_dates_only(self) -> bool: + """ + Check if we are round times at midnight (and no timezone), which will + be given a more compact __repr__ than other cases. For TimedeltaArray + we are checking for multiples of 24H. + """ + if not lib.is_np_dtype(self.dtype): + # i.e. we have a timezone + return False + + values_int = self.asi8 + consider_values = values_int != iNaT + reso = get_unit_from_dtype(self.dtype) + ppd = periods_per_day(reso) + + # TODO: can we reuse is_date_array_normalized? would need a skipna kwd + # (first attempt at this was less performant than this implementation) + even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 + return even_days + # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b520f9f4a6deb..a2742aed31e4c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -735,12 +735,12 @@ def astype(self, dtype, copy: bool = True): def _format_native_types( self, *, na_rep: str | float = "NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: - from pandas.io.formats.format import get_format_datetime64_from_values - - fmt = get_format_datetime64_from_values(self, date_format) + if date_format is None and self._is_dates_only: + # Only dates and no timezone: provide a default format + date_format = "%Y-%m-%d" return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, reso=self._creso + self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso ) # ----------------------------------------------------------------- @@ -857,37 +857,37 @@ def tz_convert(self, tz) -> Self: to other time zones: >>> dti = pd.date_range(start='2014-08-01 09:00', - ... freq='H', periods=3, tz='Europe/Berlin') + ... freq='h', periods=3, tz='Europe/Berlin') >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='H') + dtype='datetime64[ns, Europe/Berlin]', freq='h') >>> dti.tz_convert('US/Central') DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], - dtype='datetime64[ns, US/Central]', freq='H') + dtype='datetime64[ns, US/Central]', freq='h') With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): - >>> dti = pd.date_range(start='2014-08-01 09:00', freq='H', + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', ... periods=3, tz='Europe/Berlin') >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='H') + dtype='datetime64[ns, Europe/Berlin]', freq='h') >>> dti.tz_convert(None) DatetimeIndex(['2014-08-01 07:00:00', '2014-08-01 08:00:00', '2014-08-01 09:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') """ tz = timezones.maybe_get_tz(tz) @@ -1042,7 +1042,7 @@ def tz_localize( 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] @@ -1132,13 +1132,13 @@ def normalize(self) -> Self: Examples -------- - >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H', + >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', ... periods=3, tz='Asia/Calcutta') >>> idx DatetimeIndex(['2014-08-01 10:00:00+05:30', '2014-08-01 11:00:00+05:30', '2014-08-01 12:00:00+05:30'], - dtype='datetime64[ns, Asia/Calcutta]', freq='H') + dtype='datetime64[ns, Asia/Calcutta]', freq='h') >>> idx.normalize() DatetimeIndex(['2014-08-01 00:00:00+05:30', '2014-08-01 00:00:00+05:30', @@ -1276,7 +1276,7 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]: >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], dtype='datetime64[ns]', freq='ME') - >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ values = self._local_timestamps() @@ -2042,7 +2042,7 @@ def isocalendar(self) -> DataFrame: >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y") >>> idx DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], - dtype='datetime64[ns]', freq='A-DEC') + dtype='datetime64[ns]', freq='Y-DEC') >>> idx.is_leap_year array([ True, False, False]) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 58ade1ee935ec..4bc30d6dbd029 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -15,8 +15,6 @@ import numpy as np -from pandas._config import get_option - from pandas._libs import lib from pandas._libs.interval import ( VALID_CLOSED, @@ -1233,51 +1231,10 @@ def value_counts(self, dropna: bool = True) -> Series: # --------------------------------------------------------------------- # Rendering Methods - def _format_data(self) -> str: - # TODO: integrate with categorical and make generic - # name argument is unused here; just for compat with base / categorical - n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) - - formatter = str - - if n == 0: - summary = "[]" - elif n == 1: - first = formatter(self[0]) - summary = f"[{first}]" - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary = f"[{first}, {last}]" - else: - if n > max_seq_items: - n = min(max_seq_items // 2, 10) - head = [formatter(x) for x in self[:n]] - tail = [formatter(x) for x in self[-n:]] - head_str = ", ".join(head) - tail_str = ", ".join(tail) - summary = f"[{head_str} ... {tail_str}]" - else: - tail = [formatter(x) for x in self] - tail_str = ", ".join(tail) - summary = f"[{tail_str}]" - - return summary - - def __repr__(self) -> str: - # the short repr has no trailing newline, while the truncated - # repr does. So we include a newline in our template, and strip - # any trailing newlines from format_object_summary - data = self._format_data() - class_name = f"<{type(self).__name__}>\n" - - template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" - return template - - def _format_space(self) -> str: - space = " " * (len(type(self).__name__) + 1) - return f"\n{space}" + def _formatter(self, boxed: bool = False): + # returning 'str' here causes us to render as e.g. "(0, 1]" instead of + # "Interval(0, 1, closed='right')" + return str # --------------------------------------------------------------------- # Vectorized Interval Properties/Attributes diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9b85fb0477e6f..56d3711c7d13b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -952,6 +952,14 @@ def copy(self) -> Self: mask = self._mask.copy() return self._simple_new(data, mask) + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = self._data + mask = self._mask + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the BaseMaskedArray of unique values. diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4532e5bffe7a9..f188b73b4fc64 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -477,7 +477,7 @@ def __arrow_array__(self, type=None): Examples -------- - >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='H') + >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='h') >>> idx.hour Index([10, 11], dtype='int64') """, @@ -548,7 +548,7 @@ def __arrow_array__(self, type=None): >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") >>> idx - PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]') + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') >>> idx.dayofyear Index([365, 366, 365], dtype='int64') """, @@ -712,10 +712,10 @@ def asfreq(self, freq=None, how: str = "E") -> Self: Examples -------- - >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='Y') >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], - dtype='period[A-DEC]') + dtype='period[Y-DEC]') >>> pidx.asfreq('M') PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', @@ -1025,18 +1025,18 @@ def period_array( Examples -------- - >>> period_array([pd.Period('2017', freq='A'), - ... pd.Period('2018', freq='A')]) + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y')]) ['2017', '2018'] - Length: 2, dtype: period[A-DEC] + Length: 2, dtype: period[Y-DEC] - >>> period_array([pd.Period('2017', freq='A'), - ... pd.Period('2018', freq='A'), + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y'), ... pd.NaT]) ['2017', '2018', 'NaT'] - Length: 3, dtype: period[A-DEC] + Length: 3, dtype: period[Y-DEC] Integers that look like years are handled diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4d5eef960293f..cf349220e4ba7 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -28,6 +28,7 @@ from pandas._libs.tslibs import NaT from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, @@ -830,6 +831,14 @@ def _first_fill_value_loc(self): diff = np.r_[np.diff(indices), 2] return indices[(diff > 1).argmax()] + 1 + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = np.asarray(self) + mask = np.asarray(self.isna()) + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: uniques = algos.unique(self.sp_values) if len(self.sp_values) != len(self): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6262055827428..24b99b5d4852e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -53,6 +53,7 @@ from collections.abc import Sequence from pandas._typing import ( + AxisInt, Dtype, Scalar, npt, @@ -501,6 +502,28 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _convert_int_dtype(self, result): return Int64Dtype().__from_arrow__(result) + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + """ + See Series.rank.__doc__. + """ + return self._convert_int_dtype( + self._rank_calc( + axis=axis, + method=method, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" @@ -584,7 +607,10 @@ def _str_map( return lib.map_infer_mask(arr, f, mask.view("uint8")) def _convert_int_dtype(self, result): - result = result.to_numpy() + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result @@ -605,11 +631,18 @@ def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): if name in ["any", "all"]: - arr = pc.and_kleene( - pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") - ) + if not skipna and name == "all": + nas = pc.invert(pc.is_null(self._pa_array)) + arr = pc.and_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") return ArrowExtensionArray(arr)._reduce( name, skipna=skipna, keepdims=keepdims, **kwargs ) else: return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: + if item is np.nan: + item = libmissing.NA + return super().insert(loc, item) # type: ignore[return-value] diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b7b81b8271106..ca908c11a97bb 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -136,7 +136,7 @@ class TimedeltaArray(dtl.TimelikeOps): Examples -------- - >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1H', '2H'])) + >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1h', '2h'])) ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -471,7 +471,7 @@ def _format_native_types( from pandas.io.formats.format import get_format_timedelta64 # Relies on TimeDelta._repr_base - formatter = get_format_timedelta64(self._ndarray, na_rep) + formatter = get_format_timedelta64(self, na_rep) # equiv: np.array([formatter(x) for x in self._ndarray]) # but independent of dimension return np.frompyfunc(formatter, 1, 1)(self._ndarray) diff --git a/pandas/core/base.py b/pandas/core/base.py index 3026189e747bb..d4421560bcea7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"): @final def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: - return algorithms.duplicated(self._values, keep=keep) + arr = self._values + if isinstance(arr, ExtensionArray): + return arr.duplicated(keep=keep) + return algorithms.duplicated(arr, keep=keep) def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index aaac0dc73486f..b4b9a4176472d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -203,7 +203,7 @@ def array( ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] - >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') + >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]') ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -562,7 +562,12 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") + if isinstance(data, str) and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 6567ca7155b0d..6b00a5284ec5b 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -242,7 +242,7 @@ def construct_from_string(cls, string: str) -> Self: This is useful mainly for data types that accept parameters. For example, a period dtype accepts a frequency parameter that - can be set as ``period[H]`` (where H means hourly frequency). + can be set as ``period[h]`` (where H means hourly frequency). By default, in the abstract class, just the name of the type is expected. But subclasses can overwrite this method to accept diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 74e785be06356..3208a742738a3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1133,7 +1133,16 @@ def convert_dtypes( base_dtype = np.dtype(str) else: base_dtype = inferred_dtype - pa_type = to_pyarrow_type(base_dtype) + if ( + base_dtype.kind == "O" # type: ignore[union-attr] + and len(input_array) > 0 + and isna(input_array).all() + ): + import pyarrow as pa + + pa_type = pa.null() + else: + pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 9da4eac6a42c8..3d12e334e7c0f 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -402,7 +402,7 @@ def is_period_dtype(arr_or_dtype) -> bool: False >>> is_period_dtype(pd.Period("2017-01-01")) False - >>> is_period_dtype(pd.PeriodIndex([], freq="A")) + >>> is_period_dtype(pd.PeriodIndex([], freq="Y")) True """ warnings.warn( @@ -1664,9 +1664,12 @@ def is_all_strings(value: ArrayLike) -> bool: dtype = value.dtype if isinstance(dtype, np.dtype): - return dtype == np.dtype("object") and lib.is_string_array( - np.asarray(value), skipna=False - ) + if len(value) == 0: + return dtype == np.dtype("object") + else: + return dtype == np.dtype("object") and lib.is_string_array( + np.asarray(value), skipna=False + ) elif isinstance(dtype, CategoricalDtype): return dtype.categories.inferred_type == "string" return dtype == "string" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a16597221ac92..09c43822e11e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -230,6 +230,7 @@ Level, MergeHow, MergeValidate, + MutableMappingT, NaAction, NaPosition, NsmallestNlargestKeep, @@ -1322,6 +1323,25 @@ def to_string( line_width=line_width, ) + def _get_values_for_csv( + self, + *, + float_format: FloatFormatType | None, + date_format: str | None, + decimal: str, + na_rep: str, + quoting, # int csv.QUOTE_FOO from stdlib + ) -> Self: + # helper used by to_csv + mgr = self._mgr.get_values_for_csv( + float_format=float_format, + date_format=date_format, + decimal=decimal, + na_rep=na_rep, + quoting=quoting, + ) + return self._constructor_from_mgr(mgr, axes=mgr.axes) + # ---------------------------------------------------------------------- @property @@ -1927,6 +1947,27 @@ def _create_data_for_split_and_tight_to_dict( def to_dict( self, orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> MutableMappingT: + ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> list[MutableMappingT]: + ... + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, into: type[dict] = ..., index: bool = ..., ) -> dict: @@ -1936,11 +1977,14 @@ def to_dict( def to_dict( self, orient: Literal["records"], + *, into: type[dict] = ..., index: bool = ..., ) -> list[dict]: ... + # error: Incompatible default for argument "into" (default has type "type + # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") @deprecate_nonkeyword_arguments( version="3.0", allowed_args=["self", "orient"], name="to_dict" ) @@ -1949,9 +1993,10 @@ def to_dict( orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[dict] = dict, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, - ) -> dict | list[dict]: + ) -> MutableMappingT | list[MutableMappingT]: """ Convert the DataFrame to a dictionary. @@ -1979,7 +2024,7 @@ def to_dict( 'tight' as an allowed value for the ``orient`` argument into : class, default dict - The collections.abc.Mapping subclass used for all Mappings + The collections.abc.MutableMapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. @@ -1993,9 +2038,10 @@ def to_dict( Returns ------- - dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. + dict, list or collections.abc.MutableMapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` + parameter. See Also -------- @@ -2054,7 +2100,7 @@ def to_dict( """ from pandas.core.methods.to_dict import to_dict - return to_dict(self, orient, into, index) + return to_dict(self, orient, into=into, index=index) @deprecate_nonkeyword_arguments( version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" @@ -3239,7 +3285,7 @@ def to_html( ... ''' >>> assert html_string == df.to_html() """ - if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: + if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: raise ValueError("Invalid value for justify parameter") formatter = fmt.DataFrameFormatter( @@ -5122,12 +5168,7 @@ def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: @property def _series(self): - return { - item: Series( - self._mgr.iget(idx), index=self.index, name=item, fastpath=True - ) - for idx, item in enumerate(self.columns) - } + return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} # ---------------------------------------------------------------------- # Reindexing and alignment @@ -12117,7 +12158,7 @@ def to_period( For the yearly frequency >>> idx.to_period("Y") - PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') + PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') """ new_obj = self.copy(deep=copy and not using_copy_on_write()) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 738f4cbe6bc43..a7183a9d9498a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2,6 +2,7 @@ from __future__ import annotations import collections +from copy import deepcopy import datetime as dt from functools import partial import gc @@ -368,6 +369,13 @@ def attrs(self) -> dict[Hashable, Any]: -------- DataFrame.flags : Global flags applying to this object. + Notes + ----- + Many operations that create new datasets will copy ``attrs``. Copies + are always deep so that changing ``attrs`` will only affect the + present dataset. ``pandas.concat`` copies ``attrs`` only if all input + datasets have the same ``attrs``. + Examples -------- For Series: @@ -6191,8 +6199,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: stable across pandas releases. """ if isinstance(other, NDFrame): - for name in other.attrs: - self.attrs[name] = other.attrs[name] + if other.attrs: + # We want attrs propagation to have minimal performance + # impact if attrs are not used; i.e. attrs is an empty dict. + # One could make the deepcopy unconditionally, but a deepcopy + # of an empty dict is 50x more expensive than the empty check. + self.attrs = deepcopy(other.attrs) self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. @@ -6201,11 +6213,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - attrs = other.objs[0].attrs - check_attrs = all(objs.attrs == attrs for objs in other.objs[1:]) - if check_attrs: - for name in attrs: - self.attrs[name] = attrs[name] + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = deepcopy(attrs) allows_duplicate_labels = all( x.flags.allows_duplicate_labels for x in other.objs @@ -9025,7 +9039,7 @@ def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self: Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='12H') + >>> i = pd.date_range('2018-04-09', periods=4, freq='12h') >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A @@ -9173,11 +9187,11 @@ def resample( Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'ME', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'ME', 'A', 'Q', 'BM', + for all frequency offsets except for 'ME', 'Y', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or @@ -9207,6 +9221,10 @@ def resample( .. versionadded:: 1.3.0 + .. note:: + + Only takes effect for Tick-frequencies (i.e. fixed frequencies like + days, hours, and minutes, rather than months or quarters). offset : Timedelta or str, default is None An offset timedelta added to the origin. @@ -9348,12 +9366,12 @@ def resample( assigned to the first quarter of the period. >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01', - ... freq='A', + ... freq='Y', ... periods=2)) >>> s 2012 1 2013 2 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 >>> s.resample('Q', convention='start').asfreq() 2012Q1 1.0 2012Q2 NaN @@ -9477,12 +9495,12 @@ def resample( 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.resample('17W', origin='2000-01-01').sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.resample('17min', origin='2000-01-01').sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: @@ -9684,7 +9702,7 @@ def last(self, offset) -> Self: Get the rows for the last 3 days: - >>> ts.last('3D') # doctest: +SKIP + >>> ts.last('3D') # doctest: +SKIP A 2018-04-13 3 2018-04-15 4 @@ -11194,7 +11212,7 @@ def tz_convert( Pass None to convert to UTC and get a tz-naive index: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) >>> s.tz_convert(None) 2018-09-14 23:30:00 1 dtype: int64 @@ -11312,7 +11330,7 @@ def tz_localize( Pass None to convert to tz-naive index and preserve local time: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) >>> s.tz_localize(None) 2018-09-15 01:30:00 1 dtype: int64 @@ -11366,7 +11384,7 @@ def tz_localize( 2015-03-29 01:59:59.999999999+01:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) 2015-03-29 03:30:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 @@ -11555,10 +11573,10 @@ def describe( Describing a ``DataFrame``. By default only numeric fields are returned. - >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']), + >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), ... 'numeric': [1, 2, 3], ... 'object': ['a', 'b', 'c'] - ... }) + ... }) >>> df.describe() numeric count 3.0 @@ -11896,7 +11914,7 @@ def _logical_func( def any( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, @@ -12674,9 +12692,9 @@ def last_valid_index(self) -> Hashable | None: Examples -------- >>> df = pd.DataFrame({'person_id': [0, 1, 2, 3], -... 'age': [21, 25, 62, 43], -... 'height': [1.61, 1.87, 1.49, 2.01]} -... ).set_index('person_id') +... 'age': [21, 25, 62, 43], +... 'height': [1.61, 1.87, 1.49, 2.01]} +... ).set_index('person_id') >>> df age height person_id @@ -13502,7 +13520,7 @@ def make_doc(name: str, ndim: int) -> str: With a DataFrame >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]}, - ... index=['tiger', 'zebra', 'cow']) + ... index=['tiger', 'zebra', 'cow']) >>> df a b c tiger 1 2 1 @@ -13526,7 +13544,7 @@ def make_doc(name: str, ndim: int) -> str: getting an error. >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']}, - ... index=['tiger', 'zebra', 'cow']) + ... index=['tiger', 'zebra', 'cow']) >>> df.skew(numeric_only=True) a 0.0 dtype: float64""" diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dbabd04a87c36..a2f556eba08a4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -158,7 +158,7 @@ def _wrap_agged_manager(self, mgr: Manager) -> Series: def _get_data_to_aggregate( self, *, numeric_only: bool = False, name: str | None = None ) -> SingleManager: - ser = self._selected_obj + ser = self._obj_with_exclusions single = ser._mgr if numeric_only and not is_numeric_dtype(ser.dtype): # GH#41291 match Series behavior @@ -448,7 +448,7 @@ def _aggregate_named(self, func, *args, **kwargs): initialized = False for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + self._obj_with_exclusions, axis=self.axis ): # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations object.__setattr__(group, "name", name) @@ -519,7 +519,7 @@ def _cython_transform( ): assert axis == 0 # handled by caller - obj = self._selected_obj + obj = self._obj_with_exclusions try: result = self.grouper._cython_operation( @@ -546,7 +546,7 @@ def _transform_general( results = [] for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + self._obj_with_exclusions, axis=self.axis ): # this setattr is needed for test_transform_lambda_with_datetimetz object.__setattr__(group, "name", name) @@ -618,7 +618,7 @@ def true_and_notna(x) -> bool: indices = [ self._get_index(name) for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + self._obj_with_exclusions, axis=self.axis ) if true_and_notna(group) ] @@ -1164,7 +1164,7 @@ def nlargest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: f = partial(Series.nlargest, n=n, keep=keep) - data = self._selected_obj + data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. # already ordered and n >= all group sizes. result = self._python_apply_general(f, data, not_indexed_same=True) @@ -1175,7 +1175,7 @@ def nsmallest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: f = partial(Series.nsmallest, n=n, keep=keep) - data = self._selected_obj + data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. # already ordered and n >= all group sizes. result = self._python_apply_general(f, data, not_indexed_same=True) @@ -1185,15 +1185,13 @@ def nsmallest( def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) @doc(Series.corr.__doc__) def corr( @@ -2187,22 +2185,9 @@ def idxmax( Beef co2_emissions dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmax") - else: - axis = self.axis - - def func(df): - return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmax" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result def idxmin( self, @@ -2282,22 +2267,9 @@ def idxmin( Beef consumption dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmin") - else: - axis = self.axis - - def func(df): - return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmin" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a022bfd1bd9bc..e33c4b3579c69 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2015,10 +2015,14 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with com.temp_setattr(self, "as_index", True): # GH#49834 - result needs groups in the index for # _wrap_transform_fast_result - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) return self._wrap_transform_fast_result(result) @@ -5720,6 +5724,113 @@ def sample( sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ): + """Compute idxmax/idxmin. + + Parameters + ---------- + how: {"idxmin", "idxmax"} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self.grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + else: + raise_err = False + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + + try: + if self.obj.ndim == 1: + result = self._op_via_apply(how, skipna=skipna) + else: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) from None + raise + + result = result.astype(self.obj.index.dtype) if result.empty else result + + if not skipna: + has_na_value = result.isnull().any(axis=None) + if has_na_value: + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + @doc(GroupBy) def get_groupby( diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c51c17e04796a..06e6755079a22 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -207,12 +207,12 @@ class Grouper: 2000-10-02 00:26:00 24 Freq: 17min, dtype: int64 - >>> ts.groupby(pd.Grouper(freq='17W', origin='2000-01-01')).sum() - 2000-01-02 0 - 2000-04-30 0 - 2000-08-27 0 - 2000-12-24 108 - Freq: 17W-SUN, dtype: int64 + >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() + 2000-10-01 23:24:00 3 + 2000-10-01 23:41:00 15 + 2000-10-01 23:58:00 45 + 2000-10-02 00:15:00 45 + Freq: 17min, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index af8fa441f8b3f..d90de383adb48 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -544,7 +544,7 @@ class PeriodProperties(Properties): 1 2000-01-01 01:00 2 2000-01-01 02:00 3 2000-01-01 03:00 - dtype: period[H] + dtype: period[h] >>> hours_series.dt.hour 0 0 1 1 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4c86f7fc04877..80d3a6c6cbd94 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1284,25 +1284,16 @@ def __repr__(self) -> str_t: klass_name = type(self).__name__ data = self._format_data() attrs = self._format_attrs() - space = self._format_space() attrs_str = [f"{k}={v}" for k, v in attrs] - prepr = f",{space}".join(attrs_str) + prepr = ", ".join(attrs_str) # no data provided, just attributes if data is None: + # i.e. RangeIndex data = "" return f"{klass_name}({data}{prepr})" - def _format_space(self) -> str_t: - # using space here controls if the attributes - # are line separated or not (the default) - - # max_seq_items = get_option('display.max_seq_items') - # if len(self) > max_seq_items: - # space = "\n%s" % (' ' * (len(klass) + 1)) - return " " - @property def _formatter_func(self): """ @@ -1319,7 +1310,7 @@ def _format_data(self, name=None) -> str_t: if self.inferred_type == "string": is_justify = False - elif self.inferred_type == "categorical": + elif isinstance(self.dtype, CategoricalDtype): self = cast("CategoricalIndex", self) if is_object_dtype(self.categories.dtype): is_justify = False @@ -1390,9 +1381,9 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep) + return self._format_with_header(header=header, na_rep=na_rep) - def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t]: + def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str_t]: from pandas.io.formats.format import format_array values = self._values @@ -3501,7 +3492,7 @@ def _intersection(self, other: Index, sort: bool = False): pass else: # TODO: algos.unique1d should preserve DTA/TDA - if is_numeric_dtype(self): + if is_numeric_dtype(self.dtype): # This is faster, because Index.unique() checks for uniqueness # before calculating the unique values. res = algos.unique1d(res_indexer) @@ -4011,8 +4002,8 @@ def _get_fill_indexer( self, target: Index, method: str_t, limit: int | None = None, tolerance=None ) -> npt.NDArray[np.intp]: if self._is_multi: - # TODO: get_indexer_with_fill docstring says values must be _sorted_ - # but that doesn't appear to be enforced + if not (self.is_monotonic_increasing or self.is_monotonic_decreasing): + raise ValueError("index must be monotonic increasing or decreasing") # error: "IndexEngine" has no attribute "get_indexer_with_fill" engine = self._engine with warnings.catch_warnings(): @@ -4725,6 +4716,13 @@ def _join_multi(self, other: Index, how: JoinHow): multi_join_idx = multi_join_idx.remove_unused_levels() + # maintain the order of the index levels + if how == "right": + level_order = other_names_list + ldrop_names + else: + level_order = self_names_list + rdrop_names + multi_join_idx = multi_join_idx.reorder_levels(level_order) + return multi_join_idx, lidx, ridx jl = next(iter(overlap)) @@ -5013,7 +5011,10 @@ def _can_use_libjoin(self) -> bool: ) # Exclude index types where the conversion to numpy converts to object dtype, # which negates the performance benefit of libjoin - # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone + # Subclasses should override to return False if _get_join_target is + # not zero-copy. + # TODO: exclude RangeIndex (which allocates memory)? + # Doing so seems to break test_concat_datetime_timezone return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) # -------------------------------------------------------------------- @@ -6169,8 +6170,8 @@ def _get_indexer_non_comparable( If doing an inequality check, i.e. method is not None. """ if method is not None: - other = _unpack_nested_dtype(target) - raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") + other_dtype = _unpack_nested_dtype(target) + raise TypeError(f"Cannot compare dtypes {self.dtype} and {other_dtype}") no_matches = -1 * np.ones(target.shape, dtype=np.intp) if unique: @@ -6281,8 +6282,7 @@ def _should_compare(self, other: Index) -> bool: # respectively. return False - other = _unpack_nested_dtype(other) - dtype = other.dtype + dtype = _unpack_nested_dtype(other) return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: @@ -7596,7 +7596,7 @@ def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]: return names -def _unpack_nested_dtype(other: Index) -> Index: +def _unpack_nested_dtype(other: Index) -> DtypeObj: """ When checking if our dtype is comparable with another, we need to unpack CategoricalDtype to look at its categories.dtype. @@ -7607,20 +7607,20 @@ def _unpack_nested_dtype(other: Index) -> Index: Returns ------- - Index + np.dtype or ExtensionDtype """ dtype = other.dtype if isinstance(dtype, CategoricalDtype): # If there is ever a SparseIndex, this could get dispatched # here too. - return dtype.categories + return dtype.categories.dtype elif isinstance(dtype, ArrowDtype): # GH 53617 import pyarrow as pa if pa.types.is_dictionary(dtype.pyarrow_dtype): - other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) - return other + other = other[:0].astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) + return other.dtype def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index bc6fb61700aec..9cf7e861584d9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -356,7 +356,7 @@ def _format_attrs(self): extra = super()._format_attrs() return attrs + extra - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: result = [ pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep for x in self._values diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1b08596c99591..94ad556219b35 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -209,10 +209,12 @@ def format( if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + return self._format_with_header( + header=header, na_rep=na_rep, date_format=date_format + ) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: # matches base class except for whitespace padding and date_format return header + list( diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ae0feba1f9bcf..f3b2a35f379f4 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -266,6 +266,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]: return libindex.DatetimeEngine _data: DatetimeArray + _values: DatetimeArray tz: dt.tzinfo | None # -------------------------------------------------------------------- @@ -393,19 +394,12 @@ def _is_dates_only(self) -> bool: ------- bool """ - - from pandas.io.formats.format import is_dates_only - delta = getattr(self.freq, "delta", None) if delta and delta % dt.timedelta(days=1) != dt.timedelta(days=0): return False - # error: Argument 1 to "is_dates_only" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "Union[ndarray, - # DatetimeArray, Index, DatetimeIndex]" - - return self.tz is None and is_dates_only(self._values) # type: ignore[arg-type] + return self._values._is_dates_only def __reduce__(self): d = {"data": self._data, "name": self.name} @@ -428,7 +422,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: def _formatter_func(self): from pandas.io.formats.format import get_format_datetime64 - formatter = get_format_datetime64(is_dates_only_=self._is_dates_only) + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: f"'{formatter(x)}'" # -------------------------------------------------------------------- @@ -785,11 +779,11 @@ def indexer_between_time( Examples -------- - >>> idx = pd.date_range("2023-01-01", periods=4, freq="H") + >>> idx = pd.date_range("2023-01-01", periods=4, freq="h") >>> idx DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00', '2023-01-01 02:00:00', '2023-01-01 03:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') >>> idx.indexer_between_time("00:00", "2:00", include_end=False) array([0, 1]) """ @@ -854,7 +848,7 @@ def date_range( periods : int, optional Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H'. See + Frequency strings can have multiples, e.g. '5h'. See :ref:`here ` for a list of frequency aliases. tz : str or tzinfo, optional @@ -1046,7 +1040,7 @@ def bdate_range( periods : int, default None Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'B' - Frequency strings can have multiples, e.g. '5H'. The default is + Frequency strings can have multiples, e.g. '5h'. The default is business daily ('B'). tz : str or None Time zone name for returning localized DatetimeIndex, for example diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index eb8d25bcea592..209ac84869e85 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -844,23 +844,11 @@ def length(self) -> Index: # -------------------------------------------------------------------- # Rendering Methods - # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: # matches base class except for whitespace padding return header + list(self._format_native_types(na_rep=na_rep)) - def _format_native_types( - self, *, na_rep: str = "NaN", quoting=None, **kwargs - ) -> npt.NDArray[np.object_]: - # GH 28210: use base method but with different default na_rep - return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) - - def _format_data(self, name=None) -> str: - # TODO: integrate with categorical and make generic - # name argument is unused here; just for compat with base / categorical - return f"{self._data._format_data()},{self._format_space()}" - # -------------------------------------------------------------------- # Set Operations diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 144045d40a086..c5cab225fa7b1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -73,9 +73,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, ABCSeries, - ABCTimedeltaIndex, ) from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import ( @@ -109,7 +107,10 @@ lexsort_indexer, ) -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + get_adjustment, + pprint_thing, +) if TYPE_CHECKING: from pandas import ( @@ -768,8 +769,8 @@ def _values(self) -> np.ndarray: vals = cast("CategoricalIndex", vals) vals = vals._data._internal_get_values() - if isinstance(vals.dtype, ExtensionDtype) or isinstance( - vals, (ABCDatetimeIndex, ABCTimedeltaIndex) + if isinstance(vals.dtype, ExtensionDtype) or lib.is_np_dtype( + vals.dtype, "mM" ): vals = vals.astype(object) @@ -1439,8 +1440,6 @@ def format( ) if adjoin: - from pandas.io.formats.format import get_adjustment - adj = get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: @@ -1948,7 +1947,7 @@ def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIn # indexer to reorder the level codes indexer = ensure_platform_int(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) - level_codes = algos.take_nd(ri, level_codes) + level_codes = algos.take_nd(ri, level_codes, fill_value=-1) new_levels.append(lev) new_codes.append(level_codes) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index aeb7bb1813fef..bb2ccfe930ab8 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -266,7 +266,7 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: # Equivalent to Index implementation, but faster if not len(self._range): return header diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index cd6a4883946d2..498fe56a7ae7f 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -278,7 +278,7 @@ def timedelta_range( periods : int, default None Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H'. + Frequency strings can have multiples, e.g. '5h'. name : str, default None Name of the resulting TimedeltaIndex. closed : str, default None @@ -320,10 +320,10 @@ def timedelta_range( Only fixed frequencies can be passed, non-fixed frequencies such as 'M' (month end) will raise. - >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H') + >>> pd.timedelta_range(start='1 day', end='2 days', freq='6h') TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', '1 days 18:00:00', '2 days 00:00:00'], - dtype='timedelta64[ns]', freq='6H') + dtype='timedelta64[ns]', freq='6h') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). @@ -336,8 +336,7 @@ def timedelta_range( **Specify a unit** >>> pd.timedelta_range("1 Day", periods=3, freq="100000D", unit="s") - TimedeltaIndex(['1 days 00:00:00', '100001 days 00:00:00', - '200001 days 00:00:00'], + TimedeltaIndex(['1 days', '100001 days', '200001 days'], dtype='timedelta64[s]', freq='100000D') """ if freq is None and com.any_none(periods, start, end): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 693248b265eb5..9199a45c21180 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -187,7 +187,7 @@ def iloc(self) -> _iLocIndexer: -------- >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] >>> df = pd.DataFrame(mydict) >>> df a b c d @@ -328,7 +328,7 @@ def loc(self) -> _LocIndexer: DataFrame.at : Access a single value for a row/column label pair. DataFrame.iloc : Access group of rows and columns by integer position(s). DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. + Series/DataFrame. Series.loc : Access group of values using labels. Examples @@ -336,8 +336,8 @@ def loc(self) -> _LocIndexer: **Getting values** >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 @@ -380,8 +380,8 @@ def loc(self) -> _LocIndexer: Alignable boolean Series: >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] - max_speed shield + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield sidewinder 7 8 Index (same behavior as ``df.reindex``) @@ -407,7 +407,7 @@ def loc(self) -> _LocIndexer: Multiple conditional using ``&`` that returns a boolean Series >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] - max_speed shield + max_speed shield viper 4 5 Multiple conditional using ``|`` that returns a boolean Series @@ -496,7 +496,7 @@ def loc(self) -> _LocIndexer: Another example using integers for the index >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) + ... index=[7, 8, 9], columns=['max_speed', 'shield']) >>> df max_speed shield 7 1 2 @@ -517,13 +517,13 @@ def loc(self) -> _LocIndexer: A number of examples using a DataFrame with a MultiIndex >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') ... ] >>> index = pd.MultiIndex.from_tuples(tuples) >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] + ... [1, 4], [7, 1], [16, 36]] >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) >>> df max_speed shield diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b4e3fdb78b77b..99af4f51661b1 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -79,9 +79,9 @@ ensure_block_shape, external_values, extract_pandas_array, + get_values_for_csv, maybe_coerce_values, new_block, - to_native_types, ) from pandas.core.internals.managers import make_na_array @@ -343,8 +343,17 @@ def _convert(arr): return self.apply(_convert) - def to_native_types(self, **kwargs) -> Self: - return self.apply(to_native_types, **kwargs) + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + return self.apply( + get_values_for_csv, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) @property def any_extension_types(self) -> bool: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 30f6507d02484..f0c14eec81c3c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -692,9 +692,18 @@ def astype( return newb @final - def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Block: """convert to our native types format""" - result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs) + result = get_values_for_csv( + self.values, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) return self.make_block(result) @final @@ -2593,14 +2602,14 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def to_native_types( +def get_values_for_csv( values: ArrayLike, *, + date_format, na_rep: str = "nan", quoting=None, float_format=None, decimal: str = ".", - **kwargs, ) -> npt.NDArray[np.object_]: """convert to our native types format""" if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": @@ -2615,14 +2624,16 @@ def to_native_types( if isinstance(values, (DatetimeArray, TimedeltaArray)): if values.ndim == 1: - result = values._format_native_types(na_rep=na_rep, **kwargs) + result = values._format_native_types(na_rep=na_rep, date_format=date_format) result = result.astype(object, copy=False) return result # GH#21734 Process every column separately, they might have different formats results_converted = [] for i in range(len(values)): - result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs) + result = values[i, :]._format_native_types( + na_rep=na_rep, date_format=date_format + ) results_converted.append(result.astype(object, copy=False)) return np.vstack(results_converted) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6f30bc650aa36..d6aeda3d418ed 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -383,7 +383,7 @@ def ndarray_to_mgr( new_block( dtype.construct_array_type()._from_sequence(data, dtype=dtype), BlockPlacement(slice(i, i + 1)), - ndim=1, + ndim=2, ) for i, data in enumerate(obj_columns) ] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b1db2d2e708e8..86cef032ec6e4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -430,12 +430,21 @@ def convert(self, copy: bool | None) -> Self: return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) - def to_native_types(self, **kwargs) -> Self: + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: """ Convert values to native types (strings / python objects) that are used in formatting (repr / csv). """ - return self.apply("to_native_types", **kwargs) + return self.apply( + "get_values_for_csv", + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) @property def any_extension_types(self) -> bool: diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index f4e0dcddcd34a..3295c4741c03d 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Literal, + overload, ) import warnings @@ -16,17 +17,66 @@ from pandas.core import common as com if TYPE_CHECKING: + from pandas._typing import MutableMappingT + from pandas import DataFrame +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> MutableMappingT: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> list[MutableMappingT]: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., +) -> dict: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., +) -> list[dict]: + ... + + +# error: Incompatible default for argument "into" (default has type "type[dict +# [Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") def to_dict( df: DataFrame, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[dict] = dict, + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, -) -> dict | list[dict]: +) -> MutableMappingT | list[MutableMappingT]: """ Convert the DataFrame to a dictionary. @@ -54,7 +104,7 @@ def to_dict( 'tight' as an allowed value for the ``orient`` argument into : class, default dict - The collections.abc.Mapping subclass used for all Mappings + The collections.abc.MutableMapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. @@ -69,8 +119,8 @@ def to_dict( Returns ------- dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` parameter. """ if not df.columns.is_unique: warnings.warn( @@ -103,7 +153,7 @@ def to_dict( are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) if orient == "dict": - return into_c((k, v.to_dict(into)) for k, v in df.items()) + return into_c((k, v.to_dict(into=into)) for k, v in df.items()) elif orient == "list": object_dtype_indices_as_set: set[int] = set(box_native_indices) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e9b2bacd9e1df..59e6a20915c18 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -376,13 +376,13 @@ def transform(self, arg, *args, **kwargs): >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> resampled = s.resample('15min') >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) 2018-01-01 00:00:00 NaN 2018-01-01 01:00:00 NaN - Freq: H, dtype: float64 + Freq: h, dtype: float64 """ return self._selected_obj.groupby(self._timegrouper).transform( arg, *args, **kwargs @@ -612,7 +612,7 @@ def nearest(self, limit: int | None = None): >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> s.resample('15min').nearest() 2018-01-01 00:00:00 1 @@ -681,7 +681,7 @@ def bfill(self, limit: int | None = None): 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> s.resample('30min').bfill() 2018-01-01 00:00:00 1 @@ -792,7 +792,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 + Freq: h, dtype: int64 Without filling the missing values you get: @@ -848,7 +848,7 @@ def fillna(self, method, limit: int | None = None): 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN 2018-01-01 02:00:00 3.0 - Freq: H, dtype: float64 + Freq: h, dtype: float64 >>> sm.resample('30min').fillna('backfill') 2018-01-01 00:00:00 1.0 @@ -986,7 +986,7 @@ def interpolate( downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. - .. deprecated::2.1.0 + .. deprecated:: 2.1.0 ``**kwargs`` : optional Keyword arguments to pass on to the interpolating function. @@ -2101,7 +2101,7 @@ def __init__( else: freq = to_offset(freq) - end_types = {"ME", "A", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "Y", "Q", "BM", "BA", "BQ", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2301,7 +2301,7 @@ def _adjust_bin_edges( "BQ", "BA", "Q", - "A", + "Y", "W", ): # If the right end-point is on the last day of the month, roll forwards @@ -2339,7 +2339,7 @@ def _get_time_delta_bins(self, ax: TimedeltaIndex): # GH#51896 raise ValueError( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - f"e.g. '24H' or '3D', not {self.freq}" + f"e.g. '24h' or '3D', not {self.freq}" ) if not len(ax): @@ -2528,24 +2528,16 @@ def _get_timestamp_range_edges( """ if isinstance(freq, Tick): index_tz = first.tz - - if isinstance(origin, Timestamp) and origin.tz != index_tz: + if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None): raise ValueError("The origin must have the same timezone as the index.") - - elif isinstance(origin, Timestamp): - if origin <= first: - first = origin - elif origin >= last: - last = origin - if origin == "epoch": # set the epoch based on the timezone to have similar bins results when # resampling on the same kind of indexes on different timezones origin = Timestamp("1970-01-01", tz=index_tz) if isinstance(freq, Day): - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). + # _adjust_dates_anchored assumes 'D' means 24h, but first/last + # might contain a DST transition (23h, 24h, or 25h). # So "pretend" the dates are naive when adjusting the endpoints first = first.tz_localize(None) last = last.tz_localize(None) @@ -2559,9 +2551,6 @@ def _get_timestamp_range_edges( first = first.tz_localize(index_tz) last = last.tz_localize(index_tz) else: - if isinstance(origin, Timestamp): - first = origin - first = first.normalize() last = last.normalize() diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 74e6a6a28ccb0..387d43f47fe9b 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -134,7 +134,9 @@ def melt( mcolumns = id_vars + var_name + [value_name] - if frame.shape[1] > 0: + if frame.shape[1] > 0 and not any( + not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes + ): mdata[value_name] = concat( [frame.iloc[:, i] for i in range(frame.shape[1])] ).values diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4b9fcc80af4bb..ba6579a739f54 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2443,6 +2443,8 @@ def _factorize_keys( .astype(np.intp, copy=False), len(dc.dictionary), ) + if dc.null_count > 0: + count += 1 if how == "right": return rlab, llab, count return llab, rlab, count diff --git a/pandas/core/series.py b/pandas/core/series.py index fd50a85f3c2e3..c2eea371ddef3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -48,6 +48,7 @@ from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, doc, ) from pandas.util._exceptions import find_stack_level @@ -167,6 +168,7 @@ IndexKeyFunc, IndexLabel, Level, + MutableMappingT, NaPosition, NumpySorter, NumpyValueArrayLike, @@ -372,8 +374,18 @@ def __init__( dtype: Dtype | None = None, name=None, copy: bool | None = None, - fastpath: bool = False, + fastpath: bool | lib.NoDefault = lib.no_default, ) -> None: + if fastpath is not lib.no_default: + warnings.warn( + "The 'fastpath' keyword in pd.Series is deprecated and will " + "be removed in a future version.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + fastpath = False + if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None @@ -1007,7 +1019,8 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Series: # axis kwarg is retained for compat with NDFrame method # _slice is *always* positional mgr = self._mgr.get_slice(slobj, axis=axis) - out = self._constructor(mgr, fastpath=True) + out = self._constructor_from_mgr(mgr, axes=mgr.axes) + out._name = self._name return out.__finalize__(self) def __getitem__(self, key): @@ -1922,21 +1935,40 @@ def keys(self) -> Index: """ return self.index - def to_dict(self, into: type[dict] = dict) -> dict: + @overload + def to_dict( + self, *, into: type[MutableMappingT] | MutableMappingT + ) -> MutableMappingT: + ... + + @overload + def to_dict(self, *, into: type[dict] = ...) -> dict: + ... + + # error: Incompatible default for argument "into" (default has type "type[ + # dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_dict" + ) + def to_dict( + self, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] + ) -> MutableMappingT: """ Convert Series to {label -> value} dict or dict-like object. Parameters ---------- into : class, default dict - The collections.abc.Mapping subclass to use as the return - object. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. + The collections.abc.MutableMapping subclass to use as the return + object. Can be the actual class or an empty instance of the mapping + type you want. If you want a collections.defaultdict, you must + pass it initialized. Returns ------- - collections.abc.Mapping + collections.abc.MutableMapping Key-value representation of Series. Examples @@ -1945,10 +1977,10 @@ def to_dict(self, into: type[dict] = dict) -> dict: >>> s.to_dict() {0: 1, 1: 2, 2: 3, 3: 4} >>> from collections import OrderedDict, defaultdict - >>> s.to_dict(OrderedDict) + >>> s.to_dict(into=OrderedDict) OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) >>> dd = defaultdict(list) - >>> s.to_dict(dd) + >>> s.to_dict(into=dd) defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) """ # GH16122 @@ -3983,7 +4015,8 @@ def argsort( if mask.any(): # TODO(3.0): once this deprecation is enforced we can call - # self.array.argsort directly, which will close GH#43840 + # self.array.argsort directly, which will close GH#43840 and + # GH#12694 warnings.warn( "The behavior of Series.argsort in the presence of NA values is " "deprecated. In a future version, NA values will be ordered " @@ -4983,8 +5016,44 @@ def reindex( # type: ignore[override] tolerance=tolerance, ) + @overload # type: ignore[override] + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[True], + ) -> None: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[False] = ..., + ) -> Self: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: bool = ..., + ) -> Self | None: + ... + @doc(NDFrame.rename_axis) - def rename_axis( # type: ignore[override] + def rename_axis( self, mapper: IndexLabel | lib.NoDefault = lib.no_default, *, @@ -5165,7 +5234,7 @@ def pop(self, item: Hashable) -> Any: Examples -------- - >>> ser = pd.Series([1,2,3]) + >>> ser = pd.Series([1, 2, 3]) >>> ser.pop(0) 1 @@ -5651,7 +5720,7 @@ def to_timestamp( 2023 1 2024 2 2025 3 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 The resulting frequency of the Timestamps is `YearBegin` @@ -5670,7 +5739,7 @@ def to_timestamp( 2023-01-31 1 2024-01-31 2 2025-01-31 3 - Freq: A-JAN, dtype: int64 + Freq: Y-JAN, dtype: int64 """ if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") @@ -5705,12 +5774,12 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series 2023 1 2024 2 2025 3 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 Viewing the index >>> s.index - PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]') + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') """ if not isinstance(self.index, DatetimeIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d96fc02e16d0d..1b1d9d7640058 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -88,7 +88,7 @@ def get_indexer_indexer( # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has # type "Index") - target = ensure_key_mapped(target, key, levels=level) # type:ignore[assignment] + target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] target = target._sort_levels_monotonic() if level is not None: diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index a9abc0714baa3..8db77725a1aa3 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -111,7 +111,7 @@ def to_timedelta( * 'W' * 'D' / 'days' / 'day' - * 'hours' / 'hour' / 'hr' / 'h' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' * 'm' / 'minute' / 'min' / 'minutes' / 'T' * 's' / 'seconds' / 'sec' / 'second' / 'S' * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' @@ -121,9 +121,9 @@ def to_timedelta( Must not be specified when `arg` context strings and ``errors="raise"``. .. deprecated:: 2.2.0 - Units 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed - in a future version. Please use 'min', 's', 'ms', 'us', and 'ns' instead of - 'T', 'S', 'L', 'U' and 'N'. + Units 'H', 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed + in a future version. Please use 'h', 'min', 's', 'ms', 'us', and 'ns' + instead of 'H', 'T', 'S', 'L', 'U' and 'N'. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. diff --git a/pandas/io/common.py b/pandas/io/common.py index f255ea8197304..d08612f4f09f6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -68,8 +68,8 @@ is_integer, is_list_like, ) +from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -91,6 +91,8 @@ WriteBuffer, ) + from pandas import MultiIndex + @dataclasses.dataclass class IOArgs: @@ -1228,7 +1230,7 @@ def is_potential_multi_index( return bool( len(columns) - and not isinstance(columns, MultiIndex) + and not isinstance(columns, ABCMultiIndex) and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) ) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 073115cab8695..6def0024d9073 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -80,6 +80,7 @@ IntStrT, ReadBuffer, Self, + SequenceNotStr, StorageOptions, WriteExcelBuffer, ) @@ -387,7 +388,7 @@ def read_excel( sheet_name: str | int = ..., *, header: int | Sequence[int] | None = ..., - names: list[str] | None = ..., + names: SequenceNotStr[Hashable] | range | None = ..., index_col: int | Sequence[int] | None = ..., usecols: int | str @@ -426,7 +427,7 @@ def read_excel( sheet_name: list[IntStrT] | None, *, header: int | Sequence[int] | None = ..., - names: list[str] | None = ..., + names: SequenceNotStr[Hashable] | range | None = ..., index_col: int | Sequence[int] | None = ..., usecols: int | str @@ -465,7 +466,7 @@ def read_excel( sheet_name: str | int | list[IntStrT] | None = 0, *, header: int | Sequence[int] | None = 0, - names: list[str] | None = None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols: int | str @@ -730,7 +731,7 @@ def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, - names=None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols=None, dtype: DtypeArg | None = None, @@ -1589,7 +1590,7 @@ def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, - names=None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols=None, converters=None, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 569c8aaf6cef1..717dae6eea97c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -314,8 +314,8 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - res = df._mgr.to_native_types(**self._number_format) - data = [res.iget_values(i) for i in range(len(res.items))] + res = df._get_values_for_csv(**self._number_format) + data = list(res._iter_column_arrays()) ix = self.data_index[slicer]._format_native_types(**self._number_format) libwriters.write_csv_rows( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 922d0f37bee3a..cac83e2a48972 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -7,15 +7,11 @@ from collections.abc import ( Generator, Hashable, - Iterable, Mapping, Sequence, ) from contextlib import contextmanager -from csv import ( - QUOTE_NONE, - QUOTE_NONNUMERIC, -) +from csv import QUOTE_NONE from decimal import Decimal from functools import partial from io import StringIO @@ -23,14 +19,12 @@ import re from shutil import get_terminal_size from typing import ( - IO, TYPE_CHECKING, Any, Callable, Final, cast, ) -from unicodedata import east_asian_width import numpy as np @@ -45,9 +39,6 @@ NaT, Timedelta, Timestamp, - get_unit_from_dtype, - iNaT, - periods_per_day, ) from pandas._libs.tslibs.nattype import NaTType @@ -72,6 +63,7 @@ from pandas.core.arrays import ( Categorical, DatetimeArray, + ExtensionArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype @@ -175,7 +167,7 @@ Character recognized as decimal separator, e.g. ',' in Europe. """ -_VALID_JUSTIFY_PARAMETERS = ( +VALID_JUSTIFY_PARAMETERS = ( "left", "right", "center", @@ -198,75 +190,16 @@ """ -class CategoricalFormatter: - def __init__( - self, - categorical: Categorical, - buf: IO[str] | None = None, - length: bool = True, - na_rep: str = "NaN", - footer: bool = True, - ) -> None: - self.categorical = categorical - self.buf = buf if buf is not None else StringIO("") - self.na_rep = na_rep - self.length = length - self.footer = footer - self.quoting = QUOTE_NONNUMERIC - - def _get_footer(self) -> str: - footer = "" - - if self.length: - if footer: - footer += ", " - footer += f"Length: {len(self.categorical)}" - - level_info = self.categorical._repr_categories_info() - - # Levels are added in a newline - if footer: - footer += "\n" - footer += level_info - - return str(footer) - - def _get_formatted_values(self) -> list[str]: - return format_array( - self.categorical._internal_get_values(), - None, - float_format=None, - na_rep=self.na_rep, - quoting=self.quoting, - ) - - def to_string(self) -> str: - categorical = self.categorical - - if len(categorical) == 0: - if self.footer: - return self._get_footer() - else: - return "" - - fmt_values = self._get_formatted_values() - - fmt_values = [i.strip() for i in fmt_values] - values = ", ".join(fmt_values) - result = ["[" + values + "]"] - if self.footer: - footer = self._get_footer() - if footer: - result.append(footer) - - return str("\n".join(result)) - - class SeriesFormatter: + """ + Implement the main logic of Series.to_string, which underlies + Series.__repr__. + """ + def __init__( self, series: Series, - buf: IO[str] | None = None, + *, length: bool | str = True, header: bool = True, index: bool = True, @@ -278,7 +211,7 @@ def __init__( min_rows: int | None = None, ) -> None: self.series = series - self.buf = buf if buf is not None else StringIO() + self.buf = StringIO() self.name = name self.na_rep = na_rep self.header = header @@ -291,7 +224,7 @@ def __init__( float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = get_adjustment() + self.adj = printing.get_adjustment() self._chk_truncate() @@ -363,17 +296,6 @@ def _get_footer(self) -> str: return str(footer) - def _get_formatted_index(self) -> tuple[list[str], bool]: - index = self.tr_series.index - - if isinstance(index, MultiIndex): - have_header = any(name for name in index.names) - fmt_index = index.format(names=True) - else: - have_header = index.name is not None - fmt_index = index.format(name=True) - return fmt_index, have_header - def _get_formatted_values(self) -> list[str]: return format_array( self.tr_series._values, @@ -390,7 +312,8 @@ def to_string(self) -> str: if len(series) == 0: return f"{type(self.series).__name__}([], {footer})" - fmt_index, have_header = self._get_formatted_index() + have_header = _has_names(series.index) + fmt_index = self.tr_series.index.format(name=True) fmt_values = self._get_formatted_values() if self.is_truncated_vertically: @@ -422,69 +345,6 @@ def to_string(self) -> str: return str("".join(result)) -class TextAdjustment: - def __init__(self) -> None: - self.encoding = get_option("display.encoding") - - def len(self, text: str) -> int: - return len(text) - - def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: - return printing.justify(texts, max_len, mode=mode) - - def adjoin(self, space: int, *lists, **kwargs) -> str: - return printing.adjoin( - space, *lists, strlen=self.len, justfunc=self.justify, **kwargs - ) - - -class EastAsianTextAdjustment(TextAdjustment): - def __init__(self) -> None: - super().__init__() - if get_option("display.unicode.ambiguous_as_wide"): - self.ambiguous_width = 2 - else: - self.ambiguous_width = 1 - - # Definition of East Asian Width - # https://unicode.org/reports/tr11/ - # Ambiguous width can be changed by option - self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} - - def len(self, text: str) -> int: - """ - Calculate display width considering unicode East Asian Width - """ - if not isinstance(text, str): - return len(text) - - return sum( - self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text - ) - - def justify( - self, texts: Iterable[str], max_len: int, mode: str = "right" - ) -> list[str]: - # re-calculate padding space per str considering East Asian Width - def _get_pad(t): - return max_len - self.len(t) + len(t) - - if mode == "left": - return [x.ljust(_get_pad(x)) for x in texts] - elif mode == "center": - return [x.center(_get_pad(x)) for x in texts] - else: - return [x.rjust(_get_pad(x)) for x in texts] - - -def get_adjustment() -> TextAdjustment: - use_east_asian_width = get_option("display.unicode.east_asian_width") - if use_east_asian_width: - return EastAsianTextAdjustment() - else: - return TextAdjustment() - - def get_dataframe_repr_params() -> dict[str, Any]: """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string. @@ -536,16 +396,9 @@ def get_series_repr_params() -> dict[str, Any]: True """ width, height = get_terminal_size() - max_rows = ( - height - if get_option("display.max_rows") == 0 - else get_option("display.max_rows") - ) - min_rows = ( - height - if get_option("display.max_rows") == 0 - else get_option("display.min_rows") - ) + max_rows_opt = get_option("display.max_rows") + max_rows = height if max_rows_opt == 0 else max_rows_opt + min_rows = height if max_rows_opt == 0 else get_option("display.min_rows") return { "name": True, @@ -557,7 +410,11 @@ def get_series_repr_params() -> dict[str, Any]: class DataFrameFormatter: - """Class for processing dataframe formatting options and data.""" + """ + Class for processing dataframe formatting options and data. + + Used by DataFrame.to_string, which backs DataFrame.__repr__. + """ __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -607,7 +464,7 @@ def __init__( self.tr_frame = self.frame self.truncate() - self.adj = get_adjustment() + self.adj = printing.get_adjustment() def get_strcols(self) -> list[list[str]]: """ @@ -1169,16 +1026,16 @@ def save_to_buffer( """ Perform serialization. Write to buf or return as string if buf is None. """ - with get_buffer(buf, encoding=encoding) as f: - f.write(string) + with _get_buffer(buf, encoding=encoding) as fd: + fd.write(string) if buf is None: # error: "WriteBuffer[str]" has no attribute "getvalue" - return f.getvalue() # type: ignore[attr-defined] + return fd.getvalue() # type: ignore[attr-defined] return None @contextmanager -def get_buffer( +def _get_buffer( buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None ) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]: """ @@ -1216,7 +1073,7 @@ def get_buffer( def format_array( - values: Any, + values: ArrayLike, formatter: Callable | None, float_format: FloatFormatType | None = None, na_rep: str = "NaN", @@ -1233,7 +1090,7 @@ def format_array( Parameters ---------- - values + values : np.ndarray or ExtensionArray formatter float_format na_rep @@ -1255,21 +1112,24 @@ def format_array( ------- List[str] """ - fmt_klass: type[GenericArrayFormatter] + fmt_klass: type[_GenericArrayFormatter] if lib.is_np_dtype(values.dtype, "M"): - fmt_klass = Datetime64Formatter + fmt_klass = _Datetime64Formatter + values = cast(DatetimeArray, values) elif isinstance(values.dtype, DatetimeTZDtype): - fmt_klass = Datetime64TZFormatter + fmt_klass = _Datetime64TZFormatter + values = cast(DatetimeArray, values) elif lib.is_np_dtype(values.dtype, "m"): - fmt_klass = Timedelta64Formatter + fmt_klass = _Timedelta64Formatter + values = cast(TimedeltaArray, values) elif isinstance(values.dtype, ExtensionDtype): - fmt_klass = ExtensionArrayFormatter + fmt_klass = _ExtensionArrayFormatter elif lib.is_np_dtype(values.dtype, "fc"): fmt_klass = FloatArrayFormatter elif lib.is_np_dtype(values.dtype, "iu"): - fmt_klass = IntArrayFormatter + fmt_klass = _IntArrayFormatter else: - fmt_klass = GenericArrayFormatter + fmt_klass = _GenericArrayFormatter if space is None: space = 12 @@ -1297,10 +1157,10 @@ def format_array( return fmt_obj.get_result() -class GenericArrayFormatter: +class _GenericArrayFormatter: def __init__( self, - values: Any, + values: ArrayLike, digits: int = 7, formatter: Callable | None = None, na_rep: str = "NaN", @@ -1379,7 +1239,7 @@ def _format(x): vals = extract_array(self.values, extract_numpy=True) if not isinstance(vals, np.ndarray): raise TypeError( - "ExtensionArray formatting should use ExtensionArrayFormatter" + "ExtensionArray formatting should use _ExtensionArrayFormatter" ) inferred = lib.map_infer(vals, is_float) is_float_type = ( @@ -1409,7 +1269,7 @@ def _format(x): return fmt_values -class FloatArrayFormatter(GenericArrayFormatter): +class FloatArrayFormatter(_GenericArrayFormatter): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) @@ -1610,7 +1470,7 @@ def _format_strings(self) -> list[str]: return list(self.get_result_as_array()) -class IntArrayFormatter(GenericArrayFormatter): +class _IntArrayFormatter(_GenericArrayFormatter): def _format_strings(self) -> list[str]: if self.leading_space is False: formatter_str = lambda x: f"{x:d}".format(x=x) @@ -1621,10 +1481,12 @@ def _format_strings(self) -> list[str]: return fmt_values -class Datetime64Formatter(GenericArrayFormatter): +class _Datetime64Formatter(_GenericArrayFormatter): + values: DatetimeArray + def __init__( self, - values: np.ndarray | Series | DatetimeIndex | DatetimeArray, + values: DatetimeArray, nat_rep: str = "NaT", date_format: None = None, **kwargs, @@ -1637,21 +1499,23 @@ def _format_strings(self) -> list[str]: """we by definition have DO NOT have a TZ""" values = self.values - if not isinstance(values, DatetimeIndex): - values = DatetimeIndex(values) + dti = DatetimeIndex(values) if self.formatter is not None and callable(self.formatter): - return [self.formatter(x) for x in values] + return [self.formatter(x) for x in dti] - fmt_values = values._data._format_native_types( + fmt_values = dti._data._format_native_types( na_rep=self.nat_rep, date_format=self.date_format ) return fmt_values.tolist() -class ExtensionArrayFormatter(GenericArrayFormatter): +class _ExtensionArrayFormatter(_GenericArrayFormatter): + values: ExtensionArray + def _format_strings(self) -> list[str]: values = extract_array(self.values, extract_numpy=True) + values = cast(ExtensionArray, values) formatter = self.formatter fallback_formatter = None @@ -1749,31 +1613,6 @@ def format_percentiles( return [i + "%" for i in out] -def is_dates_only(values: np.ndarray | DatetimeArray | Index | DatetimeIndex) -> bool: - # return a boolean if we are only dates (and don't have a timezone) - if not isinstance(values, Index): - values = values.ravel() - - if not isinstance(values, (DatetimeArray, DatetimeIndex)): - values = DatetimeIndex(values) - - if values.tz is not None: - return False - - values_int = values.asi8 - consider_values = values_int != iNaT - # error: Argument 1 to "py_get_unit_from_dtype" has incompatible type - # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" - reso = get_unit_from_dtype(values.dtype) # type: ignore[arg-type] - ppd = periods_per_day(reso) - - # TODO: can we reuse is_date_array_normalized? would need a skipna kwd - even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 - if even_days: - return True - return False - - def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: if x is NaT: return nat_rep @@ -1799,12 +1638,12 @@ def _format_datetime64_dateonly( def get_format_datetime64( - is_dates_only_: bool, nat_rep: str = "NaT", date_format: str | None = None + is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None ) -> Callable: """Return a formatter callable taking a datetime64 as input and providing a string as output""" - if is_dates_only_: + if is_dates_only: return lambda x: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) @@ -1812,26 +1651,12 @@ def get_format_datetime64( return lambda x: _format_datetime64(x, nat_rep=nat_rep) -def get_format_datetime64_from_values( - values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None -) -> str | None: - """given values and a date_format, return a string format""" - if isinstance(values, np.ndarray) and values.ndim > 1: - # We don't actually care about the order of values, and DatetimeIndex - # only accepts 1D values - values = values.ravel() - - ido = is_dates_only(values) - if ido: - # Only dates and no timezone: provide a default format - return date_format or "%Y-%m-%d" - return date_format - +class _Datetime64TZFormatter(_Datetime64Formatter): + values: DatetimeArray -class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" - ido = is_dates_only(self.values) + ido = self.values._is_dates_only values = self.values.astype(object) formatter = self.formatter or get_format_datetime64( ido, date_format=self.date_format @@ -1841,27 +1666,28 @@ def _format_strings(self) -> list[str]: return fmt_values -class Timedelta64Formatter(GenericArrayFormatter): +class _Timedelta64Formatter(_GenericArrayFormatter): + values: TimedeltaArray + def __init__( self, - values: np.ndarray | TimedeltaIndex, + values: TimedeltaArray, nat_rep: str = "NaT", - box: bool = False, **kwargs, ) -> None: + # TODO: nat_rep is never passed, na_rep is. super().__init__(values, **kwargs) self.nat_rep = nat_rep - self.box = box def _format_strings(self) -> list[str]: formatter = self.formatter or get_format_timedelta64( - self.values, nat_rep=self.nat_rep, box=self.box + self.values, nat_rep=self.nat_rep, box=False ) return [formatter(x) for x in self.values] def get_format_timedelta64( - values: np.ndarray | TimedeltaIndex | TimedeltaArray, + values: TimedeltaArray, nat_rep: str | float = "NaT", box: bool = False, ) -> Callable: @@ -1871,20 +1697,7 @@ def get_format_timedelta64( If box, then show the return in quotes """ - values_int = values.view(np.int64) - - consider_values = values_int != iNaT - - one_day_nanos = 86400 * 10**9 - # error: Unsupported operand types for % ("ExtensionArray" and "int") - not_midnight = values_int % one_day_nanos != 0 # type: ignore[operator] - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "Union[Any, ExtensionArray, ndarray]"; expected - # "Union[Union[int, float, complex, str, bytes, generic], - # Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - both = np.logical_and(consider_values, not_midnight) # type: ignore[arg-type] - even_days = both.sum() == 0 + even_days = values._is_dates_only if even_days: format = None @@ -1911,13 +1724,13 @@ def _make_fixed_width( strings: list[str], justify: str = "right", minimum: int | None = None, - adj: TextAdjustment | None = None, + adj: printing._TextAdjustment | None = None, ) -> list[str]: if len(strings) == 0 or justify == "all": return strings if adj is None: - adjustment = get_adjustment() + adjustment = printing.get_adjustment() else: adjustment = adj @@ -1941,7 +1754,7 @@ def just(x: str) -> str: return result -def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> list[str]: +def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. @@ -1987,7 +1800,7 @@ def _trim_zeros_single_float(str_float: str) -> str: def _trim_zeros_float( - str_floats: np.ndarray | list[str], decimal: str = "." + str_floats: ArrayLike | list[str], decimal: str = "." ) -> list[str]: """ Trims the maximum number of trailing zeros equally from @@ -2000,7 +1813,7 @@ def _trim_zeros_float( def is_number_with_decimal(x) -> bool: return re.match(number_regex, x) is not None - def should_trim(values: np.ndarray | list[str]) -> bool: + def should_trim(values: ArrayLike | list[str]) -> bool: """ Determine if an array of strings should be trimmed. diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index d20c2a62c61e2..552affbd053f2 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -356,7 +356,7 @@ def _initialize_memory_usage( return memory_usage -class BaseInfo(ABC): +class _BaseInfo(ABC): """ Base class for DataFrameInfo and SeriesInfo. @@ -439,7 +439,7 @@ def render( pass -class DataFrameInfo(BaseInfo): +class DataFrameInfo(_BaseInfo): """ Class storing dataframe-specific info. """ @@ -503,7 +503,7 @@ def render( verbose: bool | None, show_counts: bool | None, ) -> None: - printer = DataFrameInfoPrinter( + printer = _DataFrameInfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -512,7 +512,7 @@ def render( printer.to_buffer(buf) -class SeriesInfo(BaseInfo): +class SeriesInfo(_BaseInfo): """ Class storing series-specific info. """ @@ -538,7 +538,7 @@ def render( "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" ) - printer = SeriesInfoPrinter( + printer = _SeriesInfoPrinter( info=self, verbose=verbose, show_counts=show_counts, @@ -572,7 +572,7 @@ def memory_usage_bytes(self) -> int: return self.data.memory_usage(index=True, deep=deep) -class InfoPrinterAbstract: +class _InfoPrinterAbstract: """ Class for printing dataframe or series info. """ @@ -586,11 +586,11 @@ def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None: fmt.buffer_put_lines(buf, lines) @abstractmethod - def _create_table_builder(self) -> TableBuilderAbstract: + def _create_table_builder(self) -> _TableBuilderAbstract: """Create instance of table builder.""" -class DataFrameInfoPrinter(InfoPrinterAbstract): +class _DataFrameInfoPrinter(_InfoPrinterAbstract): """ Class for printing dataframe info. @@ -650,27 +650,27 @@ def _initialize_show_counts(self, show_counts: bool | None) -> bool: else: return show_counts - def _create_table_builder(self) -> DataFrameTableBuilder: + def _create_table_builder(self) -> _DataFrameTableBuilder: """ Create instance of table builder based on verbosity and display settings. """ if self.verbose: - return DataFrameTableBuilderVerbose( + return _DataFrameTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) elif self.verbose is False: # specifically set to False, not necessarily None - return DataFrameTableBuilderNonVerbose(info=self.info) + return _DataFrameTableBuilderNonVerbose(info=self.info) elif self.exceeds_info_cols: - return DataFrameTableBuilderNonVerbose(info=self.info) + return _DataFrameTableBuilderNonVerbose(info=self.info) else: - return DataFrameTableBuilderVerbose( + return _DataFrameTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) -class SeriesInfoPrinter(InfoPrinterAbstract): +class _SeriesInfoPrinter(_InfoPrinterAbstract): """Class for printing series info. Parameters @@ -694,17 +694,17 @@ def __init__( self.verbose = verbose self.show_counts = self._initialize_show_counts(show_counts) - def _create_table_builder(self) -> SeriesTableBuilder: + def _create_table_builder(self) -> _SeriesTableBuilder: """ Create instance of table builder based on verbosity. """ if self.verbose or self.verbose is None: - return SeriesTableBuilderVerbose( + return _SeriesTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) else: - return SeriesTableBuilderNonVerbose(info=self.info) + return _SeriesTableBuilderNonVerbose(info=self.info) def _initialize_show_counts(self, show_counts: bool | None) -> bool: if show_counts is None: @@ -713,13 +713,13 @@ def _initialize_show_counts(self, show_counts: bool | None) -> bool: return show_counts -class TableBuilderAbstract(ABC): +class _TableBuilderAbstract(ABC): """ Abstract builder for info table. """ _lines: list[str] - info: BaseInfo + info: _BaseInfo @abstractmethod def get_lines(self) -> list[str]: @@ -769,7 +769,7 @@ def add_dtypes_line(self) -> None: self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") -class DataFrameTableBuilder(TableBuilderAbstract): +class _DataFrameTableBuilder(_TableBuilderAbstract): """ Abstract builder for dataframe info table. @@ -820,7 +820,7 @@ def add_memory_usage_line(self) -> None: self._lines.append(f"memory usage: {self.memory_usage_string}") -class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): +class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder): """ Dataframe info table builder for non-verbose output. """ @@ -838,7 +838,7 @@ def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) -class TableBuilderVerboseMixin(TableBuilderAbstract): +class _TableBuilderVerboseMixin(_TableBuilderAbstract): """ Mixin for verbose info output. """ @@ -931,7 +931,7 @@ def _gen_dtypes(self) -> Iterator[str]: yield pprint_thing(dtype) -class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): +class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin): """ Dataframe info table builder for verbose output. """ @@ -997,7 +997,7 @@ def _gen_columns(self) -> Iterator[str]: yield pprint_thing(col) -class SeriesTableBuilder(TableBuilderAbstract): +class _SeriesTableBuilder(_TableBuilderAbstract): """ Abstract builder for series info table. @@ -1029,7 +1029,7 @@ def _fill_non_empty_info(self) -> None: """Add lines to the info table, pertaining to non-empty series.""" -class SeriesTableBuilderNonVerbose(SeriesTableBuilder): +class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder): """ Series info table builder for non-verbose output. """ @@ -1043,7 +1043,7 @@ def _fill_non_empty_info(self) -> None: self.add_memory_usage_line() -class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin): +class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin): """ Series info table builder for verbose output. """ diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index b57797b7ec717..2cc9368f8846a 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -15,11 +15,14 @@ TypeVar, Union, ) +from unicodedata import east_asian_width from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +from pandas.io.formats.console import get_console_size + EscapeChars = Union[Mapping[str, str], Iterable[str]] _KT = TypeVar("_KT") _VT = TypeVar("_VT") @@ -42,7 +45,7 @@ def adjoin(space: int, *lists: list[str], **kwargs) -> str: function used to justify str. Needed for unicode handling. """ strlen = kwargs.pop("strlen", len) - justfunc = kwargs.pop("justfunc", justify) + justfunc = kwargs.pop("justfunc", _adj_justify) newLists = [] lengths = [max(map(strlen, x)) + space for x in lists[:-1]] @@ -57,7 +60,7 @@ def adjoin(space: int, *lists: list[str], **kwargs) -> str: return "\n".join("".join(lines) for lines in toJoin) -def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: +def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: """ Perform ljust, center, rjust against string or list-like """ @@ -314,9 +317,6 @@ def format_object_summary( ------- summary string """ - from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import get_adjustment - display_width, _ = get_console_size() if display_width is None: display_width = get_option("display.width") or 80 @@ -501,3 +501,72 @@ class PrettyDict(dict[_KT, _VT]): def __repr__(self) -> str: return pprint_thing(self) + + +class _TextAdjustment: + def __init__(self) -> None: + self.encoding = get_option("display.encoding") + + def len(self, text: str) -> int: + return len(text) + + def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == "left": + return [x.ljust(max_len) for x in texts] + elif mode == "center": + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] + + def adjoin(self, space: int, *lists, **kwargs) -> str: + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) + + +class _EastAsianTextAdjustment(_TextAdjustment): + def __init__(self) -> None: + super().__init__() + if get_option("display.unicode.ambiguous_as_wide"): + self.ambiguous_width = 2 + else: + self.ambiguous_width = 1 + + # Definition of East Asian Width + # https://unicode.org/reports/tr11/ + # Ambiguous width can be changed by option + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} + + def len(self, text: str) -> int: + """ + Calculate display width considering unicode East Asian Width + """ + if not isinstance(text, str): + return len(text) + + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) + + def justify( + self, texts: Iterable[str], max_len: int, mode: str = "right" + ) -> list[str]: + # re-calculate padding space per str considering East Asian Width + def _get_pad(t): + return max_len - self.len(t) + len(t) + + if mode == "left": + return [x.ljust(_get_pad(x)) for x in texts] + elif mode == "center": + return [x.center(_get_pad(x)) for x in texts] + else: + return [x.rjust(_get_pad(x)) for x in texts] + + +def get_adjustment() -> _TextAdjustment: + use_east_asian_width = get_option("display.unicode.east_asian_width") + if use_east_asian_width: + return _EastAsianTextAdjustment() + else: + return _TextAdjustment() diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 90e9b1f0486db..829ed4a33f6a4 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -2357,7 +2357,7 @@ def color(value, user_arg, command, comm_arg): return latex_styles -def _escape_latex(s): +def _escape_latex(s: str) -> str: r""" Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, ``~``, ``^``, and ``\`` in the string with LaTeX-safe sequences. @@ -2392,7 +2392,7 @@ def _escape_latex(s): ) -def _math_mode_with_dollar(s): +def _math_mode_with_dollar(s: str) -> str: r""" All characters in LaTeX math mode are preserved. @@ -2425,7 +2425,7 @@ def _math_mode_with_dollar(s): return "".join(res).replace(r"rt8§=§7wz", r"\$") -def _math_mode_with_parentheses(s): +def _math_mode_with_parentheses(s: str) -> str: r""" All characters in LaTeX math mode are preserved. @@ -2461,7 +2461,7 @@ def _math_mode_with_parentheses(s): return "".join(res) -def _escape_latex_math(s): +def _escape_latex_math(s: str) -> str: r""" All characters in LaTeX math mode are preserved. diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index a6ee8407988ec..f56fca8d7ef44 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -8,11 +8,15 @@ from typing import ( TYPE_CHECKING, Any, + final, ) import warnings from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + cache_readonly, + doc, +) from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.missing import isna @@ -41,7 +45,7 @@ storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", ) -class BaseXMLFormatter: +class _BaseXMLFormatter: """ Subclass for formatting data in XML. @@ -138,14 +142,14 @@ def __init__( self.storage_options = storage_options self.orig_cols = self.frame.columns.tolist() - self.frame_dicts = self.process_dataframe() + self.frame_dicts = self._process_dataframe() - self.validate_columns() - self.validate_encoding() - self.prefix_uri = self.get_prefix_uri() - self.handle_indexes() + self._validate_columns() + self._validate_encoding() + self.prefix_uri = self._get_prefix_uri() + self._handle_indexes() - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: """ Build tree from data. @@ -154,7 +158,8 @@ def build_tree(self) -> bytes: """ raise AbstractMethodError(self) - def validate_columns(self) -> None: + @final + def _validate_columns(self) -> None: """ Validate elems_cols and attrs_cols. @@ -175,7 +180,8 @@ def validate_columns(self) -> None: f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" ) - def validate_encoding(self) -> None: + @final + def _validate_encoding(self) -> None: """ Validate encoding. @@ -189,7 +195,8 @@ def validate_encoding(self) -> None: codecs.lookup(self.encoding) - def process_dataframe(self) -> dict[int | str, dict[str, Any]]: + @final + def _process_dataframe(self) -> dict[int | str, dict[str, Any]]: """ Adjust Data Frame to fit xml output. @@ -213,7 +220,8 @@ def process_dataframe(self) -> dict[int | str, dict[str, Any]]: return df.to_dict(orient="index") - def handle_indexes(self) -> None: + @final + def _handle_indexes(self) -> None: """ Handle indexes. @@ -234,7 +242,7 @@ def handle_indexes(self) -> None: if self.elem_cols: self.elem_cols = indexes + self.elem_cols - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: """ Get uri of namespace prefix. @@ -248,7 +256,8 @@ def get_prefix_uri(self) -> str: raise AbstractMethodError(self) - def other_namespaces(self) -> dict: + @final + def _other_namespaces(self) -> dict: """ Define other namespaces. @@ -267,7 +276,8 @@ def other_namespaces(self) -> dict: return nmsp_dict - def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: + @final + def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: """ Create attributes of row. @@ -287,6 +297,7 @@ def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: raise KeyError(f"no valid column, {col}") return elem_row + @final def _get_flat_col_name(self, col: str | tuple) -> str: flat_col = col if isinstance(col, tuple): @@ -297,17 +308,20 @@ def _get_flat_col_name(self, col: str | tuple) -> str: ) return f"{self.prefix_uri}{flat_col}" - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): + raise AbstractMethodError(self) + + @final + def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None: """ Create child elements of row. This method adds child elements using elem_cols to row element and works with tuples for multindex or hierarchical columns. """ + sub_element_cls = self._sub_element_cls - raise AbstractMethodError(self) - - def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> None: if not self.elem_cols: return @@ -319,8 +333,9 @@ def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> Non except KeyError: raise KeyError(f"no valid column, {col}") + @final def write_output(self) -> str | None: - xml_doc = self.build_tree() + xml_doc = self._build_tree() if self.path_or_buffer is not None: with get_handle( @@ -337,13 +352,13 @@ def write_output(self) -> str | None: return xml_doc.decode(self.encoding).rstrip() -class EtreeXMLFormatter(BaseXMLFormatter): +class EtreeXMLFormatter(_BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: from xml.etree.ElementTree import ( Element, SubElement, @@ -351,7 +366,7 @@ def build_tree(self) -> bytes: ) self.root = Element( - f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() + f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces() ) for d in self.frame_dicts.values(): @@ -359,11 +374,11 @@ def build_tree(self) -> bytes: if not self.attr_cols and not self.elem_cols: self.elem_cols = list(d.keys()) - self.build_elems(d, elem_row) + self._build_elems(d, elem_row) else: - elem_row = self.build_attribs(d, elem_row) - self.build_elems(d, elem_row) + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) self.out_xml = tostring( self.root, @@ -373,7 +388,7 @@ def build_tree(self) -> bytes: ) if self.pretty_print: - self.out_xml = self.prettify_tree() + self.out_xml = self._prettify_tree() if self.stylesheet is not None: raise ValueError( @@ -382,7 +397,7 @@ def build_tree(self) -> bytes: return self.out_xml - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: from xml.etree.ElementTree import register_namespace uri = "" @@ -402,12 +417,13 @@ def get_prefix_uri(self) -> str: return uri - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): from xml.etree.ElementTree import SubElement - self._build_elems(SubElement, d, elem_row) + return SubElement - def prettify_tree(self) -> bytes: + def _prettify_tree(self) -> bytes: """ Output tree for pretty print format. @@ -421,7 +437,7 @@ def prettify_tree(self) -> bytes: return dom.toprettyxml(indent=" ", encoding=self.encoding) -class LxmlXMLFormatter(BaseXMLFormatter): +class LxmlXMLFormatter(_BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. @@ -430,9 +446,9 @@ class LxmlXMLFormatter(BaseXMLFormatter): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.convert_empty_str_key() + self._convert_empty_str_key() - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: """ Build tree from data. @@ -452,11 +468,11 @@ def build_tree(self) -> bytes: if not self.attr_cols and not self.elem_cols: self.elem_cols = list(d.keys()) - self.build_elems(d, elem_row) + self._build_elems(d, elem_row) else: - elem_row = self.build_attribs(d, elem_row) - self.build_elems(d, elem_row) + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) self.out_xml = tostring( self.root, @@ -467,11 +483,11 @@ def build_tree(self) -> bytes: ) if self.stylesheet is not None: - self.out_xml = self.transform_doc() + self.out_xml = self._transform_doc() return self.out_xml - def convert_empty_str_key(self) -> None: + def _convert_empty_str_key(self) -> None: """ Replace zero-length string in `namespaces`. @@ -482,7 +498,7 @@ def convert_empty_str_key(self) -> None: if self.namespaces and "" in self.namespaces.keys(): self.namespaces[None] = self.namespaces.pop("", "default") - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: uri = "" if self.namespaces: if self.prefix: @@ -497,12 +513,13 @@ def get_prefix_uri(self) -> str: return uri - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): from lxml.etree import SubElement - self._build_elems(SubElement, d, elem_row) + return SubElement - def transform_doc(self) -> bytes: + def _transform_doc(self) -> bytes: """ Parse stylesheet from file or buffer and run it. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6ce6ac71b1ddd..df5b08029a08b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -77,11 +77,11 @@ DtypeArg, DtypeBackend, FilePath, - HashableT, IndexLabel, ReadCsvBuffer, Self, StorageOptions, + UsecolsArgType, ) _doc_read_csv_and_table = ( r""" @@ -142,7 +142,7 @@ Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g., when you have a malformed file with delimiters at the end of each line. -usecols : list of Hashable or Callable, optional +usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings @@ -645,10 +645,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -669,7 +666,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -707,10 +704,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -732,7 +726,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -770,10 +764,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -795,7 +786,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -833,10 +824,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -858,7 +846,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -907,10 +895,7 @@ def read_csv( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -935,7 +920,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration @@ -1005,10 +990,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1027,7 +1009,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -1065,10 +1047,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1087,7 +1066,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1125,10 +1104,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1147,7 +1123,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -1185,10 +1161,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1207,7 +1180,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1258,10 +1231,7 @@ def read_table( header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] - | tuple[HashableT] - | Callable[[Hashable], bool] - | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1283,7 +1253,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d630a5ff8a41c..d00f1c666d5d6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -977,7 +977,7 @@ def __init__(self) -> None: # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int self.DTYPE_MAP = dict( - list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) + [(i, np.dtype(f"S{i}")) for i in range(1, 245)] + [ (251, np.dtype(np.int8)), (252, np.dtype(np.int16)), diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 918fe4d22ea62..bd3b515dbca2f 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -88,7 +88,7 @@ class _XMLFrameParser: Parse only the attributes at the specified ``xpath``. names : list - Column names for :class:`~pandas.DataFrame`of parsed XML data. + Column names for :class:`~pandas.DataFrame` of parsed XML data. dtype : dict Data type for data or columns. E.g. {{'a': np.float64, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index e6408a0eae841..5a7ceabbf554e 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -13,6 +13,7 @@ Any, cast, ) +import warnings import matplotlib.dates as mdates from matplotlib.ticker import ( @@ -239,18 +240,29 @@ def _convert_1d(values, units, axis): if not hasattr(axis, "freq"): raise TypeError("Axis must have `freq` set to convert to Periods") valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) - if isinstance(values, valid_types) or is_integer(values) or is_float(values): - return get_datevalue(values, axis.freq) - elif isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).asi8 - elif isinstance(values, Index): - return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == "period": - # https://github.com/pandas-dev/pandas/issues/24304 - # convert ndarray[period] -> PeriodIndex - return PeriodIndex(values, freq=axis.freq).asi8 - elif isinstance(values, (list, tuple, np.ndarray, Index)): - return [get_datevalue(x, axis.freq) for x in values] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + if ( + isinstance(values, valid_types) + or is_integer(values) + or is_float(values) + ): + return get_datevalue(values, axis.freq) + elif isinstance(values, PeriodIndex): + return values.asfreq(axis.freq).asi8 + elif isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + elif lib.infer_dtype(values, skipna=False) == "period": + # https://github.com/pandas-dev/pandas/issues/24304 + # convert ndarray[period] -> PeriodIndex + return PeriodIndex(values, freq=axis.freq).asi8 + elif isinstance(values, (list, tuple, np.ndarray, Index)): + return [get_datevalue(x, axis.freq) for x in values] return values @@ -575,11 +587,18 @@ def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 - dates_ = period_range( - start=Period(ordinal=vmin, freq=freq), - end=Period(ordinal=vmax, freq=freq), - freq=freq, - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Period with BDay freq is deprecated", category=FutureWarning + ) + warnings.filterwarnings( + "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + ) + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, + ) # Initialize the output info = np.zeros( @@ -1072,7 +1091,13 @@ def __call__(self, x, pos: int = 0) -> str: fmt = self.formatdict.pop(x, "") if isinstance(fmt, np.bytes_): fmt = fmt.decode("utf-8") - period = Period(ordinal=int(x), freq=self.freq) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Period with BDay freq is deprecated", + category=FutureWarning, + ) + period = Period(ordinal=int(x), freq=self.freq) assert isinstance(period, Period) return period.strftime(fmt) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 227b72573f979..be988594ebf58 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -839,7 +839,7 @@ def test_with_listlike_columns(): { "a": Series(np.random.default_rng(2).standard_normal(4)), "b": ["a", "list", "of", "words"], - "ts": date_range("2016-10-01", periods=4, freq="H"), + "ts": date_range("2016-10-01", periods=4, freq="h"), } ) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index aeb6a01eb587a..643b9220999f7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -188,13 +188,13 @@ def test_apply_box(): def test_apply_datetimetz(by_row): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + values = pd.date_range("2011-01-01", "2011-01-02", freq="h").tz_localize( "Asia/Tokyo" ) s = Series(values, name="XX") result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 7ec77e5b65b7e..f77b81574e1c1 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -21,13 +21,13 @@ def one(request): Examples -------- - dti = pd.date_range('2016-01-01', periods=2, freq='H') + dti = pd.date_range('2016-01-01', periods=2, freq='h') dti DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') dti + one DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') """ return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 0c46d22ddcc2e..df6ccda27ab85 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1076,7 +1076,7 @@ def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): # Note: freq here includes both Tick and non-Tick offsets; this is # relevant because historically integer-addition was allowed if we had # a freq. - @pytest.mark.parametrize("freq", ["H", "D", "W", "2ME", "MS", "Q", "B", None]) + @pytest.mark.parametrize("freq", ["h", "D", "W", "2ME", "MS", "Q", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( self, request, dtype, box_with_array, freq, tz_naive_fixture @@ -1144,7 +1144,7 @@ def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): ) assert_invalid_addsub_type(dtarr, other, msg) - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) @pytest.mark.parametrize("dti_freq", [None, "D"]) def test_dt64arr_add_sub_parr( self, dti_freq, pi_freq, box_with_array, box_with_array2 @@ -1282,10 +1282,10 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): offset = dates + pd.offsets.Hour(5) assert dates[0] + pd.offsets.Hour(5) == offset[0] - dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="H") + dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="h") expected = DatetimeIndex( ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], - freq="H", + freq="h", tz=tz, ) @@ -1953,7 +1953,7 @@ def test_operators_datetimelike_with_timezones(self): dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="h")) td2 = td1.copy() td2.iloc[1] = np.nan assert td2._values.freq is None diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index ee8391830db4c..5af63258921ed 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -286,14 +286,14 @@ def test_parr_cmp_pi_mismatched_freq(self, freq, box_with_array): msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period" with pytest.raises(TypeError, match=msg): - base <= Period("2011", freq="A") + base <= Period("2011", freq="Y") with pytest.raises(TypeError, match=msg): - Period("2011", freq="A") >= base + Period("2011", freq="Y") >= base # TODO: Could parametrize over boxes for idx? - idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") - rev_msg = r"Invalid comparison between dtype=period\[A-DEC\] and PeriodArray" + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="Y") + rev_msg = r"Invalid comparison between dtype=period\[Y-DEC\] and PeriodArray" idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg with pytest.raises(TypeError, match=idx_msg): base <= idx @@ -405,18 +405,18 @@ def test_cmp_series_period_series_mixed_freq(self): # GH#13200 base = Series( [ - Period("2011", freq="A"), + Period("2011", freq="Y"), Period("2011-02", freq="M"), - Period("2013", freq="A"), + Period("2013", freq="Y"), Period("2011-04", freq="M"), ] ) ser = Series( [ - Period("2012", freq="A"), + Period("2012", freq="Y"), Period("2011-01", freq="M"), - Period("2013", freq="A"), + Period("2013", freq="Y"), Period("2011-05", freq="M"), ] ) @@ -637,11 +637,11 @@ def test_pi_sub_pi_with_nat(self): def test_parr_sub_pi_mismatched_freq(self, box_with_array, box_with_array2): rng = period_range("1/1/2000", freq="D", periods=5) - other = period_range("1/6/2000", freq="H", periods=5) + other = period_range("1/6/2000", freq="h", periods=5) rng = tm.box_expected(rng, box_with_array) other = tm.box_expected(other, box_with_array2) - msg = r"Input has different freq=[HD] from PeriodArray\(freq=[DH]\)" + msg = r"Input has different freq=[hD] from PeriodArray\(freq=[Dh]\)" with pytest.raises(IncompatibleFrequency, match=msg): rng - other @@ -696,7 +696,7 @@ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): Timestamp("2016-01-01").to_pydatetime(), Timestamp("2016-01-01").to_datetime64(), # datetime-like arrays - pd.date_range("2016-01-01", periods=3, freq="H"), + pd.date_range("2016-01-01", periods=3, freq="h"), pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), pd.date_range("2016-01-01", periods=3, freq="s")._data, pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, @@ -779,8 +779,8 @@ def test_pi_add_sub_td64_array_tick(self): with pytest.raises(TypeError, match=msg): tdi - rng - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): box = box_with_array xbox = box if box not in [pd.array, tm.to_array] else pd.Index @@ -792,7 +792,7 @@ def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): # TODO: parametrize over box for pi? td64obj = tm.box_expected(tdi, box) - if pi_freq == "H": + if pi_freq == "h": result = pi - td64obj expected = (pi.to_timestamp("s") - tdi).to_period(pi_freq) expected = tm.box_expected(expected, xbox) @@ -891,9 +891,9 @@ def test_pi_sub_offset_array(self, box): def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 - rng = period_range("2000-01-01 09:00", freq="H", periods=10) + rng = period_range("2000-01-01 09:00", freq="h", periods=10) result = rng + one - expected = period_range("2000-01-01 10:00", freq="H", periods=10) + expected = period_range("2000-01-01 10:00", freq="h", periods=10) tm.assert_index_equal(result, expected) rng += one tm.assert_index_equal(rng, expected) @@ -903,9 +903,9 @@ def test_pi_sub_isub_int(self, one): PeriodIndex.__sub__ and __isub__ with several representations of the integer 1, e.g. int, np.int64, np.uint8, ... """ - rng = period_range("2000-01-01 09:00", freq="H", periods=10) + rng = period_range("2000-01-01 09:00", freq="h", periods=10) result = rng - one - expected = period_range("2000-01-01 08:00", freq="H", periods=10) + expected = period_range("2000-01-01 08:00", freq="h", periods=10) tm.assert_index_equal(result, expected) rng -= one tm.assert_index_equal(rng, expected) @@ -934,9 +934,9 @@ def test_pi_add_sub_int_array_freqn_gt1(self): def test_pi_sub_isub_offset(self): # offset # DateOffset - rng = period_range("2014", "2024", freq="A") + rng = period_range("2014", "2024", freq="Y") result = rng - pd.offsets.YearEnd(5) - expected = period_range("2009", "2019", freq="A") + expected = period_range("2009", "2019", freq="Y") tm.assert_index_equal(result, expected) rng -= pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) @@ -1131,8 +1131,8 @@ def test_parr_add_sub_timedeltalike_freq_mismatch_daily( def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): other = two_hours - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") - expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="h") result = rng + other tm.assert_index_equal(result, expected) @@ -1144,12 +1144,12 @@ def test_parr_add_timedeltalike_mismatched_freq_hourly( self, not_hourly, box_with_array ): other = not_hourly - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") rng = tm.box_expected(rng, box_with_array) msg = "|".join( [ # non-timedelta-like DateOffset - "Input has different freq(=.+)? from Period.*?\\(freq=H\\)", + "Input has different freq(=.+)? from Period.*?\\(freq=h\\)", # timedelta/td64/Timedelta but not a multiple of 24H "Cannot add/subtract timedelta-like from PeriodArray that is " "not an integer multiple of the PeriodArray's freq.", @@ -1164,8 +1164,8 @@ def test_parr_add_timedeltalike_mismatched_freq_hourly( def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): other = two_hours - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") - expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="h") result = rng - other tm.assert_index_equal(result, expected) @@ -1176,17 +1176,17 @@ def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): def test_add_iadd_timedeltalike_annual(self): # offset # DateOffset - rng = period_range("2014", "2024", freq="A") + rng = period_range("2014", "2024", freq="Y") result = rng + pd.offsets.YearEnd(5) - expected = period_range("2019", "2029", freq="A") + expected = period_range("2019", "2029", freq="Y") tm.assert_index_equal(result, expected) rng += pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): other = mismatched_freq - rng = period_range("2014", "2024", freq="A") - msg = "Input has different freq(=.+)? from Period.*?\\(freq=A-DEC\\)" + rng = period_range("2014", "2024", freq="Y") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=Y-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 3d237b3ac4a31..205e6472aaecb 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -70,7 +70,7 @@ def test_compare_timedelta64_zerodim(self, box_with_array): box = box_with_array xbox = box_with_array if box_with_array not in [Index, pd.array] else np.ndarray - tdi = timedelta_range("2H", periods=4) + tdi = timedelta_range("2h", periods=4) other = np.array(tdi.to_numpy()[0]) tdi = tm.box_expected(tdi, box) @@ -276,32 +276,32 @@ class TestTimedelta64ArithmeticUnsorted: def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") + idx = TimedeltaIndex(["2h", "4h", "6h", "8h", "10h"], freq="2h", name="x") for result in [idx * 2, np.multiply(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["4H", "8H", "12H", "16H", "20H"], freq="4H", name="x") + exp = TimedeltaIndex(["4h", "8h", "12h", "16h", "20h"], freq="4h", name="x") tm.assert_index_equal(result, exp) - assert result.freq == "4H" + assert result.freq == "4h" for result in [idx / 2, np.divide(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["1H", "2H", "3H", "4H", "5H"], freq="H", name="x") + exp = TimedeltaIndex(["1h", "2h", "3h", "4h", "5h"], freq="h", name="x") tm.assert_index_equal(result, exp) - assert result.freq == "H" + assert result.freq == "h" for result in [-idx, np.negative(idx)]: assert isinstance(result, TimedeltaIndex) exp = TimedeltaIndex( - ["-2H", "-4H", "-6H", "-8H", "-10H"], freq="-2H", name="x" + ["-2h", "-4h", "-6h", "-8h", "-10h"], freq="-2h", name="x" ) tm.assert_index_equal(result, exp) - assert result.freq == "-2H" + assert result.freq == "-2h" - idx = TimedeltaIndex(["-2H", "-1H", "0H", "1H", "2H"], freq="H", name="x") + idx = TimedeltaIndex(["-2h", "-1h", "0h", "1h", "2h"], freq="h", name="x") for result in [abs(idx), np.absolute(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["2H", "1H", "0H", "1H", "2H"], freq=None, name="x") + exp = TimedeltaIndex(["2h", "1h", "0h", "1h", "2h"], freq=None, name="x") tm.assert_index_equal(result, exp) assert result.freq is None @@ -1073,8 +1073,8 @@ def test_td64arr_add_dt64_array(self, box_with_array): # ------------------------------------------------------------------ # Invalid __add__/__sub__ operations - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) def test_td64arr_sub_periodlike( self, box_with_array, box_with_array2, tdi_freq, pi_freq ): @@ -1133,7 +1133,7 @@ def test_td64arr_addsub_numeric_arr_invalid( def test_td64arr_add_sub_int(self, box_with_array, one): # Variants of `one` for #19012, deprecated GH#22535 - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + rng = timedelta_range("1 days 09:00:00", freq="h", periods=10) tdarr = tm.box_expected(rng, box_with_array) msg = "Addition/subtraction of integers" @@ -1152,7 +1152,7 @@ def test_td64arr_add_sub_integer_array(self, box_with_array): box = box_with_array xbox = np.ndarray if box is pd.array else box - rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) + rng = timedelta_range("1 days 09:00:00", freq="h", periods=3) tdarr = tm.box_expected(rng, box) other = tm.box_expected([4, 3, 2], xbox) @@ -2011,7 +2011,7 @@ def test_td64arr_div_numeric_array( tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) - expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") + expected = Series(["2.95D", "1D 23h 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) xbox = get_upcast_box(tdser, vector) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index dd8c3eda9ed05..71156a4d84ae5 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd @@ -51,7 +53,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): s = s.dropna() if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int_) + assert isinstance(getattr(s, op)(), np_long) elif op == "count": # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) assert isinstance(getattr(s, op)(), np.integer) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index d2f9f6dffab49..7fba150c9113f 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -32,7 +32,7 @@ def test_astype_nan_to_int(self, cls, values): [ array(["2019", "2020"], dtype="datetime64[ns, UTC]"), array([0, 0], dtype="timedelta64[ns]"), - array([Period("2019"), Period("2020")], dtype="period[A-DEC]"), + array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"), array([Interval(0, 1), Interval(1, 2)], dtype="interval"), array([1, np.nan], dtype="Int64"), ], diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index cdf5d967d9c3d..dca171bf81047 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -148,7 +148,7 @@ def test_categorical_repr_ordered(self): assert repr(c) == exp def test_categorical_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx) exp = ( @@ -176,7 +176,7 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") c = Categorical(idx) exp = ( "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " @@ -210,7 +210,7 @@ def test_categorical_repr_datetime(self): assert repr(c) == exp def test_categorical_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < @@ -225,7 +225,7 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < @@ -254,17 +254,17 @@ def test_categorical_repr_int_with_nan(self): assert repr(s) == s_exp def test_categorical_repr_period(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp @@ -283,17 +283,17 @@ def test_categorical_repr_period(self): assert repr(c) == exp def test_categorical_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp @@ -396,7 +396,7 @@ def test_categorical_index_repr_ordered(self): assert repr(i) == exp def test_categorical_index_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -405,7 +405,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -415,7 +415,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp def test_categorical_index_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -424,7 +424,7 @@ def test_categorical_index_repr_datetime_ordered(self): assert repr(i) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -445,22 +445,22 @@ def test_categorical_index_repr_datetime_ordered(self): def test_categorical_index_repr_period(self): # test all length - idx = period_range("2011-01-01 09:00", freq="H", periods=1) + idx = period_range("2011-01-01 09:00", freq="h", periods=1) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=2) + idx = period_range("2011-01-01 09:00", freq="h", periods=2) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=3) + idx = period_range("2011-01-01 09:00", freq="h", periods=3) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -483,7 +483,7 @@ def test_categorical_index_repr_period(self): assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index 30f47e37fedf5..e513457819eb5 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -34,7 +34,7 @@ def test_freq_validation(self): arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 msg = ( - "Inferred frequency H from passed values does not " + "Inferred frequency h from passed values does not " "conform to passed frequency W-SUN" ) with pytest.raises(ValueError, match=msg): @@ -70,7 +70,7 @@ def test_from_pandas_array(self): result = DatetimeArray._from_sequence(arr)._with_freq("infer") - expected = pd.date_range("1970-01-01", periods=5, freq="H")._data + expected = pd.date_range("1970-01-01", periods=5, freq="h")._data tm.assert_datetime_array_equal(result, expected) def test_mismatched_timezone_raises(self): diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index 903fc3177aa84..a1e1e8efe6dee 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -33,7 +33,7 @@ def test_arrow_extension_type(): "data, freq", [ (pd.date_range("2017", periods=3), "D"), - (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + (pd.date_range("2017", periods=3, freq="Y"), "Y-DEC"), ], ) def test_arrow_array(data, freq): @@ -110,7 +110,7 @@ def test_arrow_load_from_zero_chunks(): def test_arrow_table_roundtrip_without_metadata(): - arr = PeriodArray([1, 2, 3], dtype="period[H]") + arr = PeriodArray([1, 2, 3], dtype="period[h]") arr[1] = pd.NaT df = pd.DataFrame({"a": arr}) diff --git a/pandas/tests/arrays/period/test_constructors.py b/pandas/tests/arrays/period/test_constructors.py index 0ea26a6ece7eb..d034162f1b46e 100644 --- a/pandas/tests/arrays/period/test_constructors.py +++ b/pandas/tests/arrays/period/test_constructors.py @@ -71,11 +71,11 @@ def test_from_datetime64_freq_2M(freq): "data, freq, msg", [ ( - [pd.Period("2017", "D"), pd.Period("2017", "A")], + [pd.Period("2017", "D"), pd.Period("2017", "Y")], None, "Input has different freq", ), - ([pd.Period("2017", "D")], "A", "Input has different freq"), + ([pd.Period("2017", "D")], "Y", "Input has different freq"), ], ) def test_period_array_raises(data, freq, msg): diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 2746cd91963a0..92536a222296e 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -135,14 +135,14 @@ def test_dt64_array(dtype_unit): ), # Timedelta ( - ["1H", "2H"], + ["1h", "2h"], np.dtype("timedelta64[ns]"), - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( - pd.TimedeltaIndex(["1H", "2H"]), + pd.TimedeltaIndex(["1h", "2h"]), np.dtype("timedelta64[ns]"), - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( np.array([1, 2], dtype="m8[s]"), @@ -150,9 +150,9 @@ def test_dt64_array(dtype_unit): TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[s]")), ), ( - pd.TimedeltaIndex(["1H", "2H"]), + pd.TimedeltaIndex(["1h", "2h"]), None, - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( # preserve non-nano, i.e. don't cast to NumpyExtensionArray @@ -298,8 +298,8 @@ def test_array_copy(): ), # timedelta ( - [pd.Timedelta("1H"), pd.Timedelta("2H")], - TimedeltaArray._from_sequence(["1H", "2H"]), + [pd.Timedelta("1h"), pd.Timedelta("2h")], + TimedeltaArray._from_sequence(["1h", "2h"]), ), ( np.array([1, 2], dtype="m8[ns]"), @@ -350,7 +350,7 @@ def test_array_inference(data, expected): "data", [ # mix of frequencies - [pd.Period("2000", "D"), pd.Period("2001", "A")], + [pd.Period("2000", "D"), pd.Period("2001", "Y")], # mix of closed [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], # Mix of timezones diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 291c687d84125..3f91b9b03e1de 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -858,7 +858,7 @@ def test_concat_same_type_invalid(self, arr1d): def test_concat_same_type_different_freq(self): # we *can* concatenate DTI with different freqs. a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) - b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central")) + b = DatetimeArray(pd.date_range("2000", periods=2, freq="h", tz="US/Central")) result = DatetimeArray._concat_same_type([a, b]) expected = DatetimeArray( pd.to_datetime( diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index a105852395b3a..fc46e5a372806 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -747,7 +747,7 @@ def test_iter_zoneinfo_fold(self, tz): assert left.utcoffset() == right2.utcoffset() def test_date_range_frequency_M_deprecated(self): - depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + depr_msg = "'M' will be deprecated, please use 'ME' instead." expected = pd.date_range("1/1/2000", periods=4, freq="2ME") with tm.assert_produces_warning(UserWarning, match=depr_msg): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index d1e954bc2ebe2..48453ba19e9a1 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -82,9 +82,9 @@ def test_setitem(key, value, expected): def test_setitem_raises_incompatible_freq(): arr = PeriodArray(np.arange(3), dtype="period[D]") with pytest.raises(IncompatibleFrequency, match="freq"): - arr[0] = pd.Period("2000", freq="A") + arr[0] = pd.Period("2000", freq="Y") - other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[A]") + other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[Y]") with pytest.raises(IncompatibleFrequency, match="freq"): arr[[0, 1]] = other @@ -133,8 +133,8 @@ def test_sub_period_overflow(): @pytest.mark.parametrize( "other", [ - pd.Period("2000", freq="H"), - PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[H]"), + pd.Period("2000", freq="h"), + PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[h]"), ], ) def test_where_different_freq_raises(other): diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 1043c2ee6c9b6..21bc85a4d070e 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -196,7 +196,7 @@ def test_add_timedeltaarraylike(self, tda): class TestTimedeltaArray: @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = TimedeltaArray._from_sequence([Timedelta("1H"), Timedelta("2H")]) + arr = TimedeltaArray._from_sequence([Timedelta("1h"), Timedelta("2h")]) if np.dtype(dtype) != np.int64: with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): @@ -208,8 +208,8 @@ def test_astype_int(self, dtype): tm.assert_numpy_array_equal(result, expected) def test_setitem_clears_freq(self): - a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H")) - a[0] = Timedelta("1H") + a = TimedeltaArray(pd.timedelta_range("1h", periods=2, freq="h")) + a[0] = Timedelta("1h") assert a.freq is None @pytest.mark.parametrize( @@ -222,7 +222,7 @@ def test_setitem_clears_freq(self): ) def test_setitem_objects(self, obj): # make sure we accept timedelta64 and timedelta in addition to Timedelta - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + tdi = pd.timedelta_range("2 Days", periods=4, freq="h") arr = TimedeltaArray(tdi, freq=tdi.freq) arr[0] = obj @@ -299,7 +299,7 @@ def test_neg(self): tm.assert_timedelta_array_equal(result2, expected) def test_neg_freq(self): - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") + tdi = pd.timedelta_range("2 Days", periods=4, freq="h") arr = TimedeltaArray(tdi, freq=tdi.freq) expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) diff --git a/pandas/tests/arrays/timedeltas/test_reductions.py b/pandas/tests/arrays/timedeltas/test_reductions.py index 72d45f5b9a78c..3718e7e646ea9 100644 --- a/pandas/tests/arrays/timedeltas/test_reductions.py +++ b/pandas/tests/arrays/timedeltas/test_reductions.py @@ -35,14 +35,14 @@ def test_sum_empty(self, skipna): assert result == Timedelta(0) def test_min_max(self): - arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) + arr = TimedeltaArray._from_sequence(["3h", "3h", "NaT", "2h", "5h", "4h"]) result = arr.min() - expected = Timedelta("2H") + expected = Timedelta("2h") assert result == expected result = arr.max() - expected = Timedelta("5H") + expected = Timedelta("5h") assert result == expected result = arr.min(skipna=False) @@ -52,7 +52,7 @@ def test_min_max(self): assert result is pd.NaT def test_sum(self): - tdi = pd.TimedeltaIndex(["3H", "3H", "NaT", "2H", "5H", "4H"]) + tdi = pd.TimedeltaIndex(["3h", "3h", "NaT", "2h", "5h", "4h"]) arr = tdi.array result = arr.sum(skipna=True) @@ -86,7 +86,7 @@ def test_sum(self): def test_npsum(self): # GH#25282, GH#25335 np.sum should return a Timedelta, not timedelta64 - tdi = pd.TimedeltaIndex(["3H", "3H", "2H", "5H", "4H"]) + tdi = pd.TimedeltaIndex(["3h", "3h", "2h", "5h", "4h"]) arr = tdi.array result = np.sum(tdi) @@ -133,7 +133,7 @@ def test_sum_2d_skipna_false(self): ], ) def test_std(self, add): - tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) + add + tdi = pd.TimedeltaIndex(["0h", "4h", "NaT", "4h", "0h", "2h"]) + add arr = tdi.array result = arr.std(skipna=True) @@ -162,7 +162,7 @@ def test_std(self, add): assert np.isnat(result) def test_median(self): - tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) arr = tdi.array result = arr.median(skipna=True) @@ -181,7 +181,7 @@ def test_median(self): assert result is pd.NaT def test_mean(self): - tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) arr = tdi._data # manually verified result diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index db13e979a3c2d..3e0b0dbeb5624 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -192,9 +192,9 @@ def test_iter_box(self): "datetime64[ns, US/Central]", ), ( - pd.PeriodIndex([2018, 2019], freq="A"), + pd.PeriodIndex([2018, 2019], freq="Y"), PeriodArray, - pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"), ), (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"), ( @@ -314,7 +314,7 @@ def test_array_multiindex_raises(): ), # Timedelta ( - TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="h"), np.array([0, 3600000000000], dtype="m8[ns]"), ), # GH#26406 tz is preserved in Categorical[dt64tz] diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index af7e759902f9f..31f80d300ccca 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -99,7 +99,9 @@ def test_series_from_series_with_reindex(using_copy_on_write): def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): if idx is None or dtype is not None: fastpath = False - ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) ser_orig = ser.copy() data = getattr(arr, "_data", arr) if using_copy_on_write: @@ -157,7 +159,9 @@ def test_series_from_index_different_dtypes(using_copy_on_write): def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath): ser = Series([1, 2, 3], dtype="int64") ser_orig = ser.copy() - ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) assert np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 76b974330cbf1..45bfc6a6fcf9b 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1311,7 +1311,7 @@ def test_rename_axis(using_copy_on_write, kwargs): def test_tz_convert_localize(using_copy_on_write, func, tz): # GH 49473 ser = Series( - [1, 2], index=date_range(start="2014-08-01 09:00", freq="H", periods=2, tz=tz) + [1, 2], index=date_range(start="2014-08-01 09:00", freq="h", periods=2, tz=tz) ) ser_orig = ser.copy() ser2 = getattr(ser, func)("US/Central") diff --git a/pandas/tests/dtypes/cast/test_find_common_type.py b/pandas/tests/dtypes/cast/test_find_common_type.py index 8ce05337be70b..83ef7382fbe8a 100644 --- a/pandas/tests/dtypes/cast/test_find_common_type.py +++ b/pandas/tests/dtypes/cast/test_find_common_type.py @@ -122,7 +122,7 @@ def test_period_dtype_match(): [ DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), PeriodDtype(freq="2D"), - PeriodDtype(freq="H"), + PeriodDtype(freq="h"), np.dtype("datetime64[ns]"), object, np.int64, diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index ed08df74461ef..50eaa1f4d8713 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -170,7 +170,7 @@ def test_infer_dtype_from_scalar(value, expected): @pytest.mark.parametrize( "arr, expected", [ - ([1], np.int_), + ([1], np.dtype(int)), (np.array([1], dtype=np.int64), np.int64), ([np.nan, 1, ""], np.object_), (np.array([[1.0, 2.0]]), np.float64), diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4507857418e9e..8e99c074a2c55 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -274,7 +274,7 @@ def test_is_period_dtype(): assert not com.is_period_dtype(pd.Period("2017-01-01")) assert com.is_period_dtype(PeriodDtype(freq="D")) - assert com.is_period_dtype(pd.PeriodIndex([], freq="A")) + assert com.is_period_dtype(pd.PeriodIndex([], freq="Y")) def test_is_interval_dtype(): @@ -301,14 +301,23 @@ def test_is_categorical_dtype(): assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) -def test_is_string_dtype(): - assert not com.is_string_dtype(int) - assert not com.is_string_dtype(pd.Series([1, 2])) +@pytest.mark.parametrize( + "dtype, expected", + [ + (int, False), + (pd.Series([1, 2]), False), + (str, True), + (object, True), + (np.array(["a", "b"]), True), + (pd.StringDtype(), True), + (pd.Index([], dtype="O"), True), + ], +) +def test_is_string_dtype(dtype, expected): + # GH#54661 - assert com.is_string_dtype(str) - assert com.is_string_dtype(object) - assert com.is_string_dtype(np.array(["a", "b"])) - assert com.is_string_dtype(pd.StringDtype()) + result = com.is_string_dtype(dtype) + assert result is expected @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 6562074eee634..1f9c371c50ad4 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -432,12 +432,12 @@ def test_construction(self): assert dt.freq == pd.tseries.offsets.Day(3) for s in [ - "period[26H]", - "Period[26H]", - "26H", - "period[1D2H]", - "Period[1D2H]", - "1D2H", + "period[26h]", + "Period[26h]", + "26h", + "period[1D2h]", + "Period[1D2h]", + "1D2h", ]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Hour(26) @@ -533,7 +533,7 @@ def test_basic(self, dtype): with tm.assert_produces_warning(FutureWarning, match=msg): assert is_period_dtype(dtype) - pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") + pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="h") assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 3d1274df0a21b..12006248b1db3 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -248,7 +248,7 @@ def get_reduction_result_dtype(dtype): return NUMPY_INT_TO_DTYPE[np.dtype(int)] else: # i.e. dtype.kind == "u" - return NUMPY_INT_TO_DTYPE[np.dtype(np.uint)] + return NUMPY_INT_TO_DTYPE[np.dtype("uint")] if method in ["sum", "prod"]: # std and var are not dtype-preserving diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4e0bc8d804bab..e10c6ef9a7018 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, data, keep): + arr = data.take([0, 1, 0, 1]) + result = arr.duplicated(keep=keep) + if keep == "first": + expected = np.array([False, False, True, True]) + elif keep == "last": + expected = np.array([True, True, False, False]) + else: + expected = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 339e97e735f85..41312f45838a9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2484,10 +2484,10 @@ def test_dt_roundlike_tz_options_not_supported(method): dtype=ArrowDtype(pa.timestamp("ns")), ) with pytest.raises(NotImplementedError, match="ambiguous is not supported."): - getattr(ser.dt, method)("1H", ambiguous="NaT") + getattr(ser.dt, method)("1h", ambiguous="NaT") with pytest.raises(NotImplementedError, match="nonexistent is not supported."): - getattr(ser.dt, method)("1H", nonexistent="NaT") + getattr(ser.dt, method)("1h", nonexistent="NaT") @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) @@ -2506,7 +2506,7 @@ def test_dt_roundlike_unsupported_freq(method): @pytest.mark.xfail( pa_version_under7p0, reason="Methods not supported for pyarrow < 7.0" ) -@pytest.mark.parametrize("freq", ["D", "H", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"]) @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) def test_dt_ceil_year_floor(freq, method): ser = pd.Series( diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 7dffa7bb242d5..55d4a6c3b39fa 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -281,7 +281,7 @@ def test_frame_from_records_utc(self): def test_from_records_to_records(self): # from numpy documentation - arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr = np.zeros((2,), dtype=("i4,f4,S10")) arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] DataFrame.from_records(arr) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..de8df15a9d747 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -108,11 +108,11 @@ def test_setitem_list(self, float_frame): data["A"] = newcolumndata def test_setitem_list2(self): - df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=int) df.loc[1, ["tt1", "tt2"]] = [1, 2] result = df.loc[df.index[1], ["tt1", "tt2"]] - expected = Series([1, 2], df.columns, dtype=np.int_, name=1) + expected = Series([1, 2], df.columns, dtype=int, name=1) tm.assert_series_equal(result, expected) df["tt1"] = df["tt2"] = "0" @@ -1905,6 +1905,19 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_add_new_column_infer_string(): + # GH#55366 + pytest.importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 87a56c0736287..25eb2ccb18361 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -23,8 +23,8 @@ def test_align_asfreq_method_raises(self): df.align(df.iloc[::-1], method="asfreq") def test_frame_align_aware(self): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") - idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern") df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1) df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2) new1, new2 = df1.align(df2) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index a2f8f3e278395..b3ab11d07bd7e 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -63,8 +63,8 @@ def test_asfreq2(self, frame_or_series): def test_asfreq_datetimeindex_empty(self, frame_or_series): # GH#14320 index = DatetimeIndex(["2016-09-29 11:00"]) - expected = frame_or_series(index=index, dtype=object).asfreq("H") - result = frame_or_series([3], index=index.copy()).asfreq("H") + expected = frame_or_series(index=index, dtype=object).asfreq("h") + result = frame_or_series([3], index=index.copy()).asfreq("h") tm.assert_index_equal(expected.index, result.index) @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) @@ -104,7 +104,7 @@ def test_asfreq_keep_index_name(self, frame_or_series): assert index_name == obj.asfreq("10D").index.name def test_asfreq_ts(self, frame_or_series): - index = period_range(freq="A", start="1/1/2001", end="12/31/2010") + index = period_range(freq="Y", start="1/1/2001", end="12/31/2010") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), 3)), index=index ) @@ -194,8 +194,8 @@ def test_asfreq_with_date_object_index(self, frame_or_series): ts2 = ts.copy() ts2.index = [x.date() for x in ts2.index] - result = ts2.asfreq("4H", method="ffill") - expected = ts.asfreq("4H", method="ffill") + result = ts2.asfreq("4h", method="ffill") + expected = ts.asfreq("4h", method="ffill") tm.assert_equal(result, expected) def test_asfreq_with_unsorted_index(self, frame_or_series): @@ -235,7 +235,7 @@ def test_asfreq_2ME(self, freq, freq_half): tm.assert_frame_equal(result, expected) def test_asfreq_frequency_M_deprecated(self): - depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + depr_msg = "'M' will be deprecated, please use 'ME' instead." index = date_range("1/1/2000", periods=4, freq="ME") df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 5683ec60b0d88..4a8adf89b3aef 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -178,7 +178,7 @@ def test_is_copy(self, date_range_frame): def test_asof_periodindex_mismatched_freq(self): N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng) # Mismatched freq diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 67200396f6375..4c1434bd66aff 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -18,7 +18,7 @@ class TestAtTime: def test_localized_at_time(self, tzstr, frame_or_series): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("4/16/2012", "5/1/2012", freq="H") + rng = date_range("4/16/2012", "5/1/2012", freq="h") ts = frame_or_series( np.random.default_rng(2).standard_normal(len(rng)), index=rng ) @@ -69,7 +69,7 @@ def test_at_time_nonexistent(self, frame_or_series): ) def test_at_time_errors(self, hour): # GH#24043 - dti = date_range("2018", periods=3, freq="H") + dti = date_range("2018", periods=3, freq="h") df = DataFrame(list(range(len(dti))), index=dti) if getattr(hour, "tzinfo", None) is None: result = df.at_time(hour) @@ -81,7 +81,7 @@ def test_at_time_errors(self, hour): def test_at_time_tz(self): # GH#24043 - dti = date_range("2018", periods=3, freq="H", tz="US/Pacific") + dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) expected = df.iloc[1:2] diff --git a/pandas/tests/frame/methods/test_between_time.py b/pandas/tests/frame/methods/test_between_time.py index 4c1e009b04639..74d6291707e19 100644 --- a/pandas/tests/frame/methods/test_between_time.py +++ b/pandas/tests/frame/methods/test_between_time.py @@ -46,7 +46,7 @@ def test_between_time_formats(self, frame_or_series): def test_localized_between_time(self, tzstr, frame_or_series): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("4/16/2012", "5/1/2012", freq="H") + rng = date_range("4/16/2012", "5/1/2012", freq="h") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) if frame_or_series is DataFrame: ts = ts.to_frame() diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index f56a7896c753e..5beb09940acf3 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -204,7 +204,7 @@ def test_describe_datetime_columns(self): def test_describe_timedelta_values(self): # GH#6145 t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + t2 = pd.timedelta_range("1 hours", freq="h", periods=5) df = DataFrame({"t1": t1, "t2": t2}) expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 9a4882f11e961..f72c0594fa1f7 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -57,7 +57,7 @@ def test_drop_with_non_unique_datetime_index_and_invalid_keys(): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"], - index=pd.date_range("2012", freq="H", periods=5), + index=pd.date_range("2012", freq="h", periods=5), ) # create dataframe with non-unique datetime index df = df.iloc[[0, 2, 2, 3]].copy() diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 291a79815a81c..67aa07dd83764 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -497,3 +497,9 @@ def test_interpolate_empty_df(self): result = df.interpolate(inplace=True) assert result is None tm.assert_frame_equal(df, expected) + + def test_interpolate_ea_raise(self): + # GH#55347 + df = DataFrame({"a": [1, None, 2]}, dtype="Int64") + with pytest.raises(NotImplementedError, match="does not implement"): + df.interpolate() diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 2d4ac1d4a4444..735f6c50ab739 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -22,7 +22,7 @@ def frame_with_period_index(): return DataFrame( data=np.arange(20).reshape(4, 5), columns=list("abcde"), - index=period_range(start="2000", freq="A", periods=4), + index=period_range(start="2000", freq="Y", periods=4), ) @@ -158,9 +158,14 @@ def test_join_invalid_validate(left_no_dup, right_no_dup): left_no_dup.merge(right_no_dup, on="a", validate="invalid") -def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups): +@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"]) +def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype): # GH 46622 # Dups on right allowed by one_to_many constraint + if dtype == "string[pyarrow]": + pytest.importorskip("pyarrow") + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) left_no_dup.join( right_w_dups, on="a", diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 8b451c84dc5da..b5b5e42691e59 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected): df.rank() result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype, exp_dtype", + [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 + pytest.importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0105c41bd0eca..fb6e08cd52d97 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -36,7 +36,7 @@ def test_dti_set_index_reindex_datetimeindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) idx1 = date_range("2011/01/01", periods=6, freq="ME", tz="US/Eastern") - idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") + idx2 = date_range("2013", periods=6, freq="Y", tz="Asia/Tokyo") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) @@ -46,7 +46,7 @@ def test_dti_set_index_reindex_datetimeindex(self): def test_dti_set_index_reindex_freq_with_tz(self): # GH#11314 with tz index = date_range( - datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="h", tz="US/Eastern" ) df = DataFrame( np.random.default_rng(2).standard_normal((24, 1)), @@ -54,7 +54,7 @@ def test_dti_set_index_reindex_freq_with_tz(self): index=index, ) new_index = date_range( - datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="h", tz="US/Eastern" ) result = df.set_index(new_index) @@ -389,9 +389,9 @@ def test_reindex_frame_tz_ffill_bfill(self, frame_or_series, method, exp_values) # GH#38566 obj = frame_or_series( [0, 1, 2, 3], - index=date_range("2020-01-01 00:00:00", periods=4, freq="H", tz="UTC"), + index=date_range("2020-01-01 00:00:00", periods=4, freq="h", tz="UTC"), ) - new_index = date_range("2020-01-01 00:01:00", periods=4, freq="H", tz="UTC") + new_index = date_range("2020-01-01 00:01:00", periods=4, freq="h", tz="UTC") result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour")) expected = frame_or_series(exp_values, index=new_index) tm.assert_equal(result, expected) @@ -1067,7 +1067,7 @@ def test_reindex_multi_categorical_time(self): midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), + Categorical(date_range("2012-01-01", periods=3, freq="h")), ] ) df = DataFrame({"a": range(len(midx))}, index=midx) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 67dd5b6217187..a38d2c6fd016a 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -339,9 +339,7 @@ def test_select_dtypes_datetime_with_tz(self): expected = df3.reindex(columns=[]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype", [str, "str", np.bytes_, "S1", "unicode", np.str_, "U1"] - ) + @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame( diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5984e591dd6c1..9b87ffb0241ef 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -99,7 +99,7 @@ def test_set_index_cast_datetimeindex(self): assert isinstance(idf.index, DatetimeIndex) def test_set_index_dst(self): - di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") + di = date_range("2006-10-29 00:00:00", periods=3, freq="h", tz="US/Pacific") df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() # single level @@ -491,16 +491,16 @@ def test_set_index_period(self): df = DataFrame(np.random.default_rng(2).random(6)) idx1 = period_range("2011-01-01", periods=3, freq="M") idx1 = idx1.append(idx1) - idx2 = period_range("2013-01-01 09:00", periods=2, freq="H") + idx2 = period_range("2013-01-01 09:00", periods=2, freq="h") idx2 = idx2.append(idx2).append(idx2) - idx3 = period_range("2005", periods=6, freq="A") + idx3 = period_range("2005", periods=6, freq="Y") df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) expected1 = period_range("2011-01-01", periods=3, freq="M") - expected2 = period_range("2013-01-01 09:00", periods=2, freq="H") + expected2 = period_range("2013-01-01 09:00", periods=2, freq="h") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) @@ -694,7 +694,7 @@ def test_set_index_periodindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) idx1 = period_range("2011/01/01", periods=6, freq="M") - idx2 = period_range("2013", periods=6, freq="A") + idx2 = period_range("2013", periods=6, freq="Y") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 60c05767a5e1a..201046ebafc35 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long import pandas.util._test_decorators as td import pandas as pd @@ -36,7 +37,7 @@ def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) msg = ( @@ -44,12 +45,12 @@ def test_shift_deprecate_freq_and_fill_value(self, frame_or_series): "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - obj.shift(1, fill_value=1, freq="H") + obj.shift(1, fill_value=1, freq="h") if frame_or_series is DataFrame: - obj.columns = date_range("1/1/2000", periods=1, freq="H") + obj.columns = date_range("1/1/2000", periods=1, freq="h") with tm.assert_produces_warning(FutureWarning, match=msg): - obj.shift(1, axis=1, fill_value=1, freq="H") + obj.shift(1, axis=1, fill_value=1, freq="h") @pytest.mark.parametrize( "input_data, output_data", @@ -76,7 +77,7 @@ def test_shift_non_writable_array(self, input_data, output_data, frame_or_series def test_shift_mismatched_freq(self, frame_or_series): ts = frame_or_series( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) result = ts.shift(1, freq="5min") @@ -84,7 +85,7 @@ def test_shift_mismatched_freq(self, frame_or_series): tm.assert_index_equal(result.index, exp_index) # GH#1063, multiple of same base - result = ts.shift(1, freq="4H") + result = ts.shift(1, freq="4h") exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) @@ -92,7 +93,7 @@ def test_shift_mismatched_freq(self, frame_or_series): "obj", [ Series([np.arange(5)]), - date_range("1/1/2011", periods=24, freq="H"), + date_range("1/1/2011", periods=24, freq="h"), Series(range(5), index=date_range("2017", periods=5)), ], ) @@ -144,20 +145,20 @@ def test_shift_preserve_freqstr(self, periods, frame_or_series): # GH#21275 obj = frame_or_series( range(periods), - index=date_range("2016-1-1 00:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 00:00:00", periods=periods, freq="h"), ) - result = obj.shift(1, "2H") + result = obj.shift(1, "2h") expected = frame_or_series( range(periods), - index=date_range("2016-1-1 02:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 02:00:00", periods=periods, freq="h"), ) tm.assert_equal(result, expected) def test_shift_dst(self, frame_or_series): # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") obj = frame_or_series(dates) res = obj.shift(0) @@ -179,7 +180,7 @@ def test_shift_dst(self, frame_or_series): @pytest.mark.parametrize("ex", [10, -10, 20, -20]) def test_shift_dst_beyond(self, frame_or_series, ex): # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") obj = frame_or_series(dates) res = obj.shift(ex) exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") @@ -366,7 +367,7 @@ def test_shift_categorical_fill_value(self, frame_or_series): def test_shift_fill_value(self, frame_or_series): # GH#24128 - dti = date_range("1/1/2000", periods=5, freq="H") + dti = date_range("1/1/2000", periods=5, freq="h") ts = frame_or_series([1.0, 2.0, 3.0, 4.0, 5.0], index=dti) exp = frame_or_series([0.0, 1.0, 2.0, 3.0, 4.0], index=dti) @@ -471,22 +472,22 @@ def test_shift_axis1_multiple_blocks_with_int_fill(self): df1 = DataFrame(rng.integers(1000, size=(5, 3), dtype=int)) df2 = DataFrame(rng.integers(1000, size=(5, 2), dtype=int)) df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(2, axis=1, fill_value=np.int_(0)) + result = df3.shift(2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([-1, -1, 0, 1], axis=1) - expected.iloc[:, :2] = np.int_(0) + expected.iloc[:, :2] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) # Case with periods < 0 df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) - result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + result = df3.shift(-2, axis=1, fill_value=np_long(0)) assert len(df3._mgr.blocks) == 2 expected = df3.take([2, 3, -1, -1], axis=1) - expected.iloc[:, -2:] = np.int_(0) + expected.iloc[:, -2:] = np_long(0) expected.columns = df3.columns tm.assert_frame_equal(result, expected) @@ -706,7 +707,7 @@ def test_shift_with_iterable_freq_and_fill_value(self): # GH#44424 df = DataFrame( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) tm.assert_frame_equal( @@ -716,8 +717,8 @@ def test_shift_with_iterable_freq_and_fill_value(self): ) tm.assert_frame_equal( - df.shift([1], freq="H").rename(columns=lambda x: int(x[0])), - df.shift(1, freq="H"), + df.shift([1], freq="h").rename(columns=lambda x: int(x[0])), + df.shift(1, freq="h"), ) msg = ( @@ -725,7 +726,7 @@ def test_shift_with_iterable_freq_and_fill_value(self): "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - df.shift([1, 2], fill_value=1, freq="H") + df.shift([1, 2], fill_value=1, freq="h") def test_shift_with_iterable_check_other_arguments(self): # GH#44424 diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 985a9e3602410..f0222e5cec9b5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -955,3 +955,42 @@ def test_sort_index_multiindex_sort_remaining(self, ascending): ) tm.assert_frame_equal(result, expected) + + +def test_sort_index_with_sliced_multiindex(): + # GH 55379 + mi = MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("b", "16"), + ("b", "26"), + ("a", "45"), + ("b", "28"), + ("a", "5"), + ("a", "50"), + ("a", "51"), + ("b", "4"), + ], + names=["group", "str"], + ) + + df = DataFrame({"x": range(len(mi))}, index=mi) + result = df.iloc[0:6].sort_index() + + expected = DataFrame( + {"x": [0, 1, 2, 5, 3, 4]}, + index=MultiIndex.from_tuples( + [ + ("a", "10"), + ("a", "18"), + ("a", "25"), + ("a", "45"), + ("b", "16"), + ("b", "26"), + ], + names=["group", "str"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 9f45347c31165..94c98ad477cc1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -420,7 +420,7 @@ def test_to_csv_from_csv_w_some_infs(self, float_frame): # test roundtrip with inf, -inf, nan, as full columns and mix float_frame["G"] = np.nan f = lambda x: [np.inf, np.nan][np.random.default_rng(2).random() < 0.5] - float_frame["H"] = float_frame.index.map(f) + float_frame["h"] = float_frame.index.map(f) with tm.ensure_clean() as path: float_frame.to_csv(path) @@ -1077,7 +1077,7 @@ def test_to_csv_with_dst_transitions(self, td): "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", - freq="H", + freq="h", ambiguous="infer", ) i = times + td @@ -1095,7 +1095,7 @@ def test_to_csv_with_dst_transitions(self, td): def test_to_csv_with_dst_transitions_with_pickle(self): # GH11619 - idx = date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") + idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index fa8c4e4811ea6..fab90b112fa94 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -253,7 +253,7 @@ def test_to_records_with_categorical(self): ), # Pass in a dtype instance. ( - {"column_dtypes": np.dtype("unicode")}, + {"column_dtypes": np.dtype(np.str_)}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[ diff --git a/pandas/tests/frame/methods/test_to_timestamp.py b/pandas/tests/frame/methods/test_to_timestamp.py index e72b576fca833..aeb65d98d8ab2 100644 --- a/pandas/tests/frame/methods/test_to_timestamp.py +++ b/pandas/tests/frame/methods/test_to_timestamp.py @@ -16,7 +16,7 @@ import pandas._testing as tm -def _get_with_delta(delta, freq="A-DEC"): +def _get_with_delta(delta, freq="Y-DEC"): return date_range( to_datetime("1/1/2001") + delta, to_datetime("12/31/2009") + delta, @@ -27,7 +27,7 @@ def _get_with_delta(delta, freq="A-DEC"): class TestToTimestamp: def test_to_timestamp(self, frame_or_series): K = 5 - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), K)), index=index, @@ -36,7 +36,7 @@ def test_to_timestamp(self, frame_or_series): obj["mix"] = "a" obj = tm.get_obj(obj, frame_or_series) - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = date_range("1/1/2001", end="12/31/2009", freq="Y-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") result = obj.to_timestamp("D", "end") tm.assert_index_equal(result.index, exp_index) @@ -71,7 +71,7 @@ def test_to_timestamp(self, frame_or_series): def test_to_timestamp_columns(self): K = 5 - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") df = DataFrame( np.random.default_rng(2).standard_normal((len(index), K)), index=index, @@ -82,7 +82,7 @@ def test_to_timestamp_columns(self): # columns df = df.T - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = date_range("1/1/2001", end="12/31/2009", freq="Y-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") result = df.to_timestamp("D", "end", axis=1) tm.assert_index_equal(result.columns, exp_index) @@ -122,7 +122,7 @@ def test_to_timestamp_columns(self): assert result2.columns.freqstr == "AS-JAN" def test_to_timestamp_invalid_axis(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), 5)), index=index ) @@ -132,12 +132,12 @@ def test_to_timestamp_invalid_axis(self): obj.to_timestamp(axis=2) def test_to_timestamp_hourly(self, frame_or_series): - index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + index = period_range(freq="h", start="1/1/2001", end="1/2/2001") obj = Series(1, index=index, name="foo") if frame_or_series is not Series: obj = obj.to_frame() - exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h") result = obj.to_timestamp(how="end") exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 8ff6ea37eae18..50fc6fe6984e7 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -70,7 +70,7 @@ def test_transpose_tzaware_2col_mixed_tz(self): @pytest.mark.parametrize("tz", [None, "America/New_York"]) def test_transpose_preserves_dtindex_equality_with_dst(self, tz): # GH#19970 - idx = date_range("20161101", "20161130", freq="4H", tz=tz) + idx = date_range("20161101", "20161130", freq="4h", tz=tz) df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) result = df.T == df.T expected = DataFrame(True, index=list("ab"), columns=idx) diff --git a/pandas/tests/frame/methods/test_tz_convert.py b/pandas/tests/frame/methods/test_tz_convert.py index 8a484abaab54c..bcb8e423980fd 100644 --- a/pandas/tests/frame/methods/test_tz_convert.py +++ b/pandas/tests/frame/methods/test_tz_convert.py @@ -120,7 +120,7 @@ def test_tz_convert_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 obj = frame_or_series( np.arange(0, 5), - index=date_range("20131027", periods=5, freq="1H", tz="Europe/Berlin"), + index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"), ) orig = obj.copy() result = obj.tz_convert("UTC", copy=copy) diff --git a/pandas/tests/frame/methods/test_tz_localize.py b/pandas/tests/frame/methods/test_tz_localize.py index ed2b0b247e62c..b167afc17f484 100644 --- a/pandas/tests/frame/methods/test_tz_localize.py +++ b/pandas/tests/frame/methods/test_tz_localize.py @@ -16,7 +16,7 @@ class TestTZLocalize: # test_tz_convert_and_localize in test_tz_convert def test_tz_localize(self, frame_or_series): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) @@ -29,7 +29,7 @@ def test_tz_localize(self, frame_or_series): tm.assert_equal(result, expected) def test_tz_localize_axis1(self): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") df = DataFrame({"a": 1}, index=rng) @@ -43,7 +43,7 @@ def test_tz_localize_axis1(self): def test_tz_localize_naive(self, frame_or_series): # Can't localize if already tz-aware - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") ts = Series(1, index=rng) ts = frame_or_series(ts) @@ -54,13 +54,13 @@ def test_tz_localize_naive(self, frame_or_series): def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 obj = frame_or_series( - np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=None) + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None) ) orig = obj.copy() result = obj.tz_localize("UTC", copy=copy) expected = frame_or_series( np.arange(0, 5), - index=date_range("20131027", periods=5, freq="1H", tz="UTC"), + index=date_range("20131027", periods=5, freq="1h", tz="UTC"), ) tm.assert_equal(result, expected) tm.assert_equal(obj, orig) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..06bf169bf4dbc 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -315,6 +315,15 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + def test_attrs_deepcopy(self): + df = DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["tags"] = {"spam", "ham"} + + result = df.rename(columns=str) + assert result.attrs == df.attrs + assert result.attrs["tags"] is not df.attrs["tags"] + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( self, allows_duplicate_labels, frame_or_series, using_copy_on_write diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bb9a76829c77d..09a5cda4b3458 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1223,7 +1223,7 @@ def test_frame_single_columns_object_sum_axis_1(): class TestFrameArithmeticUnsorted: def test_frame_add_tz_mismatch_converts_to_utc(self): - rng = pd.date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + rng = pd.date_range("1/1/2011", periods=10, freq="h", tz="US/Eastern") df = DataFrame( np.random.default_rng(2).standard_normal(len(rng)), index=rng, columns=["a"] ) @@ -1236,7 +1236,7 @@ def test_frame_add_tz_mismatch_converts_to_utc(self): assert result.index.tz is timezone.utc def test_align_frame(self): - rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y") ts = DataFrame( np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng ) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd851ab244cb8..3d8053703e906 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1823,7 +1823,7 @@ def test_constructor_single_value(self): DataFrame("a", [1, 2], ["a", "c"], float) def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name + intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -2471,8 +2471,8 @@ def test_dataframe_constructor_infer_multiindex(self): [ ([1, 2]), (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="H"))), - (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + (list(date_range("1/1/2011", periods=2, freq="h"))), + (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), ([Interval(left=0, right=5)]), ], ) @@ -2743,6 +2743,13 @@ def test_frame_string_inference_array_string_dtype(self): df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"]) tm.assert_frame_equal(df, expected) + def test_frame_string_inference_block_dim(self): + # GH#55363 + pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index e66557f132c1d..0d5c2e3cd6c13 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,10 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import ( + np_long, + np_ulong, +) import pandas.util._test_decorators as td import pandas as pd @@ -908,7 +912,7 @@ def test_mean_datetimelike(self): "A": np.arange(3), "B": date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), - "D": pd.period_range("2016", periods=3, freq="A"), + "D": pd.period_range("2016", periods=3, freq="Y"), } ) result = df.mean(numeric_only=True) @@ -933,7 +937,7 @@ def test_mean_datetimelike_numeric_only_false(self): tm.assert_series_equal(result, expected) # mean of period is not allowed - df["D"] = pd.period_range("2016", periods=3, freq="A") + df["D"] = pd.period_range("2016", periods=3, freq="Y") with pytest.raises(TypeError, match="mean is not implemented for Period"): df.mean(numeric_only=False) @@ -1056,6 +1060,19 @@ def test_idxmax_numeric_only(self, numeric_only): expected = Series([1, 0, 1], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + def test_idxmax_arrow_types(self): + # GH#55368 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]") + result = df.idxmax() + expected = Series([1, 0], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([2, 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" @@ -1700,11 +1717,11 @@ class TestEmptyDataFrameReductions: "opname, dtype, exp_value, exp_dtype", [ ("sum", np.int8, 0, np.int64), - ("prod", np.int8, 1, np.int_), + ("prod", np.int8, 1, np_long), ("sum", np.int64, 0, np.int64), ("prod", np.int64, 1, np.int64), ("sum", np.uint8, 0, np.uint64), - ("prod", np.uint8, 1, np.uint), + ("prod", np.uint8, 1, np_ulong), ("sum", np.uint64, 0, np.uint64), ("prod", np.uint64, 1, np.uint64), ("sum", np.float32, 0, np.float32), diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 64d516e484991..0634b8268c04c 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -313,7 +313,7 @@ def test_latex_repr(self): def test_repr_categorical_dates_periods(self): # normal DataFrame - dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + dt = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") p = period_range("2011-01", freq="M", periods=5) df = DataFrame({"dt": dt, "p": p}) exp = """ dt p @@ -339,7 +339,7 @@ def test_repr_np_nat_with_object(self, arg, box, expected): assert result == expected def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="Y-DEC")}) # it works! repr(df) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 1522b83a4f5d0..0f7ae998a4b2b 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -281,12 +281,12 @@ ( pd.Series, (1, pd.date_range("2000", periods=4)), - operator.methodcaller("asfreq", "H"), + operator.methodcaller("asfreq", "h"), ), ( pd.DataFrame, ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - operator.methodcaller("asfreq", "H"), + operator.methodcaller("asfreq", "h"), ), ( pd.Series, @@ -628,9 +628,9 @@ def test_string_method(method): operator.methodcaller("tz_localize", "CET"), operator.methodcaller("normalize"), operator.methodcaller("strftime", "%Y"), - operator.methodcaller("round", "H"), - operator.methodcaller("floor", "H"), - operator.methodcaller("ceil", "H"), + operator.methodcaller("round", "h"), + operator.methodcaller("floor", "h"), + operator.methodcaller("ceil", "h"), operator.methodcaller("month_name"), operator.methodcaller("day_name"), ], diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 865fda0ab54a2..5c99882cef6d2 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -229,7 +229,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = DataFrame([11, 12, 13], columns=["a"]) - grps = np.arange(0, 25, 5, dtype=np.int_) + grps = np.arange(0, 25, 5, dtype=int) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( "sum", alt=None, numeric_only=True diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 7ea107f254104..398e9b09693e6 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -106,7 +106,7 @@ def test_agg_dict_parameter_cast_result_dtypes(): df = DataFrame( { "class": ["A", "A", "B", "B", "C", "C", "D", "D"], - "time": date_range("1/1/2011", periods=8, freq="H"), + "time": date_range("1/1/2011", periods=8, freq="h"), } ) df.loc[[0, 1, 2, 5], "time"] = None diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index abcb9f68e0f5c..5331b2e2c5d81 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1306,7 +1306,7 @@ def test_positional_slice_groups_datetimelike(): # GH 21651 expected = DataFrame( { - "date": pd.date_range("2010-01-01", freq="12H", periods=5), + "date": pd.date_range("2010-01-01", freq="12h", periods=5), "vals": range(5), "let": list("abcde"), } diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b11240c841420..11291bb89b604 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): return agg = getattr(series_groupby, reduction_func) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) assert len(result) == expected_length @@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) + + if reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) args = get_groupby_method_args(reduction_func, df) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(df_grp, reduction_func)(*args) + return + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1883,14 +1910,7 @@ def test_category_order_reducer( request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if ( - reduction_func in ("idxmax", "idxmin") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmax/min fail with unused categories" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index: msg = "GH#49950 - corrwith with as_index=False may not have grouping column" request.node.add_marker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: @@ -1912,6 +1932,15 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb, reduction_func)(*args) + return + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories @@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys gb = gb["b"] args = get_groupby_method_args(reduction_func, df) + if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + gb.agg([reduction_func], *args) + return + result = gb.agg([reduction_func], *args) expected = getattr(gb, reduction_func)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 41bbfcf6840a9..08372541988d0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1(): gb2.idxmax(axis=1) +@pytest.mark.parametrize( + "func, values, expected_values, warn", + [ + ("idxmin", [0, 1, 2], [0, 2], None), + ("idxmax", [0, 1, 2], [1, 2], None), + ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), + ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): + # GH#54234 + df = DataFrame( + { + "a": [1, 1, 2], + "b": values, + } + ) + gb = df.groupby("a") + index = Index([1, 2], name="a") + expected = DataFrame({"b": expected_values}, index=index) + if test_series: + gb = gb["b"] + expected = expected["b"] + klass = "Series" if test_series else "DataFrame" + msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, func)(skipna=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): @@ -824,7 +857,7 @@ def test_nlargest_and_smallest_noop(data, groups, dtype, method): data = list(reversed(data)) ser = Series(data, name="a") result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9132be50d5857..7297d049587e6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby( - columns, keys, values, method, op, request, using_array_manager, dropna -): +def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): # GH8093 & GH26411 override_dtype = None - if ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["idxmax", "idxmin"] - ): - mark = pytest.mark.xfail( - raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" - ) - request.node.add_marker(mark) - if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected(): is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) - if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: - msg = f"Cannot perform {op} with non-ordered Categorical" - with pytest.raises(TypeError, match=msg): + if ( + isinstance(values, Categorical) + and not values.ordered + and op in ["min", "max", "idxmin", "idxmax"] + ): + if op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + klass = TypeError + else: + msg = f"Can't get {op} of an empty group due to unobserved categories" + klass = ValueError + with pytest.raises(klass, match=msg): get_result() - if isinstance(columns, list): + if op in ["min", "max"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() @@ -3004,12 +3001,12 @@ def test_groupby_reduce_period(): res = gb.max() expected = ser[-10:] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) res = gb.min() expected = ser[:10] - expected.index = Index(range(10), dtype=np.int_) + expected.index = Index(range(10), dtype=int) tm.assert_series_equal(res, expected) @@ -3245,3 +3242,12 @@ def test_get_group_axis_1(): } ) tm.assert_frame_equal(result, expected) + + +def test_groupby_ffill_with_duplicated_index(): + # GH#43412 + df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2]) + + result = df.groupby(level=0).ffill() + expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) + tm.assert_frame_equal(result, expected, check_dtype=False) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d82278c277d48..8065aa63dff81 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -503,18 +503,7 @@ def test_null_is_null_for_dtype( @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_categorical_reducers( - request, reduction_func, observed, sort, as_index, index_kind -): - # GH#36327 - if ( - reduction_func in ("idxmin", "idxmax") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmin/max broken for categorical with observed=False" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - +def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): # Ensure there is at least one null value by appending to the end values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -544,6 +533,17 @@ def test_categorical_reducers( args = (args[0].drop(columns=keys),) args_filled = (args_filled[0].drop(columns=keys),) + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb_keepna, reduction_func)(*args) + return + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].replace(4, None) @@ -573,9 +573,6 @@ def test_categorical_reducers( if as_index: expected = expected["size"].rename(None) - gb_keepna = df.groupby( - keys, dropna=False, observed=observed, sort=sort, as_index=as_index - ) if as_index or index_kind == "range" or reduction_func == "size": warn = None else: diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index bb4b9aa866ac9..f2d40867af03a 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -205,11 +205,11 @@ def test_group_shift_with_multiple_periods_and_freq(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) shifted_df = df.groupby("b")[["a"]].shift( [0, 1], - freq="H", + freq="h", ) expected_df = DataFrame( { @@ -223,7 +223,7 @@ def test_group_shift_with_multiple_periods_and_freq(): 5.0, ], }, - index=date_range("1/1/2000", periods=6, freq="H"), + index=date_range("1/1/2000", periods=6, freq="h"), ) tm.assert_frame_equal(shifted_df, expected_df) @@ -244,11 +244,11 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): # GH#44424 df = DataFrame( {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) msg = ( "Passing a 'freq' together with a 'fill_value' silently ignores the " "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="H") + df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 5adf9ace255ea..88ee8a35e5c94 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -475,7 +475,7 @@ def test_groupby_with_datetime_key(self): df = DataFrame( { "id": ["a", "b"] * 3, - "b": date_range("2000-01-01", "2000-01-03", freq="9H"), + "b": date_range("2000-01-01", "2000-01-03", freq="9h"), } ) grouper = Grouper(key="b", freq="D") diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 37eb52be0b37b..30c7e1df1e691 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -108,7 +108,7 @@ def test_max_inat_not_all_na(): # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_series_equal(result, expected, check_exact=True) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index efe7b171d630d..4e7c09b70feb0 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -421,7 +421,7 @@ def test_timestamp_groupby_quantile(): { "timestamp": pd.date_range( start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC" - ).floor("1H"), + ).floor("1h"), "category": list(range(1, 101)), "value": list(range(101, 201)), } @@ -468,7 +468,7 @@ def test_groupby_quantile_dt64tz_period(): # Check that we match the group-by-group result exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} expected = DataFrame(exp).T.infer_objects() - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9a2b3d44b117..46bf324fad1d7 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -97,22 +97,24 @@ def df_with_cat_col(): return df -def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None: - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) - else: - with pytest.raises(klass, match=msg): +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning + with tm.assert_produces_warning(warn_klass, match=warn_msg): + if klass is None: if how == "method": getattr(gb, groupby_func)(*args) elif how == "agg": gb.agg(groupby_func, *args) else: gb.transform(groupby_func, *args) + else: + with pytest.raises(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -233,8 +235,7 @@ def test_groupby_raises_string_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -297,13 +298,11 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support var operations"), }[groupby_func] - warn = None - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" if groupby_func in ["any", "all"]: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func, args) + warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -342,8 +341,7 @@ def test_groupby_raises_datetime_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -540,8 +538,7 @@ def test_groupby_raises_category_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -572,6 +569,16 @@ def test_groupby_raises_category_on_category( return empty_groups = any(group.empty for group in gb.groups.values()) + if ( + not observed + and how != "transform" + and isinstance(by, list) + and isinstance(by[0], str) + and by == ["a", "b"] + ): + assert not empty_groups + # TODO: empty_groups should be true due to unobserved categorical combinations + empty_groups = True klass, msg = { "all": (None, ""), @@ -617,10 +624,10 @@ def test_groupby_raises_category_on_category( if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (ValueError, "attempt to get argmax of an empty sequence") + "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), - "idxmin": (ValueError, "attempt to get argmin of an empty sequence") + "idxmin": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), "last": (None, ""), diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 76b26c04f9f3a..c275db9d1788c 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -39,7 +39,7 @@ def test_size_axis_1(df, axis_1, by, sort, dropna): if sort: expected = expected.sort_index() if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(np.int_) + expected.index = expected.index.astype(int) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a9e67df1fb793..31629ba697e33 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -193,7 +193,7 @@ def test_timegrouper_with_reg_groups(self): ).set_index(["Date", "Buyer"]) msg = "The default value of numeric_only" - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="Y"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -336,7 +336,7 @@ def test_timegrouper_with_reg_groups(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["D", "ME", "A", "Q-APR"]) + @pytest.mark.parametrize("freq", ["D", "ME", "Y", "Q-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame( @@ -655,7 +655,7 @@ def test_groupby_groups_periods(self): df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], - "period": [pd.Period(d, freq="H") for d in dates], + "period": [pd.Period(d, freq="h") for d in dates], "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } @@ -670,7 +670,7 @@ def test_groupby_groups_periods(self): "2011-07-19 09:00:00", "2011-07-19 09:00:00", ], - freq="H", + freq="h", name="period", ) exp_idx2 = Index(["a", "b"] * 3, name="label") @@ -685,7 +685,7 @@ def test_groupby_groups_periods(self): tm.assert_frame_equal(result, expected) # by level - didx = pd.PeriodIndex(dates, freq="H") + didx = pd.PeriodIndex(dates, freq="h") df = DataFrame( {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, index=didx, @@ -693,7 +693,7 @@ def test_groupby_groups_periods(self): exp_idx = pd.PeriodIndex( ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - freq="H", + freq="h", ) expected = DataFrame( {"value1": [3, 5, 7], "value2": [2, 4, 6]}, diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 944dda8977882..45a33d3b70f71 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -996,7 +996,7 @@ def test_mixed_groupings(normalize, expected_label, expected_values): result = gp.value_counts(sort=True, normalize=normalize) expected = DataFrame( { - "level_0": np.array([4, 4, 5], dtype=np.int_), + "level_0": np.array([4, 4, 5], dtype=int), "A": [1, 1, 2], "level_2": [8, 8, 7], "B": [1, 3, 2], diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cf3f41e04902c..4a493ef3fd52c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1637,3 +1637,19 @@ def test_as_index_no_change(keys, df, groupby_func): result = gb_as_index_true.transform(groupby_func, *args) expected = gb_as_index_false.transform(groupby_func, *args) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): + # GH#55268 - ensure *args are passed through when calling transform + df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) + gb = df.groupby("a") + msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.transform(how, 0, skipna, numeric_only) + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrame.{how} with .* any-NA and skipna=False" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 5ecb2c753644d..6586f5f9de480 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -54,6 +54,14 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) + def test_insert_none_into_string_numpy(self): + # GH#55365 + pytest.importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "pos,expected", [ diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 21d1630af9de2..488f79eea0d11 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -182,7 +182,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), False, ), @@ -190,7 +190,7 @@ def test_symmetric_difference(self): "intersection", np.array( [(1, "A"), (1, "B"), (2, "A"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -198,7 +198,7 @@ def test_symmetric_difference(self): "union", np.array( [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ), None, ), @@ -208,13 +208,13 @@ def test_tuple_union_bug(self, method, expected, sort): index1 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) index2 = Index( np.array( [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], - dtype=[("num", int), ("let", "a1")], + dtype=[("num", int), ("let", "S1")], ) ) diff --git a/pandas/tests/indexes/categorical/test_equals.py b/pandas/tests/indexes/categorical/test_equals.py index 1ed8f3a903439..a8353f301a3c3 100644 --- a/pandas/tests/indexes/categorical/test_equals.py +++ b/pandas/tests/indexes/categorical/test_equals.py @@ -88,3 +88,9 @@ def test_equals_multiindex(self): ci = mi.to_flat_index().astype("category") assert not ci.equals(mi) + + def test_equals_string_dtype(self, any_string_dtype): + # GH#55364 + idx = CategoricalIndex(list("abc"), name="B") + other = Index(["a", "b", "c"], name="B", dtype=any_string_dtype) + assert idx.equals(other) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index fe397e2c7c88e..808a1687390ff 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -25,7 +25,7 @@ def sort(request): return request.param -@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "min", "2min", "s", "-3s"]) +@pytest.fixture(params=["D", "3D", "-3D", "h", "2h", "-2h", "min", "2min", "s", "-3s"]) def freq_sample(request): """ Valid values for 'freq' parameter used to create date_range and diff --git a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py index c38e24232f181..61a79c4ceabf9 100644 --- a/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py +++ b/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py @@ -68,7 +68,7 @@ def test_drop_duplicates(self, keep, expected, index, idx): class TestDropDuplicatesPeriodIndex(DropDuplicates): - @pytest.fixture(params=["D", "3D", "H", "2H", "min", "2min", "s", "3s"]) + @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) def freq(self, request): return request.param diff --git a/pandas/tests/indexes/datetimelike_/test_equals.py b/pandas/tests/indexes/datetimelike_/test_equals.py index d85d7103fe381..7845d99614d34 100644 --- a/pandas/tests/indexes/datetimelike_/test_equals.py +++ b/pandas/tests/indexes/datetimelike_/test_equals.py @@ -65,7 +65,7 @@ def test_equals2(self, freq): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") + idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="h") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -75,7 +75,7 @@ def test_equals2(self, freq): # same internal, different tz idx3 = PeriodIndex._simple_new( - idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("H")) + idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("h")) ) tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) diff --git a/pandas/tests/indexes/datetimelike_/test_is_monotonic.py b/pandas/tests/indexes/datetimelike_/test_is_monotonic.py index 088ccc406cb81..b0e42e660b751 100644 --- a/pandas/tests/indexes/datetimelike_/test_is_monotonic.py +++ b/pandas/tests/indexes/datetimelike_/test_is_monotonic.py @@ -34,7 +34,7 @@ def test_is_monotonic_with_nat(): assert obj.is_unique dti2 = dti.insert(3, NaT) - pi2 = dti2.to_period("H") + pi2 = dti2.to_period("h") tdi2 = Index(dti2.view("timedelta64[ns]")) for obj in [pi2, pi2._engine, dti2, dti2._engine, tdi2, tdi2._engine]: diff --git a/pandas/tests/indexes/datetimelike_/test_sort_values.py b/pandas/tests/indexes/datetimelike_/test_sort_values.py index ab1c15f003d4d..a2c349c8b0ef6 100644 --- a/pandas/tests/indexes/datetimelike_/test_sort_values.py +++ b/pandas/tests/indexes/datetimelike_/test_sort_values.py @@ -92,7 +92,7 @@ def check_sort_values_with_freq(self, idx): tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0], dtype=np.intp)) check_freq_ascending(ordered, idx, False) - @pytest.mark.parametrize("freq", ["D", "H"]) + @pytest.mark.parametrize("freq", ["D", "h"]) def test_sort_values_with_freq_timedeltaindex(self, freq): # GH#10295 idx = timedelta_range(start=f"1{freq}", periods=3, freq=freq).rename("idx") @@ -107,7 +107,7 @@ def test_sort_values_with_freq_timedeltaindex(self, freq): ), DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", name="tzidx", tz="Asia/Tokyo", ), @@ -127,7 +127,7 @@ def test_sort_values_with_freq_periodindex(self, freq): @pytest.mark.parametrize( "idx", [ - PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A"), + PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="Y"), Index([2011, 2012, 2013], name="idx"), # for compatibility check ], ) @@ -275,10 +275,10 @@ def test_sort_values_without_freq_datetimeindex( ), ( PeriodIndex( - ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y" ), PeriodIndex( - ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="Y" ), ), ( @@ -308,7 +308,7 @@ def test_sort_values_without_freq_periodindex_nat(self): def test_order_stability_compat(): # GH#35922. sort_values is stable both for normal and datetime-like Index - pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y") iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) diff --git a/pandas/tests/indexes/datetimelike_/test_value_counts.py b/pandas/tests/indexes/datetimelike_/test_value_counts.py index a0f05a1a35d79..069e354a364c9 100644 --- a/pandas/tests/indexes/datetimelike_/test_value_counts.py +++ b/pandas/tests/indexes/datetimelike_/test_value_counts.py @@ -18,15 +18,15 @@ class TestValueCounts: def test_value_counts_unique_datetimeindex(self, tz_naive_fixture): tz = tz_naive_fixture - orig = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) + orig = date_range("2011-01-01 09:00", freq="h", periods=10, tz=tz) self._check_value_counts_with_repeats(orig) def test_value_counts_unique_timedeltaindex(self): - orig = timedelta_range("1 days 09:00:00", freq="H", periods=10) + orig = timedelta_range("1 days 09:00:00", freq="h", periods=10) self._check_value_counts_with_repeats(orig) def test_value_counts_unique_periodindex(self): - orig = period_range("2011-01-01 09:00", freq="H", periods=10) + orig = period_range("2011-01-01 09:00", freq="h", periods=10) self._check_value_counts_with_repeats(orig) def _check_value_counts_with_repeats(self, orig): @@ -83,7 +83,7 @@ def test_value_counts_unique_periodindex2(self): "2013-01-01 08:00", NaT, ], - freq="H", + freq="h", ) self._check_value_counts_dropna(idx) diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 08a2473f22556..94390acb35624 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -117,7 +117,7 @@ def test_astype_str_tz_and_name(self): def test_astype_str_freq_and_name(self): # test astype string with freqH and name - dti = date_range("1/1/2011", periods=3, freq="H", name="test_name") + dti = date_range("1/1/2011", periods=3, freq="h", name="test_name") result = dti.astype(str) expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], @@ -129,7 +129,7 @@ def test_astype_str_freq_and_name(self): def test_astype_str_freq_and_tz(self): # test astype string with freqH and timezone dti = date_range( - "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" + "3/6/2012 00:00", periods=2, freq="h", tz="Europe/London", name="test_name" ) result = dti.astype(str) expected = Index( diff --git a/pandas/tests/indexes/datetimes/methods/test_factorize.py b/pandas/tests/indexes/datetimes/methods/test_factorize.py index 99a3bc910b9ca..41ecf9ee6b823 100644 --- a/pandas/tests/indexes/datetimes/methods/test_factorize.py +++ b/pandas/tests/indexes/datetimes/methods/test_factorize.py @@ -74,7 +74,7 @@ def test_factorize_preserves_freq(self): def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 - base = date_range("2016-11-05", freq="H", periods=100, tz=tz) + base = date_range("2016-11-05", freq="h", periods=100, tz=tz) idx = base.repeat(5) exp_arr = np.arange(100, dtype=np.intp).repeat(5) @@ -89,7 +89,7 @@ def test_factorize_tz(self, tz_naive_fixture, index_or_series): def test_factorize_dst(self, index_or_series): # GH#13750 - idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + idx = date_range("2016-11-06", freq="h", periods=12, tz="US/Eastern") obj = index_or_series(idx) arr, res = obj.factorize() @@ -98,7 +98,7 @@ def test_factorize_dst(self, index_or_series): if index_or_series is Index: assert res.freq == idx.freq - idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + idx = date_range("2016-06-13", freq="h", periods=12, tz="US/Eastern") obj = index_or_series(idx) arr, res = obj.factorize() @@ -112,7 +112,7 @@ def test_factorize_no_freq_non_nano(self, tz_naive_fixture, sort): # GH#51978 case that does not go through the fastpath based on # non-None freq tz = tz_naive_fixture - idx = date_range("2016-11-06", freq="H", periods=5, tz=tz)[[0, 4, 1, 3, 2]] + idx = date_range("2016-11-06", freq="h", periods=5, tz=tz)[[0, 4, 1, 3, 2]] exp_codes, exp_uniques = idx.factorize(sort=sort) res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort) diff --git a/pandas/tests/indexes/datetimes/methods/test_insert.py b/pandas/tests/indexes/datetimes/methods/test_insert.py index 9ef43ace747e2..4ef162913b622 100644 --- a/pandas/tests/indexes/datetimes/methods/test_insert.py +++ b/pandas/tests/indexes/datetimes/methods/test_insert.py @@ -128,10 +128,10 @@ def test_insert(self): assert result.freq is None for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") + idx = date_range("1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx") # preserve freq expected = date_range( - "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx" ) for d in [ Timestamp("2000-01-01 15:00", tz=tz), diff --git a/pandas/tests/indexes/datetimes/methods/test_shift.py b/pandas/tests/indexes/datetimes/methods/test_shift.py index e8661fafc3bb7..064f664a4de10 100644 --- a/pandas/tests/indexes/datetimes/methods/test_shift.py +++ b/pandas/tests/indexes/datetimes/methods/test_shift.py @@ -24,30 +24,30 @@ def test_dti_shift_tzaware(self, tz_naive_fixture): # GH#9903 tz = tz_naive_fixture idx = DatetimeIndex([], name="xxx", tz=tz) - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) idx = DatetimeIndex( ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", tz=tz, - freq="H", + freq="h", ) - tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = DatetimeIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", tz=tz, - freq="H", + freq="h", ) - tm.assert_index_equal(idx.shift(3, freq="H"), exp) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = DatetimeIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", tz=tz, - freq="H", + freq="h", ) - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) def test_dti_shift_freqs(self): # test shift for DatetimeIndex and non DatetimeIndex @@ -101,9 +101,9 @@ def test_dti_shift_localized(self, tzstr): def test_dti_shift_across_dst(self): # GH 8616 - idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") + idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="h") s = Series(index=idx[:-1], dtype=object) - result = s.shift(freq="H") + result = s.shift(freq="h") expected = Series(index=idx[1:], dtype=object) tm.assert_series_equal(result, expected) @@ -120,7 +120,7 @@ def test_dti_shift_near_midnight(self, shift, result_time): dt = datetime(2014, 11, 14, 0) dt_est = pytz.timezone("EST").localize(dt) s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq="H") + result = s.shift(shift, freq="h") expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 5f266ea0b42a6..8900c5cdbca14 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -60,7 +60,7 @@ def test_to_period_quarterlyish(self, off): def test_to_period_annualish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "A-DEC" + assert prng.freq == "Y-DEC" def test_to_period_monthish(self): offsets = ["MS", "BM"] @@ -112,7 +112,7 @@ def test_period_dt64_round_trip(self): tm.assert_index_equal(pi.to_timestamp(), dti) dti = date_range("1/1/2000", "1/7/2002", freq="B") - pi = dti.to_period(freq="H") + pi = dti.to_period(freq="h") tm.assert_index_equal(pi.to_timestamp(), dti) def test_to_period_millisecond(self): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 61c8cc4a50fe2..6da215715482d 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -185,7 +185,7 @@ def test_construction_caching(self): ) def test_construction_with_alt(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = date_range("20130101", periods=5, freq="H", tz=tz) + i = date_range("20130101", periods=5, freq="h", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} result = DatetimeIndex(i, **kwargs) tm.assert_index_equal(i, result) @@ -196,7 +196,7 @@ def test_construction_with_alt(self, kwargs, tz_aware_fixture): ) def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = date_range("20130101", periods=5, freq="H", tz=tz) + i = date_range("20130101", periods=5, freq="h", tz=tz) i = i._with_freq(None) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} @@ -753,7 +753,7 @@ def test_constructor_invalid_dtype_raises(self, dtype): DatetimeIndex([1, 2], dtype=dtype) def test_constructor_name(self): - idx = date_range(start="2000-01-01", periods=1, freq="A", name="TEST") + idx = date_range(start="2000-01-01", periods=1, freq="Y", name="TEST") assert idx.name == "TEST" def test_000constructor_resolution(self): @@ -902,7 +902,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): start = Timestamp("2015-03-29 02:30:00").tz_localize( timezone, nonexistent="shift_forward" ) - result = date_range(start=start, periods=2, freq="H") + result = date_range(start=start, periods=2, freq="h") expected = DatetimeIndex( [ Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), @@ -916,7 +916,7 @@ def test_constructor_with_nonexistent_keyword_arg(self, warsaw): end = Timestamp("2015-03-29 02:30:00").tz_localize( timezone, nonexistent="shift_forward" ) - result = date_range(end=end, periods=2, freq="H") + result = date_range(end=end, periods=2, freq="h") expected = DatetimeIndex( [ Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), @@ -978,8 +978,8 @@ def test_dti_constructor_years_only(self, tz_naive_fixture): rng2 = date_range("2014", "2015", freq="MS", tz=tz) expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) - rng3 = date_range("2014", "2020", freq="A", tz=tz) - expected3 = date_range("2014-12-31", "2019-12-31", freq="A", tz=tz) + rng3 = date_range("2014", "2020", freq="Y", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="Y", tz=tz) rng4 = date_range("2014", "2020", freq="AS", tz=tz) expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) @@ -1036,7 +1036,7 @@ def test_constructor_int64_nocopy(self): assert (index.asi8[50:100] != -1).all() @pytest.mark.parametrize( - "freq", ["ME", "Q", "A", "D", "B", "BH", "min", "s", "ms", "us", "H", "ns", "C"] + "freq", ["ME", "Q", "Y", "D", "B", "bh", "min", "s", "ms", "us", "h", "ns", "C"] ) def test_from_freq_recreate_from_data(self, freq): org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index b93aee1d988de..ededf78621699 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -7,6 +7,7 @@ time, timedelta, ) +import re import numpy as np import pytest @@ -123,7 +124,7 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges: - @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "H", "D"]) + @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "h", "D"]) def test_date_range_edges(self, freq): # GH#13672 td = Timedelta(f"1{freq}") @@ -205,11 +206,11 @@ def test_date_range_int64_overflow_non_recoverable(self): # case with start later than 1970-01-01, overflow int64 but not uint64 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(start="1970-02-01", periods=106752 * 24, freq="H") + date_range(start="1970-02-01", periods=106752 * 24, freq="h") # case with end before 1970-01-01, overflow int64 but not uint64 with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(end="1969-11-14", periods=106752 * 24, freq="H") + date_range(end="1969-11-14", periods=106752 * 24, freq="h") @pytest.mark.slow @pytest.mark.parametrize( @@ -223,11 +224,11 @@ def test_date_range_int64_overflow_stride_endpoint_different_signs( start = Timestamp(s_ts) end = Timestamp(e_ts) - expected = date_range(start=start, end=end, freq="-1H") + expected = date_range(start=start, end=end, freq="-1h") assert expected[0] == start assert expected[-1] == end - dti = date_range(end=end, periods=len(expected), freq="-1H") + dti = date_range(end=end, periods=len(expected), freq="-1h") tm.assert_index_equal(dti, expected) def test_date_range_out_of_bounds(self): @@ -252,12 +253,11 @@ def test_begin_year_alias(self, freq): ) tm.assert_index_equal(rng, exp) - @pytest.mark.parametrize("freq", ["A", "Y"]) - def test_end_year_alias(self, freq): + def test_end_year_alias(self): # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) + rng = date_range("1/1/2013", "7/1/2017", freq="Y") exp = DatetimeIndex( - ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq=freq + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq="Y" ) tm.assert_index_equal(rng, exp) @@ -272,10 +272,10 @@ def test_business_end_year_alias(self, freq): def test_date_range_negative_freq(self): # GH 11018 - rng = date_range("2011-12-31", freq="-2A", periods=3) - exp = DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2A") + rng = date_range("2011-12-31", freq="-2Y", periods=3) + exp = DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2Y") tm.assert_index_equal(rng, exp) - assert rng.freq == "-2A" + assert rng.freq == "-2Y" rng = date_range("2011-01-31", freq="-2ME", periods=3) exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2ME") @@ -416,13 +416,13 @@ def test_date_range_businesshour(self): "2014-07-04 15:00", "2014-07-04 16:00", ], - freq="BH", + freq="bh", ) - rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="BH") + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="bh") tm.assert_index_equal(idx, rng) - idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="BH") - rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="BH") + idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="bh") + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="bh") tm.assert_index_equal(idx, rng) idx = DatetimeIndex( @@ -452,9 +452,9 @@ def test_date_range_businesshour(self): "2014-07-08 15:00", "2014-07-08 16:00", ], - freq="BH", + freq="bh", ) - rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="BH") + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="bh") tm.assert_index_equal(idx, rng) def test_date_range_timedelta(self): @@ -481,13 +481,13 @@ def test_range_misspecified(self): date_range(periods=10) with pytest.raises(ValueError, match=msg): - date_range(start="1/1/2000", freq="H") + date_range(start="1/1/2000", freq="h") with pytest.raises(ValueError, match=msg): - date_range(end="1/1/2000", freq="H") + date_range(end="1/1/2000", freq="h") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq="H") + date_range(periods=10, freq="h") with pytest.raises(ValueError, match=msg): date_range() @@ -524,14 +524,14 @@ def test_construct_over_dst(self): pre_dst, pst_dst, ] - expected = DatetimeIndex(expect_data, freq="H") - result = date_range(start="2010-11-7", periods=3, freq="H", tz="US/Pacific") + expected = DatetimeIndex(expect_data, freq="h") + result = date_range(start="2010-11-7", periods=3, freq="h", tz="US/Pacific") tm.assert_index_equal(result, expected) def test_construct_with_different_start_end_string_format(self): # GH 12064 result = date_range( - "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="H" + "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="h" ) expected = DatetimeIndex( [ @@ -539,7 +539,7 @@ def test_construct_with_different_start_end_string_format(self): Timestamp("2013-01-01 01:00:00+09:00"), Timestamp("2013-01-01 02:00:00+09:00"), ], - freq="H", + freq="h", ) tm.assert_index_equal(result, expected) @@ -638,7 +638,7 @@ def test_range_tz_dateutil(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "Y"]) def test_range_closed(self, freq, inclusive_endpoints_fixture): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -653,7 +653,7 @@ def test_range_closed(self, freq, inclusive_endpoints_fixture): tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "Y"]) def test_range_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -674,7 +674,7 @@ def test_range_closed_with_tz_aware_start_end( tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "Y"]) def test_range_with_tz_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -839,20 +839,24 @@ def test_freq_dateoffset_with_relateivedelta_nanos(self): @pytest.mark.parametrize( "freq,freq_depr", [ - ("min", "T"), - ("s", "S"), - ("ms", "L"), - ("us", "U"), - ("ns", "N"), + ("2Y", "2A"), + ("200Y-MAY", "200A-MAY"), + ("h", "H"), + ("2min", "2T"), + ("1s", "1S"), + ("2ms", "2L"), + ("1us", "1U"), + ("2ns", "2N"), ], ) - def test_frequencies_T_S_L_U_N_deprecated(self, freq, freq_depr): + def test_frequencies_A_T_S_L_U_N_deprecated(self, freq, freq_depr): # GH#52536 - msg = f"'{freq_depr}' is deprecated and will be removed in a future version." + freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." - expected = date_range("1/1/2000", periods=4, freq=freq) + expected = date_range("1/1/2000", periods=2, freq=freq) with tm.assert_produces_warning(FutureWarning, match=msg): - result = date_range("1/1/2000", periods=4, freq=freq_depr) + result = date_range("1/1/2000", periods=2, freq=freq_depr) tm.assert_index_equal(result, expected) @@ -905,7 +909,7 @@ def test_date_range_with_tz(self, tzstr): stamp = Timestamp("3/11/2012 05:00", tz=tzstr) assert stamp.hour == 5 - rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) + rng = date_range("3/11/2012 04:00", periods=10, freq="h", tz=tzstr) assert stamp == rng[1] diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index f44cbbf560584..156075e3fafec 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,9 +1,12 @@ +import datetime as dt from datetime import date import dateutil import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DataFrame, @@ -54,7 +57,7 @@ def test_time_overflow_for_32bit_machines(self): # (which has value 1e9) and since the max value for np.int32 is ~2e9, # and since those machines won't promote np.int32 to np.int64, we get # overflow. - periods = np.int_(1000) + periods = np_long(1000) idx1 = date_range(start="2000", periods=periods, freq="s") assert len(idx1) == periods @@ -94,7 +97,7 @@ def test_append_nondatetimeindex(self): def test_iteration_preserves_tz(self): # see gh-8890 - index = date_range("2012-01-01", periods=3, freq="H", tz="US/Eastern") + index = date_range("2012-01-01", periods=3, freq="h", tz="US/Eastern") for i, ts in enumerate(index): result = ts @@ -102,7 +105,7 @@ def test_iteration_preserves_tz(self): assert result == expected index = date_range( - "2012-01-01", periods=3, freq="H", tz=dateutil.tz.tzoffset(None, -28800) + "2012-01-01", periods=3, freq="h", tz=dateutil.tz.tzoffset(None, -28800) ) for i, ts in enumerate(index): @@ -199,3 +202,27 @@ def test_asarray_tz_aware(self): result = np.asarray(idx, dtype=object) tm.assert_numpy_array_equal(result, expected) + + def test_CBH_deprecated(self): + msg = "'CBH' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" + ) + result = DatetimeIndex( + [ + "2022-12-12 09:00:00", + "2022-12-12 10:00:00", + "2022-12-12 11:00:00", + "2022-12-12 12:00:00", + "2022-12-12 13:00:00", + "2022-12-12 14:00:00", + "2022-12-12 15:00:00", + "2022-12-12 16:00:00", + ], + dtype="datetime64[ns]", + freq="cbh", + ) + + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_delete.py b/pandas/tests/indexes/datetimes/test_delete.py index 3565e516e69d5..90dfdb46cdfa5 100644 --- a/pandas/tests/indexes/datetimes/test_delete.py +++ b/pandas/tests/indexes/datetimes/test_delete.py @@ -42,25 +42,25 @@ def test_delete(self): for tz in [None, "Asia/Tokyo", "US/Pacific"]: idx = date_range( - start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + start="2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz ) expected = date_range( - start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz + start="2000-01-01 10:00", periods=9, freq="h", name="idx", tz=tz ) result = idx.delete(0) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == "H" + assert result.freqstr == "h" assert result.tz == expected.tz expected = date_range( - start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz + start="2000-01-01 09:00", periods=9, freq="h", name="idx", tz=tz ) result = idx.delete(-1) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == "H" + assert result.freqstr == "h" assert result.tz == expected.tz def test_delete_slice(self): @@ -105,13 +105,13 @@ def test_delete_slice(self): ts = Series( 1, index=date_range( - "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + "2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz ), ) # preserve freq result = ts.drop(ts.index[:5]).index expected = date_range( - "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz + "2000-01-01 14:00", periods=5, freq="h", name="idx", tz=tz ) tm.assert_index_equal(result, expected) assert result.name == expected.name diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 502cb0407bfcd..9fb5db9e034ee 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -87,8 +87,8 @@ def test_dti_repr_short(self): ), ( ["2012-01-01"], - "24H", - "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24H')", + "24h", + "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24h')", ), ], ) @@ -108,7 +108,7 @@ def test_dti_representation(self, method): idxs.append( DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) ) @@ -135,7 +135,7 @@ def test_dti_representation(self, method): exp.append( "DatetimeIndex(['2011-01-01 09:00:00+09:00', " "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" - ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')" + ", dtype='datetime64[ns, Asia/Tokyo]', freq='h')" ) exp.append( "DatetimeIndex(['2011-01-01 09:00:00-05:00', " @@ -161,7 +161,7 @@ def test_dti_representation_to_series(self): idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) idx6 = DatetimeIndex( @@ -218,7 +218,7 @@ def test_dti_summary(self): idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) idx6 = DatetimeIndex( @@ -236,7 +236,7 @@ def test_dti_summary(self): exp5 = ( "DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " "to 2011-01-01 11:00:00+09:00\n" - "Freq: H" + "Freq: h" ) exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" diff --git a/pandas/tests/indexes/datetimes/test_freq_attr.py b/pandas/tests/indexes/datetimes/test_freq_attr.py index f5821a316358d..5cddf56cd1c73 100644 --- a/pandas/tests/indexes/datetimes/test_freq_attr.py +++ b/pandas/tests/indexes/datetimes/test_freq_attr.py @@ -31,7 +31,7 @@ def test_freq_setter_errors(self): idx._data.freq = "foo" @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48h", Hour(48)]) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_freq_setter(self, values, freq, tz): # GH#20678 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index c3944a4443d67..37c580c98b139 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_long + import pandas as pd from pandas import ( DatetimeIndex, @@ -29,7 +31,7 @@ def test_getitem_slice_keeps_name(self): # GH4226 st = Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") et = Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") - dr = date_range(st, et, freq="H", name="timebucket") + dr = date_range(st, et, freq="h", name="timebucket") assert dr[1:].name == dr.name def test_getitem(self): @@ -91,7 +93,7 @@ def test_dti_business_getitem(self, freq): assert fancy_indexed.freq is None # 32-bit vs. 64-bit platforms - assert rng[4] == rng[np.int_(4)] + assert rng[4] == rng[np_long(4)] @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem_matplotlib_hackaround(self, freq): @@ -299,7 +301,7 @@ def test_take2(self, tz): idx = date_range( start="2010-01-01 09:00", end="2010-02-01 09:00", - freq="H", + freq="h", tz=tz, name="idx", ) @@ -405,7 +407,7 @@ def test_get_loc_key_unit_mismatch_not_castable(self): def test_get_loc_time_obj(self): # time indexing - idx = date_range("2000-01-01", periods=24, freq="H") + idx = date_range("2000-01-01", periods=24, freq="h") result = idx.get_loc(time(12)) expected = np.array([12]) @@ -601,7 +603,7 @@ def test_get_indexer_pad_requires_monotonicity(self): class TestMaybeCastSliceBound: def test_maybe_cast_slice_bounds_empty(self): # GH#14354 - empty_idx = date_range(freq="1H", periods=0, end="2015") + empty_idx = date_range(freq="1h", periods=0, end="2015") right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right") exp = Timestamp("2015-01-02 23:59:59.999999999") diff --git a/pandas/tests/indexes/datetimes/test_join.py b/pandas/tests/indexes/datetimes/test_join.py index ccfdb55fc8119..959fbab0dcec6 100644 --- a/pandas/tests/indexes/datetimes/test_join.py +++ b/pandas/tests/indexes/datetimes/test_join.py @@ -65,7 +65,7 @@ def test_join_object_index(self): assert isinstance(result[0], Timestamp) def test_join_utc_convert(self, join_type): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") left = rng.tz_convert("US/Eastern") right = rng.tz_convert("Europe/Berlin") diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 21dc22bea87dc..7eea05c753b8a 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -20,11 +20,11 @@ class TestDatetimeIndexOps: @pytest.mark.parametrize( "freq,expected", [ - ("A", "day"), + ("Y", "day"), ("Q", "day"), ("ME", "day"), ("D", "day"), - ("H", "hour"), + ("h", "hour"), ("min", "minute"), ("s", "second"), ("ms", "millisecond"), @@ -33,7 +33,7 @@ class TestDatetimeIndexOps: ) def test_resolution(self, request, tz_naive_fixture, freq, expected): tz = tz_naive_fixture - if freq == "A" and not IS64 and isinstance(tz, tzlocal): + if freq == "Y" and not IS64 and isinstance(tz, tzlocal): request.node.add_marker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 7ad0dfbaf6cb1..c4e23154b7ffc 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -194,7 +194,7 @@ def test_partial_slice(self): s["2004-12-31"] def test_partial_slice_daily(self): - rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) + rng = date_range(freq="h", start=datetime(2005, 1, 31), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-31"] diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 49597ef885183..7c7e57b51ccc0 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -29,7 +29,7 @@ def test_dti_time(self): assert (result == expected).all() def test_dti_date(self): - rng = date_range("1/1/2000", freq="12H", periods=10) + rng = date_range("1/1/2000", freq="12h", periods=10) result = pd.Index(rng).date expected = [t.date() for t in rng] assert (result == expected).all() @@ -122,8 +122,8 @@ def test_round(self, tz_naive_fixture): ) expected_elt = expected_rng[1] - tm.assert_index_equal(rng.round(freq="H"), expected_rng) - assert elt.round(freq="H") == expected_elt + tm.assert_index_equal(rng.round(freq="h"), expected_rng) + assert elt.round(freq="h") == expected_elt msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): @@ -198,8 +198,8 @@ def test_no_rounding_occurs(self, tz_naive_fixture): (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), (["2018-01-01 00:15:00"], "ceil", "15min", ["2018-01-01 00:15:00"]), (["2018-01-01 00:15:00"], "floor", "15min", ["2018-01-01 00:15:00"]), - (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), - (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "ceil", "3h", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3h", ["1823-01-01 03:00:00"]), ( ("NaT", "1823-01-01 00:00:01"), "floor", @@ -223,7 +223,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): @pytest.mark.parametrize( "start, index_freq, periods", - [("2018-01-01", "12H", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + [("2018-01-01", "12h", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], ) @pytest.mark.parametrize( "round_freq", @@ -245,7 +245,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): "1s", "2s", "3s", - "12H", + "12h", "1D", ], ) @@ -326,7 +326,7 @@ def test_2000(self): tm.assert_index_equal(r1, r2) def test_hour(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="H") + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="h") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Index) and r2.dtype == np.float64 diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index ca784948a5d29..6071c7fa8319b 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -98,8 +98,8 @@ def test_union_coverage(self, sort): assert result.freq == ordered.freq def test_union_bug_1730(self, sort): - rng_a = date_range("1/1/2012", periods=4, freq="3H") - rng_b = date_range("1/1/2012", periods=4, freq="4H") + rng_a = date_range("1/1/2012", periods=4, freq="3h") + rng_b = date_range("1/1/2012", periods=4, freq="4h") result = rng_a.union(rng_b, sort=sort) exp = list(rng_a) + list(rng_b[1:]) @@ -308,7 +308,7 @@ def test_intersection_empty(self, tz_aware_fixture, freq): def test_intersection_bug_1708(self): from pandas import DateOffset - index_1 = date_range("1/1/2012", periods=4, freq="12H") + index_1 = date_range("1/1/2012", periods=4, freq="12h") index_2 = index_1 + DateOffset(hours=1) result = index_1.intersection(index_2) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 756a72cf1849a..eb54ea8e4316f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -192,7 +192,7 @@ def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): expected = Index([9, 9, 9], dtype=np.int32) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize("freq, n", [("H", 1), ("min", 60), ("s", 3600)]) + @pytest.mark.parametrize("freq, n", [("h", 1), ("min", 60), ("s", 3600)]) def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pandas-dev/pandas/issues/4496 for details. @@ -204,7 +204,7 @@ def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) def test_dti_tz_convert_dst(self): - for freq, n in [("H", 1), ("min", 60), ("s", 3600)]: + for freq, n in [("h", 1), ("min", 60), ("s", 3600)]: # Start DST idx = date_range( "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" @@ -278,8 +278,8 @@ def test_tz_convert_roundtrip(self, tz_aware_fixture): idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") - idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") - exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="h", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="h") idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="min", tz="UTC") exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="min") @@ -314,7 +314,7 @@ def test_dti_tz_convert_tzlocal(self): ], ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): - rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tz) # Values are unmodified @@ -324,7 +324,7 @@ def test_dti_tz_convert_utc_to_local_no_modify(self, tz): @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_tz_convert_unsorted(self, tzstr): - dr = date_range("2012-03-09", freq="H", periods=100, tz="utc") + dr = date_range("2012-03-09", freq="h", periods=100, tz="utc") dr = dr.tz_convert(tzstr) result = dr[::-1].hour @@ -504,10 +504,10 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): tm.assert_index_equal(reset, expected) def test_dti_tz_localize_naive(self): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") conv = rng.tz_localize("US/Pacific") - exp = date_range("1/1/2011", periods=100, freq="H", tz="US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="h", tz="US/Pacific") tm.assert_index_equal(conv, exp._with_freq(None)) @@ -613,11 +613,11 @@ def test_dti_construction_ambiguous_endpoint(self, tz): with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): date_range( - "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h" ) times = date_range( - "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" + "2013-10-26 23:00", "2013-10-27 01:00", freq="h", tz=tz, ambiguous="infer" ) assert times[0] == Timestamp("2013-10-26 23:00", tz=tz) assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz) @@ -637,11 +637,11 @@ def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): date_range( - "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h" ) times = date_range( - "2019-03-10 00:00", "2019-03-10 02:00", freq="H", tz=tz, nonexistent=option + "2019-03-10 00:00", "2019-03-10 02:00", freq="h", tz=tz, nonexistent=option ) assert times[-1] == Timestamp(expected, tz=tz) @@ -820,7 +820,7 @@ def test_dti_tz_constructors(self, tzstr): arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = date_range(start="2005-11-10 08:00:00", freq="H", periods=2, tz=tzstr) + idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) idx2 = idx2._with_freq(None) # the others all have freq=None idx3 = DatetimeIndex(arr, tz=tzstr) idx4 = DatetimeIndex(np.array(arr), tz=tzstr) @@ -877,7 +877,7 @@ def test_dti_drop_dont_lose_tz(self): def test_dti_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="h") assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="min") assert t4.tz_convert(tz="UTC").freq == t4.freq @@ -927,9 +927,9 @@ def test_drop_dst_boundary(self): tm.assert_index_equal(result, expected) def test_date_range_localize(self): - rng = date_range("3/11/2012 03:00", periods=15, freq="H", tz="US/Eastern") + rng = date_range("3/11/2012 03:00", periods=15, freq="h", tz="US/Eastern") rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") - rng3 = date_range("3/11/2012 03:00", periods=15, freq="H") + rng3 = date_range("3/11/2012 03:00", periods=15, freq="h") rng3 = rng3.tz_localize("US/Eastern") tm.assert_index_equal(rng._with_freq(None), rng3) @@ -944,9 +944,9 @@ def test_date_range_localize(self): tm.assert_index_equal(rng[:2], rng2) # Right before the DST transition - rng = date_range("3/11/2012 00:00", periods=2, freq="H", tz="US/Eastern") + rng = date_range("3/11/2012 00:00", periods=2, freq="h", tz="US/Eastern") rng2 = DatetimeIndex( - ["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern", freq="H" + ["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern", freq="h" ) tm.assert_index_equal(rng, rng2) exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") @@ -956,7 +956,7 @@ def test_date_range_localize(self): assert exp.hour == 1 assert rng[1] == exp - rng = date_range("3/11/2012 00:00", periods=10, freq="H", tz="US/Eastern") + rng = date_range("3/11/2012 00:00", periods=10, freq="h", tz="US/Eastern") assert rng[2].hour == 3 def test_timestamp_equality_different_timezones(self): @@ -974,7 +974,7 @@ def test_timestamp_equality_different_timezones(self): assert (berlin_range == eastern_range).all() def test_dti_intersection(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") left = rng[10:90][::-1] right = rng[20:80][::-1] @@ -984,8 +984,8 @@ def test_dti_intersection(self): assert result.tz == left.tz def test_dti_equals_with_tz(self): - left = date_range("1/1/2011", periods=100, freq="H", tz="utc") - right = date_range("1/1/2011", periods=100, freq="H", tz="US/Eastern") + left = date_range("1/1/2011", periods=100, freq="h", tz="utc") + right = date_range("1/1/2011", periods=100, freq="h", tz="US/Eastern") assert not left.equals(right) @@ -1036,7 +1036,7 @@ def test_dti_take_dont_lose_meta(self, tzstr): def test_utc_box_timestamp_and_localize(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) expected = rng[-1].astimezone(tz) @@ -1046,7 +1046,7 @@ def test_utc_box_timestamp_and_localize(self, tzstr): assert stamp.tzinfo == expected.tzinfo # right tzinfo - rng = date_range("3/13/2012", "3/14/2012", freq="H", tz="utc") + rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) # test not valid for dateutil timezones. # assert 'EDT' in repr(rng_eastern[0].tzinfo) @@ -1148,9 +1148,9 @@ def test_dti_convert_tz_aware_datetime_datetime(self, tz): def test_dti_setop_aware(self, setop): # non-overlapping # GH#39328 as of 2.0 we cast these to UTC instead of object - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") + rng = date_range("2012-11-15 00:00:00", periods=6, freq="h", tz="US/Central") - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="h", tz="US/Eastern") result = getattr(rng, setop)(rng2) @@ -1195,14 +1195,14 @@ def test_tz_localize_invalidates_freq(): # we only preserve freq in unambiguous cases # if localized to US/Eastern, this crosses a DST transition - dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="H") - assert dti.freq == "H" + dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="h") + assert dti.freq == "h" result = dti.tz_localize(None) # no-op - assert result.freq == "H" + assert result.freq == "h" result = dti.tz_localize("UTC") # unambiguous freq preservation - assert result.freq == "H" + assert result.freq == "h" result = dti.tz_localize("US/Eastern", nonexistent="shift_forward") assert result.freq is None @@ -1211,4 +1211,4 @@ def test_tz_localize_invalidates_freq(): # Case where we _can_ keep freq because we're length==1 dti2 = dti[:1] result = dti2.tz_localize("US/Eastern") - assert result.freq == "H" + assert result.freq == "h" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f003211abd857..acb330c190d6f 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -104,7 +104,7 @@ def test_repr_floats(self): def test_to_native_types(self, tuples, closed, expected_data): # GH 28210 index = IntervalIndex.from_tuples(tuples, closed=closed) - result = index._format_native_types() + result = index._format_native_types(na_rep="NaN") expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index aff4944e7bd55..fcf297fd1b092 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -87,7 +87,7 @@ def test_properties(self, closed): [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf], pd.to_datetime(["20170101", "20170202", "20170303", "20170404"]), - pd.to_timedelta(["1ns", "2ms", "3s", "4min", "5H", "6D"]), + pd.to_timedelta(["1ns", "2ms", "3s", "4min", "5h", "6D"]), ], ) def test_length(self, closed, breaks): @@ -689,13 +689,13 @@ def test_datetime(self, tz): # test get_indexer start = Timestamp("1999-12-31T12:00", tz=tz) - target = date_range(start=start, periods=7, freq="12H") + target = date_range(start=start, periods=7, freq="12h") actual = index.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") tm.assert_numpy_array_equal(actual, expected) start = Timestamp("2000-01-08T18:00", tz=tz) - target = date_range(start=start, periods=7, freq="6H") + target = date_range(start=start, periods=7, freq="6h") actual = index.get_indexer(target) expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 499220b39279d..6c531fb0428a3 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -58,7 +58,7 @@ def test_constructor_numeric(self, closed, name, freq, periods): @pytest.mark.parametrize("tz", [None, "US/Eastern"]) @pytest.mark.parametrize( - "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("ME", 11)] + "freq, periods", [("D", 364), ("2D", 182), ("22D18h", 16), ("ME", 11)] ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) @@ -93,7 +93,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "freq, periods", [("D", 100), ("2D12H", 40), ("5D", 20), ("25D", 4)] + "freq, periods", [("D", 100), ("2D12h", 40), ("5D", 20), ("25D", 4)] ) def test_constructor_timedelta(self, closed, name, freq, periods): start, end = Timedelta("0 days"), Timedelta("100 days") @@ -130,7 +130,7 @@ def test_constructor_timedelta(self, closed, name, freq, periods): (0, 10, 3, 9), (0, 10, 1.5, 9), (0.5, 10, 3, 9.5), - (Timedelta("0D"), Timedelta("10D"), "2D4H", Timedelta("8D16H")), + (Timedelta("0D"), Timedelta("10D"), "2D4h", Timedelta("8D16h")), ( Timestamp("2018-01-01"), Timestamp("2018-02-09"), @@ -140,7 +140,7 @@ def test_constructor_timedelta(self, closed, name, freq, periods): ( Timestamp("2018-01-01", tz="US/Eastern"), Timestamp("2018-01-20", tz="US/Eastern"), - "5D12H", + "5D12h", Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), ), ], diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index f91856c3948a0..27a8c6e9b7158 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -87,7 +87,7 @@ def test_inplace_mutation_resets_values(): def test_boxable_categorical_values(): - cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="H")) + cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="h")) result = MultiIndex.from_product([["a", "b", "c"], cat]).values expected = pd.Series( [ diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index cce6c98d71b47..c51dcb395c795 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -203,15 +203,15 @@ def test_from_arrays_tuples(idx): [ ( pd.period_range("2011-01-01", freq="D", periods=3), - pd.period_range("2015-01-01", freq="H", periods=3), + pd.period_range("2015-01-01", freq="h", periods=3), ), ( date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), - date_range("2015-01-01 10:00", freq="H", periods=3, tz="Asia/Tokyo"), + date_range("2015-01-01 10:00", freq="h", periods=3, tz="Asia/Tokyo"), ), ( pd.timedelta_range("1 days", freq="D", periods=3), - pd.timedelta_range("2 hours", freq="H", periods=3), + pd.timedelta_range("2 hours", freq="h", periods=3), ), ], ) @@ -229,7 +229,7 @@ def test_from_arrays_index_series_period_datetimetz_and_timedelta(idx1, idx2): def test_from_arrays_index_datetimelike_mixed(): idx1 = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") - idx2 = date_range("2015-01-01 10:00", freq="H", periods=3) + idx2 = date_range("2015-01-01 10:00", freq="h", periods=3) idx3 = pd.timedelta_range("1 days", freq="D", periods=3) idx4 = pd.period_range("2011-01-01", freq="D", periods=3) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 78b2c493ec116..a65677bba35e4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -261,7 +261,7 @@ def test_get_indexer_categorical_time(self): midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), + Categorical(date_range("2012-01-01", periods=3, freq="h")), ] ) result = midx.get_indexer(midx) @@ -342,6 +342,19 @@ def test_get_indexer_methods(self): expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) + @pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill", "nearest"]) + def test_get_indexer_methods_raise_for_non_monotonic(self, method): + # 53452 + mi = MultiIndex.from_arrays([[0, 4, 2], [0, 4, 2]]) + if method == "nearest": + err = NotImplementedError + msg = "not implemented yet for MultiIndex" + else: + err = ValueError + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(err, match=msg): + mi.get_indexer([(1, 1)], method=method) + def test_get_indexer_three_or_more_levels(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests get_indexer() on MultiIndexes with 3+ levels @@ -841,7 +854,7 @@ def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 idx = MultiIndex.from_product( [ - date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"), + date_range("2019-01-01T00:15:33", periods=100, freq="h", name="date"), ["x"], [3], ] @@ -853,7 +866,7 @@ def test_timestamp_multiindex_indexer(): date_range( start="2019-01-02T00:15:33", end="2019-01-05T03:15:33", - freq="H", + freq="h", name="date", ), ["x"], diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 66163dad3deae..64cc1fa621b31 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -28,7 +28,7 @@ def df(): # 2016-01-03 00:00:00 a 12 # b 13 # c 14 - dr = date_range("2016-01-01", "2016-01-03", freq="12H") + dr = date_range("2016-01-01", "2016-01-03", freq="12h") abc = ["a", "b", "c"] mi = MultiIndex.from_product([dr, abc]) frame = DataFrame({"c1": range(15)}, index=mi) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 08c1a4092952c..b4dcef71dcf50 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -338,3 +338,12 @@ def test_sort_values_with_na_na_position(dtype, na_position): ] expected = MultiIndex.from_arrays(arrays) tm.assert_index_equal(result, expected) + + +def test_sort_unnecessary_warning(): + # GH#55386 + midx = MultiIndex.from_tuples([(1.5, 2), (3.5, 3), (0, 1)]) + midx = midx.set_levels([2.5, np.nan, 1], level=0) + result = midx.sort_values() + expected = MultiIndex.from_tuples([(1, 3), (2.5, 1), (np.nan, 2)]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index 89ea4fb6472d0..ed078a3e8fb8b 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -10,11 +10,11 @@ class TestPeriodIndex: def test_asfreq(self): - pi1 = period_range(freq="A", start="1/1/2001", end="1/1/2001") + pi1 = period_range(freq="Y", start="1/1/2001", end="1/1/2001") pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") - pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") + pi5 = period_range(freq="h", start="1/1/2001", end="1/1/2001 00:00") pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") pi7 = period_range(freq="s", start="1/1/2001", end="1/1/2001 00:00:00") @@ -22,50 +22,50 @@ def test_asfreq(self): assert pi1.asfreq("Q", "s") == pi2 assert pi1.asfreq("M", "start") == pi3 assert pi1.asfreq("D", "StarT") == pi4 - assert pi1.asfreq("H", "beGIN") == pi5 + assert pi1.asfreq("h", "beGIN") == pi5 assert pi1.asfreq("Min", "s") == pi6 assert pi1.asfreq("s", "s") == pi7 - assert pi2.asfreq("A", "s") == pi1 + assert pi2.asfreq("Y", "s") == pi1 assert pi2.asfreq("M", "s") == pi3 assert pi2.asfreq("D", "s") == pi4 - assert pi2.asfreq("H", "s") == pi5 + assert pi2.asfreq("h", "s") == pi5 assert pi2.asfreq("Min", "s") == pi6 assert pi2.asfreq("s", "s") == pi7 - assert pi3.asfreq("A", "s") == pi1 + assert pi3.asfreq("Y", "s") == pi1 assert pi3.asfreq("Q", "s") == pi2 assert pi3.asfreq("D", "s") == pi4 - assert pi3.asfreq("H", "s") == pi5 + assert pi3.asfreq("h", "s") == pi5 assert pi3.asfreq("Min", "s") == pi6 assert pi3.asfreq("s", "s") == pi7 - assert pi4.asfreq("A", "s") == pi1 + assert pi4.asfreq("Y", "s") == pi1 assert pi4.asfreq("Q", "s") == pi2 assert pi4.asfreq("M", "s") == pi3 - assert pi4.asfreq("H", "s") == pi5 + assert pi4.asfreq("h", "s") == pi5 assert pi4.asfreq("Min", "s") == pi6 assert pi4.asfreq("s", "s") == pi7 - assert pi5.asfreq("A", "s") == pi1 + assert pi5.asfreq("Y", "s") == pi1 assert pi5.asfreq("Q", "s") == pi2 assert pi5.asfreq("M", "s") == pi3 assert pi5.asfreq("D", "s") == pi4 assert pi5.asfreq("Min", "s") == pi6 assert pi5.asfreq("s", "s") == pi7 - assert pi6.asfreq("A", "s") == pi1 + assert pi6.asfreq("Y", "s") == pi1 assert pi6.asfreq("Q", "s") == pi2 assert pi6.asfreq("M", "s") == pi3 assert pi6.asfreq("D", "s") == pi4 - assert pi6.asfreq("H", "s") == pi5 + assert pi6.asfreq("h", "s") == pi5 assert pi6.asfreq("s", "s") == pi7 - assert pi7.asfreq("A", "s") == pi1 + assert pi7.asfreq("Y", "s") == pi1 assert pi7.asfreq("Q", "s") == pi2 assert pi7.asfreq("M", "s") == pi3 assert pi7.asfreq("D", "s") == pi4 - assert pi7.asfreq("H", "s") == pi5 + assert pi7.asfreq("h", "s") == pi5 assert pi7.asfreq("Min", "s") == pi6 msg = "How must be one of S or E" @@ -100,23 +100,23 @@ def test_asfreq_mult_pi(self, freq): assert result.freq == exp.freq def test_asfreq_combined_pi(self): - pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") - exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25H") - for freq, how in zip(["1D1H", "1H1D"], ["S", "E"]): + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["S", "E"]): result = pi.asfreq(freq, how=how) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - for freq in ["1D1H", "1H1D"]: + for freq in ["1D1h", "1h1D"]: pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) - result = pi.asfreq("H") - exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="H") + result = pi.asfreq("h") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="h") tm.assert_index_equal(result, exp) assert result.freq == exp.freq pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) - result = pi.asfreq("H", how="S") - exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + result = pi.asfreq("h", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") tm.assert_index_equal(result, exp) assert result.freq == exp.freq diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index e54cd73a35f59..07595b6b8c1dd 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_conversion(self): expected = Index([str(x) for x in idx], name="idx") tm.assert_index_equal(result, expected) - idx = period_range("1990", "2009", freq="A", name="idx") + idx = period_range("1990", "2009", freq="Y", name="idx") result = idx.astype("i8") tm.assert_index_equal(result, Index(idx.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, idx.asi8) diff --git a/pandas/tests/indexes/period/methods/test_fillna.py b/pandas/tests/indexes/period/methods/test_fillna.py index 12a07bac25a59..ed6b4686a06de 100644 --- a/pandas/tests/indexes/period/methods/test_fillna.py +++ b/pandas/tests/indexes/period/methods/test_fillna.py @@ -10,19 +10,19 @@ class TestFillNA: def test_fillna_period(self): # GH#11343 - idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="h") exp = PeriodIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="h" ) - result = idx.fillna(Period("2011-01-01 10:00", freq="H")) + result = idx.fillna(Period("2011-01-01 10:00", freq="h")) tm.assert_index_equal(result, exp) exp = Index( [ - Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01 09:00", freq="h"), "x", - Period("2011-01-01 11:00", freq="H"), + Period("2011-01-01 11:00", freq="h"), ], dtype=object, ) @@ -31,9 +31,9 @@ def test_fillna_period(self): exp = Index( [ - Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01 09:00", freq="h"), Period("2011-01-01", freq="D"), - Period("2011-01-01 11:00", freq="H"), + Period("2011-01-01 11:00", freq="h"), ], dtype=object, ) diff --git a/pandas/tests/indexes/period/methods/test_is_full.py b/pandas/tests/indexes/period/methods/test_is_full.py index 490f199a59ed7..b4105bedbe21d 100644 --- a/pandas/tests/indexes/period/methods/test_is_full.py +++ b/pandas/tests/indexes/period/methods/test_is_full.py @@ -4,19 +4,19 @@ def test_is_full(): - index = PeriodIndex([2005, 2007, 2009], freq="A") + index = PeriodIndex([2005, 2007, 2009], freq="Y") assert not index.is_full - index = PeriodIndex([2005, 2006, 2007], freq="A") + index = PeriodIndex([2005, 2006, 2007], freq="Y") assert index.is_full - index = PeriodIndex([2005, 2005, 2007], freq="A") + index = PeriodIndex([2005, 2005, 2007], freq="Y") assert not index.is_full - index = PeriodIndex([2005, 2005, 2006], freq="A") + index = PeriodIndex([2005, 2005, 2006], freq="Y") assert index.is_full - index = PeriodIndex([2006, 2005, 2005], freq="A") + index = PeriodIndex([2006, 2005, 2005], freq="Y") with pytest.raises(ValueError, match="Index is not monotonic"): index.is_full diff --git a/pandas/tests/indexes/period/methods/test_shift.py b/pandas/tests/indexes/period/methods/test_shift.py index 48dc5f0e64d08..fca3e3a559e1f 100644 --- a/pandas/tests/indexes/period/methods/test_shift.py +++ b/pandas/tests/indexes/period/methods/test_shift.py @@ -29,16 +29,16 @@ def test_pi_shift_ndarray(self): tm.assert_index_equal(result, expected) def test_shift(self): - pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") - pi2 = period_range(freq="A", start="1/1/2002", end="12/1/2010") + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2002", end="12/1/2010") tm.assert_index_equal(pi1.shift(0), pi1) assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") - pi2 = period_range(freq="A", start="1/1/2000", end="12/1/2008") + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2000", end="12/1/2008") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) @@ -64,12 +64,12 @@ def test_shift(self): def test_shift_corner_cases(self): # GH#9903 - idx = PeriodIndex([], name="xxx", freq="H") + idx = PeriodIndex([], name="xxx", freq="h") msg = "`freq` argument is not supported for PeriodIndex.shift" with pytest.raises(TypeError, match=msg): # period shift doesn't accept freq - idx.shift(1, freq="H") + idx.shift(1, freq="h") tm.assert_index_equal(idx.shift(0), idx) tm.assert_index_equal(idx.shift(3), idx) @@ -77,19 +77,19 @@ def test_shift_corner_cases(self): idx = PeriodIndex( ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(0), idx) exp = PeriodIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(3), exp) exp = PeriodIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(-3), exp) @@ -117,6 +117,6 @@ def test_shift_gh8083(self): def test_shift_periods(self): # GH #22458 : argument 'n' was deprecated in favor of 'periods' - idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") + idx = period_range(freq="Y", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index 8bb0c3518c835..2394efb353ab6 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -47,7 +47,7 @@ def test_to_timestamp_non_contiguous(self): tm.assert_datetime_array_equal(result, expected, check_freq=False) def test_to_timestamp_freq(self): - idx = period_range("2017", periods=12, freq="A-DEC") + idx = period_range("2017", periods=12, freq="Y-DEC") result = idx.to_timestamp() expected = date_range("2017", periods=12, freq="AS-JAN") tm.assert_index_equal(result, expected) @@ -72,12 +72,12 @@ def test_to_timestamp_pi_nat(self): tm.assert_index_equal(result3, exp) assert result3.freqstr == "3M" - msg = "Frequency must be positive, because it represents span: -2A" + msg = "Frequency must be positive, because it represents span: -2Y" with pytest.raises(ValueError, match=msg): - result.to_period(freq="-2A") + result.to_period(freq="-2Y") def test_to_timestamp_preserve_name(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009", name="foo") assert index.name == "foo" conv = index.to_timestamp("D") @@ -107,7 +107,7 @@ def test_to_timestamp_pi_mult(self): tm.assert_index_equal(result, expected) def test_to_timestamp_pi_combined(self): - idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") + idx = period_range(start="2011", periods=2, freq="1D1h", name="idx") result = idx.to_timestamp() expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") @@ -120,7 +120,7 @@ def test_to_timestamp_pi_combined(self): expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how="E", freq="H") + result = idx.to_timestamp(how="E", freq="h") expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index a5bdfa11140d1..f1db5ab28be30 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -166,7 +166,7 @@ def test_constructor_fromarraylike(self): msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): - PeriodIndex(data=Period("2007", freq="A")) + PeriodIndex(data=Period("2007", freq="Y")) result = PeriodIndex(iter(idx)) tm.assert_index_equal(result, idx) @@ -397,9 +397,9 @@ def test_constructor_freq_mult(self): ) tm.assert_index_equal(pidx, expected) - pidx = period_range(end="2014-01-01 17:00", freq="4H", periods=3) + pidx = period_range(end="2014-01-01 17:00", freq="4h", periods=3) expected = PeriodIndex( - ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4H" + ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4h" ) tm.assert_index_equal(pidx, expected) @@ -418,7 +418,7 @@ def test_constructor_freq_mult(self): @pytest.mark.parametrize( "freq_offset, freq_period", [ - ("A", "A"), + ("Y", "Y"), ("ME", "M"), ("D", "D"), ("min", "min"), @@ -444,16 +444,16 @@ def test_constructor_freq_mult_dti_compat_month(self, mult): tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): - for freq in ["1D1H", "1H1D"]: + for freq in ["1D1h", "1h1D"]: pidx = PeriodIndex(["2016-01-01", "2016-01-02"], freq=freq) - expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25H") - for freq in ["1D1H", "1H1D"]: + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25h") + for freq in ["1D1h", "1h1D"]: pidx = period_range(start="2016-01-01", periods=2, freq=freq) - expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25H") + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25h") tm.assert_index_equal(pidx, expected) def test_constructor(self): - pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi = period_range(freq="Y", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") @@ -470,7 +470,7 @@ def test_constructor(self): pi = period_range(freq="B", start="1/1/2001", end="12/31/2009") assert len(pi) == 261 * 9 - pi = period_range(freq="H", start="1/1/2001", end="12/31/2001 23:00") + pi = period_range(freq="h", start="1/1/2001", end="12/31/2001 23:00") assert len(pi) == 365 * 24 pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") @@ -526,7 +526,7 @@ def test_constructor(self): Period("2006-12-31", ("w", 1)) @pytest.mark.parametrize( - "freq", ["M", "Q", "A", "D", "B", "min", "s", "ms", "us", "ns", "H"] + "freq", ["M", "Q", "Y", "D", "B", "min", "s", "ms", "us", "ns", "h"] ) @pytest.mark.filterwarnings( r"ignore:Period with BDay freq is deprecated:FutureWarning" @@ -539,7 +539,7 @@ def test_recreate_from_data(self, freq): def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] - index = PeriodIndex(raw, freq="A") + index = PeriodIndex(raw, freq="Y") expected = Index([str(num) for num in raw]) res = index.map(str) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 87bbb96377a79..9441f56a75f03 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -55,8 +55,8 @@ def test_representation(self, method): idx2 = PeriodIndex(["2011-01-01"], freq="D") idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") - idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") - idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="h") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") idx9 = pd.period_range("2013Q1", periods=3, freq="Q") @@ -73,11 +73,11 @@ def test_representation(self, method): "dtype='period[D]')" ) - exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]')" + exp5 = "PeriodIndex(['2011', '2012', '2013'], dtype='period[Y-DEC]')" exp6 = ( "PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='period[H]')" + "dtype='period[h]')" ) exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]')" @@ -101,8 +101,8 @@ def test_representation_to_series(self): idx2 = PeriodIndex(["2011-01-01"], freq="D") idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") - idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") - idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="h") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") @@ -125,12 +125,12 @@ def test_representation_to_series(self): exp5 = """0 2011 1 2012 2 2013 -dtype: period[A-DEC]""" +dtype: period[Y-DEC]""" exp6 = """0 2011-01-01 09:00 1 2012-02-01 10:00 2 NaT -dtype: period[H]""" +dtype: period[h]""" exp7 = """0 2013Q1 dtype: period[Q-DEC]""" @@ -157,8 +157,8 @@ def test_summary(self): idx2 = PeriodIndex(["2011-01-01"], freq="D") idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") - idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") - idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="Y") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="h") idx7 = pd.period_range("2013Q1", periods=1, freq="Q") idx8 = pd.period_range("2013Q1", periods=2, freq="Q") @@ -177,10 +177,10 @@ def test_summary(self): Freq: D""" exp5 = """PeriodIndex: 3 entries, 2011 to 2013 -Freq: A-DEC""" +Freq: Y-DEC""" exp6 = """PeriodIndex: 3 entries, 2011-01-01 09:00 to NaT -Freq: H""" +Freq: h""" exp7 = """PeriodIndex: 1 entries, 2013Q1 to 2013Q1 Freq: Q-DEC""" diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 109a4a41e2841..2683e25eda618 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -183,7 +183,7 @@ def test_getitem_seconds(self): "2014", "2013/02", "2013/01/02", - "2013/02/01 9H", + "2013/02/01 9h", "2013/02/01 09:00", ] for val in values: @@ -195,7 +195,7 @@ def test_getitem_seconds(self): ser = Series(np.random.default_rng(2).random(len(idx)), index=idx) tm.assert_series_equal(ser["2013/01/01 10:00"], ser[3600:3660]) - tm.assert_series_equal(ser["2013/01/01 9H"], ser[:3600]) + tm.assert_series_equal(ser["2013/01/01 9h"], ser[:3600]) for d in ["2013/01/01", "2013/01", "2013"]: tm.assert_series_equal(ser[d], ser) @@ -215,7 +215,7 @@ def test_getitem_day(self, idx_range): "2014", "2013/02", "2013/01/02", - "2013/02/01 9H", + "2013/02/01 9h", "2013/02/01 09:00", ] for val in values: @@ -230,7 +230,7 @@ def test_getitem_day(self, idx_range): tm.assert_series_equal(ser["2013/02"], ser[31:59]) tm.assert_series_equal(ser["2014"], ser[365:]) - invalid = ["2013/02/01 9H", "2013/02/01 09:00"] + invalid = ["2013/02/01 9h", "2013/02/01 09:00"] for val in invalid: with pytest.raises(KeyError, match=val): ser[val] @@ -238,9 +238,9 @@ def test_getitem_day(self, idx_range): class TestGetLoc: def test_get_loc_msg(self): - idx = period_range("2000-1-1", freq="A", periods=10) - bad_period = Period("2012", "A") - with pytest.raises(KeyError, match=r"^Period\('2012', 'A-DEC'\)$"): + idx = period_range("2000-1-1", freq="Y", periods=10) + bad_period = Period("2012", "Y") + with pytest.raises(KeyError, match=r"^Period\('2012', 'Y-DEC'\)$"): idx.get_loc(bad_period) try: @@ -479,13 +479,13 @@ def test_get_indexer_non_unique(self): # TODO: This method came from test_period; de-dup with version above def test_get_indexer2(self): - idx = period_range("2000-01-01", periods=3).asfreq("H", how="start") + idx = period_range("2000-01-01", periods=3).asfreq("h", how="start") tm.assert_numpy_array_equal( idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) ) target = PeriodIndex( - ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="H" + ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="h" ) tm.assert_numpy_array_equal( idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) @@ -501,7 +501,7 @@ def test_get_indexer2(self): np.array([0, -1, 1], dtype=np.intp), ) - msg = "Input has different freq=None from PeriodArray\\(freq=H\\)" + msg = "Input has different freq=None from PeriodArray\\(freq=h\\)" with pytest.raises(ValueError, match=msg): idx.get_indexer(target, "nearest", tolerance="1 minute") @@ -714,7 +714,7 @@ def test_take_fill_value(self): class TestGetValue: - @pytest.mark.parametrize("freq", ["H", "D"]) + @pytest.mark.parametrize("freq", ["h", "D"]) def test_get_value_datetime_hourly(self, freq): # get_loc and get_value should treat datetime objects symmetrically # TODO: this test used to test get_value, which is removed in 2.0. @@ -730,7 +730,7 @@ def test_get_value_datetime_hourly(self, freq): assert ser.loc[ts] == 7 ts2 = ts + Timedelta(hours=3) - if freq == "H": + if freq == "h": with pytest.raises(KeyError, match="2016-01-01 03:00"): pi.get_loc(ts2) with pytest.raises(KeyError, match="2016-01-01 03:00"): @@ -795,7 +795,7 @@ class TestAsOfLocs: def test_asof_locs_mismatched_type(self): dti = date_range("2016-01-01", periods=3) pi = dti.to_period("D") - pi2 = dti.to_period("H") + pi2 = dti.to_period("h") mask = np.array([0, 1, 0], dtype=bool) @@ -810,6 +810,6 @@ def test_asof_locs_mismatched_type(self): # TimedeltaIndex pi.asof_locs(dti - dti, mask) - msg = "Input has different freq=H" + msg = "Input has different freq=h" with pytest.raises(libperiod.IncompatibleFrequency, match=msg): pi.asof_locs(pi2, mask) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 3a272f53091b5..5bc76340badaf 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -14,7 +14,7 @@ class TestPeriodIndex: def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): # monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) original = ts.copy() @@ -28,7 +28,7 @@ def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): assert (ts[1:3] == 1).all() # not monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="Y-JUN") ts = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) result = ts["2007"] diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index bd40fa37897d8..22bb63d67f57f 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -18,7 +18,7 @@ class TestPeriodIndex: def test_make_time_series(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) assert isinstance(series, Series) @@ -67,7 +67,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.asi8, exp) def test_period_index_length(self): - pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi = period_range(freq="Y", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") @@ -157,11 +157,11 @@ def test_period_index_length(self): @pytest.mark.parametrize( "periodindex", [ - period_range(freq="A", start="1/1/2001", end="12/1/2005"), + period_range(freq="Y", start="1/1/2001", end="12/1/2005"), period_range(freq="Q", start="1/1/2001", end="12/1/2002"), period_range(freq="M", start="1/1/2001", end="1/1/2002"), period_range(freq="D", start="12/1/2001", end="6/1/2001"), - period_range(freq="H", start="12/31/2001", end="1/1/2002 23:00"), + period_range(freq="h", start="12/31/2001", end="1/1/2002 23:00"), period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20"), period_range( freq="s", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" @@ -187,7 +187,7 @@ def test_fields(self, periodindex, field): assert getattr(x, field) == val def test_is_(self): - create_index = lambda: period_range(freq="A", start="1/1/2001", end="12/1/2009") + create_index = lambda: period_range(freq="Y", start="1/1/2001", end="12/1/2009") index = create_index() assert index.is_(index) assert not index.is_(create_index()) @@ -199,23 +199,23 @@ def test_is_(self): assert ind2.is_(index) assert not index.is_(index[:]) assert not index.is_(index.asfreq("M")) - assert not index.is_(index.asfreq("A")) + assert not index.is_(index.asfreq("Y")) assert not index.is_(index - 2) assert not index.is_(index - 0) def test_index_unique(self): - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") - expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN") + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="Y-JUN") + expected = PeriodIndex([2000, 2007, 2009], freq="Y-JUN") tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 def test_negative_ordinals(self): - Period(ordinal=-1000, freq="A") - Period(ordinal=0, freq="A") + Period(ordinal=-1000, freq="Y") + Period(ordinal=0, freq="Y") - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="A") - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="A") + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") tm.assert_index_equal(idx1, idx2) def test_pindex_fieldaccessor_nat(self): @@ -255,7 +255,7 @@ def test_iteration(self): def test_with_multi_index(self): # #1705 - index = date_range("1/1/2012", periods=4, freq="12H") + index = date_range("1/1/2012", periods=4, freq="12h") index_as_arrays = [index.to_period(freq="D"), index.hour] s = Series([0, 1, 2, 3], index_as_arrays) @@ -267,14 +267,14 @@ def test_with_multi_index(self): def test_map(self): # test_map_dictlike generally tests - index = PeriodIndex([2005, 2007, 2009], freq="A") + index = PeriodIndex([2005, 2007, 2009], freq="Y") result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) def test_format_empty(self): # GH35712 - empty_idx = PeriodIndex([], freq="A") + empty_idx = PeriodIndex([], freq="Y") assert empty_idx.format() == [] assert empty_idx.format(name=True) == [""] @@ -284,6 +284,26 @@ def test_period_index_frequency_ME_error_message(self): with pytest.raises(ValueError, match=msg): PeriodIndex(["2020-01-01", "2020-01-02"], freq="2ME") + def test_H_deprecated_from_time_series(self): + # GH#52536 + msg = "'H' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + index = period_range(freq="2H", start="1/1/2001", end="12/1/2009") + series = Series(1, index=index) + assert isinstance(series, Series) + + @pytest.mark.parametrize("freq", ["2A", "A-DEC", "200A-AUG"]) + def test_a_deprecated_from_time_series(self, freq): + # GH#52536 + freq_msg = freq[freq.index("A") :] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + index = period_range(freq=freq, start="1/1/2001", end="12/1/2009") + series = Series(1, index=index) + assert isinstance(series, Series) + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 63acaba2d4f3e..bee8a1282d08b 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -20,7 +20,7 @@ def test_required_arguments(self): with pytest.raises(ValueError, match=msg): period_range("2011-1-1", "2012-1-1", "B") - @pytest.mark.parametrize("freq", ["D", "W", "Q", "A"]) + @pytest.mark.parametrize("freq", ["D", "W", "Q", "Y"]) def test_construction_from_string(self, freq): # non-empty expected = date_range( diff --git a/pandas/tests/indexes/period/test_pickle.py b/pandas/tests/indexes/period/test_pickle.py index cb981ab10064f..7d359fdabb6f1 100644 --- a/pandas/tests/indexes/period/test_pickle.py +++ b/pandas/tests/indexes/period/test_pickle.py @@ -12,7 +12,7 @@ class TestPickle: - @pytest.mark.parametrize("freq", ["D", "M", "A"]) + @pytest.mark.parametrize("freq", ["D", "M", "Y"]) def test_pickle_round_trip(self, freq): idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq=freq) result = tm.round_trip_pickle(idx) diff --git a/pandas/tests/indexes/period/test_resolution.py b/pandas/tests/indexes/period/test_resolution.py index 6c876b4f9366f..680bdaa2e2a44 100644 --- a/pandas/tests/indexes/period/test_resolution.py +++ b/pandas/tests/indexes/period/test_resolution.py @@ -7,11 +7,11 @@ class TestResolution: @pytest.mark.parametrize( "freq,expected", [ - ("A", "year"), + ("Y", "year"), ("Q", "quarter"), ("M", "month"), ("D", "day"), - ("H", "hour"), + ("h", "hour"), ("min", "minute"), ("s", "second"), ("ms", "millisecond"), diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py index b9863d1bb019a..9b02a2f35fd01 100644 --- a/pandas/tests/indexes/period/test_searchsorted.py +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -27,9 +27,9 @@ def test_searchsorted(self, freq): assert pidx.searchsorted(NaT) == 5 - msg = "Input has different freq=H from PeriodArray" + msg = "Input has different freq=h from PeriodArray" with pytest.raises(IncompatibleFrequency, match=msg): - pidx.searchsorted(Period("2014-01-01", freq="H")) + pidx.searchsorted(Period("2014-01-01", freq="h")) msg = "Input has different freq=5D from PeriodArray" with pytest.raises(IncompatibleFrequency, match=msg): diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index dd05210e417b0..b9a5940795a5b 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -43,8 +43,8 @@ def test_union(self, sort): other3 = PeriodIndex([], freq="D") expected3 = period_range("1/1/2000", freq="D", periods=5) - rng4 = period_range("2000-01-01 09:00", freq="H", periods=5) - other4 = period_range("2000-01-02 09:00", freq="H", periods=5) + rng4 = period_range("2000-01-01 09:00", freq="h", periods=5) + other4 = period_range("2000-01-02 09:00", freq="h", periods=5) expected4 = PeriodIndex( [ "2000-01-01 09:00", @@ -58,7 +58,7 @@ def test_union(self, sort): "2000-01-02 12:00", "2000-01-02 13:00", ], - freq="H", + freq="h", ) rng5 = PeriodIndex( @@ -81,8 +81,8 @@ def test_union(self, sort): other6 = period_range("2000-04-01", freq="M", periods=7) expected6 = period_range("2000-01-01", freq="M", periods=10) - rng7 = period_range("2003-01-01", freq="A", periods=5) - other7 = period_range("1998-01-01", freq="A", periods=8) + rng7 = period_range("2003-01-01", freq="Y", periods=5) + other7 = period_range("1998-01-01", freq="Y", periods=8) expected7 = PeriodIndex( [ "2003", @@ -96,7 +96,7 @@ def test_union(self, sort): "2001", "2002", ], - freq="A", + freq="Y", ) rng8 = PeriodIndex( @@ -269,8 +269,8 @@ def test_difference(self, sort): "2000-01-01 11:00", "2000-01-01 13:00", ] - rng4 = PeriodIndex(period_rng, freq="H") - other4 = period_range("2000-01-02 09:00", freq="H", periods=5) + rng4 = PeriodIndex(period_rng, freq="h") + other4 = period_range("2000-01-02 09:00", freq="h", periods=5) expected4 = rng4 rng5 = PeriodIndex( @@ -293,9 +293,9 @@ def test_difference(self, sort): expected6 = PeriodIndex(["2000-02-01", "2000-01-01", "2000-03-01"], freq="M") period_rng = ["2003", "2007", "2006", "2005", "2004"] - rng7 = PeriodIndex(period_rng, freq="A") - other7 = period_range("1998-01-01", freq="A", periods=8) - expected7 = PeriodIndex(["2007", "2006"], freq="A") + rng7 = PeriodIndex(period_rng, freq="Y") + other7 = period_range("1998-01-01", freq="Y", periods=8) + expected7 = PeriodIndex(["2007", "2006"], freq="Y") for rng, other, expected in [ (rng1, other1, expected1), diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 18668fd357fd8..f507e64d88b06 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -20,14 +20,14 @@ class TestPeriodRepresentation: ("W-THU", "1970-01-01"), ("D", "1970-01-01"), ("B", "1970-01-01"), - ("H", "1970-01-01"), + ("h", "1970-01-01"), ("min", "1970-01-01"), ("s", "1970-01-01"), ("ms", "1970-01-01"), ("us", "1970-01-01"), ("ns", "1970-01-01"), ("M", "1970-01"), - ("A", 1970), + ("Y", 1970), ], ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @@ -43,7 +43,7 @@ def test_freq(self, freq, base_date): class TestPeriodIndexConversion: def test_tolist(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") rs = index.tolist() for x in rs: assert isinstance(x, Period) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index bc04c1c6612f4..6afab569797f2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -457,7 +457,7 @@ def test_fancy(self, simple_index): ["string", "int64", "int32", "uint64", "uint32", "float64", "float32"], indirect=True, ) - @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) + @pytest.mark.parametrize("dtype", [int, np.bool_]) def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) empty_index = type(index)([], dtype=index.dtype) @@ -980,7 +980,7 @@ def test_str_attribute(self, method): Index(range(5)), tm.makeDateIndex(10), MultiIndex.from_tuples([("foo", "1"), ("bar", "3")]), - period_range(start="2000", end="2010", freq="A"), + period_range(start="2000", end="2010", freq="Y"), ], ) def test_str_attribute_raises(self, index): diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index f69f0fd3d78e2..4f5ece61fc30c 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -69,7 +69,7 @@ def test_astype(self): tm.assert_numpy_array_equal(rng.asi8, result.values) def test_astype_uint(self): - arr = timedelta_range("1H", periods=2) + arr = timedelta_range("1h", periods=2) with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): arr.astype("uint64") @@ -104,10 +104,10 @@ def test_astype_raises(self, dtype): idx.astype(dtype) def test_astype_category(self): - obj = timedelta_range("1H", periods=2, freq="H") + obj = timedelta_range("1h", periods=2, freq="h") result = obj.astype("category") - expected = pd.CategoricalIndex([Timedelta("1H"), Timedelta("2H")]) + expected = pd.CategoricalIndex([Timedelta("1h"), Timedelta("2h")]) tm.assert_index_equal(result, expected) result = obj._data.astype("category") @@ -115,7 +115,7 @@ def test_astype_category(self): tm.assert_categorical_equal(result, expected) def test_astype_array_fallback(self): - obj = timedelta_range("1H", periods=2) + obj = timedelta_range("1h", periods=2) result = obj.astype(bool) expected = Index(np.array([True, True])) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/methods/test_shift.py b/pandas/tests/indexes/timedeltas/methods/test_shift.py index e33b8de3e6594..a0986d1496881 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_shift.py +++ b/pandas/tests/indexes/timedeltas/methods/test_shift.py @@ -14,17 +14,17 @@ class TestTimedeltaIndexShift: def test_tdi_shift_empty(self): # GH#9903 idx = TimedeltaIndex([], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) def test_tdi_shift_hours(self): # GH#9903 idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="H"), exp) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) def test_tdi_shift_minutes(self): # GH#9903 diff --git a/pandas/tests/indexes/timedeltas/test_formats.py b/pandas/tests/indexes/timedeltas/test_formats.py index 751f9e4cc9eee..ee090bd0aaf0a 100644 --- a/pandas/tests/indexes/timedeltas/test_formats.py +++ b/pandas/tests/indexes/timedeltas/test_formats.py @@ -8,6 +8,18 @@ class TestTimedeltaIndexRendering: + def test_repr_round_days_non_nano(self): + # GH#55405 + # we should get "1 days", not "1 days 00:00:00" with non-nano + tdi = TimedeltaIndex(["1 days"], freq="D").as_unit("s") + result = repr(tdi) + expected = "TimedeltaIndex(['1 days'], dtype='timedelta64[s]', freq='D')" + assert result == expected + + result2 = repr(Series(tdi)) + expected2 = "0 1 days\ndtype: timedelta64[s]" + assert result2 == expected2 + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) def test_representation(self, method): idx1 = TimedeltaIndex([], freq="D") diff --git a/pandas/tests/indexes/timedeltas/test_freq_attr.py b/pandas/tests/indexes/timedeltas/test_freq_attr.py index 868da4329dccf..1912c49d3000f 100644 --- a/pandas/tests/indexes/timedeltas/test_freq_attr.py +++ b/pandas/tests/indexes/timedeltas/test_freq_attr.py @@ -12,7 +12,7 @@ class TestFreq: @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48h", Hour(48)]) def test_freq_setter(self, values, freq): # GH#20678 idx = TimedeltaIndex(values) diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 31cc8e18f58ce..397f9d9e18331 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -21,7 +21,7 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): # GH#4226 - tdi = timedelta_range("1d", "5d", freq="H", name="timebucket") + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +230,7 @@ def test_take_invalid_kwargs(self): def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="H", name="idx") + idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index fe3ff1799e763..63db5c1b9c91d 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -63,8 +63,8 @@ def test_tdi_round(self): ) expected_elt = expected_rng[1] - tm.assert_index_equal(td.round(freq="H"), expected_rng) - assert elt.round(freq="H") == expected_elt + tm.assert_index_equal(td.round(freq="h"), expected_rng) + assert elt.round(freq="h") == expected_elt msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): @@ -121,7 +121,7 @@ def test_round(self): ), ), ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), ]: r1 = t1.round(freq) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 6cdd6944e90ea..727b4eee00566 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -52,8 +52,8 @@ def test_union_coverage(self): assert result.freq == ordered.freq def test_union_bug_1730(self): - rng_a = timedelta_range("1 day", periods=4, freq="3H") - rng_b = timedelta_range("1 day", periods=4, freq="4H") + rng_a = timedelta_range("1 day", periods=4, freq="3h") + rng_b = timedelta_range("1 day", periods=4, freq="4h") result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index d0593b3230959..f22bdb7a90516 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,6 +3,7 @@ from pandas import ( Timedelta, + TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -45,6 +46,7 @@ def test_timedelta_range(self): @pytest.mark.parametrize( "depr_unit, unit", [ + ("H", "hour"), ("T", "minute"), ("t", "minute"), ("S", "second"), @@ -56,7 +58,8 @@ def test_timedelta_range(self): ("n", "nanosecond"), ], ) - def test_timedelta_units_T_S_L_U_N_deprecated(self, depr_unit, unit): + def test_timedelta_units_H_T_S_L_U_N_deprecated(self, depr_unit, unit): + # GH#52536 depr_msg = ( f"'{depr_unit}' is deprecated and will be removed in a future version." ) @@ -67,7 +70,7 @@ def test_timedelta_units_T_S_L_U_N_deprecated(self, depr_unit, unit): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12min"), (7, "16H"), (9, "12H")] + "periods, freq", [(3, "2D"), (5, "D"), (6, "19h12min"), (7, "16h"), (9, "12h")] ) def test_linspace_behavior(self, periods, freq): # GH 20976 @@ -75,6 +78,16 @@ def test_linspace_behavior(self, periods, freq): expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("msg_freq, freq", [("H", "19H12min"), ("T", "19h12T")]) + def test_timedelta_range_H_T_deprecated(self, freq, msg_freq): + # GH#52536 + msg = f"'{msg_freq}' is deprecated and will be removed in a future version." + + result = timedelta_range(start="0 days", end="4 days", periods=6) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = timedelta_range(start="0 days", end="4 days", freq=freq) + tm.assert_index_equal(result, expected) + def test_errors(self): # not enough params msg = ( @@ -95,7 +108,7 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): - timedelta_range(start="0 days", end="5 days", periods=10, freq="H") + timedelta_range(start="0 days", end="5 days", periods=10, freq="h") @pytest.mark.parametrize( "start, end, freq, expected_periods", @@ -119,3 +132,42 @@ def test_timedelta_range_infer_freq(self): # https://github.com/pandas-dev/pandas/issues/35897 result = timedelta_range("0s", "1s", periods=31) assert result.freq is None + + @pytest.mark.parametrize( + "freq_depr, start, end, expected_values, expected_freq", + [ + ( + "3.5S", + "05:03:01", + "05:03:10", + ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], + "3500ms", + ), + ( + "2.5T", + "5 hours", + "5 hours 8 minutes", + [ + "0 days 05:00:00", + "0 days 05:02:30", + "0 days 05:05:00", + "0 days 05:07:30", + ], + "150s", + ), + ], + ) + def test_timedelta_range_deprecated_freq( + self, freq_depr, start, end, expected_values, expected_freq + ): + # GH#52536 + msg = ( + f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = timedelta_range(start=start, end=end, freq=freq_depr) + expected = TimedeltaIndex( + expected_values, dtype="timedelta64[ns]", freq=expected_freq + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index de989ad550f2b..081da385ebcc3 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -166,7 +166,7 @@ def test_getitem_intkey_leading_level( mi = ser.index assert isinstance(mi, MultiIndex) if dtype is int: - assert mi.levels[0].dtype == np.int_ + assert mi.levels[0].dtype == np.dtype(int) else: assert mi.levels[0].dtype == np.float64 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a2693c85e507f..70eada188f3c8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -260,19 +260,19 @@ def test_loc_npstr(self): @pytest.mark.parametrize( "msg, key", [ - (r"Period\('2019', 'A-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), - (r"Period\('2019', 'A-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), - (r"Period\('2019', 'A-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), + (r"Period\('2019', 'Y-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), + (r"Period\('2019', 'Y-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), + (r"Period\('2019', 'Y-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), ( - r"Period\('2018', 'A-DEC'\), Period\('2016', 'A-DEC'\), 'bar'", + r"Period\('2018', 'Y-DEC'\), Period\('2016', 'Y-DEC'\), 'bar'", (Period(2018), Period(2016), "bar"), ), - (r"Period\('2018', 'A-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), + (r"Period\('2018', 'Y-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), ( - r"Period\('2017', 'A-DEC'\), 'foo', Period\('2015', 'A-DEC'\)", + r"Period\('2017', 'Y-DEC'\), 'foo', Period\('2015', 'Y-DEC'\)", (Period(2017), "foo", Period(2015)), ), - (r"Period\('2017', 'A-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), + (r"Period\('2017', 'Y-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), ], ) def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): @@ -442,7 +442,7 @@ def test_loc_to_fail(self): ) msg = ( - rf"\"None of \[Index\(\[1, 2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[1, 2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -460,7 +460,7 @@ def test_loc_to_fail2(self): s.loc[-1] msg = ( - rf"\"None of \[Index\(\[-1, -2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-1, -2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -476,7 +476,7 @@ def test_loc_to_fail2(self): s["a"] = 2 msg = ( - rf"\"None of \[Index\(\[-2\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[-2\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -493,7 +493,7 @@ def test_loc_to_fail3(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - rf"\"None of \[Index\(\[3\], dtype='{np.int_().dtype}'\)\] are " + rf"\"None of \[Index\(\[3\], dtype='{np.dtype(int)}'\)\] are " r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -510,7 +510,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.int_().dtype}')] are in the [index]" + msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] @@ -1209,7 +1209,7 @@ def test_loc_setitem_empty_append_raises(self): df = DataFrame(columns=["x", "y"]) df.index = df.index.astype(np.int64) msg = ( - rf"None of \[Index\(\[0, 1\], dtype='{np.int_().dtype}'\)\] " + rf"None of \[Index\(\[0, 1\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]" ) with pytest.raises(KeyError, match=msg): @@ -1464,7 +1464,7 @@ def test_loc_setitem_datetime_coercion(self): def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH#11365 tz = tz_naive_fixture - idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) + idx = date_range(start="2015-07-12", periods=3, freq="h", tz=tz) expected = DataFrame(1.2, index=idx, columns=["var"]) # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype @@ -2057,7 +2057,7 @@ def test_loc_setitem_with_expansion_and_existing_dst(self): start = Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") end = Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") ts = Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") - idx = date_range(start, end, inclusive="left", freq="H") + idx = date_range(start, end, inclusive="left", freq="h") assert ts not in idx # i.e. result.loc setitem is with-expansion result = DataFrame(index=idx, columns=["value"]) @@ -2166,6 +2166,19 @@ def test_loc_setitem_with_expansion_preserves_nullable_int(self, dtype): result.loc[df.index, "data"] = ser._values tm.assert_frame_equal(result, df) + def test_loc_setitem_ea_not_full_column(self): + # GH#39163 + df = DataFrame({"A": range(5)}) + + val = date_range("2016-01-01", periods=3, tz="US/Pacific") + + df.loc[[0, 1, 2], "B"] = val + + bex = val.append(DatetimeIndex([pd.NaT, pd.NaT], dtype=val.dtype)) + expected = DataFrame({"A": range(5), "B": bex}) + assert expected.dtypes["B"] == val.dtype + tm.assert_frame_equal(df, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): @@ -2317,7 +2330,7 @@ def test_loc_getitem_partial_string_slicing_with_periodindex(self): tm.assert_series_equal(result, expected) def test_loc_getitem_partial_string_slicing_with_timedeltaindex(self): - ix = timedelta_range(start="1 day", end="2 days", freq="1H") + ix = timedelta_range(start="1 day", end="2 days", freq="1h") ser = ix.to_series() result = ser.loc[:"1 days"] expected = ser.iloc[:-1] @@ -2419,7 +2432,7 @@ def test_loc_getitem_label_slice_across_dst(self): "index", [ pd.period_range(start="2017-01-01", end="2018-01-01", freq="M"), - timedelta_range(start="1 day", end="2 days", freq="1H"), + timedelta_range(start="1 day", end="2 days", freq="1h"), ], ) def test_loc_getitem_label_slice_period_timedelta(self, index): @@ -2575,7 +2588,7 @@ def test_loc_setitem_mask_and_label_with_datetimeindex(self): df = DataFrame( np.arange(6.0).reshape(3, 2), columns=list("AB"), - index=date_range("1/1/2000", periods=3, freq="1H"), + index=date_range("1/1/2000", periods=3, freq="1h"), ) expected = df.copy() expected["C"] = [expected.index[0]] + [pd.NaT, pd.NaT] @@ -2874,7 +2887,7 @@ def test_loc_datetimelike_mismatched_dtypes(): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"], - index=date_range("2012", freq="H", periods=5), + index=date_range("2012", freq="h", periods=5), ) # create dataframe with non-unique DatetimeIndex df = df.iloc[[0, 2, 2, 3]].copy() diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 8e5cde42ec91b..8f499644f1013 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -402,7 +402,7 @@ def test_series_partial_set(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}'\)\] " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}'\)\] " r"are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): @@ -483,7 +483,7 @@ def test_series_partial_set_with_name(self): # raises as nothing is in the index msg = ( - rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.int_().dtype}', " + rf"\"None of \[Index\(\[3, 3, 3\], dtype='{np.dtype(int)}', " r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 597bc2975268e..d1c1d72ac4afe 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -468,7 +468,7 @@ def test_set_change_dtype(self, mgr): np.random.default_rng(2).standard_normal(N).astype(int), ) idx = mgr2.items.get_loc("quux") - assert mgr2.iget(idx).dtype == np.int_ + assert mgr2.iget(idx).dtype == np.dtype(int) mgr2.iset( mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N) @@ -1335,13 +1335,13 @@ def test_interval_can_hold_element(self, dtype, element): assert not blk._can_hold_element(elem) def test_period_can_hold_element_emptylist(self): - pi = period_range("2016", periods=3, freq="A") + pi = period_range("2016", periods=3, freq="Y") blk = new_block(pi._data.reshape(1, 3), BlockPlacement([1]), ndim=2) assert blk._can_hold_element([]) def test_period_can_hold_element(self, element): - pi = period_range("2016", periods=3, freq="A") + pi = period_range("2016", periods=3, freq="Y") elem = element(pi) self.check_series_setitem(elem, pi, True) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 53ee449c2dc0c..57f1f082708ae 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -212,7 +212,7 @@ def test_repr_truncation(self): r = repr(df) r = r[r.find("\n") + 1 :] - adj = fmt.get_adjustment() + adj = printing.get_adjustment() for line, value in zip(r.split("\n"), df["B"]): if adj.len(value) + 1 > max_len: @@ -2166,7 +2166,7 @@ def test_period(self): "B": [ pd.Period("2011-01", freq="M"), pd.Period("2011-02-01", freq="D"), - pd.Period("2011-03-01 09:00", freq="H"), + pd.Period("2011-03-01 09:00", freq="h"), pd.Period("2011-04", freq="M"), ], "C": list("abcd"), @@ -2703,7 +2703,7 @@ def test_period(self): [ pd.Period("2011-01", freq="M"), pd.Period("2011-02-01", freq="D"), - pd.Period("2011-03-01 09:00", freq="H"), + pd.Period("2011-03-01 09:00", freq="h"), ] ) exp = ( @@ -2933,9 +2933,9 @@ def test_to_string_empty_col(self): class TestGenericArrayFormatter: def test_1d_array(self): - # GenericArrayFormatter is used on types for which there isn't a dedicated + # _GenericArrayFormatter is used on types for which there isn't a dedicated # formatter. np.bool_ is one of those types. - obj = fmt.GenericArrayFormatter(np.array([True, False])) + obj = fmt._GenericArrayFormatter(np.array([True, False])) res = obj.get_result() assert len(res) == 2 # Results should be right-justified. @@ -2943,14 +2943,14 @@ def test_1d_array(self): assert res[1] == " False" def test_2d_array(self): - obj = fmt.GenericArrayFormatter(np.array([[True, False], [False, True]])) + obj = fmt._GenericArrayFormatter(np.array([[True, False], [False, True]])) res = obj.get_result() assert len(res) == 2 assert res[0] == " [True, False]" assert res[1] == " [False, True]" def test_3d_array(self): - obj = fmt.GenericArrayFormatter( + obj = fmt._GenericArrayFormatter( np.array([[[True, True], [False, False]], [[False, True], [True, False]]]) ) res = obj.get_result() @@ -3186,65 +3186,65 @@ def test_all(self): class TestTimedelta64Formatter: def test_days(self): - x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" - assert result[1].strip() == "'1 days'" + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "1 days" - result = fmt.Timedelta64Formatter(x[1:2], box=True).get_result() - assert result[0].strip() == "'1 days'" + result = fmt._Timedelta64Formatter(x[1:2]).get_result() + assert result[0].strip() == "1 days" - result = fmt.Timedelta64Formatter(x, box=False).get_result() + result = fmt._Timedelta64Formatter(x).get_result() assert result[0].strip() == "0 days" assert result[1].strip() == "1 days" - result = fmt.Timedelta64Formatter(x[1:2], box=False).get_result() + result = fmt._Timedelta64Formatter(x[1:2]).get_result() assert result[0].strip() == "1 days" def test_days_neg(self): - x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(-x, box=True).get_result() - assert result[0].strip() == "'0 days'" - assert result[1].strip() == "'-1 days'" + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(-x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "-1 days" def test_subdays(self): - y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") - result = fmt.Timedelta64Formatter(y, box=True).get_result() - assert result[0].strip() == "'0 days 00:00:00'" - assert result[1].strip() == "'0 days 00:00:01'" + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values + result = fmt._Timedelta64Formatter(y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "0 days 00:00:01" def test_subdays_neg(self): - y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") - result = fmt.Timedelta64Formatter(-y, box=True).get_result() - assert result[0].strip() == "'0 days 00:00:00'" - assert result[1].strip() == "'-1 days +23:59:59'" + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values + result = fmt._Timedelta64Formatter(-y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "-1 days +23:59:59" def test_zero(self): - x = pd.to_timedelta(list(range(1)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" + x = pd.to_timedelta(list(range(1)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" - x = pd.to_timedelta(list(range(1)), unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" + x = pd.to_timedelta(list(range(1)), unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" class TestDatetime64Formatter: def test_mixed(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) - result = fmt.Datetime64Formatter(x).get_result() + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT])._values + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT]) - result = fmt.Datetime64Formatter(x).get_result() + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT])._values + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" def test_date_nanos(self): - x = Series([Timestamp(200)]) - result = fmt.Datetime64Formatter(x).get_result() + x = Series([Timestamp(200)])._values + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "1970-01-01 00:00:00.000000200" def test_dates_display(self): @@ -3252,66 +3252,70 @@ def test_dates_display(self): # make sure that we are consistently display date formatting x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" x = Series(date_range("20130101 09:00:00", periods=5, freq="ns")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000000004" def test_datetime64formatter_yearmonth(self): - x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)])._values def format_func(x): return x.strftime("%Y-%m") - formatter = fmt.Datetime64Formatter(x, formatter=format_func) + formatter = fmt._Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() assert result == ["2016-01", "2016-02"] def test_datetime64formatter_hoursecond(self): x = Series( pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") - ) + )._values def format_func(x): return x.strftime("%H:%M") - formatter = fmt.Datetime64Formatter(x, formatter=format_func) + formatter = fmt._Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() assert result == ["10:10", "12:12"] def test_datetime64formatter_tz_ms(self): - x = Series( - np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") - ).dt.tz_localize("US/Pacific") - result = fmt.Datetime64TZFormatter(x).get_result() + x = ( + Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ) + .dt.tz_localize("US/Pacific") + ._values + ) + result = fmt._Datetime64TZFormatter(x).get_result() assert result[0].strip() == "2999-01-01 00:00:00-08:00" assert result[1].strip() == "2999-01-02 00:00:00-08:00" @@ -3326,7 +3330,7 @@ def test_str(self): class TestPeriodIndexFormat: def test_period_format_and_strftime_default(self): - per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="H") + per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="h") # Default formatting formatted = per.format() @@ -3373,13 +3377,13 @@ def test_period_tz(self): # Converting to a period looses the timezone information # Since tz is currently set as utc, we'll see 2012 with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="H") + per = dt.to_period(freq="h") assert per.format()[0] == "2012-12-31 23:00" # If tz is currently set as paris before conversion, we'll see 2013 dt = dt.tz_convert("Europe/Paris") with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="H") + per = dt.to_period(freq="h") assert per.format()[0] == "2013-01-01 00:00" @pytest.mark.parametrize( @@ -3402,7 +3406,7 @@ def test_period_non_ascii_fmt(self, locale_str): # Change locale temporarily for this test. with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): # Scalar - per = pd.Period("2018-03-11 13:00", freq="H") + per = pd.Period("2018-03-11 13:00", freq="h") assert per.strftime("%y é") == "18 é" # Index @@ -3434,7 +3438,7 @@ def test_period_custom_locale_directive(self, locale_str): am_local, pm_local = get_local_am_pm() # Scalar - per = pd.Period("2018-03-11 13:00", freq="H") + per = pd.Period("2018-03-11 13:00", freq="h") assert per.strftime("%p") == pm_local # Index diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 2d0dc0d937709..78198bce71460 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -8,7 +8,6 @@ import pandas as pd from pandas.io.formats import printing -import pandas.io.formats.format as fmt def test_adjoin(): @@ -48,7 +47,7 @@ def test_adjoin_unicode(self): adjoined = printing.adjoin(2, *data) assert adjoined == expected - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() expected = """あ dd ggg b ええ hhh @@ -73,7 +72,7 @@ def test_adjoin_unicode(self): assert adj.len(cols[2]) == 26 def test_justify(self): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() def just(x, *args, **kwargs): # wrapper to test single str @@ -95,7 +94,7 @@ def just(x, *args, **kwargs): assert just("パンダ", 10, mode="right") == " パンダ" def test_east_asian_len(self): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("abc") == 3 assert adj.len("abc") == 3 @@ -106,11 +105,11 @@ def test_east_asian_len(self): assert adj.len("パンダpanda") == 10 def test_ambiguous_width(self): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 4 with cf.option_context("display.unicode.ambiguous_as_wide", True): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 6 data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 822bd14610388..613f609320f31 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -285,7 +285,7 @@ def test_to_csv_different_datetime_formats(self): df = DataFrame( { "date": pd.to_datetime("1970-01-01"), - "datetime": pd.date_range("1970-01-01", periods=2, freq="H"), + "datetime": pd.date_range("1970-01-01", periods=2, freq="h"), } ) expected_rows = [ diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 5811485406b86..38a2bb52930e3 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -68,7 +68,7 @@ def biggie_df_fixture(request): return df -@pytest.fixture(params=fmt._VALID_JUSTIFY_PARAMETERS) +@pytest.fixture(params=fmt.VALID_JUSTIFY_PARAMETERS) def justify(request): return request.param diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index fc2edc7559a48..943515acd33b5 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -32,7 +32,7 @@ def df_schema(): "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="min"), + "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), ) @@ -45,7 +45,7 @@ def df_table(): "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="min"), + "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], @@ -150,7 +150,7 @@ def test_as_json_table_type_bool_data(self, bool_type): pd.to_datetime(["2016"], utc=True), pd.Series(pd.to_datetime(["2016"])), pd.Series(pd.to_datetime(["2016"], utc=True)), - pd.period_range("2016", freq="A", periods=3), + pd.period_range("2016", freq="Y", periods=3), ], ) def test_as_json_table_type_date_data(self, date_data): @@ -480,9 +480,9 @@ def test_convert_pandas_type_to_json_field_datetime( assert result == expected def test_convert_pandas_type_to_json_period_range(self): - arr = pd.period_range("2016", freq="A-DEC", periods=4) + arr = pd.period_range("2016", freq="Y-DEC", periods=4) result = convert_pandas_type_to_json_field(arr) - expected = {"name": "values", "type": "datetime", "freq": "A-DEC"} + expected = {"name": "values", "type": "datetime", "freq": "Y-DEC"} assert result == expected @pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex]) @@ -695,7 +695,7 @@ def test_read_json_table_orient(self, index_nm, vals, recwarn): @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) @pytest.mark.parametrize( "vals", - [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="min")}], + [{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}], ) def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b3c2e67f7c318..2767078674632 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -521,7 +521,7 @@ def test_v12_compat(self, datapath): tm.assert_frame_equal(df_iso, df_unser_iso) def test_blocks_compat_GH9037(self): - index = pd.date_range("20000101", periods=10, freq="H") + index = pd.date_range("20000101", periods=10, freq="h") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) @@ -604,7 +604,7 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype("unicode") + df_mixed.columns = df_mixed.columns.astype(np.str_) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 8671bccbc1bbd..8640c17a1349f 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -275,9 +275,9 @@ def test_categorical_coerces_timestamp(all_parsers): def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))} - data = "b\n1H\n2H\n3H" + data = "b\n1h\n2h\n3h" expected = DataFrame({"b": Categorical(dtype["b"].categories)}) result = parser.read_csv(StringIO(data), dtype=dtype) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c79fdd9145a6a..9f7840588f89e 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -47,7 +47,7 @@ def test_read_csv_with_custom_date_parser(all_parsers): # GH36111 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( @@ -87,7 +87,7 @@ def test_read_csv_with_custom_date_parser_parse_dates_false(all_parsers): # GH44366 def __custom_date_parser(time): time = time.astype(np.float64) - time = time.astype(np.int_) # convert float seconds to int type + time = time.astype(int) # convert float seconds to int type return pd.to_timedelta(time, unit="s") testdata = StringIO( diff --git a/pandas/tests/io/pytables/test_retain_attributes.py b/pandas/tests/io/pytables/test_retain_attributes.py index 799c2f22a9dc6..aef6fc0460cd9 100644 --- a/pandas/tests/io/pytables/test_retain_attributes.py +++ b/pandas/tests/io/pytables/test_retain_attributes.py @@ -21,7 +21,7 @@ def test_retain_index_attributes(setup_path): # GH 3499, losing frequency info on index recreation df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} ) with ensure_clean_store(setup_path) as store: @@ -76,7 +76,7 @@ def test_retain_index_attributes2(tmp_path, setup_path): with tm.assert_produces_warning(errors.AttributeConflictWarning): df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} ) df.to_hdf(path, key="data", mode="w", append=True) df2 = DataFrame( @@ -85,7 +85,7 @@ def test_retain_index_attributes2(tmp_path, setup_path): df2.to_hdf(path, key="data", append=True) - idx = date_range("2000-1-1", periods=3, freq="H") + idx = date_range("2000-1-1", periods=3, freq="h") idx.name = "foo" df = DataFrame({"A": Series(range(3), index=idx)}) df.to_hdf(path, key="data", mode="w", append=True) @@ -93,7 +93,7 @@ def test_retain_index_attributes2(tmp_path, setup_path): assert read_hdf(path, key="data").index.name == "foo" with tm.assert_produces_warning(errors.AttributeConflictWarning): - idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2 = date_range("2001-1-1", periods=3, freq="h") idx2.name = "bar" df2 = DataFrame({"A": Series(range(3), index=idx2)}) df2.to_hdf(path, key="data", append=True) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 0ff84bcf136cd..f031ac46c670c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -303,8 +303,10 @@ def test_store_dropna(tmp_path, setup_path): tm.assert_frame_equal(df_without_missing, reloaded) -def test_keyword_deprecation(): +def test_keyword_deprecation(tmp_path, setup_path): # GH 54229 + path = tmp_path / setup_path + msg = ( "Starting with pandas version 3.0 all arguments of to_hdf except for the " "argument 'path_or_buf' will be keyword-only." @@ -312,7 +314,7 @@ def test_keyword_deprecation(): df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_hdf("example", "key") + df.to_hdf(path, "key") def test_to_hdf_with_min_itemsize(tmp_path, setup_path): diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 1eb7a34bead56..676b9374514e8 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -131,7 +131,7 @@ def test_append_with_timezones(setup_path, gettz): def test_append_with_timezones_as_index(setup_path, gettz): # GH#4098 example - dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")) + dti = date_range("2000-1-1", periods=3, freq="h", tz=gettz("US/Eastern")) dti = dti._with_freq(None) # freq doesn't round-trip df = DataFrame({"A": Series(range(3), index=dti)}) @@ -332,7 +332,7 @@ def test_dst_transitions(setup_path): "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", - freq="H", + freq="h", ambiguous="infer", ) times = times._with_freq(None) # freq doesn't round-trip diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e1839fc1b0a67..82fb98615100f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2411,7 +2411,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 47114cab47619..2ef1f065f603d 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2487,6 +2487,15 @@ def test_secondary_y(self, secondary_y): assert ax.get_ylim() == (0, 100) assert ax.get_yticks()[0] == 99 + @pytest.mark.slow + def test_plot_no_warning(self): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + df = tm.makeTimeDataFrame() + with tm.assert_produces_warning(False): + _ = df.plot() + _ = df.T.plot() + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 220741cf1ec3d..db7c0cec09e6c 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -49,7 +49,7 @@ class TestTSPlot: def test_ts_plot_with_tz(self, tz_aware_fixture): # GH2877, GH17173, GH31205, GH31580 tz = tz_aware_fixture - index = date_range("1/1/2011", periods=2, freq="H", tz=tz) + index = date_range("1/1/2011", periods=2, freq="h", tz=tz) ts = Series([188.5, 328.25], index=index) _check_plot_works(ts.plot) ax = ts.plot() @@ -102,7 +102,7 @@ def test_is_error_nozeroindex(self): _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - idx = date_range("1/1/1987", freq="A", periods=3) + idx = date_range("1/1/1987", freq="Y", periods=3) df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) fig, ax = mpl.pyplot.subplots() @@ -111,13 +111,13 @@ def test_nonnumeric_exclude(self): mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): - idx = date_range("1/1/1987", freq="A", periods=3) + idx = date_range("1/1/1987", freq="Y", periods=3) df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): df["A"].plot() - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) @@ -125,7 +125,7 @@ def test_tsplot_period(self, freq): _check_plot_works(ser.plot, ax=ax) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_tsplot_datetime(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -165,8 +165,8 @@ def test_get_datevalue(self): from pandas.plotting._matplotlib.converter import get_datevalue assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "A") == 1987 - assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal + assert get_datevalue(1987, "Y") == 1987 + assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal def test_ts_plot_format_coord(self): @@ -176,7 +176,7 @@ def check_format_of_first_point(ax, expected_string): first_y = first_line.get_ydata()[0] assert expected_string == ax.format_coord(first_x, first_y) - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) + annual = Series(1, index=date_range("2014-01-01", periods=3, freq="Y-DEC")) _, ax = mpl.pyplot.subplots() annual.plot(ax=ax) check_format_of_first_point(ax, "t = 2014 y = 1.000000") @@ -187,14 +187,14 @@ def check_format_of_first_point(ax, expected_string): daily.plot(ax=ax) check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @pytest.mark.parametrize( - "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the @@ -204,14 +204,14 @@ def test_line_plot_period_mlt_series(self, frqncy): _check_plot_works(s.plot, s.index.freq.rule_code) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) - @pytest.mark.parametrize("freq", ["s", "min", "H", "D", "W", "ME", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "Q", "Y"]) def test_line_plot_period_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) df = DataFrame( @@ -222,7 +222,7 @@ def test_line_plot_period_frame(self, freq): _check_plot_works(df.plot, df.index.freq) @pytest.mark.parametrize( - "frqncy", ["1s", "3s", "5min", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) @@ -240,7 +240,7 @@ def test_line_plot_period_mlt_frame(self, frqncy): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -254,7 +254,7 @@ def test_line_plot_datetime_frame(self, freq): _check_plot_works(df.plot, freq) @pytest.mark.parametrize( - "freq", ["s", "min", "H", "D", "W", "ME", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "Q-DEC", "Y", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -382,7 +382,7 @@ def test_freq_with_no_period_alias(self): def test_nonzero_base(self): # GH2571 - idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) + idx = date_range("2012-12-20", periods=24, freq="h") + timedelta(minutes=30) df = DataFrame(np.arange(24), index=idx) _, ax = mpl.pyplot.subplots() df.plot(ax=ax) @@ -440,7 +440,7 @@ def test_get_finder(self): assert conv.get_finder(to_offset("D")) == conv._daily_finder assert conv.get_finder(to_offset("ME")) == conv._monthly_finder assert conv.get_finder(to_offset("Q")) == conv._quarterly_finder - assert conv.get_finder(to_offset("A")) == conv._annual_finder + assert conv.get_finder(to_offset("Y")) == conv._annual_finder assert conv.get_finder(to_offset("W")) == conv._daily_finder def test_finder_daily(self): @@ -523,10 +523,10 @@ def test_finder_monthly_long(self): def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - xp = [Period(x, freq="A").ordinal for x in xp] + xp = [Period(x, freq="Y").ordinal for x in xp] rs = [] for nyears in [5, 10, 19, 49, 99, 199, 599, 1001]: - rng = period_range("1987", periods=nyears, freq="A") + rng = period_range("1987", periods=nyears, freq="Y") ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() ser.plot(ax=ax) @@ -551,13 +551,13 @@ def test_finder_minutely(self): def test_finder_hourly(self): nhours = 23 - rng = date_range("1/1/1999", freq="H", periods=nhours) + rng = date_range("1/1/1999", freq="h", periods=nhours) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period("1/1/1999", freq="H").ordinal + xp = Period("1/1/1999", freq="h").ordinal assert rs == xp @@ -814,7 +814,7 @@ def test_mixed_freq_hf_first(self): assert PeriodIndex(data=line.get_xdata()).freq == "D" def test_mixed_freq_alignment(self): - ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") + ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="h") ts_data = np.random.default_rng(2).standard_normal(12) ts = Series(ts_data, index=ts_ind) @@ -842,7 +842,7 @@ def test_mixed_freq_lf_first(self): def test_mixed_freq_lf_first_hourly(self): idxh = date_range("1/1/1999", periods=240, freq="min") - idxl = date_range("1/1/1999", periods=4, freq="H") + idxl = date_range("1/1/1999", periods=4, freq="h") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -1517,7 +1517,7 @@ def test_timedelta_short_period(self): def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 - rng = date_range("1/1/2011", periods=10, freq="H") + rng = date_range("1/1/2011", periods=10, freq="h") x = rng w1 = np.arange(0, 1, 0.1) w2 = np.arange(0, 1, 0.1)[::-1] diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index e77dc0b305171..bf4474d085b11 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -6,7 +6,10 @@ import pytest from pandas.compat import is_platform_linux -from pandas.compat.numpy import np_version_gte1p24 +from pandas.compat.numpy import ( + np_long, + np_version_gte1p24, +) import pandas.util._test_decorators as td import pandas as pd @@ -561,7 +564,7 @@ def test_plot_fails_with_dupe_color_and_style(self): [ ["scott", 20], [None, 20], - [None, np.int_(20)], + [None, np_long(20)], [0.5, np.linspace(-100, 100, 20)], ], ) @@ -973,3 +976,10 @@ def test_series_none_color(self): ax = series.plot(color=None) expected = _unpack_cycler(mpl.pyplot.rcParams)[:1] _check_colors(ax.get_lines(), linecolors=expected) + + @pytest.mark.slow + def test_plot_no_warning(self, ts): + # GH 55138 + # TODO(3.0): this can be removed once Period[B] deprecation is enforced + with tm.assert_produces_warning(False): + _ = ts.plot() diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 021252500e814..560b2377ada70 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1087,7 +1087,8 @@ def test_any_all_pyarrow_string(self): ser = Series([None, "a"], dtype="string[pyarrow_numpy]") assert ser.any() - assert not ser.all() + assert ser.all() + assert not ser.all(skipna=False) ser = Series([None, ""], dtype="string[pyarrow_numpy]") assert not ser.any() diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 3cea39fa75ece..74e521ab71f41 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -41,7 +41,7 @@ def test_dt64_mean(self, tz_naive_fixture, box): assert obj.mean(skipna=False) is pd.NaT @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) - @pytest.mark.parametrize("freq", ["s", "H", "D", "W", "B"]) + @pytest.mark.parametrize("freq", ["s", "h", "D", "W", "B"]) def test_period_mean(self, box, freq): # GH#24757 dti = pd.date_range("2001-01-01", periods=11) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index ae28a010d8435..42e741119b0a1 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -44,7 +44,7 @@ def _create_index(*args, **kwargs): return _create_index -@pytest.mark.parametrize("freq", ["2D", "1H"]) +@pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) @@ -65,16 +65,16 @@ def test_asfreq_fill_value(series, create_index): ser = series - result = ser.resample("1H").asfreq() - new_index = create_index(ser.index[0], ser.index[-1], freq="1H") + result = ser.resample("1h").asfreq() + new_index = create_index(ser.index[0], ser.index[-1], freq="1h") expected = ser.reindex(new_index) tm.assert_series_equal(result, expected) # Explicit cast to float to avoid implicit cast when setting None frame = ser.astype("float").to_frame("value") frame.iloc[1] = None - result = frame.resample("1H").asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], frame.index[-1], freq="1H") + result = frame.resample("1h").asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], frame.index[-1], freq="1h") expected = frame.reindex(new_index, fill_value=4.0) tm.assert_frame_equal(result, expected) @@ -96,11 +96,11 @@ def test_raises_on_non_datetimelike_index(): "but got an instance of 'RangeIndex'" ) with pytest.raises(TypeError, match=msg): - xp.resample("A") + xp.resample("Y") @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 @@ -108,7 +108,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): ser.resample(freq) @@ -140,7 +140,7 @@ def test_resample_empty_series(freq, empty_series_dti, resample_method): [ pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")), "D", - "H", + "h", ], ) def test_resample_nat_index_series(freq, series, resample_method): @@ -164,7 +164,7 @@ def test_resample_nat_index_series(freq, series, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 @@ -172,7 +172,7 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): ser.resample(freq) @@ -192,7 +192,7 @@ def test_resample_count_empty_series(freq, empty_series_dti, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti @@ -200,7 +200,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): if freq == "ME" and isinstance(df.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): df.resample(freq, group_keys=False) @@ -234,7 +234,7 @@ def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_count_empty_dataframe(freq, empty_frame_dti): # GH28427 @@ -243,7 +243,7 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) @@ -261,7 +261,7 @@ def test_resample_count_empty_dataframe(freq, empty_frame_dti): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_size_empty_dataframe(freq, empty_frame_dti): # GH28427 @@ -270,7 +270,7 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) @@ -308,7 +308,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): @all_ts -@pytest.mark.parametrize("freq", ["ME", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 ser = empty_series_dti @@ -316,7 +316,7 @@ def test_apply_to_empty_series(empty_series_dti, freq): if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_series_dti.resample(freq) @@ -333,7 +333,7 @@ def test_apply_to_empty_series(empty_series_dti, freq): @all_ts def test_resampler_is_iterable(series): # GH 15314 - freq = "H" + freq = "h" tg = Grouper(freq=freq, convention="start") grouped = series.groupby(tg) resampled = series.resample(freq) @@ -347,7 +347,7 @@ def test_resample_quantile(series): # GH 15023 ser = series q = 0.75 - freq = "H" + freq = "h" result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 113e2d8986ad2..28d02576156a0 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -435,7 +435,7 @@ def test_resample_frame_basic_cy_funcs(f, unit): g._cython_agg_general(f, alt=None, numeric_only=True) -@pytest.mark.parametrize("freq", ["A", "ME"]) +@pytest.mark.parametrize("freq", ["Y", "ME"]) def test_resample_frame_basic_M_A(freq, unit): df = tm.makeTimeDataFrame() df.index = df.index.as_unit(unit) @@ -516,8 +516,8 @@ def test_upsample_with_limit(unit): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10s"]) -@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30H", "15Min", "30s"]) +@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"]) +@pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): # GH 33939 rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture).as_unit( @@ -604,9 +604,9 @@ def test_resample_ohlc_dataframe(unit): ).reindex(["VOLUME", "PRICE"], axis=1) df.index = df.index.as_unit(unit) df.columns.name = "Cols" - res = df.resample("H").ohlc() + res = df.resample("h").ohlc() exp = pd.concat( - [df["VOLUME"].resample("H").ohlc(), df["PRICE"].resample("H").ohlc()], + [df["VOLUME"].resample("h").ohlc(), df["PRICE"].resample("h").ohlc()], axis=1, keys=df.columns, ) @@ -614,7 +614,7 @@ def test_resample_ohlc_dataframe(unit): tm.assert_frame_equal(exp, res) df.columns = [["a", "b"], ["c", "d"]] - res = df.resample("H").ohlc() + res = df.resample("h").ohlc() exp.columns = pd.MultiIndex.from_tuples( [ ("a", "c", "open"), @@ -659,7 +659,7 @@ def test_resample_reresample(unit): ).as_unit(unit) s = Series(np.random.default_rng(2).random(len(dti)), dti) bs = s.resample("B", closed="right", label="right").mean() - result = bs.resample("8H").mean() + result = bs.resample("8h").mean() assert len(result) == 25 assert isinstance(result.index.freq, offsets.DateOffset) assert result.index.freq == offsets.Hour(8) @@ -668,8 +668,8 @@ def test_resample_reresample(unit): @pytest.mark.parametrize( "freq, expected_kwargs", [ - ["A-DEC", {"start": "1990", "end": "2000", "freq": "a-dec"}], - ["A-JUN", {"start": "1990", "end": "2000", "freq": "a-jun"}], + ["Y-DEC", {"start": "1990", "end": "2000", "freq": "y-dec"}], + ["Y-JUN", {"start": "1990", "end": "2000", "freq": "y-jun"}], ["ME", {"start": "1990-01", "end": "2000-01", "freq": "M"}], ], ) @@ -796,34 +796,24 @@ def test_resample_offset(unit): @pytest.mark.parametrize( - "kwargs, expected", + "kwargs", [ - ( - {"origin": "1999-12-31 23:57:00"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": Timestamp("1970-01-01 00:02:00")}, - ["1970-01-01 00:02:00", "2000-01-01 01:57:00"], - ), - ( - {"origin": "epoch", "offset": "2m"}, - ["1999-12-31 23:57:00", "2000-01-01 01:57:00"], - ), + {"origin": "1999-12-31 23:57:00"}, + {"origin": Timestamp("1970-01-01 00:02:00")}, + {"origin": "epoch", "offset": "2m"}, # origin of '1999-31-12 12:02:00' should be equivalent for this case - ( - {"origin": "1999-12-31 12:02:00"}, - ["1999-12-31 12:02:00", "2000-01-01 01:57:00"], - ), - ({"offset": "-3m"}, ["1999-12-31 23:57:00", "2000-01-01 01:57:00"]), + {"origin": "1999-12-31 12:02:00"}, + {"offset": "-3m"}, ], ) -def test_resample_origin(kwargs, unit, expected): +def test_resample_origin(kwargs, unit): # GH 31809 rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - exp_rng = date_range(expected[0], expected[1], freq="5min").as_unit(unit) + exp_rng = date_range( + "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min" + ).as_unit(unit) resampled = ts.resample("5min", **kwargs).mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -853,31 +843,6 @@ def test_resample_bad_offset(offset, unit): ts.resample("5min", offset=offset) -def test_resample_monthstart_origin(): - # GH 53662 - df = DataFrame({"ts": [datetime(1999, 12, 31, 0, 0, 0)], "values": [10.0]}) - result = df.resample("2MS", on="ts", origin="1999-11-01")["values"].sum() - excepted = Series( - [10.0], - index=DatetimeIndex( - ["1999-11-01"], dtype="datetime64[ns]", name="ts", freq="2MS" - ), - ) - tm.assert_index_equal(result.index, excepted.index) - - df = DataFrame({"ts": [datetime(1999, 12, 31, 20)], "values": [10.0]}) - result = df.resample( - "3YS", on="ts", closed="left", label="left", origin=datetime(1995, 1, 1) - )["values"].sum() - expected = Series( - [0, 10.0], - index=DatetimeIndex( - ["1995-01-01", "1998-01-01"], dtype="datetime64[ns]", name="ts", freq="3YS" - ), - ) - tm.assert_index_equal(result.index, expected.index) - - def test_resample_origin_prime_freq(unit): # GH 31809 start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" @@ -909,7 +874,7 @@ def test_resample_origin_prime_freq(unit): tm.assert_index_equal(resampled.index, exp_rng) exp_rng = date_range( - "2000-01-01 00:00:00", "2000-10-02 00:15:00", freq="17min" + "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min" ).as_unit(unit) resampled = ts.resample("17min", origin="2000-01-01").mean() tm.assert_index_equal(resampled.index, exp_rng) @@ -928,12 +893,14 @@ def test_resample_origin_with_tz(unit): exp_rng = date_range( "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz ).as_unit(unit) - resampled = ts.resample("5min", origin="epoch", offset="2m").mean() + resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean() tm.assert_index_equal(resampled.index, exp_rng) - resampled = ts.resample( - "5min", origin=Timestamp("1999-12-31 23:57:00", tz=tz) - ).mean() + # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case + resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean() + tm.assert_index_equal(resampled.index, exp_rng) + + resampled = ts.resample("5min", origin="epoch", offset="2m").mean() tm.assert_index_equal(resampled.index, exp_rng) with pytest.raises(ValueError, match=msg): @@ -954,13 +921,13 @@ def test_resample_origin_epoch_with_tz_day_vs_24h(unit): ts_1 = Series(random_values, index=rng) result_1 = ts_1.resample("D", origin="epoch").mean() - result_2 = ts_1.resample("24H", origin="epoch").mean() + result_2 = ts_1.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1, result_2) # check that we have the same behavior with epoch even if we are not timezone aware ts_no_tz = ts_1.tz_localize(None) result_3 = ts_no_tz.resample("D", origin="epoch").mean() - result_4 = ts_no_tz.resample("24H", origin="epoch").mean() + result_4 = ts_no_tz.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1, result_3.tz_localize(rng.tz), check_freq=False) tm.assert_series_equal(result_1, result_4.tz_localize(rng.tz), check_freq=False) @@ -969,7 +936,7 @@ def test_resample_origin_epoch_with_tz_day_vs_24h(unit): rng = date_range(start, end, freq="7min").as_unit(unit) ts_2 = Series(random_values, index=rng) result_5 = ts_2.resample("D", origin="epoch").mean() - result_6 = ts_2.resample("24H", origin="epoch").mean() + result_6 = ts_2.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1.tz_localize(None), result_5.tz_localize(None)) tm.assert_series_equal(result_1.tz_localize(None), result_6.tz_localize(None)) @@ -1005,27 +972,27 @@ def _create_series(values, timestamps, freq="D"): expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"] expected = _create_series([23.0, 2.0], expected_ts) - result = ts.resample("D", origin="start", offset="-2H").sum() + result = ts.resample("D", origin="start", offset="-2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 21:00-06:00"] - expected = _create_series([22.0, 3.0], expected_ts, freq="24H") - result = ts.resample("24H", origin="start", offset="-2H").sum() + expected = _create_series([22.0, 3.0], expected_ts, freq="24h") + result = ts.resample("24h", origin="start", offset="-2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 02:00-05:00", "2013-11-03 02:00-06:00"] expected = _create_series([3.0, 22.0], expected_ts) - result = ts.resample("D", origin="start", offset="2H").sum() + result = ts.resample("D", origin="start", offset="2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 23:00-05:00", "2013-11-03 23:00-06:00"] expected = _create_series([24.0, 1.0], expected_ts) - result = ts.resample("D", origin="start", offset="-1H").sum() + result = ts.resample("D", origin="start", offset="-1h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 01:00-05:00", "2013-11-03 01:00:00-0500"] expected = _create_series([1.0, 24.0], expected_ts) - result = ts.resample("D", origin="start", offset="1H").sum() + result = ts.resample("D", origin="start", offset="1h").sum() tm.assert_series_equal(result, expected) @@ -1054,7 +1021,7 @@ def test_period_with_agg(): # aggregate a period resampler with a lambda s2 = Series( np.random.default_rng(2).integers(0, 5, 50), - index=period_range("2012-01-01", freq="H", periods=50), + index=period_range("2012-01-01", freq="h", periods=50), dtype="float64", ) @@ -1114,12 +1081,12 @@ def test_resample_dtype_coercion(unit): df = {"a": [1, 3, 1, 4]} df = DataFrame(df, index=date_range("2017-01-01", "2017-01-04").as_unit(unit)) - expected = df.astype("float64").resample("H").mean()["a"].interpolate("cubic") + expected = df.astype("float64").resample("h").mean()["a"].interpolate("cubic") - result = df.resample("H")["a"].mean().interpolate("cubic") + result = df.resample("h")["a"].mean().interpolate("cubic") tm.assert_series_equal(result, expected) - result = df.resample("H").mean()["a"].interpolate("cubic") + result = df.resample("h").mean()["a"].interpolate("cubic") tm.assert_series_equal(result, expected) @@ -1246,7 +1213,7 @@ def test_corner_cases_period(simple_period_range_series): # miscellaneous test coverage len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] # it works - result = len0pts.resample("A-DEC").mean() + result = len0pts.resample("Y-DEC").mean() assert len(result) == 0 @@ -1564,11 +1531,11 @@ def test_resample_across_dst(): pd.to_datetime(df2.ts, unit="s") .dt.tz_localize("UTC") .dt.tz_convert("Europe/Madrid"), - freq="H", + freq="h", ) df = DataFrame([5, 5], index=dti1) - result = df.resample(rule="H").sum() + result = df.resample(rule="h").sum() expected = DataFrame([5, 5], index=dti2) tm.assert_frame_equal(result, expected) @@ -1694,11 +1661,11 @@ def test_downsample_across_dst(unit): # GH 8531 tz = pytz.timezone("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2H").as_unit(unit) - result = Series(5, index=dates).resample("H").mean() + dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="H").as_unit(unit), + index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -1724,7 +1691,7 @@ def test_downsample_across_dst_weekly(unit): def test_downsample_across_dst_weekly_2(unit): # GH 9119, GH 21459 - idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H").as_unit( + idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="h").as_unit( unit ) s = Series(index=idx, dtype=np.float64) @@ -1742,7 +1709,7 @@ def test_downsample_dst_at_midnight(unit): # GH 25758 start = datetime(2018, 11, 3, 12) end = datetime(2018, 11, 5, 12) - index = date_range(start, end, freq="1H").as_unit(unit) + index = date_range(start, end, freq="1h").as_unit(unit) index = index.tz_localize("UTC").tz_convert("America/Havana") data = list(range(len(index))) dataframe = DataFrame(data, index=index) @@ -1844,14 +1811,14 @@ def f(data, add_arg): [ (30, "s", 0.5, "Min"), (60, "s", 1, "Min"), - (3600, "s", 1, "H"), - (60, "Min", 1, "H"), + (3600, "s", 1, "h"), + (60, "Min", 1, "h"), (21600, "s", 0.25, "D"), (86400, "s", 1, "D"), (43200, "s", 0.5, "D"), (1440, "Min", 1, "D"), - (12, "H", 0.5, "D"), - (24, "H", 1, "D"), + (12, "h", 0.5, "D"), + (24, "h", 1, "D"), ], ) def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): @@ -1871,7 +1838,7 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): [ ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), - ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), + ("19910905 06:00", "19920406 06:00", "h", "19910905 06:00", "19920406 07:00"), ("19910906", "19920406", "ME", "19910831", "19920430"), ("19910831", "19920430", "ME", "19910831", "19920531"), ("1991-08", "1992-04", "ME", "19910831", "19920531"), @@ -1922,32 +1889,32 @@ def test_resample_apply_product(duplicates, unit): "2020-03-28", "2020-03-31", "D", - "24H", + "24h", "2020-03-30 01:00", ), # includes transition into DST ( "2020-03-28", "2020-10-27", "D", - "24H", + "24h", "2020-10-27 00:00", ), # includes transition into and out of DST ( "2020-10-25", "2020-10-27", "D", - "24H", + "24h", "2020-10-26 23:00", ), # includes transition out of DST ( "2020-03-28", "2020-03-31", - "24H", + "24h", "D", "2020-03-30 00:00", ), # same as above, but from 24H to D - ("2020-03-28", "2020-10-27", "24H", "D", "2020-10-27 00:00"), - ("2020-10-25", "2020-10-27", "24H", "D", "2020-10-26 00:00"), + ("2020-03-28", "2020-10-27", "24h", "D", "2020-10-27 00:00"), + ("2020-10-25", "2020-10-27", "24h", "D", "2020-10-26 00:00"), ], ) def test_resample_calendar_day_with_dst( @@ -1981,7 +1948,7 @@ def test_resample_aggregate_functions_min_count(func, unit): def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit): # gh-43329 df = DataFrame( - index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12H").as_unit( + index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12h").as_unit( unit ), columns=["x"], @@ -2021,7 +1988,7 @@ def test_long_rule_non_nano(): "1900-12-31", ] ).astype("datetime64[s]"), - freq="200A-DEC", + freq="200Y-DEC", ) expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx) tm.assert_series_equal(result, expected) @@ -2044,7 +2011,7 @@ def test_resample_empty_series_with_tz(): def test_resample_M_deprecated(): - depr_msg = r"\'M\' will be deprecated, please use \'ME\' for \'month end\'" + depr_msg = "'M' will be deprecated, please use 'ME' instead." s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) expected = s.resample("2ME").mean() @@ -2098,7 +2065,7 @@ def test_resample_c_b_closed_right(freq: str): def test_resample_b_55282(): # https://github.com/pandas-dev/pandas/issues/55282 s = Series( - [1, 2, 3, 4, 5, 6], index=date_range("2023-09-26", periods=6, freq="12H") + [1, 2, 3, 4, 5, 6], index=date_range("2023-09-26", periods=6, freq="12h") ) result = s.resample("B", closed="right", label="right").mean() expected = Series( diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index db3804a6600b9..d214e1b4ae4ae 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -41,7 +41,7 @@ def _series_name(): class TestPeriodIndex: - @pytest.mark.parametrize("freq", ["2D", "1H", "2H"]) + @pytest.mark.parametrize("freq", ["2D", "1h", "2h"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_asfreq(self, series_and_frame, freq, kind): # GH 12884, 15944 @@ -65,23 +65,23 @@ def test_asfreq_fill_value(self, series): new_index = date_range( s.index[0].to_timestamp(how="start"), (s.index[-1]).to_timestamp(how="start"), - freq="1H", + freq="1h", ) expected = s.to_timestamp().reindex(new_index, fill_value=4.0) - result = s.resample("1H", kind="timestamp").asfreq(fill_value=4.0) + result = s.resample("1h", kind="timestamp").asfreq(fill_value=4.0) tm.assert_series_equal(result, expected) frame = s.to_frame("value") new_index = date_range( frame.index[0].to_timestamp(how="start"), (frame.index[-1]).to_timestamp(how="start"), - freq="1H", + freq="1h", ) expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) - result = frame.resample("1H", kind="timestamp").asfreq(fill_value=3.0) + result = frame.resample("1h", kind="timestamp").asfreq(fill_value=3.0) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["H", "12H", "2D", "W"]) + @pytest.mark.parametrize("freq", ["h", "12h", "2D", "W"]) @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) @pytest.mark.parametrize("kwargs", [{"on": "date"}, {"level": "d"}]) def test_selection(self, index, freq, kind, kwargs): @@ -109,7 +109,7 @@ def test_selection(self, index, freq, kind, kwargs): def test_annual_upsample_cases( self, offset, period, conv, meth, month, simple_period_range_series ): - ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"A-{month}") + ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"Y-{month}") warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(warn, match=msg): @@ -120,20 +120,20 @@ def test_annual_upsample_cases( def test_basic_downsample(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") - result = ts.resample("a-dec").mean() + result = ts.resample("y-dec").mean() expected = ts.groupby(ts.index.year).mean() - expected.index = period_range("1/1/1990", "6/30/1995", freq="a-dec") + expected.index = period_range("1/1/1990", "6/30/1995", freq="y-dec") tm.assert_series_equal(result, expected) # this is ok - tm.assert_series_equal(ts.resample("a-dec").mean(), result) - tm.assert_series_equal(ts.resample("a").mean(), result) + tm.assert_series_equal(ts.resample("y-dec").mean(), result) + tm.assert_series_equal(ts.resample("y").mean(), result) @pytest.mark.parametrize( "rule,expected_error_msg", [ - ("a-dec", ""), + ("y-dec", ""), ("q-mar", ""), ("M", ""), ("w-thu", ""), @@ -152,7 +152,7 @@ def test_not_subperiod(self, simple_period_range_series, rule, expected_error_ms @pytest.mark.parametrize("freq", ["D", "2D"]) def test_basic_upsample(self, freq, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") - result = ts.resample("a-dec").mean() + result = ts.resample("y-dec").mean() resampled = result.resample(freq, convention="end").ffill() expected = result.to_timestamp(freq, how="end") @@ -160,7 +160,7 @@ def test_basic_upsample(self, freq, simple_period_range_series): tm.assert_series_equal(resampled, expected) def test_upsample_with_limit(self): - rng = period_range("1/1/2000", periods=5, freq="A") + rng = period_range("1/1/2000", periods=5, freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) result = ts.resample("M", convention="end").ffill(limit=2) @@ -168,13 +168,13 @@ def test_upsample_with_limit(self): tm.assert_series_equal(result, expected) def test_annual_upsample(self, simple_period_range_series): - ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="Y-DEC") df = DataFrame({"a": ts}) rdf = df.resample("D").ffill() exp = df["a"].resample("D").ffill() tm.assert_series_equal(rdf["a"], exp) - rng = period_range("2000", "2003", freq="A-DEC") + rng = period_range("2000", "2003", freq="Y-DEC") ts = Series([1, 2, 3, 4], index=rng) result = ts.resample("M").ffill() @@ -271,7 +271,7 @@ def test_with_local_timezone_pytz(self): # 1 day later end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) - index = date_range(start, end, freq="H") + index = date_range(start, end, freq="h") series = Series(1, index=index) series = series.tz_convert(local_timezone) @@ -287,7 +287,7 @@ def test_with_local_timezone_pytz(self): def test_resample_with_pytz(self): # GH 13238 s = Series( - 2, index=date_range("2017-01-01", periods=48, freq="H", tz="US/Eastern") + 2, index=date_range("2017-01-01", periods=48, freq="h", tz="US/Eastern") ) result = s.resample("D").mean() expected = Series( @@ -312,7 +312,7 @@ def test_with_local_timezone_dateutil(self): year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() ) - index = date_range(start, end, freq="H", name="idx") + index = date_range(start, end, freq="h", name="idx") series = Series(1, index=index) series = series.tz_convert(local_timezone) @@ -337,7 +337,7 @@ def test_resample_nonexistent_time_bin_edge(self): tm.assert_series_equal(result, expected) # GH 23742 - index = date_range(start="2017-10-10", end="2017-10-20", freq="1H") + index = date_range(start="2017-10-10", end="2017-10-20", freq="1h") index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo") df = DataFrame(data=list(range(len(index))), index=index) result = df.groupby(pd.Grouper(freq="1D")).count() @@ -391,13 +391,13 @@ def test_weekly_upsample(self, day, target, convention, simple_period_range_seri def test_resample_to_timestamps(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") - result = ts.resample("A-DEC", kind="timestamp").mean() - expected = ts.to_timestamp(how="start").resample("A-DEC").mean() + result = ts.resample("Y-DEC", kind="timestamp").mean() + expected = ts.to_timestamp(how="start").resample("Y-DEC").mean() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) def test_resample_to_quarterly(self, simple_period_range_series, month): - ts = simple_period_range_series("1990", "1992", freq=f"A-{month}") + ts = simple_period_range_series("1990", "1992", freq=f"Y-{month}") quar_ts = ts.resample(f"Q-{month}").ffill() stamps = ts.to_timestamp("D", how="start") @@ -415,7 +415,7 @@ def test_resample_to_quarterly(self, simple_period_range_series, month): @pytest.mark.parametrize("how", ["start", "end"]) def test_resample_to_quarterly_start_end(self, simple_period_range_series, how): # conforms, but different month - ts = simple_period_range_series("1990", "1992", freq="A-JUN") + ts = simple_period_range_series("1990", "1992", freq="Y-JUN") result = ts.resample("Q-MAR", convention=how).ffill() expected = ts.asfreq("Q-MAR", how=how) expected = expected.reindex(result.index, method="ffill") @@ -426,21 +426,21 @@ def test_resample_to_quarterly_start_end(self, simple_period_range_series, how): tm.assert_series_equal(result, expected) def test_resample_fill_missing(self): - rng = PeriodIndex([2000, 2005, 2007, 2009], freq="A") + rng = PeriodIndex([2000, 2005, 2007, 2009], freq="Y") s = Series(np.random.default_rng(2).standard_normal(4), index=rng) stamps = s.to_timestamp() - filled = s.resample("A").ffill() - expected = stamps.resample("A").ffill().to_period("A") + filled = s.resample("Y").ffill() + expected = stamps.resample("Y").ffill().to_period("Y") tm.assert_series_equal(filled, expected) def test_cant_fill_missing_dups(self): - rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="A") + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="Y") s = Series(np.random.default_rng(2).standard_normal(5), index=rng) msg = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=msg): - s.resample("A").ffill() + s.resample("Y").ffill() @pytest.mark.parametrize("freq", ["5min"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) @@ -461,9 +461,9 @@ def test_upsample_daily_business_daily(self, simple_period_range_series): tm.assert_series_equal(result, expected) ts = simple_period_range_series("1/1/2000", "2/1/2000") - result = ts.resample("H", convention="s").asfreq() - exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="H") - expected = ts.asfreq("H", how="s").reindex(exp_rng) + result = ts.resample("h", convention="s").asfreq() + exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="h") + expected = ts.asfreq("h", how="s").reindex(exp_rng) tm.assert_series_equal(result, expected) def test_resample_irregular_sparse(self): @@ -530,20 +530,20 @@ def test_resample_tz_localized(self): # GH 6397 # comparing an offset that doesn't propagate tz's - rng = date_range("1/1/2011", periods=20000, freq="H") + rng = date_range("1/1/2011", periods=20000, freq="h") rng = rng.tz_localize("EST") ts = DataFrame(index=rng) ts["first"] = np.random.default_rng(2).standard_normal(len(rng)) ts["second"] = np.cumsum(np.random.default_rng(2).standard_normal(len(rng))) expected = DataFrame( { - "first": ts.resample("A").sum()["first"], - "second": ts.resample("A").mean()["second"], + "first": ts.resample("Y").sum()["first"], + "second": ts.resample("Y").mean()["second"], }, columns=["first", "second"], ) result = ( - ts.resample("A") + ts.resample("Y") .agg({"first": "sum", "second": "mean"}) .reindex(columns=["first", "second"]) ) @@ -573,8 +573,8 @@ def test_quarterly_resampling(self): rng = period_range("2000Q1", periods=10, freq="Q-DEC") ts = Series(np.arange(10), index=rng) - result = ts.resample("A").mean() - exp = ts.to_timestamp().resample("A").mean().to_period() + result = ts.resample("Y").mean() + exp = ts.to_timestamp().resample("Y").mean().to_period() tm.assert_series_equal(result, exp) def test_resample_weekly_bug_1726(self): @@ -647,7 +647,7 @@ def test_monthly_convention_span(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "from_freq, to_freq", [("D", "ME"), ("Q", "A"), ("ME", "Q"), ("D", "W")] + "from_freq, to_freq", [("D", "ME"), ("Q", "Y"), ("ME", "Q"), ("D", "W")] ) def test_default_right_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -660,7 +660,7 @@ def test_default_right_closed_label(self, from_freq, to_freq): @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("H", "D"), ("min", "H")], + [("D", "MS"), ("Q", "AS"), ("ME", "QS"), ("h", "D"), ("min", "h")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -676,7 +676,7 @@ def test_all_values_single_bin(self): index = period_range(start="2012-01-01", end="2012-12-31", freq="M") s = Series(np.random.default_rng(2).standard_normal(len(index)), index=index) - result = s.resample("A").mean() + result = s.resample("Y").mean() tm.assert_almost_equal(result.iloc[0], s.mean()) def test_evenly_divisible_with_no_extra_bins(self): @@ -752,7 +752,7 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample("7D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) + @pytest.mark.parametrize("freq, period_mult", [("h", 24), ("12h", 2)]) @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): # GH 13083 @@ -829,19 +829,19 @@ def test_resample_with_only_nat(self): @pytest.mark.parametrize( "start,end,start_freq,end_freq,offset", [ - ("19910905", "19910909 03:00", "H", "24H", "10H"), - ("19910905", "19910909 12:00", "H", "24H", "10H"), - ("19910905", "19910909 23:00", "H", "24H", "10H"), - ("19910905 10:00", "19910909", "H", "24H", "10H"), - ("19910905 10:00", "19910909 10:00", "H", "24H", "10H"), - ("19910905", "19910909 10:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909", "H", "24H", "10H"), - ("19910905 12:00", "19910909 03:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), - ("19910905", "19910913 06:00", "2H", "24H", "10H"), + ("19910905", "19910909 03:00", "h", "24h", "10h"), + ("19910905", "19910909 12:00", "h", "24h", "10h"), + ("19910905", "19910909 23:00", "h", "24h", "10h"), + ("19910905 10:00", "19910909", "h", "24h", "10h"), + ("19910905 10:00", "19910909 10:00", "h", "24h", "10h"), + ("19910905", "19910909 10:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909", "h", "24h", "10h"), + ("19910905 12:00", "19910909 03:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "24h", "34h"), + ("19910905 12:00", "19910909 12:00", "h", "17h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "17h", "3h"), + ("19910905", "19910913 06:00", "2h", "24h", "10h"), ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), ], @@ -858,11 +858,11 @@ def test_resample_with_offset(self, start, end, start_freq, end_freq, offset): def test_resample_with_offset_month(self): # GH 23882 & 31809 - pi = period_range("19910905 12:00", "19910909 1:00", freq="H") + pi = period_range("19910905 12:00", "19910909 1:00", freq="h") ser = Series(np.arange(len(pi)), index=pi) - result = ser.resample("M", offset="3H").mean() + result = ser.resample("M", offset="3h").mean() result = result.to_timestamp("M") - expected = ser.to_timestamp().resample("ME", offset="3H").mean() + expected = ser.to_timestamp().resample("ME", offset="3h").mean() # TODO: is non-tick the relevant characteristic? (GH 33815) expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @@ -875,8 +875,8 @@ def test_resample_with_offset_month(self): ( "19910905 06:00", "19920406 06:00", - "H", - "H", + "h", + "h", "19910905 06:00", "19920406 06:00", ), @@ -912,7 +912,7 @@ def test_sum_min_count(self): tm.assert_series_equal(result, expected) def test_resample_t_l_deprecated(self): - # GH 52536 + # GH#52536 msg_t = "'T' is deprecated and will be removed in a future version." msg_l = "'L' is deprecated and will be removed in a future version." diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 86a7439410d8b..ff7b129c52f71 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -33,13 +33,13 @@ def test_frame(dti, _test_series): def test_str(_test_series): - r = _test_series.resample("H") + r = _test_series.resample("h") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " "label=left, convention=start, origin=start_day]" in str(r) ) - r = _test_series.resample("H", origin="2000-01-01") + r = _test_series.resample("h", origin="2000-01-01") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r) @@ -47,12 +47,12 @@ def test_str(_test_series): def test_api(_test_series): - r = _test_series.resample("H") + r = _test_series.resample("h") result = r.mean() assert isinstance(result, Series) assert len(result) == 217 - r = _test_series.to_frame().resample("H") + r = _test_series.to_frame().resample("h") result = r.mean() assert isinstance(result, DataFrame) assert len(result) == 217 @@ -127,36 +127,36 @@ def test_pipe(test_frame, _test_series): # GH17905 # series - r = _test_series.resample("H") + r = _test_series.resample("h") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_series_equal(result, expected) # dataframe - r = test_frame.resample("H") + r = test_frame.resample("h") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_frame_equal(result, expected) def test_getitem(test_frame): - r = test_frame.resample("H") + r = test_frame.resample("h") tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) - r = test_frame.resample("H")["B"] + r = test_frame.resample("h")["B"] assert r._selected_obj.name == test_frame.columns[1] # technically this is allowed - r = test_frame.resample("H")["A", "B"] + r = test_frame.resample("h")["A", "B"] tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) - r = test_frame.resample("H")["A", "B"] + r = test_frame.resample("h")["A", "B"] tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) @pytest.mark.parametrize("key", [["D"], ["A", "D"]]) def test_select_bad_cols(key, test_frame): - g = test_frame.resample("H") + g = test_frame.resample("h") # 'A' should not be referenced as a bad column... # will have to rethink regex if you change message! msg = r"^\"Columns not found: 'D'\"$" @@ -165,7 +165,7 @@ def test_select_bad_cols(key, test_frame): def test_attribute_access(test_frame): - r = test_frame.resample("H") + r = test_frame.resample("h") tm.assert_series_equal(r.A.sum(), r["A"].sum()) @@ -188,7 +188,7 @@ def test_api_compat_before_use(attr): def tests_raises_on_nuisance(test_frame): df = test_frame df["D"] = "foo" - r = df.resample("H") + r = df.resample("h") result = r[["A", "B"]].mean() expected = pd.concat([r.A.mean(), r.B.mean()], axis=1) tm.assert_frame_equal(result, expected) @@ -1041,7 +1041,7 @@ def test_series_axis_param_depr(_test_series): "deprecated and will be removed in a future version." ) with tm.assert_produces_warning(FutureWarning, match=warning_msg): - _test_series.resample("H", axis=0) + _test_series.resample("h", axis=0) def test_resample_empty(): @@ -1061,5 +1061,5 @@ def test_resample_empty(): ] ) ) - result = df.resample("8H").mean() + result = df.resample("8h").mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index f394e3a25dc0f..b85ccdc70068f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -151,19 +151,6 @@ def test_groupby_with_origin(): start, end = "1/1/2000 00:00:00", "1/31/2000 00:00" middle = "1/15/2000 00:00:00" - # test origin on 1970-01-01 00:00:00 - rng = date_range("1970-01-01 00:00:00", end, freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] - - origin = Timestamp(0) - adjusted_grouper = pd.Grouper(freq=freq, origin=origin) - adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") - adjusted_count_ts = adjusted_count_ts[middle_ts:end] - adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") - tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2[middle_ts:end]) - rng = date_range(start, end, freq="1231min") # prime number ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ts2 = ts[middle:end] @@ -177,19 +164,26 @@ def test_groupby_with_origin(): with pytest.raises(AssertionError, match="Index are different"): tm.assert_index_equal(count_ts.index, count_ts2.index) - # test origin on 2049-10-18 20:00:00 + # test origin on 1970-01-01 00:00:00 + origin = Timestamp(0) + adjusted_grouper = pd.Grouper(freq=freq, origin=origin) + adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count") + adjusted_count_ts = adjusted_count_ts[middle:end] + adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count") + tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2) - rng = date_range(start, "2049-10-18 20:00:00", freq="1231min") # prime number - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - middle_ts = rng[len(rng) // 2] - ts2 = ts[middle_ts:end] + # test origin on 2049-10-18 20:00:00 origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000 adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future) adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count") - adjusted2_count_ts = adjusted2_count_ts[middle_ts:end] + adjusted2_count_ts = adjusted2_count_ts[middle:end] adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count") tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2) + # both grouper use an adjusted timestamp that is a multiple of 1399 min + # they should be equals even if the adjusted_timestamp is in the future + tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2) + def test_nearest(): # GH 17496 @@ -327,10 +321,10 @@ def test_apply_columns_multilevel(): ind = date_range(start="2017-01-01", freq="15Min", periods=8) df = DataFrame(np.array([0] * 16).reshape(8, 2), index=ind, columns=cols) agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} - result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) + result = df.resample("h").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame( 2 * [[0, 0.0]], - index=date_range(start="2017-01-01", freq="1H", periods=2), + index=date_range(start="2017-01-01", freq="1h", periods=2), columns=pd.MultiIndex.from_tuples( [("A", "a", "", "one"), ("B", "b", "i", "two")] ), @@ -418,14 +412,14 @@ def test_apply_to_one_column_of_df(): ) # access "col" via getattr -> make sure we handle AttributeError - result = df.resample("H").apply(lambda group: group.col.sum()) + result = df.resample("h").apply(lambda group: group.col.sum()) expected = Series( - [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="H") + [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="h") ) tm.assert_series_equal(result, expected) # access "col" via _getitem__ -> make sure we handle KeyErrpr - result = df.resample("H").apply(lambda group: group["col"].sum()) + result = df.resample("h").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) @@ -608,7 +602,7 @@ def test_groupby_resample_size_all_index_same(): # GH 46826 df = DataFrame( {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, - index=date_range("31/12/2000 18:00", freq="H", periods=12), + index=date_range("31/12/2000 18:00", freq="h", periods=12), ) msg = "DataFrameGroupBy.resample operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3e0922228cb74..e5593302625ec 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -24,7 +24,7 @@ def test_series(): def test_apply(test_series): - grouper = Grouper(freq="A", label="right", closed="right") + grouper = Grouper(freq="Y", label="right", closed="right") grouped = test_series.groupby(grouper) @@ -44,18 +44,18 @@ def test_count(test_series): expected = test_series.groupby(lambda x: x.year).count() - grouper = Grouper(freq="A", label="right", closed="right") + grouper = Grouper(freq="Y", label="right", closed="right") result = test_series.groupby(grouper).count() expected.index = result.index tm.assert_series_equal(result, expected) - result = test_series.resample("A").count() + result = test_series.resample("Y").count() expected.index = result.index tm.assert_series_equal(result, expected) def test_numpy_reduction(test_series): - result = test_series.resample("A", closed="right").prod() + result = test_series.resample("Y", closed="right").prod() msg = "using SeriesGroupBy.prod" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -273,7 +273,7 @@ def test_aggregate_with_nat_size(): def test_repr(): # GH18203 - result = repr(Grouper(key="A", freq="H")) + result = repr(Grouper(key="A", freq="h")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " "closed='left', label='left', how='mean', " @@ -281,7 +281,7 @@ def test_repr(): ) assert result == expected - result = repr(Grouper(key="A", freq="H", origin="2000-01-01")) + result = repr(Grouper(key="A", freq="h", origin="2000-01-01")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " "closed='left', label='left', how='mean', " @@ -304,7 +304,7 @@ def test_repr(): ], ) def test_upsample_sum(method, method_args, expected_values): - s = Series(1, index=date_range("2017", periods=2, freq="H")) + s = Series(1, index=date_range("2017", periods=2, freq="h")) resampled = s.resample("30min") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 79b13673e70c6..606403ba56494 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -128,13 +128,13 @@ def test_resample_timedelta_values(): @pytest.mark.parametrize( "start, end, freq, resample_freq", [ - ("8H", "21h59min50s", "10s", "3H"), # GH 30353 example - ("3H", "22H", "1H", "5H"), + ("8h", "21h59min50s", "10s", "3h"), # GH 30353 example + ("3h", "22h", "1h", "5h"), ("527D", "5006D", "3D", "10D"), ("1D", "10D", "1D", "2D"), # GH 13022 example # tests that worked before GH 33498: - ("8H", "21h59min50s", "10s", "2H"), - ("0H", "21h59min50s", "10s", "3H"), + ("8h", "21h59min50s", "10s", "2h"), + ("0h", "21h59min50s", "10s", "3h"), ("10D", "85D", "D", "2D"), ], ) diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 386f363c81557..81ca227fb7afb 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -91,10 +91,10 @@ def test_append_length0_frame(self, sort): tm.assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1 = np.zeros((2,), dtype=("i4,f4,S10")) arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2 = np.zeros((3,), dtype=("i4,f4,S10")) arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 5dde863f246d1..74c79d20a3fb3 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -30,8 +30,8 @@ class TestConcatenate: def test_append_concat(self): # GH#1815 - d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") - d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") + d1 = date_range("12/31/1990", "12/31/1999", freq="Y-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="Y-DEC") s1 = Series(np.random.default_rng(2).standard_normal(10), d1) s2 = Series(np.random.default_rng(2).standard_normal(10), d2) @@ -272,7 +272,7 @@ def test_concat_mixed_objs(self): # G2385 # axis 1 - index = date_range("01-Jan-2013", periods=10, freq="H") + index = date_range("01-Jan-2013", periods=10, freq="h") arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) s2 = Series(arr, index=index) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 12d28c388d508..51398acd6ec57 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -46,8 +46,8 @@ def test_concat_datetime_datetime64_frame(self): def test_concat_datetime_timezone(self): # GH 18523 - idx1 = date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") - idx2 = date_range(start=idx1[0], end=idx1[-1], freq="H") + idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris") + idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h") df1 = DataFrame({"a": [1, 2, 3]}, index=idx1) df2 = DataFrame({"b": [1, 2, 3]}, index=idx2) result = concat([df1, df2], axis=1) @@ -59,7 +59,7 @@ def test_concat_datetime_timezone(self): "2011-01-01 01:00:00+01:00", "2011-01-01 02:00:00+01:00", ], - freq="H", + freq="h", ) .tz_convert("UTC") .tz_convert("Europe/Paris") @@ -71,7 +71,7 @@ def test_concat_datetime_timezone(self): tm.assert_frame_equal(result, expected) - idx3 = date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) result = concat([df1, df3], axis=1) @@ -102,7 +102,7 @@ def test_concat_datetime_timezone(self): tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample - result = concat([df1.resample("H").mean(), df2.resample("H").mean()], sort=True) + result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True) expected = DataFrame( {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, index=idx1.append(idx1), diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 20daa388c2c88..c630ba6a43cb1 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -902,7 +902,7 @@ def test_join_inner_multiindex_deterministic_order(): result = left.join(right, how="inner") expected = DataFrame( {"e": [5], "f": [6]}, - index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), + index=MultiIndex.from_tuples([(1, 2, 4, 3)], names=("a", "b", "d", "c")), ) tm.assert_frame_equal(result, expected) @@ -926,10 +926,16 @@ def test_join_multiindex_one_level(join_type): ) right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",))) result = left.join(right, how=join_type) - expected = DataFrame( - {"c": [3], "d": [4]}, - index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), - ) + if join_type == "right": + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), + ) + else: + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(1, 2)], names=["a", "b"]), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d889ae2e4806b..4d779349b5c14 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -365,7 +365,7 @@ def test_merge_join_key_dtype_cast(self): lkey = np.array([1]) rkey = np.array([2]) df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") - assert df["key_0"].dtype == np.int_ + assert df["key_0"].dtype == np.dtype(int) def test_handle_join_key_pass_array(self): left = DataFrame( @@ -389,7 +389,7 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") - expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=np.int_, name="key_0") + expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=int, name="key_0") tm.assert_series_equal(merged["key_0"], expected) left = DataFrame({"value": np.arange(3)}) @@ -901,7 +901,7 @@ def test_merge_on_datetime64tz_empty(self): def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 df1 = DataFrame( - pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), + pd.date_range("2017-10-29 01:00", periods=4, freq="h", tz="Europe/Madrid"), columns=["date"], ) df1["value"] = 1 @@ -922,7 +922,7 @@ def test_merge_datetime64tz_with_dst_transition(self): expected = DataFrame( { "date": pd.date_range( - "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" + "2017-10-29 01:00", periods=7, freq="h", tz="Europe/Madrid" ), "value_x": [1] * 4 + [np.nan] * 3, "value_y": [np.nan] * 4 + [2] * 3, diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index ab010bdb909f1..c029acf0c8938 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -69,11 +69,6 @@ def on_cols_multi(): return ["Origin", "Destination", "Period"] -@pytest.fixture -def idx_cols_multi(): - return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] - - class TestMergeMulti: def test_merge_on_multikey(self, left, right, join_type): on_cols = ["key1", "key2"] @@ -815,9 +810,13 @@ def test_join_multi_levels2(self): class TestJoinMultiMulti: - def test_join_multi_multi( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi - ): + def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi): + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) # Multi-index join tests expected = ( merge( @@ -826,7 +825,7 @@ def test_join_multi_multi( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) @@ -834,11 +833,18 @@ def test_join_multi_multi( tm.assert_frame_equal(result, expected) def test_join_multi_empty_frames( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + self, left_multi, right_multi, join_type, on_cols_multi ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) + expected = ( merge( left_multi.reset_index(), @@ -846,7 +852,7 @@ def test_join_multi_empty_frames( how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 941478066a7d8..ef748e264188c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -459,6 +459,47 @@ def test_melt_ea_columns(self): ) tm.assert_frame_equal(result, expected) + def test_melt_preserves_datetime(self): + df = DataFrame( + data=[ + { + "type": "A0", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/10", tz="Asia/Tokyo"), + }, + { + "type": "A1", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/11", tz="Asia/Tokyo"), + }, + ], + index=["aaaa", "bbbb"], + ) + result = df.melt( + id_vars=["type"], + value_vars=["start_date", "end_date"], + var_name="start/end", + value_name="date", + ) + expected = DataFrame( + { + "type": {0: "A0", 1: "A1", 2: "A0", 3: "A1"}, + "start/end": { + 0: "start_date", + 1: "start_date", + 2: "end_date", + 3: "end_date", + }, + "date": { + 0: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 1: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 2: pd.Timestamp("2023-03-10 00:00:00+0900", tz="Asia/Tokyo"), + 3: pd.Timestamp("2023-03-11 00:00:00+0900", tz="Asia/Tokyo"), + }, + } + ) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8435f4a189c56..2d41b6d355ead 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -445,10 +445,10 @@ def test_pivot_no_values(self): tm.assert_frame_equal(res, exp) res = df.pivot_table( - index=Grouper(freq="A"), columns=Grouper(key="dt", freq="ME") + index=Grouper(freq="Y"), columns=Grouper(key="dt", freq="ME") ) exp = DataFrame( - [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns + [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="Y"), columns=exp_columns ) tm.assert_frame_equal(res, exp) @@ -1273,7 +1273,7 @@ def test_pivot_timegrouper(self, using_array_manager): expected = DataFrame( np.array([10, 18, 3], dtype="int64").reshape(1, 3), - index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="A"), + index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="Y"), columns="Carl Joe Mark".split(), ) expected.index.name = "Date" @@ -1281,7 +1281,7 @@ def test_pivot_timegrouper(self, using_array_manager): result = pivot_table( df, - index=Grouper(freq="A"), + index=Grouper(freq="Y"), columns="Buyer", values="Quantity", aggfunc="sum", @@ -1291,7 +1291,7 @@ def test_pivot_timegrouper(self, using_array_manager): result = pivot_table( df, index="Buyer", - columns=Grouper(freq="A"), + columns=Grouper(freq="Y"), values="Quantity", aggfunc="sum", ) diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index a02dbf0a0413f..4841c488a5768 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -80,8 +80,8 @@ def test_hash(self, interval): (-np.inf, np.inf, np.inf), (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), - (Timedelta("1H10min"), Timedelta("5H5min"), Timedelta("3H55min")), - (Timedelta("5s"), Timedelta("1H"), Timedelta("59min55s")), + (Timedelta("1h10min"), Timedelta("5h5min"), Timedelta("3h55min")), + (Timedelta("5s"), Timedelta("1h"), Timedelta("59min55s")), ], ) def test_length(self, left, right, expected): diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 00285148a3c90..597282e10052e 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -18,7 +18,7 @@ class TestFreqConversion: """Test frequency conversion of date objects""" @pytest.mark.filterwarnings("ignore:Period with BDay:FutureWarning") - @pytest.mark.parametrize("freq", ["A", "Q", "M", "W", "B", "D"]) + @pytest.mark.parametrize("freq", ["Y", "Q", "M", "W", "B", "D"]) def test_asfreq_near_zero(self, freq): # GH#19643, GH#19650 per = Period("0001-01-01", freq=freq) @@ -49,7 +49,7 @@ def test_to_timestamp_out_of_bounds(self): per.to_timestamp() def test_asfreq_corner(self): - val = Period(freq="A", year=2007) + val = Period(freq="Y", year=2007) result1 = val.asfreq("5min") result2 = val.asfreq("min") expected = Period("2007-12-31 23:59", freq="min") @@ -61,11 +61,11 @@ def test_asfreq_corner(self): def test_conv_annual(self): # frequency conversion tests: from Annual Frequency - ival_A = Period(freq="A", year=2007) + ival_A = Period(freq="Y", year=2007) - ival_AJAN = Period(freq="A-JAN", year=2007) - ival_AJUN = Period(freq="A-JUN", year=2007) - ival_ANOV = Period(freq="A-NOV", year=2007) + ival_AJAN = Period(freq="Y-JAN", year=2007) + ival_AJUN = Period(freq="Y-JUN", year=2007) + ival_ANOV = Period(freq="Y-NOV", year=2007) ival_A_to_Q_start = Period(freq="Q", year=2007, quarter=1) ival_A_to_Q_end = Period(freq="Q", year=2007, quarter=4) @@ -78,8 +78,8 @@ def test_conv_annual(self): ival_A_to_B_end = Period(freq="B", year=2007, month=12, day=31) ival_A_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_A_to_D_end = Period(freq="D", year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_A_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq="h", year=2007, month=12, day=31, hour=23) ival_A_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -111,8 +111,10 @@ def test_conv_annual(self): assert ival_A.asfreq("B", "E") == ival_A_to_B_end assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - assert ival_A.asfreq("H", "s") == ival_A_to_H_start - assert ival_A.asfreq("H", "E") == ival_A_to_H_end + msg = "'H' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("H", "s") == ival_A_to_H_start + assert ival_A.asfreq("H", "E") == ival_A_to_H_end assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end msg = "'T' is deprecated and will be removed in a future version." @@ -133,7 +135,7 @@ def test_conv_annual(self): assert ival_ANOV.asfreq("D", "s") == ival_ANOV_to_D_start assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end - assert ival_A.asfreq("A") == ival_A + assert ival_A.asfreq("Y") == ival_A def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency @@ -144,7 +146,7 @@ def test_conv_quarterly(self): ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - ival_Q_to_A = Period(freq="A", year=2007) + ival_Q_to_A = Period(freq="Y", year=2007) ival_Q_to_M_start = Period(freq="M", year=2007, month=1) ival_Q_to_M_end = Period(freq="M", year=2007, month=3) ival_Q_to_W_start = Period(freq="W", year=2007, month=1, day=1) @@ -154,8 +156,8 @@ def test_conv_quarterly(self): ival_Q_to_B_end = Period(freq="B", year=2007, month=3, day=30) ival_Q_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_Q_to_D_end = Period(freq="D", year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_Q_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq="h", year=2007, month=3, day=31, hour=23) ival_Q_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -175,8 +177,8 @@ def test_conv_quarterly(self): ival_QEJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) ival_QEJUN_to_D_end = Period(freq="D", year=2006, month=9, day=30) - assert ival_Q.asfreq("A") == ival_Q_to_A - assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A + assert ival_Q.asfreq("Y") == ival_Q_to_A + assert ival_Q_end_of_year.asfreq("Y") == ival_Q_to_A assert ival_Q.asfreq("M", "s") == ival_Q_to_M_start assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end @@ -187,8 +189,8 @@ def test_conv_quarterly(self): assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end assert ival_Q.asfreq("D", "s") == ival_Q_to_D_start assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end - assert ival_Q.asfreq("H", "s") == ival_Q_to_H_start - assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end + assert ival_Q.asfreq("h", "s") == ival_Q_to_H_start + assert ival_Q.asfreq("h", "E") == ival_Q_to_H_end assert ival_Q.asfreq("Min", "s") == ival_Q_to_T_start assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end assert ival_Q.asfreq("s", "s") == ival_Q_to_S_start @@ -207,7 +209,7 @@ def test_conv_monthly(self): ival_M = Period(freq="M", year=2007, month=1) ival_M_end_of_year = Period(freq="M", year=2007, month=12) ival_M_end_of_quarter = Period(freq="M", year=2007, month=3) - ival_M_to_A = Period(freq="A", year=2007) + ival_M_to_A = Period(freq="Y", year=2007) ival_M_to_Q = Period(freq="Q", year=2007, quarter=1) ival_M_to_W_start = Period(freq="W", year=2007, month=1, day=1) ival_M_to_W_end = Period(freq="W", year=2007, month=1, day=31) @@ -216,8 +218,8 @@ def test_conv_monthly(self): ival_M_to_B_end = Period(freq="B", year=2007, month=1, day=31) ival_M_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_M_to_D_end = Period(freq="D", year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_M_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq="h", year=2007, month=1, day=31, hour=23) ival_M_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -231,8 +233,8 @@ def test_conv_monthly(self): freq="s", year=2007, month=1, day=31, hour=23, minute=59, second=59 ) - assert ival_M.asfreq("A") == ival_M_to_A - assert ival_M_end_of_year.asfreq("A") == ival_M_to_A + assert ival_M.asfreq("Y") == ival_M_to_A + assert ival_M_end_of_year.asfreq("Y") == ival_M_to_A assert ival_M.asfreq("Q") == ival_M_to_Q assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q @@ -243,8 +245,8 @@ def test_conv_monthly(self): assert ival_M.asfreq("B", "E") == ival_M_to_B_end assert ival_M.asfreq("D", "s") == ival_M_to_D_start assert ival_M.asfreq("D", "E") == ival_M_to_D_end - assert ival_M.asfreq("H", "s") == ival_M_to_H_start - assert ival_M.asfreq("H", "E") == ival_M_to_H_end + assert ival_M.asfreq("h", "s") == ival_M_to_H_start + assert ival_M.asfreq("h", "E") == ival_M_to_H_end assert ival_M.asfreq("Min", "s") == ival_M_to_T_start assert ival_M.asfreq("Min", "E") == ival_M_to_T_end assert ival_M.asfreq("s", "s") == ival_M_to_S_start @@ -282,14 +284,14 @@ def test_conv_weekly(self): ival_W_end_of_year = Period(freq="W", year=2007, month=12, day=31) ival_W_end_of_quarter = Period(freq="W", year=2007, month=3, day=31) ival_W_end_of_month = Period(freq="W", year=2007, month=1, day=31) - ival_W_to_A = Period(freq="A", year=2007) + ival_W_to_A = Period(freq="Y", year=2007) ival_W_to_Q = Period(freq="Q", year=2007, quarter=1) ival_W_to_M = Period(freq="M", year=2007, month=1) if Period(freq="D", year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq="A", year=2007) + ival_W_to_A_end_of_year = Period(freq="Y", year=2007) else: - ival_W_to_A_end_of_year = Period(freq="A", year=2008) + ival_W_to_A_end_of_year = Period(freq="Y", year=2008) if Period(freq="D", year=2007, month=3, day=31).weekday == 6: ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=1) @@ -306,8 +308,8 @@ def test_conv_weekly(self): ival_W_to_B_end = Period(freq="B", year=2007, month=1, day=5) ival_W_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_W_to_D_end = Period(freq="D", year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_W_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq="h", year=2007, month=1, day=7, hour=23) ival_W_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -321,8 +323,8 @@ def test_conv_weekly(self): freq="s", year=2007, month=1, day=7, hour=23, minute=59, second=59 ) - assert ival_W.asfreq("A") == ival_W_to_A - assert ival_W_end_of_year.asfreq("A") == ival_W_to_A_end_of_year + assert ival_W.asfreq("Y") == ival_W_to_A + assert ival_W_end_of_year.asfreq("Y") == ival_W_to_A_end_of_year assert ival_W.asfreq("Q") == ival_W_to_Q assert ival_W_end_of_quarter.asfreq("Q") == ival_W_to_Q_end_of_quarter @@ -352,8 +354,8 @@ def test_conv_weekly(self): assert ival_WMON.asfreq("D", "s") == ival_WMON_to_D_start assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end - assert ival_W.asfreq("H", "s") == ival_W_to_H_start - assert ival_W.asfreq("H", "E") == ival_W_to_H_end + assert ival_W.asfreq("h", "s") == ival_W_to_H_start + assert ival_W.asfreq("h", "E") == ival_W_to_H_end assert ival_W.asfreq("Min", "s") == ival_W_to_T_start assert ival_W.asfreq("Min", "E") == ival_W_to_T_end assert ival_W.asfreq("s", "s") == ival_W_to_S_start @@ -394,13 +396,13 @@ def test_conv_business(self): ival_B_end_of_month = Period(freq="B", year=2007, month=1, day=31) ival_B_end_of_week = Period(freq="B", year=2007, month=1, day=5) - ival_B_to_A = Period(freq="A", year=2007) + ival_B_to_A = Period(freq="Y", year=2007) ival_B_to_Q = Period(freq="Q", year=2007, quarter=1) ival_B_to_M = Period(freq="M", year=2007, month=1) ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) ival_B_to_D = Period(freq="D", year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_B_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_B_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -414,8 +416,8 @@ def test_conv_business(self): freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) - assert ival_B.asfreq("A") == ival_B_to_A - assert ival_B_end_of_year.asfreq("A") == ival_B_to_A + assert ival_B.asfreq("Y") == ival_B_to_A + assert ival_B_end_of_year.asfreq("Y") == ival_B_to_A assert ival_B.asfreq("Q") == ival_B_to_Q assert ival_B_end_of_quarter.asfreq("Q") == ival_B_to_Q assert ival_B.asfreq("M") == ival_B_to_M @@ -425,8 +427,8 @@ def test_conv_business(self): assert ival_B.asfreq("D") == ival_B_to_D - assert ival_B.asfreq("H", "s") == ival_B_to_H_start - assert ival_B.asfreq("H", "E") == ival_B_to_H_end + assert ival_B.asfreq("h", "s") == ival_B_to_H_start + assert ival_B.asfreq("h", "E") == ival_B_to_H_end assert ival_B.asfreq("Min", "s") == ival_B_to_T_start assert ival_B.asfreq("Min", "E") == ival_B_to_T_end assert ival_B.asfreq("s", "s") == ival_B_to_S_start @@ -452,11 +454,11 @@ def test_conv_daily(self): ival_B_friday = Period(freq="B", year=2007, month=1, day=5) ival_B_monday = Period(freq="B", year=2007, month=1, day=8) - ival_D_to_A = Period(freq="A", year=2007) + ival_D_to_A = Period(freq="Y", year=2007) - ival_Deoq_to_AJAN = Period(freq="A-JAN", year=2008) - ival_Deoq_to_AJUN = Period(freq="A-JUN", year=2007) - ival_Deoq_to_ADEC = Period(freq="A-DEC", year=2007) + ival_Deoq_to_AJAN = Period(freq="Y-JAN", year=2008) + ival_Deoq_to_AJUN = Period(freq="Y-JUN", year=2007) + ival_Deoq_to_ADEC = Period(freq="Y-DEC", year=2007) ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) @@ -465,8 +467,8 @@ def test_conv_daily(self): ival_D_to_M = Period(freq="M", year=2007, month=1) ival_D_to_W = Period(freq="W", year=2007, month=1, day=7) - ival_D_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_D_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_D_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -480,13 +482,13 @@ def test_conv_daily(self): freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) - assert ival_D.asfreq("A") == ival_D_to_A + assert ival_D.asfreq("Y") == ival_D_to_A - assert ival_D_end_of_quarter.asfreq("A-JAN") == ival_Deoq_to_AJAN - assert ival_D_end_of_quarter.asfreq("A-JUN") == ival_Deoq_to_AJUN - assert ival_D_end_of_quarter.asfreq("A-DEC") == ival_Deoq_to_ADEC + assert ival_D_end_of_quarter.asfreq("Y-JAN") == ival_Deoq_to_AJAN + assert ival_D_end_of_quarter.asfreq("Y-JUN") == ival_Deoq_to_AJUN + assert ival_D_end_of_quarter.asfreq("Y-DEC") == ival_Deoq_to_ADEC - assert ival_D_end_of_year.asfreq("A") == ival_D_to_A + assert ival_D_end_of_year.asfreq("Y") == ival_D_to_A assert ival_D_end_of_quarter.asfreq("Q") == ival_D_to_QEDEC assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN @@ -503,8 +505,8 @@ def test_conv_daily(self): assert ival_D_sunday.asfreq("B", "s") == ival_B_friday assert ival_D_sunday.asfreq("B", "E") == ival_B_monday - assert ival_D.asfreq("H", "s") == ival_D_to_H_start - assert ival_D.asfreq("H", "E") == ival_D_to_H_end + assert ival_D.asfreq("h", "s") == ival_D_to_H_start + assert ival_D.asfreq("h", "E") == ival_D_to_H_end assert ival_D.asfreq("Min", "s") == ival_D_to_T_start assert ival_D.asfreq("Min", "E") == ival_D_to_T_end assert ival_D.asfreq("s", "s") == ival_D_to_S_start @@ -515,15 +517,15 @@ def test_conv_daily(self): def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" - ival_H = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq="H", year=2007, month=12, day=31, hour=23) - ival_H_end_of_quarter = Period(freq="H", year=2007, month=3, day=31, hour=23) - ival_H_end_of_month = Period(freq="H", year=2007, month=1, day=31, hour=23) - ival_H_end_of_week = Period(freq="H", year=2007, month=1, day=7, hour=23) - ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) - ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_H = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq="h", year=2007, month=12, day=31, hour=23) + ival_H_end_of_quarter = Period(freq="h", year=2007, month=3, day=31, hour=23) + ival_H_end_of_month = Period(freq="h", year=2007, month=1, day=31, hour=23) + ival_H_end_of_week = Period(freq="h", year=2007, month=1, day=7, hour=23) + ival_H_end_of_day = Period(freq="h", year=2007, month=1, day=1, hour=23) + ival_H_end_of_bus = Period(freq="h", year=2007, month=1, day=1, hour=23) - ival_H_to_A = Period(freq="A", year=2007) + ival_H_to_A = Period(freq="Y", year=2007) ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) ival_H_to_M = Period(freq="M", year=2007, month=1) ival_H_to_W = Period(freq="W", year=2007, month=1, day=7) @@ -544,8 +546,8 @@ def test_conv_hourly(self): freq="s", year=2007, month=1, day=1, hour=0, minute=59, second=59 ) - assert ival_H.asfreq("A") == ival_H_to_A - assert ival_H_end_of_year.asfreq("A") == ival_H_to_A + assert ival_H.asfreq("Y") == ival_H_to_A + assert ival_H_end_of_year.asfreq("Y") == ival_H_to_A assert ival_H.asfreq("Q") == ival_H_to_Q assert ival_H_end_of_quarter.asfreq("Q") == ival_H_to_Q assert ival_H.asfreq("M") == ival_H_to_M @@ -563,7 +565,7 @@ def test_conv_hourly(self): assert ival_H.asfreq("s", "s") == ival_H_to_S_start assert ival_H.asfreq("s", "E") == ival_H_to_S_end - assert ival_H.asfreq("H") == ival_H + assert ival_H.asfreq("h") == ival_H def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" @@ -591,14 +593,14 @@ def test_conv_minutely(self): freq="Min", year=2007, month=1, day=1, hour=0, minute=59 ) - ival_T_to_A = Period(freq="A", year=2007) + ival_T_to_A = Period(freq="Y", year=2007) ival_T_to_Q = Period(freq="Q", year=2007, quarter=1) ival_T_to_M = Period(freq="M", year=2007, month=1) ival_T_to_W = Period(freq="W", year=2007, month=1, day=7) ival_T_to_D = Period(freq="D", year=2007, month=1, day=1) with tm.assert_produces_warning(FutureWarning, match=bday_msg): ival_T_to_B = Period(freq="B", year=2007, month=1, day=1) - ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_T_to_H = Period(freq="h", year=2007, month=1, day=1, hour=0) ival_T_to_S_start = Period( freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 @@ -607,8 +609,8 @@ def test_conv_minutely(self): freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) - assert ival_T.asfreq("A") == ival_T_to_A - assert ival_T_end_of_year.asfreq("A") == ival_T_to_A + assert ival_T.asfreq("Y") == ival_T_to_A + assert ival_T_end_of_year.asfreq("Y") == ival_T_to_A assert ival_T.asfreq("Q") == ival_T_to_Q assert ival_T_end_of_quarter.asfreq("Q") == ival_T_to_Q assert ival_T.asfreq("M") == ival_T_to_M @@ -620,8 +622,8 @@ def test_conv_minutely(self): with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_T.asfreq("B") == ival_T_to_B assert ival_T_end_of_bus.asfreq("B") == ival_T_to_B - assert ival_T.asfreq("H") == ival_T_to_H - assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H + assert ival_T.asfreq("h") == ival_T_to_H + assert ival_T_end_of_hour.asfreq("h") == ival_T_to_H assert ival_T.asfreq("s", "s") == ival_T_to_S_start assert ival_T.asfreq("s", "E") == ival_T_to_S_end @@ -657,18 +659,18 @@ def test_conv_secondly(self): freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) - ival_S_to_A = Period(freq="A", year=2007) + ival_S_to_A = Period(freq="Y", year=2007) ival_S_to_Q = Period(freq="Q", year=2007, quarter=1) ival_S_to_M = Period(freq="M", year=2007, month=1) ival_S_to_W = Period(freq="W", year=2007, month=1, day=7) ival_S_to_D = Period(freq="D", year=2007, month=1, day=1) with tm.assert_produces_warning(FutureWarning, match=bday_msg): ival_S_to_B = Period(freq="B", year=2007, month=1, day=1) - ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_S_to_H = Period(freq="h", year=2007, month=1, day=1, hour=0) ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) - assert ival_S.asfreq("A") == ival_S_to_A - assert ival_S_end_of_year.asfreq("A") == ival_S_to_A + assert ival_S.asfreq("Y") == ival_S_to_A + assert ival_S_end_of_year.asfreq("Y") == ival_S_to_A assert ival_S.asfreq("Q") == ival_S_to_Q assert ival_S_end_of_quarter.asfreq("Q") == ival_S_to_Q assert ival_S.asfreq("M") == ival_S_to_M @@ -680,8 +682,8 @@ def test_conv_secondly(self): with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_S.asfreq("B") == ival_S_to_B assert ival_S_end_of_bus.asfreq("B") == ival_S_to_B - assert ival_S.asfreq("H") == ival_S_to_H - assert ival_S_end_of_hour.asfreq("H") == ival_S_to_H + assert ival_S.asfreq("h") == ival_S_to_H + assert ival_S_end_of_hour.asfreq("h") == ival_S_to_H assert ival_S.asfreq("Min") == ival_S_to_T assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T @@ -707,44 +709,44 @@ def test_conv_microsecond(self): def test_asfreq_mult(self): # normal freq to mult freq - p = Period(freq="A", year=2007) + p = Period(freq="Y", year=2007) # ordinal will not change - for freq in ["3A", offsets.YearEnd(3)]: + for freq in ["3Y", offsets.YearEnd(3)]: result = p.asfreq(freq) - expected = Period("2007", freq="3A") + expected = Period("2007", freq="3Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ["3A", offsets.YearEnd(3)]: + for freq in ["3Y", offsets.YearEnd(3)]: result = p.asfreq(freq, how="S") - expected = Period("2007", freq="3A") + expected = Period("2007", freq="3Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # mult freq to normal freq - p = Period(freq="3A", year=2007) + p = Period(freq="3Y", year=2007) # ordinal will change because how=E is the default - for freq in ["A", offsets.YearEnd()]: + for freq in ["Y", offsets.YearEnd()]: result = p.asfreq(freq) - expected = Period("2009", freq="A") + expected = Period("2009", freq="Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ["A", offsets.YearEnd()]: + for freq in ["Y", offsets.YearEnd()]: result = p.asfreq(freq, how="s") - expected = Period("2007", freq="A") + expected = Period("2007", freq="Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq="A", year=2007) + p = Period(freq="Y", year=2007) for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) expected = Period("2007-12", freq="2M") @@ -760,7 +762,7 @@ def test_asfreq_mult(self): assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq="3A", year=2007) + p = Period(freq="3Y", year=2007) for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) expected = Period("2009-12", freq="2M") @@ -778,24 +780,24 @@ def test_asfreq_mult(self): def test_asfreq_combined(self): # normal freq to combined freq - p = Period("2007", freq="H") + p = Period("2007", freq="h") # ordinal will not change - expected = Period("2007", freq="25H") - for freq, how in zip(["1D1H", "1H1D"], ["E", "S"]): + expected = Period("2007", freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["E", "S"]): result = p.asfreq(freq, how=how) assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # combined freq to normal freq - p1 = Period(freq="1D1H", year=2007) - p2 = Period(freq="1H1D", year=2007) + p1 = Period(freq="1D1h", year=2007) + p2 = Period(freq="1h1D", year=2007) # ordinal will change because how=E is the default - result1 = p1.asfreq("H") - result2 = p2.asfreq("H") - expected = Period("2007-01-02", freq="H") + result1 = p1.asfreq("h") + result2 = p2.asfreq("h") + expected = Period("2007-01-02", freq="h") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -804,9 +806,9 @@ def test_asfreq_combined(self): assert result2.freq == expected.freq # ordinal will not change - result1 = p1.asfreq("H", how="S") - result2 = p2.asfreq("H", how="S") - expected = Period("2007-01-01", freq="H") + result1 = p1.asfreq("h", how="S") + result2 = p2.asfreq("h", how="S") + expected = Period("2007-01-01", freq="h") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 7d07f327e3978..6c27881e44b56 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -61,9 +61,9 @@ def test_construction(self): assert i1 == i2 - i1 = Period("2005", freq="A") + i1 = Period("2005", freq="Y") i2 = Period("2005") - i3 = Period("2005", freq="a") + i3 = Period("2005", freq="y") assert i1 == i2 assert i1 == i3 @@ -224,7 +224,7 @@ def test_period_constructor_offsets(self): assert Period("1/1/2005", freq=offsets.MonthEnd()) == Period( "1/1/2005", freq="M" ) - assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="A") + assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="Y") assert Period("2005", freq=offsets.MonthEnd()) == Period("2005", freq="M") with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert Period("3/10/12", freq=offsets.BusinessDay()) == Period( @@ -315,13 +315,13 @@ def test_invalid_arguments(self): msg = '^Given date string "-2000" not likely a datetime$' with pytest.raises(ValueError, match=msg): - Period("-2000", "A") + Period("-2000", "Y") msg = "day is out of range for month" with pytest.raises(DateParseError, match=msg): - Period("0", "A") + Period("0", "Y") msg = "Unknown datetime string format, unable to parse" with pytest.raises(DateParseError, match=msg): - Period("1/1/-2000", "A") + Period("1/1/-2000", "Y") def test_constructor_corner(self): expected = Period("2007-01", freq="2M") @@ -331,8 +331,8 @@ def test_constructor_corner(self): p = Period("2007-01-01", freq="D") - result = Period(p, freq="A") - exp = Period("2007", freq="A") + result = Period(p, freq="Y") + exp = Period("2007", freq="Y") assert result == exp def test_constructor_infer_freq(self): @@ -340,7 +340,7 @@ def test_constructor_infer_freq(self): assert p.freq == "D" p = Period("2007-01-01 07") - assert p.freq == "H" + assert p.freq == "h" p = Period("2007-01-01 07:10") assert p.freq == "min" @@ -360,11 +360,11 @@ def test_constructor_infer_freq(self): assert p.freq == "us" def test_multiples(self): - result1 = Period("1989", freq="2A") - result2 = Period("1989", freq="A") + result1 = Period("1989", freq="2Y") + result2 = Period("1989", freq="Y") assert result1.ordinal == result2.ordinal - assert result1.freqstr == "2A-DEC" - assert result2.freqstr == "A-DEC" + assert result1.freqstr == "2Y-DEC" + assert result2.freqstr == "Y-DEC" assert result1.freq == offsets.YearEnd(2) assert result2.freq == offsets.YearEnd() @@ -390,7 +390,7 @@ def test_period_cons_quarterly(self, month): @pytest.mark.parametrize("month", MONTHS) def test_period_cons_annual(self, month): # bugs in scikits.timeseries - freq = f"A-{month}" + freq = f"Y-{month}" exp = Period("1989", freq=freq) stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) p = Period(stamp, freq=freq) @@ -428,7 +428,7 @@ def test_period_from_ordinal(self): assert p == res assert isinstance(res, Period) - @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "h"]) def test_construct_from_nat_string_and_freq(self, freq): per = Period("NaT", freq=freq) assert per is NaT @@ -449,7 +449,7 @@ def test_period_cons_nat(self): p = Period(iNaT, freq="3D") assert p is NaT - p = Period(iNaT, freq="1D1H") + p = Period(iNaT, freq="1D1h") assert p is NaT p = Period("NaT") @@ -491,14 +491,14 @@ def test_period_cons_mult(self): def test_period_cons_combined(self): p = [ ( - Period("2011-01", freq="1D1H"), - Period("2011-01", freq="1H1D"), - Period("2011-01", freq="H"), + Period("2011-01", freq="1D1h"), + Period("2011-01", freq="1h1D"), + Period("2011-01", freq="h"), ), ( - Period(ordinal=1, freq="1D1H"), - Period(ordinal=1, freq="1H1D"), - Period(ordinal=1, freq="H"), + Period(ordinal=1, freq="1D1h"), + Period(ordinal=1, freq="1h1D"), + Period(ordinal=1, freq="h"), ), ] @@ -507,49 +507,49 @@ def test_period_cons_combined(self): assert p2.ordinal == p3.ordinal assert p1.freq == offsets.Hour(25) - assert p1.freqstr == "25H" + assert p1.freqstr == "25h" assert p2.freq == offsets.Hour(25) - assert p2.freqstr == "25H" + assert p2.freqstr == "25h" assert p3.freq == offsets.Hour() - assert p3.freqstr == "H" + assert p3.freqstr == "h" result = p1 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p1.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p2 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p2.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p1 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p1.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p2 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p2.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" - msg = "Frequency must be positive, because it represents span: -25H" + msg = "Frequency must be positive, because it represents span: -25h" with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="-1D1H") + Period("2011-01", freq="-1D1h") with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="-1H1D") + Period("2011-01", freq="-1h1D") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="-1D1H") + Period(ordinal=1, freq="-1D1h") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="-1H1D") + Period(ordinal=1, freq="-1h1D") msg = "Frequency must be positive, because it represents span: 0D" with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="0D0H") + Period("2011-01", freq="0D0h") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="0D0H") + Period(ordinal=1, freq="0D0h") # You can only combine together day and intraday offsets msg = "Invalid frequency: 1W1D" @@ -584,7 +584,7 @@ def test_period_constructor_nanosecond(self, day, hour, sec_float, expected): def test_period_large_ordinal(self, hour): # Issue #36430 # Integer overflow for Period over the maximum timestamp - p = Period(ordinal=2562048 + hour, freq="1H") + p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour @@ -621,7 +621,7 @@ def test_to_timestamp_mult(self): "ignore:Period with BDay freq is deprecated:FutureWarning" ) def test_to_timestamp(self): - p = Period("1982", freq="A") + p = Period("1982", freq="Y") start_ts = p.to_timestamp(how="S") aliases = ["s", "StarT", "BEGIn"] for a in aliases: @@ -635,7 +635,7 @@ def test_to_timestamp(self): assert end_ts == p.to_timestamp("D", how=a) assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "s"] + from_lst = ["Y", "Q", "M", "W", "B", "D", "h", "Min", "s"] def _ex(p): if p.freq == "B": @@ -653,12 +653,12 @@ def _ex(p): # Frequency other than daily - p = Period("1985", freq="A") + p = Period("1985", freq="Y") - result = p.to_timestamp("H", how="end") + result = p.to_timestamp("h", how="end") expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp("3H", how="end") + result = p.to_timestamp("3h", how="end") assert result == expected result = p.to_timestamp("min", how="end") @@ -672,13 +672,13 @@ def _ex(p): assert result == expected expected = datetime(1985, 1, 1) - result = p.to_timestamp("H", how="start") + result = p.to_timestamp("h", how="start") assert result == expected result = p.to_timestamp("min", how="start") assert result == expected result = p.to_timestamp("s", how="start") assert result == expected - result = p.to_timestamp("3H", how="start") + result = p.to_timestamp("3h", how="start") assert result == expected result = p.to_timestamp("5s", how="start") assert result == expected @@ -731,8 +731,8 @@ def test_to_timestamp_microsecond(self, ts, expected, freq): ("2000-12-15 13:45:26.123", None, "2000-12-15 13:45:26.123", "ms"), ("2000-12-15 13:45:26", "s", "2000-12-15 13:45:26", "s"), ("2000-12-15 13:45:26", "min", "2000-12-15 13:45", "min"), - ("2000-12-15 13:45:26", "H", "2000-12-15 13:00", "H"), - ("2000-12-15", "Y", "2000", "A-DEC"), + ("2000-12-15 13:45:26", "h", "2000-12-15 13:00", "h"), + ("2000-12-15", "Y", "2000", "Y-DEC"), ("2000-12-15", "Q", "2000Q4", "Q-DEC"), ("2000-12-15", "M", "2000-12", "M"), ("2000-12-15", "W", "2000-12-11/2000-12-17", "W-SUN"), @@ -763,7 +763,7 @@ def test_strftime(self): class TestPeriodProperties: """Test properties such as year, month, weekday, etc....""" - @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "h"]) def test_is_leap_year(self, freq): # GH 13727 p = Period("2000-01-01 00:00:00", freq=freq) @@ -808,7 +808,7 @@ def test_period_deprecated_freq(self): "M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], - "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "h": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], "min": ["minute", "MINUTE", "MINUTELY", "minutely"], "s": ["sec", "SEC", "SECOND", "SECONDLY", "second"], "ms": ["MILLISECOND", "MILLISECONDLY", "millisecond"], @@ -861,7 +861,7 @@ def test_inner_bounds_start_and_end_time(self, bound, offset, period_property): assert getattr(period, period_property).floor("s") == expected def test_start_time(self): - freq_lst = ["A", "Q", "M", "D", "H", "min", "s"] + freq_lst = ["Y", "Q", "M", "D", "h", "min", "s"] xp = datetime(2012, 1, 1) for f in freq_lst: p = Period("2012", freq=f) @@ -871,7 +871,7 @@ def test_start_time(self): assert Period("2012", freq="W").start_time == datetime(2011, 12, 26) def test_end_time(self): - p = Period("2012", freq="A") + p = Period("2012", freq="Y") def _ex(*args): return Timestamp(Timestamp(datetime(*args)).as_unit("ns")._value - 1) @@ -891,7 +891,7 @@ def _ex(*args): xp = _ex(2012, 1, 2) assert xp == p.end_time - p = Period("2012", freq="H") + p = Period("2012", freq="h") xp = _ex(2012, 1, 1, 1) assert xp == p.end_time @@ -909,11 +909,11 @@ def _ex(*args): xp = _ex(2012, 1, 16) assert xp == p.end_time - p = Period("2012", freq="1D1H") + p = Period("2012", freq="1D1h") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time - p = Period("2012", freq="1H1D") + p = Period("2012", freq="1h1D") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time @@ -936,7 +936,7 @@ def _ex(*args): def test_properties_annually(self): # Test properties on Periods with annually frequency. - a_date = Period(freq="A", year=2007) + a_date = Period(freq="Y", year=2007) assert a_date.year == 2007 def test_properties_quarterly(self): @@ -1023,8 +1023,8 @@ def test_properties_daily(self): def test_properties_hourly(self): # Test properties on Periods with hourly frequency. - h_date1 = Period(freq="H", year=2007, month=1, day=1, hour=0) - h_date2 = Period(freq="2H", year=2007, month=1, day=1, hour=0) + h_date1 = Period(freq="h", year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq="2h", year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: assert h_date.year == 2007 @@ -1036,7 +1036,7 @@ def test_properties_hourly(self): assert h_date.hour == 0 assert h_date.days_in_month == 31 assert ( - Period(freq="H", year=2012, month=2, day=1, hour=0).days_in_month == 29 + Period(freq="h", year=2012, month=2, day=1, hour=0).days_in_month == 29 ) def test_properties_minutely(self): @@ -1196,11 +1196,11 @@ def test_add_sub_td64_nat(self, unit): nat - per def test_sub_delta(self): - left, right = Period("2011", freq="A"), Period("2007", freq="A") + left, right = Period("2011", freq="Y"), Period("2007", freq="Y") result = left - right assert result == 4 * right.freq - msg = r"Input has different freq=M from Period\(freq=A-DEC\)" + msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" with pytest.raises(IncompatibleFrequency, match=msg): left - Period("2007-01", freq="M") @@ -1316,7 +1316,7 @@ def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): def test_add_offset(self): # freq is DateOffset - for freq in ["A", "2A", "3A"]: + for freq in ["Y", "2Y", "3Y"]: p = Period("2011", freq=freq) exp = Period("2013", freq=freq) assert p + offsets.YearEnd(2) == exp @@ -1411,7 +1411,7 @@ def test_add_offset(self): with pytest.raises(IncompatibleFrequency, match=msg): o + p - for freq in ["H", "2H", "3H"]: + for freq in ["h", "2h", "3h"]: p = Period("2011-04-01 09:00", freq=freq) exp = Period("2011-04-03 09:00", freq=freq) @@ -1467,7 +1467,7 @@ def test_sub_offset(self): ] ) - for freq in ["A", "2A", "3A"]: + for freq in ["Y", "2Y", "3Y"]: p = Period("2011", freq=freq) assert p - offsets.YearEnd(2) == Period("2009", freq=freq) @@ -1516,7 +1516,7 @@ def test_sub_offset(self): with pytest.raises(IncompatibleFrequency, match=msg): p - o - for freq in ["H", "2H", "3H"]: + for freq in ["h", "2h", "3h"]: p = Period("2011-04-01 09:00", freq=freq) assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) @@ -1589,7 +1589,7 @@ def test_small_year_parsing(): def test_negone_ordinals(): - freqs = ["A", "M", "Q", "D", "H", "min", "s"] + freqs = ["Y", "M", "Q", "D", "h", "min", "s"] period = Period(ordinal=-1, freq="D") for freq in freqs: diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 287b7557f50f9..44ce5c79db348 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -9,6 +9,7 @@ import pytest from pandas._libs.missing import NA +from pandas.compat.numpy import np_long from pandas.core.dtypes.common import is_scalar @@ -102,9 +103,9 @@ def test_comparison_ops(comparison_op, other): -0.0, False, np.bool_(False), - np.int_(0), + np_long(0), np.float64(0), - np.int_(-0), + np_long(-0), np.float64(-0), ], ) @@ -123,7 +124,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float64(1)] + "value", [1, 1.0, True, np.bool_(True), np_long(1), np.float64(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -133,14 +134,14 @@ def test_rpow_special(value, asarray): if asarray: result = result[0] - elif not isinstance(value, (np.float64, np.bool_, np.int_)): + elif not isinstance(value, (np.float64, np.bool_, np_long)): # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) assert result == value -@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float64(-1)]) +@pytest.mark.parametrize("value", [-1, -1.0, np_long(-1), np.float64(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 7bd9e5fc5e293..0d876fbb9bde8 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -208,8 +208,8 @@ def test_construction(): assert Timedelta(offsets.Second(2)) == Timedelta(seconds=2) # GH#11995: unicode - expected = Timedelta("1H") - result = Timedelta("1H") + expected = Timedelta("1h") + result = Timedelta("1h") assert result == expected assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index b1483342eb6e4..ef780a090c12d 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -673,7 +673,7 @@ def test_to_numpy_alias(self): ("5s", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), ("min", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), ("12min", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), - ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("h", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), ("d", Timedelta("1 days"), Timedelta("-1 days")), ], ) @@ -991,7 +991,7 @@ def test_total_seconds_precision(self): def test_resolution_string(self): assert Timedelta(days=1).resolution_string == "D" - assert Timedelta(days=1, hours=6).resolution_string == "H" + assert Timedelta(days=1, hours=6).resolution_string == "h" assert Timedelta(days=1, minutes=6).resolution_string == "min" assert Timedelta(days=1, seconds=6).resolution_string == "s" assert Timedelta(days=1, milliseconds=6).resolution_string == "ms" @@ -1043,6 +1043,7 @@ def test_timedelta_attribute_precision(): @pytest.mark.parametrize( "unit,unit_depr", [ + ("h", "H"), ("min", "T"), ("s", "S"), ("ms", "L"), @@ -1050,8 +1051,8 @@ def test_timedelta_attribute_precision(): ("us", "U"), ], ) -def test_units_t_l_u_n_deprecated(unit, unit_depr): - # GH 52536 +def test_units_H_T_S_L_N_U_deprecated(unit, unit_depr): + # GH#52536 msg = f"'{unit_depr}' is deprecated and will be removed in a future version." expected = Timedelta(1, unit=unit) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index e501bd93bc1c6..b7d5bbe71269a 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -45,7 +45,7 @@ def test_round_division_by_zero_raises(self): ("20130201 12:00:00", "D", "20130202"), ("20130104 12:00:00", "D", "20130105"), ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), - ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), + ("2000-01-05 05:09:15.13", "h", "2000-01-05 05:00:00"), ("2000-01-05 05:09:15.13", "s", "2000-01-05 05:09:15"), ], ) @@ -141,7 +141,7 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): ("2018-01-01 00:04:00", "4min", "2018-01-01 00:04:00"), ("2018-01-01 00:15:00", "15min", "2018-01-01 00:15:00"), ("2018-01-01 00:20:00", "20min", "2018-01-01 00:20:00"), - ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), + ("2018-01-01 03:00:00", "3h", "2018-01-01 03:00:00"), ], ) @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) @@ -181,30 +181,30 @@ def test_round_dst_border_ambiguous(self, method, unit): ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") ts = ts.as_unit(unit) # - result = getattr(ts, method)("H", ambiguous=True) + result = getattr(ts, method)("h", ambiguous=True) assert result == ts assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = getattr(ts, method)("H", ambiguous=False) + result = getattr(ts, method)("h", ambiguous=False) expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( "Europe/Madrid" ) assert result == expected assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - result = getattr(ts, method)("H", ambiguous="NaT") + result = getattr(ts, method)("h", ambiguous="NaT") assert result is NaT msg = "Cannot infer dst time" with pytest.raises(pytz.AmbiguousTimeError, match=msg): - getattr(ts, method)("H", ambiguous="raise") + getattr(ts, method)("h", ambiguous="raise") @pytest.mark.parametrize( "method, ts_str, freq", [ ["ceil", "2018-03-11 01:59:00-0600", "5min"], ["round", "2018-03-11 01:59:00-0600", "5min"], - ["floor", "2018-03-11 03:01:00-0500", "2H"], + ["floor", "2018-03-11 03:01:00-0500", "2h"], ], ) @pytest.mark.parametrize( diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index fba16749c026e..d7d33ae058af8 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -345,30 +345,30 @@ def test_dt_round_tz_ambiguous(self, method): ) df1["date"] = df1["date"].dt.tz_convert("Europe/Madrid") # infer - result = getattr(df1.date.dt, method)("H", ambiguous="infer") + result = getattr(df1.date.dt, method)("h", ambiguous="infer") expected = df1["date"] tm.assert_series_equal(result, expected) # bool-array - result = getattr(df1.date.dt, method)("H", ambiguous=[True, False, False]) + result = getattr(df1.date.dt, method)("h", ambiguous=[True, False, False]) tm.assert_series_equal(result, expected) # NaT - result = getattr(df1.date.dt, method)("H", ambiguous="NaT") + result = getattr(df1.date.dt, method)("h", ambiguous="NaT") expected = df1["date"].copy() expected.iloc[0:2] = pd.NaT tm.assert_series_equal(result, expected) # raise with tm.external_error_raised(pytz.AmbiguousTimeError): - getattr(df1.date.dt, method)("H", ambiguous="raise") + getattr(df1.date.dt, method)("h", ambiguous="raise") @pytest.mark.parametrize( "method, ts_str, freq", [ ["ceil", "2018-03-11 01:59:00-0600", "5min"], ["round", "2018-03-11 01:59:00-0600", "5min"], - ["floor", "2018-03-11 03:01:00-0500", "2H"], + ["floor", "2018-03-11 03:01:00-0500", "2h"], ], ) def test_dt_round_tz_nonexistent(self, method, ts_str, freq): @@ -598,7 +598,7 @@ def test_strftime_dt64_microsecond_resolution(self): tm.assert_series_equal(result, expected) def test_strftime_period_hours(self): - ser = Series(period_range("20130101", periods=4, freq="H")) + ser = Series(period_range("20130101", periods=4, freq="h")) result = ser.dt.strftime("%Y/%m/%d %H:%M:%S") expected = Series( [ @@ -776,8 +776,8 @@ class TestSeriesPeriodValuesDtAccessor: [Period("2016-01", freq="M"), Period("2016-02", freq="M")], [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], [ - Period("2016-01-01 00:00:00", freq="H"), - Period("2016-01-01 01:00:00", freq="H"), + Period("2016-01-01 00:00:00", freq="h"), + Period("2016-01-01 01:00:00", freq="h"), ], [ Period("2016-01-01 00:00:00", freq="M"), diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 317967bcbb7ff..84cf80fa1ffce 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -76,7 +76,7 @@ def test_getitem_setitem_datetime_tz(tz_source): N = 50 # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz=tzget("US/Eastern")) + rng = date_range("1/1/1990", periods=N, freq="h", tz=tzget("US/Eastern")) ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -107,7 +107,7 @@ def test_getitem_setitem_datetime_tz(tz_source): def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") + rng = date_range("1/1/1990", periods=N, freq="h", tz="US/Eastern") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) result = ts["1990-01-01 04:00:00"] @@ -213,7 +213,7 @@ def test_getitem_setitem_datetimeindex(): def test_getitem_setitem_periodindex(): N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) result = ts["1990-01-01 04"] @@ -450,7 +450,7 @@ def test_indexing(): def test_getitem_str_month_with_datetimeindex(): # GH3546 (not including times on the last day) - idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="H") + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="h") ts = Series(range(len(idx)), index=idx) expected = ts["2013-05"] tm.assert_series_equal(expected, ts) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 458988491aae8..479e74703bc0e 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -137,7 +137,7 @@ def test_getitem_pydatetime_tz(self, tzstr): tz = timezones.maybe_get_tz(tzstr) index = date_range( - start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="h", tz=tzstr ) ts = Series(index=index, data=index.hour) time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 0fa28920d41bd..a52d87b1a0457 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -327,7 +327,7 @@ def test_multilevel_preserve_name(lexsorted_two_level_string_multiindex, indexer [ date_range("2014-01-01", periods=20, freq="MS"), period_range("2014-01", periods=20, freq="M"), - timedelta_range("0", periods=20, freq="H"), + timedelta_range("0", periods=20, freq="h"), ], ) def test_slice_with_negative_step(index): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index f419ff9384042..5fcd3a19dcaa4 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -79,7 +79,7 @@ def test_setitem_tuple_with_datetimetz_values(self): @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(self, tz, indexer_sli): - orig = Series(date_range("2016-01-01", freq="H", periods=3, tz=tz)) + orig = Series(date_range("2016-01-01", freq="h", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" exp = Series( @@ -117,7 +117,7 @@ def test_setitem_with_tz(self, tz, indexer_sli): def test_setitem_with_tz_dst(self, indexer_sli): # GH#14146 trouble setting values near DST boundary tz = "US/Eastern" - orig = Series(date_range("2016-11-06", freq="H", periods=3, tz=tz)) + orig = Series(date_range("2016-11-06", freq="h", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" exp = Series( diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index e1b3dd4888ef5..cb60cd2e5bcf3 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -193,7 +193,7 @@ def test_align_with_dataframe_method(method): def test_align_dt64tzindex_mismatched_tzs(): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") ser = Series(np.random.default_rng(2).standard_normal(len(idx1)), index=idx1) ser_central = ser.tz_convert("US/Central") # different timezones convert to UTC @@ -204,7 +204,7 @@ def test_align_dt64tzindex_mismatched_tzs(): def test_align_periodindex(join_type): - rng = period_range("1/1/2000", "1/1/2010", freq="A") + rng = period_range("1/1/2000", "1/1/2010", freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) # TODO: assert something? @@ -240,10 +240,10 @@ def test_align_left_different_named_levels(): result_left, result_right = left.align(right) expected_left = Series( - [2], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) expected_right = Series( - [1], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) tm.assert_series_equal(result_left, expected_left) tm.assert_series_equal(result_right, expected_right) diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 31c264d74d063..2acc2921e5efc 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -118,7 +118,7 @@ def test_with_nan(self): def test_periodindex(self): # array or list or dates N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="37min") @@ -133,7 +133,7 @@ def test_periodindex(self): lb = ts.index[14] ub = ts.index[30] - pix = PeriodIndex(result.index.values, freq="H") + pix = PeriodIndex(result.index.values, freq="h") mask = (pix >= lb) & (pix < ub) rs = result[mask] assert (rs == ts[lb]).all() diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b6c409397c9fb..03fc6cba2902a 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -403,12 +403,12 @@ def test_astype_unicode(self): # bytes with obj.decode() instead of str(obj) item = "野菜食べないとやばい" ser = Series([item.encode()]) - result = ser.astype("unicode") + result = ser.astype(np.str_) expected = Series([item]) tm.assert_series_equal(result, expected) for ser in test_series: - res = ser.astype("unicode") + res = ser.astype(np.str_) expec = ser.map(str) tm.assert_series_equal(res, expec) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index aabed794ac557..89b6f9b01bc66 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -107,7 +107,7 @@ def test_combine_first_timezone_series_with_empty_series(self): time_index = date_range( datetime(2021, 1, 1, 1), datetime(2021, 1, 1, 10), - freq="H", + freq="h", tz="Europe/Rome", ) s1 = Series(range(10), index=time_index) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d1c79d0f00365..f621604faae4b 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -265,3 +265,11 @@ def test_convert_dtypes_pyarrow_to_np_nullable(self): result = ser.convert_dtypes(dtype_backend="numpy_nullable") expected = pd.Series(range(2), dtype="Int32") tm.assert_series_equal(result, expected) + + def test_convert_dtypes_pyarrow_null(self): + # GH#55346 + pa = pytest.importorskip("pyarrow") + ser = pd.Series([None, None]) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 549f429f09d35..f8bbd4c25a4c0 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -780,7 +780,7 @@ def test_series_interpolate_intraday(self): exp = ts.reindex(new_index).interpolate(method="time") - index = date_range("1/1/2012", periods=4, freq="12H") + index = date_range("1/1/2012", periods=4, freq="12h") ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() result = ts.reindex(new_index).interpolate(method="time") diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 783e18e541ad8..ae6c62e95f696 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -494,14 +494,14 @@ def test_map_categorical_na_action(na_action, expected): def test_map_datetimetz(): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + values = pd.date_range("2011-01-01", "2011-01-02", freq="h").tz_localize( "Asia/Tokyo" ) s = Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index f3075c116883a..0923a2d42ce10 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -197,7 +197,7 @@ def test_reindex_int(datetime_series): # NO NaNs introduced reindexed_int = int_ts.reindex(int_ts.index[::2]) - assert reindexed_int.dtype == np.int_ + assert reindexed_int.dtype == np.dtype(int) def test_reindex_bool(datetime_series): @@ -329,7 +329,7 @@ def test_reindex_fill_value_datetimelike_upcast(dtype, fill_value, using_array_m def test_reindex_datetimeindexes_tz_naive_and_aware(): # GH 8306 idx = date_range("20131101", tz="America/Chicago", periods=7) - newidx = date_range("20131103", periods=10, freq="H") + newidx = date_range("20131103", periods=10, freq="h") s = Series(range(7), index=idx) msg = ( r"Cannot compare dtypes datetime64\[ns, America/Chicago\] " diff --git a/pandas/tests/series/methods/test_to_dict.py b/pandas/tests/series/methods/test_to_dict.py index 4c3d9592eebe3..41c01f4537f23 100644 --- a/pandas/tests/series/methods/test_to_dict.py +++ b/pandas/tests/series/methods/test_to_dict.py @@ -13,12 +13,12 @@ class TestSeriesToDict: ) def test_to_dict(self, mapping, datetime_series): # GH#16122 - result = Series(datetime_series.to_dict(mapping), name="ts") + result = Series(datetime_series.to_dict(into=mapping), name="ts") expected = datetime_series.copy() expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) - from_method = Series(datetime_series.to_dict(collections.Counter)) + from_method = Series(datetime_series.to_dict(into=collections.Counter)) from_constructor = Series(collections.Counter(datetime_series.items())) tm.assert_series_equal(from_method, from_constructor) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 8547fd6988791..e9eb906a9cf10 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -153,7 +153,7 @@ class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted def test_add_series_with_period_index(self): - rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) result = ts + ts[::2] @@ -164,7 +164,7 @@ def test_add_series_with_period_index(self): result = ts + _permute(ts[::2]) tm.assert_series_equal(result, expected) - msg = "Input has different freq=D from Period\\(freq=A-DEC\\)" + msg = "Input has different freq=D from Period\\(freq=Y-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): ts + ts.asfreq("D", how="end") @@ -457,7 +457,7 @@ def test_ser_flex_cmp_return_dtypes_empty(self, opname): def test_ser_cmp_result_names(self, names, comparison_op): # datetime64 dtype op = comparison_op - dti = date_range("1949-06-07 03:00:00", freq="H", periods=5, name=names[0]) + dti = date_range("1949-06-07 03:00:00", freq="h", periods=5, name=names[0]) ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] @@ -713,7 +713,7 @@ def test_compare_series_interval_keyword(self): class TestTimeSeriesArithmetic: def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") perm = np.random.default_rng(2).permutation(100)[:90] ser1 = Series( @@ -737,7 +737,7 @@ def test_series_add_tz_mismatch_converts_to_utc(self): tm.assert_series_equal(result, expected) def test_series_add_aware_naive_raises(self): - rng = date_range("1/1/2011", periods=10, freq="H") + rng = date_range("1/1/2011", periods=10, freq="h") ser = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ser_utc = ser.tz_localize("utc") @@ -941,8 +941,8 @@ def test_series_varied_multiindex_alignment(): expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( - [("x", 1, "a"), ("x", 2, "a"), ("y", 1, "a"), ("y", 2, "a")], - names=["xy", "num", "ab"], + [("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2)], + names=["ab", "xy", "num"], ), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d45c655a4c0a2..4f9050be100ca 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -335,8 +335,8 @@ def test_constructor_index_dtype(self, dtype): [ ([1, 2]), (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="H"))), - (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + (list(date_range("1/1/2011", periods=2, freq="h"))), + (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), ([Interval(left=0, right=5)]), ], ) @@ -1306,7 +1306,7 @@ def test_construct_from_ints_including_iNaT_scalar_period_dtype(self): assert isna(series[2]) def test_constructor_period_incompatible_frequency(self): - data = [Period("2000", "D"), Period("2001", "A")] + data = [Period("2000", "D"), Period("2001", "Y")] result = Series(data) assert result.dtype == object assert result.tolist() == data @@ -2158,7 +2158,7 @@ def test_constructor_no_pandas_array(self, using_array_manager): @td.skip_array_manager_invalid_test def test_from_array(self): - result = Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) + result = Series(pd.array(["1h", "2h"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False result = Series(pd.array(["2015"], dtype="datetime64[ns]")) @@ -2166,7 +2166,7 @@ def test_from_array(self): @td.skip_array_manager_invalid_test def test_from_list_dtype(self): - result = Series(["1H", "2H"], dtype="timedelta64[ns]") + result = Series(["1h", "2h"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False result = Series(["2015"], dtype="datetime64[ns]") diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 1e1ac100b21bf..fbdf843a998bb 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -50,7 +50,7 @@ def test_td64_sum_empty(skipna): def test_td64_summation_overflow(): # GH#9442 - ser = Series(pd.date_range("20130101", periods=100000, freq="H")) + ser = Series(pd.date_range("20130101", periods=100000, freq="h")) ser[0] += pd.Timedelta("1s 1ms") # mean diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index f294885fb8f4d..86474a38d29fb 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -21,6 +21,16 @@ class TestSeriesRepr: + def test_multilevel_name_print_0(self): + # GH#55415 None does not get printed, but 0 does + # (matching DataFrame and flat index behavior) + mi = pd.MultiIndex.from_product([range(2, 3), range(3, 4)], names=[0, None]) + ser = Series(1.5, index=mi) + + res = repr(ser) + expected = "0 \n2 3 1.5\ndtype: float64" + assert res == expected + def test_multilevel_name_print(self, lexsorted_two_level_string_multiindex): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") @@ -247,7 +257,7 @@ def test_index_repr_in_frame_with_nan(self): assert repr(s) == exp def test_format_pre_1900_dates(self): - rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") + rng = date_range("1/1/1850", "1/1/1950", freq="Y-DEC") rng.format() ts = Series(1, index=rng) repr(ts) @@ -348,7 +358,7 @@ def test_categorical_series_repr(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" +Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -374,12 +384,12 @@ def test_categorical_series_repr_ordered(self): 8 8 9 9 dtype: category -Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" +Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp def test_categorical_series_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -392,7 +402,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -407,7 +417,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp def test_categorical_series_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -420,7 +430,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -435,7 +445,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp def test_categorical_series_repr_period(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -443,7 +453,7 @@ def test_categorical_series_repr_period(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(s) == exp @@ -461,7 +471,7 @@ def test_categorical_series_repr_period(self): assert repr(s) == exp def test_categorical_series_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -469,7 +479,7 @@ def test_categorical_series_repr_period_ordered(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(s) == exp diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 93fe9b05adb4f..b0406dbfa3469 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2817,7 +2817,7 @@ class TestToDatetimeInferFormat: def test_to_datetime_infer_datetime_format_consistent_format( self, cache, test_format ): - ser = Series(date_range("20000101", periods=50, freq="H")) + ser = Series(date_range("20000101", periods=50, freq="h")) s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 417bf0e90201b..fa92590fb5ec1 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -13,7 +13,7 @@ @pytest.mark.parametrize( "freqstr,exp_freqstr", - [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("H", "s")], + [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("h", "s")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): off = to_offset(freqstr) @@ -27,11 +27,11 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): @pytest.mark.parametrize( "freqstr,expected", [ - ("A", "year"), + ("Y", "year"), ("Q", "quarter"), ("M", "month"), ("D", "day"), - ("H", "hour"), + ("h", "hour"), ("min", "minute"), ("s", "second"), ("ms", "millisecond"), @@ -43,7 +43,7 @@ def test_get_attrname_from_abbrev(freqstr, expected): assert Resolution.get_reso_from_freqstr(freqstr).attrname == expected -@pytest.mark.parametrize("freq", ["D", "H", "min", "s", "ms", "us", "ns"]) +@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"]) def test_get_freq_roundtrip2(freq): obj = Resolution.get_reso_from_freqstr(freq) result = _attrname_to_abbrevs[obj.attrname] @@ -55,9 +55,9 @@ def test_get_freq_roundtrip2(freq): [ ((1.5, "min"), (90, "s")), ((62.4, "min"), (3744, "s")), - ((1.04, "H"), (3744, "s")), + ((1.04, "h"), (3744, "s")), ((1, "D"), (1, "D")), - ((0.342931, "H"), (1234551600, "us")), + ((0.342931, "h"), (1234551600, "us")), ((1.2345, "D"), (106660800, "ms")), ], ) @@ -73,7 +73,7 @@ def test_resolution_bumping(args, expected): [ (0.5, "ns"), # Too much precision in the input can prevent. - (0.3429324798798269273987982, "H"), + (0.3429324798798269273987982, "h"), ], ) def test_cat(args): @@ -86,7 +86,7 @@ def test_cat(args): @pytest.mark.parametrize( "freqstr,expected", [ - ("1H", "2021-01-01T09:00:00"), + ("1h", "2021-01-01T09:00:00"), ("1D", "2021-01-02T08:00:00"), ("1W", "2021-01-03T08:00:00"), ("1ME", "2021-01-31T08:00:00"), @@ -99,9 +99,9 @@ def test_compatibility(freqstr, expected): assert ts_np + do == np.datetime64(expected) -@pytest.mark.parametrize("freq", ["T", "S", "L", "N", "U"]) -def test_units_t_l_deprecated_from__attrname_to_abbrevs(freq): - # GH 52536 +@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) +def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): + # GH#52536 msg = f"'{freq}' is deprecated and will be removed in a future version." with tm.assert_produces_warning(FutureWarning, match=msg): diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 82cceeac2cd25..51d0dd298f841 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -38,7 +38,7 @@ @pytest.fixture( params=[ (timedelta(1), "D"), - (timedelta(hours=1), "H"), + (timedelta(hours=1), "h"), (timedelta(minutes=1), "min"), (timedelta(seconds=1), "s"), (np.timedelta64(1, "ns"), "ns"), @@ -52,7 +52,7 @@ def base_delta_code_pair(request): freqs = ( [f"Q-{month}" for month in MONTHS] - + [f"{annual}-{month}" for annual in ["A", "BA"] for month in MONTHS] + + [f"{annual}-{month}" for annual in ["Y", "BA"] for month in MONTHS] + ["ME", "BM", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] @@ -167,7 +167,7 @@ def test_monthly_ambiguous(): def test_annual_ambiguous(): rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) - assert rng.inferred_freq == "A-JAN" + assert rng.inferred_freq == "Y-JAN" @pytest.mark.parametrize("count", range(1, 5)) @@ -220,7 +220,7 @@ def test_infer_freq_index(freq, expected): "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], - "H": [ + "h": [ "2011-12-31 22:00", "2011-12-31 23:00", "2012-01-01 00:00", @@ -255,7 +255,7 @@ def test_infer_freq_tz_series(tz_naive_fixture): ) @pytest.mark.parametrize( "freq", - ["H", "3H", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"], + ["h", "3h", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"], ) def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): # see gh-8772 @@ -265,7 +265,7 @@ def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): def test_infer_freq_tz_transition_custom(): - index = date_range("2013-11-03", periods=5, freq="3H").tz_localize( + index = date_range("2013-11-03", periods=5, freq="3h").tz_localize( "America/Chicago" ) assert index.inferred_freq is None @@ -274,7 +274,7 @@ def test_infer_freq_tz_transition_custom(): @pytest.mark.parametrize( "data,expected", [ - # Hourly freq in a day must result in "H" + # Hourly freq in a day must result in "h" ( [ "2014-07-01 09:00", @@ -284,7 +284,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-01 13:00", "2014-07-01 14:00", ], - "H", + "h", ), ( [ @@ -300,7 +300,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-02 10:00", "2014-07-02 11:00", ], - "BH", + "bh", ), ( [ @@ -316,7 +316,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-07 10:00", "2014-07-07 11:00", ], - "BH", + "bh", ), ( [ @@ -345,7 +345,7 @@ def test_infer_freq_tz_transition_custom(): "2014-07-08 15:00", "2014-07-08 16:00", ], - "BH", + "bh", ), ], ) @@ -359,7 +359,7 @@ def test_not_monotonic(): rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) rng = rng[::-1] - assert rng.inferred_freq == "-1A-JAN" + assert rng.inferred_freq == "-1Y-JAN" def test_non_datetime_index2(): @@ -479,18 +479,18 @@ def test_series_datetime_index(freq): "Q@JAN", "Q@FEB", "Q@MAR", - "A@JAN", - "A@FEB", - "A@MAR", - "A@APR", - "A@MAY", - "A@JUN", - "A@JUL", - "A@AUG", - "A@SEP", - "A@OCT", - "A@NOV", - "A@DEC", + "Y@JAN", + "Y@FEB", + "Y@MAR", + "Y@APR", + "Y@MAY", + "Y@JUN", + "Y@JUL", + "Y@AUG", + "Y@SEP", + "Y@OCT", + "Y@NOV", + "Y@DEC", "Y@JAN", "WOM@1MON", "WOM@2MON", diff --git a/pandas/tests/tseries/offsets/test_business_hour.py b/pandas/tests/tseries/offsets/test_business_hour.py index 319cc053d5d7d..f0d065f8bb7ef 100644 --- a/pandas/tests/tseries/offsets/test_business_hour.py +++ b/pandas/tests/tseries/offsets/test_business_hour.py @@ -148,17 +148,17 @@ def test_repr( offset9, offset10, ): - assert repr(offset1) == "" - assert repr(offset2) == "<3 * BusinessHours: BH=09:00-17:00>" - assert repr(offset3) == "<-1 * BusinessHour: BH=09:00-17:00>" - assert repr(offset4) == "<-4 * BusinessHours: BH=09:00-17:00>" - - assert repr(offset5) == "" - assert repr(offset6) == "" - assert repr(offset7) == "<-2 * BusinessHours: BH=21:30-06:30>" - assert repr(offset8) == "" - assert repr(offset9) == "<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>" - assert repr(offset10) == "<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>" + assert repr(offset1) == "" + assert repr(offset2) == "<3 * BusinessHours: bh=09:00-17:00>" + assert repr(offset3) == "<-1 * BusinessHour: bh=09:00-17:00>" + assert repr(offset4) == "<-4 * BusinessHours: bh=09:00-17:00>" + + assert repr(offset5) == "" + assert repr(offset6) == "" + assert repr(offset7) == "<-2 * BusinessHours: bh=21:30-06:30>" + assert repr(offset8) == "" + assert repr(offset9) == "<3 * BusinessHours: bh=09:00-13:00,22:00-03:00>" + assert repr(offset10) == "<-1 * BusinessHour: bh=13:00-17:00,23:00-02:00>" def test_with_offset(self, dt): expected = Timestamp("2014-07-01 13:00") @@ -947,9 +947,9 @@ def test_apply_nanoseconds(self): assert_offset_equal(offset, base, expected) def test_datetimeindex(self): - idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="BH") - idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="BH") - idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="BH") + idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="bh") + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="bh") + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="bh") expected = DatetimeIndex( [ "2014-07-04 15:00", @@ -965,14 +965,14 @@ def test_datetimeindex(self): "2014-07-08 09:00", "2014-07-08 10:00", ], - freq="BH", + freq="bh", ) for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) - idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="BH") - idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="BH") - idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="BH") + idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="bh") + idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="bh") + idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="bh") expected = idx1 for idx in [idx1, idx2, idx3]: @@ -980,13 +980,13 @@ def test_datetimeindex(self): def test_short_datetimeindex_creation(self): # gh-49835 - idx4 = date_range(start="2014-07-01 10:00", freq="BH", periods=1) - expected4 = DatetimeIndex(["2014-07-01 10:00"], freq="BH") + idx4 = date_range(start="2014-07-01 10:00", freq="bh", periods=1) + expected4 = DatetimeIndex(["2014-07-01 10:00"], freq="bh") tm.assert_index_equal(idx4, expected4) def test_bday_ignores_timedeltas(self): - idx = date_range("2010/02/01", "2010/02/10", freq="12H") - t1 = idx + BDay(offset=Timedelta(3, unit="H")) + idx = date_range("2010/02/01", "2010/02/10", freq="12h") + t1 = idx + BDay(offset=Timedelta(3, unit="h")) expected = DatetimeIndex( [ diff --git a/pandas/tests/tseries/offsets/test_custom_business_hour.py b/pandas/tests/tseries/offsets/test_custom_business_hour.py index 38b5d74fe170f..55a184f95c2d8 100644 --- a/pandas/tests/tseries/offsets/test_custom_business_hour.py +++ b/pandas/tests/tseries/offsets/test_custom_business_hour.py @@ -69,8 +69,8 @@ def test_different_normalize_equals(self, _offset): assert offset != offset2 def test_repr(self, offset1, offset2): - assert repr(offset1) == "" - assert repr(offset2) == "" + assert repr(offset1) == "" + assert repr(offset2) == "" def test_with_offset(self, dt): expected = Timestamp("2014-07-01 13:00") diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index a7e9854c38f18..7f96ea98fa047 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -811,7 +811,7 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ["ME", "MS", "BM", "BMS", "D", "B", "H", "min", "s", "ms", "us"] + lst = ["ME", "MS", "BM", "BMS", "D", "B", "h", "min", "s", "ms", "us"] for k in lst: assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... @@ -839,7 +839,7 @@ def test_rule_code(self): "NOV", "DEC", ] - base_lst = ["A", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] + base_lst = ["Y", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -858,7 +858,7 @@ def test_freq_offsets(): class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["A", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + month_prefixes = ["Y", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] names = [ prefix + "-" + month for prefix in month_prefixes diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index c1ab0ba0b5e6f..cefe449f3484d 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -66,14 +66,14 @@ def test_tz_localize_to_utc_copies(): def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture): tz = tz_aware_fixture - tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz) - naive_didx = date_range("2014-03-01", "2015-01-10", freq="H") + tz_didx = date_range("2014-03-01", "2015-01-10", freq="h", tz=tz) + naive_didx = date_range("2014-03-01", "2015-01-10", freq="h") _compare_utc_to_local(tz_didx) _compare_local_to_utc(tz_didx, naive_didx) -@pytest.mark.parametrize("freq", ["D", "A"]) +@pytest.mark.parametrize("freq", ["D", "Y"]) def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): tz = tz_aware_fixture tz_didx = date_range("2018-01-01", "2020-01-01", freq=freq, tz=tz) diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index 83f28f6b5dc01..effd3b4b8b4e5 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -16,10 +16,8 @@ (offsets.QuarterEnd(startingMonth=12).freqstr, "DEC"), ("Q-JAN", "JAN"), (offsets.QuarterEnd(startingMonth=1).freqstr, "JAN"), - ("A-DEC", "DEC"), ("Y-DEC", "DEC"), (offsets.YearEnd().freqstr, "DEC"), - ("A-MAY", "MAY"), ("Y-MAY", "MAY"), (offsets.YearEnd(month=5).freqstr, "MAY"), ], diff --git a/pandas/tests/tslibs/test_npy_units.py b/pandas/tests/tslibs/test_npy_units.py new file mode 100644 index 0000000000000..6d05dc79fbb2c --- /dev/null +++ b/pandas/tests/tslibs/test_npy_units.py @@ -0,0 +1,27 @@ +import numpy as np + +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.vectorized import is_date_array_normalized + +# a datetime64 ndarray which *is* normalized +day_arr = np.arange(10, dtype="i8").view("M8[D]") + + +class TestIsDateArrayNormalized: + def test_is_date_array_normalized_day(self): + arr = day_arr + abbrev = "D" + unit = abbrev_to_npy_unit(abbrev) + result = is_date_array_normalized(arr.view("i8"), None, unit) + assert result is True + + def test_is_date_array_normalized_seconds(self): + abbrev = "s" + arr = day_arr.astype(f"M8[{abbrev}]") + unit = abbrev_to_npy_unit(abbrev) + result = is_date_array_normalized(arr.view("i8"), None, unit) + assert result is True + + arr[0] += np.timedelta64(1, abbrev) + result2 = is_date_array_normalized(arr.view("i8"), None, unit) + assert result2 is False diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index ec3579109e7a4..425decc14251a 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -138,8 +138,8 @@ def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): "date_str,freq,expected", [ ("2013Q2", None, datetime(2013, 4, 1)), - ("2013Q2", "A-APR", datetime(2012, 8, 1)), - ("2013-Q2", "A-DEC", datetime(2013, 4, 1)), + ("2013Q2", "Y-APR", datetime(2012, 8, 1)), + ("2013-Q2", "Y-DEC", datetime(2013, 4, 1)), ], ) def test_parsers_quarterly_with_freq(date_str, freq, expected): @@ -148,7 +148,7 @@ def test_parsers_quarterly_with_freq(date_str, freq, expected): @pytest.mark.parametrize( - "date_str", ["2Q 2005", "2Q-200A", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] + "date_str", ["2Q 2005", "2Q-200Y", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] ) def test_parsers_quarter_invalid(date_str): if date_str == "6Q-20": diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index 99f0a82d6711e..149817357fbd6 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -24,17 +24,17 @@ def get_freq_code(freqstr: str) -> int: @pytest.mark.parametrize( "freq1,freq2,expected", [ - ("D", "H", 24), + ("D", "h", 24), ("D", "min", 1440), ("D", "s", 86400), ("D", "ms", 86400000), ("D", "us", 86400000000), ("D", "ns", 86400000000000), - ("H", "min", 60), - ("H", "s", 3600), - ("H", "ms", 3600000), - ("H", "us", 3600000000), - ("H", "ns", 3600000000000), + ("h", "min", 60), + ("h", "s", 3600), + ("h", "ms", 3600000), + ("h", "us", 3600000000), + ("h", "ns", 3600000000000), ("min", "s", 60), ("min", "ms", 60000), ("min", "us", 60000000), @@ -54,7 +54,7 @@ def test_intra_day_conversion_factors(freq1, freq2, expected): @pytest.mark.parametrize( - "freq,expected", [("A", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] + "freq,expected", [("Y", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] ) def test_period_ordinal_start_values(freq, expected): # information for Jan. 1, 1970. diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index bc3e06646b235..82b0c78002972 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -53,7 +53,7 @@ def test_to_offset_negative(freqstr, expected): "-us", "3us1", "-2-3us", - "-2D:3H", + "-2D:3h", "1.5.0s", "2SMS-15-15", "2SMS-15D", @@ -105,12 +105,12 @@ def test_to_offset_tuple_unsupported(): @pytest.mark.parametrize( "freqstr,expected", [ - ("2D 3H", offsets.Hour(51)), - ("2 D3 H", offsets.Hour(51)), - ("2 D 3 H", offsets.Hour(51)), - (" 2 D 3 H ", offsets.Hour(51)), - (" H ", offsets.Hour()), - (" 3 H ", offsets.Hour(3)), + ("2D 3h", offsets.Hour(51)), + ("2 D3 h", offsets.Hour(51)), + ("2 D 3 h", offsets.Hour(51)), + (" 2 D 3 h ", offsets.Hour(51)), + (" h ", offsets.Hour()), + (" 3 h ", offsets.Hour(3)), ], ) def test_to_offset_whitespace(freqstr, expected): @@ -119,7 +119,7 @@ def test_to_offset_whitespace(freqstr, expected): @pytest.mark.parametrize( - "freqstr,expected", [("00H 00min 01s", 1), ("-00H 03min 14s", -194)] + "freqstr,expected", [("00h 00min 01s", 1), ("-00h 03min 14s", -194)] ) def test_to_offset_leading_zero(freqstr, expected): result = to_offset(freqstr) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index b8e0173ee131f..a23c91df5eef6 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1239,15 +1239,15 @@ def test_dont_mutate_obj_after_slicing(self): df = DataFrame( { "id": ["a", "a", "b", "b", "b"], - "timestamp": date_range("2021-9-1", periods=5, freq="H"), + "timestamp": date_range("2021-9-1", periods=5, freq="h"), "y": range(5), } ) - grp = df.groupby("id").rolling("1H", on="timestamp") + grp = df.groupby("id").rolling("1h", on="timestamp") result = grp.count() expected_df = DataFrame( { - "timestamp": date_range("2021-9-1", periods=5, freq="H"), + "timestamp": date_range("2021-9-1", periods=5, freq="h"), "y": [1.0] * 5, }, index=MultiIndex.from_arrays( @@ -1262,7 +1262,7 @@ def test_dont_mutate_obj_after_slicing(self): index=MultiIndex.from_arrays( [ ["a", "a", "b", "b", "b"], - date_range("2021-9-1", periods=5, freq="H"), + date_range("2021-9-1", periods=5, freq="h"), ], names=["id", "timestamp"], ), diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index b6f2365afb457..2258a4106fe92 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -392,7 +392,7 @@ def test_pairwise_with_series(self, pairwise_frames, pairwise_target_frame, f): def test_corr_freq_memory_error(self): # GH 31789 s = Series(range(5), index=date_range("2020", periods=5)) - result = s.rolling("12H").corr(s) + result = s.rolling("12h").corr(s) expected = Series([np.nan] * 5, index=date_range("2020", periods=5)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index caea3e98f262f..482c8992feb13 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -599,12 +599,12 @@ def test_all2(self, arithmetic_win_operators): # more sophisticated comparison of integer vs. # time-based windowing df = DataFrame( - {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") + {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="h") ) # in-range data dft = df.between_time("09:00", "16:00") - r = dft.rolling(window="5H") + r = dft.rolling(window="5h") result = getattr(r, f)() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 7aa245341cbdd..0ed0fe4b87576 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -56,7 +56,7 @@ TimedeltaIndex, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -# --------------------------------------------------------------------- +# -------------------------------------------------------------------- # Offset related functions _need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] @@ -66,7 +66,7 @@ key = f"{_prefix}-{_m}" OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix] -for _prefix in ["A", "Q"]: +for _prefix in ["Y", "Q"]: for _m in MONTHS: _alias = f"{_prefix}-{_m}" OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias @@ -229,7 +229,7 @@ def get_freq(self) -> str | None: # Business hourly, maybe. 17: one day / 65: one weekend if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): - return "BH" + return "bh" # Possibly intraday frequency. Here we use the # original .asi8 values as the modified values @@ -243,7 +243,7 @@ def get_freq(self) -> str | None: pps = ppm // 60 if _is_multiple(delta, pph): # Hours - return _maybe_add_count("H", delta / pph) + return _maybe_add_count("h", delta / pph) elif _is_multiple(delta, ppm): # Minutes return _maybe_add_count("min", delta / ppm) @@ -345,7 +345,7 @@ def _get_annual_rule(self) -> str | None: if pos_check is None: return None else: - return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) + return {"cs": "AS", "bs": "BAS", "ce": "Y", "be": "BA"}.get(pos_check) def _get_quarterly_rule(self) -> str | None: if len(self.mdiffs) > 1: @@ -457,21 +457,21 @@ def is_subperiod(source, target) -> bool: return _quarter_months_conform( get_rule_month(source), get_rule_month(target) ) - return source in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_quarterly(target): - return source in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_monthly(target): - return source in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return source in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif _is_weekly(target): - return source in {target, "D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return source in {target, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif target == "B": - return source in {"B", "H", "min", "s", "ms", "us", "ns"} + return source in {"B", "h", "min", "s", "ms", "us", "ns"} elif target == "C": - return source in {"C", "H", "min", "s", "ms", "us", "ns"} + return source in {"C", "h", "min", "s", "ms", "us", "ns"} elif target == "D": - return source in {"D", "H", "min", "s", "ms", "us", "ns"} - elif target == "H": - return source in {"H", "min", "s", "ms", "us", "ns"} + return source in {"D", "h", "min", "s", "ms", "us", "ns"} + elif target == "h": + return source in {"h", "min", "s", "ms", "us", "ns"} elif target == "min": return source in {"min", "s", "ms", "us", "ns"} elif target == "s": @@ -515,21 +515,21 @@ def is_superperiod(source, target) -> bool: smonth = get_rule_month(source) tmonth = get_rule_month(target) return _quarter_months_conform(smonth, tmonth) - return target in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_quarterly(source): - return target in {"D", "C", "B", "M", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_monthly(source): - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif _is_weekly(source): - return target in {source, "D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {source, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "B": - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "C": - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "D": - return target in {"D", "C", "B", "H", "min", "s", "ms", "us", "ns"} - elif source == "H": - return target in {"H", "min", "s", "ms", "us", "ns"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} + elif source == "h": + return target in {"h", "min", "s", "ms", "us", "ns"} elif source == "min": return target in {"min", "s", "ms", "us", "ns"} elif source == "s": @@ -560,7 +560,7 @@ def _maybe_coerce_freq(code) -> str: assert code is not None if isinstance(code, DateOffset): code = freq_to_period_freqstr(1, code.name) - if code in {"min", "s", "ms", "us", "ns"}: + if code in {"h", "min", "s", "ms", "us", "ns"}: return code else: return code.upper() @@ -574,7 +574,7 @@ def _quarter_months_conform(source: str, target: str) -> bool: def _is_annual(rule: str) -> bool: rule = rule.upper() - return rule == "A" or rule.startswith("A-") + return rule == "Y" or rule.startswith("Y-") def _is_quarterly(rule: str) -> bool: diff --git a/pyproject.toml b/pyproject.toml index 89432c2353ea8..a8388a9ff52de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -505,6 +505,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", + # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged + "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", ] junit_family = "xunit2" markers = [ diff --git a/setup.py b/setup.py index 663bbd3952eab..db3717efb738d 100755 --- a/setup.py +++ b/setup.py @@ -418,6 +418,9 @@ def maybe_cythonize(extensions, *args, **kwargs): kwargs["nthreads"] = parsed.parallel build_ext.render_templates(_pxifiles) + if debugging_symbols_requested: + kwargs["gdb_debug"] = True + return cythonize(extensions, *args, **kwargs) diff --git a/tooling/debug/Dockerfile.pandas-debug b/tooling/debug/Dockerfile.pandas-debug new file mode 100644 index 0000000000000..00e10a85d7ab9 --- /dev/null +++ b/tooling/debug/Dockerfile.pandas-debug @@ -0,0 +1,35 @@ +FROM ubuntu:latest + +RUN apt-get update && apt-get upgrade -y +RUN apt-get install -y build-essential git valgrind + +# cpython dev install +RUN git clone -b 3.10 --depth 1 https://github.com/python/cpython.git /clones/cpython +RUN apt-get install -y libbz2-dev libffi-dev libssl-dev zlib1g-dev liblzma-dev libsqlite3-dev libreadline-dev +RUN cd /clones/cpython && ./configure --with-pydebug && CFLAGS="-g3" make -s -j$(nproc) && make install + +# gdb installation +RUN apt-get install -y wget libgmp-dev +RUN cd /tmp && wget http://mirrors.kernel.org/sourceware/gdb/releases/gdb-12.1.tar.gz && tar -zxf gdb-12.1.tar.gz +RUN cd /tmp/gdb-12.1 && ./configure --with-python=python3 && make -j$(nproc) && make install +RUN rm -r /tmp/gdb-12.1 + +# pandas dependencies +RUN python3 -m pip install \ + cython \ + hypothesis \ + ninja \ + numpy \ + meson \ + meson-python \ + pytest \ + pytest-asyncio \ + python-dateutil \ + pytz \ + versioneer[toml] + +# At the time this docker image was built, there was a bug/limitation +# with meson where only having a python3 executable and not python +# would cause the build to fail. This symlink could be removed if +# users stick to always calling python3 within the container +RUN ln -s /usr/local/bin/python3 /usr/local/bin/python diff --git a/tooling/debug/README b/tooling/debug/README new file mode 100644 index 0000000000000..111a958ff5ef5 --- /dev/null +++ b/tooling/debug/README @@ -0,0 +1,19 @@ +The Docker image here helps to set up an isolated environment containing a debug version of Python and a gdb installation which the Cython debugger can work with. + +If you have internet access, you can pull a pre-built image via + +```sh +docker pull pandas/pandas-debug +``` + +To build the image locally, you can do + +```sh +docker build . -t pandas/pandas-debug -f Dockerfile.pandas-debug +``` + +For pandas developers, you can push a new copy of the image to dockerhub via + +```sh +docker push pandas/pandas-debug +``` diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 561503de416a5..e6a6b3a8531ca 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -345,6 +345,29 @@ which pandas excels. ## IO +### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas) + +NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. + +It supports the following data types: + +- pandas data types +- data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) +- data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) + +The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). + +Example: + +```python +import ntv_pandas as npd + +jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) +df = npd.read_json(jsn) # load a JSON-value as a `DataFrame` + +df.equals(npd.read_json(df.npd.to_json(df))) # `True` in any case, whether `table=True` or not +``` + ### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) BCPandas provides high performance writes from pandas to Microsoft SQL Server, diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md new file mode 100644 index 0000000000000..3ae34f4bb6ebb --- /dev/null +++ b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md @@ -0,0 +1,479 @@ +# PDEP-12: Compact and reversible JSON interface + +- Created: 16 June 2023 +- Status: Rejected +- Discussion: + [#53252](https://github.com/pandas-dev/pandas/issues/53252) + [#55038](https://github.com/pandas-dev/pandas/issues/55038) +- Author: [Philippe THOMY](https://github.com/loco-philippe) +- Revision: 3 + +##### Summary + +- [Abstract](./0012-compact-and-reversible-JSON-interface.md/#Abstract) + - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description) + - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description) +- [Scope](./0012-compact-and-reversible-JSON-interface.md/#Scope) +- [Motivation](./0012-compact-and-reversible-JSON-interface.md/#Motivation) + - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?) + - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?) + - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?) +- [Description](./0012-compact-and-reversible-JSON-interface.md/#Description) + - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing) + - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas) + - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format) + - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion) +- [Usage and impact](./0012-compact-and-reversible-JSON-interface.md/#Usage-and-impact) + - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage) + - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility) + - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework) + - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do) +- [Implementation](./0012-compact-and-reversible-JSON-interface.md/#Implementation) + - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules) + - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options) +- [F.A.Q.](./0012-compact-and-reversible-JSON-interface.md/#F.A.Q.) +- [Synthesis](./0012-compact-and-reversible-JSON-interface.md/Synthesis) +- [Core team decision](./0012-compact-and-reversible-JSON-interface.md/#Core-team-decision) +- [Timeline](./0012-compact-and-reversible-JSON-interface.md/#Timeline) +- [PDEP history](./0012-compact-and-reversible-JSON-interface.md/#PDEP-history) + +------------------------- + +## Abstract + +### Problem description + +The `dtype` and "Python type" are not explicitly taken into account in the current JSON interface. + +So, the JSON interface is not always reversible and has inconsistencies related to the consideration of the `dtype`. + +Another consequence is the partial application of the Table Schema specification in the `orient="table"` option (6 Table Schema data types are taken into account out of the 24 defined). + +Some JSON-interface problems are detailed in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_json_pandas.ipynb#Current-Json-interface) + +### Feature Description + +To have a simple, compact and reversible solution, I propose to use the [JSON-NTV format (Named and Typed Value)](https://github.com/loco-philippe/NTV#readme) - which integrates the notion of type - and its JSON-TAB variation for tabular data (the JSON-NTV format is defined in an [IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/) (not yet an RFC !!) ). + +This solution allows to include a large number of types (not necessarily pandas `dtype`) which allows to have: + +- a Table Schema JSON interface (`orient="table"`) which respects the Table Schema specification (going from 6 types to 20 types), +- a global JSON interface for all pandas data formats. + +#### Global JSON interface example + +In the example below, a DataFrame with several data types is converted to JSON. + +The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). + +With the existing JSON interface, this conversion is not possible. + +This example uses `ntv_pandas` module defined in the [ntv-pandas repository](https://github.com/loco-philippe/ntv-pandas#readme). + +Data example: + +```python +In [1]: from shapely.geometry import Point + from datetime import date + import pandas as pd + import ntv_pandas as npd + +In [2]: data = {'index': [100, 200, 300, 400, 500, 600], + 'dates::date': [date(1964,1,1), date(1985,2,5), date(2022,1,21), date(1964,1,1), date(1985,2,5), date(2022,1,21)], + 'value': [10, 10, 20, 20, 30, 30], + 'value32': pd.Series([12, 12, 22, 22, 32, 32], dtype='int32'), + 'res': [10, 20, 30, 10, 20, 30], + 'coord::point': [Point(1,2), Point(3,4), Point(5,6), Point(7,8), Point(3,4), Point(5,6)], + 'names': pd.Series(['john', 'eric', 'judith', 'mila', 'hector', 'maria'], dtype='string'), + 'unique': True } + +In [3]: df = pd.DataFrame(data).set_index('index') + +In [4]: df +Out[4]: dates::date value value32 res coord::point names unique + index + 100 1964-01-01 10 12 10 POINT (1 2) john True + 200 1985-02-05 10 12 20 POINT (3 4) eric True + 300 2022-01-21 20 22 30 POINT (5 6) judith True + 400 1964-01-01 20 22 10 POINT (7 8) mila True + 500 1985-02-05 30 32 20 POINT (3 4) hector True + 600 2022-01-21 30 32 30 POINT (5 6) maria True +``` + +JSON representation + +```python +In [5]: df_to_json = npd.to_json(df) + pprint(df_to_json, width=120) +Out[5]: {':tab': {'coord::point': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [3.0, 4.0], [5.0, 6.0]], + 'dates::date': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'], + 'index': [100, 200, 300, 400, 500, 600], + 'names::string': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'], + 'res': [10, 20, 30, 10, 20, 30], + 'unique': [True, True, True, True, True, True], + 'value': [10, 10, 20, 20, 30, 30], + 'value32::int32': [12, 12, 22, 22, 32, 32]}} +``` + +Reversibility + +```python +In [5]: df_from_json = npd.read_json(df_to_json) + print('df created from JSON is equal to initial df ? ', df_from_json.equals(df)) +Out[5]: df created from JSON is equal to initial df ? True +``` + +Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb) + +#### Table Schema JSON interface example + +In the example below, a DataFrame with several Table Schema data types is converted to JSON. + +The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). + +With the existing Table Schema JSON interface, this conversion is not possible. + +```python +In [1]: from shapely.geometry import Point + from datetime import date + +In [2]: df = pd.DataFrame({ + 'end february::date': ['date(2023,2,28)', 'date(2024,2,29)', 'date(2025,2,28)'], + 'coordinates::point': ['Point([2.3, 48.9])', 'Point([5.4, 43.3])', 'Point([4.9, 45.8])'], + 'contact::email': ['john.doe@table.com', 'lisa.minelli@schema.com', 'walter.white@breaking.com'] + }) + +In [3]: df +Out[3]: end february::date coordinates::point contact::email + 0 2023-02-28 POINT (2.3 48.9) john.doe@table.com + 1 2024-02-29 POINT (5.4 43.3) lisa.minelli@schema.com + 2 2025-02-28 POINT (4.9 45.8) walter.white@breaking.com +``` + +JSON representation + +```python +In [4]: df_to_table = npd.to_json(df, table=True) + pprint(df_to_table, width=140, sort_dicts=False) +Out[4]: {'schema': {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'end february', 'type': 'date'}, + {'name': 'coordinates', 'type': 'geopoint', 'format': 'array'}, + {'name': 'contact', 'type': 'string', 'format': 'email'}], + 'primaryKey': ['index'], + 'pandas_version': '1.4.0'}, + 'data': [{'index': 0, 'end february': '2023-02-28', 'coordinates': [2.3, 48.9], 'contact': 'john.doe@table.com'}, + {'index': 1, 'end february': '2024-02-29', 'coordinates': [5.4, 43.3], 'contact': 'lisa.minelli@schema.com'}, + {'index': 2, 'end february': '2025-02-28', 'coordinates': [4.9, 45.8], 'contact': 'walter.white@breaking.com'}]} +``` + +Reversibility + +```python +In [5]: df_from_table = npd.read_json(df_to_table) + print('df created from JSON is equal to initial df ? ', df_from_table.equals(df)) +Out[5]: df created from JSON is equal to initial df ? True +``` + +Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_table_pandas.ipynb) + +## Scope + +The objective is to make available the proposed JSON interface for any type of data and for `orient="table"` option or a new option `orient="ntv"`. + +The proposed interface is compatible with existing data. + +## Motivation + +### Why extend the `orient=table` option to other data types? + +- The Table Schema specification defines 24 data types, 6 are taken into account in the pandas interface + +### Why is it important to have a compact and reversible JSON interface ? + +- a reversible interface provides an exchange format. +- a textual exchange format facilitates exchanges between platforms (e.g. OpenData) +- a JSON exchange format can be used at API level + +### Is it relevant to take an extended type into account ? + +- it avoids the addition of an additional data schema +- it increases the semantic scope of the data processed by pandas +- it is an answer to several issues (e.g. #12997, #14358, #16492, #35420, #35464, #36211, #39537, #49585, #50782, #51375, #52595, #53252) +- the use of a complementary type avoids having to modify the pandas data model + +### Is this only useful for pandas ? + +- the JSON-TAB format is applicable to tabular data and multi-dimensional data. +- this JSON interface can therefore be used for any application using tabular or multi-dimensional data. This would allow for example reversible data exchanges between pandas - DataFrame and Xarray - DataArray (Xarray issue under construction) [see example DataFrame / DataArray](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Multidimensional-data). + +## Description + +The proposed solution is based on several key points: + +- data typing +- correspondence between TableSchema and pandas +- JSON format for tabular data +- conversion to and from JSON format + +### Data typing + +Data types are defined and managed in the NTV project (name, JSON encoder and decoder). + +Pandas `dtype` are compatible with NTV types : + +| **pandas dtype** | **NTV type** | +|--------------------|------------| +| intxx | intxx | +| uintxx | uintxx | +| floatxx | floatxx | +| datetime[ns] | datetime | +| datetime[ns, ] | datetimetz | +| timedelta[ns] | durationiso| +| string | string | +| boolean | boolean | + +Note: + +- datetime with timezone is a single NTV type (string ISO8601) +- `CategoricalDtype` and `SparseDtype` are included in the tabular JSON format +- `object` `dtype` is depending on the context (see below) +- `PeriodDtype` and `IntervalDtype` are to be defined + +JSON types (implicit or explicit) are converted in `dtype` following pandas JSON interface: + +| **JSON type** | **pandas dtype** | +|----------------|-------------------| +| number | int64 / float64 | +| string | string / object | +| array | object | +| object | object | +| true, false | boolean | +| null | NaT / NaN / None | + +Note: + +- if an NTV type is defined, the `dtype` is adjusted accordingly +- the consideration of null type data needs to be clarified + +The other NTV types are associated with `object` `dtype`. + +### Correspondence between TableSchema and pandas + +The TableSchema typing is carried by two attributes `format` and `type`. + +The table below shows the correspondence between TableSchema format / type and pandas NTVtype / dtype: + +| **format / type** | **NTV type / dtype** | +|--------------------|----------------------| +| default / datetime | / datetime64[ns] | +| default / number | / float64 | +| default / integer | / int64 | +| default / boolean | / bool | +| default / string | / object | +| default / duration | / timedelta64[ns] | +| email / string | email / string | +| uri / string | uri / string | +| default / object | object / object | +| default / array | array / object | +| default / date | date / object | +| default / time | time / object | +| default / year | year / int64 | +| default / yearmonth| month / int64 | +| array / geopoint | point / object | +| default / geojson | geojson / object | + +Note: + +- other TableSchema format are defined and are to be studied (uuid, binary, topojson, specific format for geopoint and datation) +- the first six lines correspond to the existing + +### JSON format + +The JSON format for the TableSchema interface is the existing. + +The JSON format for the Global interface is defined in [JSON-TAB](https://github.com/loco-philippe/NTV/blob/main/documentation/JSON-TAB-standard.pdf) specification. +It includes the naming rules originally defined in the [JSON-ND project](https://github.com/glenkleidon/JSON-ND) and support for categorical data. +The specification have to be updated to include sparse data. + +### Conversion + +When data is associated with a non-`object` `dtype`, pandas conversion methods are used. +Otherwise, NTV conversion is used. + +#### pandas -> JSON + +- `NTV type` is not defined : use `to_json()` +- `NTV type` is defined and `dtype` is not `object` : use `to_json()` +- `NTV type` is defined and `dtype` is `object` : use NTV conversion (if pandas conversion does not exist) + +#### JSON -> pandas + +- `NTV type` is compatible with a `dtype` : use `read_json()` +- `NTV type` is not compatible with a `dtype` : use NTV conversion (if pandas conversion does not exist) + +## Usage and Impact + +### Usage + +It seems to me that this proposal responds to important issues: + +- having an efficient text format for data exchange + + The alternative CSV format is not reversible and obsolete (last revision in 2005). Current CSV tools do not comply with the standard. + +- taking into account "semantic" data in pandas objects + +- having a complete Table Schema interface + +### Compatibility + +Interface can be used without NTV type (compatibility with existing data - [see examples](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb#Appendix-:-Series-tests)) + +If the interface is available, throw a new `orient` option in the JSON interface, the use of the feature is decoupled from the other features. + +### Impacts on the pandas framework + +Initially, the impacts are very limited: + +- modification of the `name` of `Series` or `DataFrame columns` (no functional impact), +- added an option in the Json interface (e.g. `orient='ntv'`) and added associated methods (no functional interference with the other methods) + +In later stages, several developments could be considered: + +- validation of the `name` of `Series` or `DataFrame columns` , +- management of the NTV type as a "complementary-object-dtype" +- functional extensions depending on the NTV type + +### Risk to do / risk not to do + +The JSON-NTV format and the JSON-TAB format are not (yet) recognized and used formats. The risk for pandas is that this function is not used (no functional impacts). + +On the other hand, the early use by pandas will allow a better consideration of the expectations and needs of pandas as well as a reflection on the evolution of the types supported by pandas. + +## Implementation + +### Modules + +Two modules are defined for NTV: + +- json-ntv + + this module manages NTV data without dependency to another module + +- ntvconnector + + those modules manage the conversion between objects and JSON data. They have dependency with objects modules (e.g. connectors with shapely location have dependency with shapely). + +The pandas integration of the JSON interface requires importing only the json-ntv module. + +### Implementation options + +The interface can be implemented as NTV connector (`SeriesConnector` and `DataFrameConnector`) and as a new pandas JSON interface `orient` option. + +Several pandas implementations are possible: + +1. External: + + In this implementation, the interface is available only in the NTV side. + This option means that this evolution of the JSON interface is not useful or strategic for pandas. + +2. NTV side: + + In this implementation, the interface is available in the both sides and the conversion is located inside NTV. + This option is the one that minimizes the impacts on the pandas side + +3. pandas side: + + In this implementation, the interface is available in the both sides and the conversion is located inside pandas. + This option allows pandas to keep control of this evolution + +4. pandas restricted: + + In this implementation, the pandas interface and the conversion are located inside pandas and only for non-object `dtype`. + This option makes it possible to offer a compact and reversible interface while prohibiting the introduction of types incompatible with the existing `dtype` + +## F.A.Q. + +**Q: Does `orient="table"` not do what you are proposing already?** + +**A**: In principle, yes, this option takes into account the notion of type. + +But this is very limited (see examples added in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)) : + +- **Types and Json interface** + - the only way to keep the types in the json interface is to use the `orient='table'` option + - few dtypes are not allowed in json-table interface : period, timedelta64, interval + - allowed types are not always kept in json-table interface + - data with 'object' dtype is kept only id data is string + - with categorical dtype, the underlying dtype is not included in json interface +- **Data compactness** + - json-table interface is not compact (in the example in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#data-compactness))the size is triple or quadruple the size of the compact format +- **Reversibility** + - Interface is reversible only with few dtypes : int64, float64, bool, string, datetime64 and partially categorical +- **External types** + - the interface does not accept external types + - Table-schema defines 20 data types but the `orient="table"` interface takes into account 5 data types (see [table](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Converting-table-schema-type-to-pandas-dtype)) + - to integrate external types, it is necessary to first create ExtensionArray and ExtensionDtype objects + +The current interface is not compatible with the data structure defined by table-schema. For this to be possible, it is necessary to integrate a "type extension" like the one proposed (this has moreover been partially achieved with the notion of `extDtype` found in the interface for several formats). + +**Q: In general, we should only have 1 `"table"` format for pandas in read_json/to_json. There is also the issue of backwards compatibility if we do change the format. The fact that the table interface is buggy is not a reason to add a new interface (I'd rather fix those bugs). Can the existing format be adapted in a way that fixes the type issues/issues with roundtripping?** + +**A**: I will add two additional remarks: + +- the types defined in Tableschema are partially taken into account (examples of types not taken into account in the interface: string-uri, array, date, time, year, geopoint, string-email): +- the `read_json()` interface works too with the following data: `{'simple': [1,2,3] }` (contrary to what is indicated in the documentation) but it is impossible with `to_json()` to recreate this simple json. + +I think that the problem cannot be limited to bug fixes and that a clear strategy must be defined for the Json interface in particular with the gradual abandonment in open-data solutions of the obsolete CSV format in favor of a Json format. + +As stated, the proposed solution addresses several shortcomings of the current interface and could simply fit into the pandas environment (the other option would be to consider that the Json interface is a peripheral function of pandas and can remain external to pandas) regardless of the `orient='table'` option. + +It is nevertheless possible to merge the proposed format and the `orient='table'` format in order to have an explicit management of the notion of `extDtype` + +**Q: As far as I can tell, JSON NTV is not in any form a standardised JSON format. I believe that pandas (and geopandas, which is where I came from to this issue) should try to follow either de facto or de jure standards and do not opt in for a file format that does not have any community support at this moment. This can obviously change in the future and that is where this PR should be revised. Why would pandas use this standard?** + +**A**: As indicated in the issue (and detailed in [the attached Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)), the json interface is not reversible (`to_json` then `read_json` does not always return the initial object) and several shortcomings and bugs are present. The main cause of this problem is that the data type is not taken into account in the JSON format (or very partially with the `orient='table'` option). + +The proposal made answers this problem ([the example at the beginning of Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#0---Simple-example) simply and clearly illustrates the interest of the proposal). + +Regarding the underlying JSON-NTV format, its impact is quite low for tabular data (it is limited to adding the type in the field name). +Nevertheless, the question is relevant: The JSON-NTV format ([IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/)) is a shared, documented, supported and implemented format, but indeed the community support is for the moment reduced but it only asks to expand !! + +## Synthesis + +To conclude, + +- if it is important (or strategic) to have a reversible JSON interface for any type of data, the proposal can be allowed, +- if not, a third-party package listed in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) that reads/writes this format to/from pandas DataFrames should be considered + +## Core team decision + +Vote was open from september-11 to setpember-26: + +- Final tally is 0 approvals, 5 abstentions, 7 disapprove. The quorum has been met. The PDEP fails. + +**Disapprove comments** : + +- 1 Given the newness of the proposed JSON NTV format, I would support (as described in the PDEP): "if not, a third-party package listed in the ecosystem that reads/writes this format to/from pandas DataFrames should be considered" +- 2 Same reason as -1-, this should be a third party package for now +- 3 Not mature enough, and not clear what the market size would be. +- 4 for the same reason I left in the PDEP: "I think this (JSON-NTV format) does not meet the bar of being a commonly used format for implementation within pandas" +- 5 agree with -4- +- 6 agree with the other core-dev responders. I think work in the existing json interface is extremely valuable. A number of the original issues raised are just bug fixes / extensions of already existing functionality. Trying to start anew is likely not worth the migration effort. That said if a format is well supported in the community we can reconsider in the future (obviously json is well supported but the actual specification detailed here is too new / not accepted as a standard) +- 7 while I do think having a more comprehensive JSON format would be worthwhile, making a new format part of pandas means an implicit endorsement of a standard that is still being reviewed by the broader community. + +**Decision**: + +- add the `ntv-pandas` package in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) +- revisit again this PDEP at a later stage, for example in 1/2 to 1 year (based on the evolution of the Internet draft [JSON semantic format (JSON-NTV)](https://www.ietf.org/archive/id/draft-thomy-json-ntv-01.html) and the usage of the [ntv-pandas](https://github.com/loco-philippe/ntv-pandas#readme)) + +## Timeline + +Not applicable + +## PDEP History + +- 16 June 2023: Initial draft +- 22 July 2023: Add F.A.Q. +- 06 September 2023: Add Table Schema extension +- 01 Octobre: Add Core team decision