diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a36420556ae24..d87fa5203bd52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,53 +23,53 @@ jobs: - name: Looking for unwanted patterns run: ci/code_checks.sh patterns - if: true + if: always() - name: Setup environment and build pandas run: ci/setup_env.sh - if: true + if: always() - name: Linting run: | source activate pandas-dev ci/code_checks.sh lint - if: true + if: always() - name: Dependencies consistency run: | source activate pandas-dev ci/code_checks.sh dependencies - if: true + if: always() - name: Checks on imported code run: | source activate pandas-dev ci/code_checks.sh code - if: true + if: always() - name: Running doctests run: | source activate pandas-dev ci/code_checks.sh doctests - if: true + if: always() - name: Docstring validation run: | source activate pandas-dev ci/code_checks.sh docstrings - if: true + if: always() - name: Typing validation run: | source activate pandas-dev ci/code_checks.sh typing - if: true + if: always() - name: Testing docstring validation script run: | source activate pandas-dev pytest --capture=no --strict scripts - if: true + if: always() - name: Running benchmarks run: | @@ -87,7 +87,7 @@ jobs: else echo "Benchmarks did not run, no changes detected" fi - if: true + if: always() - name: Publish benchmarks artifact uses: actions/upload-artifact@master @@ -95,3 +95,65 @@ jobs: name: Benchmarks log path: asv_bench/benchmarks.log if: failure() + + web_and_docs: + name: Web and docs + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" + + - name: Checkout + uses: actions/checkout@v1 + + - name: Setup environment and build pandas + run: ci/setup_env.sh + + - name: Build website + run: | + source activate pandas-dev + python web/pandas_web.py web/pandas --target-path=web/build + + - name: Build documentation + run: | + source activate pandas-dev + doc/make.py --warnings-are-errors | tee sphinx.log ; exit ${PIPESTATUS[0]} + + # This can be removed when the ipython directive fails when there are errors, + # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) + - name: Check ipython directive errors + run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" + + - name: Merge website and docs + run: | + mkdir -p pandas_web/docs + cp -r web/build/* pandas_web/ + cp -r doc/build/html/* pandas_web/docs/ + if: github.event_name == 'push' + + - name: Install Rclone + run: sudo apt install rclone -y + if: github.event_name == 'push' + + - name: Set up Rclone + run: | + RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf + mkdir -p `dirname $RCLONE_CONFIG_PATH` + echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH + echo "type = swift" >> $RCLONE_CONFIG_PATH + echo "env_auth = false" >> $RCLONE_CONFIG_PATH + echo "auth_version = 3" >> $RCLONE_CONFIG_PATH + echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH + echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH + echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH + echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH + echo "domain = default" >> $RCLONE_CONFIG_PATH + echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH + echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH + echo "region = BHS" >> $RCLONE_CONFIG_PATH + if: github.event_name == 'push' + + - name: Sync web + run: rclone sync pandas_web ovh_cloud_pandas_web:dev + if: github.event_name == 'push' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b34f5dfdd1a83..809764a20a713 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,8 +11,20 @@ repos: language: python_venv additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 + rev: v4.3.21 hooks: - id: isort language: python_venv exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.730 + hooks: + - id: mypy + # We run mypy over all files because of: + # * changes in type definitions may affect non-touched files. + # * Running it with `mypy pandas` and the filenames will lead to + # spurious duplicate module errors, + # see also https://github.com/pre-commit/mirrors-mypy/issues/5 + pass_filenames: false + args: + - pandas diff --git a/.travis.yml b/.travis.yml index 0c7740295b637..a11cd469e9b9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: python -python: 3.5 +python: 3.7 # To turn off cached cython files and compiler cache # set NOCACHE-true @@ -48,17 +48,12 @@ matrix: - mysql - postgresql - # In allow_failures - env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" services: - mysql - postgresql - allow_failures: - - env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" - before_install: - echo "before_install" # set non-blocking IO on travis diff --git a/RELEASE.md b/RELEASE.md index efd075dabcba9..7924ffaff561f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,5 +2,5 @@ Release Notes ============= The list of changes to Pandas between each release can be found -[here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full +[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full details, see the commit logs at http://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index c04bbf53a86a6..cd1a31d4eaf34 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -7,7 +7,7 @@ "project": "pandas", // The project's homepage - "project_url": "http://pandas.pydata.org/", + "project_url": "https://pandas.pydata.org/", // The URL of the source code repository for the project being // benchmarked @@ -122,5 +122,8 @@ ".*": "0409521665" }, "regression_thresholds": { - } + }, + "build_command": + ["python setup.py build -j4", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], } diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7d97f2c740acb..0f3b3838de1b2 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,8 @@ from pandas._libs import lib import pandas as pd -from pandas.util import testing as tm + +from .pandas_vb_common import tm for imp in ["pandas.util", "pandas.tools.hashing"]: try: diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 43b1b31a0bfe8..1dcd52ac074a6 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -3,7 +3,8 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas.api.types import union_categoricals diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index a9e45cad22d27..7c43485f5ef45 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,7 +1,8 @@ import numpy as np from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp -import pandas.util.testing as tm + +from .pandas_vb_common import tm def no_change(arr): diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 1deca8fe3aad0..2b24bab85bc57 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,7 +1,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas.tseries.offsets import Nano, Hour diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index ae6c07107f4a0..2187668c96ca4 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -4,7 +4,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class GetNumericData: diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 860c6cc6192bb..e266d871f5bc6 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -2,7 +2,8 @@ from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas import ( @@ -24,7 +25,7 @@ except ImportError: from pandas import algos try: - from pandas.util.testing import test_parallel + from pandas._testing import test_parallel have_real_test_parallel = True except ImportError: diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d51c53e2264f1..28e0dcc5d9b13 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -13,7 +13,8 @@ date_range, period_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm method_blacklist = { "object": { diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index d69799eb70040..103141545504b 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -12,7 +12,8 @@ Series, date_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm class SetOperations: diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index c78c2fa92827e..087fe3916845b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -17,7 +17,8 @@ option_context, period_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm class NumericSeriesIndexing: @@ -131,6 +132,7 @@ def setup(self): self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) + self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean") def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] @@ -144,6 +146,9 @@ def time_boolean_rows(self): def time_boolean_rows_object(self): self.df[self.bool_obj_indexer] + def time_boolean_rows_boolean(self): + self.df[self.boolean_indexer] + class DataFrameNumericIndexing: def setup(self): diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index e85b3bd2c7687..1a8d5ede52512 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, Series, to_numeric -import pandas.util.testing as tm -from .pandas_vb_common import lib, numeric_dtypes +from .pandas_vb_common import lib, numeric_dtypes, tm class NumericInferOps: diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index b8e8630e663ee..9bcd125f56bbb 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -5,9 +5,8 @@ import numpy as np from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class ToCSV(BaseIO): diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 75d87140488e3..80af2cff41769 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -6,7 +6,8 @@ from odf.text import P from pandas import DataFrame, ExcelWriter, date_range, read_excel -import pandas.util.testing as tm + +from ..pandas_vb_common import tm def _generate_dataframe(): diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 88c1a3dc48ea4..4ca399a293a4b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, HDFStore, date_range, read_hdf -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class HDFStoreDataFrame(BaseIO): diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 8f037e94e0095..f478bf2aee0ba 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, concat, date_range, read_json, timedelta_range -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class ReadJSON(BaseIO): @@ -132,6 +131,30 @@ def peakmem_to_json_wide(self, orient, frame): df.to_json(self.fname, orient=orient) +class ToJSONISO(BaseIO): + fname = "__test__.json" + params = [["split", "columns", "index", "values", "records"]] + param_names = ["orient"] + + def setup(self, orient): + N = 10 ** 5 + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") + self.df = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + + def time_iso_format(self, orient): + self.df.to_json(orient=orient, date_format="iso") + + class ToJSONLines(BaseIO): fname = "__test__.json" diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 12620656dd2bf..4ca9a82ae4827 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, date_range, read_pickle -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class Pickle(BaseIO): diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 6cc7f56ae3d65..b71bb832280b9 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -4,7 +4,8 @@ from sqlalchemy import create_engine from pandas import DataFrame, date_range, read_sql_query, read_sql_table -import pandas.util.testing as tm + +from ..pandas_vb_common import tm class SQL: diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index f3125f8598418..9faafa82ff46e 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, date_range, read_stata -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class Stata(BaseIO): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 5cf9f6336ba0c..1333b3a0f0560 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas import merge_ordered diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 5a396c9f0deff..0e188c58012fa 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, RangeIndex, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class GetLoc: diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 1faf13329110d..6da2b2270c04a 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -13,6 +13,13 @@ except (ImportError, TypeError, ValueError): pass +# Compatibility import for the testing module +try: + import pandas._testing as tm # noqa +except ImportError: + import pandas.util.testing as tm # noqa + + numeric_dtypes = [ np.int64, np.int32, diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index cd450f801c805..03394e6fe08cb 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range -import pandas.util.testing as tm -from .pandas_vb_common import lib +from .pandas_vb_common import lib, tm class Reindex: diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a3f1d92545c3f..57c625ced8a43 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -3,7 +3,8 @@ import numpy as np from pandas import NaT, Series, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class SeriesConstructor: diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index f30b2482615bd..d7fb2775376c0 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, Series -import pandas.util.testing as tm + +from .pandas_vb_common import tm class Methods: diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index cb0b17e3553a4..55e8e839f4fae 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -19,18 +19,24 @@ jobs: ENV_FILE: ci/deps/azure-36-minimum_versions.yaml CONDA_PY: "36" PATTERN: "not slow and not network" + py36_locale_slow_old_np: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" PATTERN: "slow" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + # pandas does not use the language (zh_CN), but should support diferent encodings (utf8) + # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" py36_locale: ENV_FILE: ci/deps/azure-36-locale.yaml CONDA_PY: "36" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "it_IT.UTF-8" + LANG: "it_IT.utf8" + LC_ALL: "it_IT.utf8" + EXTRA_APT: "language-pack-it" py36_32bit: ENV_FILE: ci/deps/azure-36-32bit.yaml @@ -42,7 +48,9 @@ jobs: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" + EXTRA_APT: "language-pack-zh-hans" py37_np_dev: ENV_FILE: ci/deps/azure-37-numpydev.yaml @@ -54,10 +62,16 @@ jobs: steps: - script: | - if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi - echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - echo "Creating Environment" - ci/setup_env.sh + if [ "$(uname)" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y libc6-dev-i386 $EXTRA_APT + fi + displayName: 'Install extra packages' + + - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' + displayName: 'Set conda path' + + - script: ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 03529bd6569c6..187a5db99802f 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -34,7 +34,7 @@ jobs: - bash: | source activate pandas-dev conda list - python setup.py build_ext -q -i + python setup.py build_ext -q -i -j 4 python -m pip install --no-build-isolation -e . displayName: 'Build' diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 46ace2dd9d70e..83ceb11dfcbf4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,6 +100,14 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of not concatenated strings' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" . + else + $BASE_DIR/scripts/validate_string_concatenation.py . + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "isort --version-number" isort --version-number @@ -131,8 +139,8 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" # Checks for test suite - # Check for imports from pandas.util.testing instead of `import pandas.util.testing as tm` - invgrep -R --include="*.py*" -E "from pandas.util.testing import" pandas/tests + # Check for imports from pandas._testing instead of `import pandas._testing as tm` + invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from pandas.util import testing as tm" pandas/tests RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -290,8 +298,11 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/string_.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/string_.py + MSG='Doctests arrays'; echo $MSG + pytest -q --doctest-modules \ + pandas/core/arrays/string_.py \ + pandas/core/arrays/integer.py \ + pandas/core/arrays/boolean.py RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests arrays/boolean.py' ; echo $MSG diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 4f4c4524cb4dd..810554632a507 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -9,6 +9,7 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 + - pytest-asyncio - hypothesis>=3.58.0 - pytest-azurepipelines @@ -26,7 +27,7 @@ dependencies: - openpyxl # lowest supported version of pyarrow (putting it here instead of in # azure-36-minimum_versions because it needs numpy >= 1.14) - - pyarrow=0.12 + - pyarrow=0.13 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 2bb2b00319382..48ac50c001715 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -13,7 +13,7 @@ dependencies: - pytest-azurepipelines # pandas dependencies - - beautifulsoup4==4.6.0 + - beautifulsoup4=4.6.0 - bottleneck=1.2.* - lxml - matplotlib=2.2.2 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index a10fa0904a451..111ba6b020bc7 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -8,6 +8,7 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 + - pytest-asyncio - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index f393ed84ecf63..3bbbdb4cf32ad 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -22,7 +22,7 @@ dependencies: - numexpr - numpy=1.14 - openpyxl - - pyarrow>=0.12.0 + - pyarrow>=0.13.0 - pytables - python-dateutil==2.6.1 - pytz diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 7fa9dee7445a6..663c55492e69e 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -22,7 +22,7 @@ dependencies: - numpy=1.15.* - openpyxl - jinja2 - - pyarrow>=0.12.0 + - pyarrow>=0.13.0 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 928896efd5fc4..62be1075b3337 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -24,6 +24,7 @@ dependencies: - numexpr - numpy=1.14.* - openpyxl + - pyarrow=0.14 - pytables - python-dateutil - pytz diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index c1403f8eb8409..a46001c58d165 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -30,10 +30,8 @@ dependencies: - openpyxl<=3.0.1 # https://github.com/pandas-dev/pandas/pull/30009 openpyxl 3.0.2 broke - pandas-gbq - # https://github.com/pydata/pandas-gbq/issues/271 - - google-cloud-bigquery<=1.11 - psycopg2 - - pyarrow>=0.12.0 + - pyarrow>=0.13.0 - pymysql - pytables - python-snappy diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 0b68164e5767e..8020680d617d7 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,17 +5,6 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE" - export LANG="$LOCALE_OVERRIDE" - PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` - if [[ "$LOCALE_OVERRIDE" != "$PANDAS_LOCALE" ]]; then - echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" - # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed - # exit 1 - fi -fi - if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 2b488295b5cc2..db28eaea8956e 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,15 +1,15 @@ #!/bin/bash -e # edit the locale file if needed -if [ -n "$LOCALE_OVERRIDE" ]; then +if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" sed -i "$SEDC" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py echo - sudo locale-gen "$LOCALE_OVERRIDE" fi MINICONDA_DIR="$HOME/miniconda3" diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index f92090fecccf3..47f63c11d0567 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -36,5 +36,5 @@ test: about: - home: http://pandas.pydata.org + home: https://pandas.pydata.org license: BSD diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst new file mode 100644 index 0000000000000..2fc2f1fb6ee8d --- /dev/null +++ b/doc/source/development/code_style.rst @@ -0,0 +1,129 @@ +.. _code_style: + +{{ header }} + +======================= +pandas code style guide +======================= + +.. contents:: Table of contents: + :local: + +Patterns +======== + +foo.__class__ +------------- + +*pandas* uses 'type(foo)' instead 'foo.__class__' as it is making the code more +readable. + +For example: + +**Good:** + +.. code-block:: python + + foo = "bar" + type(foo) + +**Bad:** + +.. code-block:: python + + foo = "bar" + foo.__class__ + + +String formatting +================= + +Concatenated strings +-------------------- + +f-strings +~~~~~~~~~ + +*pandas* uses f-strings formatting instead of '%' and '.format()' string formatters. + +The convention of using f-strings on a string that is concatenated over serveral lines, +is to prefix only the lines containing the value needs to be interpeted. + +For example: + +**Good:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + "please use the new and way better " + f"{bar}" + ) + +**Bad:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + f"please use the new and way better " + f"{bar}" + ) + +White spaces +~~~~~~~~~~~~ + +Putting the white space only at the end of the previous line, so +there is no whitespace at the beggining of the concatenated string. + +For example: + +**Good:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string, " + "with good placement of the " + "whitespaces" + ) + +**Bad:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string," + " with bad placement of the" + " whitespaces" + ) + +Representation function (aka 'repr()') +-------------------------------------- + +*pandas* uses 'repr()' instead of '%r' and '!r'. + +The use of 'repr()' will only happend when the value is not an obvious string. + +For example: + +**Good:** + +.. code-block:: python + + value = str + f"Unknown recived value, got: {repr(value)}" + +**Good:** + +.. code-block:: python + + value = str + f"Unknown recived type, got: '{type(value).__name__}'" diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index d7b3e159f8ce7..2dc5ed07544d1 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -434,7 +434,7 @@ The utility script ``scripts/validate_docstrings.py`` can be used to get a csv summary of the API documentation. And also validate common errors in the docstring of a specific class, function or method. The summary also compares the list of methods documented in ``doc/source/api.rst`` (which is used to generate -the `API Reference `_ page) +the `API Reference `_ page) and the actual public methods. This will identify methods documented in ``doc/source/api.rst`` that are not actually class methods, and existing methods that are not documented in ``doc/source/api.rst``. @@ -569,8 +569,7 @@ do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. -Additional standards are outlined on the `code style wiki -page `_. +Additional standards are outlined on the `pandas code style guide `_ Optional dependencies --------------------- @@ -636,6 +635,8 @@ many errors as possible, but it may not correct *all* of them. Thus, it is recommended that you run ``cpplint`` to double check and make any other style fixes manually. +.. _contributing.code-formatting: + Python (PEP8 / black) ~~~~~~~~~~~~~~~~~~~~~ @@ -657,19 +658,8 @@ apply ``black`` as you edit files. You should use a ``black`` version >= 19.10b0 as previous versions are not compatible with the pandas codebase. -Optionally, you may wish to setup `pre-commit hooks `_ -to automatically run ``black`` and ``flake8`` when you make a git commit. This -can be done by installing ``pre-commit``:: - - pip install pre-commit - -and then running:: - - pre-commit install - -from the root of the pandas repository. Now ``black`` and ``flake8`` will be run -each time you commit changes. You can skip these checks with -``git commit --no-verify``. +If you wish to run these checks automatically, we encourage you to use +:ref:`pre-commits ` instead. One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this command will catch any stylistic errors in your changes specifically, but @@ -677,7 +667,7 @@ be beware it may not catch all of them. For example, if you delete the only usage of an imported function, it is stylistically incorrect to import an unused function. However, style-checking the diff will not catch this because the actual import is not part of the diff. Thus, for completeness, you should -run this command, though it will take longer:: +run this command, though it may take longer:: git diff upstream/master --name-only -- "*.py" | xargs -r flake8 @@ -695,6 +685,8 @@ behaviour as follows:: This will get all the files being changed by the PR (and ending with ``.py``), and run ``flake8`` on them, one after the other. +Note that these commands can be run analogously with ``black``. + .. _contributing.import-formatting: Import formatting @@ -717,7 +709,6 @@ A summary of our current import sections ( in order ): Imports are alphabetically sorted within these sections. - As part of :ref:`Continuous Integration ` checks we run:: isort --recursive --check-only pandas @@ -741,8 +732,37 @@ to automatically format imports correctly. This will modify your local copy of t The `--recursive` flag can be passed to sort all files in a directory. +Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: + + git diff upstream/master --name-only -- "*.py" | xargs -r isort + +Where similar caveats apply if you are on OSX or Windows. + You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. +.. _contributing.pre-commit: + +Pre-Commit +~~~~~~~~~~ + +You can run many of these styling checks manually as we have described above. However, +we encourage you to use `pre-commit hooks `_ instead +to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This +can be done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the pandas repository. Now all of the styling checks will be +run each time you commit changes without your needing to run each one manually. +In addition, using this pre-commit hook will also allow you to more easily +remain up-to-date with our code checks as they change. + +Note that if needed, you can skip these checks with ``git commit --no-verify``. + Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -957,7 +977,7 @@ inspiration. If your test requires working with files or network connectivity, there is more information on the `testing page `_ of the wiki. -The ``pandas.util.testing`` module has many special ``assert`` functions that +The ``pandas._testing`` module has many special ``assert`` functions that make it easier to make statements about whether Series or DataFrame objects are equivalent. The easiest way to verify that your code is correct is to explicitly construct the result you expect, then compare the actual result to @@ -1143,7 +1163,7 @@ If your change involves checking that a warning is actually emitted, use .. code-block:: python - import pandas.util.testing as tm + import pandas._testing as tm df = pd.DataFrame() @@ -1364,6 +1384,7 @@ some common prefixes along with general guidelines for when to use them: * TST: Additions/updates to tests * BLD: Updates to the build process/scripts * PERF: Performance improvement +* TYP: Type annotations * CLN: Code cleanup The following defines how a commit message should be structured. Please reference the diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 34bc5f44eb0c0..cb32f0e1ee475 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -22,39 +22,39 @@ Next example gives an idea on how a docstring looks like: .. code-block:: python def add(num1, num2): - """ - Add up two integer numbers. - - This function simply wraps the `+` operator, and does not - do anything interesting, except for illustrating what is - the docstring of a very simple function. - - Parameters - ---------- - num1 : int - First number to add - num2 : int - Second number to add - - Returns - ------- - int - The sum of `num1` and `num2` - - See Also - -------- - subtract : Subtract one integer from another - - Examples - -------- - >>> add(2, 2) - 4 - >>> add(25, 0) - 25 - >>> add(10, -10) - 0 - """ - return num1 + num2 + """ + Add up two integer numbers. + + This function simply wraps the `+` operator, and does not + do anything interesting, except for illustrating what is + the docstring of a very simple function. + + Parameters + ---------- + num1 : int + First number to add + num2 : int + Second number to add + + Returns + ------- + int + The sum of `num1` and `num2` + + See Also + -------- + subtract : Subtract one integer from another + + Examples + -------- + >>> add(2, 2) + 4 + >>> add(25, 0) + 25 + >>> add(10, -10) + 0 + """ + return num1 + num2 Some standards exist about docstrings, so they are easier to read, and they can be exported to other formats such as html or pdf. @@ -399,7 +399,7 @@ DataFrame: * DataFrame * pandas.Index * pandas.Categorical -* pandas.SparseArray +* pandas.arrays.SparseArray If the exact type is not relevant, but must be compatible with a numpy array, array-like can be specified. If Any type that can be iterated is diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index 757b197c717e6..f8a6bb6deb52d 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -13,6 +13,7 @@ Development :maxdepth: 2 contributing + code_style maintaining internals extending diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index 00598830e2fe9..fafe63d80249c 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -129,20 +129,6 @@ Some specific goals include * Improve the overall organization of the documentation and specific subsections of the documentation to make navigation and finding content easier. -Package docstring validation ----------------------------- - -To improve the quality and consistency of pandas docstrings, we've developed -tooling to check docstrings in a variety of ways. -https://github.com/pandas-dev/pandas/blob/master/scripts/validate_docstrings.py -contains the checks. - -Like many other projects, pandas uses the -`numpydoc `__ style for writing -docstrings. With the collaboration of the numpydoc maintainers, we'd like to -move the checks to a package other than pandas so that other projects can easily -use them as well. - Performance monitoring ---------------------- diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 48c722bc16a86..7bd5ba7ecdf0b 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -244,8 +244,8 @@ Pandas DataFrames with timeseries indexes. `pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the -`Thomson Dataworks Enterprise (DWE/Datastream) `__ -SOAP API to return indexed Pandas DataFrames with financial data. +`Refinitiv Datastream (DWS) `__ +REST API to return indexed Pandas DataFrames with financial data. This package requires valid credentials for this API (non free). `pandaSDMX `__ @@ -327,6 +327,21 @@ PyTables, h5py, and pymongo to move data between non pandas formats. Its graph based approach is also extensible by end users for custom formats that may be too specific for the core of odo. +`Pandarallel `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. +If also displays progress bars. + +.. code:: python + + from pandarallel import pandarallel + + pandarallel.initialize(progress_bar=True) + + # df.apply(func) + df.parallel_apply(func) + `Ray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -380,4 +395,3 @@ Library Accessor Classes .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest .. _pdvega: https://altair-viz.github.io/pdvega/ - diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 66e500131b316..3055a22129b91 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -697,8 +697,9 @@ Plotting See the :ref:`Plotting ` docs. +We use the standard convention for referencing the matplotlib API: + .. ipython:: python - :suppress: import matplotlib.pyplot as plt plt.close('all') diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index d489d35dc1226..4fef5efbd1551 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1937,21 +1937,36 @@ See :ref:`extending.extension-types` for how to write your own extension that works with pandas. See :ref:`ecosystem.extensions` for a list of third-party libraries that have implemented an extension. -The following table lists all of pandas extension types. See the respective +The following table lists all of pandas extension types. For methods requiring ``dtype`` +arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. -=================== ========================= ================== ============================= ============================= -Kind of Data Data Type Scalar Array Documentation -=================== ========================= ================== ============================= ============================= -tz-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :class:`arrays.DatetimeArray` :ref:`timeseries.timezone` -Categorical :class:`CategoricalDtype` (none) :class:`Categorical` :ref:`categorical` -period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.PeriodArray` :ref:`timeseries.periods` -sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` -intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` -nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` -Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` -Boolean (with NA) :class:`BooleanDtype` :class:`bool` :class:`arrays.BooleanArray` :ref:`api.arrays.bool` -=================== ========================= ================== ============================= ============================= ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Kind of Data | Data Type | Scalar | Array | String Aliases | Documentation | ++===================+===========================+====================+===============================+=========================================+===============================+ +| tz-aware datetime | :class:`DatetimeTZDtype` | :class:`Timestamp` | :class:`arrays.DatetimeArray` | ``'datetime64[ns, ]'`` | :ref:`timeseries.timezone` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Categorical | :class:`CategoricalDtype` | (none) | :class:`Categorical` | ``'category'`` | :ref:`categorical` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| period | :class:`PeriodDtype` | :class:`Period` | :class:`arrays.PeriodArray` | ``'period[]'``, | :ref:`timeseries.periods` | +| (time spans) | | | | ``'Period[]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| sparse | :class:`SparseDtype` | (none) | :class:`arrays.SparseArray` | ``'Sparse'``, ``'Sparse[int]'``, | :ref:`sparse` | +| | | | | ``'Sparse[float]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| intervals | :class:`IntervalDtype` | :class:`Interval` | :class:`arrays.IntervalArray` | ``'interval'``, ``'Interval'``, | :ref:`advanced.intervalindex` | +| | | | | ``'Interval[]'``, | | +| | | | | ``'Interval[datetime64[ns, ]]'``, | | +| | | | | ``'Interval[timedelta64[]]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| nullable integer + :class:`Int64Dtype`, ... | (none) | :class:`arrays.IntegerArray` | ``'Int8'``, ``'Int16'``, ``'Int32'``, | :ref:`integer_na` | +| | | | | ``'Int64'``, ``'UInt8'``, ``'UInt16'``, | | +| | | | | ``'UInt32'``, ``'UInt64'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Strings | :class:`StringDtype` | :class:`str` | :class:`arrays.StringArray` | ``'string'`` | :ref:`text` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Boolean (with NA) | :class:`BooleanDtype` | :class:`bool` | :class:`arrays.BooleanArray` | ``'boolean'`` | :ref:`api.arrays.bool` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ Pandas has two ways to store strings. diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 69bb700c97b15..4e284fe7b5968 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -629,7 +629,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])['total_bill', 'tip'].sum() + tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() tips_summed.head() diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index db687386329bb..fec6bae1e0330 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -617,7 +617,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])['total_bill', 'tip'].sum() + tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() tips_summed.head() diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index a07fcbd8b67c4..8bd271815549d 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -676,11 +676,11 @@ similar to an ndarray: # only show the first 5 rows df[:5].T +.. _dsintro.numpy_interop: + DataFrame interoperability with NumPy functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _dsintro.numpy_interop: - Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions can be used with no issues on Series and DataFrame, assuming the data within are numeric: @@ -741,7 +741,7 @@ implementation takes precedence and a Series is returned. np.maximum(ser, idx) NumPy ufuncs are safe to apply to :class:`Series` backed by non-ndarray arrays, -for example :class:`SparseArray` (see :ref:`sparse.calculation`). If possible, +for example :class:`arrays.SparseArray` (see :ref:`sparse.calculation`). If possible, the ufunc is applied without converting the underlying data to an ndarray. Console display diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 212f3636d0a98..1ed0e8f635b58 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -15,7 +15,7 @@ pandas' own :ref:`10 Minutes to pandas<10min>`. More complex recipes are in the :ref:`Cookbook`. -A handy pandas `cheat sheet `_. +A handy pandas `cheat sheet `_. Community guides ================ diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 9cea68530fbe7..10705787dfedf 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -109,6 +109,7 @@ See the :ref:`overview` for more detail about what's in the library. * :doc:`development/index` * :doc:`development/contributing` + * :doc:`development/code_style` * :doc:`development/internals` * :doc:`development/extending` * :doc:`development/developer` diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index cf14d28772f4c..c71350ecd73b3 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -12,7 +12,8 @@ For most data types, pandas uses NumPy arrays as the concrete objects contained with a :class:`Index`, :class:`Series`, or :class:`DataFrame`. -For some data types, pandas extends NumPy's type system. +For some data types, pandas extends NumPy's type system. String aliases for these types +can be found at :ref:`basics.dtypes`. =================== ========================= ================== ============================= Kind of Data Pandas Data Type Scalar Array @@ -443,13 +444,13 @@ Sparse data ----------- Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may -be stored efficiently as a :class:`SparseArray`. +be stored efficiently as a :class:`arrays.SparseArray`. .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - SparseArray + arrays.SparseArray .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 4b1a99da7cd4c..c072237850d82 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -59,3 +59,16 @@ objects. api.extensions.ExtensionArray.nbytes api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape + +Additionally, we have some utility methods for ensuring your object +behaves correctly. + +.. autosummary:: + :toctree: api/ + + api.indexers.check_bool_array_indexer + + +The sentinel ``pandas.api.extensions.no_default`` is used as the default +value in some methods. Use an ``is`` comparison to check if the user +provides a non-default value. diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4c9df35ea8d9d..01aa6c60e3b2f 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -273,6 +273,8 @@ Metadata :attr:`DataFrame.attrs` is a dictionary for storing global metadata for this DataFrame. +.. warning:: ``DataFrame.attrs`` is considered experimental and may change without warning. + .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 0961acc43f301..0d9e0b0f4c668 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -18,6 +18,8 @@ Working with options set_option option_context +.. _api.general.testing: + Testing functions ----------------- .. autosummary:: @@ -26,6 +28,7 @@ Testing functions testing.assert_frame_equal testing.assert_series_equal testing.assert_index_equal + testing.assert_extension_array_equal Exceptions and warnings ----------------------- diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 0639730e2dcde..4ad6a7b014532 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -525,6 +525,8 @@ Metadata :attr:`Series.attrs` is a dictionary for storing global metadata for this Series. +.. warning:: ``Series.attrs`` is considered experimental and may change without warning. + .. autosummary:: :toctree: api/ diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 8223b831ebe2d..d6f5c0c758b60 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -565,19 +565,15 @@ When working with an ``Index`` object directly, rather than via a ``DataFrame``, mi2 = mi.rename("new name", level=0) mi2 -.. warning:: - Prior to pandas 1.0.0, you could also set the names of a ``MultiIndex`` - by updating the name of a level. +You cannot set the names of the MultiIndex via a level. - .. code-block:: none +.. ipython:: python + :okexcept: - >>> mi.levels[0].name = 'name via level' - >>> mi.names[0] # only works for older pandas - 'name via level' + mi.levels[0].name = "name via level" - As of pandas 1.0, this will *silently* fail to update the names - of the MultiIndex. Use :meth:`Index.set_names` instead. +Use :meth:`Index.set_names` instead. Sorting a ``MultiIndex`` ------------------------ diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index e0f676d3072fc..5276bc6142206 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -14,6 +14,29 @@ Nullable Boolean Data Type .. versionadded:: 1.0.0 + +.. _boolean.indexing: + +Indexing with NA values +----------------------- + +pandas does not allow indexing with NA values. Attempting to do so +will raise a ``ValueError``. + +.. ipython:: python + :okexcept: + + s = pd.Series([1, 2, 3]) + mask = pd.array([True, False, pd.NA], dtype="boolean") + s[mask] + +The missing values will need to be explicitly filled with True or False prior +to using the array as a mask. + +.. ipython:: python + + s[mask.fillna(False)] + .. _boolean.kleene: Kleene Logical Operations diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 37637bbdb38e6..f581d183b9413 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -406,10 +406,10 @@ Levels ****** `Prepending a level to a multiindex -`__ +`__ `Flatten Hierarchical columns -`__ +`__ .. _cookbook.missing_data: @@ -430,13 +430,13 @@ Fill forward a reversed timeseries df.reindex(df.index[::-1]).ffill() `cumsum reset at NaN values -`__ +`__ Replace ******* `Using replace with backrefs -`__ +`__ .. _cookbook.grouping: @@ -446,7 +446,7 @@ Grouping The :ref:`grouping ` docs. `Basic grouping with apply -`__ +`__ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to all the columns @@ -462,7 +462,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) `Using get_group -`__ +`__ .. ipython:: python @@ -470,7 +470,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to gb.get_group('cat') `Apply to different items in a group -`__ +`__ .. ipython:: python @@ -486,7 +486,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to expected_df `Expanding apply -`__ +`__ .. ipython:: python @@ -502,7 +502,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to `Replacing some values with mean of the rest of a group -`__ +`__ .. ipython:: python @@ -516,7 +516,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to gb.transform(replace) `Sort groups by aggregated data -`__ +`__ .. ipython:: python @@ -533,7 +533,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to sorted_df `Create multiple aggregated columns -`__ +`__ .. ipython:: python @@ -550,7 +550,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to ts `Create a value counts column and reassign back to the DataFrame -`__ +`__ .. ipython:: python @@ -561,7 +561,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df `Shift groups of the values in a column based on the index -`__ +`__ .. ipython:: python @@ -575,7 +575,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df `Select row with maximum value from each group -`__ +`__ .. ipython:: python @@ -587,7 +587,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df_count `Grouping like Python's itertools.groupby -`__ +`__ .. ipython:: python @@ -599,19 +599,19 @@ Expanding data ************** `Alignment and to-date -`__ +`__ `Rolling Computation window based on values instead of counts -`__ +`__ `Rolling Mean by Time Interval -`__ +`__ Splitting ********* `Splitting a frame -`__ +`__ Create a list of dataframes, split using a delineation based on logic included in rows. @@ -635,7 +635,7 @@ Pivot The :ref:`Pivot ` docs. `Partial sums and subtotals -`__ +`__ .. ipython:: python @@ -649,7 +649,7 @@ The :ref:`Pivot ` docs. table.stack('City') `Frequency table like plyr in R -`__ +`__ .. ipython:: python @@ -675,7 +675,7 @@ The :ref:`Pivot ` docs. 'Grade': lambda x: sum(x) / len(x)}) `Plot pandas DataFrame with year over year data -`__ +`__ To create year and month cross tabulation: @@ -691,7 +691,7 @@ Apply ***** `Rolling apply to organize - Turning embedded lists into a MultiIndex frame -`__ +`__ .. ipython:: python @@ -707,7 +707,7 @@ Apply df_orgz `Rolling apply with a DataFrame returning a Series -`__ +`__ Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned @@ -727,7 +727,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc s `Rolling apply with a DataFrame returning a Scalar -`__ +`__ Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price) @@ -753,26 +753,26 @@ Timeseries ---------- `Between times -`__ +`__ `Using indexer between time -`__ +`__ `Constructing a datetime range that excludes weekends and includes only certain times -`__ +`__ `Vectorized Lookup -`__ +`__ `Aggregation and plotting time series `__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? -`__ +`__ `Dealing with duplicates when reindexing a timeseries to a specified frequency -`__ +`__ Calculate the first day of the month for each entry in a DatetimeIndex @@ -795,7 +795,7 @@ The :ref:`Resample ` docs. `__ `Valid frequency arguments to Grouper -`__ +`__ `Grouping using a MultiIndex `__ @@ -804,15 +804,15 @@ The :ref:`Resample ` docs. `__ `Resampling with custom periods -`__ +`__ `Resample intraday frame without adding new days -`__ +`__ `Resample minute data -`__ +`__ -`Resample with groupby `__ +`Resample with groupby `__ .. _cookbook.merge: @@ -822,7 +822,7 @@ Merge The :ref:`Concat ` docs. The :ref:`Join ` docs. `Append two dataframes with overlapping index (emulate R rbind) -`__ +`__ .. ipython:: python @@ -855,16 +855,16 @@ Depending on df construction, ``ignore_index`` may be needed suffixes=('_L', '_R')) `How to set the index and join -`__ +`__ `KDB like asof join -`__ +`__ `Join with a criteria based on the values -`__ +`__ `Using searchsorted to merge based on values inside a range -`__ +`__ .. _cookbook.plotting: @@ -874,31 +874,31 @@ Plotting The :ref:`Plotting ` docs. `Make Matplotlib look like R -`__ +`__ `Setting x-axis major and minor labels -`__ +`__ `Plotting multiple charts in an ipython notebook -`__ +`__ `Creating a multi-line plot -`__ +`__ `Plotting a heatmap -`__ +`__ `Annotate a time-series plot -`__ +`__ `Annotate a time-series plot #2 -`__ +`__ `Generate Embedded plots in excel files using Pandas, Vincent and xlsxwriter `__ `Boxplot for each quartile of a stratifying variable -`__ +`__ .. ipython:: python @@ -918,7 +918,7 @@ Data In/Out ----------- `Performance comparison of SQL vs HDF5 -`__ +`__ .. _cookbook.csv: @@ -930,25 +930,25 @@ The :ref:`CSV ` docs `read_csv in action `__ `appending to a csv -`__ +`__ `Reading a csv chunk-by-chunk -`__ +`__ `Reading only certain rows of a csv chunk-by-chunk -`__ +`__ `Reading the first few lines of a frame -`__ +`__ Reading a file that is compressed but not by ``gzip/bz2`` (the native compressed formats which ``read_csv`` understands). This example shows a ``WinZipped`` file, but is a general application of opening the file within a context manager and using that handle to read. `See here -`__ +`__ `Inferring dtypes from a file -`__ +`__ `Dealing with bad lines `__ @@ -960,7 +960,7 @@ using that handle to read. `__ `Write a multi-row index CSV without writing duplicates -`__ +`__ .. _cookbook.csv.multiple_files: @@ -1069,7 +1069,7 @@ SQL The :ref:`SQL ` docs `Reading from databases with SQL -`__ +`__ .. _cookbook.excel: @@ -1079,7 +1079,7 @@ Excel The :ref:`Excel ` docs `Reading from a filelike handle -`__ +`__ `Modifying formatting in XlsxWriter output `__ @@ -1090,7 +1090,7 @@ HTML **** `Reading HTML tables from a server that cannot handle the default request -header `__ +header `__ .. _cookbook.hdf: @@ -1100,54 +1100,54 @@ HDFStore The :ref:`HDFStores ` docs `Simple queries with a Timestamp Index -`__ +`__ `Managing heterogeneous data using a linked multiple table hierarchy `__ `Merging on-disk tables with millions of rows -`__ +`__ `Avoiding inconsistencies when writing to a store from multiple processes/threads -`__ +`__ De-duplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from csv file and creating a store by chunks, with date parsing as well. `See here -`__ +`__ `Creating a store chunk-by-chunk from a csv file -`__ +`__ `Appending to a store, while creating a unique index -`__ +`__ `Large Data work flows -`__ +`__ `Reading in a sequence of files, then providing a global unique index to a store while appending -`__ +`__ `Groupby on a HDFStore with low group density -`__ +`__ `Groupby on a HDFStore with high group density -`__ +`__ `Hierarchical queries on a HDFStore -`__ +`__ `Counting with a HDFStore -`__ +`__ `Troubleshoot HDFStore exceptions -`__ +`__ `Setting min_itemsize with strings -`__ +`__ `Using ptrepack to create a completely-sorted-index on a store -`__ +`__ Storing Attributes to a group node @@ -1305,7 +1305,7 @@ The :ref:`Timedeltas ` docs. datetime.timedelta(minutes=5) + s `Adding and subtracting deltas and dates -`__ +`__ .. ipython:: python @@ -1322,7 +1322,7 @@ The :ref:`Timedeltas ` docs. df.dtypes `Another example -`__ +`__ Values can be set to NaT using np.nan, similar to datetime diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 0229331127441..a8cdf4a61073d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -668,7 +668,7 @@ Current behavior KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike Out[4]: 1 2.0 diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 77568f3bcb244..a45d7a4fa1547 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,6 +15,10 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with @@ -23,6 +27,9 @@ much. But if your integer column is, say, an identifier, casting to float can be problematic. Some integers cannot even be represented as floating point numbers. +Construction +------------ + Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. @@ -39,6 +46,12 @@ NumPy's ``'int64'`` dtype: pd.array([1, 2, np.nan], dtype="Int64") +All NA-like values are replaced with :attr:`pandas.NA`. + +.. ipython:: python + + pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64") + This array can be stored in a :class:`DataFrame` or :class:`Series` like any NumPy array. @@ -78,6 +91,9 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +Operations +---------- + Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another dtype if needed. @@ -123,3 +139,15 @@ Reduction and groupby operations such as 'sum' work as well. df.sum() df.groupby('B').A.sum() + +Scalar NA Value +--------------- + +:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar +missing value. Slicing a single element that's missing will return +:attr:`pandas.NA` + +.. ipython:: python + + a = pd.array([1, None], dtype="Int64") + a[1] diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c32b009948fda..55bbf6848820b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1153,7 +1153,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Let us consider some examples: @@ -1519,7 +1519,7 @@ rows will skip the intervening rows. .. ipython:: python - from pandas.util.testing import makeCustomDataframe as mkdf + from pandas._testing import makeCustomDataframe as mkdf df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) @@ -2066,6 +2066,8 @@ The Numpy parameter +++++++++++++++++++ .. note:: + This param has been deprecated as of version 1.0.0 and will raise a ``FutureWarning``. + This supports numeric data only. Index and columns labels may be non-numeric, e.g. strings, dates etc. If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff @@ -2088,6 +2090,7 @@ data: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2102,6 +2105,7 @@ The speedup is less noticeable for smaller datasets: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2629,7 +2633,7 @@ that contain URLs. url_df = pd.DataFrame({ 'name': ['Python', 'Pandas'], - 'url': ['https://www.python.org/', 'http://pandas.pydata.org']}) + 'url': ['https://www.python.org/', 'https://pandas.pydata.org']}) print(url_df.to_html(render_links=True)) .. ipython:: python @@ -3877,6 +3881,8 @@ specified in the format: ``()``, where float may be signed (and fra store.append('dftd', dftd, data_columns=True) store.select('dftd', "C<'-3.5D'") +.. _io.query_multi: + Query MultiIndex ++++++++++++++++ @@ -4646,10 +4652,10 @@ Several caveats. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message + on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0. * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data - type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, + type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols, see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index da593bcb6e923..abbb6feef6056 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -920,3 +920,29 @@ filling missing values beforehand. A similar situation occurs when using Series or DataFrame objects in ``if`` statements, see :ref:`gotchas.truth`. + +NumPy ufuncs +------------ + +:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs +work with ``NA``, and generally return ``NA``: + +.. ipython:: python + + np.log(pd.NA) + np.add(pd.NA, 1) + +.. warning:: + + Currently, ufuncs involving an ndarray and ``NA`` will return an + object-dtype filled with NA values. + + .. ipython:: python + + a = np.array([1, 2, 3]) + np.greater(a, pd.NA) + + The return type here may change to return a different array type + in the future. + +See :ref:`dsintro.numpy_interop` for more on ufuncs. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8583a9312b690..b28354cd8b5f2 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -14,7 +14,7 @@ Reshaping by pivoting DataFrame objects .. ipython:: python :suppress: - import pandas.util.testing as tm + import pandas._testing as tm tm.N = 3 def unpivot(frame): @@ -38,7 +38,7 @@ For the curious here is how the above ``DataFrame`` was created: .. code-block:: python - import pandas.util.testing as tm + import pandas._testing as tm tm.N = 3 diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 0611c6334937f..43bb4966ec5bf 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -26,7 +26,7 @@ Assuming you want or need the expressiveness and power of pandas, let's carry on .. ipython:: python :suppress: - from pandas.util.testing import _make_timeseries + from pandas._testing import _make_timeseries # Make a random in-memory dataset ts = _make_timeseries(freq="30S", seed=0) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index c258a8840b714..8588fac4a18d0 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -15,7 +15,7 @@ can be chosen, including 0) is omitted. The compressed values are not actually s arr = np.random.randn(10) arr[2:-2] = np.nan - ts = pd.Series(pd.SparseArray(arr)) + ts = pd.Series(pd.arrays.SparseArray(arr)) ts Notice the dtype, ``Sparse[float64, nan]``. The ``nan`` means that elements in the @@ -51,7 +51,7 @@ identical to their dense counterparts. SparseArray ----------- -:class:`SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` +:class:`arrays.SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` for storing an array of sparse values (see :ref:`basics.dtypes` for more on extension arrays). It is a 1-dimensional ndarray-like object storing only values distinct from the ``fill_value``: @@ -61,7 +61,7 @@ only values distinct from the ``fill_value``: arr = np.random.randn(10) arr[2:5] = np.nan arr[7:8] = np.nan - sparr = pd.SparseArray(arr) + sparr = pd.arrays.SparseArray(arr) sparr A sparse array can be converted to a regular (dense) ndarray with :meth:`numpy.asarray` @@ -144,7 +144,7 @@ to ``SparseArray`` and get a ``SparseArray`` as a result. .. ipython:: python - arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) + arr = pd.arrays.SparseArray([1., np.nan, np.nan, -2., np.nan]) np.abs(arr) @@ -153,7 +153,7 @@ the correct dense result. .. ipython:: python - arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + arr = pd.arrays.SparseArray([1., -1, -1, -2., -1], fill_value=-1) np.abs(arr) np.abs(arr).to_dense() @@ -194,7 +194,7 @@ From an array-like, use the regular :class:`Series` or .. ipython:: python # New way - pd.DataFrame({"A": pd.SparseArray([0, 1])}) + pd.DataFrame({"A": pd.arrays.SparseArray([0, 1])}) From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`, @@ -256,10 +256,10 @@ Instead, you'll need to ensure that the values being assigned are sparse .. ipython:: python - df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) + df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1])}) df['B'] = [0, 0] # remains dense df['B'].dtype - df['B'] = pd.SparseArray([0, 0]) + df['B'] = pd.arrays.SparseArray([0, 0]) df['B'].dtype The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 633827eb79f46..02550eab86913 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1063,7 +1063,7 @@ "- Provide an API that is pleasing to use interactively and is \"good enough\" for many tasks\n", "- Provide the foundations for dedicated libraries to build on\n", "\n", - "If you build a great library on top of this, let us know and we'll [link](http://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", + "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", "\n", "### Subclassing\n", "\n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 7a8400d124b22..88c86ac212f11 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -87,8 +87,9 @@ l. For ``StringDtype``, :ref:`string accessor methods` .. ipython:: python - s.astype(object).str.count("a") - s.astype(object).dropna().str.count("a") + s2 = pd.Series(["a", None, "b"], dtype="object") + s2.str.count("a") + s2.dropna().str.count("a") When NA values are present, the output dtype is float64. Similarly for methods returning boolean values. diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 86ff338536f80..823e177f3e05e 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -236,7 +236,7 @@ I/O enhancements .. ipython:: python - from pandas.util.testing import makeCustomDataframe as mkdf + from pandas._testing import makeCustomDataframe as mkdf df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 6242c40d44bf8..4f9ab761334e7 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -224,7 +224,7 @@ Enhancements .. code-block:: ipython - In [28]: import pandas.util.testing as tm + In [28]: import pandas._testing as tm In [29]: panel = tm.makePanel(5) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index b328e549e8899..95e354e425143 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -852,7 +852,7 @@ Other notable API changes: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead - See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy + See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy - ``merge``, ``DataFrame.merge``, and ``ordered_merge`` now return the same type as the ``left`` argument (:issue:`7737`). diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index b58eabaed6127..292351c709940 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -172,7 +172,7 @@ Other enhancements: 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. +- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index fc638e35ed88b..855d0b8695bb1 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -528,7 +528,7 @@ Deprecations `seaborn `_ for similar but more refined functionality (:issue:`3445`). The documentation includes some examples how to convert your existing code - from ``rplot`` to seaborn `here `__. + from ``rplot`` to seaborn `here `__. - The ``pandas.sandbox.qtpandas`` interface is deprecated and will be removed in a future version. We refer users to the external package `pandas-qt `_. (:issue:`9615`) diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index a7174c6325f86..d3f96d4185d65 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1279,7 +1279,7 @@ Bug Fixes - Removed ``millisecond`` property of ``DatetimeIndex``. This would always raise a ``ValueError`` (:issue:`12019`). - Bug in ``Series`` constructor with read-only data (:issue:`11502`) -- Removed ``pandas.util.testing.choice()``. Should use ``np.random.choice()``, instead. (:issue:`12386`) +- Removed ``pandas._testing.choice()``. Should use ``np.random.choice()``, instead. (:issue:`12386`) - Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`) - Bug in ``.style`` indexes and MultiIndexes not appearing (:issue:`11655`) - Bug in ``to_msgpack`` and ``from_msgpack`` which did not correctly serialize or deserialize ``NaT`` (:issue:`12307`). diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 6f6446c3f74e1..6eb509a258430 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1225,6 +1225,7 @@ Previously, sparse data were ``float64`` dtype by default, even if all inputs we As of v0.19.0, sparse data keeps the input dtype, and uses more appropriate ``fill_value`` defaults (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). .. ipython:: python + :okwarning: pd.SparseArray([1, 2, 0, 0], dtype=np.int64) pd.SparseArray([True, False, False, False]) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index e7dc6150ffcb1..ceb1c7f27231b 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -1360,7 +1360,7 @@ provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:` .. code-block:: ipython - In [133]: import pandas.util.testing as tm + In [133]: import pandas._testing as tm In [134]: p = tm.makePanel() diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index f33943e423b25..71969c4de6b02 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -470,7 +470,7 @@ Current behavior KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike Out[4]: 1 2.0 @@ -927,7 +927,7 @@ Other API changes - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytables standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) -- Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) +- Removed the ``@slow`` decorator from ``pandas._testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. - The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`) - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index f4c283ea742f7..b9e1b5060d1da 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -648,7 +648,7 @@ provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:` .. code-block:: ipython - In [75]: import pandas.util.testing as tm + In [75]: import pandas._testing as tm In [76]: p = tm.makePanel() diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b6b91983b8267..b18d022349001 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -354,6 +354,7 @@ When passed DataFrames whose values are sparse, :func:`concat` will now return a :class:`Series` or :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (:issue:`25702`). .. ipython:: python + :okwarning: df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) @@ -910,6 +911,7 @@ by a ``Series`` or ``DataFrame`` with sparse values. **New way** .. ipython:: python + :okwarning: df = pd.DataFrame({"A": pd.SparseArray([0, 0, 1, 2])}) df.dtypes diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst index f73a3f956f42e..f7f54198a0f82 100644 --- a/doc/source/whatsnew/v0.25.3.rst +++ b/doc/source/whatsnew/v0.25.3.rst @@ -19,4 +19,4 @@ Groupby/resample/rolling Contributors ~~~~~~~~~~~~ -.. contributors:: v0.25.2..HEAD +.. contributors:: v0.25.2..v0.25.3 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e453b79ba0047..655b1196fc669 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1,40 +1,29 @@ -.. _whatsnew_1000: +.. _whatsnew_100: What's new in 1.0.0 (??) ------------------------ -.. warning:: +These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog +including other versions of pandas. - Starting with the 1.x series of releases, pandas only supports Python 3.6.1 and higher. +.. note:: -New Deprecation Policy -~~~~~~~~~~~~~~~~~~~~~~ - -Starting with Pandas 1.0.0, pandas will adopt a version of `SemVer`_. + The pandas 1.0 release removed a lot of functionality that was deprecated + in previous releases (see :ref:`below ` + for an overview). It is recommended to first upgrade to pandas 0.25 and to + ensure your code is working without warnings, before upgrading to pandas + 1.0. -Historically, pandas has used a "rolling" deprecation policy, with occasional -outright breaking API changes. Where possible, we would deprecate the behavior -we'd like to change, giving an option to adopt the new behavior (via a keyword -or an alternative method), and issuing a warning for users of the old behavior. -Sometimes, a deprecation was not possible, and we would make an outright API -breaking change. -We'll continue to *introduce* deprecations in major and minor releases (e.g. -1.0.0, 1.1.0, ...). Those deprecations will be *enforced* in the next major -release. +New Deprecation Policy +~~~~~~~~~~~~~~~~~~~~~~ -Note that *behavior changes* and *API breaking changes* are not identical. API -breaking changes will only be released in major versions. If we consider a -behavior to be a bug, and fixing that bug induces a behavior change, we'll -release that change in a minor release. This is a sometimes difficult judgment -call that we'll do our best on. +Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to +version releases. Briefly, -This doesn't mean that pandas' pace of development will slow down. In the `2019 -Pandas User Survey`_, about 95% of the respondents said they considered pandas -"stable enough". This indicates there's an appetite for new features, even if it -comes at the cost of break API. The difference is that now API breaking changes -will be accompanied with a bump in the major version number (e.g. pandas 1.5.1 --> 2.0.0). +* Deprecations will be introduced in minor releases (e.g. 1.1.0, 1.2.0, 2.1.0, ...) +* Deprecations will be enforced in major releases (e.g. 1.0.0, 2.0.0, 3.0.0, ...) +* API-breaking changes will be made only in major releases (except for experimental features) See :ref:`policies.version` for more. @@ -43,20 +32,63 @@ See :ref:`policies.version` for more. {{ header }} -These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog -including other versions of pandas. - +.. --------------------------------------------------------------------------- Enhancements ~~~~~~~~~~~~ +.. _whatsnew_100.NA: + +Experimental ``NA`` scalar to denote missing values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A new ``pd.NA`` value (singleton) is introduced to represent scalar missing +values. Up to now, pandas used several values to represent missing data: ``np.nan`` is used for this for float data, ``np.nan`` or +``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The +goal of ``pd.NA`` is to provide a "missing" indicator that can be used +consistently across data types. ``pd.NA`` is currently used by the nullable integer and boolean +data types and the new string data type (:issue:`28095`). + +.. warning:: + + Experimental: the behaviour of ``pd.NA`` can still change without warning. + +For example, creating a Series using the nullable integer dtype: + +.. ipython:: python + + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + +Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations. +In addition to arithmetic operations, ``pd.NA`` also propagates as "missing" +or "unknown" in comparison operations: + +.. ipython:: python + + np.nan > 1 + pd.NA > 1 + +For logical operations, ``pd.NA`` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*). For example: + +.. ipython:: python + + pd.NA | True + +For more, see :ref:`NA section ` in the user guide on missing +data. + + .. _whatsnew_100.string: Dedicated string data type ^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`StringDtype`, an extension type dedicated to string data. -Previously, strings were typically stored in object-dtype NumPy arrays. +Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`) .. warning:: @@ -102,59 +134,15 @@ String accessor methods returning integers will return a value with :class:`Int6 We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. -.. _whatsnew_100.NA: - -Experimental ``NA`` scalar to denote missing values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A new ``pd.NA`` value (singleton) is introduced to represent scalar missing -values. Up to now, ``np.nan`` is used for this for float data, ``np.nan`` or -``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The -goal of ``pd.NA`` is provide a "missing" indicator that can be used -consistently across data types. For now, the nullable integer and boolean -data types and the new string data type make use of ``pd.NA`` (:issue:`28095`). - -.. warning:: - - Experimental: the behaviour of ``pd.NA`` can still change without warning. - -For example, creating a Series using the nullable integer dtype: - -.. ipython:: python - - s = pd.Series([1, 2, None], dtype="Int64") - s - s[2] - -Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations. -In addition to arithmetic operations, ``pd.NA`` also propagates as "missing" -or "unknown" in comparison operations: - -.. ipython:: python - - np.nan > 1 - pd.NA > 1 - -For logical operations, ``pd.NA`` follows the rules of the -`three-valued logic `__ (or -*Kleene logic*). For example: - -.. ipython:: python - - pd.NA | True - -For more, see :ref:`NA section ` in the user guide on missing -data. - .. _whatsnew_100.boolean: Boolean data type with missing values support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`BooleanDtype` / :class:`~arrays.BooleanArray`, an extension -type dedicated to boolean data that can hold missing values. With the default -``'bool`` data type based on a numpy bool array, the column can only hold -True or False values and not missing values. This new :class:`BooleanDtype` +type dedicated to boolean data that can hold missing values. The default +``bool`` data type based on a bool-dtype NumPy array, the column can only hold +``True`` or ``False``, and not missing values. This new :class:`~arrays.BooleanArray` can store missing values as well by keeping track of this in a separate mask. (:issue:`29555`, :issue:`30095`) @@ -169,18 +157,18 @@ You can use the alias ``"boolean"`` as well. s = pd.Series([True, False, None], dtype="boolean") s -.. _whatsnew_1000.numba_rolling_apply: +.. _whatsnew_100.numba_rolling_apply: Using Numba in ``rolling.apply`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added an ``engine`` keyword to :meth:`~Rolling.apply` that allows the user to execute the +We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the routine using `Numba `__ instead of Cython. Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger (1 million rows or greater). For more details, see :ref:`rolling apply documentation ` (:issue:`28987`) -.. _whatsnew_1000.custom_window: +.. _whatsnew_100.custom_window: Defining custom windows for rolling operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -191,12 +179,25 @@ method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate indices used for each window during the rolling aggregation. For more details and example usage, see the :ref:`custom window rolling documentation ` -.. _whatsnew_1000.enhancements.other: +.. _whatsnew_100.to_markdown: + +Converting to Markdown +^^^^^^^^^^^^^^^^^^^^^^ + +We've added :meth:`~DataFrame.to_markdown` for creating a markdown table (:issue:`11052`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=['a', 'a', 'b']) + print(df.to_markdown()) + +.. _whatsnew_100.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) +- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - The :ref:`integer dtype ` with support for missing values and the @@ -212,17 +213,24 @@ Other enhancements - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) -- Roundtripping DataFrames with nullable integer or string data types to parquet +- Roundtripping DataFrames with nullable integer, string and period data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine - now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). + now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) +- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) -- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) +- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) -- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` added (:issue:`11052`) - +- :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) +- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`) +- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`) +- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`) +- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) +- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) +- :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) + Build Changes ^^^^^^^^^^^^^ @@ -232,12 +240,14 @@ cythonized files in the source distribution uploaded to PyPI (:issue:`28341`, :i a built distribution (wheel) or via conda, this shouldn't have any effect on you. If you're building pandas from source, you should no longer need to install Cython into your build environment before calling ``pip install pandas``. -.. _whatsnew_1000.api_breaking: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_1000.api_breaking.MultiIndex._names: +.. _whatsnew_100.api_breaking.MultiIndex._names: Avoid using names from ``MultiIndex.levels`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -255,10 +265,10 @@ For backwards compatibility, you can still *access* the names via the levels. mi.levels[0].name However, it is no longer possible to *update* the names of the ``MultiIndex`` -via the name of the level. The following will **silently** fail to update the -name of the ``MultiIndex`` +via the level. .. ipython:: python + :okexcept: mi.levels[0].name = "new name" mi.names @@ -285,52 +295,107 @@ New repr for :class:`~pandas.arrays.IntervalArray` closed='right', dtype='interval[int64]') - *pandas 1.0.0* .. ipython:: python pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) +``DataFrame.rename`` now only accepts one positional argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) +- :meth:`DataFrame.rename` would previously accept positional arguments that would lead + to ambiguous or undefined behavior. From pandas 1.0, only the very first argument, which + maps labels to their new names along the default axis, is allowed to be passed by position + (:issue:`29136`). -- :meth:`SeriesGroupBy.count` -- :meth:`SeriesGroupBy.size` -- :meth:`SeriesGroupBy.nunique` -- :meth:`SeriesGroupBy.nth` +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame([[1]]) + In [2]: df.rename({0: 1}, {0: 2}) + FutureWarning: ...Use named arguments to resolve ambiguity... + Out[2]: + 2 + 1 1 + +*pandas 1.0.0* .. ipython:: python + :okexcept: - df = pd.DataFrame({ - "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), - "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), - "value": [0.1] * 4, - }) - df + df.rename({0: 1}, {0: 2}) +Note that errors will now be raised when conflicting or potentially ambiguous arguments are provided. *pandas 0.25.x* .. code-block:: ipython - In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + In [1]: df.rename({0: 1}, index={0: 2}) + Out[1]: + 0 + 1 1 + + In [2]: df.rename(mapper={0: 1}, index={0: 2}) Out[2]: - cat_1 cat_2 - A A 1 - B 1 - B A 1 - B 1 - Name: value, dtype: int64 + 0 + 2 1 + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + df.rename({0: 1}, index={0: 2}) + df.rename(mapper={0: 1}, index={0: 2}) + +You can still change the axis along which the first positional argument is applied by +supplying the ``axis`` keyword argument. + +.. ipython:: python + + df.rename({0: 1}) + df.rename({0: 1}, axis=1) + +If you would like to update both the index and column labels, be sure to use the respective +keywords. + +.. ipython:: python + + df.rename(index={0: 1}, columns={0: 2}) +Extended verbose info output for :class:`~pandas.DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :meth:`DataFrame.info` now shows line numbers for the columns summary (:issue:`17304`) + +*pandas 0.25.x* + +.. code-block:: python + + >>> df = pd.DataFrame({"int_col": [1, 2, 3], + ... "text_col": ["a", "b", "c"], + ... "float_col": [0.0, 0.1, 0.2]}) + >>> df.info(verbose=True) + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 3 columns): + int_col 3 non-null int64 + text_col 3 non-null object + float_col 3 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 152.0+ bytes *pandas 1.0.0* .. ipython:: python - df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + df = pd.DataFrame({"int_col": [1, 2, 3], + "text_col": ["a", "b", "c"], + "float_col": [0.0, 0.1, 0.2]}) + df.info(verbose=True) :meth:`pandas.array` inference changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -365,6 +430,111 @@ The following methods now also correctly output values for unobserved categories As a reminder, you can specify the ``dtype`` to disable all inference. +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` rather than +:attr:`numpy.nan` as its missing value marker (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a[2] + nan + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a + a[2] + +This has a few API-breaking consequences. + +**Converting to a NumPy ndarray** + +When converting to a NumPy array missing values will be ``pd.NA``, which cannot +be converted to a float. So calling ``np.asarray(integer_array, dtype="float")`` +will now raise. + +*pandas 0.25.x* + +.. code-block:: python + + >>> np.asarray(a, dtype="float") + array([ 1., 2., nan]) + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + np.asarray(a, dtype="float") + +Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. + +.. ipython:: python + + a.to_numpy(dtype="float", na_value=np.nan) + +**value_counts returns a nullable integer dtype** + +:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable +integer dtype for the values. + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + dtype('int64') + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + +See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` +and :attr:`numpy.nan`. + +:class:`arrays.IntegerArray` comparisons return :class:`arrays.BooleanArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Comparison operations on a :class:`arrays.IntegerArray` now returns a +:class:`arrays.BooleanArray` rather than a NumPy array (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a > 1 + array([False, True, False]) + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a > 1 + +Note that missing values now propagate, rather than always comparing unequal +like :attr:`numpy.nan`. See :ref:`missing_data.NA` for more. + By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -402,7 +572,14 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`. DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. Series([], dtype: float64) -.. _whatsnew_1000.api_breaking.deps: +.. _whatsnew_100.api_breaking.python: + +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). + +.. _whatsnew_100.api_breaking.deps: Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -447,7 +624,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | openpyxl | 2.5.7 | X | +-----------------+-----------------+---------+ -| pyarrow | 0.12.0 | X | +| pyarrow | 0.13.0 | X | +-----------------+-----------------+---------+ | pymysql | 0.7.1 | | +-----------------+-----------------+---------+ @@ -470,14 +647,13 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. - -.. _whatsnew_1000.api.other: +.. _whatsnew_100.api.other: Other API changes ^^^^^^^^^^^^^^^^^ - Bumped the minimum supported version of ``s3fs`` from 0.0.8 to 0.3.0 (:issue:`28616`) -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) +- :class:`core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) - In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). @@ -486,22 +662,25 @@ Other API changes - Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). Now, pandas custom formatters will only be applied to plots created by pandas, through :meth:`~DataFrame.plot`. Previously, pandas' formatters would be applied to all plots created *after* a :meth:`~DataFrame.plot`. - See :ref:`units registration ` for more. + See :ref:`units registration ` for more. - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- Added ```` to the list of default NA values for :meth:`read_csv` (:issue:`30821`) -.. _whatsnew_1000.api.documentation: +.. _whatsnew_100.api.documentation: Documentation Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). -- Added sub-section Query MultiIndex in IO tools user guide (:issue:`28791`) +- Added sub-section on :ref:`io.query_multi` for HDF5 datasets (:issue:`28791`). + +.. --------------------------------------------------------------------------- -.. _whatsnew_1000.deprecations: +.. _whatsnew_100.deprecations: Deprecations ~~~~~~~~~~~~ @@ -514,31 +693,66 @@ Deprecations - :func:`eval` keyword argument "truediv" is deprecated and will be removed in a future version (:issue:`29812`) - :meth:`DateOffset.isAnchored` and :meth:`DatetOffset.onOffset` are deprecated and will be removed in a future version, use :meth:`DateOffset.is_anchored` and :meth:`DateOffset.is_on_offset` instead (:issue:`30340`) - ``pandas.tseries.frequencies.get_offset`` is deprecated and will be removed in a future version, use ``pandas.tseries.frequencies.to_offset`` instead (:issue:`4205`) -- :meth:`Categorical.take_nd` is deprecated, use :meth:`Categorical.take` instead (:issue:`27745`) +- :meth:`Categorical.take_nd` and :meth:`CategoricalIndex.take_nd` are deprecated, use :meth:`Categorical.take` and :meth:`CategoricalIndex.take` instead (:issue:`27745`) - The parameter ``numeric_only`` of :meth:`Categorical.min` and :meth:`Categorical.max` is deprecated and replaced with ``skipna`` (:issue:`25303`) - The parameter ``label`` in :func:`lreshape` has been deprecated and will be removed in a future version (:issue:`29742`) - ``pandas.core.index`` has been deprecated and will be removed in a future version, the public classes are available in the top-level namespace (:issue:`19711`) - :func:`pandas.json_normalize` is now exposed in the top-level namespace. Usage of ``json_normalize`` as ``pandas.io.json.json_normalize`` is now deprecated and it is recommended to use ``json_normalize`` as :func:`pandas.json_normalize` instead (:issue:`27586`). +- The ``numpy`` argument of :meth:`pandas.read_json` is deprecated (:issue:`28512`). - :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_feather`, and :meth:`DataFrame.to_parquet` argument "fname" is deprecated, use "path" instead (:issue:`23574`) - The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) +- The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`). +- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) +- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) +- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) -.. _whatsnew_1000.prior_deprecations: +**Selecting Columns from a Grouped DataFrame** +When selecting columns from a :class:`DataFrameGroupBy` object, passing individual keys (or a tuple of keys) inside single brackets is deprecated, +a list of items should be used instead. (:issue:`23566`) For example: -Removed SparseSeries and SparseDataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: ipython + + df = pd.DataFrame({ + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": np.random.randn(8), + "C": np.random.randn(8), + }) + g = df.groupby('A') + + # single key, returns SeriesGroupBy + g['B'] + + # tuple of single key, returns SeriesGroupBy + g[('B',)] + + # tuple of multiple keys, returns DataFrameGroupBy, raises FutureWarning + g[('B', 'C')] + + # multiple keys passed directly, returns DataFrameGroupBy, raises FutureWarning + # (implicitly converts the passed strings into a single tuple) + g['B', 'C'] + + # proper way, returns DataFrameGroupBy + g[['B', 'C']] + +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Removed SparseSeries and SparseDataFrame** ``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method have been removed (:issue:`28425`). We recommend using a ``Series`` or ``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help with migrating existing code. -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew_1000.matplotlib_units: +.. _whatsnew_100.matplotlib_units: **Matplotlib unit registration** @@ -560,40 +774,40 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. **Other removals** - Removed the previously deprecated keyword "index" from :func:`read_stata`, :class:`StataReader`, and :meth:`StataReader.read`, use "index_col" instead (:issue:`17328`) -- Removed :meth:`StataReader.data` method, use :meth:`StataReader.read` instead (:issue:`9493`) -- Removed :func:`pandas.plotting._matplotlib.tsplot`, use :meth:`Series.plot` instead (:issue:`19980`) -- :func:`pandas.tseries.converter.register` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) +- Removed ``StataReader.data`` method, use :meth:`StataReader.read` instead (:issue:`9493`) +- Removed ``pandas.plotting._matplotlib.tsplot``, use :meth:`Series.plot` instead (:issue:`19980`) +- ``pandas.tseries.converter.register`` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) - :meth:`Series.plot` no longer accepts positional arguments, pass keyword arguments instead (:issue:`30003`) - :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passinig a tuple instead (:issue:`30003`) - Floordiv of integer-dtyped array by :class:`Timedelta` now raises ``TypeError`` (:issue:`21036`) - :class:`TimedeltaIndex` and :class:`DatetimeIndex` no longer accept non-nanosecond dtype strings like "timedelta64" or "datetime64", use "timedelta64[ns]" and "datetime64[ns]" instead (:issue:`24806`) - Changed the default "skipna" argument in :func:`pandas.api.types.infer_dtype` from ``False`` to ``True`` (:issue:`24050`) -- Removed :attr:`Series.ix` and :attr:`DataFrame.ix` (:issue:`26438`) -- Removed :meth:`Index.summary` (:issue:`18217`) +- Removed ``Series.ix`` and ``DataFrame.ix`` (:issue:`26438`) +- Removed ``Index.summary`` (:issue:`18217`) - Removed the previously deprecated keyword "fastpath" from the :class:`Index` constructor (:issue:`23110`) -- Removed :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) -- Removed :meth:`Series.compound` and :meth:`DataFrame.compound` (:issue:`26405`) +- Removed ``Series.get_value``, ``Series.set_value``, ``DataFrame.get_value``, ``DataFrame.set_value`` (:issue:`17739`) +- Removed ``Series.compound`` and ``DataFrame.compound`` (:issue:`26405`) - Changed the default "inplace" argument in :meth:`DataFrame.set_index` and :meth:`Series.set_axis` from ``None`` to ``False`` (:issue:`27600`) -- Removed :attr:`Series.cat.categorical`, :attr:`Series.cat.index`, :attr:`Series.cat.name` (:issue:`24751`) +- Removed ``Series.cat.categorical``, ``Series.cat.index``, ``Series.cat.name`` (:issue:`24751`) - Removed the previously deprecated keyword "box" from :func:`to_datetime` and :func:`to_timedelta`; in addition these now always returns :class:`DatetimeIndex`, :class:`TimedeltaIndex`, :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`24486`) - :func:`to_timedelta`, :class:`Timedelta`, and :class:`TimedeltaIndex` no longer allow "M", "y", or "Y" for the "unit" argument (:issue:`23264`) -- Removed the previously deprecated keyword "time_rule" from (non-public) :func:`offsets.generate_range`, which has been moved to :func:`core.arrays._ranges.generate_range` (:issue:`24157`) +- Removed the previously deprecated keyword "time_rule" from (non-public) ``offsets.generate_range``, which has been moved to :func:`core.arrays._ranges.generate_range` (:issue:`24157`) - :meth:`DataFrame.loc` or :meth:`Series.loc` with listlike indexers and missing labels will no longer reindex (:issue:`17295`) - :meth:`DataFrame.to_excel` and :meth:`Series.to_excel` with non-existent columns will no longer reindex (:issue:`17295`) - Removed the previously deprecated keyword "join_axes" from :func:`concat`; use ``reindex_like`` on the result instead (:issue:`22318`) - Removed the previously deprecated keyword "by" from :meth:`DataFrame.sort_index`, use :meth:`DataFrame.sort_values` instead (:issue:`10726`) -- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`18529`) - Passing ``datetime64`` data to :class:`TimedeltaIndex` or ``timedelta64`` data to ``DatetimeIndex`` now raises ``TypeError`` (:issue:`23539`, :issue:`23937`) - Passing ``int64`` values to :class:`DatetimeIndex` and a timezone now interprets the values as nanosecond timestamps in UTC, not wall times in the given timezone (:issue:`24559`) - A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`) -- Removed :meth:`Index.contains`, use ``key in index`` instead (:issue:`30103`) +- Removed ``Index.contains``, use ``key in index`` instead (:issue:`30103`) - Addition and subtraction of ``int`` or integer-arrays is no longer allowed in :class:`Timestamp`, :class:`DatetimeIndex`, :class:`TimedeltaIndex`, use ``obj + n * obj.freq`` instead of ``obj + n`` (:issue:`22535`) -- Removed :meth:`Series.ptp` (:issue:`21614`) -- Removed :meth:`Series.from_array` (:issue:`18258`) -- Removed :meth:`DataFrame.from_items` (:issue:`18458`) -- Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`) -- Removed :meth:`Series.asobject` (:issue:`18477`) -- Removed :meth:`DataFrame.as_blocks`, :meth:`Series.as_blocks`, `DataFrame.blocks`, :meth:`Series.blocks` (:issue:`17656`) +- Removed ``Series.ptp`` (:issue:`21614`) +- Removed ``Series.from_array`` (:issue:`18258`) +- Removed ``DataFrame.from_items`` (:issue:`18458`) +- Removed ``DataFrame.as_matrix``, ``Series.as_matrix`` (:issue:`18458`) +- Removed ``Series.asobject`` (:issue:`18477`) +- Removed ``DataFrame.as_blocks``, ``Series.as_blocks``, ``DataFrame.blocks``, ``Series.blocks`` (:issue:`17656`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - :meth:`Series.where` with ``Categorical`` dtype (or :meth:`DataFrame.where` with ``Categorical`` column) no longer allows setting new categories (:issue:`24114`) @@ -601,37 +815,37 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated keyword "verify_integrity" from the :class:`DatetimeIndex` and :class:`TimedeltaIndex` constructors (:issue:`23919`) - Removed the previously deprecated keyword "fastpath" from ``pandas.core.internals.blocks.make_block`` (:issue:`19265`) - Removed the previously deprecated keyword "dtype" from :meth:`Block.make_block_same_class` (:issue:`19434`) -- Removed :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) -- Removed :meth:`MultiIndex.to_hierarchical` (:issue:`21613`) -- Removed :attr:`MultiIndex.labels`, use :attr:`MultiIndex.codes` instead (:issue:`23752`) +- Removed ``ExtensionArray._formatting_values``. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed ``MultiIndex.to_hierarchical`` (:issue:`21613`) +- Removed ``MultiIndex.labels``, use :attr:`MultiIndex.codes` instead (:issue:`23752`) - Removed the previously deprecated keyword "labels" from the :class:`MultiIndex` constructor, use "codes" instead (:issue:`23752`) -- Removed :meth:`MultiIndex.set_labels`, use :meth:`MultiIndex.set_codes` instead (:issue:`23752`) +- Removed ``MultiIndex.set_labels``, use :meth:`MultiIndex.set_codes` instead (:issue:`23752`) - Removed the previously deprecated keyword "labels" from :meth:`MultiIndex.set_codes`, :meth:`MultiIndex.copy`, :meth:`MultiIndex.drop`, use "codes" instead (:issue:`23752`) - Removed support for legacy HDF5 formats (:issue:`29787`) - Passing a dtype alias (e.g. 'datetime64[ns, UTC]') to :class:`DatetimeTZDtype` is no longer allowed, use :meth:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`) - Removed the previously deprecated keyword "skip_footer" from :func:`read_excel`; use "skipfooter" instead (:issue:`18836`) - :func:`read_excel` no longer allows an integer value for the parameter ``usecols``, instead pass a list of integers from 0 to ``usecols`` inclusive (:issue:`23635`) - Removed the previously deprecated keyword "convert_datetime64" from :meth:`DataFrame.to_records` (:issue:`18902`) -- Removed :meth:`IntervalIndex.from_intervals` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- Removed ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) - Changed the default "keep_tz" argument in :meth:`DatetimeIndex.to_series` from ``None`` to ``True`` (:issue:`23739`) -- Removed :func:`api.types.is_period` and :func:`api.types.is_datetimetz` (:issue:`23917`) +- Removed ``api.types.is_period`` and ``api.types.is_datetimetz`` (:issue:`23917`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) -- Removed :func:`pandas.tseries.plotting.tsplot` (:issue:`18627`) +- Removed ``pandas.tseries.plotting.tsplot`` (:issue:`18627`) - Removed the previously deprecated keywords "reduce" and "broadcast" from :meth:`DataFrame.apply` (:issue:`18577`) -- Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) +- Removed the previously deprecated ``assert_raises_regex`` function in ``pandas._testing`` (:issue:`29174`) - Removed the previously deprecated ``FrozenNDArray`` class in ``pandas.core.indexes.frozen`` (:issue:`29335`) - Removed the previously deprecated keyword "nthreads" from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) -- Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) -- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) -- Removed :meth:`Series.valid`; use :meth:`Series.dropna` instead (:issue:`18800`) -- Removed :attr:`DataFrame.is_copy`, :attr:`Series.is_copy` (:issue:`18812`) -- Removed :meth:`DataFrame.get_ftype_counts`, :meth:`Series.get_ftype_counts` (:issue:`18243`) -- Removed :meth:`DataFrame.ftypes`, :meth:`Series.ftypes`, :meth:`Series.ftype` (:issue:`26744`) -- Removed :meth:`Index.get_duplicates`, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) -- Removed :meth:`Series.clip_upper`, :meth:`Series.clip_lower`, :meth:`DataFrame.clip_upper`, :meth:`DataFrame.clip_lower` (:issue:`24203`) +- Removed ``Index.is_lexsorted_for_tuple`` (:issue:`29305`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`29608`) +- Removed ``Series.valid``; use :meth:`Series.dropna` instead (:issue:`18800`) +- Removed ``DataFrame.is_copy``, ``Series.is_copy`` (:issue:`18812`) +- Removed ``DataFrame.get_ftype_counts``, ``Series.get_ftype_counts`` (:issue:`18243`) +- Removed ``DataFrame.ftypes``, ``Series.ftypes``, ``Series.ftype`` (:issue:`26744`) +- Removed ``Index.get_duplicates``, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) +- Removed ``Series.clip_upper``, ``Series.clip_lower``, ``DataFrame.clip_upper``, ``DataFrame.clip_lower`` (:issue:`24203`) - Removed the ability to alter :attr:`DatetimeIndex.freq`, :attr:`TimedeltaIndex.freq`, or :attr:`PeriodIndex.freq` (:issue:`20772`) -- Removed :attr:`DatetimeIndex.offset` (:issue:`20730`) -- Removed :meth:`DatetimeIndex.asobject`, :meth:`TimedeltaIndex.asobject`, :meth:`PeriodIndex.asobject`, use ``astype(object)`` instead (:issue:`29801`) +- Removed ``DatetimeIndex.offset`` (:issue:`20730`) +- Removed ``DatetimeIndex.asobject``, ``TimedeltaIndex.asobject``, ``PeriodIndex.asobject``, use ``astype(object)`` instead (:issue:`29801`) - Removed the previously deprecated keyword "order" from :func:`factorize` (:issue:`19751`) - Removed the previously deprecated keyword "encoding" from :func:`read_stata` and :meth:`DataFrame.to_stata` (:issue:`21400`) - Changed the default "sort" argument in :func:`concat` from ``None`` to ``False`` (:issue:`20613`) @@ -640,36 +854,39 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`) - Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`) - Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`) -- Removed :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`) +- Removed ``Series.nonzero``, use ``to_numpy().nonzero()`` instead (:issue:`24048`) - Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`) - Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`) -- Removed :meth:`Series.put` (:issue:`27106`) -- Removed :attr:`Series.real`, :attr:`Series.imag` (:issue:`27106`) -- Removed :meth:`Series.to_dense`, :meth:`DataFrame.to_dense` (:issue:`26684`) -- Removed :meth:`Index.dtype_str`, use ``str(index.dtype)`` instead (:issue:`27106`) +- Removed ``Series.put`` (:issue:`27106`) +- Removed ``Series.real``, ``Series.imag`` (:issue:`27106`) +- Removed ``Series.to_dense``, ``DataFrame.to_dense`` (:issue:`26684`) +- Removed ``Index.dtype_str``, use ``str(index.dtype)`` instead (:issue:`27106`) - :meth:`Categorical.ravel` returns a :class:`Categorical` instead of a ``ndarray`` (:issue:`27199`) - The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`) -- Removed :meth:`Series.get_dtype_counts` and :meth:`DataFrame.get_dtype_counts` (:issue:`27145`) +- Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`) - Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) - Changed the default value for the `raw` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) - Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`) - Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`) -- Removed :attr:`Series.base`, :attr:`Index.base`, :attr:`Categorical.base`, :attr:`Series.flags`, :attr:`Index.flags`, :attr:`PeriodArray.flags`, :attr:`Series.strides`, :attr:`Index.strides`, :attr:`Series.itemsize`, :attr:`Index.itemsize`, :attr:`Series.data`, :attr:`Index.data` (:issue:`20721`) +- Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`) - Changed :meth:`Timedelta.resolution` to match the behavior of the standard library ``datetime.timedelta.resolution``, for the old behavior, use :meth:`Timedelta.resolution_string` (:issue:`26839`) -- Removed :attr:`Timestamp.weekday_name`, :attr:`DatetimeIndex.weekday_name`, and :attr:`Series.dt.weekday_name` (:issue:`18164`) +- Removed ``Timestamp.weekday_name``, ``DatetimeIndex.weekday_name``, and ``Series.dt.weekday_name`` (:issue:`18164`) - Removed the previously deprecated keyword "errors" in :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` (:issue:`22644`) - Changed the default "ordered" argument in :class:`CategoricalDtype` from ``None`` to ``False`` (:issue:`26336`) - :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` now require "labels" as the first argument and "axis" as an optional named parameter (:issue:`30089`) -- Removed :func:`to_msgpack`, :func:`read_msgpack`, :meth:`DataFrame.to_msgpack`, :meth:`Series.to_msgpack` (:issue:`27103`) -- Removed :meth:`Series.compress` (:issue:`21930`) +- Removed ``to_msgpack``, ``read_msgpack``, ``DataFrame.to_msgpack``, ``Series.to_msgpack`` (:issue:`27103`) +- Removed ``Series.compress`` (:issue:`21930`) - Removed the previously deprecated keyword "fill_value" from :meth:`Categorical.fillna`, use "value" instead (:issue:`19269`) - Removed the previously deprecated keyword "data" from :func:`andrews_curves`, use "frame" instead (:issue:`6956`) - Removed the previously deprecated keyword "data" from :func:`parallel_coordinates`, use "frame" instead (:issue:`6956`) - Removed the previously deprecated keyword "colors" from :func:`parallel_coordinates`, use "color" instead (:issue:`6956`) - Removed the previously deprecated keywords "verbose" and "private_key" from :func:`read_gbq` (:issue:`30200`) +- Calling ``np.array`` and ``np.asarray`` on tz-aware :class:`Series` and :class:`DatetimeIndex` will now return an object array of tz-aware :class:`Timestamp` (:issue:`24596`) - -.. _whatsnew_1000.performance: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -689,7 +906,9 @@ Performance improvements - Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) - Performance improvement in :func:`~pandas.api.types.infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) -.. _whatsnew_1000.bug_fixes: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.bug_fixes: Bug fixes ~~~~~~~~~ @@ -711,6 +930,11 @@ Categorical :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue:`27952`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) - Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception (:issue:`30227`) +- The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + * :meth:`core.groupby.SeriesGroupBy.count` + * :meth:`core.groupby.SeriesGroupBy.size` + * :meth:`core.groupby.SeriesGroupBy.nunique` + * :meth:`core.groupby.SeriesGroupBy.nth` Datetimelike @@ -719,16 +943,18 @@ Datetimelike - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - Bug in :func:`to_datetime` where passing arrays of malformed ``str`` with errors="coerce" could incorrectly lead to raising ``ValueError`` (:issue:`28299`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) +- Bug in :meth:`core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) - Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) +- Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) - Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) -- Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) +- Bug in :func:`core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) +- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) @@ -736,7 +962,12 @@ Datetimelike - Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) - Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) - Bug in :meth:`DataFrame.append` would remove the timezone-awareness of new data (:issue:`30238`) +- Bug in :meth:`Series.cummin` and :meth:`Series.cummax` with timezone-aware dtype incorrectly dropping its timezone (:issue:`15553`) - Bug in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` where inplace addition and subtraction did not actually operate inplace (:issue:`24115`) +- Bug in :func:`pandas.to_datetime` when called with ``Series`` storing ``IntegerArray`` raising ``TypeError`` instead of returning ``Series`` (:issue:`30050`) +- Bug in :func:`date_range` with custom business hours as ``freq`` and given number of ``periods`` (:issue:`30593`) +- Bug in :class:`PeriodIndex` comparisons with incorrectly casting integers to :class:`Period` objects, inconsistent with the :class:`Period` comparison behavior (:issue:`30722`) +- Bug in :meth:`DatetimeIndex.insert` raising a ``ValueError`` instead of a ``TypeError`` when trying to insert a timezone-aware :class:`Timestamp` into a timezone-naive :class:`DatetimeIndex`, or vice-versa (:issue:`30806`) Timedelta ^^^^^^^^^ @@ -765,6 +996,8 @@ Numeric - Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`) - Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`) - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) +- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`) +- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) Conversion ^^^^^^^^^^ @@ -775,7 +1008,7 @@ Conversion Strings ^^^^^^^ -- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty Series would return an object dtype instead of bool (:issue:`29624`) +- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty ``Series`` would return an ``object`` dtype instead of ``bool`` (:issue:`29624`) - @@ -784,6 +1017,9 @@ Interval - Bug in :meth:`IntervalIndex.get_indexer` where a :class:`Categorical` or :class:`CategoricalIndex` ``target`` would incorrectly raise a ``TypeError`` (:issue:`30063`) - Bug in ``pandas.core.dtypes.cast.infer_dtype_from_scalar`` where passing ``pandas_dtype=True`` did not infer :class:`IntervalDtype` (:issue:`30337`) +- Bug in :class:`Series` constructor where constructing a ``Series`` from a ``list`` of :class:`Interval` objects resulted in ``object`` dtype instead of :class:`IntervalDtype` (:issue:`23563`) +- Bug in :class:`IntervalDtype` where the ``kind`` attribute was incorrectly set as ``None`` instead of ``"O"`` (:issue:`30568`) +- Bug in :class:`IntervalIndex`, :class:`~arrays.IntervalArray`, and :class:`Series` with interval data where equality comparisons were incorrect (:issue:`24112`) Indexing ^^^^^^^^ @@ -795,9 +1031,13 @@ Indexing - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) - Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`) -- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- :meth:`Index.get_indexer_non_unique` could fail with ``TypeError`` in some cases, such as when searching for ints in a string index (:issue:`28257`) - Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`) - Bug in :meth:`DataFrame.loc` with incorrect dtype when setting Categorical value in 1-row DataFrame (:issue:`25495`) +- :meth:`MultiIndex.get_loc` can't find missing values when input includes missing values (:issue:`19132`) +- Bug in :meth:`Series.__setitem__` incorrectly assigning values with boolean indexer when the length of new data matches the number of ``True`` values and new data is not a ``Series`` or an ``np.array`` (:issue:`30567`) +- Bug in indexing with a :class:`PeriodIndex` incorrectly accepting integers representing years, use e.g. ``ser.loc["2007"]`` instead of ``ser.loc[2007]`` (:issue:`30763`) +>>>>>>> upstream/master Missing ^^^^^^^ @@ -808,7 +1048,7 @@ Missing MultiIndex ^^^^^^^^^^ -- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) +- Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) - Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) - @@ -833,43 +1073,47 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) +- :func:`read_excel` now accepts binary data (:issue:`15914`) +- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) Plotting ^^^^^^^^ - Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) -- - Bug in :meth:`DataFrame.plot` not able to plot when no rows (:issue:`27758`) - Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) - Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) -- Bug where :meth:`DataFrame.boxplot` would not accept a `color` parameter like `DataFrame.plot.box` (:issue:`26214`) +- Bug where :meth:`DataFrame.boxplot` would not accept a ``color`` parameter like :meth:`DataFrame.plot.box` (:issue:`26214`) - Bug in the ``xticks`` argument being ignored for :meth:`DataFrame.plot.bar` (:issue:`14119`) - :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`) - :meth:`DataFrame.plot` now allow a ``backend`` keyword argument to allow changing between backends in one session (:issue:`28619`). - Bug in color validation incorrectly raising for non-color styles (:issue:`29122`). +- Allow :meth:`DataFrame.plot.scatter` to plot ``objects`` and ``datetime`` type data (:issue:`18755`, :issue:`30391`) - Bug in :meth:`DataFrame.hist`, ``xrot=0`` does not work with ``by`` and subplots (:issue:`30288`). Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) -- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty series or dataframe (:issue:`28427`) +- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue:`15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue:`19248`). - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) -- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in :meth:`core.groupby.DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`) -- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) -- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) +- :meth:`core.groupby.SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) +- Bug in :meth:`core.window.rolling.Rolling.quantile` ignoring ``interpolation`` keyword argument when used within a groupby (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) -- Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) +- Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) Reshaping ^^^^^^^^^ @@ -882,18 +1126,20 @@ Reshaping - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) -- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) +- Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`) +- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ``ValueError`` (:issue:`28664`) - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) - Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -- +- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) +- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) Sparse ^^^^^^ - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) -- +- Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`) - ExtensionArray @@ -901,7 +1147,7 @@ ExtensionArray - Bug in :class:`arrays.PandasArray` when setting a scalar string (:issue:`28118`, :issue:`28150`). - Bug where nullable integers could not be compared to strings (:issue:`28930`) -- Bug where :class:`DataFrame` constructor raised ValueError with list-like data and ``dtype`` specified (:issue:`30280`) +- Bug where :class:`DataFrame` constructor raised ``ValueError`` with list-like data and ``dtype`` specified (:issue:`30280`) Other @@ -912,15 +1158,26 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`) +- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) -- Fixed :class:`IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by 0 (:issue:`27398`) -- Fixed ``pow`` operations for :class:`IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) +- Fixed :class:`~arrays.IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by ``0`` (:issue:`27398`) +- Fixed ``pow`` operations for :class:`~arrays.IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) - Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`) -- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:29069`) +- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`) +- Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`) +- Bug in :meth:`DataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`) +- Bug where :meth:`DataFrame.itertuples` would incorrectly determine whether or not namedtuples could be used for dataframes of 255 columns (:issue:`28282`) +- Handle nested NumPy ``object`` arrays in :func:`testing.assert_series_equal` for ExtensionArray implementations (:issue:`30841`) +- Bug in :class:`Index` constructor incorrectly allowing 2-dimensional input arrays (:issue:`13601`, :issue:`27125`) -.. _whatsnew_1000.contributors: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ + +.. contributors:: v0.25.3..v1.0.0rc0 diff --git a/environment.yml b/environment.yml index 46fb5e7a19078..e244350a0bea0 100644 --- a/environment.yml +++ b/environment.yml @@ -55,6 +55,7 @@ dependencies: - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 + - pytest-asyncio # downstream tests - seaborn @@ -70,7 +71,7 @@ dependencies: - blosc - bottleneck>=1.2.1 - ipykernel - - ipython>=5.6.0,<=7.10.1 # see gh-30527 + - ipython>=7.11.1 - jinja2 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 diff --git a/pandas/__init__.py b/pandas/__init__.py index 99495d4b7dcb6..491bcb21f245d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -39,8 +39,6 @@ "the C extensions first." ) -from datetime import datetime - from pandas._config import ( get_option, set_option, @@ -117,7 +115,7 @@ DataFrame, ) -from pandas.core.arrays.sparse import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseDtype from pandas.tseries.api import infer_freq from pandas.tseries import offsets @@ -140,6 +138,7 @@ qcut, ) +import pandas.api from pandas.util._print_versions import show_versions from pandas.io.api import ( @@ -210,6 +209,19 @@ class Panel: return Panel + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) + + from datetime import datetime as dt + + return dt + elif name == "np": warnings.warn( @@ -234,6 +246,19 @@ class Panel: return type(name, (), {}) + elif name == "SparseArray": + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray + + return _SparseArray + raise AttributeError(f"module 'pandas' has no attribute '{name}'") @@ -264,6 +289,7 @@ def __getattr__(self, item): FutureWarning, stacklevel=2, ) + try: return getattr(self.np, item) except AttributeError: @@ -271,6 +297,73 @@ def __getattr__(self, item): np = __numpy() + class __Datetime(type): + + from datetime import datetime as dt + + datetime = dt + + def __getattr__(cls, item): + cls.emit_warning() + + try: + return getattr(cls.datetime, item) + except AttributeError: + raise AttributeError(f"module datetime has no attribute {item}") + + def __instancecheck__(cls, other): + return isinstance(other, cls.datetime) + + class __DatetimeSub(metaclass=__Datetime): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from datetime import datetime as dt + + return dt(*args, **kwargs) + + datetime = __DatetimeSub + + class __SparseArray(type): + + from pandas.core.arrays.sparse import SparseArray as sa + + SparseArray = sa + + def __instancecheck__(cls, other): + return isinstance(other, cls.SparseArray) + + class __SparseArraySub(metaclass=__SparseArray): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from pandas.core.arrays.sparse import SparseArray as sa + + return sa(*args, **kwargs) + + SparseArray = __SparseArraySub + + # module level doc-string __doc__ = """ pandas - a powerful data analysis and manipulation library for Python diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 6844df495547a..0a3009f74492f 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -197,7 +197,7 @@ def __setattr__(self, key, val): else: raise OptionError("You can only set the value of existing options") - def __getattr__(self, key): + def __getattr__(self, key: str): prefix = object.__getattribute__(self, "prefix") if prefix: prefix += "." diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 067b7c503baab..ef319f4447565 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -1,6 +1,7 @@ """ Unopinionated display configuration. """ + import locale import sys @@ -11,7 +12,7 @@ _initial_defencoding = None -def detect_console_encoding(): +def detect_console_encoding() -> str: """ Try to find the most capable encoding supported by the console. slightly modified from the way IPython handles the same issue. diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index dd1d4948aa6e3..0d68e78372d8a 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -12,7 +12,7 @@ @contextmanager -def set_locale(new_locale, lc_var=locale.LC_ALL): +def set_locale(new_locale, lc_var: int = locale.LC_ALL): """ Context manager for temporarily setting a locale. @@ -44,7 +44,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): locale.setlocale(lc_var, current_locale) -def can_set_locale(lc, lc_var=locale.LC_ALL): +def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool: """ Check to see if we can set a locale, and subsequently get the locale, without raising an Exception. @@ -58,7 +58,7 @@ def can_set_locale(lc, lc_var=locale.LC_ALL): Returns ------- - is_valid : bool + bool Whether the passed locale can be set """ diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 420e08a3d68d4..995fabbedcb5d 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -116,7 +116,7 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, IF {{True if c_type_in == c_type_out != "object" else False}}: cdef: - {{c_type_out}} *v + const {{c_type_out}} *v {{c_type_out}} *o # GH#3130 diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index d735890f7d07e..5298d8c5ed34e 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -70,6 +70,12 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): # null, stringify and encode data = str(val).encode(encoding) + elif isinstance(val, tuple): + # GH#28969 we could have a tuple, but need to ensure that + # the tuple entries are themselves hashable before converting + # to str + hash(val) + data = str(val).encode(encoding) else: raise TypeError(f"{val} of type {type(val)} is not a valid type " "for hashing, must be string or null") diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 6e68a687de94a..59ba1705d2dbb 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,7 +1,7 @@ cimport cython from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free +from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdlib cimport malloc, free diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ac8172146d351..28d269a9a809e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -447,7 +447,6 @@ cdef class DatetimeEngine(Int64Engine): conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') except TypeError: - self._date_check_type(val) raise KeyError(val) if loc == len(values) or values[loc] != conv: @@ -470,12 +469,6 @@ cdef class DatetimeEngine(Int64Engine): val = maybe_datetimelike_to_i8(val) return self.mapping.get_item(val) except (TypeError, ValueError): - self._date_check_type(val) - raise KeyError(val) - - cdef inline _date_check_type(self, object val): - hash(val) - if not util.is_integer_object(val): raise KeyError(val) def get_indexer(self, values): diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 4293108ea7ec2..1166768472449 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -326,7 +326,7 @@ cdef class Interval(IntervalMixin): def __hash__(self): return hash((self.left, self.right, self.closed)) - def __contains__(self, key): + def __contains__(self, key) -> bool: if _interval_like(key): raise TypeError("__contains__ not defined for two intervals") return ((self.left < key if self.open_left else self.left <= key) and diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 333c05f7c0dc5..d09413bfa5210 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -6,12 +6,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.algos import is_monotonic -ctypedef fused scalar_t: - float64_t - float32_t +ctypedef fused int_scalar_t: int64_t - int32_t + float64_t + +ctypedef fused uint_scalar_t: uint64_t + float64_t + +ctypedef fused scalar_t: + int_scalar_t + uint_scalar_t # ---------------------------------------------------------------------- # IntervalTree @@ -128,7 +133,12 @@ cdef class IntervalTree(IntervalMixin): result = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) elif result.data.n > old_len + 1: @@ -150,7 +160,12 @@ cdef class IntervalTree(IntervalMixin): missing = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) missing.append(i) @@ -194,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset): {{py: nodes = [] -for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: +for dtype in ['float64', 'int64', 'uint64']: for closed, cmp_left, cmp_right in [ ('left', '<=', '<'), ('right', '<', '<='), @@ -202,19 +217,26 @@ for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: ('neither', '<', '<')]: cmp_left_converse = '<' if cmp_left == '<=' else '<=' cmp_right_converse = '<' if cmp_right == '<=' else '<=' + if dtype.startswith('int'): + fused_prefix = 'int_' + elif dtype.startswith('uint'): + fused_prefix = 'uint_' + elif dtype.startswith('float'): + fused_prefix = '' nodes.append((dtype, dtype.title(), closed, closed.title(), cmp_left, cmp_right, cmp_left_converse, - cmp_right_converse)) + cmp_right_converse, + fused_prefix)) }} NODE_CLASSES = {} {{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, - cmp_left_converse, cmp_right_converse in nodes}} + cmp_left_converse, cmp_right_converse, fused_prefix in nodes}} cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree @@ -317,7 +339,7 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar_t point): + cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 089a7a04abb63..719db5c03f07f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -19,7 +19,7 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_Check, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, float32_t, float64_t, @@ -524,8 +524,11 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: # we are either not equal or both nan # I think None == None will be true here try: - if not (PyObject_RichCompareBool(x, y, Py_EQ) or - (x is None or is_nan(x)) and (y is None or is_nan(y))): + if PyArray_Check(x) and PyArray_Check(y): + if not array_equivalent_object(x, y): + return False + elif not (PyObject_RichCompareBool(x, y, Py_EQ) or + (x is None or is_nan(x)) and (y is None or is_nan(y))): return False except TypeError as err: # Avoid raising TypeError on tzawareness mismatch @@ -2232,13 +2235,14 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -_no_default = object() +# Note: no_default is exported to the public API in pandas.api.extensions +no_default = object() #: Sentinel indicating the default value. @cython.boundscheck(False) @cython.wraparound(False) def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, - object na_value=_no_default, object dtype=object): + object na_value=no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2269,7 +2273,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: - if na_value is _no_default: + if na_value is no_default: val = arr[i] else: val = na_value diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index f1cfa0978c3a0..26653438356b1 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -14,6 +14,7 @@ from pandas._libs.tslibs.np_datetime cimport ( get_timedelta64_value, get_datetime64_value) from pandas._libs.tslibs.nattype cimport ( checknull_with_nat, c_NaT as NaT, is_null_datetimelike) +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit @@ -290,16 +291,29 @@ cdef inline bint is_null_period(v): # Implementation of NA singleton -def _create_binary_propagating_op(name, divmod=False): +def _create_binary_propagating_op(name, is_divmod=False): def method(self, other): if (other is C_NA or isinstance(other, str) - or isinstance(other, (numbers.Number, np.bool_))): - if divmod: + or isinstance(other, (numbers.Number, np.bool_)) + or isinstance(other, np.ndarray) and not other.shape): + # Need the other.shape clause to handle NumPy scalars, + # since we do a setitem on `out` below, which + # won't work for NumPy scalars. + if is_divmod: return NA, NA else: return NA + elif isinstance(other, np.ndarray): + out = np.empty(other.shape, dtype=object) + out[:] = NA + + if is_divmod: + return out, out.copy() + else: + return out + return NotImplemented method.__name__ = name @@ -340,10 +354,7 @@ class NAType(C_NAType): return NAType._instance def __repr__(self) -> str: - return "NA" - - def __str__(self) -> str: - return "NA" + return "" def __bool__(self): raise TypeError("boolean value of NA is ambiguous") @@ -369,8 +380,8 @@ class NAType(C_NAType): __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") __mod__ = _create_binary_propagating_op("__mod__") __rmod__ = _create_binary_propagating_op("__rmod__") - __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True) - __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True) + __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True) + __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True) # __lshift__ and __rshift__ are not implemented __eq__ = _create_binary_propagating_op("__eq__") @@ -397,6 +408,8 @@ class NAType(C_NAType): return type(other)(1) else: return NA + elif isinstance(other, np.ndarray): + return np.where(other == 0, other.dtype.type(1), NA) return NotImplemented @@ -408,6 +421,8 @@ class NAType(C_NAType): return other else: return NA + elif isinstance(other, np.ndarray): + return np.where((other == 1) | (other == -1), other, NA) return NotImplemented @@ -440,6 +455,31 @@ class NAType(C_NAType): __rxor__ = __xor__ + __array_priority__ = 1000 + _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + types = self._HANDLED_TYPES + (NAType,) + for x in inputs: + if not isinstance(x, types): + return NotImplemented + + if method != "__call__": + raise ValueError(f"ufunc method '{method}' not supported for NA") + result = maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is NotImplemented: + # For a NumPy ufunc that's not a binop, like np.logaddexp + index = [i for i, x in enumerate(inputs) if x is NA][0] + result = np.broadcast_arrays(*inputs)[index] + if result.ndim == 0: + result = result.item() + if ufunc.nout > 1: + result = (NA,) * ufunc.nout + + return result + C_NA = NAType() # C-visible NA = C_NA # Python-visible diff --git a/pandas/_libs/ops_dispatch.pyx b/pandas/_libs/ops_dispatch.pyx new file mode 100644 index 0000000000000..f6ecef2038cf3 --- /dev/null +++ b/pandas/_libs/ops_dispatch.pyx @@ -0,0 +1,94 @@ +DISPATCHED_UFUNCS = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "remainder", + "matmul", + "or", + "xor", + "and", +} +UFUNC_ALIASES = { + "subtract": "sub", + "multiply": "mul", + "floor_divide": "floordiv", + "true_divide": "truediv", + "power": "pow", + "remainder": "mod", + "divide": "div", + "equal": "eq", + "not_equal": "ne", + "less": "lt", + "less_equal": "le", + "greater": "gt", + "greater_equal": "ge", + "bitwise_or": "or", + "bitwise_and": "and", + "bitwise_xor": "xor", +} + +# For op(., Array) -> Array.__r{op}__ +REVERSED_NAMES = { + "lt": "__gt__", + "le": "__ge__", + "gt": "__lt__", + "ge": "__le__", + "eq": "__eq__", + "ne": "__ne__", +} + + +def maybe_dispatch_ufunc_to_dunder_op( + object self, object ufunc, str method, *inputs, **kwargs +): + """ + Dispatch a ufunc to the equivalent dunder method. + + Parameters + ---------- + self : ArrayLike + The array whose dunder method we dispatch to + ufunc : Callable + A NumPy ufunc + method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} + inputs : ArrayLike + The input arrays. + kwargs : Any + The additional keyword arguments, e.g. ``out``. + + Returns + ------- + result : Any + The result of applying the ufunc + """ + # special has the ufuncs we dispatch to the dunder op on + + op_name = ufunc.__name__ + op_name = UFUNC_ALIASES.get(op_name, op_name) + + def not_implemented(*args, **kwargs): + return NotImplemented + + if (method == "__call__" + and op_name in DISPATCHED_UFUNCS + and kwargs.get("out") is None): + if isinstance(inputs[0], type(self)): + name = f"__{op_name}__" + return getattr(self, name, not_implemented)(inputs[1]) + else: + name = REVERSED_NAMES.get(op_name, f"__r{op_name}__") + result = getattr(self, name, not_implemented)(inputs[0]) + return result + else: + return NotImplemented diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1b566af7a5437..377d49f2bbd29 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,6 +2,7 @@ # See LICENSE for the license import bz2 import gzip +import io import os import sys import time @@ -171,12 +172,9 @@ cdef extern from "parser/tokenizer.h": int64_t skip_first_N_rows int64_t skipfooter # pick one, depending on whether the converter requires GIL - float64_t (*double_converter_nogil)(const char *, char **, - char, char, char, - int, int *, int *) nogil - float64_t (*double_converter_withgil)(const char *, char **, - char, char, char, - int, int *, int *) + float64_t (*double_converter)(const char *, char **, + char, char, char, + int, int *, int *) nogil # error handling char *warn_msg @@ -469,16 +467,11 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 - # - # Our current roundtrip implementation requires the GIL. - self.parser.double_converter_nogil = NULL - self.parser.double_converter_withgil = round_trip + self.parser.double_converter = round_trip elif float_precision == "high": - self.parser.double_converter_withgil = NULL - self.parser.double_converter_nogil = precise_xstrtod + self.parser.double_converter = precise_xstrtod else: - self.parser.double_converter_withgil = NULL - self.parser.double_converter_nogil = xstrtod + self.parser.double_converter = xstrtod if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) @@ -645,11 +638,10 @@ cdef class TextReader: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') - if b'utf-16' in (self.encoding or b''): - # we need to read utf-16 through UTF8Recoder. - # if source is utf-16, convert source to utf-8 by UTF8Recoder. - source = icom.UTF8Recoder(source, - self.encoding.decode('utf-8')) + if self.encoding and isinstance(source, io.BufferedIOBase): + source = io.TextIOWrapper( + source, self.encoding.decode('utf-8'), newline='') + self.encoding = b'utf-8' self.c_encoding = self.encoding @@ -1377,6 +1369,7 @@ STR_NA_VALUES = { "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", @@ -1663,22 +1656,12 @@ cdef _try_double(parser_t *parser, int64_t col, result = np.empty(lines, dtype=np.float64) data = result.data na_fset = kset_float64_from_list(na_flist) - if parser.double_converter_nogil != NULL: # if it can run without the GIL - with nogil: - error = _try_double_nogil(parser, parser.double_converter_nogil, - col, line_start, line_end, - na_filter, na_hashset, use_na_flist, - na_fset, NA, data, &na_count) - else: - assert parser.double_converter_withgil != NULL - error = _try_double_nogil(parser, - parser.double_converter_withgil, + with nogil: + error = _try_double_nogil(parser, parser.double_converter, col, line_start, line_end, na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + kh_destroy_float64(na_fset) if error != 0: return None, None diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0019fc4b36d20..8571761f77265 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,3 +1,4 @@ +from copy import copy from distutils.version import LooseVersion from cython import Py_ssize_t @@ -15,7 +16,7 @@ from numpy cimport (ndarray, cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.lib import maybe_convert_objects +from pandas._libs.lib import maybe_convert_objects, is_scalar cdef _check_result_array(object obj, Py_ssize_t cnt): @@ -492,14 +493,19 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if piece.index is chunk.index: - piece = piece.copy(deep='all') - else: + if piece.index is not chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int pass + if not is_scalar(piece): + # Need to copy data to avoid appending references + if hasattr(piece, "copy"): + piece = piece.copy(deep="all") + else: + piece = copy(piece) + results.append(piece) # If the data was modified inplace we need to diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 9f2b26b0dea19..2188ff6b0d464 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1774,11 +1774,18 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + // This is called from a nogil block in parsers.pyx + // so need to explicitly get GIL before Python calls + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + double r = PyOS_string_to_double(p, q, 0); if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); + + PyGILState_Release(gstate); return r; } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index b37de47662feb..4fd2065c07100 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -155,11 +155,8 @@ typedef struct parser_t { PyObject *skipfunc; int64_t skip_first_N_rows; int64_t skip_footer; - // pick one, depending on whether the converter requires GIL - double (*double_converter_nogil)(const char *, char **, - char, char, char, int, int *, int *); - double (*double_converter_withgil)(const char *, char **, - char, char, char, int, int *, int *); + double (*double_converter)(const char *, char **, + char, char, char, int, int *, int *); // error handling char *warn_msg; @@ -226,6 +223,8 @@ double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); + +// GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); int to_boolean(const char *item, uint8_t *val); diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 05c3ae4096ad5..8d04874b4c9bf 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -154,6 +154,8 @@ enum JSTYPES { JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure JT_INVALID, // Internal, do not return nor expect + JT_POS_INF, // Positive infinity + JT_NEG_INF, // Negative infinity }; typedef void * JSOBJ; @@ -290,6 +292,8 @@ typedef struct __JSONObjectDecoder { JSOBJ (*newTrue)(void *prv); JSOBJ (*newFalse)(void *prv); JSOBJ (*newNull)(void *prv); + JSOBJ (*newPosInf)(void *prv); + JSOBJ (*newNegInf)(void *prv); JSOBJ (*newObject)(void *prv, void *decoder); JSOBJ (*endObject)(void *prv, JSOBJ obj); JSOBJ (*newArray)(void *prv, void *decoder); diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 26b00c0cacd31..4eb18ee13d70b 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -127,9 +127,16 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == '-') { + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { offset++; intNeg = -1; + if (*(offset) == 'I') { + goto DECODE_INF; + } overflowLimit = LLONG_MIN; } @@ -281,6 +288,48 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } } +DECODE_NAN: + offset++; + if (*(offset++) != 'a') goto SET_NAN_ERROR; + if (*(offset++) != 'N') goto SET_NAN_ERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); + +SET_NAN_ERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); + +DECODE_INF: + offset++; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'f') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 't') goto SET_INF_ERROR; + if (*(offset++) != 'y') goto SET_INF_ERROR; + + ds->start = offset; + + if (intNeg == 1) { + ds->lastType = JT_POS_INF; + return ds->dec->newPosInf(ds->prv); + } else { + ds->lastType = JT_NEG_INF; + return ds->dec->newNegInf(ds->prv); + } + +SET_INF_ERROR: + if (intNeg == 1) { + const char *msg = "Unexpected character found when decoding 'Infinity'"; + return SetError(ds, -1, msg); + } else { + const char *msg = "Unexpected character found when decoding '-Infinity'"; + return SetError(ds, -1, msg); + } + + BREAK_EXP_LOOP: // FIXME: Check for arithmetic overflow here ds->lastType = JT_DOUBLE; @@ -1070,6 +1119,8 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { case '7': case '8': case '9': + case 'I': + case 'N': case '-': return decode_numeric(ds); diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 7a2e5a584443a..b2fc788478864 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -459,6 +459,10 @@ JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } +JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); } + +JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); } + JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } @@ -502,10 +506,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectDecoder dec = { Object_newString, Object_objectAddKey, Object_arrayAddItem, Object_newTrue, Object_newFalse, Object_newNull, - Object_newObject, Object_endObject, Object_newArray, - Object_endArray, Object_newInteger, Object_newLong, - Object_newDouble, Object_releaseObject, PyObject_Malloc, - PyObject_Free, PyObject_Realloc}; + Object_newPosInf, Object_newNegInf, Object_newObject, + Object_endObject, Object_newArray, Object_endArray, + Object_newInteger, Object_newLong, Object_newDouble, + Object_releaseObject, PyObject_Malloc, PyObject_Free, + PyObject_Realloc}; dec.preciseFloat = 0; dec.prv = NULL; diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 389e040866f72..c413a16f8d5f0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -54,7 +54,6 @@ static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; -PyObject *cls_timestamp; PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -166,7 +165,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -178,9 +176,8 @@ void *initObjToJSON(void) { Py_DECREF(mod_nattype); } - /* Initialise numpy API and use 2/3 compatible return */ + /* Initialise numpy API */ import_array(); - return NUMPY_IMPORT_ARRAY_RETVAL; } static TypeContext *createTypeContext(void) { @@ -243,65 +240,39 @@ static int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; - values = PyObject_GetAttrString(obj, "values"); PRINTMARK(); - if (values && !PyArray_CheckExact(values)) { - - if (PyObject_HasAttrString(values, "to_numpy")) { - values = PyObject_CallMethod(values, "to_numpy", NULL); - } - - if (PyObject_HasAttrString(values, "values")) { - PyObject *subvals = get_values(values); - PyErr_Clear(); - PRINTMARK(); - // subvals are sometimes missing a dimension - if (subvals) { - PyArrayObject *reshape = (PyArrayObject *)subvals; - PyObject *shape = PyObject_GetAttrString(obj, "shape"); - PyArray_Dims dims; - PRINTMARK(); - - if (!shape || !PyArray_IntpConverter(shape, &dims)) { - subvals = NULL; - } else { - subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER); - PyDimMem_FREE(dims.ptr); - } - Py_DECREF(reshape); - Py_XDECREF(shape); - } - Py_DECREF(values); - values = subvals; - } else { - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - - if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { + if (PyObject_HasAttrString(obj, "_internal_get_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "_internal_get_values", NULL); - if (values && !PyArray_CheckExact(values)) { + + if (values == NULL) { + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying PRINTMARK(); Py_DECREF(values); values = NULL; } } - if (!values && PyObject_HasAttrString(obj, "get_block_values")) { + if ((values == NULL) && PyObject_HasAttrString(obj, "get_block_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "get_block_values", NULL); - if (values && !PyArray_CheckExact(values)) { + + if (values == NULL) { + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying PRINTMARK(); Py_DECREF(values); values = NULL; } } - if (!values) { + if (values == NULL) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; PRINTMARK(); @@ -408,22 +379,18 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); } -/* returns a char* and mutates the pointer to *len */ -static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *len) { +/* Converts the int64_t representation of a datetime to ISO; mutates len */ +static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { npy_datetimestruct dts; int ret_code; - int64_t longVal = GET_TC(tc)->longValue; - pandas_datetime_to_datetimestruct(longVal, NPY_FR_ns, &dts); + pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); if (result == NULL) { PyErr_NoMemory(); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; return NULL; } @@ -431,7 +398,6 @@ static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, if (ret_code != 0) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; PyObject_Free(result); } @@ -441,30 +407,33 @@ static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, return result; } +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return int64ToIso(GET_TC(tc)->longValue, base, len); +} + static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { scaleNanosecToUnit(&dt, base); return dt; } -static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) { +/* Convert PyDatetime To ISO C-string. mutates len */ +static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, + size_t *len) { npy_datetimestruct dts; int ret; - if (!PyDateTime_Check(obj)) { - // TODO: raise TypeError - } - ret = convert_pydatetime_to_datetimestruct(obj, &dts); if (ret != 0) { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, "Could not convert PyDateTime to numpy datetime"); } - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; return NULL; } - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); ret = make_iso_8601_datetime(&dts, result, *len, base); @@ -473,7 +442,6 @@ static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) { PRINTMARK(); PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; PyObject_Free(result); return NULL; } @@ -484,6 +452,19 @@ static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) { return result; } +/* JSON callback */ +static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { + + if (!PyDateTime_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected datetime object"); + return NULL; + } + + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return PyDateTimeToIso(obj, base, len); +} + static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; int ret; @@ -1518,7 +1499,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. PyObject *item = NULL; - npy_intp i, stride, len; + size_t len; + npy_intp i, stride; char **ret; char *dataptr, *cLabel; int type_num; @@ -1559,8 +1541,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - // TODO: for any matches on type_num (date and timedeltas) should use a - // vectorized solution to convert to epoch or iso formats + // TODO: vectorized timedelta solution if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); @@ -1583,54 +1564,36 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, cLabel = (char *)PyUnicode_AsUTF8(iso); Py_DECREF(iso); len = strlen(cLabel); - } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || - PyDate_Check(item)) { - PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); - if (ts == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + } else if (PyTypeNum_ISDATETIME(type_num)) { + NPY_DATETIMEUNIT base = enc->datetimeUnit; + npy_int64 longVal; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); } - + castfunc(dataptr, &longVal, 1, NULL, NULL); if (enc->datetimeIso) { - PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); - Py_DECREF(ts); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + cLabel = int64ToIso(longVal, base, &len); + } else { + if (!scaleNanosecToUnit(&longVal, base)) { + // TODO: This gets hit but somehow doesn't cause errors + // need to clean up (elsewhere in module as well) } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_INT64_FMT, longVal); len = strlen(cLabel); + } + } else if (PyDateTime_Check(item) || PyDate_Check(item)) { + NPY_DATETIMEUNIT base = enc->datetimeUnit; + if (enc->datetimeIso) { + cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); } else { - npy_int64 value; - // TODO: refactor to not duplicate what goes on in - // beginTypeContext - if (PyObject_HasAttrString(ts, "value")) { - PRINTMARK(); - value = get_long_attr(ts, "value"); - } else { - PRINTMARK(); - value = total_seconds(ts) * - 1000000000LL; // nanoseconds per second - } - Py_DECREF(ts); - - NPY_DATETIMEUNIT unit = enc->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - char buf[21] = {0}; // 21 chars for 2**63 as string - cLabel = buf; - sprintf(buf, "%" NPY_INT64_FMT, value); + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + PyDateTimeToEpoch(item, base)); len = strlen(cLabel); } } else { // Fallback to string representation @@ -1740,7 +1703,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = NpyDateTimeToIso; + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; // Currently no way to pass longVal to iso function, so use // state management GET_TC(tc)->longValue = longVal; @@ -1815,7 +1778,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = PyDateTimeToIso; + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; tc->type = JT_UTF8; } else { PRINTMARK(); @@ -1841,7 +1804,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToUTF8 = PyDateTimeToIso; + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; tc->type = JT_UTF8; } else { PRINTMARK(); diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index 39320d73d0cab..4a88fb7a4e849 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -65,35 +65,15 @@ static PyMethodDef ujsonMethods[] = { {NULL, NULL, 0, NULL} /* Sentinel */ }; -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "_libjson", - 0, /* m_doc */ - -1, /* m_size */ - ujsonMethods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL /* m_free */ +static PyModuleDef moduledef = { + .m_base = PyModuleDef_HEAD_INIT, + .m_name = "_libjson", + .m_methods = ujsonMethods }; -#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) -#define PYMODULE_CREATE() PyModule_Create(&moduledef) -#define MODINITERROR return NULL -PYMODINITFUNC { - PyObject *module; - PyObject *version_string; +PyMODINIT_FUNC PyInit_json(void) { + initObjToJSON(); // TODO: clean up, maybe via tp_free? + return PyModuleDef_Init(&moduledef); - initObjToJSON(); - module = PYMODULE_CREATE(); - - if (module == NULL) { - MODINITERROR; - } - - version_string = PyUnicode_FromString(UJSON_VERSION); - PyModule_AddObject(module, "__version__", version_string); - - return module; } diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 026bd7a44a509..5a30b71a6fea1 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -123,7 +123,7 @@ cpdef assert_almost_equal(a, b, if isiterable(a): if not isiterable(b): - from pandas.util.testing import assert_class_equal + from pandas._testing import assert_class_equal # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -134,12 +134,12 @@ cpdef assert_almost_equal(a, b, if a_is_ndarray and b_is_ndarray: na, nb = a.size, b.size if a.shape != b.shape: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail raise_assert_detail( obj, f'{obj} shapes are different', a.shape, b.shape) if check_dtype and not is_dtype_equal(a.dtype, b.dtype): - from pandas.util.testing import assert_attr_equal + from pandas._testing import assert_attr_equal assert_attr_equal('dtype', a, b, obj=obj) if array_equivalent(a, b, strict_nan=True): @@ -149,7 +149,7 @@ cpdef assert_almost_equal(a, b, na, nb = len(a), len(b) if na != nb: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail # if we have a small diff set, print it if abs(na - nb) < 10: @@ -168,7 +168,7 @@ cpdef assert_almost_equal(a, b, diff += 1 if is_unequal: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail msg = (f"{obj} values are different " f"({np.round(diff * 100.0 / na, 5)} %)") raise_assert_detail(obj, msg, lobj, robj) @@ -176,7 +176,7 @@ cpdef assert_almost_equal(a, b, return True elif isiterable(b): - from pandas.util.testing import assert_class_equal + from pandas._testing import assert_class_equal # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index cbe6dd6c2322d..53e3354ca8eb6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -120,8 +120,7 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, elif box == "datetime": func_create = create_datetime_from_ts else: - raise ValueError("box must be one of 'datetime', 'date', 'time' or" - " 'timestamp'") + raise ValueError("box must be one of 'datetime', 'date', 'time' or 'timestamp'") if is_utc(tz) or tz is None: for i in range(n): @@ -296,10 +295,15 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, object unit, +def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, str errors='coerce'): """ - convert the ndarray according to the unit + Convert the ndarray to datetime according to the time unit. + + This function converts an array of objects into a numpy array of + datetime64[ns]. It returns the converted array + and also returns the timezone offset + if errors: - raise: return converted values or raise OutOfBoundsDatetime if out of range on the conversion or @@ -307,6 +311,18 @@ def array_with_unit_to_datetime(ndarray values, object unit, - ignore: return non-convertible values as the same unit - coerce: NaT for non-convertibles + Parameters + ---------- + values : ndarray of object + Date-like objects to convert + mask : ndarray of bool + Not-a-time mask for non-nullable integer types conversion, + can be None + unit : object + Time unit to use during conversion + errors : str, default 'raise' + Error behavior when parsing + Returns ------- result : ndarray of m8 values @@ -316,7 +332,6 @@ def array_with_unit_to_datetime(ndarray values, object unit, Py_ssize_t i, j, n=len(values) int64_t m ndarray[float64_t] fvalues - ndarray mask bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' bint is_raise = errors=='raise' @@ -329,9 +344,13 @@ def array_with_unit_to_datetime(ndarray values, object unit, if unit == 'ns': if issubclass(values.dtype.type, np.integer): - return values.astype('M8[ns]'), tz - # This will return a tz - return array_to_datetime(values.astype(object), errors=errors) + result = values.astype('M8[ns]') + else: + result, tz = array_to_datetime(values.astype(object), errors=errors) + if mask is not None: + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz m = cast_from_unit(None, unit) @@ -343,7 +362,9 @@ def array_with_unit_to_datetime(ndarray values, object unit, if values.dtype.kind == "i": # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) - mask = iresult == NPY_NAT + # If no mask, fill mask by comparing to NPY_NAT constant + if mask is None: + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 0b77948027ad7..36e6b14be182a 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from cpython.datetime cimport datetime, tzinfo +from cpython.datetime cimport datetime from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 76a694c64e1fb..67c0f0cc33ab8 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -5,6 +5,9 @@ from cpython.object cimport ( from cpython.datetime cimport (datetime, PyDateTime_Check, PyDelta_Check, PyDateTime_IMPORT) + +from cpython.version cimport PY_MINOR_VERSION + PyDateTime_IMPORT import numpy as np @@ -19,6 +22,7 @@ from pandas._libs.tslibs.util cimport ( get_nat, is_integer_object, is_float_object, is_datetime64_object, is_timedelta64_object) + # ---------------------------------------------------------------------- # Constants nat_strings = {'NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN'} @@ -427,6 +431,10 @@ class NaTType(_NaT): tzname = _make_error_func('tzname', datetime) utcoffset = _make_error_func('utcoffset', datetime) + # "fromisocalendar" was introduced in 3.8 + if PY_MINOR_VERSION >= 8: + fromisocalendar = _make_error_func('fromisocalendar', datetime) + # ---------------------------------------------------------------------- # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 742403883f7dd..5508b208de00a 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -278,8 +278,8 @@ def array_strptime(object[:] values, object fmt, "the ISO year directive '%G' and a weekday " "directive '%A', '%a', '%w', or '%u'.") else: - raise ValueError("ISO week directive '%V' is incompatible with" - " the year directive '%Y'. Use the ISO year " + raise ValueError("ISO week directive '%V' is incompatible with " + "the year directive '%Y'. Use the ISO year " "'%G' instead.") # If we know the wk of the year and what day of that wk, we can figure diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 86a9d053730b8..abe7f9e5b4105 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -814,9 +814,9 @@ default 'raise' 'shift_backward') if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + raise ValueError("The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object") if self.tzinfo is None: # tz naive, localize diff --git a/pandas/_testing.py b/pandas/_testing.py new file mode 100644 index 0000000000000..1fdc5d478aaf6 --- /dev/null +++ b/pandas/_testing.py @@ -0,0 +1,2745 @@ +import bz2 +from collections import Counter +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +import gzip +import os +from shutil import rmtree +import string +import tempfile +from typing import Any, List, Optional, Union, cast +import warnings +import zipfile + +import numpy as np +from numpy.random import rand, randn + +from pandas._config.localization import ( # noqa:F401 + can_set_locale, + get_locales, + set_locale, +) + +import pandas._libs.testing as _testing +from pandas._typing import FilePathOrBuffer, FrameOrSeries +from pandas.compat import _get_lzma_file, _import_lzma + +from pandas.core.dtypes.common import ( + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_period_dtype, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + bdate_range, +) +from pandas.core.algorithms import take_1d +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + period_array, +) + +from pandas.io.common import urlopen +from pandas.io.formats.printing import pprint_thing + +lzma = _import_lzma() + +N = 30 +K = 4 +_RAISE_NETWORK_ERROR_DEFAULT = False + +# set testing_mode +_testing_mode_warnings = (DeprecationWarning, ResourceWarning) + + +def set_testing_mode(): + # set the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("always", _testing_mode_warnings) + + +def reset_testing_mode(): + # reset the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("ignore", _testing_mode_warnings) + + +set_testing_mode() + + +def reset_display_options(): + """ + Reset the display options for printing and representing objects. + """ + pd.reset_option("^display.", silent=True) + + +def round_trip_pickle( + obj: Any, path: Optional[FilePathOrBuffer] = None +) -> FrameOrSeries: + """ + Pickle an object and then read it again. + + Parameters + ---------- + obj : any object + The object to pickle and then re-read. + path : str, path object or file-like object, default None + The path where the pickled object is written and then read. + + Returns + ------- + pandas object + The original object that was pickled and then re-read. + """ + _path = path + if _path is None: + _path = f"__{rands(10)}__.pickle" + with ensure_clean(_path) as temp_path: + pd.to_pickle(obj, temp_path) + return pd.read_pickle(temp_path) + + +def round_trip_pathlib(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a pathlib.Path and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + Path = pytest.importorskip("pathlib").Path + if path is None: + path = "___pathlib___" + with ensure_clean(path) as path: + writer(Path(path)) + obj = reader(Path(path)) + return obj + + +def round_trip_localpath(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a py.path LocalPath and read it back. + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + LocalPath = pytest.importorskip("py.path").local + if path is None: + path = "___localpath___" + with ensure_clean(path) as path: + writer(LocalPath(path)) + obj = reader(LocalPath(path)) + return obj + + +@contextmanager +def decompress_file(path, compression): + """ + Open a compressed file and return a file object. + + Parameters + ---------- + path : str + The path where the file is read from. + + compression : {'gzip', 'bz2', 'zip', 'xz', None} + Name of the decompression to use + + Returns + ------- + file object + """ + if compression is None: + f = open(path, "rb") + elif compression == "gzip": + f = gzip.open(path, "rb") + elif compression == "bz2": + f = bz2.BZ2File(path, "rb") + elif compression == "xz": + f = _get_lzma_file(lzma)(path, "rb") + elif compression == "zip": + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + try: + yield f + finally: + f.close() + if compression == "zip": + zip_file.close() + + +def write_to_compressed(compression, path, data, dest="test"): + """ + Write data to a compressed file. + + Parameters + ---------- + compression : {'gzip', 'bz2', 'zip', 'xz'} + The compression type to use. + path : str + The file path to write the data. + data : str + The data to write. + dest : str, default "test" + The destination file (for ZIP only) + + Raises + ------ + ValueError : An invalid compression value was passed in. + """ + if compression == "zip": + import zipfile + + compress_method = zipfile.ZipFile + elif compression == "gzip": + import gzip + + compress_method = gzip.GzipFile + elif compression == "bz2": + import bz2 + + compress_method = bz2.BZ2File + elif compression == "xz": + compress_method = _get_lzma_file(lzma) + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + if compression == "zip": + mode = "w" + args = (dest, data) + method = "writestr" + else: + mode = "wb" + args = (data,) + method = "write" + + with compress_method(path, mode=mode) as f: + getattr(f, method)(*args) + + +def assert_almost_equal( + left, + right, + check_dtype: Union[bool, str] = "equiv", + check_less_precise: Union[bool, int] = False, + **kwargs, +): + """ + Check that the left and right objects are approximately equal. + + By approximately equal, we refer to objects that are numbers or that + contain numbers which may be equivalent to specific levels of precision. + + Parameters + ---------- + left : object + right : object + check_dtype : bool or {'equiv'}, default 'equiv' + Check dtype if both a and b are the same type. If 'equiv' is passed in, + then `RangeIndex` and `Int64Index` are also considered equivalent + when doing type checking. + check_less_precise : bool or int, default False + Specify comparison precision. 5 digits (False) or 3 digits (True) + after decimal points are compared. If int, then specify the number + of digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + """ + if isinstance(left, pd.Index): + assert_index_equal( + left, + right, + check_exact=False, + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.Series): + assert_series_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.DataFrame): + assert_frame_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + else: + # Other sequences. + if check_dtype: + if is_number(left) and is_number(right): + # Do not compare numeric classes, like np.float64 and float. + pass + elif is_bool(left) and is_bool(right): + # Do not compare bool classes, like np.bool_ and bool. + pass + else: + if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): + obj = "numpy array" + else: + obj = "Input" + assert_class_equal(left, right, obj=obj) + _testing.assert_almost_equal( + left, + right, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + +def _check_isinstance(left, right, cls): + """ + Helper method for our assert_* methods that ensures that + the two objects being compared have the right type before + proceeding with the comparison. + + Parameters + ---------- + left : The first object being compared. + right : The second object being compared. + cls : The class type to check against. + + Raises + ------ + AssertionError : Either `left` or `right` is not an instance of `cls`. + """ + cls_name = cls.__name__ + + if not isinstance(left, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(left)} instead" + ) + if not isinstance(right, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(right)} instead" + ) + + +def assert_dict_equal(left, right, compare_keys: bool = True): + + _check_isinstance(left, right, dict) + _testing.assert_dict_equal(left, right, compare_keys=compare_keys) + + +def randbool(size=(), p: float = 0.5): + return rand(*size) <= p + + +RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) +RANDU_CHARS = np.array( + list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), + dtype=(np.unicode_, 1), +) + + +def rands_array(nchars, size, dtype="O"): + """ + Generate an array of byte strings. + """ + retval = ( + np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) + .view((np.str_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def randu_array(nchars, size, dtype="O"): + """ + Generate an array of unicode strings. + """ + retval = ( + np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) + .view((np.unicode_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def rands(nchars): + """ + Generate one random byte string. + + See `rands_array` if you want to create an array of random strings. + + """ + return "".join(np.random.choice(RANDS_CHARS, nchars)) + + +def randu(nchars): + """ + Generate one random unicode string. + + See `randu_array` if you want to create an array of random unicode strings. + + """ + return "".join(np.random.choice(RANDU_CHARS, nchars)) + + +def close(fignum=None): + from matplotlib.pyplot import get_fignums, close as _close + + if fignum is None: + for fignum in get_fignums(): + _close(fignum) + else: + _close(fignum) + + +# ----------------------------------------------------------------------------- +# contextmanager to ensure the file cleanup + + +@contextmanager +def ensure_clean(filename=None, return_filelike=False): + """ + Gets a temporary path and agrees to remove on close. + + Parameters + ---------- + filename : str (optional) + if None, creates a temporary file which is then removed when out of + scope. if passed, creates temporary file with filename as ending. + return_filelike : bool (default False) + if True, returns a file-like which is *always* cleaned. Necessary for + savefig and other functions which want to append extensions. + """ + filename = filename or "" + fd = None + + if return_filelike: + f = tempfile.TemporaryFile(suffix=filename) + try: + yield f + finally: + f.close() + else: + # don't generate tempfile if using a path with directory specified + if len(os.path.dirname(filename)): + raise ValueError("Can't pass a qualified name to ensure_clean()") + + try: + fd, filename = tempfile.mkstemp(suffix=filename) + except UnicodeEncodeError: + import pytest + + pytest.skip("no unicode file names on this system") + + try: + yield filename + finally: + try: + os.close(fd) + except OSError: + print(f"Couldn't close file descriptor: {fd} (file: {filename})") + try: + if os.path.exists(filename): + os.remove(filename) + except OSError as e: + print(f"Exception on removing file: {e}") + + +@contextmanager +def ensure_clean_dir(): + """ + Get a temporary directory path and agrees to remove on close. + + Yields + ------ + Temporary directory path + """ + directory_name = tempfile.mkdtemp(suffix="") + try: + yield directory_name + finally: + try: + rmtree(directory_name) + except OSError: + pass + + +@contextmanager +def ensure_safe_environment_variables(): + """ + Get a context manager to safely set environment variables + + All changes will be undone on close, hence environment variables set + within this contextmanager will neither persist nor change global state. + """ + saved_environ = dict(os.environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(saved_environ) + + +# ----------------------------------------------------------------------------- +# Comparators + + +def equalContents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + +def assert_index_equal( + left: Index, + right: Index, + exact: Union[bool, str] = "equiv", + check_names: bool = True, + check_less_precise: Union[bool, int] = False, + check_exact: bool = True, + check_categorical: bool = True, + obj: str = "Index", +) -> None: + """ + Check that left and right Index are equal. + + Parameters + ---------- + left : Index + right : Index + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + check_names : bool, default True + Whether to check the names attribute. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default True + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + def _check_types(l, r, obj="Index"): + if exact: + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal("dtype", l, r, obj=obj) + + # allow string-like to have different inferred_types + if l.inferred_type in ("string", "unicode"): + assert r.inferred_type in ("string", "unicode") + else: + assert_attr_equal("inferred_type", l, r, obj=obj) + + def _get_ilevel_values(index, level): + # accept level number only + unique = index.levels[level] + level_codes = index.codes[level] + filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) + values = unique._shallow_copy(filled, name=index.names[level]) + return values + + # instance validation + _check_isinstance(left, right, Index) + + # class / dtype comparison + _check_types(left, right, obj=obj) + + # level comparison + if left.nlevels != right.nlevels: + msg1 = f"{obj} levels are different" + msg2 = f"{left.nlevels}, {left}" + msg3 = f"{right.nlevels}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # length comparison + if len(left) != len(right): + msg1 = f"{obj} length are different" + msg2 = f"{len(left)}, {left}" + msg3 = f"{len(right)}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # MultiIndex special comparison for little-friendly error messages + if left.nlevels > 1: + left = cast(MultiIndex, left) + right = cast(MultiIndex, right) + + for level in range(left.nlevels): + # cannot use get_level_values here because it can change dtype + llevel = _get_ilevel_values(left, level) + rlevel = _get_ilevel_values(right, level) + + lobj = f"MultiIndex level [{level}]" + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=lobj, + ) + # get_level_values may change dtype + _check_types(left.levels[level], right.levels[level], obj=obj) + + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: + if not left.equals(right): + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + else: + _testing.assert_almost_equal( + left.values, + right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, + lobj=left, + robj=right, + ) + + # metadata comparison + if check_names: + assert_attr_equal("names", left, right, obj=obj) + if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): + assert_attr_equal("freq", left, right, obj=obj) + if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): + assert_interval_array_equal(left.values, right.values) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): + """ + Checks classes are equal. + """ + __tracebackhide__ = True + + def repr_class(x): + if isinstance(x, Index): + # return Index as it is to include values in the error message + return x + + try: + return type(x).__name__ + except AttributeError: + return repr(type(x)) + + if exact == "equiv": + if type(left) != type(right): + # allow equivalence of Int64Index/RangeIndex + types = {type(left).__name__, type(right).__name__} + if len(types - {"Int64Index", "RangeIndex"}): + msg = f"{obj} classes are not equivalent" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + elif exact: + if type(left) != type(right): + msg = f"{obj} classes are different" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + + +def assert_attr_equal(attr, left, right, obj="Attributes"): + """checks attributes are equal. Both objects must have attribute. + + Parameters + ---------- + attr : str + Attribute name being compared. + left : object + right : object + obj : str, default 'Attributes' + Specify object name being compared, internally used to show appropriate + assertion message + """ + __tracebackhide__ = True + + left_attr = getattr(left, attr) + right_attr = getattr(right, attr) + + if left_attr is right_attr: + return True + elif ( + is_number(left_attr) + and np.isnan(left_attr) + and is_number(right_attr) + and np.isnan(right_attr) + ): + # np.nan + return True + + try: + result = left_attr == right_attr + except TypeError: + # datetimetz on rhs may raise TypeError + result = False + if not isinstance(result, bool): + result = result.all() + + if result: + return True + else: + msg = f'Attribute "{attr}" are different' + raise_assert_detail(obj, msg, left_attr, right_attr) + + +def assert_is_valid_plot_return_object(objs): + import matplotlib.pyplot as plt + + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {repr(type(el).__name__)}" + ) + assert isinstance(el, (plt.Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{repr(type(objs).__name__)}" + ) + assert isinstance(objs, (plt.Artist, tuple, dict)), msg + + +def isiterable(obj): + return hasattr(obj, "__iter__") + + +def assert_is_sorted(seq): + """Assert that the sequence is sorted.""" + if isinstance(seq, (Index, Series)): + seq = seq.values + # sorting does not change precisions + assert_numpy_array_equal(seq, np.sort(np.array(seq))) + + +def assert_categorical_equal( + left, right, check_dtype=True, check_category_order=True, obj="Categorical" +): + """Test that Categoricals are equivalent. + + Parameters + ---------- + left : Categorical + right : Categorical + check_dtype : bool, default True + Check that integer dtype of the codes are the same + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, Categorical) + + if check_category_order: + assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") + assert_numpy_array_equal( + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + ) + else: + assert_index_equal( + left.categories.sort_values(), + right.categories.sort_values(), + obj=f"{obj}.categories", + ) + assert_index_equal( + left.categories.take(left.codes), + right.categories.take(right.codes), + obj=f"{obj}.values", + ) + + assert_attr_equal("ordered", left, right, obj=obj) + + +def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): + """Test that two IntervalArrays are equivalent. + + Parameters + ---------- + left, right : IntervalArray + The IntervalArrays to compare. + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + obj : str, default 'IntervalArray' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, IntervalArray) + + assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") + assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + assert_attr_equal("closed", left, right, obj=obj) + + +def assert_period_array_equal(left, right, obj="PeriodArray"): + _check_isinstance(left, right, PeriodArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") + assert_attr_equal("freq", left, right, obj=obj) + + +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): + __tracebackhide__ = True + _check_isinstance(left, right, DatetimeArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("tz", left, right, obj=obj) + + +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): + __tracebackhide__ = True + _check_isinstance(left, right, TimedeltaArray) + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + + +def raise_assert_detail(obj, message, left, right, diff=None): + __tracebackhide__ = True + + if isinstance(left, np.ndarray): + left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) + + if isinstance(right, np.ndarray): + right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) + + msg = f"""{obj} are different + +{message} +[left]: {left} +[right]: {right}""" + + if diff is not None: + msg += f"\n[diff]: {diff}" + + raise AssertionError(msg) + + +def assert_numpy_array_equal( + left, + right, + strict_nan=False, + check_dtype=True, + err_msg=None, + check_same=None, + obj="numpy array", +): + """ + Check that 'np.ndarray' is equivalent. + + Parameters + ---------- + left, right : numpy.ndarray or iterable + The two arrays to be compared. + strict_nan : bool, default False + If True, consider NaN and None to be different. + check_dtype : bool, default True + Check dtype if both a and b are np.ndarray. + err_msg : str, default None + If provided, used as assertion message. + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area. + obj : str, default 'numpy array' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + # Show a detailed error message when classes are different + assert_class_equal(left, right, obj=obj) + # both classes must be an np.ndarray + _check_isinstance(left, right, np.ndarray) + + def _get_base(obj): + return obj.base if getattr(obj, "base", None) is not None else obj + + left_base = _get_base(left) + right_base = _get_base(right) + + if check_same == "same": + if left_base is not right_base: + raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") + elif check_same == "copy": + if left_base is right_base: + raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") + + def _raise(left, right, err_msg): + if err_msg is None: + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shapes are different", left.shape, right.shape, + ) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + + raise AssertionError(err_msg) + + # compare shape and values + if not array_equivalent(left, right, strict_nan=strict_nan): + _raise(left, right, err_msg) + + if check_dtype: + if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): + assert_attr_equal("dtype", left, right, obj=obj) + + +def assert_extension_array_equal( + left, right, check_dtype=True, check_less_precise=False, check_exact=False +): + """Check that left and right ExtensionArrays are equal. + + Parameters + ---------- + left, right : ExtensionArray + The two arrays to compare + check_dtype : bool, default True + Whether to check if the ExtensionArray dtypes are identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default False + Whether to compare number exactly. + + Notes + ----- + Missing values are checked separately from valid values. + A mask of missing values is computed for each and checked to match. + The remaining all-valid values are cast to object dtype and checked. + """ + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" + assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" + if check_dtype: + assert_attr_equal("dtype", left, right, obj="ExtensionArray") + + if hasattr(left, "asi8") and type(right) == type(left): + # Avoid slow object-dtype comparisons + assert_numpy_array_equal(left.asi8, right.asi8) + return + + left_na = np.asarray(left.isna()) + right_na = np.asarray(right.isna()) + assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") + + left_valid = np.asarray(left[~left_na].astype(object)) + right_valid = np.asarray(right[~right_na].astype(object)) + if check_exact: + assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") + else: + _testing.assert_almost_equal( + left_valid, + right_valid, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + obj="ExtensionArray", + ) + + +# This could be refactored to use the NDFrame.equals method +def assert_series_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_series_type=True, + check_less_precise=False, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + obj="Series", +): + """ + Check that left and right Series are equal. + + Parameters + ---------- + left : Series + right : Series + check_dtype : bool, default True + Whether to check the Series dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_series_type : bool, default True + Whether to check the Series class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check the Series and Index names attribute. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Series' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, Series) + + if check_series_type: + # ToDo: There are some tests using rhs is sparse + # lhs is dense. Should use assert_class_equal in future + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # length comparison + if len(left) != len(right): + msg1 = f"{len(left)}, {left.index}" + msg2 = f"{len(right)}, {right.index}" + raise_assert_detail(obj, "Series length are different", msg1, msg2) + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + if check_dtype: + # We want to skip exact dtype checking when `check_categorical` + # is False. We'll still raise if only one is a `Categorical`, + # regardless of `check_categorical` + if ( + is_categorical_dtype(left) + and is_categorical_dtype(right) + and not check_categorical + ): + pass + else: + assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") + + if check_exact: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + obj=str(obj), + ) + elif check_datetimelike_compat: + # we want to check only if we have compat dtypes + # e.g. integer and M|m are NOT compat, but we can simply check + # the values in that case + if needs_i8_conversion(left) or needs_i8_conversion(right): + + # datetimelike may have different objects (e.g. datetime.datetime + # vs Timestamp) but will compare equal + if not Index(left.values).equals(Index(right.values)): + msg = ( + f"[datetimelike_compat=True] {left.values} " + f"is not equal to {right.values}." + ) + raise AssertionError(msg) + else: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + ) + elif is_interval_dtype(left) or is_interval_dtype(right): + assert_interval_array_equal(left.array, right.array) + elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): + # .values is an ndarray, but ._values is the ExtensionArray. + # TODO: Use .array + assert is_extension_array_dtype(right.dtype) + assert_extension_array_equal(left._values, right._values) + elif ( + is_extension_array_dtype(left) + and not is_categorical_dtype(left) + and is_extension_array_dtype(right) + and not is_categorical_dtype(right) + ): + assert_extension_array_equal(left.array, right.array) + else: + _testing.assert_almost_equal( + left._internal_get_values(), + right._internal_get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj=str(obj), + ) + + # metadata comparison + if check_names: + assert_attr_equal("name", left, right, obj=obj) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +# This could be refactored to use the NDFrame.equals method +def assert_frame_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_column_type="equiv", + check_frame_type=True, + check_less_precise=False, + check_names=True, + by_blocks=False, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_like=False, + obj="DataFrame", +): + """ + Check that left and right DataFrame are equal. + + This function is intended to compare two DataFrames and output any + differences. Is is mostly intended for use in unit tests. + Additional parameters allow varying the strictness of the + equality checks performed. + + Parameters + ---------- + left : DataFrame + First DataFrame to compare. + right : DataFrame + Second DataFrame to compare. + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool or {'equiv'}, default 'equiv' + Whether to check the columns class, dtype and inferred_type + are identical. Is passed as the ``exact`` argument of + :func:`assert_index_equal`. + check_frame_type : bool, default True + Whether to check the DataFrame class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check that the `names` attribute for both the `index` + and `column` attributes of the DataFrame is identical. + by_blocks : bool, default False + Specify how to compare internal data. If False, compare by columns. + If True, compare by blocks. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_like : bool, default False + If True, ignore the order of index & columns. + Note: index labels must match their respective rows + (same as in columns) - same labels must be with the same data. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message. + + See Also + -------- + assert_series_equal : Equivalent method for asserting Series equality. + DataFrame.equals : Check DataFrame equality. + + Examples + -------- + This example shows comparing two DataFrames that are equal + but with columns of differing dtypes. + + >>> from pandas._testing import assert_frame_equal + >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + + df1 equals itself. + + >>> assert_frame_equal(df1, df1) + + df1 differs from df2 as column 'b' is of a different type. + + >>> assert_frame_equal(df1, df2) + Traceback (most recent call last): + ... + AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different + + Attribute "dtype" are different + [left]: int64 + [right]: float64 + + Ignore differing dtypes in columns with check_dtype. + + >>> assert_frame_equal(df1, df2, check_dtype=False) + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, DataFrame) + + if check_frame_type: + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # shape comparison + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + ) + + if check_like: + left, right = left.reindex_like(right), right + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + # column comparison + assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.columns", + ) + + # compare by blocks + if by_blocks: + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() + for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): + assert dtype in lblocks + assert dtype in rblocks + assert_frame_equal( + lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj + ) + + # compare by columns + else: + for i, col in enumerate(left.columns): + assert col in right + lcol = left.iloc[:, i] + rcol = right.iloc[:, i] + assert_series_equal( + lcol, + rcol, + check_dtype=check_dtype, + check_index_type=check_index_type, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_names=check_names, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + obj=f'{obj}.iloc[:, {i}] (column name="{col}")', + ) + + +def assert_equal(left, right, **kwargs): + """ + Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. + + Parameters + ---------- + left, right : Index, Series, DataFrame, ExtensionArray, or np.ndarray + The two items to be compared. + **kwargs + All keyword arguments are passed through to the underlying assert method. + """ + __tracebackhide__ = True + + if isinstance(left, pd.Index): + assert_index_equal(left, right, **kwargs) + elif isinstance(left, pd.Series): + assert_series_equal(left, right, **kwargs) + elif isinstance(left, pd.DataFrame): + assert_frame_equal(left, right, **kwargs) + elif isinstance(left, IntervalArray): + assert_interval_array_equal(left, right, **kwargs) + elif isinstance(left, PeriodArray): + assert_period_array_equal(left, right, **kwargs) + elif isinstance(left, DatetimeArray): + assert_datetime_array_equal(left, right, **kwargs) + elif isinstance(left, TimedeltaArray): + assert_timedelta_array_equal(left, right, **kwargs) + elif isinstance(left, ExtensionArray): + assert_extension_array_equal(left, right, **kwargs) + elif isinstance(left, np.ndarray): + assert_numpy_array_equal(left, right, **kwargs) + elif isinstance(left, str): + assert kwargs == {} + assert left == right + else: + raise NotImplementedError(type(left)) + + +def box_expected(expected, box_cls, transpose=True): + """ + Helper function to wrap the expected output of a test in a given box_class. + + Parameters + ---------- + expected : np.ndarray, Index, Series + box_cls : {Index, Series, DataFrame} + + Returns + ------- + subclass of box_cls + """ + if box_cls is pd.Index: + expected = pd.Index(expected) + elif box_cls is pd.Series: + expected = pd.Series(expected) + elif box_cls is pd.DataFrame: + expected = pd.Series(expected).to_frame() + if transpose: + # for vector operations, we we need a DataFrame to be a single-row, + # not a single-column, in order to operate against non-DataFrame + # vectors of the same length. + expected = expected.T + elif box_cls is PeriodArray: + # the PeriodArray constructor is not as flexible as period_array + expected = period_array(expected) + elif box_cls is DatetimeArray: + expected = DatetimeArray(expected) + elif box_cls is TimedeltaArray: + expected = TimedeltaArray(expected) + elif box_cls is np.ndarray: + expected = np.array(expected) + elif box_cls is to_array: + expected = to_array(expected) + else: + raise NotImplementedError(box_cls) + return expected + + +def to_array(obj): + # temporary implementation until we get pd.array in place + if is_period_dtype(obj): + return period_array(obj) + elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): + return DatetimeArray._from_sequence(obj) + elif is_timedelta64_dtype(obj): + return TimedeltaArray._from_sequence(obj) + else: + return np.array(obj) + + +# ----------------------------------------------------------------------------- +# Sparse + + +def assert_sp_array_equal( + left, + right, + check_dtype=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, +): + """Check that the left and right SparseArray are equal. + + Parameters + ---------- + left : SparseArray + right : SparseArray + check_dtype : bool, default True + Whether to check the data dtype is identical. + check_kind : bool, default True + Whether to just the kind of the sparse index for each column. + check_fill_value : bool, default True + Whether to check that left.fill_value matches right.fill_value + consolidate_block_indices : bool, default False + Whether to consolidate contiguous blocks for sparse arrays with + a BlockIndex. Some operations, e.g. concat, will end up with + block indices that could be consolidated. Setting this to true will + create a new BlockIndex for that array, with consolidated + block indices. + """ + + _check_isinstance(left, right, pd.arrays.SparseArray) + + assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) + + # SparseIndex comparison + assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) + assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) + + if not check_kind: + left_index = left.sp_index.to_block_index() + right_index = right.sp_index.to_block_index() + else: + left_index = left.sp_index + right_index = right.sp_index + + if consolidate_block_indices and left.kind == "block": + # we'll probably remove this hack... + left_index = left_index.to_int_index().to_block_index() + right_index = right_index.to_int_index().to_block_index() + + if not left_index.equals(right_index): + raise_assert_detail( + "SparseArray.index", "index are not equal", left_index, right_index + ) + else: + # Just ensure a + pass + + if check_fill_value: + assert_attr_equal("fill_value", left, right) + if check_dtype: + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + + +# ----------------------------------------------------------------------------- +# Others + + +def assert_contains_all(iterable, dic): + for k in iterable: + assert k in dic, f"Did not contain item: {repr(k)}" + + +def assert_copy(iter1, iter2, **eql_kwargs): + """ + iter1, iter2: iterables that produce elements + comparable with assert_almost_equal + + Checks that the elements are equal, but not + the same object. (Does not check that items + in sequences are also not the same object) + """ + for elem1, elem2 in zip(iter1, iter2): + assert_almost_equal(elem1, elem2, **eql_kwargs) + msg = ( + f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " + "different objects, but they were the same object." + ) + assert elem1 is not elem2, msg + + +def getCols(k): + return string.ascii_uppercase[:k] + + +# make index +def makeStringIndex(k=10, name=None): + return Index(rands_array(nchars=10, size=k), name=name) + + +def makeUnicodeIndex(k=10, name=None): + return Index(randu_array(nchars=10, size=k), name=name) + + +def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): + """ make a length k index or n categories """ + x = rands_array(nchars=4, size=n) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) + + +def makeIntervalIndex(k=10, name=None, **kwargs): + """ make a length k IntervalIndex """ + x = np.linspace(0, 100, num=(k + 1)) + return IntervalIndex.from_breaks(x, name=name, **kwargs) + + +def makeBoolIndex(k=10, name=None): + if k == 1: + return Index([True], name=name) + elif k == 2: + return Index([False, True], name=name) + return Index([False, True] + [False] * (k - 2), name=name) + + +def makeIntIndex(k=10, name=None): + return Index(list(range(k)), name=name) + + +def makeUIntIndex(k=10, name=None): + return Index([2 ** 63 + i for i in range(k)], name=name) + + +def makeRangeIndex(k=10, name=None, **kwargs): + return RangeIndex(0, k, 1, name=name, **kwargs) + + +def makeFloatIndex(k=10, name=None): + values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) + return Index(values * (10 ** np.random.randint(0, 9)), name=name) + + +def makeDateIndex(k=10, freq="B", name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = bdate_range(dt, periods=k, freq=freq, name=name) + return DatetimeIndex(dr, name=name, **kwargs) + + +def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): + return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) + + +def makePeriodIndex(k=10, name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) + return dr + + +def makeMultiIndex(k=10, names=None, **kwargs): + return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) + + +_names = [ + "Alice", + "Bob", + "Charlie", + "Dan", + "Edith", + "Frank", + "George", + "Hannah", + "Ingrid", + "Jerry", + "Kevin", + "Laura", + "Michael", + "Norbert", + "Oliver", + "Patricia", + "Quinn", + "Ray", + "Sarah", + "Tim", + "Ursula", + "Victor", + "Wendy", + "Xavier", + "Yvonne", + "Zelda", +] + + +def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): + """ + Make a DataFrame with a DatetimeIndex + + Parameters + ---------- + start : str or Timestamp, default "2000-01-01" + The start of the index. Passed to date_range with `freq`. + end : str or Timestamp, default "2000-12-31" + The end of the index. Passed to date_range with `freq`. + freq : str or Freq + The frequency to use for the DatetimeIndex + seed : int, optional + The random state seed. + + * name : object dtype with string names + * id : int dtype with + * x, y : float dtype + + Examples + -------- + >>> _make_timeseries() + id name x y + timestamp + 2000-01-01 982 Frank 0.031261 0.986727 + 2000-01-02 1025 Edith -0.086358 -0.032920 + 2000-01-03 982 Edith 0.473177 0.298654 + 2000-01-04 1009 Sarah 0.534344 -0.750377 + 2000-01-05 963 Zelda -0.271573 0.054424 + ... ... ... ... ... + 2000-12-27 980 Ingrid -0.132333 -0.422195 + 2000-12-28 972 Frank -0.376007 -0.298687 + 2000-12-29 1009 Ursula -0.865047 -0.503133 + 2000-12-30 1000 Hannah -0.063757 -0.507336 + 2000-12-31 972 Tim -0.869120 0.531685 + """ + index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") + n = len(index) + state = np.random.RandomState(seed) + columns = { + "name": state.choice(_names, size=n), + "id": state.poisson(1000, size=n), + "x": state.rand(n) * 2 - 1, + "y": state.rand(n) * 2 - 1, + } + df = pd.DataFrame(columns, index=index, columns=sorted(columns)) + if df.index[-1] == end: + df = df.iloc[:-1] + return df + + +def all_index_generator(k=10): + """Generator which can be iterated over to get instances of all the various + index classes. + + Parameters + ---------- + k: length of each of the index instances + """ + all_make_index_funcs = [ + makeIntIndex, + makeFloatIndex, + makeStringIndex, + makeUnicodeIndex, + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeBoolIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + ] + for make_index_func in all_make_index_funcs: + yield make_index_func(k=k) + + +def index_subclass_makers_generator(): + make_index_funcs = [ + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + makeMultiIndex, + ] + for make_index_func in make_index_funcs: + yield make_index_func + + +def all_timeseries_index_generator(k=10): + """Generator which can be iterated over to get instances of all the classes + which represent time-series. + + Parameters + ---------- + k: length of each of the index instances + """ + make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] + for make_index_func in make_index_funcs: + yield make_index_func(k=k) + + +# make series +def makeFloatSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeStringSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeObjectSeries(name=None): + data = makeStringIndex(N) + data = Index(data, dtype=object) + index = makeStringIndex(N) + return Series(data, index=index, name=name) + + +def getSeriesData(): + index = makeStringIndex(N) + return {c: Series(randn(N), index=index) for c in getCols(K)} + + +def makeTimeSeries(nper=None, freq="B", name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) + + +def makePeriodSeries(nper=None, name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makePeriodIndex(nper), name=name) + + +def getTimeSeriesData(nper=None, freq="B"): + return {c: makeTimeSeries(nper, freq) for c in getCols(K)} + + +def getPeriodData(nper=None): + return {c: makePeriodSeries(nper) for c in getCols(K)} + + +# make frame +def makeTimeDataFrame(nper=None, freq="B"): + data = getTimeSeriesData(nper, freq) + return DataFrame(data) + + +def makeDataFrame(): + data = getSeriesData() + return DataFrame(data) + + +def getMixedTypeDict(): + index = Index(["a", "b", "c", "d", "e"]) + + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + + return index, data + + +def makeMixedDataFrame(): + return DataFrame(getMixedTypeDict()[1]) + + +def makePeriodFrame(nper=None): + data = getPeriodData(nper) + return DataFrame(data) + + +def makeCustomIndex( + nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None +): + """Create an index/multindex with given dimensions, levels, names, etc' + + nentries - number of entries in index + nlevels - number of levels (> 1 produces multindex) + prefix - a string prefix for labels + names - (Optional), bool or list of strings. if True will use default + names, if false will use no names, if a list is given, the name of + each level in the index will be taken from the list. + ndupe_l - (Optional), list of ints, the number of rows for which the + label will repeated at the corresponding level, you can specify just + the first few, the rest will use the default ndupe_l of 1. + len(ndupe_l) <= nlevels. + idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a datetime index. + + if unspecified, string labels will be generated. + """ + + if ndupe_l is None: + ndupe_l = [1] * nlevels + assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels + assert names is None or names is False or names is True or len(names) is nlevels + assert idx_type is None or ( + idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 + ) + + if names is True: + # build default names + names = [prefix + str(i) for i in range(nlevels)] + if names is False: + # pass None to index constructor for no name + names = None + + # make singleton case uniform + if isinstance(names, str) and nlevels == 1: + names = [names] + + # specific 1D index type requested? + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) + if idx_func: + idx = idx_func(nentries) + # but we need to fill in the name + if names: + idx.name = names[0] + return idx + elif idx_type is not None: + raise ValueError( + f"{repr(idx_type)} is not a legal value for `idx_type`, " + "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." + ) + + if len(ndupe_l) < nlevels: + ndupe_l.extend([1] * (nlevels - len(ndupe_l))) + assert len(ndupe_l) == nlevels + + assert all(x > 0 for x in ndupe_l) + + tuples = [] + for i in range(nlevels): + + def keyfunc(x): + import re + + numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") + return [int(num) for num in numeric_tuple] + + # build a list of lists to create the index from + div_factor = nentries // ndupe_l[i] + 1 + cnt = Counter() + for j in range(div_factor): + label = f"{prefix}_l{i}_g{j}" + cnt[label] = ndupe_l[i] + # cute Counter trick + result = sorted(cnt.elements(), key=keyfunc)[:nentries] + tuples.append(result) + + tuples = list(zip(*tuples)) + + # convert tuples to index + if nentries == 1: + # we have a single level of tuples, i.e. a regular Index + index = Index(tuples[0], name=names[0]) + elif nlevels == 1: + name = None if names is None else names[0] + index = Index((x[0] for x in tuples), name=name) + else: + index = MultiIndex.from_tuples(tuples, names=names) + return index + + +def makeCustomDataframe( + nrows, + ncols, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + nrows, ncols - number of data rows/cols + c_idx_names, idx_names - False/True/list of strings, yields No names , + default names or uses the provided names for the levels of the + corresponding index. You can provide a single string when + c_idx_nlevels ==1. + c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex + r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex + data_gen_f - a function f(row,col) which return the data value + at that position, the default generator used yields values of the form + "RxCy" based on position. + c_ndupe_l, r_ndupe_l - list of integers, determines the number + of duplicates for each label at a given level of the corresponding + index. The default `None` value produces a multiplicity of 1 across + all levels, i.e. a unique index. Will accept a partial list of length + N < idx_nlevels, for just the first N levels. If ndupe doesn't divide + nrows/ncol, the last label might have lower multiplicity. + dtype - passed to the DataFrame constructor as is, in case you wish to + have more control in conjunction with a custom `data_gen_f` + r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a timedelta index. + + if unspecified, string labels will be generated. + + Examples: + + # 5 row, 3 columns, default names on both, single index on both axis + >> makeCustomDataframe(5,3) + + # make the data a random int between 1 and 100 + >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) + + # 2-level multiindex on rows with each label duplicated + # twice on first level, default names on both axis, single + # index on both axis + >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) + + # DatetimeIndex on row, index with unicode labels on columns + # no names on either axis + >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, + r_idx_type="dt",c_idx_type="u") + + # 4-level multindex on rows with names provided, 2-level multindex + # on columns with default labels and default names. + >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, + r_idx_names=["FEE","FI","FO","FAM"], + c_idx_nlevels=2) + + >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + """ + + assert c_idx_nlevels > 0 + assert r_idx_nlevels > 0 + assert r_idx_type is None or ( + r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + ) + assert c_idx_type is None or ( + c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + ) + + columns = makeCustomIndex( + ncols, + nlevels=c_idx_nlevels, + prefix="C", + names=c_idx_names, + ndupe_l=c_ndupe_l, + idx_type=c_idx_type, + ) + index = makeCustomIndex( + nrows, + nlevels=r_idx_nlevels, + prefix="R", + names=r_idx_names, + ndupe_l=r_ndupe_l, + idx_type=r_idx_type, + ) + + # by default, generate data based on location + if data_gen_f is None: + data_gen_f = lambda r, c: f"R{r}C{c}" + + data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] + + return DataFrame(data, index, columns, dtype=dtype) + + +def _create_missing_idx(nrows, ncols, density, random_state=None): + if random_state is None: + random_state = np.random + else: + random_state = np.random.RandomState(random_state) + + # below is cribbed from scipy.sparse + size = int(np.round((1 - density) * nrows * ncols)) + # generate a few more to ensure unique values + min_rows = 5 + fac = 1.02 + extra_size = min(size + min_rows, fac * size) + + def _gen_unique_rand(rng, _extra_size): + ind = rng.rand(int(_extra_size)) + return np.unique(np.floor(ind * nrows * ncols))[:size] + + ind = _gen_unique_rand(random_state, extra_size) + while ind.size < size: + extra_size *= 1.05 + ind = _gen_unique_rand(random_state, extra_size) + + j = np.floor(ind * 1.0 / nrows).astype(int) + i = (ind - j * nrows).astype(int) + return i.tolist(), j.tolist() + + +def makeMissingCustomDataframe( + nrows, + ncols, + density=0.9, + random_state=None, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + Parameters + ---------- + Density : float, optional + Float in (0, 1) that gives the percentage of non-missing numbers in + the DataFrame. + random_state : {np.random.RandomState, int}, optional + Random number generator or random seed. + + See makeCustomDataframe for descriptions of the rest of the parameters. + """ + df = makeCustomDataframe( + nrows, + ncols, + c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, + r_ndupe_l=r_ndupe_l, + dtype=dtype, + c_idx_type=c_idx_type, + r_idx_type=r_idx_type, + ) + + i, j = _create_missing_idx(nrows, ncols, density, random_state) + df.values[i, j] = np.nan + return df + + +def makeMissingDataframe(density=0.9, random_state=None): + df = makeDataFrame() + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) + df.values[i, j] = np.nan + return df + + +def optional_args(decorator): + """allows a decorator to take optional positional and keyword arguments. + Assumes that taking a single, callable, positional argument means that + it is decorating a function, i.e. something like this:: + + @my_decorator + def function(): pass + + Calls decorator with decorator(f, *args, **kwargs)""" + + @wraps(decorator) + def wrapper(*args, **kwargs): + def dec(f): + return decorator(f, *args, **kwargs) + + is_decorating = not kwargs and len(args) == 1 and callable(args[0]) + if is_decorating: + f = args[0] + args = [] + return dec(f) + else: + return dec + + return wrapper + + +# skip tests on exceptions with this message +_network_error_messages = ( + # 'urlopen error timed out', + # 'timeout: timed out', + # 'socket.timeout: timed out', + "timed out", + "Server Hangup", + "HTTP Error 503: Service Unavailable", + "502: Proxy Error", + "HTTP Error 502: internal error", + "HTTP Error 502", + "HTTP Error 503", + "HTTP Error 403", + "HTTP Error 400", + "Temporary failure in name resolution", + "Name or service not known", + "Connection refused", + "certificate verify", +) + +# or this e.errno/e.reason.errno +_network_errno_vals = ( + 101, # Network is unreachable + 111, # Connection refused + 110, # Connection timed out + 104, # Connection reset Error + 54, # Connection reset by peer + 60, # urllib.error.URLError: [Errno 60] Connection timed out +) + +# Both of the above shouldn't mask real issues such as 404's +# or refused connections (changed DNS). +# But some tests (test_data yahoo) contact incredibly flakey +# servers. + +# and conditionally raise on exception types in _get_default_network_errors + + +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + + return (IOError, http.client.HTTPException, TimeoutError) + + +def can_connect(url, error_classes=None): + """Try to connect to the given url. True if succeeds, False if IOError + raised + + Parameters + ---------- + url : basestring + The URL to try to connect to + + Returns + ------- + connectable : bool + Return True if no IOError (unable to connect) or URLError (bad url) was + raised + """ + + if error_classes is None: + error_classes = _get_default_network_errors() + + try: + with urlopen(url): + pass + except error_classes: + return False + else: + return True + + +@optional_args +def network( + t, + url="http://www.google.com", + raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, + check_before_test=False, + error_classes=None, + skip_errnos=_network_errno_vals, + _skip_on_messages=_network_error_messages, +): + """ + Label a test as requiring network connection and, if an error is + encountered, only raise if it does not find a network connection. + + In comparison to ``network``, this assumes an added contract to your test: + you must assert that, under normal conditions, your test will ONLY fail if + it does not have network connectivity. + + You can call this in 3 ways: as a standard decorator, with keyword + arguments, or with a positional argument that is the url to check. + + Parameters + ---------- + t : callable + The test requiring network connectivity. + url : path + The url to test via ``pandas.io.common.urlopen`` to check + for connectivity. Defaults to 'http://www.google.com'. + raise_on_error : bool + If True, never catches errors. + check_before_test : bool + If True, checks connectivity before running the test case. + error_classes : tuple or Exception + error classes to ignore. If not in ``error_classes``, raises the error. + defaults to IOError. Be careful about changing the error classes here. + skip_errnos : iterable of int + Any exception that has .errno or .reason.erno set to one + of these values will be skipped with an appropriate + message. + _skip_on_messages: iterable of string + any exception e for which one of the strings is + a substring of str(e) will be skipped with an appropriate + message. Intended to suppress errors where an errno isn't available. + + Notes + ----- + * ``raise_on_error`` supercedes ``check_before_test`` + + Returns + ------- + t : callable + The decorated test ``t``, with checks for connectivity errors. + + Example + ------- + + Tests decorated with @network will fail if it's possible to make a network + connection to another URL (defaults to google.com):: + + >>> from pandas._testing import network + >>> from pandas.io.common import urlopen + >>> @network + ... def test_network(): + ... with urlopen("rabbit://bonanza.com"): + ... pass + Traceback + ... + URLError: + + You can specify alternative URLs:: + + >>> @network("http://www.yahoo.com") + ... def test_something_with_yahoo(): + ... raise IOError("Failure Message") + >>> test_something_with_yahoo() + Traceback (most recent call last): + ... + IOError: Failure Message + + If you set check_before_test, it will check the url first and not run the + test on failure:: + + >>> @network("failing://url.blaher", check_before_test=True) + ... def test_something(): + ... print("I ran!") + ... raise ValueError("Failure") + >>> test_something() + Traceback (most recent call last): + ... + + Errors not related to networking will always be raised. + """ + from pytest import skip + + if error_classes is None: + error_classes = _get_default_network_errors() + + t.network = True + + @wraps(t) + def wrapper(*args, **kwargs): + if check_before_test and not raise_on_error: + if not can_connect(url, error_classes): + skip() + try: + return t(*args, **kwargs) + except Exception as err: + errno = getattr(err, "errno", None) + if not errno and hasattr(errno, "reason"): + errno = getattr(err.reason, "errno", None) + + if errno in skip_errnos: + skip(f"Skipping test due to known errno and error {err}") + + e_str = str(err) + + if any(m.lower() in e_str.lower() for m in _skip_on_messages): + skip( + f"Skipping test because exception message is known and error {err}" + ) + + if not isinstance(err, error_classes): + raise + + if raise_on_error or can_connect(url, error_classes): + raise + else: + skip(f"Skipping test due to lack of connectivity and error {err}") + + return wrapper + + +with_connectivity_check = network + + +@contextmanager +def assert_produces_warning( + expected_warning=Warning, + filter_level="always", + clear=None, + check_stacklevel=True, + raise_on_extra_warnings=True, +): + """ + Context manager for running code expected to either raise a specific + warning, or not raise any warnings. Verifies that the code raises the + expected warning, and that it does not raise any other unexpected + warnings. It is basically a wrapper around ``warnings.catch_warnings``. + + Parameters + ---------- + expected_warning : {Warning, False, None}, default Warning + The type of Exception raised. ``exception.Warning`` is the base + class for all warnings. To check that no warning is returned, + specify ``False`` or ``None``. + filter_level : str or None, default "always" + Specifies whether warnings are ignored, displayed, or turned + into errors. + Valid values are: + + * "error" - turns matching warnings into exceptions + * "ignore" - discard the warning + * "always" - always emit a warning + * "default" - print the warning the first time it is generated + from each location + * "module" - print the warning the first time it is generated + from each module + * "once" - print the warning the first time it is generated + + clear : str, default None + If not ``None`` then remove any previously raised warnings from + the ``__warningsregistry__`` to ensure that no warning messages are + suppressed by this context manager. If ``None`` is specified, + the ``__warningsregistry__`` keeps track of which warnings have been + shown, and does not show them again. + check_stacklevel : bool, default True + If True, displays the line that called the function containing + the warning to show were the function is called. Otherwise, the + line that implements the function is displayed. + raise_on_extra_warnings : bool, default True + Whether extra warnings not of the type `expected_warning` should + cause the test to fail. + + Examples + -------- + >>> import warnings + >>> with assert_produces_warning(): + ... warnings.warn(UserWarning()) + ... + >>> with assert_produces_warning(False): + ... warnings.warn(RuntimeWarning()) + ... + Traceback (most recent call last): + ... + AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. + >>> with assert_produces_warning(UserWarning): + ... warnings.warn(RuntimeWarning()) + Traceback (most recent call last): + ... + AssertionError: Did not see expected warning of class 'UserWarning'. + + ..warn:: This is *not* thread-safe. + """ + __tracebackhide__ = True + + with warnings.catch_warnings(record=True) as w: + + if clear is not None: + # make sure that we are clearing these warnings + # if they have happened before + # to guarantee that we will catch them + if not is_list_like(clear): + clear = [clear] + for m in clear: + try: + m.__warningregistry__.clear() + except AttributeError: + # module may not have __warningregistry__ + pass + + saw_warning = False + warnings.simplefilter(filter_level) + yield w + extra_warnings = [] + + for actual_warning in w: + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): + saw_warning = True + + if check_stacklevel and issubclass( + actual_warning.category, (FutureWarning, DeprecationWarning) + ): + from inspect import getframeinfo, stack + + caller = getframeinfo(stack()[2][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg + else: + extra_warnings.append( + ( + actual_warning.category.__name__, + actual_warning.message, + actual_warning.filename, + actual_warning.lineno, + ) + ) + if expected_warning: + msg = ( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + assert saw_warning, msg + if raise_on_extra_warnings and extra_warnings: + raise AssertionError( + f"Caused unexpected warning(s): {repr(extra_warnings)}" + ) + + +class RNGContext: + """ + Context manager to set the numpy random number generator speed. Returns + to the original value upon exiting the context manager. + + Parameters + ---------- + seed : int + Seed for numpy.random.seed + + Examples + -------- + + with RNGContext(42): + np.random.randn() + """ + + def __init__(self, seed): + self.seed = seed + + def __enter__(self): + + self.start_state = np.random.get_state() + np.random.seed(self.seed) + + def __exit__(self, exc_type, exc_value, traceback): + + np.random.set_state(self.start_state) + + +@contextmanager +def with_csv_dialect(name, **kwargs): + """ + Context manager to temporarily register a CSV dialect for parsing CSV. + + Parameters + ---------- + name : str + The name of the dialect. + kwargs : mapping + The parameters for the dialect. + + Raises + ------ + ValueError : the name of the dialect conflicts with a builtin one. + + See Also + -------- + csv : Python's CSV library. + """ + import csv + + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} + + if name in _BUILTIN_DIALECTS: + raise ValueError("Cannot override builtin dialect.") + + csv.register_dialect(name, **kwargs) + yield + csv.unregister_dialect(name) + + +@contextmanager +def use_numexpr(use, min_elements=None): + from pandas.core.computation import expressions as expr + + if min_elements is None: + min_elements = expr._MIN_ELEMENTS + + olduse = expr._USE_NUMEXPR + oldmin = expr._MIN_ELEMENTS + expr.set_use_numexpr(use) + expr._MIN_ELEMENTS = min_elements + yield + expr._MIN_ELEMENTS = oldmin + expr.set_use_numexpr(olduse) + + +def test_parallel(num_threads=2, kwargs_list=None): + """Decorator to run the same function multiple times in parallel. + + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + import threading + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper + + +class SubclassedSeries(Series): + _metadata = ["testattr", "name"] + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + +class SubclassedDataFrame(DataFrame): + _metadata = ["testattr"] + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + +class SubclassedCategorical(Categorical): + @property + def _constructor(self): + return SubclassedCategorical + + +@contextmanager +def set_timezone(tz: str): + """ + Context manager for temporarily setting a timezone. + + Parameters + ---------- + tz : str + A string representing a valid timezone. + + Examples + -------- + + >>> from datetime import datetime + >>> from dateutil.tz import tzlocal + >>> tzlocal().tzname(datetime.now()) + 'IST' + + >>> with set_timezone('US/Eastern'): + ... tzlocal().tzname(datetime.now()) + ... + 'EDT' + """ + + import os + import time + + def setTZ(tz): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() + + orig_tz = os.environ.get("TZ") + setTZ(tz) + try: + yield + finally: + setTZ(orig_tz) + + +def _make_skipna_wrapper(alternative, skipna_alternative=None): + """ + Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + function + """ + if skipna_alternative: + + def skipna_wrapper(x): + return skipna_alternative(x.values) + + else: + + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper + + +def convert_rows_list_to_csv_str(rows_list: List[str]): + """ + Convert list of CSV rows to single CSV-formatted string for current OS. + + This method is used for creating expected value of to_csv() method. + + Parameters + ---------- + rows_list : List[str] + Each element represents the row of csv. + + Returns + ------- + str + Expected output of to_csv() in current OS. + """ + sep = os.linesep + expected = sep.join(rows_list) + sep + return expected diff --git a/pandas/_typing.py b/pandas/_typing.py index 69b08c581cff9..171b76b4d2c4b 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -2,10 +2,14 @@ from typing import ( IO, TYPE_CHECKING, + Any, AnyStr, + Callable, Collection, Dict, + Hashable, List, + Mapping, Optional, TypeVar, Union, @@ -21,23 +25,49 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.series import Series # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 + from pandas.core.series import Series # noqa: F401 + from pandas.core.frame import DataFrame # noqa: F401 +# array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) ArrayLike = TypeVar("ArrayLike", "ExtensionArray", np.ndarray) + +# scalars + +PythonScalar = Union[str, int, float, bool] DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", "Period", "Timestamp", "Timedelta") +PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar = Union[PythonScalar, PandasScalar] + +# other + Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] +# FrameOrSeriesUnion means either a DataFrame or a Series. E.g. +# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series +# is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed +# in, either a DataFrame or a Series is returned. +FrameOrSeriesUnion = Union["DataFrame", "Series"] + +# FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is +# used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a +# Series is passed into a function, a Series is always returned and if a DataFrame is +# passed in, a DataFrame is always returned. FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") -Scalar = Union[str, int, float, bool] + Axis = Union[str, int] +Label = Optional[Hashable] +Level = Union[Label, int] Ordered = Optional[bool] -JSONSerializable = Union[Scalar, List, Dict] - +JSONSerializable = Union[PythonScalar, List, Dict] Axes = Collection +# For functions like rename that convert one label to another +Renamer = Union[Mapping[Label, Any], Callable[[Label], Label]] + # to maintain type information across generic functions and parametrization -_T = TypeVar("_T") +T = TypeVar("T") diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index d0a26864a1102..bebbb38b4aefa 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,2 +1,2 @@ """ public toolkit API """ -from . import extensions, indexers, types # noqa +from pandas.api import extensions, indexers, types # noqa diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 573d700dac43d..3019dd0e9b371 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,13 +1,27 @@ -"""Public API for extending pandas objects.""" -from pandas.core.dtypes.dtypes import ( # noqa: F401 - ExtensionDtype, - register_extension_dtype, -) +""" +Public API for extending pandas objects. +""" + +from pandas._libs.lib import no_default -from pandas.core.accessor import ( # noqa: F401 +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype + +from pandas.core.accessor import ( register_dataframe_accessor, register_index_accessor, register_series_accessor, ) -from pandas.core.algorithms import take # noqa: F401 -from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin + +__all__ = [ + "no_default", + "ExtensionDtype", + "register_extension_dtype", + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", + "take", + "ExtensionArray", + "ExtensionScalarOpsMixin", +] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index a5d6bc07da3eb..10654eb0888ee 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -1,2 +1,8 @@ -"""Public API for Rolling Window Indexers""" -from pandas.core.window.indexers import BaseIndexer # noqa: F401 +""" +Public API for Rolling Window Indexers. +""" + +from pandas.core.indexers import check_bool_array_indexer +from pandas.core.window.indexers import BaseIndexer + +__all__ = ["check_bool_array_indexer", "BaseIndexer"] diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index f32e1abe28cc1..3495b493707c2 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -1,12 +1,23 @@ -""" public toolkit API """ +""" +Public toolkit API. +""" -from pandas._libs.lib import infer_dtype # noqa: F401 +from pandas._libs.lib import infer_dtype from pandas.core.dtypes.api import * # noqa: F403, F401 -from pandas.core.dtypes.concat import union_categoricals # noqa: F401 -from pandas.core.dtypes.dtypes import ( # noqa: F401 +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, ) + +__all__ = [ + "infer_dtype", + "union_categoricals", + "CategoricalDtype", + "DatetimeTZDtype", + "IntervalDtype", + "PeriodDtype", +] diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index bac976333ef0a..7aeb0327139f1 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -16,7 +16,7 @@ "odfpy": "1.3.0", "openpyxl": "2.5.7", "pandas_gbq": "0.8.0", - "pyarrow": "0.12.0", + "pyarrow": "0.13.0", "pytables": "3.4.2", "pytest": "5.0.1", "s3fs": "0.3.0", diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index 479eddf0c0536..588bd24ddf797 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -1,15 +1,24 @@ -from collections import ChainMap +from typing import ChainMap, MutableMapping, TypeVar, cast +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") -class DeepChainMap(ChainMap): - def __setitem__(self, key, value): + +class DeepChainMap(ChainMap[_KT, _VT]): + """Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: for mapping in self.maps: - if key in mapping: - mapping[key] = value + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) + if key in mutable_mapping: + mutable_mapping[key] = value return - self.maps[0][key] = value + cast(MutableMapping[_KT, _VT], self.maps[0])[key] = value - def __delitem__(self, key): + def __delitem__(self, key: _KT) -> None: """ Raises ------ @@ -17,7 +26,8 @@ def __delitem__(self, key): If `key` doesn't exist. """ for mapping in self.maps: + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) if key in mapping: - del mapping[key] + del mutable_mapping[key] return raise KeyError(key) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index fffe09a74571e..50f234cbf9419 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -33,13 +33,26 @@ class CompatValidator: - def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): + def __init__( + self, + defaults, + fname=None, + method: Optional[str] = None, + max_fname_arg_count=None, + ): self.fname = fname self.method = method self.defaults = defaults self.max_fname_arg_count = max_fname_arg_count - def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None): + def __call__( + self, + args, + kwargs, + fname=None, + max_fname_arg_count=None, + method: Optional[str] = None, + ) -> None: if args or kwargs: fname = self.fname if fname is None else fname max_fname_arg_count = ( @@ -169,13 +182,6 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -COMPRESS_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() -COMPRESS_DEFAULTS["axis"] = None -COMPRESS_DEFAULTS["out"] = None -validate_compress = CompatValidator( - COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 -) - CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None @@ -307,7 +313,7 @@ def validate_take_with_convert(convert, args, kwargs): ) -def validate_window_func(name, args, kwargs): +def validate_window_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -322,7 +328,7 @@ def validate_window_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_rolling_func(name, args, kwargs): +def validate_rolling_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -337,7 +343,7 @@ def validate_rolling_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_expanding_func(name, args, kwargs): +def validate_expanding_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -352,7 +358,7 @@ def validate_expanding_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_groupby_func(name, args, kwargs, allowed=None): +def validate_groupby_func(name, args, kwargs, allowed=None) -> None: """ 'args' and 'kwargs' should be empty, except for allowed kwargs because all of @@ -366,16 +372,15 @@ def validate_groupby_func(name, args, kwargs, allowed=None): if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall( - f"numpy operations are not valid with " - f"groupby. Use .groupby(...).{name}() " - f"instead" + "numpy operations are not valid with groupby. " + f"Use .groupby(...).{name}() instead" ) RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") -def validate_resampler_func(method, args, kwargs): +def validate_resampler_func(method: str, args, kwargs) -> None: """ 'args' and 'kwargs' should be empty because all of their necessary parameters are explicitly listed in @@ -392,7 +397,7 @@ def validate_resampler_func(method, args, kwargs): raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis): +def validate_minmax_axis(axis: Optional[int]) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index e8fd390456f82..0a1a1376bfc8d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -169,9 +169,9 @@ def __new__(cls) -> "DataFrame": # type: ignore # our Unpickler sub-class to override methods and some dispatcher -# functions for compat - +# functions for compat and uses a non-public class of the pickle module. +# error: Name 'pkl._Unpickler' is not defined class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass diff --git a/pandas/conftest.py b/pandas/conftest.py index 0a3bf31cf9666..0c964452df5da 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,4 @@ +from collections import abc from datetime import date, time, timedelta, timezone from decimal import Decimal import operator @@ -14,8 +15,8 @@ import pandas as pd from pandas import DataFrame +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm hypothesis.settings.register_profile( "ci", @@ -64,25 +65,28 @@ def pytest_runtest_setup(item): pytest.skip("skipping high memory test since --run-high-memory was not set") -# Configurations for all tests and all test modules - - @pytest.fixture(autouse=True) def configure_tests(): + """ + Configure settings for all tests and test modules. + """ pd.set_option("chained_assignment", "raise") -# For running doctests: make np and pd names available - - @pytest.fixture(autouse=True) def add_imports(doctest_namespace): + """ + Make `np` and `pd` names available for doctests. + """ doctest_namespace["np"] = np doctest_namespace["pd"] = pd @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) def spmatrix(request): + """ + Yields scipy sparse matrix classes. + """ from scipy import sparse return getattr(sparse, request.param + "_matrix") @@ -91,8 +95,8 @@ def spmatrix(request): @pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") def axis(request): """ - Fixture for returning the axis numbers of a DataFrame. - """ + Fixture for returning the axis numbers of a DataFrame. + """ return request.param @@ -236,6 +240,10 @@ def all_boolean_reductions(request): @pytest.fixture(params=list(_cython_table)) def cython_table_items(request): + """ + Yields a tuple of a function and its corresponding name. Correspond to + the list of aggregator "Cython functions" used on selected table items. + """ return request.param @@ -336,6 +344,9 @@ def writable(request): @pytest.fixture(scope="module") def datetime_tz_utc(): + """ + Yields the UTC timezone object from the datetime module. + """ return timezone.utc @@ -357,6 +368,9 @@ def join_type(request): @pytest.fixture def strict_data_files(pytestconfig): + """ + Returns the configuration for the test setting `--strict-data-files`. + """ return pytestconfig.getoption("--strict-data-files") @@ -894,3 +908,38 @@ def index_or_series(request): See GH#29725 """ return request.param + + +@pytest.fixture +def dict_subclass(): + """ + Fixture for a dictionary subclass. + """ + + class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + return TestSubDict + + +@pytest.fixture +def non_mapping_dict_subclass(): + """ + Fixture for a non-mapping dictionary subclass. + """ + + class TestNonDictMapping(abc.Mapping): + def __init__(self, underlying_dict): + self._data = underlying_dict + + def __getitem__(self, key): + return self._data.__getitem__(key) + + def __iter__(self): + return self._data.__iter__() + + def __len__(self): + return self._data.__len__() + + return TestNonDictMapping diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 42cfd9d54ac19..39e8e9008a844 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -50,6 +50,9 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices +if TYPE_CHECKING: + from pandas import Series + _shared_docs: Dict[str, str] = {} @@ -651,7 +654,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -) -> ABCSeries: +) -> "Series": """ Compute a histogram of the counts of non-null values. @@ -793,7 +796,7 @@ def duplicated(values, keep="first") -> np.ndarray: return f(values, keep=keep) -def mode(values, dropna: bool = True) -> ABCSeries: +def mode(values, dropna: bool = True) -> "Series": """ Returns the mode(s) of an array. diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index df26cd94b5ed9..bf3469924a700 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,16 +1,36 @@ -from .base import ( # noqa: F401 +from pandas.core.arrays.base import ( ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin, try_cast_to_ea, ) -from .boolean import BooleanArray # noqa: F401 -from .categorical import Categorical # noqa: F401 -from .datetimes import DatetimeArray # noqa: F401 -from .integer import IntegerArray, integer_array # noqa: F401 -from .interval import IntervalArray # noqa: F401 -from .numpy_ import PandasArray, PandasDtype # noqa: F401 -from .period import PeriodArray, period_array # noqa: F401 -from .sparse import SparseArray # noqa: F401 -from .string_ import StringArray # noqa: F401 -from .timedeltas import TimedeltaArray # noqa: F401 +from pandas.core.arrays.boolean import BooleanArray +from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.integer import IntegerArray, integer_array +from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +from pandas.core.arrays.period import PeriodArray, period_array +from pandas.core.arrays.sparse import SparseArray +from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.timedeltas import TimedeltaArray + +__all__ = [ + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", + "try_cast_to_ea", + "BooleanArray", + "Categorical", + "DatetimeArray", + "IntegerArray", + "integer_array", + "IntervalArray", + "PandasArray", + "PandasDtype", + "PeriodArray", + "period_array", + "SparseArray", + "StringArray", + "TimedeltaArray", +] diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py new file mode 100644 index 0000000000000..e0d33bebeb421 --- /dev/null +++ b/pandas/core/arrays/_arrow_utils.py @@ -0,0 +1,124 @@ +from distutils.version import LooseVersion +import json + +import numpy as np +import pyarrow + +from pandas.core.arrays.interval import _VALID_CLOSED + +_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") + + +def pyarrow_array_to_numpy_and_mask(arr, dtype): + """ + Convert a primitive pyarrow.Array to a numpy array and boolean mask based + on the buffers of the Array. + + Parameters + ---------- + arr : pyarrow.Array + dtype : numpy.dtype + + Returns + ------- + (data, mask) + Tuple of two numpy arrays with the raw data (with specified dtype) and + a boolean mask (validity mask, so False means missing) + """ + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + return data, mask + + +if _pyarrow_version_ge_015: + # the pyarrow extension types are only available for pyarrow 0.15+ + + class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + # register the type with a dummy instance + _period_type = ArrowPeriodType("D") + pyarrow.register_extension_type(_period_type) + + class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in _VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + # register the type with a dummy instance + _interval_type = ArrowIntervalType(pyarrow.int64(), "left") + pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 96a4eb1b3bf32..9723343ea7af5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ArrayLike from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -350,6 +351,39 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + """ + Convert to a NumPy ndarray. + + .. versionadded:: 1.0.0 + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ + result = np.asarray(self, dtype=dtype) + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index a8fcd6d03847c..fa1cbc87cc5c1 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -15,7 +15,6 @@ is_extension_array_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, @@ -27,8 +26,8 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin + +from .masked import BaseMaskedArray if TYPE_CHECKING: from pandas._typing import Scalar @@ -60,6 +59,8 @@ class BooleanDtype(ExtensionDtype): BooleanDtype """ + name = "boolean" + @property def na_value(self) -> "Scalar": """ @@ -79,19 +80,6 @@ def type(self) -> Type: def kind(self) -> str: return "b" - @property - def name(self) -> str: - """ - The alias for BooleanDtype is ``'boolean'``. - """ - return "boolean" - - @classmethod - def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "boolean": - return cls() - return super().construct_from_string(string) - @classmethod def construct_array_type(cls) -> "Type[BooleanArray]": return BooleanArray @@ -206,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False): return values, mask -class BooleanArray(ExtensionArray, ExtensionOpsMixin): +class BooleanArray(BaseMaskedArray): """ Array of boolean (True/False) data with missing values. @@ -256,10 +244,13 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): >>> pd.array([True, False, None], dtype="boolean") - [True, False, NA] + [True, False, ] Length: 3, dtype: boolean """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( @@ -304,59 +295,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: def _from_factorized(cls, values, original: "BooleanArray"): return cls._from_sequence(values, dtype=original.dtype) - def _formatter(self, boxed=False): - return str - - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - return type(self)(self._data[item], self._mask[item]) - - def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): - """ - Coerce to an ndarray of object dtype or bool dtype (if force_bool=True). - - Parameters - ---------- - dtype : dtype, default object - The numpy dtype to convert to - na_value : scalar, optional - Scalar missing value indicator to use in numpy array. Defaults - to the native missing value indicator of this array (pd.NA). - """ - if dtype is None: - dtype = object - if is_bool_dtype(dtype): - if not self.isna().any(): - return self._data - else: - raise ValueError( - "cannot convert to bool numpy array in presence of missing values" - ) - data = self._data.astype(dtype) - data[self._mask] = na_value - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - # by default (no dtype specified), return an object array - return self._coerce_to_ndarray(dtype=dtype) - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -404,40 +342,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with False internally - # to avoid upcasting - data_fill_value = False if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -451,26 +355,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self): - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return self._dtype.na_value - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or ExtensionArray with 'dtype'. @@ -503,7 +387,7 @@ def astype(self, dtype, copy=True): if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True - if self.isna().any(): + if self._hasna: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) @@ -515,7 +399,7 @@ def astype(self, dtype, copy=True): ) # for integer, error if there are missing values if is_integer_dtype(dtype): - if self.isna().any(): + if self._hasna: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) @@ -523,54 +407,8 @@ def astype(self, dtype, copy=True): if is_float_dtype(dtype): na_value = np.nan # coerce - data = self._coerce_to_ndarray(na_value=na_value) - return astype_nansafe(data, dtype, copy=None) - - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.values.astype(bool).astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object - ) - - return Series(array, index=index) + data = self.to_numpy(na_value=na_value) + return astype_nansafe(data, dtype, copy=False) def _values_for_argsort(self) -> np.ndarray: """ @@ -643,7 +481,7 @@ def any(self, skipna: bool = True, **kwargs): >>> pd.array([True, False, pd.NA]).any(skipna=False) True >>> pd.array([False, False, pd.NA]).any(skipna=False) - NA + """ kwargs.pop("axis", None) nv.validate_any((), kwargs) @@ -708,7 +546,7 @@ def all(self, skipna: bool = True, **kwargs): required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, True, pd.NA]).all(skipna=False) - NA + >>> pd.array([True, False, pd.NA]).all(skipna=False) False """ @@ -730,7 +568,6 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -755,9 +592,8 @@ def logical_method(self, other): if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): raise TypeError( - "'other' should be pandas.NA or a bool. Got {} instead.".format( - type(other).__name__ - ) + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." ) if not other_is_scalar and len(self) != len(other): @@ -772,14 +608,17 @@ def logical_method(self, other): return BooleanArray(result, mask) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return set_function_name(logical_method, name, cls) @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -819,7 +658,7 @@ def cmp_method(self, other): return BooleanArray(result, mask, copy=False) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): @@ -922,7 +761,7 @@ def boolean_arithmetic_method(self, other): return self._maybe_mask_result(result, mask, other, op_name) - name = "__{name}__".format(name=op_name) + name = f"__{op_name}__" return set_function_name(boolean_arithmetic_method, name, cls) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4d6be8221557d..2806635211459 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,6 +1,6 @@ import operator from shutil import get_terminal_size -from typing import Type, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -8,7 +8,7 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, hashtable as htable -from pandas._typing import ArrayLike, Dtype, Ordered +from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -39,24 +39,28 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.inference import is_array_like, is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d +from pandas.core.arrays.base import ( + ExtensionArray, + _extension_array_shared_docs, + try_cast_to_ea, +) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.io.formats import console -from .base import ExtensionArray, _extension_array_shared_docs, try_cast_to_ea - def _cat_compare_op(op): opname = f"__{op.__name__}__" @@ -232,7 +236,7 @@ class Categorical(ExtensionArray, PandasObject): `categories` attribute (which in turn is the `categories` argument, if provided). dtype : CategoricalDtype - An instance of ``CategoricalDtype`` to use for this categorical + An instance of ``CategoricalDtype`` to use for this categorical. .. versionadded:: 0.21.0 @@ -272,7 +276,7 @@ class Categorical(ExtensionArray, PandasObject): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -302,7 +306,7 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset(["tolist", "itemsize"]) + _deprecations = PandasObject._deprecations | frozenset(["tolist"]) _typ = "categorical" def __init__( @@ -511,7 +515,7 @@ def itemsize(self) -> int: """ return self.categories.itemsize - def tolist(self) -> list: + def tolist(self) -> List[Scalar]: """ Return a list of the values. @@ -1260,7 +1264,7 @@ def shift(self, periods, fill_value=None): return self.from_codes(codes, dtype=self.dtype) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1873,7 +1877,7 @@ def __iter__(self): """ return iter(self._internal_get_values().tolist()) - def __contains__(self, key): + def __contains__(self, key) -> bool: """ Returns True if `key` is in this Categorical. """ @@ -1883,7 +1887,7 @@ def __contains__(self, key): return contains(self, key, container=self._codes) - def _tidy_repr(self, max_vals=10, footer=True): + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default footer) """ @@ -1920,7 +1924,7 @@ def _repr_categories(self): category_strs = [x.strip() for x in category_strs] return category_strs - def _repr_categories_info(self): + def _repr_categories_info(self) -> str: """ Returns a string representation of the footer. """ @@ -1950,11 +1954,11 @@ def _repr_categories_info(self): # replace to simple save space by return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" - def _repr_footer(self): + def _repr_footer(self) -> str: info = self._repr_categories_info() return f"Length: {len(self)}\n{info}" - def _get_repr(self, length=True, na_rep="NaN", footer=True): + def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str: from pandas.io.formats import format as fmt formatter = fmt.CategoricalFormatter( @@ -1996,10 +2000,17 @@ def __getitem__(self, key): return np.nan else: return self.categories[i] - else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + + if is_list_like(key) and not is_array_like(key): + key = np.asarray(key) + + if com.is_bool_indexer(key): + key = check_bool_array_indexer(self, key) + + result = self._codes[key] + if result.ndim > 1: + return result + return self._constructor(result, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): """ @@ -2067,7 +2078,7 @@ def __setitem__(self, key, value): lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer - def _reverse_indexer(self): + def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ Compute the inverse of a categorical, returning a dict of categories -> indexers. @@ -2097,8 +2108,8 @@ def _reverse_indexer(self): self.codes.astype("int64"), categories.size ) counts = counts.cumsum() - result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, result)) + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + result = dict(zip(categories, _result)) return result # reduction ops # diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ceeaf018eb5f3..d7c508c890a46 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -11,6 +11,7 @@ from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 from pandas._typing import DatetimeLikeScalar +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning from pandas.util._decorators import Appender, Substitution @@ -27,27 +28,101 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_offsetlike, is_period_dtype, is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodArray, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import missing, nanops +from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.invalid import make_invalid_op +from pandas.core.ops.invalid import invalid_comparison, make_invalid_op from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick -from .base import ExtensionArray, ExtensionOpsMixin + +def _datetimelike_array_cmp(cls, op): + """ + Wrap comparison operations to convert Timestamp/Timedelta/Period-like to + boxed scalars/arrays. + """ + opname = f"__{op.__name__}__" + nat_result = opname == "__ne__" + + @unpack_zerodim_and_defer(opname) + def wrapper(self, other): + + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + return invalid_comparison(self, other, op) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) + self._check_compatible_with(other) + + other_i8 = self._unbox_scalar(other) + + result = op(self.view("i8"), other_i8) + if isna(other): + result.fill(nat_result) + + elif not is_list_like(other): + return invalid_comparison(self, other, op) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + if isinstance(other, list): + # TODO: could use pd.Index to do inference? + other = np.array(other) + + if not isinstance(other, (np.ndarray, type(self))): + return invalid_comparison(self, other, op) + + if is_object_dtype(other): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would fail to raise when + # comparing tz-aware and tz-naive + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY( + op, self.astype(object), other + ) + o_mask = isna(other) + + elif not type(self)._is_recognized_dtype(other.dtype): + return invalid_comparison(self, other, op) + + else: + # For PeriodDType this casting is unnecessary + other = type(self)._from_sequence(other) + self._check_compatible_with(other) + + result = op(self.view("i8"), other.view("i8")) + o_mask = other._isnan + + if o_mask.any(): + result[o_mask] = nat_result + + if self._hasnans: + result[self._isnan] = nat_result + + return result + + return set_function_name(wrapper, opname, cls) class AttributesMixin: @@ -109,7 +184,7 @@ def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> raise AbstractMethodError(self) def _check_compatible_with( - self, other: Union[Period, Timestamp, Timedelta, NaTType] + self, other: Union[Period, Timestamp, Timedelta, NaTType], setitem: bool = False ) -> None: """ Verify that `self` and `other` are compatible. @@ -123,6 +198,9 @@ def _check_compatible_with( Parameters ---------- other + setitem : bool, default False + For __setitem__ we may have stricter compatiblity resrictions than + for comparisons. Raises ------ @@ -289,16 +367,19 @@ class TimelikeOps: def _round(self, freq, mode, ambiguous, nonexistent): # round the local times - values = _ensure_datetimelike_to_i8(self) + if is_datetime64tz_dtype(self): + # operate on naive timestamps, then convert back to aware + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + aware = result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + return aware + + values = self.view("i8") result = round_nsint64(values, mode, freq) result = self._maybe_mask_results(result, fill_value=NaT) - - dtype = self.dtype - if is_datetime64tz_dtype(self): - dtype = None - return self._ensure_localized( - self._simple_new(result, dtype=dtype), ambiguous, nonexistent - ) + return self._simple_new(result, dtype=self.dtype) @Appender((_round_doc + _round_example).format(op="round")) def round(self, freq, ambiguous="raise", nonexistent="raise"): @@ -400,7 +481,7 @@ def _formatter(self, boxed=False): def nbytes(self): return self._data.nbytes - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) @@ -437,7 +518,7 @@ def __getitem__(self, key): return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: @@ -462,8 +543,6 @@ def __getitem__(self, key): if result.ndim > 1: # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition - if is_period: - return self._simple_new(result, dtype=self.dtype, freq=freq) return result return self._simple_new(result, dtype=self.dtype, freq=freq) @@ -500,10 +579,10 @@ def __setitem__( return value = type(self)._from_sequence(value, dtype=self.dtype) - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=True) value = value.asi8 elif isinstance(value, self._scalar_type): - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=True) value = self._unbox_scalar(value) elif is_valid_nat_for_dtype(value, self.dtype): value = iNaT @@ -588,7 +667,17 @@ def _validate_fill_value(self, fill_value): ------ ValueError """ - raise AbstractMethodError(self) + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, self._recognized_scalars): + self._check_compatible_with(fill_value) + fill_value = self._scalar_type(fill_value) + fill_value = self._unbox_scalar(fill_value) + else: + raise ValueError( + f"'fill_value' should be a {self._scalar_type}. Got '{fill_value}'." + ) + return fill_value def take(self, indices, allow_fill=False, fill_value=None): if allow_fill: @@ -831,7 +920,7 @@ def freq(self, value): @property def freqstr(self): """ - Return the frequency object as a string if its set, otherwise None + Return the frequency object as a string if its set, otherwise None. """ if self.freq is None: return None @@ -921,6 +1010,7 @@ def _is_unique(self): # ------------------------------------------------------------------ # Arithmetic Methods + _create_comparison_method = classmethod(_datetimelike_array_cmp) # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops @@ -1075,8 +1165,6 @@ def _sub_period_array(self, other): f"cannot subtract {other.dtype}-dtype from {type(self).__name__}" ) - if len(self) != len(other): - raise ValueError("cannot subtract arrays/indices of unequal length") if self.freq != other.freq: msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr @@ -1093,47 +1181,13 @@ def _sub_period_array(self, other): new_values[mask] = NaT return new_values - def _addsub_int_array(self, other, op): - """ - Add or subtract array-like of integers equivalent to applying - `_time_shift` pointwise. - - Parameters - ---------- - other : Index, ExtensionArray, np.ndarray - integer-dtype - op : {operator.add, operator.sub} - - Returns - ------- - result : same class as self - """ - # _addsub_int_array is overridden by PeriodArray - assert not is_period_dtype(self) - assert op in [operator.add, operator.sub] - - if self.freq is None: - # GH#19123 - raise NullFrequencyError("Cannot shift with no freq") - - elif isinstance(self.freq, Tick): - # easy case where we can convert to timedelta64 operation - td = Timedelta(self.freq) - return op(self, td * other) - - # We should only get here with DatetimeIndex; dispatch - # to _addsub_offset_array - assert not is_timedelta64_dtype(self) - return op(self, np.array(other) * self.freq) - - def _addsub_offset_array(self, other, op): + def _addsub_object_array(self, other: np.ndarray, op): """ Add or subtract array-like of DateOffset objects Parameters ---------- - other : Index, np.ndarray - object-dtype containing pd.DateOffset objects + other : np.ndarray[object] op : {operator.add, operator.sub} Returns @@ -1157,7 +1211,12 @@ def _addsub_offset_array(self, other, op): kwargs = {} if not is_period_dtype(self): kwargs["freq"] = "infer" - return self._from_sequence(res_values, **kwargs) + try: + res = type(self)._from_sequence(res_values, **kwargs) + except ValueError: + # e.g. we've passed a Timestamp to TimedeltaArray + res = res_values + return res def _time_shift(self, periods, freq=None): """ @@ -1220,9 +1279,9 @@ def __add__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.add) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) @@ -1275,9 +1334,9 @@ def __sub__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.sub) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) @@ -1352,45 +1411,6 @@ def __isub__(self, other): # type: ignore self._freq = result._freq return self - # -------------------------------------------------------------- - # Comparison Methods - - def _ensure_localized( - self, arg, ambiguous="raise", nonexistent="raise", from_utc=False - ): - """ - Ensure that we are re-localized. - - This is for compat as we can then call this on all datetimelike - arrays generally (ignored for Period/Timedelta) - - Parameters - ---------- - arg : Union[DatetimeLikeArray, DatetimeIndexOpsMixin, ndarray] - ambiguous : str, bool, or bool-ndarray, default 'raise' - nonexistent : str, default 'raise' - from_utc : bool, default False - If True, localize the i8 ndarray to UTC first before converting to - the appropriate tz. If False, localize directly to the tz. - - Returns - ------- - localized array - """ - - # reconvert to local tz - tz = getattr(self, "tz", None) - if tz is not None: - if not isinstance(arg, type(self)): - arg = self._simple_new(arg) - if from_utc: - arg = arg.tz_localize("UTC").tz_convert(self.tz) - else: - arg = arg.tz_localize( - self.tz, ambiguous=ambiguous, nonexistent=nonexistent - ) - return arg - # -------------------------------------------------------------- # Reductions @@ -1503,6 +1523,8 @@ def mean(self, skipna=True): return self._box_func(result) +DatetimeLikeArrayMixin._add_comparison_ops() + # ------------------------------------------------------------------- # Shared Constructor Helpers @@ -1626,38 +1648,3 @@ def maybe_infer_freq(freq): freq_infer = True freq = None return freq, freq_infer - - -def _ensure_datetimelike_to_i8(other, to_utc=False): - """ - Helper for coercing an input scalar or array to i8. - - Parameters - ---------- - other : 1d array - to_utc : bool, default False - If True, convert the values to UTC before extracting the i8 values - If False, extract the i8 values directly. - - Returns - ------- - i8 1d array - """ - from pandas import Index - - if lib.is_scalar(other) and isna(other): - return iNaT - elif isinstance(other, (ABCPeriodArray, ABCIndexClass, DatetimeLikeArrayMixin)): - # convert tz if needed - if getattr(other, "tz", None) is not None: - if to_utc: - other = other.tz_convert("UTC") - else: - other = other.tz_localize(None) - else: - try: - return np.array(other, copy=False).view("i8") - except TypeError: - # period array cannot be coerced to int - other = Index(other) - return other.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index eb762a23d684d..e42402b307f28 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -18,14 +18,13 @@ timezones, tzconversion, ) -import pandas.compat as compat from pandas.errors import PerformanceWarning -from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, @@ -42,13 +41,10 @@ from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import isna -from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com -from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset from pandas.tseries.offsets import Day, Tick @@ -74,22 +70,6 @@ def tz_to_dtype(tz): return DatetimeTZDtype(tz=tz) -def _to_M8(key, tz=None): - """ - Timestamp-like => dt64 - """ - if not isinstance(key, Timestamp): - # this also converts strings - key = Timestamp(key) - if key.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - key = key.tz_convert(tz) - else: - key = key.tz_localize(tz) - - return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE) - - def _field_accessor(name, field, docstring=None): def f(self): values = self.asi8 @@ -130,87 +110,6 @@ def f(self): return property(f) -def _dt_array_cmp(cls, op): - """ - Wrap comparison operations to convert datetime-like to datetime64 - """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - - if isinstance(other, (datetime, np.datetime64, str)): - if isinstance(other, (datetime, np.datetime64)): - # GH#18435 strings get a pass from tzawareness compat - self._assert_tzawareness_compat(other) - - try: - other = _to_M8(other, tz=self.tz) - except ValueError: - # string that cannot be parsed to Timestamp - return invalid_comparison(self, other, op) - - result = op(self.asi8, other.view("i8")) - if isna(other): - result.fill(nat_result) - elif lib.is_scalar(other) or np.ndim(other) == 0: - return invalid_comparison(self, other, op) - elif len(other) != len(self): - raise ValueError("Lengths must match") - else: - if isinstance(other, list): - try: - other = type(self)._from_sequence(other) - except ValueError: - other = np.array(other, dtype=np.object_) - elif not isinstance( - other, (np.ndarray, ABCIndexClass, ABCSeries, DatetimeArray) - ): - # Following Timestamp convention, __eq__ is all-False - # and __ne__ is all True, others raise TypeError. - return invalid_comparison(self, other, op) - - if is_object_dtype(other): - # We have to use comp_method_OBJECT_ARRAY instead of numpy - # comparison otherwise it would fail to raise when - # comparing tz-aware and tz-naive - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY( - op, self.astype(object), other - ) - o_mask = isna(other) - elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): - # e.g. is_timedelta64_dtype(other) - return invalid_comparison(self, other, op) - else: - self._assert_tzawareness_compat(other) - if isinstance(other, (ABCIndexClass, ABCSeries)): - other = other.array - - if ( - is_datetime64_dtype(other) - and not is_datetime64_ns_dtype(other) - or not hasattr(other, "asi8") - ): - # e.g. other.dtype == 'datetime64[s]' - # or an object-dtype ndarray - other = type(self)._from_sequence(other) - - result = op(self.view("i8"), other.view("i8")) - o_mask = other._isnan - - if o_mask.any(): - result[o_mask] = nat_result - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -230,12 +129,12 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps The datetime data. For DatetimeArray `values` (or a Series or Index boxing one), - `dtype` and `freq` will be extracted from `values`, with - precedence given to + `dtype` and `freq` will be extracted from `values`. dtype : numpy.dtype or DatetimeTZDtype Note that the only NumPy dtype allowed is 'datetime64[ns]'. freq : str or Offset, optional + The frequency. copy : bool, default False Whether to copy the underlying array of values. @@ -250,6 +149,8 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps _typ = "datetimearray" _scalar_type = Timestamp + _recognized_scalars = (datetime, np.datetime64) + _is_recognized_dtype = is_datetime64_any_dtype # define my properties & methods for delegation _bool_ops = [ @@ -327,7 +228,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): raise TypeError(msg) elif values.tz: dtype = values.dtype - # freq = validate_values_freq(values, freq) + if freq is None: freq = values.freq values = values._data @@ -335,8 +236,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if not isinstance(values, np.ndarray): msg = ( f"Unexpected type '{type(values).__name__}'. 'values' must be " - "a DatetimeArray ndarray, or Series or Index containing one of" - " those." + "a DatetimeArray ndarray, or Series or Index containing one of those." ) raise ValueError(msg) if values.ndim not in [1, 2]: @@ -556,11 +456,14 @@ def _unbox_scalar(self, value): def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): if other is NaT: return - if not timezones.tz_compare(self.tz, other.tz): - raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") + self._assert_tzawareness_compat(other) + if setitem: + # Stricter check for setitem vs comparison methods + if not timezones.tz_compare(self.tz, other.tz): + raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") def _maybe_clear_freq(self): self._freq = None @@ -643,7 +546,7 @@ def _resolution(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object @@ -700,20 +603,6 @@ def astype(self, dtype, copy=True): return self.to_period(freq=dtype.freq) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) - # ---------------------------------------------------------------- - # ExtensionArray Interface - - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, (datetime, np.datetime64)): - self._assert_tzawareness_compat(fill_value) - fill_value = Timestamp(fill_value).value - else: - raise ValueError(f"'fill_value' should be a Timestamp. Got '{fill_value}'.") - return fill_value - # ----------------------------------------------------------------- # Rendering Methods @@ -729,8 +618,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ----------------------------------------------------------------- # Comparison Methods - _create_comparison_method = classmethod(_dt_array_cmp) - def _has_same_tz(self, other): zzone = self._timezone @@ -1782,9 +1669,6 @@ def to_julian_date(self): ) -DatetimeArray._add_comparison_ops() - - # ------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3f5a4ca49702f..cb1e7115cd3c2 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,10 +1,10 @@ import numbers -from typing import Type +from typing import Any, Tuple, Type import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -24,12 +24,12 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric +from .masked import BaseMaskedArray + class _IntegerDtype(ExtensionDtype): """ @@ -44,7 +44,7 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = np.nan + na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" @@ -90,6 +90,7 @@ def construct_array_type(cls): def __from_arrow__(self, array): """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" import pyarrow + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask if isinstance(array, pyarrow.Array): chunks = [array] @@ -99,18 +100,7 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - buflist = arr.buffers() - data = np.frombuffer(buflist[1], dtype=self.type)[ - arr.offset : arr.offset + len(arr) - ] - bitmask = buflist[0] - if bitmask is not None: - mask = pyarrow.BooleanArray.from_buffers( - pyarrow.bool_(), len(arr), [None, bitmask] - ) - mask = np.asarray(mask) - else: - mask = np.ones(len(arr), dtype=bool) + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) int_arr = IntegerArray(data.copy(), ~mask, copy=False) results.append(int_arr) @@ -257,12 +247,17 @@ def coerce_to_array(values, dtype, mask=None, copy=False): return values, mask -class IntegerArray(ExtensionArray, ExtensionOpsMixin): +class IntegerArray(BaseMaskedArray): """ Array of integer (optional missing) values. .. versionadded:: 0.24.0 + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + .. warning:: IntegerArray is currently experimental, and its API or internal @@ -306,22 +301,25 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) >>> int_array - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 String aliases for the dtypes are also available. They are capitalized. >>> pd.array([1, None, 3], dtype='Int32') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 >>> pd.array([1, None, 3], dtype='UInt16') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: UInt16 """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 + @cache_readonly def dtype(self): return _dtypes[str(self._data.dtype)] @@ -358,48 +356,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt - - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - return type(self)(self._data[item], self._mask[item]) - - def _coerce_to_ndarray(self): - """ - coerce to an ndarary of object dtype - """ - - # TODO(jreback) make this better - data = self._data.astype(object) - data[self._mask] = self._na_value - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - return self._coerce_to_ndarray() - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -447,40 +403,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with 1 internally - # to avoid upcasting - data_fill_value = 1 if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -494,26 +416,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self) -> int: - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return np.nan - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or IntegerArray with 'dtype'. @@ -545,8 +447,14 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray() - return astype_nansafe(data, dtype, copy=None) + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) @property def _ndarray_values(self) -> np.ndarray: @@ -559,52 +467,10 @@ def _ndarray_values(self) -> np.ndarray: """ return self._data - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate([index.values, np.array([np.nan], dtype=object)]), - dtype=object, - ) - - return Series(array, index=index) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. + return self.to_numpy(na_value=np.nan), np.nan def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. @@ -629,9 +495,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): + from pandas.arrays import BooleanArray + mask = None - if isinstance(other, IntegerArray): + if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask elif is_list_like(other): @@ -643,25 +511,35 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op_name}__") - result = method(other) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) if result is NotImplemented: result = invalid_comparison(self._data, other, op) # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask - result[mask] = op_name == "ne" - return result + return BooleanArray(result, mask) name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) @@ -673,7 +551,8 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") - data[mask] = self._na_value + # We explicitly use NaN within reductions. + data[mask] = np.nan op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) @@ -739,12 +618,13 @@ def integer_arithmetic_method(self, other): raise TypeError("can only perform ops with numeric values") else: - if not (is_float(other) or is_integer(other)): + if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") - # nans propagate if omask is None: mask = self._mask.copy() + if other is libmissing.NA: + mask |= True else: mask = self._mask | omask @@ -754,20 +634,23 @@ def integer_arithmetic_method(self, other): # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": @@ -790,6 +673,11 @@ def integer_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + Attributes ---------- None diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cea059fb22be1..37d2baed2c09e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -17,6 +17,8 @@ is_integer_dtype, is_interval, is_interval_dtype, + is_list_like, + is_object_dtype, is_scalar, is_string_dtype, is_timedelta64_dtype, @@ -37,6 +39,7 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com +from pandas.core.construction import array from pandas.core.indexes.base import ensure_index _VALID_CLOSED = {"left", "right", "both", "neither"} @@ -105,7 +108,7 @@ Notes ----- See the `user guide -`_ +`_ for more. %(examples)s\ @@ -497,8 +500,11 @@ def __getitem__(self, value): # scalar if not isinstance(left, ABCIndexClass): - if isna(left): + if is_scalar(left) and isna(left): return self._fill_value + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) return self._shallow_copy(left, right) @@ -547,6 +553,58 @@ def __setitem__(self, key, value): right.values[key] = value_right self._right = right + def __eq__(self, other): + # ensure pandas array for list-like and eliminate non-interval scalars + if is_list_like(other): + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other = array(other) + elif not isinstance(other, Interval): + # non-interval scalar -> no matches + return np.zeros(len(self), dtype=bool) + + # determine the dtype of the elements we want to compare + if isinstance(other, Interval): + other_dtype = "interval" + elif not is_categorical_dtype(other): + other_dtype = other.dtype + else: + # for categorical defer to categories for dtype + other_dtype = other.categories.dtype + + # extract intervals if we have interval categories with matching closed + if is_interval_dtype(other_dtype): + if self.closed != other.categories.closed: + return np.zeros(len(self), dtype=bool) + other = other.categories.take(other.codes) + + # interval-like -> need same closed and matching endpoints + if is_interval_dtype(other_dtype): + if self.closed != other.closed: + return np.zeros(len(self), dtype=bool) + return (self.left == other.left) & (self.right == other.right) + + # non-interval/non-object dtype -> no matches + if not is_object_dtype(other_dtype): + return np.zeros(len(self), dtype=bool) + + # object dtype -> iteratively check for intervals + result = np.zeros(len(self), dtype=bool) + for i, obj in enumerate(other): + # need object to be an Interval with same closed and endpoints + if ( + isinstance(obj, Interval) + and self.closed == obj.closed + and self.left[i] == obj.left + and self.right[i] == obj.right + ): + result[i] = True + + return result + + def __ne__(self, other): + return ~self.__eq__(other) + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -1008,7 +1066,7 @@ def is_non_overlapping_monotonic(self): ) # Conversion - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') @@ -1026,6 +1084,59 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + try: + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) + except TypeError: + raise TypeError( + "Conversion to arrow with subtype '{}' " + "is not supported".format(self.dtype.subtype) + ) + interval_type = ArrowIntervalType(subtype, self.closed) + storage_array = pyarrow.StructArray.from_arrays( + [ + pyarrow.array(self.left, type=subtype, from_pandas=True), + pyarrow.array(self.right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) " + "attributes".format( + self.dtype.subtype, type.subtype, self.closed, type.closed + ) + ) + else: + raise TypeError( + "Not supported to convert IntervalArray to '{0}' type".format(type) + ) + + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + _interval_shared_docs[ "to_tuples" ] = """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py new file mode 100644 index 0000000000000..47605413ff1a6 --- /dev/null +++ b/pandas/core/arrays/masked.py @@ -0,0 +1,250 @@ +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): + """ + Base class for masked arrays (which use _data and _mask to store the data). + + numpy based + """ + + _data: np.ndarray + _mask: np.ndarray + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value: "Scalar" + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + + return type(self)(self._data[item], self._mask[item]) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def __len__(self) -> int: + return len(self._data) + + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, + ): + """ + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. + + Parameters + ---------- + dtype : dtype, default object + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is the equivalent numpy dtype. + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a.to_numpy() + array([True, False, NA], dtype=object) + + When no missing values are present, an equivalent dtype can be used. + + >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + array([ True, False]) + >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") + array([1, 2]) + + However, requesting such dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) + """ + if na_value is lib.no_default: + na_value = libmissing.NA + if dtype is None: + dtype = object + if self._hasna: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None) -> np.ndarray: + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self.dtype.na_value + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + from pandas.arrays import IntegerArray + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(object) + + # if we want nans, count the mask + if dropna: + counts = value_counts.values + else: + counts = np.empty(len(value_counts) + 1, dtype="int64") + counts[:-1] = value_counts + counts[-1] = self._mask.sum() + + index = Index( + np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), + dtype=object, + ) + + mask = np.zeros(len(counts), dtype="bool") + counts = IntegerArray(counts, mask) + + return Series(counts, index=index) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index deec30dfe34ff..4db3d3010adaf 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -17,11 +17,12 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d -from .base import ExtensionArray, ExtensionOpsMixin - class PandasDtype(ExtensionDtype): """ @@ -181,7 +182,7 @@ def dtype(self): # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._ndarray, dtype=dtype) _HANDLED_TYPES = (np.ndarray, numbers.Number) @@ -234,6 +235,9 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + result = self._ndarray[item] if not lib.is_scalar(item): result = type(self)(result) @@ -416,27 +420,15 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False): - """ - Convert the PandasArray to a :class:`numpy.ndarray`. - - By default, this requires no coercion or copying of data. - - Parameters - ---------- - dtype : numpy.dtype - The NumPy dtype to pass to :func:`numpy.asarray`. - copy : bool, default False - Whether to copy the underlying data. - - Returns - ------- - ndarray - """ + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): result = np.asarray(self._ndarray, dtype=dtype) - if copy and result is self._ndarray: + + if (copy or na_value is not lib.no_default) and result is self._ndarray: result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result @Appender(ExtensionArray.searchsorted.__doc__) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index df057ce5a0104..697d759206ff9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -20,15 +20,13 @@ period_asfreq_arr, ) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds -import pandas.compat as compat -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, - is_list_like, is_period_dtype, pandas_dtype, ) @@ -44,7 +42,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick @@ -61,49 +58,6 @@ def f(self): return property(f) -def _period_array_cmp(cls, op): - """ - Wrap comparison operations to convert Period-like to PeriodDtype - """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - ordinal_op = getattr(self.asi8, opname) - - if is_list_like(other) and len(other) != len(self): - raise ValueError("Lengths must match") - - if isinstance(other, Period): - self._check_compatible_with(other) - - result = ordinal_op(other.ordinal) - elif isinstance(other, cls): - self._check_compatible_with(other) - - result = ordinal_op(other.asi8) - - mask = self._isnan | other._isnan - if mask.any(): - result[mask] = nat_result - - return result - elif other is NaT: - result = np.empty(len(self.asi8), dtype=bool) - result.fill(nat_result) - else: - other = Period(other, freq=self.freq) - result = ordinal_op(other.ordinal) - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. @@ -159,6 +113,8 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): __array_priority__ = 1000 _typ = "periodarray" # ABCPeriodArray _scalar_type = Period + _recognized_scalars = (Period,) + _is_recognized_dtype = is_period_dtype # Names others delegate to us _other_ops: List[str] = [] @@ -203,12 +159,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False): if isinstance(values, type(self)): if freq is not None and freq != values.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=values.freq.freqstr, - other_freq=freq.freqstr, - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(values, freq) values, freq = values._data, values.freq values = np.array(values, dtype="int64", copy=copy) @@ -307,11 +258,11 @@ def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): if other is NaT: return if self.freqstr != other.freqstr: - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) # -------------------------------------------------------------------- # Data / Attributes @@ -320,7 +271,7 @@ def _check_compatible_with(self, other): def dtype(self): return self._dtype - # read-only property overwriting read/write + # error: Read-only property cannot override read-write property [misc] @property # type: ignore def freq(self): """ @@ -328,10 +279,36 @@ def freq(self): """ return self.dtype.freq - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: # overriding DatetimelikeArray return np.array(list(self), dtype=object) + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + if type is not None: + if pyarrow.types.is_integer(type): + return pyarrow.array(self._data, mask=self.isna(), type=type) + elif isinstance(type, ArrowPeriodType): + # ensure we have the same freq + if self.freqstr != type.freq: + raise TypeError( + "Not supported to convert PeriodArray to array with different" + f" 'freq' ({self.freqstr} vs {type.freq})" + ) + else: + raise TypeError( + f"Not supported to convert PeriodArray to '{type}' type" + ) + + period_type = ArrowPeriodType(self.freqstr) + storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64") + return pyarrow.ExtensionArray.from_storage(period_type, storage_array) + # -------------------------------------------------------------------- # Vectorized analogues of Period properties @@ -440,8 +417,9 @@ def to_timestamp(self, freq=None, how="start"): ---------- freq : str or DateOffset, optional Target frequency. The default is 'D' for week or longer, - 'S' otherwise + 'S' otherwise. how : {'s', 'e', 'start', 'end'} + Whether to use the start or end of the time period being converted. Returns ------- @@ -476,21 +454,8 @@ def to_timestamp(self, freq=None, how="start"): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def _formatter(self, boxed=False): - if boxed: - return str - return "'{}'".format - - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, Period): - self._check_compatible_with(fill_value) - fill_value = fill_value.ordinal - else: - raise ValueError(f"'fill_value' should be a Period. Got '{fill_value}'.") - return fill_value + def _values_for_argsort(self): + return self._data # -------------------------------------------------------------------- @@ -530,17 +495,20 @@ def asfreq(self, freq=None, how="E"): Parameters ---------- freq : str - a frequency + A frequency. how : str {'E', 'S'} - 'E', 'END', or 'FINISH' for end, - 'S', 'START', or 'BEGIN' for start. Whether the elements should be aligned to the end - or start within pa period. January 31st ('END') vs. - January 1st ('START') for example. + or start within pa period. + + * 'E', 'END', or 'FINISH' for end, + * 'S', 'START', or 'BEGIN' for start. + + January 31st ('END') vs. January 1st ('START') for example. Returns ------- - new : Period Array/Index with the new frequency + Period Array/Index + Constructed with the new frequency. Examples -------- @@ -582,6 +550,11 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types @@ -615,7 +588,6 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------ # Arithmetic Methods - _create_comparison_method = classmethod(_period_array_cmp) def _sub_datelike(self, other): assert other is not NaT @@ -634,12 +606,23 @@ def _sub_period(self, other): return new_data - @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) def _addsub_int_array( - self, - other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray], - op: Callable[[Any], Any], - ) -> ABCPeriodArray: + self, other: np.ndarray, op: Callable[[Any, Any], Any], + ) -> "PeriodArray": + """ + Add or subtract array of integers; equivalent to applying + `_time_shift` pointwise. + + Parameters + ---------- + other : np.ndarray[integer-dtype] + op : {operator.add, operator.sub} + + Returns + ------- + result : PeriodArray + """ + assert op in [operator.add, operator.sub] if op is operator.sub: other = -other @@ -652,7 +635,7 @@ def _add_offset(self, other): assert not isinstance(other, Tick) base = libfrequencies.get_base_alias(other.rule_code) if base != self.freq.rule_code: - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here @@ -720,7 +703,7 @@ def _add_delta(self, other): """ if not isinstance(self.freq, Tick): # We cannot add timedelta-like to non-tick PeriodArray - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) new_ordinals = super()._add_delta(other) return type(self)(new_ordinals, freq=self.freq) @@ -772,16 +755,10 @@ def _check_timedeltalike_freq_compat(self, other): # by which will be added to self. return delta - _raise_on_incompatible(self, other) - - def _values_for_argsort(self): - return self._data - + raise raise_on_incompatible(self, other) -PeriodArray._add_comparison_ops() - -def _raise_on_incompatible(left, right): +def raise_on_incompatible(left, right): """ Helper function to render a consistent error message when raising IncompatibleFrequency. @@ -789,14 +766,15 @@ def _raise_on_incompatible(left, right): Parameters ---------- left : PeriodArray - right : DateOffset, Period, ndarray, or timedelta-like + right : None, DateOffset, Period, ndarray, or timedelta-like - Raises - ------ + Returns + ------- IncompatibleFrequency + Exception to be raised by the caller. """ # GH#24283 error message format depends on whether right is scalar - if isinstance(right, np.ndarray): + if isinstance(right, np.ndarray) or right is None: other_freq = None elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)): other_freq = right.freqstr @@ -806,7 +784,7 @@ def _raise_on_incompatible(left, right): msg = DIFFERENT_FREQ.format( cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq ) - raise IncompatibleFrequency(msg) + return IncompatibleFrequency(msg) # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index 75f3819fb19fd..e928db499a771 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -1,5 +1,10 @@ # flake8: noqa: F401 -from .accessor import SparseAccessor, SparseFrameAccessor -from .array import BlockIndex, IntIndex, SparseArray, _make_index -from .dtype import SparseDtype +from pandas.core.arrays.sparse.accessor import SparseAccessor, SparseFrameAccessor +from pandas.core.arrays.sparse.array import ( + BlockIndex, + IntIndex, + SparseArray, + _make_index, +) +from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index c207b96a8d308..92c05f44d677c 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -7,9 +7,8 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.accessor import PandasDelegate, delegate_names - -from .array import SparseArray -from .dtype import SparseDtype +from pandas.core.arrays.sparse.array import SparseArray +from pandas.core.arrays.sparse.dtype import SparseDtype class BaseAccessor: @@ -163,7 +162,7 @@ def to_dense(self): Examples -------- - >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) >>> series 0 0 1 1 @@ -216,7 +215,7 @@ def from_spmatrix(cls, data, index=None, columns=None): ------- DataFrame Each column of the DataFrame is stored as a - :class:`SparseArray`. + :class:`arrays.SparseArray`. Examples -------- @@ -251,7 +250,7 @@ def to_dense(self): Examples -------- - >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) >>> df.sparse.to_dense() A 0 0 diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 548f2bf702e60..e2562a375515d 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -39,6 +39,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import sanitize_array @@ -48,8 +49,6 @@ import pandas.io.formats.printing as printing -from .dtype import SparseDtype - # ---------------------------------------------------------------------------- # Array @@ -403,7 +402,7 @@ def from_spmatrix(cls, data): -------- >>> import scipy.sparse >>> mat = scipy.sparse.coo_matrix((4, 1)) - >>> pd.SparseArray.from_spmatrix(mat) + >>> pd.arrays.SparseArray.from_spmatrix(mat) [0.0, 0.0, 0.0, 0.0] Fill: 0.0 IntIndex @@ -428,7 +427,7 @@ def from_spmatrix(cls, data): return cls._simple_new(arr, index, dtype) - def __array__(self, dtype=None, copy=True): + def __array__(self, dtype=None, copy=True) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -738,6 +737,9 @@ def value_counts(self, dropna=True): # -------- def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer + if isinstance(key, tuple): if len(key) > 1: raise IndexError("too many indices for array.") @@ -766,7 +768,9 @@ def __getitem__(self, key): else: key = np.asarray(key) - if com.is_bool_indexer(key) and len(self) == len(key): + if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) + return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): return self.take(key) @@ -1074,7 +1078,7 @@ def map(self, mapper): Examples -------- - >>> arr = pd.SparseArray([0, 1, 2]) + >>> arr = pd.arrays.SparseArray([0, 1, 2]) >>> arr.apply(lambda x: x + 10) [10, 11, 12] Fill: 10 diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index e3e0064c84da3..6f15681cab87e 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -64,7 +64,7 @@ class SparseDtype(ExtensionDtype): # hash(nan) is (sometimes?) 0. _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): if isinstance(dtype, type(self)): if fill_value is None: @@ -175,7 +175,7 @@ def construct_array_type(cls): ------- type """ - from .array import SparseArray + from pandas.core.arrays.sparse.array import SparseArray return SparseArray diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 278ad1027d489..88d63071c360f 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,8 +3,6 @@ Currently only includes to_coo helpers. """ -from collections import OrderedDict - from pandas.core.indexes.api import Index, MultiIndex from pandas.core.series import Series @@ -46,14 +44,13 @@ def get_indexers(levels): # labels_to_i[:] = np.arange(labels_to_i.shape[0]) def _get_label_to_i_dict(labels, sort_labels=False): - """ Return OrderedDict of unique labels to number. + """ Return dict of unique labels to number. Optionally sort by label. """ labels = Index(map(tuple, labels)).unique().tolist() # squish if sort_labels: labels = sorted(labels) - d = OrderedDict((k, i) for i, k in enumerate(labels)) - return d + return {k: i for i, k in enumerate(labels)} def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): ilabels = list(zip(*[index._get_level_values(i) for i in subset])) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index de254f662bb32..84130132de4dc 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -47,6 +47,8 @@ class StringDtype(ExtensionDtype): StringDtype """ + name = "string" + #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA @@ -54,19 +56,6 @@ class StringDtype(ExtensionDtype): def type(self) -> Type: return str - @property - def name(self) -> str: - """ - The alias for StringDtype is ``'string'``. - """ - return "string" - - @classmethod - def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "string": - return cls() - return super().construct_from_string(string) - @classmethod def construct_array_type(cls) -> "Type[StringArray]": return StringArray @@ -142,7 +131,7 @@ class StringArray(PandasArray): -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - ['This is', 'some text', NA, 'data.'] + ['This is', 'some text', , 'data.'] Length: 4, dtype: string Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string @@ -157,7 +146,7 @@ class StringArray(PandasArray): >>> pd.array(["a", None, "c"], dtype="string") == "a" - [True, NA, False] + [True, , False] Length: 3, dtype: boolean """ @@ -264,7 +253,7 @@ def _reduce(self, name, skipna=True, **kwargs): def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + return value_counts(self._ndarray, dropna=dropna).astype("Int64") # Overrride parent because we have different return types. @classmethod diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b95dfc9ba7580..516a271042c9b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -11,9 +11,7 @@ parse_timedelta_unit, precision_from_unit, ) -import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _NS_DTYPE, @@ -21,7 +19,6 @@ is_dtype_equal, is_float_dtype, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -40,17 +37,12 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick -from . import datetimelike as dtl - -_BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]" - def _is_convertible_to_td(key): return isinstance(key, (Tick, timedelta, np.timedelta64, str)) @@ -72,54 +64,6 @@ def f(self): return property(f) -def _td_array_cmp(cls, op): - """ - Wrap comparison operations to convert timedelta-like to timedelta64 - """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - - if _is_convertible_to_td(other) or other is NaT: - try: - other = Timedelta(other) - except ValueError: - # failed to parse as timedelta - return invalid_comparison(self, other, op) - - result = op(self.view("i8"), other.value) - if isna(other): - result.fill(nat_result) - - elif not is_list_like(other): - return invalid_comparison(self, other, op) - - elif len(other) != len(self): - raise ValueError("Lengths must match") - - else: - try: - other = type(self)._from_sequence(other)._data - except (ValueError, TypeError): - return invalid_comparison(self, other, op) - - result = op(self.view("i8"), other.view("i8")) - result = com.values_from_object(result) - - o_mask = np.array(isna(other)) - if o_mask.any(): - result[o_mask] = nat_result - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): """ Pandas ExtensionArray for timedelta data. @@ -155,6 +99,9 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" _scalar_type = Timedelta + _recognized_scalars = (timedelta, np.timedelta64, Tick) + _is_recognized_dtype = is_timedelta64_dtype + __array_priority__ = 1000 # define my properties & methods for delegation _other_ops: List[str] = [] @@ -213,8 +160,8 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): if not isinstance(values, np.ndarray): msg = ( - f"Unexpected type '{type(values).__name__}'. 'values' must be a" - " TimedeltaArray ndarray, or Series or Index containing one of those." + f"Unexpected type '{type(values).__name__}'. 'values' must be a " + "TimedeltaArray ndarray, or Series or Index containing one of those." ) raise ValueError(msg) if values.ndim not in [1, 2]: @@ -332,7 +279,7 @@ def _unbox_scalar(self, value): def _scalar_from_string(self, value): return Timedelta(value) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): # we don't have anything to validate. pass @@ -342,16 +289,6 @@ def _maybe_clear_freq(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): - fill_value = Timedelta(fill_value).value - else: - raise ValueError(f"'fill_value' should be a Timedelta. Got '{fill_value}'.") - return fill_value - def astype(self, dtype, copy=True): # We handle # --> timedelta64[ns] @@ -378,6 +315,9 @@ def astype(self, dtype, copy=True): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + # ---------------------------------------------------------------- + # Reductions + def sum( self, axis=None, @@ -442,7 +382,7 @@ def _formatter(self, boxed=False): return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep="NaT", date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) @@ -451,8 +391,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None): # ---------------------------------------------------------------- # Arithmetic Methods - _create_comparison_method = classmethod(_td_array_cmp) - def _add_offset(self, other): assert not isinstance(other, Tick) raise TypeError( @@ -507,13 +445,13 @@ def _add_datetimelike_scalar(self, other): dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) - def _addsub_offset_array(self, other, op): - # Add or subtract Array-like of DateOffset objects + def _addsub_object_array(self, other, op): + # Add or subtract Array-like of objects try: # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - return super()._addsub_offset_array(other, op) + return super()._addsub_object_array(other, op) except AttributeError: raise TypeError( f"Cannot add/subtract non-tick DateOffset to {type(self).__name__}" @@ -948,9 +886,6 @@ def f(x): return result -TimedeltaArray._add_comparison_ops() - - # --------------------------------------------------------------------- # Constructor Helpers @@ -1127,7 +1062,7 @@ def _validate_td64_dtype(dtype): raise ValueError(msg) if not is_dtype_equal(dtype, _TD_DTYPE): - raise ValueError(_BAD_DTYPE.format(dtype=dtype)) + raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]") return dtype diff --git a/pandas/core/base.py b/pandas/core/base.py index 948b80fef4032..66d7cd59dcfa4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -8,6 +8,7 @@ import numpy as np import pandas._libs.lib as lib +from pandas._typing import T from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -18,7 +19,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, + is_dict_like, is_extension_array_dtype, is_list_like, is_object_dtype, @@ -86,6 +87,14 @@ def __sizeof__(self): # object's 'sizeof' return super().__sizeof__() + def _ensure_type(self: T, obj) -> T: + """Ensure that an object has same type as self. + + Used by type checkers. + """ + assert isinstance(obj, type(self)), type(obj) + return obj + class NoNewAttributesMixin: """Mixin which prevents adding new attributes. @@ -597,7 +606,7 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 _deprecations: FrozenSet[str] = frozenset( - ["tolist", "item"] # tolist is not deprecated, just suppressed in the __dir__ + ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) def transpose(self, *args, **kwargs): @@ -618,24 +627,6 @@ def transpose(self, *args, **kwargs): """, ) - @property - def _is_homogeneous_type(self) -> bool: - """ - Whether the object has a single dtype. - - By definition, Series and Index are always considered homogeneous. - A MultiIndex may or may not be homogeneous, depending on the - dtypes of the levels. - - See Also - -------- - DataFrame._is_homogeneous_type : Whether all the columns in a - DataFrame have the same dtype. - MultiIndex._is_homogeneous_type : Whether all the levels of a - MultiIndex have the same dtype. - """ - return True - @property def shape(self): """ @@ -724,6 +715,8 @@ def array(self) -> ExtensionArray: period PeriodArray interval IntervalArray IntegerNA IntegerArray + string StringArray + boolean BooleanArray datetime64[ns, tz] DatetimeArray ================== ============================= @@ -775,7 +768,7 @@ def array(self) -> ExtensionArray: return result - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): """ A NumPy ndarray representing the values in this Series or Index. @@ -790,6 +783,17 @@ def to_numpy(self, dtype=None, copy=False): another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + .. versionadded:: 1.0.0 Returns ------- @@ -859,16 +863,21 @@ def to_numpy(self, dtype=None, copy=False): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ - if is_datetime64tz_dtype(self.dtype) and dtype is None: - # note: this is going to change very soon. - # I have a WIP PR making this unnecessary, but it's - # a bit out of scope for the DatetimeArray PR. - dtype = "object" + if is_extension_array_dtype(self.dtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + else: + if kwargs: + msg = "to_numpy() got an unexpected keyword argument '{}'".format( + list(kwargs.keys())[0] + ) + raise TypeError(msg) result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy - if copy: + if copy or na_value is not lib.no_default: result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value return result @property @@ -1107,8 +1116,8 @@ def _map_values(self, mapper, na_action=None): # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types - if isinstance(mapper, dict): - if hasattr(mapper, "__missing__"): + if is_dict_like(mapper): + if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper diff --git a/pandas/core/common.py b/pandas/core/common.py index 9017584171850..f0fcb736586d6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -9,11 +9,12 @@ from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Iterable, Union +from typing import Any, Collection, Iterable, Union import numpy as np from pandas._libs import lib, tslibs +from pandas._typing import T from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -110,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool: Returns ------- bool + Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. + + See Also + -------- + check_bool_array_indexer : Check that `key` + is a valid mask for an array, and convert to an ndarray. """ - na_msg = "cannot index with vector containing NA / NaN values" + na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -270,7 +277,7 @@ def maybe_make_list(obj): return obj -def maybe_iterable_to_list(obj: Union[Iterable, Any]) -> Union[list, Any]: +def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T]: """ If obj is Iterable but not list-like, consume into list. """ diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 57348ad3b81a0..a1b1cffdd1d76 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -2,10 +2,12 @@ """ from functools import partial, wraps +from typing import Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np +from pandas._typing import FrameOrSeries from pandas.errors import PerformanceWarning from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -15,22 +17,27 @@ from pandas.core.computation.common import result_type_many -def _align_core_single_unary_op(term): +def _align_core_single_unary_op( + term, +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: + + typ: Union[partial, Type[FrameOrSeries]] + axes: Optional[Dict[str, int]] = None + if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) - ret = (typ,) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) - if not hasattr(term.value, "axes"): - ret += (None,) - else: - ret += (_zip_axes_from_type(typ, term.value.axes),) - return ret + return typ, axes -def _zip_axes_from_type(typ, new_axes): - axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()} +def _zip_axes_from_type( + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: + axes = {name: new_axes[i] for i, name in typ._AXIS_NAMES.items()} return axes diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 994f470942cd1..19a8898a2987c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -4,9 +4,6 @@ from pandas._config import get_option -# A token value Python's tokenizer probably will never use. -_BACKTICK_QUOTED_STRING = 100 - def _ensure_decoded(s): """ @@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def _remove_spaces_column_name(name): - """ - Check if name contains any spaces, if it contains any spaces - the spaces will be removed and an underscore suffix is added. - """ - if not isinstance(name, str) or " " not in name: - return name - - return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" - - class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index dbfd6c04eee32..9c5388faae1bd 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -3,6 +3,7 @@ """ import abc +from typing import Dict, Type from pandas.core.computation.align import align_terms, reconstruct_object from pandas.core.computation.ops import _mathops, _reductions @@ -53,7 +54,7 @@ def convert(self) -> str: """ return printing.pprint_thing(self.expr) - def evaluate(self): + def evaluate(self) -> object: """ Run the engine on the expression. @@ -62,7 +63,7 @@ def evaluate(self): Returns ------- - obj : object + object The result of the passed expression. """ if not self._is_aligned: @@ -101,12 +102,6 @@ class NumExprEngine(AbstractEngine): has_neg_frac = True - def __init__(self, expr): - super().__init__(expr) - - def convert(self) -> str: - return str(super().convert()) - def _evaluate(self): import numexpr as ne @@ -128,14 +123,14 @@ class PythonEngine(AbstractEngine): has_neg_frac = False - def __init__(self, expr): - super().__init__(expr) - def evaluate(self): return self.expr() - def _evaluate(self): + def _evaluate(self) -> None: pass -_engines = {"numexpr": NumExprEngine, "python": PythonEngine} +_engines: Dict[str, Type[AbstractEngine]] = { + "numexpr": NumExprEngine, + "python": PythonEngine, +} diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 2e5a563b815b3..51892b8c02d87 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -5,19 +5,21 @@ """ import tokenize +from typing import Optional import warnings -from pandas._libs.lib import _no_default +from pandas._libs.lib import no_default from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines -from pandas.core.computation.expr import Expr, _parsers, tokenize_string +from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing -def _check_engine(engine): +def _check_engine(engine: Optional[str]) -> str: """ Make sure a valid engine is passed. @@ -168,8 +170,8 @@ def _check_for_locals(expr: str, stack_level: int, parser: str): def eval( expr, parser="pandas", - engine=None, - truediv=_no_default, + engine: Optional[str] = None, + truediv=no_default, local_dict=None, global_dict=None, resolvers=(), @@ -286,7 +288,7 @@ def eval( inplace = validate_bool_kwarg(inplace, "inplace") - if truediv is not _no_default: + if truediv is not no_default: warnings.warn( "The `truediv` parameter in pd.eval is deprecated and will be " "removed in a future version.", @@ -337,8 +339,8 @@ def eval( if parsed_expr.assigner is None: if multi_line: raise ValueError( - "Multi-line expressions are only valid" - " if all expressions contain an assignment" + "Multi-line expressions are only valid " + "if all expressions contain an assignment" ) elif inplace: raise ValueError("Cannot operate inplace if there is no assignment") diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 9b422b28c3c27..1350587b5ca90 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,19 +3,13 @@ import ast from functools import partial, reduce -from io import StringIO -import itertools as it -import operator +from keyword import iskeyword import tokenize from typing import Optional, Type import numpy as np import pandas.core.common as com -from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, - _remove_spaces_column_name, -) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, @@ -34,38 +28,12 @@ _unary_ops_syms, is_term, ) +from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing -def tokenize_string(source: str): - """ - Tokenize a Python source code string. - - Parameters - ---------- - source : str - A Python source code string - """ - line_reader = StringIO(source).readline - token_generator = tokenize.generate_tokens(line_reader) - - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted - # string. - for toknum, tokval, _, _, _ in token_generator: - if tokval == "`": - tokval = " ".join( - it.takewhile( - lambda tokval: tokval != "`", - map(operator.itemgetter(1), token_generator), - ) - ) - toknum = _BACKTICK_QUOTED_STRING - yield toknum, tokval - - def _rewrite_assign(tok): """Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. @@ -133,31 +101,6 @@ def _replace_locals(tok): return toknum, tokval -def _clean_spaces_backtick_quoted_names(tok): - """Clean up a column name if surrounded by backticks. - - Backtick quoted string are indicated by a certain tokval value. If a string - is a backtick quoted token it will processed by - :func:`_remove_spaces_column_name` so that the parser can find this - string when the query is executed. - See also :meth:`NDFrame._get_space_character_free_column_resolver`. - - Parameters - ---------- - tok : tuple of int, str - ints correspond to the all caps constants in the tokenize module - - Returns - ------- - t : tuple of int, str - Either the input or token or the replacement values - """ - toknum, tokval = tok - if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, _remove_spaces_column_name(tokval) - return toknum, tokval - - def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -172,10 +115,7 @@ def _compose(*funcs): def _preparse( source: str, f=_compose( - _replace_locals, - _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names, + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks ), ): """Compose a collection of tokenization functions @@ -426,8 +366,6 @@ def visit(self, node, **kwargs): try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: - from keyword import iskeyword - if any(iskeyword(x) for x in clean.split()): e.msg = "Python keyword not valid identifier in numexpr query" raise e @@ -781,9 +719,7 @@ def __init__( parser, preparser=partial( _preparse, - f=_compose( - _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names - ), + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), ), ): super().__init__(env, engine, parser, preparser) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py new file mode 100644 index 0000000000000..ce213c8532834 --- /dev/null +++ b/pandas/core/computation/parsing.py @@ -0,0 +1,190 @@ +""":func:`~pandas.eval` source string parsing functions +""" + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import Iterator, Tuple + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # toke.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + # The ignore here is because of a bug in mypy that is resolved in 0.740 + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join(special_characters_replacements.get(char, char) for char in name) + name = "BACKTICK_QUOTED_STRING_" + name + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: str) -> str: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : str + Name to be cleaned. + + Returns + ------- + name : str + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not catched and propogates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> Tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") + else: + yield toknum, tokval diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 4d27bcf2845f1..be652ca0e6a36 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -533,7 +533,7 @@ def __init__( self._visitor = None # capture the environment if needed - local_dict = DeepChainMap() + local_dict: DeepChainMap[Any, Any] = DeepChainMap() if isinstance(where, PyTablesExpr): local_dict = where.env.scope diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ba0a4d81a88d3..afdd8a01ee003 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -300,14 +300,15 @@ def table_schema_cb(key): _enable_data_resource_formatter(cf.get_option(key)) -def is_terminal(): +def is_terminal() -> bool: """ Detect if Python is running in a terminal. Returns True if Python is running in a terminal or False if not. """ try: - ip = get_ipython() + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore except NameError: # assume standard Python interpreter in a terminal return True else: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index cc8311cf3e21d..203ef3ec75c8f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -497,13 +497,8 @@ def sanitize_array( if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "period": - from pandas.core.arrays import period_array - - try: - subarr = period_array(subarr) - except IncompatibleFrequency: - pass + if inferred in {"interval", "period"}: + subarr = array(subarr) return subarr diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index cb0912cbcf880..051affd0af1f9 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,6 +1,6 @@ # flake8: noqa -from .common import ( +from pandas.core.dtypes.common import ( is_array_like, is_bool, is_bool_dtype, diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 1dda51da49ffb..1b4e7062b38e5 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -236,6 +236,10 @@ def construct_from_string(cls, string: str): """ if not isinstance(string, str): raise TypeError(f"Expects a string, got {type(string).__name__}") + + # error: Non-overlapping equality check (left operand type: "str", right + # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] + assert isinstance(cls.name, str), (cls, type(cls.name)) if string != cls.name: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") return cls() @@ -276,10 +280,12 @@ def is_dtype(cls, dtype) -> bool: return False elif isinstance(dtype, cls): return True - try: - return cls.construct_from_string(dtype) is not None - except TypeError: - return False + if isinstance(dtype, str): + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + return False @property def _is_numeric(self) -> bool: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 946070f8fad98..1dbdb8dbba48b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,9 +6,10 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas._libs.tslibs.timezones import tz_compare from pandas.util._validators import validate_bool_kwarg -from .common import ( +from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, @@ -41,8 +42,13 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from .dtypes import DatetimeTZDtype, ExtensionDtype, IntervalDtype, PeriodDtype -from .generic import ( +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, @@ -50,8 +56,8 @@ ABCPeriodIndex, ABCSeries, ) -from .inference import is_list_like -from .missing import isna, notna +from pandas.core.dtypes.inference import is_list_like +from pandas.core.dtypes.missing import isna, notna _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -409,6 +415,14 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT + elif not isinstance(fill_value, datetime): + dtype = np.dtype(np.object_) + elif fill_value.tzinfo is None: + dtype = np.dtype(np.object_) + elif not tz_compare(fill_value.tzinfo, dtype.tz): + # TODO: sure we want to cast here? + dtype = np.dtype(np.object_) + elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value @@ -814,6 +828,8 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): if is_object_dtype(dtype): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions @@ -826,6 +842,8 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): if is_object_dtype(dtype): return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) if dtype not in [_INT64_DTYPE, _TD_DTYPE]: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dc22a79a2f3fe..f62f03be9b732 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -18,7 +18,6 @@ ) from pandas.core.dtypes.generic import ( ABCCategorical, - ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, ABCPeriodArray, @@ -172,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: try: return arr.astype("uint64", copy=copy, casting="safe") # type: ignore except TypeError: + if is_extension_array_dtype(arr.dtype): + return arr.to_numpy(dtype="float64", na_value=np.nan) return arr.astype("float64", copy=copy) @@ -193,12 +194,11 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: """ if not is_scalar(value): raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") - msg = "Wrong type {} for value {}" try: new_value = int(value) assert new_value == value except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(type(value), value)) + raise TypeError(f"Wrong type {type(value)} for value {value}") return new_value @@ -270,9 +270,9 @@ def is_sparse(arr) -> bool: -------- Returns `True` if the parameter is a 1-D pandas sparse array. - >>> is_sparse(pd.SparseArray([0, 0, 1, 0])) + >>> is_sparse(pd.arrays.SparseArray([0, 0, 1, 0])) True - >>> is_sparse(pd.Series(pd.SparseArray([0, 0, 1, 0]))) + >>> is_sparse(pd.Series(pd.arrays.SparseArray([0, 0, 1, 0]))) True Returns `False` if the parameter is not sparse. @@ -319,7 +319,7 @@ def is_scipy_sparse(arr) -> bool: >>> from scipy.sparse import bsr_matrix >>> is_scipy_sparse(bsr_matrix([1, 2, 3])) True - >>> is_scipy_sparse(pd.SparseArray([1, 2, 3])) + >>> is_scipy_sparse(pd.arrays.SparseArray([1, 2, 3])) False """ @@ -368,37 +368,6 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_offsetlike(arr_or_obj) -> bool: - """ - Check if obj or all elements of list-like is DateOffset - - Parameters - ---------- - arr_or_obj : object - - Returns - ------- - boolean - Whether the object is a DateOffset or listlike of DatetOffsets - - Examples - -------- - >>> is_offsetlike(pd.DateOffset(days=1)) - True - >>> is_offsetlike('offset') - False - >>> is_offsetlike([pd.offsets.Minute(4), pd.offsets.MonthEnd()]) - True - >>> is_offsetlike(np.array([pd.DateOffset(months=3), pd.Timestamp.now()])) - False - """ - if isinstance(arr_or_obj, ABCDateOffset): - return True - elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj): - return all(isinstance(x, ABCDateOffset) for x in arr_or_obj) - return False - - def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. @@ -633,7 +602,14 @@ def is_string_dtype(arr_or_dtype) -> bool: # TODO: gh-15585: consider making the checks stricter. def condition(dtype) -> bool: - return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) + return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + + def is_excluded_dtype(dtype) -> bool: + """ + These have kind = "O" but aren't string dtypes so need to be explicitly excluded + """ + is_excluded_checks = (is_period_dtype, is_interval_dtype) + return any(is_excluded(dtype) for is_excluded in is_excluded_checks) return _is_dtype(arr_or_dtype, condition) @@ -1492,7 +1468,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: True >>> is_bool_dtype(pd.Categorical([True, False])) True - >>> is_bool_dtype(pd.SparseArray([True, False])) + >>> is_bool_dtype(pd.arrays.SparseArray([True, False])) True """ if arr_or_dtype is None: @@ -1554,7 +1530,7 @@ def is_extension_type(arr) -> bool: True >>> is_extension_type(pd.Series(cat)) True - >>> is_extension_type(pd.SparseArray([1, 2, 3])) + >>> is_extension_type(pd.arrays.SparseArray([1, 2, 3])) True >>> from scipy.sparse import bsr_matrix >>> is_extension_type(bsr_matrix([1, 2, 3])) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 7b3e7d4f42121..cd4b5af4588e5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -220,7 +220,7 @@ def union_categoricals( ----- To learn more about categories, see `link - `__ + `__ Examples -------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b77cd34700f10..93522abc3a48f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -9,10 +9,9 @@ from pandas._libs.tslibs import NaT, Period, Timestamp, timezones from pandas._typing import Ordered +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass - -from .base import ExtensionDtype -from .inference import is_bool, is_list_like +from pandas.core.dtypes.inference import is_bool, is_list_like str_type = str @@ -436,12 +435,11 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self) -> str_type: - tpl = "CategoricalDtype(categories={data}ordered={ordered})" if self.categories is None: data = "None, " else: data = self.categories._format_data(name=type(self).__name__) - return tpl.format(data=data, ordered=self.ordered) + return f"CategoricalDtype(categories={data}ordered={self.ordered})" @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: @@ -882,7 +880,11 @@ def construct_from_string(cls, string): return cls(freq=string) except ValueError: pass - raise TypeError(f"Cannot construct a 'PeriodDtype' from '{string}'") + if isinstance(string, str): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + else: + msg = f"'construct_from_string' expects a string, got {type(string)}" + raise TypeError(msg) def __str__(self) -> str_type: return self.name @@ -946,6 +948,26 @@ def construct_array_type(cls): return PeriodArray + def __from_arrow__(self, array): + """Construct PeriodArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import PeriodArray + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") + parr = PeriodArray(data.copy(), freq=self.freq, copy=False) + parr[~mask] = NaT + results.append(parr) + + return PeriodArray._concat_same_type(results) + @register_extension_dtype class IntervalDtype(PandasExtensionDtype): @@ -974,7 +996,7 @@ class IntervalDtype(PandasExtensionDtype): """ name = "interval" - kind: Optional[str_type] = None + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 103 @@ -1117,3 +1139,22 @@ def is_dtype(cls, dtype) -> bool: else: return False return super().is_dtype(dtype) + + def __from_arrow__(self, array): + """Construct IntervalArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import IntervalArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + left = np.asarray(arr.storage.field("left"), dtype=self.subtype) + right = np.asarray(arr.storage.field("right"), dtype=self.subtype) + iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) + results.append(iarr) + + return IntervalArray._concat_same_type(results) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index aa0f7d2aba1fc..4c3f8b7374465 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -4,7 +4,10 @@ # define abstract base classes to enable isinstance type checking on our # objects def create_pandas_abc_type(name, attr, comp): - @classmethod + + # https://github.com/python/mypy/issues/1006 + # error: 'classmethod' used with a non-method + @classmethod # type: ignore def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index fc22d5be1ca69..fb579f2f58a57 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,7 +9,7 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from .common import ( +from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, ensure_object, @@ -31,7 +31,7 @@ needs_i8_conversion, pandas_dtype, ) -from .generic import ( +from pandas.core.dtypes.generic import ( ABCDatetimeArray, ABCExtensionArray, ABCGeneric, @@ -40,7 +40,7 @@ ABCSeries, ABCTimedeltaArray, ) -from .inference import is_list_like +from pandas.core.dtypes.inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar @@ -212,7 +212,7 @@ def _use_inf_as_na(key): This approach to setting global module values is discussed and approved here: - * http://stackoverflow.com/questions/4859217/ + * https://stackoverflow.com/questions/4859217/ programmatically-creating-variables-in-python/4859312#4859312 """ flag = get_option(key) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc39b264d1598..676b78573399c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,6 +16,7 @@ from textwrap import dedent from typing import ( IO, + TYPE_CHECKING, Any, FrozenSet, Hashable, @@ -37,7 +38,8 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas._typing import Axes, Dtype, FilePathOrBuffer +from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer +from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -100,6 +102,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex @@ -125,6 +128,9 @@ from pandas.io.formats.printing import pprint_thing import pandas.plotting +if TYPE_CHECKING: + from pandas.io.formats.style import Styler + # --------------------------------------------------------------------- # Docstring templates @@ -140,11 +146,12 @@ Name or list of names to sort by. - if `axis` is 0 or `'index'` then `by` may contain index - levels and/or column labels + levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels + levels and/or index labels. .. versionchanged:: 0.23.0 + Allow specifying index or column level names.""", versionadded_to_excel="", optional_labels="""labels : array-like, optional @@ -395,7 +402,7 @@ def _constructor(self) -> Type["DataFrame"]: _constructor_sliced: Type[Series] = Series _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) - _accessors: Set[str] = set() + _accessors: Set[str] = {"sparse"} @property def _constructor_expanddim(self): @@ -815,7 +822,7 @@ def to_string( # ---------------------------------------------------------------------- @property - def style(self): + def style(self) -> "Styler": """ Returns a Styler object. @@ -890,10 +897,10 @@ def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield k, self._ixs(i, axis=1) @Appender(_shared_docs["items"]) - def iteritems(self): + def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield from self.items() - def iterrows(self): + def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: """ Iterate over DataFrame rows as (index, Series) pairs. @@ -974,7 +981,8 @@ def itertuples(self, index=True, name="Pandas"): ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. - With a large number of columns (>255), regular tuples are returned. + On python versions < 3.7 regular tuples are returned for DataFrames + with a large number of columns (>254). Examples -------- @@ -1017,8 +1025,9 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python 3 supports at most 255 arguments to constructor - if name is not None and len(self.columns) + index < 256: + # Python versions before 3.7 support at most 255 arguments to constructors + can_return_named_tuples = PY37 or len(self.columns) + index < 255 + if name is not None and can_return_named_tuples: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) @@ -1157,7 +1166,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None): + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": """ Construct DataFrame from dict of array-like or dicts. @@ -1237,7 +1246,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1441,7 +1450,7 @@ def to_gbq( location=None, progress_bar=True, credentials=None, - ): + ) -> None: """ Write a DataFrame to a Google BigQuery table. @@ -1546,7 +1555,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, - ): + ) -> "DataFrame": """ Convert structured or record ndarray to DataFrame. @@ -1668,7 +1677,9 @@ def from_records( return cls(mgr) - def to_records(self, index=True, column_dtypes=None, index_dtypes=None): + def to_records( + self, index=True, column_dtypes=None, index_dtypes=None + ) -> np.recarray: """ Convert DataFrame to a NumPy record array. @@ -1833,7 +1844,7 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def _from_arrays(cls, arrays, columns, index, dtype=None): + def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) @@ -1929,14 +1940,17 @@ def to_stata( >>> df.to_stata('animals.dta') # doctest: +SKIP """ kwargs = {} - if version not in (114, 117): - raise ValueError("Only formats 114 and 117 supported.") + if version not in (114, 117, 118): + raise ValueError("Only formats 114, 117 and 118 are supported.") if version == 114: if convert_strl is not None: - raise ValueError("strl support is only available when using format 117") + raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter else: - from pandas.io.stata import StataWriter117 as statawriter + if version == 117: + from pandas.io.stata import StataWriter117 as statawriter + else: + from pandas.io.stata import StataWriter118 as statawriter kwargs["convert_strl"] = convert_strl @@ -1954,7 +1968,7 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path): + def to_feather(self, path) -> None: """ Write out the binary feather-format for DataFrames. @@ -1984,7 +1998,7 @@ def to_feather(self, path): @Substitution(klass="DataFrame") @Appender(_shared_docs["to_markdown"]) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs ) -> Optional[str]: kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") @@ -2006,7 +2020,7 @@ def to_parquet( index=None, partition_cols=None, **kwargs, - ): + ) -> None: """ Write a DataFrame to the binary parquet format. @@ -2141,9 +2155,10 @@ def to_html( A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.display.html.border``. encoding : str, default "utf-8" - Set character encoding + Set character encoding. .. versionadded:: 1.0 + table_id : str, optional A css id is included in the opening `
` tag if specified. @@ -2196,7 +2211,7 @@ def to_html( def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ): + ) -> None: """ Print a concise summary of a DataFrame. @@ -2269,9 +2284,11 @@ def info( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - int_col 5 non-null int64 - text_col 5 non-null object - float_col 5 non-null float64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2310,9 +2327,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2320,9 +2339,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB """ @@ -2341,6 +2362,7 @@ def info( return cols = self.columns + col_count = len(self.columns) # hack if max_cols is None: @@ -2349,36 +2371,76 @@ def info( max_rows = get_option("display.max_info_rows", len(self) + 1) if null_counts is None: - show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows) + show_counts = (col_count <= max_cols) and (len(self) < max_rows) else: show_counts = null_counts - exceeds_info_cols = len(self.columns) > max_cols + exceeds_info_cols = col_count > max_cols def _verbose_repr(): lines.append(f"Data columns (total {len(self.columns)} columns):") - space = max(len(pprint_thing(k)) for k in self.columns) + 4 + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space counts = None - tmpl = "{count}{dtype}" + header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({len(cols)} != {len(counts)})" ) - tmpl = "{count} non-null {dtype}" + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) - dtypes = self.dtypes for i, col in enumerate(self.columns): - dtype = dtypes.iloc[i] + dtype = self.dtypes.iloc[i] col = pprint_thing(col) + line_no = _put_str(f" {i}", space_num) count = "" if show_counts: count = counts.iloc[i] lines.append( - _put_str(col, space) + tmpl.format(count=count, dtype=dtype) + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): @@ -2424,7 +2486,7 @@ def _sizeof_fmt(num, size_qualifier): lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(buf, lines) - def memory_usage(self, index=True, deep=False): + def memory_usage(self, index=True, deep=False) -> Series: """ Return the memory usage of each column in bytes. @@ -2518,7 +2580,7 @@ def memory_usage(self, index=True, deep=False): ) return result - def transpose(self, *args, copy: bool = False): + def transpose(self, *args, copy: bool = False) -> "DataFrame": """ Transpose index and columns. @@ -3010,18 +3072,27 @@ def query(self, expr, inplace=False, **kwargs): Parameters ---------- expr : str - The query string to evaluate. You can refer to variables + The query string to evaluate. + + You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. - .. versionadded:: 0.25.0 - - You can refer to column names that contain spaces by surrounding - them in backticks. + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3076,6 +3147,32 @@ def query(self, expr, inplace=False, **kwargs): For further details and examples see the ``query`` documentation in :ref:`indexing `. + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), @@ -3225,14 +3322,15 @@ def eval(self, expr, inplace=False, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_space_character_free_column_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) + return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None): + def select_dtypes(self, include=None, exclude=None) -> "DataFrame": """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3362,7 +3460,7 @@ def extract_unique_dtypes_from_dtypes_set( return self.iloc[:, keep_these.values] - def insert(self, loc, column, value, allow_duplicates=False): + def insert(self, loc, column, value, allow_duplicates=False) -> None: """ Insert column into DataFrame at specified location. @@ -3382,7 +3480,7 @@ def insert(self, loc, column, value, allow_duplicates=False): value = self._sanitize_column(column, value, broadcast=False) self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs): + def assign(self, **kwargs) -> "DataFrame": r""" Assign new columns to a DataFrame. @@ -3565,7 +3663,7 @@ def _series(self): for idx, item in enumerate(self.columns) } - def lookup(self, row_labels, col_labels): + def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. @@ -3673,7 +3771,7 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value): + def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": """ We are guaranteed non-Nones in the axes. """ @@ -3707,7 +3805,7 @@ def align( limit=None, fill_axis=0, broadcast_axis=None, - ): + ) -> "DataFrame": return super().align( other, join=join, @@ -3734,13 +3832,13 @@ def align( ("tolerance", None), ], ) - def reindex(self, *args, **kwargs): + def reindex(self, *args, **kwargs) -> "DataFrame": axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) - return super().reindex(**kwargs) + return self._ensure_type(super().reindex(**kwargs)) def drop( self, @@ -3888,7 +3986,19 @@ def drop( "mapper", [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], ) - def rename(self, *args, **kwargs): + def rename( + self, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional["DataFrame"]: + """ Alter axes labels. @@ -3997,12 +4107,16 @@ def rename(self, *args, **kwargs): 2 2 5 4 3 6 """ - axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename") - kwargs.update(axes) - # Pop these, since the values are in `kwargs` under different names - kwargs.pop("axis", None) - kwargs.pop("mapper", None) - return super().rename(**kwargs) + return super().rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.fillna.__doc__) @@ -4014,8 +4128,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs, - ): + ) -> Optional["DataFrame"]: return super().fillna( value=value, method=method, @@ -4023,7 +4136,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs, ) @Appender(_shared_docs["replace"] % _shared_doc_kwargs) @@ -4046,9 +4158,9 @@ def replace( ) @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift( - periods=periods, freq=freq, axis=axis, fill_value=fill_value + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + return self._ensure_type( + super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) ) def set_index( @@ -4153,7 +4265,7 @@ def set_index( "one-dimensional arrays." ) - missing = [] + missing: List[Optional[Hashable]] = [] for col in keys: if isinstance( col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) @@ -4190,7 +4302,7 @@ def set_index( else: arrays.append(self.index) - to_remove = [] + to_remove: List[Optional[Hashable]] = [] for col in keys: if isinstance(col, ABCMultiIndex): for n in range(col.nlevels): @@ -4486,19 +4598,19 @@ def _maybe_casted_values(index, labels=None): # Reindex-based selection methods @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self) -> "DataFrame": return super().isna() @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self) -> "DataFrame": return super().isnull() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self) -> "DataFrame": return super().notna() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self) -> "DataFrame": return super().notnull() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -4829,6 +4941,7 @@ def sort_index( kind="quicksort", na_position="last", sort_remaining=True, + ignore_index: bool = False, ): # TODO: this can be combined with Series.sort_index impl as @@ -4879,12 +4992,15 @@ def sort_index( # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + if inplace: return self._update_inplace(new_data) else: return self._constructor(new_data).__finalize__(self) - def nlargest(self, n, columns, keep="first"): + def nlargest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. @@ -4993,7 +5109,7 @@ def nlargest(self, n, columns, keep="first"): """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first"): + def nsmallest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5094,7 +5210,7 @@ def nsmallest(self, n, columns, keep="first"): self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0): + def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": """ Swap levels i and j in a MultiIndex on a particular axis. @@ -5116,7 +5232,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0): + def reorder_levels(self, order, axis=0) -> "DataFrame": """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5130,7 +5246,7 @@ def reorder_levels(self, order, axis=0): Returns ------- - type of caller (new object) + DataFrame """ axis = self._get_axis_number(axis) if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover @@ -5148,7 +5264,7 @@ def reorder_levels(self, order, axis=0): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join="outer", level=level, copy=False) + # at this point we have `self._indexed_same(other)` if fill_value is None: # since _arith_op may be called in a loop, avoid function call @@ -5164,14 +5280,15 @@ def _arith_op(left, right): left, right = ops.fill_binop(left, right, fill_value) return func(left, right) - if ops.should_series_dispatch(this, other, func): + if ops.should_series_dispatch(self, other, func): # iterate over columns - new_data = ops.dispatch_to_series(this, other, _arith_op) + new_data = ops.dispatch_to_series(self, other, _arith_op) else: with np.errstate(all="ignore"): - res_values = _arith_op(this.values, other.values) - new_data = dispatch_fill_zeros(func, this.values, other.values, res_values) - return this._construct_result(new_data) + res_values = _arith_op(self.values, other.values) + new_data = dispatch_fill_zeros(func, self.values, other.values, res_values) + + return new_data def _combine_match_index(self, other, func): # at this point we have `self.index.equals(other.index)` @@ -5203,7 +5320,9 @@ def _construct_result(self, result) -> "DataFrame": out.columns = self.columns return out - def combine(self, other, func, fill_value=None, overwrite=True): + def combine( + self, other: "DataFrame", func, fill_value=None, overwrite=True + ) -> "DataFrame": """ Perform column-wise combine with another DataFrame. @@ -5370,7 +5489,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other): + def combine_first(self, other: "DataFrame") -> "DataFrame": """ Update null elements with value in the same location in `other`. @@ -5448,7 +5567,7 @@ def combiner(x, y): def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" - ): + ) -> None: """ Modify in place using non-NA values from another DataFrame. @@ -5599,6 +5718,82 @@ def update( # ---------------------------------------------------------------------- # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.DataFrameGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) _shared_docs[ "pivot" @@ -5707,7 +5902,7 @@ def update( @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None): + def pivot(self, index=None, columns=None, values=None) -> "DataFrame": from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -5854,7 +6049,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, - ): + ) -> "DataFrame": from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -6284,7 +6479,7 @@ def melt( var_name=None, value_name="value", col_level=None, - ): + ) -> "DataFrame": from pandas.core.reshape.melt import melt return melt( @@ -6299,7 +6494,7 @@ def melt( # ---------------------------------------------------------------------- # Time series-related - def diff(self, periods=1, axis=0): + def diff(self, periods=1, axis=0) -> "DataFrame": """ First discrete difference of element. @@ -6507,7 +6702,7 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate @Appender(_shared_docs["transform"] % _shared_doc_kwargs) - def transform(self, func, axis=0, *args, **kwargs): + def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T @@ -6662,7 +6857,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): ) return op.get_result() - def applymap(self, func): + def applymap(self, func) -> "DataFrame": """ Apply a function to a Dataframe elementwise. @@ -6731,7 +6926,9 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False, sort=False): + def append( + self, other, ignore_index=False, verify_integrity=False, sort=False + ) -> "DataFrame": """ Append rows of `other` to the end of caller, returning a new object. @@ -6829,8 +7026,8 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): other = Series(other) if other.name is None and not ignore_index: raise TypeError( - "Can only append a Series if ignore_index=True" - " or if the Series has a name" + "Can only append a Series if ignore_index=True " + "or if the Series has a name" ) index = Index([other.name], name=self.index.name) @@ -6858,7 +7055,7 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): from pandas.core.reshape.concat import concat if isinstance(other, (list, tuple)): - to_concat = [self] + other + to_concat = [self, *other] else: to_concat = [self, other] return concat( @@ -6868,7 +7065,9 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): sort=sort, ) - def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): + def join( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ) -> "DataFrame": """ Join columns of another DataFrame. @@ -7059,7 +7258,7 @@ def merge( copy=True, indicator=False, validate=None, - ): + ) -> "DataFrame": from pandas.core.reshape.merge import merge return merge( @@ -7078,7 +7277,7 @@ def merge( validate=validate, ) - def round(self, decimals=0, *args, **kwargs): + def round(self, decimals=0, *args, **kwargs) -> "DataFrame": """ Round a DataFrame to a variable number of decimal places. @@ -7192,7 +7391,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1): + def corr(self, method="pearson", min_periods=1) -> "DataFrame": """ Compute pairwise correlation of columns, excluding NA/null values. @@ -7280,7 +7479,7 @@ def corr(self, method="pearson", min_periods=1): return self._constructor(correl, index=idx, columns=cols) - def cov(self, min_periods=None): + def cov(self, min_periods=None) -> "DataFrame": """ Compute pairwise covariance of columns, excluding NA/null values. @@ -7390,7 +7589,7 @@ def cov(self, min_periods=None): return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False, method="pearson"): + def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: """ Compute pairwise correlation. @@ -7662,6 +7861,26 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + return out + if numeric_only is None: values = self.values try: @@ -7726,7 +7945,7 @@ def _get_data(axis_matters): result = Series(result, index=labels) return result - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. @@ -7766,7 +7985,7 @@ def nunique(self, axis=0, dropna=True): """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - def idxmin(self, axis=0, skipna=True): + def idxmin(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of minimum over requested axis. @@ -7775,7 +7994,7 @@ def idxmin(self, axis=0, skipna=True): Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7804,7 +8023,7 @@ def idxmin(self, axis=0, skipna=True): result = [index[i] if i >= 0 else np.nan for i in indices] return Series(result, index=self._get_agg_axis(axis)) - def idxmax(self, axis=0, skipna=True): + def idxmax(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of maximum over requested axis. @@ -7813,7 +8032,7 @@ def idxmax(self, axis=0, skipna=True): Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7853,7 +8072,7 @@ def _get_agg_axis(self, axis_num): else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True): + def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": """ Get the mode(s) of each element along the selected axis. @@ -8036,7 +8255,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): return result - def to_timestamp(self, freq=None, how="start", axis=0, copy=True): + def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame": """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -8070,7 +8289,7 @@ def to_timestamp(self, freq=None, how="start", axis=0, copy=True): return self._constructor(new_data) - def to_period(self, freq=None, axis=0, copy=True): + def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -8104,7 +8323,7 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) - def isin(self, values): + def isin(self, values) -> "DataFrame": """ Whether each element in the DataFrame is contained in values. @@ -8171,12 +8390,14 @@ def isin(self, values): from pandas.core.reshape.concat import concat values = collections.defaultdict(list, values) - return concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, + return self._ensure_type( + concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) ) elif isinstance(values, Series): if not values.index.is_unique: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 08c7f38ce4c82..04ce424edbee4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -19,6 +19,7 @@ Sequence, Set, Tuple, + Type, Union, ) import warnings @@ -29,7 +30,15 @@ from pandas._config import config from pandas._libs import Timestamp, iNaT, lib, properties -from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries, JSONSerializable +from pandas._typing import ( + Axis, + Dtype, + FilePathOrBuffer, + FrameOrSeries, + JSONSerializable, + Level, + Renamer, +) from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -105,10 +114,6 @@ Name or list of names to sort by""", ) -# sentinel value to use as kwarg in place of None when None has special meaning -# and needs to be distinguished from a user explicitly passing None. -sentinel = object() - def _single_replace(self, to_replace, method, inplace, limit): """ @@ -143,7 +148,7 @@ def _single_replace(self, to_replace, method, inplace, limit): bool_t = bool # Need alias because NDFrame has def bool: -class NDFrame(PandasObject, SelectionMixin): +class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -172,7 +177,7 @@ class NDFrame(PandasObject, SelectionMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset(["get_values", "ix"]) + _deprecations: FrozenSet[str] = frozenset(["get_values"]) _metadata: List[str] = [] _is_copy = None _data: BlockManager @@ -234,6 +239,10 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): def attrs(self) -> Dict[Optional[Hashable], Any]: """ Dictionary of global attributes on this object. + + .. warning:: + + attrs is experimental and may change without warning. """ if self._attrs is None: self._attrs = {} @@ -262,7 +271,7 @@ def _validate_dtype(self, dtype): # Construction @property - def _constructor(self): + def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]: """Used when a manipulation result has the same dimensions as the original. """ @@ -298,7 +307,7 @@ def _constructor_expanddim(self): _AXIS_LEN: int @classmethod - def _setup_axes(cls, axes: List[str], docs: Dict[str, str]): + def _setup_axes(cls, axes: List[str], docs: Dict[str, str]) -> None: """ Provide axes setup for the major PandasObjects. @@ -373,7 +382,7 @@ def _construct_axes_from_arguments( return axes, kwargs @classmethod - def _from_axes(cls, data, axes, **kwargs): + def _from_axes(cls: Type[FrameOrSeries], data, axes, **kwargs) -> FrameOrSeries: # for construction from BlockManager if isinstance(data, BlockManager): return cls(data, **kwargs) @@ -423,7 +432,7 @@ def _get_block_manager_axis(cls, axis): return m - axis return axis - def _get_axis_resolvers(self, axis): + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -453,22 +462,31 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_index_resolvers(self): - d = {} + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + from pandas.core.computation.parsing import clean_column_name + + d: Dict[str, ABCSeries] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) - return d - def _get_space_character_free_column_resolvers(self): - """Return the space character free column resolvers of a dataframe. + return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} - Column names with spaces are 'cleaned up' so that they can be referred - to by backtick quoting. + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. Used in :meth:`DataFrame.eval`. """ - from pandas.core.computation.common import _remove_spaces_column_name + from pandas.core.computation.parsing import clean_column_name - return {_remove_spaces_column_name(k): v for k, v in self.items()} + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} + + return { + clean_column_name(k): v for k, v in self.items() if not isinstance(k, int) + } @property def _info_axis(self): @@ -486,7 +504,7 @@ def shape(self) -> Tuple[int, ...]: return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) @property - def axes(self): + def axes(self) -> List[Index]: """ Return index label(s) of the internal NDFrame """ @@ -639,11 +657,11 @@ def set_axis(self, labels, axis=0, inplace=False): obj.set_axis(labels, axis=axis, inplace=True) return obj - def _set_axis(self, axis, labels): + def _set_axis(self, axis, labels) -> None: self._data.set_axis(axis, labels) self._clear_item_cache() - def swapaxes(self, axis1, axis2, copy=True): + def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: """ Interchange axes and swap values axes appropriately. @@ -668,7 +686,7 @@ def swapaxes(self, axis1, axis2, copy=True): return self._constructor(new_values, *new_axes).__finalize__(self) - def droplevel(self, level, axis=0): + def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ Return DataFrame with requested index / column level(s) removed. @@ -728,7 +746,7 @@ def droplevel(self, level, axis=0): result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self, item): + def pop(self: FrameOrSeries, item) -> FrameOrSeries: """ Return item and drop from frame. Raise KeyError if not found. @@ -889,7 +907,7 @@ def squeeze(self, axis=None): ) ] - def swaplevel(self, i=-2, j=-1, axis=0): + def swaplevel(self: FrameOrSeries, i=-2, j=-1, axis=0) -> FrameOrSeries: """ Swap levels i and j in a MultiIndex on a particular axis @@ -911,7 +929,18 @@ def swaplevel(self, i=-2, j=-1, axis=0): # ---------------------------------------------------------------------- # Rename - def rename(self, *args, **kwargs): + def rename( + self: FrameOrSeries, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional[FrameOrSeries]: """ Alter axes input function or functions. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left @@ -1024,44 +1053,46 @@ def rename(self, *args, **kwargs): See the :ref:`user guide ` for more. """ - axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - copy = kwargs.pop("copy", True) - inplace = kwargs.pop("inplace", False) - level = kwargs.pop("level", None) - axis = kwargs.pop("axis", None) - errors = kwargs.pop("errors", "ignore") - if axis is not None: - # Validate the axis - self._get_axis_number(axis) - - if kwargs: - raise TypeError( - "rename() got an unexpected keyword " - f'argument "{list(kwargs.keys())[0]}"' - ) - - if com.count_not_none(*axes.values()) == 0: + if mapper is None and index is None and columns is None: raise TypeError("must pass an index to rename") - self._consolidate_inplace() + if index is not None or columns is not None: + if axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'" + ) + elif mapper is not None: + raise TypeError( + "Cannot specify both 'mapper' and any of 'index' or 'columns'" + ) + else: + # use the mapper argument + if axis and self._get_axis_number(axis) == 1: + columns = mapper + else: + index = mapper + result = self if inplace else self.copy(deep=copy) - # start in the axis order to eliminate too many copies - for axis in range(self._AXIS_LEN): - v = axes.get(self._AXIS_NAMES[axis]) - if v is None: + for axis_no, replacements in enumerate((index, columns)): + if replacements is None: continue - f = com.get_rename_function(v) - baxis = self._get_block_manager_axis(axis) + + ax = self._get_axis(axis_no) + baxis = self._get_block_manager_axis(axis_no) + f = com.get_rename_function(replacements) + if level is not None: - level = self.axes[axis]._get_level_number(level) + level = ax._get_level_number(level) # GH 13473 - if not callable(v): - indexer = self.axes[axis].get_indexer_for(v) + if not callable(replacements): + indexer = ax.get_indexer_for(replacements) if errors == "raise" and len(indexer[indexer == -1]): missing_labels = [ - label for index, label in enumerate(v) if indexer[index] == -1 + label + for index, label in enumerate(replacements) + if indexer[index] == -1 ] raise KeyError(f"{missing_labels} not found in axis") @@ -1072,11 +1103,12 @@ def rename(self, *args, **kwargs): if inplace: self._update_inplace(result._data) + return None else: return result.__finalize__(self) @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) - def rename_axis(self, mapper=sentinel, **kwargs): + def rename_axis(self, mapper=lib.no_default, **kwargs): """ Set the name of the axis for the index or columns. @@ -1201,7 +1233,7 @@ class name monkey 2 2 """ axes, kwargs = self._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel + (), kwargs, sentinel=lib.no_default ) copy = kwargs.pop("copy", True) inplace = kwargs.pop("inplace", False) @@ -1217,7 +1249,7 @@ class name inplace = validate_bool_kwarg(inplace, "inplace") - if mapper is not sentinel: + if mapper is not lib.no_default: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or ( is_list_like(mapper) and not is_dict_like(mapper) @@ -1233,7 +1265,7 @@ class name for axis in range(self._AXIS_LEN): v = axes.get(self._AXIS_NAMES[axis]) - if v is sentinel: + if v is lib.no_default: continue non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) if non_mapper: @@ -1473,10 +1505,10 @@ def bool(self): self.__nonzero__() - def __abs__(self): + def __abs__(self: FrameOrSeries) -> FrameOrSeries: return self.abs() - def __round__(self, decimals=0): + def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: return self.round(decimals) # ------------------------------------------------------------------------- @@ -1872,7 +1904,7 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__ = 1000 - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return com.values_from_object(self) def __array_wrap__(self, result, context=None): @@ -2123,7 +2155,7 @@ def to_excel( inf_rep="inf", verbose=True, freeze_panes=None, - ): + ) -> None: df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter @@ -2347,7 +2379,7 @@ def to_hdf( data_columns: Optional[List[str]] = None, errors: str = "strict", encoding: str = "UTF-8", - ): + ) -> None: """ Write the contained data to an HDF5 file using HDFStore. @@ -2691,7 +2723,9 @@ def to_pickle( to_pickle(self, path, compression=compression, protocol=protocol) - def to_clipboard(self, excel: bool_t = True, sep: Optional[str] = None, **kwargs): + def to_clipboard( + self, excel: bool_t = True, sep: Optional[str] = None, **kwargs + ) -> None: r""" Copy object to the system clipboard. @@ -3178,7 +3212,10 @@ def to_csv( @classmethod def _create_indexer(cls, name: str, indexer) -> None: - """Create an indexer like _name in the class.""" + """Create an indexer like _name in the class. + + Kept for compatibility with geopandas. To be removed in the future. See GH27258 + """ if getattr(cls, name, None) is None: _indexer = functools.partial(indexer, name) setattr(cls, name, property(_indexer, doc=indexer.__doc__)) @@ -3259,7 +3296,9 @@ def _clear_item_cache(self) -> None: # ---------------------------------------------------------------------- # Indexing Methods - def take(self, indices, axis=0, is_copy: bool_t = True, **kwargs): + def take( + self: FrameOrSeries, indices, axis=0, is_copy: Optional[bool_t] = None, **kwargs + ) -> FrameOrSeries: """ Return the elements in the given *positional* indices along an axis. @@ -3276,6 +3315,8 @@ def take(self, indices, axis=0, is_copy: bool_t = True, **kwargs): selecting rows, ``1`` means that we are selecting columns. is_copy : bool, default True Whether to return a copy of the original object or not. + + .. deprecated:: 1.0.0 **kwargs For compatibility with :meth:`numpy.take`. Has no effect on the output. @@ -3334,6 +3375,16 @@ class max_speed 1 monkey mammal NaN 3 lion mammal 80.5 """ + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "take will always return a copy in the future.", + FutureWarning, + stacklevel=2, + ) + else: + is_copy = True + nv.validate_take(tuple(), kwargs) self._consolidate_inplace() @@ -3542,7 +3593,7 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self, slobj: slice, axis=0, kind=None): + def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries: """ Construct a slice of this container. @@ -3648,7 +3699,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): "A value is trying to be set on a copy of a slice from a " "DataFrame\n\n" "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" "indexing.html#returning-a-view-versus-a-copy" ) @@ -3659,7 +3710,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): "DataFrame.\n" "Try using .loc[row_indexer,col_indexer] = value " "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" "indexing.html#returning-a-view-versus-a-copy" ) @@ -3668,7 +3719,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): elif value == "warn": warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel) - def __delitem__(self, key): + def __delitem__(self, key) -> None: """ Delete item """ @@ -3730,13 +3781,13 @@ def _is_view(self): return self._data.is_view def reindex_like( - self, + self: FrameOrSeries, other, method: Optional[str] = None, copy: bool_t = True, limit=None, tolerance=None, - ): + ) -> FrameOrSeries: """ Return an object with matching indices as other object. @@ -3878,7 +3929,9 @@ def drop( else: return obj - def _drop_axis(self, labels, axis, level=None, errors: str = "raise"): + def _drop_axis( + self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" + ) -> FrameOrSeries: """ Drop labels from specified axis. Used in the ``drop`` method internally. @@ -3948,7 +4001,7 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: self._data = getattr(result, "_data", result) self._maybe_update_cacher(verify_is_copy=verify_is_copy) - def add_prefix(self, prefix: str): + def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: """ Prefix labels with string `prefix`. @@ -4005,9 +4058,9 @@ def add_prefix(self, prefix: str): f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) + return self.rename(**mapper) # type: ignore - def add_suffix(self, suffix: str): + def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ Suffix labels with string `suffix`. @@ -4064,11 +4117,10 @@ def add_suffix(self, suffix: str): f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) + return self.rename(**mapper) # type: ignore def sort_values( self, - by=None, axis=0, ascending=True, inplace: bool_t = False, @@ -4178,6 +4230,7 @@ def sort_index( kind: str = "quicksort", na_position: str = "last", sort_remaining: bool_t = True, + ignore_index: bool_t = False, ): """ Sort object by labels (along an axis). @@ -4204,6 +4257,10 @@ def sort_index( sort_remaining : bool, default True If True and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4227,7 +4284,7 @@ def sort_index( new_axis = labels.take(sort_index) return self.reindex(**{axis_name: new_axis}) - def reindex(self, *args, **kwargs): + def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: """ Conform %(klass)s to new index with optional filling logic. @@ -4475,7 +4532,9 @@ def reindex(self, *args, **kwargs): axes, level, limit, tolerance, method, fill_value, copy ).__finalize__(self) - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): + def _reindex_axes( + self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy + ) -> FrameOrSeries: """Perform the reindex for all the axes.""" obj = self for a in self._AXIS_ORDERS: @@ -4511,12 +4570,12 @@ def _reindex_multi(self, axes, copy, fill_value): raise AbstractMethodError(self) def _reindex_with_indexers( - self, + self: FrameOrSeries, reindexers, fill_value=None, copy: bool_t = False, allow_dups: bool_t = False, - ): + ) -> FrameOrSeries: """allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indicated @@ -4548,12 +4607,12 @@ def _reindex_with_indexers( return self._constructor(new_data).__finalize__(self) def filter( - self, + self: FrameOrSeries, items=None, like: Optional[str] = None, regex: Optional[str] = None, axis=None, - ): + ) -> FrameOrSeries: """ Subset the dataframe rows or columns according to the specified index labels. @@ -4652,6 +4711,9 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: on position. It is useful for quickly testing if your object has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + Parameters ---------- n : int, default 5 @@ -4659,7 +4721,7 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: Returns ------- - obj_head : same type as caller + same type as caller The first `n` rows of the caller object. See Also @@ -4699,6 +4761,17 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: 0 alligator 1 bee 2 falcon + + For negative values of `n` + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot """ return self.iloc[:n] @@ -4711,6 +4784,9 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: position. It is useful for quickly verifying data, for example, after sorting or appending rows. + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + Parameters ---------- n : int, default 5 @@ -4758,6 +4834,17 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: 6 shark 7 whale 8 zebra + + For negative values of `n` + + >>> df.tail(-3) + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra """ if n == 0: @@ -4765,14 +4852,14 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: return self.iloc[-n:] def sample( - self, + self: FrameOrSeries, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None, - ): + ) -> FrameOrSeries: """ Return a random sample of items from an axis of object. @@ -4960,7 +5047,7 @@ def sample( ) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) - return self.take(locs, axis=axis, is_copy=False) + return self.take(locs, axis=axis) _shared_docs[ "pipe" @@ -5311,7 +5398,7 @@ def _get_bool_data(self): # Internal Interface Methods @property - def values(self): + def values(self) -> np.ndarray: """ Return a Numpy representation of the DataFrame. @@ -5388,16 +5475,16 @@ def values(self): return self._data.as_array(transpose=self._AXIS_REVERSED) @property - def _values(self): + def _values(self) -> np.ndarray: """internal implementation""" return self.values @property - def _get_values(self): + def _get_values(self) -> np.ndarray: # compat return self.values - def _internal_get_values(self): + def _internal_get_values(self) -> np.ndarray: """ Return an ndarray after converting sparse values to dense. @@ -5461,7 +5548,9 @@ def _to_dict_of_blocks(self, copy: bool_t = True): for k, v, in self._data.to_dict(copy=copy).items() } - def astype(self, dtype, copy: bool_t = True, errors: str = "raise"): + def astype( + self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" + ) -> FrameOrSeries: """ Cast a pandas object to a specified dtype ``dtype``. @@ -5769,7 +5858,7 @@ def _convert( ) ).__finalize__(self) - def infer_objects(self): + def infer_objects(self: FrameOrSeries) -> FrameOrSeries: """ Attempt to infer better dtypes for object columns. @@ -6372,8 +6461,8 @@ def replace( if not is_dict_like(to_replace): if not is_dict_like(regex): raise TypeError( - 'If "to_replace" and "value" are both None' - ' and "to_replace" is not a list, then ' + 'If "to_replace" and "value" are both None ' + 'and "to_replace" is not a list, then ' "regex must be a mapping" ) to_replace = regex @@ -6387,9 +6476,8 @@ def replace( if any(are_mappings): if not all(are_mappings): raise TypeError( - "If a nested mapping is passed, all values" - " of the top level mapping must be " - "mappings" + "If a nested mapping is passed, all values " + "of the top level mapping must be mappings" ) # passed a nested dict/Series to_rep_dict = {} @@ -6909,8 +6997,7 @@ def asof(self, where, subset=None): if not is_list: start = self.index[0] if isinstance(self.index, PeriodIndex): - where = Period(where, freq=self.index.freq).ordinal - start = start.ordinal + where = Period(where, freq=self.index.freq) if where < start: if not is_series: @@ -6955,7 +7042,8 @@ def asof(self, where, subset=None): # mask the missing missing = locs == -1 - data = self.take(locs, is_copy=False) + d = self.take(locs) + data = d.copy() data.index = where data.loc[missing] = np.nan return data if is_list else data.iloc[-1] @@ -7026,11 +7114,11 @@ def asof(self, where, subset=None): """ @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self) @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self) _shared_docs[ @@ -7096,11 +7184,11 @@ def isnull(self): """ @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self) @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self) def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): @@ -7152,14 +7240,14 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): return self.where(subset, threshold, axis=axis, inplace=inplace) def clip( - self, + self: FrameOrSeries, lower=None, upper=None, axis=None, inplace: bool_t = False, *args, **kwargs, - ): + ) -> FrameOrSeries: """ Trim values at input threshold(s). @@ -7273,19 +7361,10 @@ def clip( return result - def groupby( - self, - by=None, - axis=0, - level=None, - as_index: bool_t = True, - sort: bool_t = True, - group_keys: bool_t = True, - squeeze: bool_t = False, - observed: bool_t = False, - ): - """ - Group DataFrame or Series using a mapper or by a Series of columns. + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7330,9 +7409,8 @@ def groupby( Returns ------- - DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that - contains information about the groups. + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. See Also -------- @@ -7342,79 +7420,17 @@ def groupby( Notes ----- See the `user guide - `_ for more. - - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level=1).mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - """ - from pandas.core.groupby.groupby import get_groupby - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - - return get_groupby( - self, - by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, - observed=observed, - ) + `_ for more. + """ def asfreq( - self, + self: FrameOrSeries, freq, method=None, how: Optional[str] = None, normalize: bool_t = False, fill_value=None, - ): + ) -> FrameOrSeries: """ Convert TimeSeries to specified frequency. @@ -7453,7 +7469,7 @@ def asfreq( Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -7517,7 +7533,9 @@ def asfreq( fill_value=fill_value, ) - def at_time(self, time, asof: bool_t = False, axis=None): + def at_time( + self: FrameOrSeries, time, asof: bool_t = False, axis=None + ) -> FrameOrSeries: """ Select values at particular time of day (e.g. 9:30AM). @@ -7574,13 +7592,13 @@ def at_time(self, time, asof: bool_t = False, axis=None): return self.take(indexer, axis=axis) def between_time( - self, + self: FrameOrSeries, start_time, end_time, include_start: bool_t = True, include_end: bool_t = True, axis=None, - ): + ) -> FrameOrSeries: """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -7730,7 +7748,7 @@ def resample( for more. To learn more about the offset strings, please see `this link - `__. + `__. Examples -------- @@ -7949,7 +7967,7 @@ def resample( level=level, ) - def first(self, offset): + def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ Method to subset initial periods of time series data based on a date offset. @@ -8011,7 +8029,7 @@ def first(self, offset): return self.loc[:end] - def last(self, offset): + def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ Method to subset final periods of time series data based on a date offset. @@ -8370,8 +8388,12 @@ def _align_frame( ) if method is not None: - left = left.fillna(axis=fill_axis, method=method, limit=limit) - right = right.fillna(axis=fill_axis, method=method, limit=limit) + left = self._ensure_type( + left.fillna(method=method, axis=fill_axis, limit=limit) + ) + right = self._ensure_type( + right.fillna(method=method, axis=fill_axis, limit=limit) + ) # if DatetimeIndex have different tz, convert to UTC if is_datetime64tz_dtype(left.index): @@ -8864,7 +8886,9 @@ def mask( """ @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift( + self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None + ) -> FrameOrSeries: if periods == 0: return self.copy() @@ -8915,7 +8939,9 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: return new_obj.__finalize__(self) - def tshift(self, periods: int = 1, freq=None, axis=0): + def tshift( + self: FrameOrSeries, periods: int = 1, freq=None, axis=0 + ) -> FrameOrSeries: """ Shift the time index, using the index's frequency if available. @@ -9358,7 +9384,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): # ---------------------------------------------------------------------- # Numeric Methods - def abs(self): + def abs(self: FrameOrSeries) -> FrameOrSeries: """ Return a Series/DataFrame with absolute numeric value of each element. @@ -9427,7 +9453,9 @@ def abs(self): """ return np.abs(self) - def describe(self, percentiles=None, include=None, exclude=None): + def describe( + self: FrameOrSeries, percentiles=None, include=None, exclude=None + ) -> FrameOrSeries: """ Generate descriptive statistics. @@ -9763,7 +9791,7 @@ def describe_1d(data): ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows - names = [] + names: List[Optional[Hashable]] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: @@ -9892,20 +9920,29 @@ def describe_1d(data): """ @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) - def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): + def pct_change( + self: FrameOrSeries, + periods=1, + fill_method="pad", + limit=None, + freq=None, + **kwargs, + ) -> FrameOrSeries: # TODO: Not sure if above is correct - need someone to confirm. axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) if fill_method is None: data = self else: - data = self.fillna(method=fill_method, limit=limit, axis=axis) + data = self._ensure_type( + self.fillna(method=fill_method, axis=axis, limit=limit) + ) rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 - rs = rs.loc[~rs.index.duplicated()] - rs = rs.reindex_like(data) - if freq is None: - mask = isna(com.values_from_object(data)) - np.putmask(rs.values, mask, np.nan) + if freq is not None: + # Shift method is implemented differently when freq is not None + # We want to restore the original index + rs = rs.loc[~rs.index.duplicated()] + rs = rs.reindex_like(data) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): @@ -11086,44 +11123,67 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - y = com.values_from_object(self).copy() - d = self._construct_axes_dict() - d["copy"] = False + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def na_accum_func(blk_values): + # We will be applying this function to block values + if blk_values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = blk_values.dtype + + # We need to define mask before masking NaTs + mask = isna(blk_values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = blk_values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = blk_values + changed = False + + result = accum_func(y.view("i8"), axis) + if skipna: + np.putmask(result, mask, iNaT) + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(blk_values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(blk_values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass( + blk_values.dtype.type, (np.integer, np.bool_) + ): + vals = blk_values.copy().T + mask = isna(vals) + np.putmask(vals, mask, mask_a) + result = accum_func(vals, axis) + np.putmask(result, mask, mask_b) + else: + result = accum_func(blk_values.T, axis) - if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - # numpy 1.18 started sorting NaTs at the end instead of beginning, - # so we need to work around to maintain backwards-consistency. - orig_dtype = y.dtype - if accum_func == np.minimum.accumulate: - # Note: the accum_func comparison fails as an "is" comparison - # Note that "y" is always a copy, so we can safely modify it - mask = isna(self) - y = y.view("i8") - y[mask] = np.iinfo(np.int64).max - - result = accum_func(y.view("i8"), axis).view(orig_dtype) - if skipna: - mask = isna(self) - np.putmask(result, mask, iNaT) - elif accum_func == np.minimum.accumulate: - # Restore NaTs that we masked previously - nz = (~np.asarray(mask)).nonzero()[0] - if len(nz): - # everything up to the first non-na entry stays NaT - result[: nz[0]] = iNaT + # transpose back for ndarray, not for EA + return result.T if hasattr(result, "T") else result - if self.ndim == 1: - # restore dt64tz dtype - d["dtype"] = self.dtype - - elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - mask = isna(self) - np.putmask(y, mask, mask_a) - result = accum_func(y, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(y, axis) + result = self._data.apply(na_accum_func) + d = self._construct_axes_dict() + d["copy"] = False return self._constructor(result, **d).__finalize__(self) return set_function_name(cum_func, name, cls) @@ -11160,8 +11220,3 @@ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs ) return set_function_name(logical_func, name, cls) - - -# install the indexes -for _name, _indexer in indexing.get_indexers_list(): - NDFrame._create_indexer(_name, _indexer) diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 252f20ed40068..0c5d2658978b4 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,7 +1,11 @@ -from pandas.core.groupby.generic import ( # noqa: F401 - DataFrameGroupBy, - NamedAgg, - SeriesGroupBy, -) -from pandas.core.groupby.groupby import GroupBy # noqa: F401 -from pandas.core.groupby.grouper import Grouper # noqa: F401 +from pandas.core.groupby.generic import DataFrameGroupBy, NamedAgg, SeriesGroupBy +from pandas.core.groupby.groupby import GroupBy +from pandas.core.groupby.grouper import Grouper + +__all__ = [ + "DataFrameGroupBy", + "NamedAgg", + "SeriesGroupBy", + "GroupBy", + "Grouper", +] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6b110a0c80c07..c49677fa27a31 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,7 +5,7 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ -from collections import OrderedDict, abc, defaultdict, namedtuple +from collections import abc, defaultdict, namedtuple import copy from functools import partial from textwrap import dedent @@ -14,6 +14,7 @@ TYPE_CHECKING, Any, Callable, + Dict, FrozenSet, Iterable, List, @@ -24,6 +25,7 @@ Union, cast, ) +import warnings import numpy as np @@ -306,7 +308,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results = OrderedDict() + results = {} for name, func in arg: obj = self @@ -325,7 +327,7 @@ def _aggregate_multiple_funcs(self, arg): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -443,7 +445,7 @@ def _get_index() -> Index: return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): - result = OrderedDict() + result = {} for name, group in self: group.name = name @@ -809,6 +811,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): periods=periods, fill_method=fill_method, limit=limit, freq=freq ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -1119,7 +1124,7 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: axis = self.axis obj = self._obj_with_exclusions - result: OrderedDict = OrderedDict() + result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) @@ -1136,7 +1141,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 obj = self._obj_with_exclusions - result: OrderedDict = OrderedDict() + result: Dict[Union[int, str], NDFrame] = {} cannot_agg = [] for item in obj: data = obj[item] @@ -1574,6 +1579,19 @@ def filter(self, func, dropna=True, *args, **kwargs): return self._apply_filter(indices, dropna) + def __getitem__(self, key): + # per GH 23566 + if isinstance(key, tuple) and len(key) > 1: + # if len == 1, then it becomes a SeriesGroupBy and this is actually + # valid syntax, so don't raise warning + warnings.warn( + "Indexing with multiple keys (implicitly converted to a tuple " + "of keys) will be deprecated, use a list instead.", + FutureWarning, + stacklevel=2, + ) + return super().__getitem__(key) + def _gotitem(self, key, ndim: int, subset=None): """ sub-classes to define @@ -1874,7 +1892,7 @@ def _normalize_keyword_aggregation(kwargs): Normalize user-provided "named aggregation" kwargs. Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs - to the old OrderedDict[str, List[scalar]]]. + to the old Dict[str, List[scalar]]]. Parameters ---------- @@ -1892,11 +1910,11 @@ def _normalize_keyword_aggregation(kwargs): Examples -------- >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) """ # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. - # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] # May be hitting https://github.com/python/mypy/issues/5958 # saying it doesn't have an attribute __name__ aggspec = defaultdict(list) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 227547daf3668..233bdd11b372b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -236,7 +236,7 @@ class providing the base-class of operations. Notes ----- See more `here -`_ +`_ Examples -------- @@ -325,7 +325,7 @@ def f(self): f.__name__ = "plot" return self._groupby.apply(f) - def __getattr__(self, name): + def __getattr__(self, name: str): def attr(*args, **kwargs): def f(self): return getattr(self.plot, name)(*args, **kwargs) @@ -485,8 +485,8 @@ def get_converter(s): except KeyError: # turns out it wasn't a tuple msg = ( - "must supply a same-length tuple to get_group" - " with multiple grouping keys" + "must supply a same-length tuple to get_group " + "with multiple grouping keys" ) raise ValueError(msg) @@ -570,7 +570,7 @@ def _set_result_index_ordered(self, result): def _dir_additions(self): return self.obj._dir_additions() | self._apply_whitelist - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: @@ -2099,17 +2099,17 @@ def rank( Parameters ---------- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. ascending : bool, default True False for ranks by high (1) to low (N). na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. pct : bool, default False Compute percentage rank of data within each group. axis : int, default 0 @@ -2362,6 +2362,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 axis=axis, ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -2377,6 +2380,8 @@ def head(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2390,6 +2395,10 @@ def head(self, n=5): A B 0 1 2 2 5 6 + >>> df.groupby('A').head(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array() < n @@ -2405,6 +2414,8 @@ def tail(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2418,6 +2429,10 @@ def tail(self, n=5): A B 1 a 2 3 b 2 + >>> df.groupby('A').tail(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n @@ -2528,9 +2543,9 @@ def get_groupby( squeeze: bool = False, observed: bool = False, mutated: bool = False, -): +) -> GroupBy: - klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] + klass: Type[GroupBy] if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2c224a1bef338..0b89e702c9867 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -3,7 +3,7 @@ split-apply-combine paradigm. """ -from typing import Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Tuple import numpy as np @@ -34,8 +34,7 @@ class Grouper: """ - A Grouper allows the user to specify a groupby instruction for a target - object. + A Grouper allows the user to specify a groupby instruction for an object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target @@ -47,17 +46,18 @@ class Grouper: Parameters ---------- key : str, defaults to None - groupby key, which selects the grouping column of the target + Groupby key, which selects the grouping column of the target. level : name/number, defaults to None - the level for the target index + The level for the target index. freq : str / frequency object, defaults to None This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification of available frequencies, please see `here - `_. - axis : number/name of the axis, defaults to 0 + `_. + axis : str, int, defaults to 0 + Number/name of the axis. sort : bool, default to False - whether to sort the resulting labels + Whether to sort the resulting labels. closed : {'left' or 'right'} Closed end of interval. Only when `freq` parameter is passed. label : {'left' or 'right'} @@ -194,7 +194,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis, is_copy=False) + obj = obj.take(indexer, axis=self.axis) self.obj = obj self.grouper = ax @@ -419,7 +419,7 @@ def _make_codes(self) -> None: self._group_index = uniques @cache_readonly - def groups(self) -> dict: + def groups(self) -> Dict[Hashable, np.ndarray]: return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) @@ -605,8 +605,8 @@ def is_in_obj(gpr) -> bool: if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]})" - " must be same length" + f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " + "must be same length" ) # create the Grouping diff --git a/pandas/core/index.py b/pandas/core/index.py index a9c8e6731a17e..8cff53d7a8b74 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -27,4 +27,5 @@ "pandas.core.index is deprecated and will be removed in a future version. " "The public classes are available in the top-level namespace.", FutureWarning, + stacklevel=2, ) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index f75087ca3b505..4d45769d2fea9 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -3,6 +3,8 @@ """ import numpy as np +from pandas._typing import AnyArrayLike + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int: elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") + + +def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: + """ + Check if `mask` is a valid boolean indexer for `array`. + + `array` and `mask` are checked to have the same length, and the + dtype is validated. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + array : array + The array that's being masked. + mask : array + The boolean array that's masking. + + Returns + ------- + numpy.ndarray + The validated boolean mask. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `mask` cannot be converted to a bool-dtype ndarray. + + See Also + -------- + api.types.is_bool_dtype : Check if `key` is of boolean dtype. + + Examples + -------- + A boolean ndarray is returned when the arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.array([1, 2]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Item wrong length 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + """ + result = np.asarray(mask, dtype=bool) + # GH26658 + if len(result) != len(array): + raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") + return result diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 1904456848396..4072d06b9427c 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -198,6 +198,7 @@ def conv(i): result = indexes[0] if hasattr(result, "union_many"): + # DatetimeIndex return result.union_many(indexes[1:]) else: for other in indexes[1:]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a3808f6f4a37e..62e3fd28f6684 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import FrozenSet, Hashable, Optional, Union +from typing import Dict, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -107,6 +107,11 @@ def cmp_method(self, other): if is_object_dtype(self) and isinstance(other, ABCCategorical): left = type(other)(self._values, dtype=other.dtype) return op(left, other) + elif is_object_dtype(self) and isinstance(other, ExtensionArray): + # e.g. PeriodArray + with np.errstate(all="ignore"): + result = op(self.values, other) + elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): @@ -240,6 +245,10 @@ def _outer_indexer(self, left, right): _data: Union[ExtensionArray, np.ndarray] _id = None _name: Optional[Hashable] = None + # MultiIndex.levels previously allowed setting the index name. We + # don't allow this anymore, and raise if it happens rather than + # failing silently. + _no_setting_name: bool = False _comparables = ["name"] _attributes = ["name"] _is_numeric_dtype = False @@ -268,11 +277,11 @@ def __new__( cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs, ) -> "Index": - from .range import RangeIndex + from pandas.core.indexes.range import RangeIndex from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex - from .numeric import Float64Index, Int64Index, UInt64Index - from .interval import IntervalIndex - from .category import CategoricalIndex + from pandas.core.indexes.numeric import Float64Index, Int64Index, UInt64Index + from pandas.core.indexes.interval import IntervalIndex + from pandas.core.indexes.category import CategoricalIndex name = maybe_extract_name(name, data, cls) @@ -291,11 +300,15 @@ def __new__( return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval - elif ( - is_interval_dtype(data) or is_interval_dtype(dtype) - ) and not is_object_dtype(dtype): - closed = kwargs.get("closed", None) - return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) + elif is_interval_dtype(data) or is_interval_dtype(dtype): + closed = kwargs.pop("closed", None) + if is_dtype_equal(_o_dtype, dtype): + return IntervalIndex( + data, name=name, copy=copy, closed=closed, **kwargs + ).astype(object) + return IntervalIndex( + data, dtype=dtype, name=name, copy=copy, closed=closed, **kwargs + ) elif ( is_datetime64_any_dtype(data) @@ -325,8 +338,10 @@ def __new__( else: return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) - elif is_period_dtype(data) and not is_object_dtype(dtype): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + elif is_period_dtype(data) or is_period_dtype(dtype): + if is_dtype_equal(_o_dtype, dtype): + return PeriodIndex(data, copy=False, name=name, **kwargs).astype(object) + return PeriodIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): @@ -349,41 +364,8 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - return Float64Index(data, copy=copy, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": - pass - else: - data = data.astype(dtype) - else: - data = np.array(data, dtype=dtype, copy=copy) + data = _maybe_cast_with_dtype(data, dtype, copy) + dtype = data.dtype # TODO: maybe not for object? # maybe coerce to a sub-class if is_signed_integer_dtype(data.dtype): @@ -403,45 +385,17 @@ def __new__( subarr = subarr.copy() if dtype is None: - inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "integer": - try: - return cls._try_convert_to_int_index(subarr, copy, name, dtype) - except ValueError: - pass - - return Index(subarr, copy=copy, dtype=object, name=name) - elif inferred in ["floating", "mixed-integer-float", "integer-na"]: - # TODO: Returns IntegerArray for integer-na case in the future - return Float64Index(subarr, copy=copy, name=name) - elif inferred == "interval": - try: - return IntervalIndex(subarr, name=name, copy=copy) - except ValueError: - # GH27172: mixed closed Intervals --> object dtype - pass - elif inferred == "boolean": - # don't support boolean explicitly ATM - pass - elif inferred != "string": - if inferred.startswith("datetime"): - try: - return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except (ValueError, OutOfBoundsDatetime): - # GH 27011 - # If we have mixed timezones, just send it - # down the base constructor - pass - - elif inferred.startswith("timedelta"): - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) - elif inferred == "period": - try: - return PeriodIndex(subarr, name=name, **kwargs) - except IncompatibleFrequency: - pass + new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) + if new_dtype is not None: + return cls( + new_data, dtype=new_dtype, copy=False, name=name, **kwargs + ) + if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") return cls._simple_new(subarr, name, **kwargs) elif hasattr(data, "__array__"): @@ -457,7 +411,7 @@ def __new__( if data and all(isinstance(e, tuple) for e in data): # we must be all tuples, otherwise don't construct # 10697 - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get("names") @@ -646,7 +600,7 @@ def __len__(self) -> int: """ return len(self._data) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ The array interface, return my values. """ @@ -657,7 +611,7 @@ def __array_wrap__(self, result, context=None): Gets called after a ufunc. """ result = lib.item_from_zerodim(result) - if is_bool_dtype(result) or lib.is_scalar(result): + if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: return result attrs = self._get_attributes_dict() @@ -728,7 +682,7 @@ def astype(self, dtype, copy=True): return self.copy() if copy else self elif is_categorical_dtype(dtype): - from .category import CategoricalIndex + from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) @@ -736,11 +690,10 @@ def astype(self, dtype, copy=True): return Index(np.asarray(self), dtype=dtype, copy=copy) try: - return Index( - self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype - ) + casted = self.values.astype(dtype, copy=copy) except (TypeError, ValueError): raise TypeError(f"Cannot cast {type(self).__name__} to dtype {dtype}") + return Index(casted, name=self.name, dtype=dtype) _index_shared_docs[ "take" @@ -1214,6 +1167,12 @@ def name(self): @name.setter def name(self, value): + if self._no_setting_name: + # Used in MultiIndex.levels to avoid silently ignoring name updates. + raise RuntimeError( + "Cannot set name on a level of a MultiIndex. Use " + "'MultiIndex.set_names' instead." + ) maybe_extract_name(value, None, type(self)) self._name = value @@ -1557,7 +1516,7 @@ def droplevel(self, level=0): result._name = new_names[0] return result else: - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex( levels=new_levels, @@ -1769,35 +1728,6 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_Index, (type(self), d), None - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - - if isinstance(state, dict): - self._data = state.pop("data") - for k, v in state.items(): - setattr(self, k, v) - - elif isinstance(state, tuple): - - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self._name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - # -------------------------------------------------------------------- # Null Handling Methods @@ -1990,7 +1920,7 @@ def dropna(self, how="any"): raise ValueError(f"invalid how option: {how}") if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy(self._values[~self._isnan]) return self._shallow_copy() # -------------------------------------------------------------------- @@ -2353,11 +2283,11 @@ def _union(self, other, sort): return other._get_reconciled_name_object(self) # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): + if is_datetime64tz_dtype(self): lvals = self._ndarray_values else: lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): + if is_datetime64tz_dtype(other): rvals = other._ndarray_values else: rvals = other._values @@ -2456,14 +2386,8 @@ def intersection(self, other, sort=False): return this.intersection(other, sort=sort) # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + lvals = self._values + rvals = other._values if self.is_monotonic and other.is_monotonic: try: @@ -2482,18 +2406,13 @@ def intersection(self, other, sort=False): indexer = indexer[indexer != -1] taken = other.take(indexer) + res_name = get_op_result_name(self, other) if sort is None: taken = algos.safe_sort(taken.values) - if self.name != other.name: - name = None - else: - name = self.name - return self._shallow_copy(taken, name=name) - - if self.name != other.name: - taken.name = None + return self._shallow_copy(taken, name=res_name) + taken.name = res_name return taken def difference(self, other, sort=None): @@ -2622,11 +2541,11 @@ def symmetric_difference(self, other, result_name=None, sort=None): left_indexer = np.setdiff1d( np.arange(this.size), common_indexer, assume_unique=True ) - left_diff = this.values.take(left_indexer) + left_diff = this._values.take(left_indexer) # {other} minus {this} right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + right_diff = other._values.take(right_indexer) the_diff = concat_compat([left_diff, right_diff]) if sort is None: @@ -2910,12 +2829,12 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] if kind == "iloc": return self._validate_indexer("positional", key, kind) @@ -2923,11 +2842,11 @@ def _convert_scalar_indexer(self, key, kind=None): if len(self) and not isinstance(self, ABCMultiIndex): # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) + # is positional indexing (eg. .loc on with a float) # or label indexing if we are using a type able # to be represented in the index - if kind in ["getitem", "ix"] and is_float(key): + if kind in ["getitem"] and is_float(key): if not self.is_floating(): return self._invalid_indexer("label", key) @@ -2963,12 +2882,12 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key: slice, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # validate iloc if kind == "iloc": @@ -3107,7 +3026,7 @@ def _convert_index_indexer(self, keyarr): @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): if ( - kind in [None, "iloc", "ix"] + kind in [None, "iloc"] and is_integer_dtype(keyarr) and not self.is_floating() and not isinstance(keyarr, ABCPeriodIndex) @@ -3411,7 +3330,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return join_index def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex from pandas.core.reshape.merge import _restore_dropped_levels_multijoin # figure out join names @@ -3432,8 +3351,13 @@ def _join_multi(self, other, how, return_indexers=True): ldrop_names = list(self_names - overlap) rdrop_names = list(other_names - overlap) - self_jnlevels = self.droplevel(ldrop_names) - other_jnlevels = other.droplevel(rdrop_names) + # if only the order differs + if not len(ldrop_names + rdrop_names): + self_jnlevels = self + other_jnlevels = other.reorder_levels(self.names) + else: + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) # Join left and right # Join on same leveled multi-index frames is supported @@ -3513,7 +3437,7 @@ def _join_level( MultiIndex will not be changed; otherwise, it will tie out with `other`. """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex def _get_leaf_sorter(labels): """ @@ -3839,50 +3763,6 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desired - try: - res = data.astype("i8", copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype("u8", copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor @@ -4024,6 +3904,9 @@ def __getitem__(self, key): key = com.values_from_object(key) result = getitem(key) if not is_scalar(result): + if np.ndim(result) > 1: + deprecate_ndim_indexing(result) + return result return promote(result) else: return result @@ -4594,7 +4477,7 @@ def _maybe_promote(self, other): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values): + def groupby(self, values) -> Dict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -4605,7 +4488,7 @@ def groupby(self, values): Returns ------- - groups : dict + dict {group name -> group labels} """ @@ -4641,7 +4524,7 @@ def map(self, mapper, na_action=None): a MultiIndex will be returned. """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex new_values = super()._map_values(mapper, na_action=na_action) @@ -4821,7 +4704,7 @@ def _validate_indexer(self, form, key, kind): If we are positional indexer, validate that we have appropriate typed bounds must be an integer. """ - assert kind in ["ix", "loc", "getitem", "iloc"] + assert kind in ["loc", "getitem", "iloc"] if key is None: pass @@ -4842,7 +4725,7 @@ def _validate_indexer(self, form, key, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- @@ -4855,15 +4738,14 @@ def _validate_indexer(self, form, key, kind): @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them if is_float(label): - if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())): - self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) # we are trying to find integer bounds on a non-integer based index # this is rejected (generally .loc gets you here) @@ -4897,14 +4779,14 @@ def get_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- int Index of label. """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if side not in ("left", "right"): raise ValueError( @@ -4964,7 +4846,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): If None, defaults to the end. step : int, defaults None If None, defaults to 1. - kind : {'ix', 'loc', 'getitem'} or None + kind : {'loc', 'getitem'} or None Returns ------- @@ -5362,7 +5244,7 @@ def ensure_index_from_sequences(sequences, names=None): -------- ensure_index """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex if len(sequences) == 1: if names is not None: @@ -5423,7 +5305,7 @@ def ensure_index(index_like, copy=False): converted, all_arrays = lib.clean_index_list(index_like) if len(converted) > 0 and all_arrays: - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_arrays(converted) else: @@ -5486,3 +5368,186 @@ def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: raise TypeError(f"{cls.__name__}.name must be a hashable type") return name + + +def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + If a dtype is passed, cast to the closest matching dtype that is supported + by Index. + + Parameters + ---------- + data : np.ndarray + dtype : np.dtype + copy : bool + + Returns + ------- + np.ndarray + """ + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + data = _try_convert_to_int_array(data, copy, dtype) + except ValueError: + data = np.array(data, dtype=np.float64, copy=copy) + + elif inferred == "string": + pass + else: + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) + + return data + + +def _maybe_cast_data_without_dtype(subarr): + """ + If we have an arraylike input but no passed dtype, try to infer + a supported dtype. + + Parameters + ---------- + subarr : np.ndarray, Index, or Series + + Returns + ------- + converted : np.ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + """ + # Runtime import needed bc IntervalArray imports Index + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + DatetimeArray, + TimedeltaArray, + ) + + inferred = lib.infer_dtype(subarr, skipna=False) + + if inferred == "integer": + try: + data = _try_convert_to_int_array(subarr, False, None) + return data, data.dtype + except ValueError: + pass + + return subarr, object + + elif inferred in ["floating", "mixed-integer-float", "integer-na"]: + # TODO: Returns IntegerArray for integer-na case in the future + return subarr, np.float64 + + elif inferred == "interval": + try: + data = IntervalArray._from_sequence(subarr, copy=False) + return data, data.dtype + except ValueError: + # GH27172: mixed closed Intervals --> object dtype + pass + elif inferred == "boolean": + # don't support boolean explicitly ATM + pass + elif inferred != "string": + if inferred.startswith("datetime"): + try: + data = DatetimeArray._from_sequence(subarr, copy=False) + return data, data.dtype + except (ValueError, OutOfBoundsDatetime): + # GH 27011 + # If we have mixed timezones, just send it + # down the base constructor + pass + + elif inferred.startswith("timedelta"): + data = TimedeltaArray._from_sequence(subarr, copy=False) + return data, data.dtype + elif inferred == "period": + try: + data = PeriodArray._from_sequence(subarr) + return data, data.dtype + except IncompatibleFrequency: + pass + + return subarr, subarr.dtype + + +def _try_convert_to_int_array( + data: np.ndarray, copy: bool, dtype: np.dtype +) -> np.ndarray: + """ + Attempt to convert an array of data into an integer array. + + Parameters + ---------- + data : The data to convert. + copy : bool + Whether to copy the data or not. + dtype : np.dtype + + Returns + ------- + int_array : data converted to either an ndarray[int64] or ndarray[uint64] + + Raises + ------ + ValueError if the conversion was not successful. + """ + + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desired + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype("u8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError + + +def deprecate_ndim_indexing(result): + if np.ndim(result) > 1: + # GH#27125 indexer like idx[:, None] expands dim, but we + # cannot do that and keep an index, so return ndarray + # Deprecation GH#30588 + warnings.warn( + "Support for multi-dimensional indexing (e.g. `index[:, None]`) " + "on an Index is deprecated and will be removed in a future " + "version. Convert to a numpy array before indexing instead.", + DeprecationWarning, + stacklevel=3, + ) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ba476f9e25ee6..a247a986fcb55 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,5 @@ -import operator -from typing import Any +from typing import Any, List +import warnings import numpy as np @@ -8,9 +8,7 @@ from pandas._libs import index as libindex from pandas._libs.hashtable import duplicated_int64 from pandas._typing import AnyArrayLike -import pandas.compat as compat -from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, @@ -26,10 +24,10 @@ from pandas.core import accessor from pandas.core.algorithms import take_1d from pandas.core.arrays.categorical import Categorical, _recode_for_categories, contains -from pandas.core.base import _shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.extension import ExtensionIndex import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -37,6 +35,12 @@ _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) +@accessor.delegate_names( + delegate=Categorical, + accessors=["codes", "categories", "ordered"], + typ="property", + overwrite=True, +) @accessor.delegate_names( delegate=Categorical, accessors=[ @@ -50,11 +54,17 @@ "as_unordered", "min", "max", + "is_dtype_equal", + "tolist", + "_internal_get_values", + "_reverse_indexer", + "searchsorted", + "argsort", ], typ="method", overwrite=True, ) -class CategoricalIndex(Index, accessor.PandasDelegate): +class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -121,7 +131,7 @@ class CategoricalIndex(Index, accessor.PandasDelegate): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -147,6 +157,20 @@ class CategoricalIndex(Index, accessor.PandasDelegate): _typ = "categoricalindex" + _raw_inherit = { + "argsort", + "_internal_get_values", + "tolist", + "codes", + "categories", + "ordered", + "_reverse_indexer", + "searchsorted", + } + + codes: np.ndarray + categories: Index + @property def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need @@ -164,13 +188,7 @@ def _engine_type(self): # Constructors def __new__( - cls, - data=None, - categories=None, - ordered=None, - dtype=None, - copy=False, - name=None, + cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None ): dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) @@ -243,16 +261,15 @@ def _create_categorical(cls, data, dtype=None): return data @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): result = object.__new__(cls) values = cls._create_categorical(values, dtype=dtype) result._data = values result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) result._reset_identity() + result._no_setting_name = False return result # -------------------------------------------------------------------- @@ -361,38 +378,12 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def itemsize(self): - # Size of the items in categories, not codes. - return self.values.itemsize - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) + # We use _shallow_copy rather than the Index implementation + # (which uses _constructor) in order to preserve dtype. return self._shallow_copy(result, name=name) - def _internal_get_values(self): - # override base Index version to get the numpy array representation of - # the underlying Categorical - return self._data._internal_get_values() - - def tolist(self): - return self._data.tolist() - - @property - def codes(self): - return self._data.codes - - @property - def categories(self): - return self._data.categories - - @property - def ordered(self): - return self._data.ordered - - def _reverse_indexer(self): - return self._data._reverse_indexer() - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. @@ -401,7 +392,7 @@ def __contains__(self, key) -> bool: return contains(self, key, container=self._engine) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return np.array(self._data, dtype=dtype) @@ -417,7 +408,7 @@ def astype(self, dtype, copy=True): if dtype == self.dtype: return self.copy() if copy else self - return super().astype(dtype=dtype, copy=copy) + return Index.astype(self, dtype=dtype, copy=copy) @cache_readonly def _isnan(self): @@ -429,9 +420,6 @@ def fillna(self, value, downcast=None): self._assert_can_do_op(value) return CategoricalIndex(self._data.fillna(value), name=self.name) - def argsort(self, *args, **kwargs): - return self.values.argsort(*args, **kwargs) - @cache_readonly def _engine(self): # we are going to look things up with the codes themselves. @@ -440,19 +428,6 @@ def _engine(self): codes = self.codes return self._engine_type(lambda: codes, len(self)) - # introspection - @cache_readonly - def is_unique(self) -> bool: - return self._engine.is_unique - - @property - def is_monotonic_increasing(self): - return self._engine.is_monotonic_increasing - - @property - def is_monotonic_decreasing(self) -> bool: - return self._engine.is_monotonic_decreasing - @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: @@ -539,11 +514,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - @Substitution(klass="CategoricalIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - return self._data.searchsorted(value, side=side, sorter=sorter) - @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with @@ -583,6 +553,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ibase.ensure_index(target) + missing: List[int] if self.equals(target): indexer = None missing = [] @@ -732,23 +703,14 @@ def _convert_arr_indexer(self, keyarr): def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - taken = self._assert_take_fillable( - self.codes, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1, + def take_nd(self, *args, **kwargs): + """Alias for `take`""" + warnings.warn( + "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take instead", + FutureWarning, + stacklevel=2, ) - return self._create_from_codes(taken) - - def is_dtype_equal(self, other): - return self._data.is_dtype_equal(other) - - take_nd = take + return self.take(*args, **kwargs) @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): @@ -881,34 +843,10 @@ def _concat_same_dtype(self, to_concat, name): result.name = name return result - def _codes_for_groupby(self, sort, observed): - """ Return a Categorical adjusted for groupby """ - return self.values._codes_for_groupby(sort, observed) - - @classmethod - def _add_comparison_methods(cls): - """ add in comparison methods """ - - def _make_compare(op): - opname = f"__{op.__name__}__" - - def _evaluate_compare(self, other): - with np.errstate(all="ignore"): - result = op(self.array, other) - if isinstance(result, ABCSeries): - # Dispatch to pd.Categorical returned NotImplemented - # and we got a Series back; down-cast to ndarray - result = result._values - return result - - return compat.set_function_name(_evaluate_compare, opname, cls) - - cls.__eq__ = _make_compare(operator.eq) - cls.__ne__ = _make_compare(operator.ne) - cls.__lt__ = _make_compare(operator.lt) - cls.__gt__ = _make_compare(operator.gt) - cls.__le__ = _make_compare(operator.le) - cls.__ge__ = _make_compare(operator.ge) + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._values, name) + return prop # no wrapping for now def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ @@ -916,7 +854,7 @@ def _delegate_method(self, name, *args, **kwargs): if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if is_scalar(res): + if is_scalar(res) or name in self._raw_inherit: return res return CategoricalIndex(res, name=self.name) @@ -924,4 +862,3 @@ def _delegate_method(self, name, *args, **kwargs): CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() -CategoricalIndex._add_comparison_methods() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7ba04fc9d2fea..9eb5ed7cb0911 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,12 +2,13 @@ Base and utility classes for tseries type pandas objects. """ import operator -from typing import List, Set +from typing import List, Optional, Set import numpy as np -from pandas._libs import NaT, iNaT, lib +from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.algos import unique_deltas +from pandas._libs.tslibs import timezones from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly @@ -15,24 +16,32 @@ from pandas.core.dtypes.common import ( ensure_int64, is_bool_dtype, + is_categorical_dtype, is_dtype_equal, is_float, is_integer, is_list_like, is_period_dtype, is_scalar, + needs_i8_conversion, ) +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna -from pandas.core import algorithms, ops +from pandas.core import algorithms from pandas.core.accessor import PandasDelegate -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin, - _ensure_datetimelike_to_i8, -) +from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.extension import ( + ExtensionIndex, + inherit_names, + make_wrapped_arith_op, +) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name from pandas.core.tools.timedeltas import to_timedelta from pandas.tseries.frequencies import DateOffset, to_offset @@ -40,139 +49,59 @@ _index_doc_kwargs = dict(ibase._index_doc_kwargs) -def ea_passthrough(array_method): +def _join_i8_wrapper(joinf, with_indexers: bool = True): """ - Make an alias for a method of the underlying ExtensionArray. - - Parameters - ---------- - array_method : method on an Array class - - Returns - ------- - method + Create the join wrapper methods. """ - def method(self, *args, **kwargs): - return array_method(self._data, *args, **kwargs) - - method.__name__ = array_method.__name__ - method.__doc__ = array_method.__doc__ - return method - - -def _make_wrapped_arith_op(opname): - def method(self, other): - meth = getattr(self._data, opname) - result = meth(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - method.__name__ = opname - return method - + @staticmethod # type: ignore + def wrapper(left, right): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + left = left.view("i8") + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + right = right.view("i8") -class DatetimeTimedeltaMixin: - """ - Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, - but not PeriodIndex - """ - - def _set_freq(self, freq): - """ - Set the _freq attribute on our underlying DatetimeArray. + results = joinf(left, right) + if with_indexers: + # dtype should be timedelta64[ns] for TimedeltaIndex + # and datetime64[ns] for DatetimeIndex + dtype = left.dtype.base - Parameters - ---------- - freq : DateOffset, None, or "infer" - """ - # GH#29843 - if freq is None: - # Always valid - pass - elif len(self) == 0 and isinstance(freq, DateOffset): - # Always valid. In the TimedeltaIndex case, we assume this - # is a Tick offset. - pass - else: - # As an internal method, we can ensure this assertion always holds - assert freq == "infer" - freq = to_offset(self.inferred_freq) + join_index, left_indexer, right_indexer = results + join_index = join_index.view(dtype) + return join_index, left_indexer, right_indexer + return results - self._data._freq = freq + return wrapper -class DatetimeIndexOpsMixin(ExtensionOpsMixin): +@inherit_names( + ["inferred_freq", "_isnan", "_resolution", "resolution"], + DatetimeLikeArrayMixin, + cache=True, +) +@inherit_names( + ["__iter__", "mean", "freq", "freqstr", "_ndarray_values", "asi8", "_box_values"], + DatetimeLikeArrayMixin, +) +class DatetimeIndexOpsMixin(ExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. """ _data: ExtensionArray + freq: Optional[DateOffset] + freqstr: Optional[str] + _resolution: int + _bool_ops: List[str] = [] + _field_ops: List[str] = [] - # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are - # properties there. They can be made into cache_readonly for Index - # subclasses bc they are immutable - inferred_freq = cache_readonly( - DatetimeLikeArrayMixin.inferred_freq.fget # type: ignore - ) - _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code - _resolution = cache_readonly( - DatetimeLikeArrayMixin._resolution.fget # type: ignore - ) - resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore - - _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) - __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) - mean = ea_passthrough(DatetimeLikeArrayMixin.mean) - - @property - def freq(self): - """ - Return the frequency object if it is set, otherwise None. - """ - return self._data.freq - - @property - def freqstr(self): - """ - Return the frequency object as a string if it is set, otherwise None. - """ - return self._data.freqstr - - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - - result = self._data.unique() - - # Note: if `self` is already unique, then self.unique() should share - # a `freq` with self. If not already unique, then self.freq must be - # None, so again sharing freq is correct. - return self._shallow_copy(result._data) - - @classmethod - def _create_comparison_method(cls, op): - """ - Create a comparison method that dispatches to ``cls.values``. - """ - - def wrapper(self, other): - if isinstance(other, ABCSeries): - # the arrays defer to Series for comparison ops but the indexes - # don't, so we have to unwrap here. - other = other._values - - result = op(self._data, maybe_unwrap_index(other)) - return result - - wrapper.__doc__ = op.__doc__ - wrapper.__name__ = f"__{op.__name__}__" - return wrapper @property - def _ndarray_values(self) -> np.ndarray: - return self._data._ndarray_values + def is_all_dates(self) -> bool: + return True # ------------------------------------------------------------------------ # Abstract data attributes @@ -182,11 +111,6 @@ def values(self): # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data - @property # type: ignore # https://github.com/python/mypy/issues/1362 - @Appender(DatetimeLikeArrayMixin.asi8.__doc__) - def asi8(self): - return self._data.asi8 - def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. @@ -203,7 +127,7 @@ def __array_wrap__(self, result, context=None): # ------------------------------------------------------------------------ - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -226,55 +150,8 @@ def equals(self, other): # have different timezone return False - elif is_period_dtype(self): - if not is_period_dtype(other): - return False - if self.freq != other.freq: - return False - return np.array_equal(self.asi8, other.asi8) - @staticmethod - def _join_i8_wrapper(joinf, dtype, with_indexers=True): - """ - Create the join wrapper methods. - """ - from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin - - @staticmethod - def wrapper(left, right): - if isinstance( - left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - left = left.view("i8") - if isinstance( - right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - right = right.view("i8") - results = joinf(left, right) - if with_indexers: - join_index, left_indexer, right_indexer = results - join_index = join_index.view(dtype) - return join_index, left_indexer, right_indexer - return results - - return wrapper - - def _ensure_localized( - self, arg, ambiguous="raise", nonexistent="raise", from_utc=False - ): - # See DatetimeLikeArrayMixin._ensure_localized.__doc__ - if getattr(self, "tz", None): - # ensure_localized is only relevant for tz-aware DTI - result = self._data._ensure_localized( - arg, ambiguous=ambiguous, nonexistent=nonexistent, from_utc=from_utc - ) - return type(self)._simple_new(result, name=self.name) - return arg - - def _box_values(self, values): - return self._data._box_values(values) - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): try: @@ -342,18 +219,10 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if isinstance(maybe_slice, slice): return self[maybe_slice] - taken = self._assert_take_fillable( - self.asi8, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=iNaT, + return ExtensionIndex.take( + self, indices, axis, allow_fill, fill_value, **kwargs ) - # keep freq in PeriodArray/Index, reset otherwise - freq = self.freq if is_period_dtype(self) else None - return self._shallow_copy(taken, freq=freq) - _can_hold_na = True _na_value = NaT @@ -519,10 +388,10 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem @@ -531,56 +400,27 @@ def _convert_scalar_indexer(self, key, kind=None): is_flt = is_float(key) if kind in ["loc"] and (is_int or is_flt): self._invalid_indexer("index", key) - elif kind in ["ix", "getitem"] and is_flt: + elif kind in ["getitem"] and is_flt: self._invalid_indexer("index", key) return super()._convert_scalar_indexer(key, kind=kind) - @classmethod - def _add_datetimelike_methods(cls): - """ - Add in the datetimelike methods (as we may have to override the - superclass). - """ - - def __add__(self, other): - # dispatch to ExtensionArray implementation - result = self._data.__add__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__add__ = __add__ - - def __radd__(self, other): - # alias for __add__ - return self.__add__(other) - - cls.__radd__ = __radd__ - - def __sub__(self, other): - # dispatch to ExtensionArray implementation - result = self._data.__sub__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__sub__ = __sub__ - - def __rsub__(self, other): - result = self._data.__rsub__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__rsub__ = __rsub__ - - __pow__ = _make_wrapped_arith_op("__pow__") - __rpow__ = _make_wrapped_arith_op("__rpow__") - __mul__ = _make_wrapped_arith_op("__mul__") - __rmul__ = _make_wrapped_arith_op("__rmul__") - __floordiv__ = _make_wrapped_arith_op("__floordiv__") - __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__") - __mod__ = _make_wrapped_arith_op("__mod__") - __rmod__ = _make_wrapped_arith_op("__rmod__") - __divmod__ = _make_wrapped_arith_op("__divmod__") - __rdivmod__ = _make_wrapped_arith_op("__rdivmod__") - __truediv__ = _make_wrapped_arith_op("__truediv__") - __rtruediv__ = _make_wrapped_arith_op("__rtruediv__") + __add__ = make_wrapped_arith_op("__add__") + __radd__ = make_wrapped_arith_op("__radd__") + __sub__ = make_wrapped_arith_op("__sub__") + __rsub__ = make_wrapped_arith_op("__rsub__") + __pow__ = make_wrapped_arith_op("__pow__") + __rpow__ = make_wrapped_arith_op("__rpow__") + __mul__ = make_wrapped_arith_op("__mul__") + __rmul__ = make_wrapped_arith_op("__rmul__") + __floordiv__ = make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") + __mod__ = make_wrapped_arith_op("__mod__") + __rmod__ = make_wrapped_arith_op("__rmod__") + __divmod__ = make_wrapped_arith_op("__divmod__") + __rdivmod__ = make_wrapped_arith_op("__rdivmod__") + __truediv__ = make_wrapped_arith_op("__truediv__") + __rtruediv__ = make_wrapped_arith_op("__rtruediv__") def isin(self, values, level=None): """ @@ -606,79 +446,29 @@ def isin(self, values, level=None): return algorithms.isin(self.asi8, values.asi8) - def intersection(self, other, sort=False): - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return self.copy() - if len(other) == 0: - return other.copy() - - if not isinstance(other, type(self)): - result = Index.intersection(self, other, sort=sort) - if isinstance(result, type(self)): - if result.freq is None: - result._set_freq("infer") - return result - - elif ( - other.freq is None - or self.freq is None - or other.freq != self.freq - or not other.freq.is_anchored() - or (not self.is_monotonic or not other.is_monotonic) - ): - result = Index.intersection(self, other, sort=sort) + @Appender(_index_shared_docs["where"] % _index_doc_kwargs) + def where(self, cond, other=None): + values = self.view("i8") - # Invalidate the freq of `result`, which may not be correct at - # this point, depending on the values. + if is_scalar(other) and isna(other): + other = NaT.value - result._set_freq(None) - if hasattr(self, "tz"): - result = self._shallow_copy( - result._values, name=result.name, tz=result.tz, freq=None - ) - else: - result = self._shallow_copy(result._values, name=result.name, freq=None) - if result.freq is None: - result._set_freq("infer") - return result - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other else: - left, right = other, self + # Do type inference if necessary up front + # e.g. we passed PeriodIndex.values and got an ndarray of Periods + other = Index(other) - # after sorting, the intersection always starts with the right index - # and ends with the index of which the last elements is smallest - end = min(left[-1], right[-1]) - start = right[0] + if is_categorical_dtype(other): + # e.g. we have a Categorical holding self.dtype + if needs_i8_conversion(other.categories): + other = other._internal_get_values() - if end < start: - return type(self)(data=[]) - else: - lslice = slice(*left.slice_locs(start, end)) - left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) + if not is_dtype_equal(self.dtype, other.dtype): + raise TypeError(f"Where requires matching dtype, not {other.dtype}") - @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) - def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) - freq = self.freq if is_period_dtype(self) else None - return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + other = other.view("i8") - @Appender(_index_shared_docs["where"] % _index_doc_kwargs) - def where(self, cond, other=None): - other = _ensure_datetimelike_to_i8(other, to_utc=True) - values = _ensure_datetimelike_to_i8(self, to_utc=True) result = np.where(cond, values, other).astype("i8") - - result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result) def _summary(self, name=None): @@ -732,18 +522,6 @@ def _concat_same_dtype(self, to_concat, name): return self._simple_new(new_data, **attribs) - @Appender(_index_shared_docs["astype"]) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype) and copy is False: - # Ensure that self.astype(self.dtype) is self - return self - - new_values = self._data.astype(dtype, copy=copy) - - # pass copy=False because any copying will be done in the - # _data.astype call above - return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -777,45 +555,339 @@ def shift(self, periods=1, freq=None): result = self._data._time_shift(periods, freq=freq) return type(self)(result, name=self.name) + # -------------------------------------------------------------------- + # List-like Methods -def wrap_arithmetic_op(self, other, result): - if result is NotImplemented: - return NotImplemented - - if isinstance(result, tuple): - # divmod, rdivmod - assert len(result) == 2 - return ( - wrap_arithmetic_op(self, other, result[0]), - wrap_arithmetic_op(self, other, result[1]), - ) + def delete(self, loc): + new_i8s = np.delete(self.asi8, loc) - if not isinstance(result, Index): - # Index.__new__ will choose appropriate subclass for dtype - result = Index(result) + freq = None + if is_period_dtype(self): + freq = self.freq + elif is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq - res_name = ops.get_op_result_name(self, other) - result.name = res_name - return result + return self._shallow_copy(new_i8s, freq=freq) -def maybe_unwrap_index(obj): +class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): + """ + Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, + but not PeriodIndex """ - If operating against another Index object, we need to unwrap the underlying - data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray - implementation, otherwise we will incorrectly return NotImplemented. - Parameters - ---------- - obj : object + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique - Returns - ------- - unwrapped object - """ - if isinstance(obj, ABCIndexClass): - return obj._data - return obj + def _set_freq(self, freq): + """ + Set the _freq attribute on our underlying DatetimeArray. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaIndex case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + self._data._freq = freq + + def _shallow_copy(self, values=None, **kwargs): + if values is None: + values = self._data + if isinstance(values, type(self)): + values = values._data + + attributes = self._get_attributes_dict() + + if "freq" not in kwargs and self.freq is not None: + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.freq is None: + del attributes["freq"] + + attributes.update(kwargs) + return self._simple_new(values, **attributes) + + # -------------------------------------------------------------------- + # Set Operation Methods + + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx._set_freq(None) + return new_idx + + def intersection(self, other, sort=False): + """ + Specialized intersection for DatetimeIndex/TimedeltaIndex. + + May be much faster than Index.intersection + + Parameters + ---------- + other : Same type as self or array-like + sort : False or None, default False + Sort the resulting index if possible. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + .. versionchanged:: 0.25.0 + + The `sort` keyword is added + + Returns + ------- + y : Index or same type as self + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if len(self) == 0: + return self.copy() + if len(other) == 0: + return other.copy() + + if not isinstance(other, type(self)): + result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + result._set_freq("infer") + return result + + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.is_anchored() + or (not self.is_monotonic or not other.is_monotonic) + ): + result = Index.intersection(self, other, sort=sort) + + # Invalidate the freq of `result`, which may not be correct at + # this point, depending on the values. + + result._set_freq(None) + result = self._shallow_copy( + result._data, name=result.name, dtype=result.dtype, freq=None + ) + if result.freq is None: + result._set_freq("infer") + return result + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + # after sorting, the intersection always starts with the right index + # and ends with the index of which the last elements is smallest + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._shallow_copy(left_chunk) + + def _can_fast_union(self, other) -> bool: + if not isinstance(other, type(self)): + return False + + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + try: + return (right_start == left_end + freq) or right_start in left + except ValueError: + # if we are comparing a freq that does not propagate timezones + # this will raise + return False + + def _fast_union(self, other, sort=None): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + elif sort is False: + # TDIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side="left") + right_chunk = right.values[:loc] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + left, right = other, self + + left_end = left[-1] + right_end = right[-1] + + # concatenate + if left_end < right_end: + loc = right.searchsorted(left_end, side="right") + right_chunk = right.values[loc:] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + return left + + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) + + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) + + this, other = self._maybe_utc_convert(other) + + if this._can_fast_union(other): + return this._fast_union(other, sort=sort) + else: + result = Index._union(this, other, sort=sort) + if isinstance(result, type(self)): + assert result._data.dtype == this.dtype + if result.freq is None: + result._set_freq("infer") + return result + + # -------------------------------------------------------------------- + # Join Methods + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) + _left_indexer_unique = _join_i8_wrapper( + libjoin.left_join_indexer_unique, with_indexers=False + ) + + def join( + self, other, how: str = "left", level=None, return_indexers=False, sort=False + ): + """ + See Index.join + """ + if self._is_convertible_to_index_for_join(other): + try: + other = type(self)(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + + def _maybe_utc_convert(self, other): + this = self + if not hasattr(self, "tz"): + return this, other + + if isinstance(other, type(self)): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + + @classmethod + def _is_convertible_to_index_for_join(cls, other: Index) -> bool: + """ + return a boolean whether I can attempt conversion to a + DatetimeIndex/TimedeltaIndex + """ + if isinstance(other, cls): + return False + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "integer-na", + "mixed-integer-float", + "mixed", + ): + return True + return False + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if ( + isinstance(other, type(self)) + and self.freq == other.freq + and self._can_fast_union(other) + ): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + kwargs = {} + if hasattr(self, "tz"): + kwargs["tz"] = getattr(other, "tz", None) + return self._simple_new(joined, name, **kwargs) class DatetimelikeDelegateMixin(PandasDelegate): @@ -825,8 +897,6 @@ class DatetimelikeDelegateMixin(PandasDelegate): Functionality is delegated from the Index class to an Array class. A few things can be customized - * _delegate_class : type - The class being delegated to. * _delegated_methods, delegated_properties : List The list of property / method names being delagated. * raw_methods : Set @@ -843,10 +913,6 @@ class DatetimelikeDelegateMixin(PandasDelegate): _raw_properties: Set[str] = set() _data: ExtensionArray - @property - def _delegate_class(self): - raise AbstractMethodError - def _delegate_property_get(self, name, *args, **kwargs): result = getattr(self._data, name) if name not in self._raw_properties: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 108e24ffee820..75515949d1855 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,30 +1,21 @@ -from datetime import datetime, time, timedelta +from datetime import datetime, time, timedelta, tzinfo import operator +from typing import Optional import warnings import numpy as np from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts -import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.dtypes.common import ( - _NS_DTYPE, - ensure_int64, - is_float, - is_integer, - is_list_like, - is_scalar, -) -from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( DatetimeArray, - _to_M8, tz_to_dtype, validate_tz_from_dtype, ) @@ -32,12 +23,10 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, DatetimeTimedeltaMixin, - ea_passthrough, ) -from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.extension import inherit_names from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -70,8 +59,14 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. _extra_methods = ["to_period", "to_perioddelta", "to_julian_date", "strftime"] - _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] - _extra_raw_properties = ["_box_func", "tz", "tzinfo"] + _extra_raw_methods = [ + "to_pydatetime", + "_local_timestamps", + "_has_same_tz", + "_format_native_types", + "__iter__", + ] + _extra_raw_properties = ["_box_func", "tz", "tzinfo", "dtype"] _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties _delegated_methods = ( DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods @@ -82,9 +77,19 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): | set(_extra_raw_properties) ) _raw_methods = set(_extra_raw_methods) - _delegate_class = DatetimeArray +@inherit_names(["_timezone", "is_normalized", "_resolution"], DatetimeArray, cache=True) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + ], + DatetimeArray, +) @delegate_names( DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property" ) @@ -92,11 +97,9 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): DatetimeArray, DatetimeDelegateMixin._delegated_methods, typ="method", - overwrite=False, + overwrite=True, ) -class DatetimeIndex( - DatetimeTimedeltaMixin, DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin -): +class DatetimeIndex(DatetimeTimedeltaMixin, DatetimeDelegateMixin): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and @@ -195,41 +198,21 @@ class DatetimeIndex( Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. """ _typ = "datetimeindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True - _tz = None - _freq = None _comparables = ["name", "freqstr", "tz"] _attributes = ["name", "tz", "freq"] _is_numeric_dtype = False _infer_as_myclass = True - # Use faster implementation given we know we have DatetimeArrays - __iter__ = DatetimeArray.__iter__ - # some things like freq inference make use of these attributes. - _bool_ops = DatetimeArray._bool_ops - _object_ops = DatetimeArray._object_ops - _field_ops = DatetimeArray._field_ops - _datetimelike_ops = DatetimeArray._datetimelike_ops - _datetimelike_methods = DatetimeArray._datetimelike_methods + tz: Optional[tzinfo] # -------------------------------------------------------------------- # Constructors @@ -302,6 +285,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): result = object.__new__(cls) result._data = dtarr result.name = name + result._no_setting_name = False # For groupby perf. See note in indexes/base about _index_data result._index_data = dtarr._data result._reset_identity() @@ -309,43 +293,9 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): # -------------------------------------------------------------------- - def __array__(self, dtype=None): - if ( - dtype is None - and isinstance(self._data, DatetimeArray) - and getattr(self.dtype, "tz", None) - ): - msg = ( - "Converting timezone-aware DatetimeArray to timezone-naive " - "ndarray with 'datetime64[ns]' dtype. In the future, this " - "will return an ndarray with 'object' dtype where each " - "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t" - "To accept the future behavior, pass 'dtype=object'.\n\t" - "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = "M8[ns]" + def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._data, dtype=dtype) - @property - def dtype(self): - return self._data.dtype - - @property - def tz(self): - # GH 18595 - return self._data.tz - - @tz.setter - def tz(self, value): - # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError( - "Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate" - ) - - tzinfo = tz - @cache_readonly def _is_dates_only(self) -> bool: """ @@ -368,55 +318,14 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_DatetimeIndex, (type(self), d), None - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - if isinstance(state, dict): - super().__setstate__(state) - - elif isinstance(state, tuple): - - # < 0.15 compat - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - - freq = own_state[1] - tz = timezones.tz_standardize(own_state[2]) - dtype = tz_to_dtype(tz) - dtarr = DatetimeArray._simple_new(data, freq=freq, dtype=dtype) - - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - dtarr = DatetimeArray(data) - - self._data = dtarr - self._reset_identity() - - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - def _convert_for_op(self, value): """ Convert value to be insertable to ndarray. """ if self._has_same_tz(value): - return _to_M8(value) + return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") - @Appender(Index.difference.__doc__) - def difference(self, other, sort=None): - new_idx = super().difference(other, sort=sort) - new_idx._set_freq(None) - return new_idx - # -------------------------------------------------------------------- # Rendering Methods @@ -424,15 +333,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return libts.ints_to_pydatetime(self.asi8, self.tz) - def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_datetime64_from_values - - fmt = _get_format_datetime64_from_values(self, date_format) - - return libts.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep - ) - @property def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 @@ -443,35 +343,6 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def _union(self, other, sort): - if not len(other) or self.equals(other) or not len(self): - return super()._union(other, sort=sort) - - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other, sort=sort) - - if not isinstance(other, DatetimeIndex): - try: - other = DatetimeIndex(other) - except TypeError: - pass - - this, other = self._maybe_utc_convert(other) - - if this._can_fast_union(other): - return this._fast_union(other, sort=sort) - else: - result = Index._union(this, other, sort=sort) - if isinstance(result, DatetimeIndex): - # TODO: we shouldn't be setting attributes like this; - # in all the tests this equality already holds - result._data._dtype = this.dtype - if result.freq is None and ( - this.freq is not None or other.freq is not None - ): - result._data._freq = to_offset(result.inferred_freq) - return result - def union_many(self, others): """ A bit of a hack to accelerate unioning a collection of indexes. @@ -502,102 +373,6 @@ def union_many(self, others): this._data._dtype = dtype return this - def _can_fast_union(self, other) -> bool: - if not isinstance(other, DatetimeIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - try: - return (right_start == left_end + freq) or right_start in left - except (ValueError): - - # if we are comparing a freq that does not propagate timezones - # this will raise - return False - - def _fast_union(self, other, sort=None): - if len(other) == 0: - return self.view(type(self)) - - if len(self) == 0: - return other.view(type(self)) - - # Both DTIs are monotonic. Check if they are already - # in the "correct" order - if self[0] <= other[0]: - left, right = self, other - # DTIs are not in the "correct" order and we don't want - # to sort but want to remove overlaps - elif sort is False: - left, right = self, other - left_start = left[0] - loc = right.searchsorted(left_start, side="left") - right_chunk = right.values[:loc] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - # DTIs are not in the "correct" order and we want - # to sort - else: - left, right = other, self - - left_end = left[-1] - right_end = right[-1] - - # TODO: consider re-implementing freq._should_cache for fastpath - - # concatenate dates - if left_end < right_end: - loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left - - def intersection(self, other, sort=False): - """ - Specialized intersection for DatetimeIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - Returns - ------- - Index or DatetimeIndex or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) return self._shallow_copy(result, name=name, freq=None, tz=self.tz) @@ -610,7 +385,7 @@ def _get_time_micros(self): values = self._data._local_timestamps() return fields.get_time_micros(values) - def to_series(self, keep_tz=lib._no_default, index=None, name=None): + def to_series(self, keep_tz=lib.no_default, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. @@ -655,7 +430,7 @@ def to_series(self, keep_tz=lib._no_default, index=None, name=None): if name is None: name = self.name - if keep_tz is not lib._no_default: + if keep_tz is not lib.no_default: if keep_tz: warnings.warn( "The 'keep_tz' keyword in DatetimeIndex.to_series " @@ -712,68 +487,6 @@ def snap(self, freq="S"): # we know it conforms; skip check return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) - def join( - self, other, how: str = "left", level=None, return_indexers=False, sort=False - ): - """ - See Index.join - """ - if ( - not isinstance(other, DatetimeIndex) - and len(other) > 0 - and other.inferred_type - not in ( - "floating", - "integer", - "integer-na", - "mixed-integer", - "mixed-integer-float", - "mixed", - ) - ): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join( - this, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, DatetimeIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, "tz", None) - return self._simple_new(joined, name, tz=tz) - def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. @@ -927,15 +640,7 @@ def get_value(self, series, key): know what you're doing """ - if isinstance(key, datetime): - - # needed to localize naive datetimes - if self.tz is not None: - if key.tzinfo is not None: - key = Timestamp(key).tz_convert(self.tz) - else: - key = Timestamp(key).tz_localize(self.tz) - + if isinstance(key, (datetime, np.datetime64)): return self.get_value_maybe_box(series, key) if isinstance(key, time): @@ -943,7 +648,7 @@ def get_value(self, series, key): return series.take(locs) try: - return com.maybe_box(self, Index.get_value(self, series, key), series, key) + value = Index.get_value(self, series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -955,6 +660,8 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) except (TypeError, ValueError, KeyError): raise KeyError(key) + else: + return com.maybe_box(self, value, series, key) def get_value_maybe_box(self, series, key): # needed to localize naive datetimes @@ -1035,7 +742,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- @@ -1045,7 +752,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ----- Value of `side` parameter should be validated in caller. """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if is_float(label) or isinstance(label, time) or is_integer(label): self._invalid_indexer("slice", label) @@ -1123,46 +830,31 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): else: raise - # -------------------------------------------------------------------- - # Wrapping DatetimeArray - - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - - _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore - is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore - _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore - - _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) - - def __getitem__(self, key): - result = self._data.__getitem__(key) - if is_scalar(result): - return result - elif result.ndim > 1: - # To support MPL which performs slicing with 2 dim - # even though it only has 1 dim by definition - assert isinstance(result, np.ndarray), result - return result - return type(self)(result, name=self.name) - - @property - def _box_func(self): - return lambda x: Timestamp(x, tz=self.tz) - # -------------------------------------------------------------------- @Substitution(klass="DatetimeIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): - value = np.array(value, dtype=_NS_DTYPE, copy=False) - else: - value = _to_M8(value, tz=self.tz) + if not type(self._data)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self._data)(value) + self._data._check_compatible_with(value) - return self.values.searchsorted(value, side=side) + elif isinstance(value, self._data._recognized_scalars): + self._data._check_compatible_with(value) + value = self._data._scalar_type(value) + + elif not isinstance(value, DatetimeArray): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + + return self._data.searchsorted(value, side=side) def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "datetime" @@ -1173,10 +865,6 @@ def inferred_type(self) -> str: # sure we can't have ambiguous indexing return "datetime64" - @property - def is_all_dates(self) -> bool: - return True - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -1192,16 +880,21 @@ def insert(self, loc, item): ------- new_index : Index """ - if is_scalar(item) and isna(item): + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): # GH 18295 item = self._na_value + elif is_scalar(item) and isna(item): + # i.e. timedeltat64("NaT") + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) freq = None + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) - if isinstance(item, (datetime, np.datetime64)): - self._assert_can_do_op(item) - if not self._has_same_tz(item) and not isna(item): - raise ValueError("Passed item and index have different timezone") # check freq can be preserved on edge cases if self.size and self.freq is not None: if item is NaT: @@ -1210,47 +903,21 @@ def insert(self, loc, item): freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq - item = _to_M8(item, tz=self.tz) + item = item.asm8 try: - new_dates = np.concatenate( + new_i8s = np.concatenate( (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) ) - return self._shallow_copy(new_dates, freq=freq) + return self._shallow_copy(new_i8s, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError("cannot insert DatetimeIndex with incompatible label") - - def delete(self, loc): - """ - Make a new DatetimeIndex with passed location(s) deleted. - - Parameters - ---------- - loc: int, slice or array of ints - Indicate which sub-arrays to remove. - - Returns - ------- - new_index : DatetimeIndex - """ - new_dates = np.delete(self.asi8, loc) - - freq = None - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - - return self._shallow_copy(new_dates, freq=freq) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) def indexer_at_time(self, time, asof=False): """ @@ -1340,10 +1007,8 @@ def indexer_between_time( return mask.nonzero()[0] -DatetimeIndex._add_comparison_ops() DatetimeIndex._add_numeric_methods_disabled() DatetimeIndex._add_logical_methods_disabled() -DatetimeIndex._add_datetimelike_methods() def date_range( @@ -1356,7 +1021,7 @@ def date_range( name=None, closed=None, **kwargs, -): +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex. @@ -1405,7 +1070,7 @@ def date_range( ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1522,7 +1187,7 @@ def bdate_range( holidays=None, closed=None, **kwargs, -): +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex, with business day as the default frequency. @@ -1576,7 +1241,7 @@ def bdate_range( desired. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py new file mode 100644 index 0000000000000..58fcce7e59be7 --- /dev/null +++ b/pandas/core/indexes/extension.py @@ -0,0 +1,242 @@ +""" +Shared methods for Index subclasses backed by ExtensionArray. +""" +from typing import List + +import numpy as np + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ensure_platform_int, is_dtype_equal +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.arrays import ExtensionArray +from pandas.core.indexes.base import Index, deprecate_ndim_indexing +from pandas.core.ops import get_op_result_name + + +def inherit_from_data(name: str, delegate, cache: bool = False): + """ + Make an alias for a method of the underlying ExtensionArray. + + Parameters + ---------- + name : str + Name of an attribute the class should inherit from its EA parent. + delegate : class + cache : bool, default False + Whether to convert wrapped properties into cache_readonly + + Returns + ------- + attribute, method, property, or cache_readonly + """ + + attr = getattr(delegate, name) + + if isinstance(attr, property): + if cache: + method = cache_readonly(attr.fget) + + else: + + def fget(self): + return getattr(self._data, name) + + def fset(self, value): + setattr(self._data, name, value) + + fget.__name__ = name + fget.__doc__ = attr.__doc__ + + method = property(fget, fset) + + elif not callable(attr): + # just a normal attribute, no wrapping + method = attr + + else: + + def method(self, *args, **kwargs): + result = attr(self._data, *args, **kwargs) + return result + + method.__name__ = name + method.__doc__ = attr.__doc__ + return method + + +def inherit_names(names: List[str], delegate, cache: bool = False): + """ + Class decorator to pin attributes from an ExtensionArray to a Index subclass. + + Parameters + ---------- + names : List[str] + delegate : class + cache : bool, default False + """ + + def wrapper(cls): + for name in names: + meth = inherit_from_data(name, delegate, cache=cache) + setattr(cls, name, meth) + + return cls + + return wrapper + + +def _make_wrapped_comparison_op(opname): + """ + Create a comparison method that dispatches to ``._data``. + """ + + def wrapper(self, other): + if isinstance(other, ABCSeries): + # the arrays defer to Series for comparison ops but the indexes + # don't, so we have to unwrap here. + other = other._values + + other = _maybe_unwrap_index(other) + + op = getattr(self._data, opname) + return op(other) + + wrapper.__name__ = opname + return wrapper + + +def make_wrapped_arith_op(opname): + def method(self, other): + meth = getattr(self._data, opname) + result = meth(_maybe_unwrap_index(other)) + return _wrap_arithmetic_op(self, other, result) + + method.__name__ = opname + return method + + +def _wrap_arithmetic_op(self, other, result): + if result is NotImplemented: + return NotImplemented + + if isinstance(result, tuple): + # divmod, rdivmod + assert len(result) == 2 + return ( + _wrap_arithmetic_op(self, other, result[0]), + _wrap_arithmetic_op(self, other, result[1]), + ) + + if not isinstance(result, Index): + # Index.__new__ will choose appropriate subclass for dtype + result = Index(result) + + res_name = get_op_result_name(self, other) + result.name = res_name + return result + + +def _maybe_unwrap_index(obj): + """ + If operating against another Index object, we need to unwrap the underlying + data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray + implementation, otherwise we will incorrectly return NotImplemented. + + Parameters + ---------- + obj : object + + Returns + ------- + unwrapped object + """ + if isinstance(obj, Index): + return obj._data + return obj + + +class ExtensionIndex(Index): + """ + Index subclass for indexes backed by ExtensionArray. + """ + + _data: ExtensionArray + + __eq__ = _make_wrapped_comparison_op("__eq__") + __ne__ = _make_wrapped_comparison_op("__ne__") + __lt__ = _make_wrapped_comparison_op("__lt__") + __gt__ = _make_wrapped_comparison_op("__gt__") + __le__ = _make_wrapped_comparison_op("__le__") + __ge__ = _make_wrapped_comparison_op("__ge__") + + def __getitem__(self, key): + result = self._data[key] + if isinstance(result, type(self._data)): + return type(self)(result, name=self.name) + + # Includes cases where we get a 2D ndarray back for MPL compat + deprecate_ndim_indexing(result) + return result + + def __iter__(self): + return self._data.__iter__() + + @property + def _ndarray_values(self) -> np.ndarray: + return self._data._ndarray_values + + def dropna(self, how="any"): + if how not in ("any", "all"): + raise ValueError(f"invalid how option: {how}") + + if self.hasnans: + return self._shallow_copy(self._data[~self._isnan]) + return self._shallow_copy() + + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + result = self._data.repeat(repeats, axis=axis) + return self._shallow_copy(result) + + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + + taken = self._assert_take_fillable( + self._data, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) + return type(self)(taken, name=self.name) + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + result = self._data.unique() + return self._shallow_copy(result) + + def _get_unique_index(self, dropna=False): + if self.is_unique and not dropna: + return self + + result = self._data.unique() + if dropna and self.hasnans: + result = result[~result.isna()] + return self._shallow_copy(result) + + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype) and copy is False: + # Ensure that self.astype(self.dtype) is self + return self + + new_values = self._data.astype(dtype, copy=copy) + + # pass copy=False because any copying will be done in the + # _data.astype call above + return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ce0716e36cdf3..1c86235f9eaa1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -37,6 +37,7 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna +from pandas.core import accessor from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com @@ -50,6 +51,7 @@ maybe_extract_name, ) from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.extension import ExtensionIndex, inherit_names from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name @@ -103,19 +105,6 @@ def _get_prev_label(label): raise TypeError(f"cannot determine next label for type {repr(type(label))}") -def _get_interval_closed_bounds(interval): - """ - Given an Interval or IntervalIndex, return the corresponding interval with - closed bounds. - """ - left, right = interval.left, interval.right - if interval.open_left: - left = _get_next_label(left) - if interval.open_right: - right = _get_prev_label(right) - return left, right - - def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have @@ -194,7 +183,31 @@ def func(intvidx_self, other, sort=False): ), ) ) -class IntervalIndex(IntervalMixin, Index): +@accessor.delegate_names( + delegate=IntervalArray, + accessors=["length", "size", "left", "right", "mid", "closed", "dtype"], + typ="property", + overwrite=True, +) +@accessor.delegate_names( + delegate=IntervalArray, + accessors=[ + "__array__", + "overlaps", + "contains", + "__len__", + "set_closed", + "to_tuples", + ], + typ="method", + overwrite=True, +) +@inherit_names( + ["is_non_overlapping_monotonic", "mid", "_ndarray_values"], + IntervalArray, + cache=True, +) +class IntervalIndex(IntervalMixin, ExtensionIndex, accessor.PandasDelegate): _typ = "intervalindex" _comparables = ["name"] _attributes = ["name", "closed"] @@ -205,6 +218,8 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + _raw_inherit = {"__array__", "overlaps", "contains"} + # -------------------------------------------------------------------- # Constructors @@ -247,6 +262,7 @@ def _simple_new(cls, array, name, closed=None): result = IntervalMixin.__new__(cls) result._data = array result.name = name + result._no_setting_name = False result._reset_identity() return result @@ -378,98 +394,10 @@ def __contains__(self, key) -> bool: except KeyError: return False - @Appender( - _interval_shared_docs["to_tuples"] - % dict( - return_type="Index", - examples=""" - Examples - -------- - >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) - >>> idx.to_tuples() - Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') - >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') - """, - ) - ) - def to_tuples(self, na_tuple=True): - tuples = self._data.to_tuples(na_tuple=na_tuple) - return Index(tuples) - @cache_readonly def _multiindex(self): return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) - @property - def left(self): - """ - Return the left endpoints of each Interval in the IntervalIndex as - an Index. - """ - return self._data._left - - @property - def right(self): - """ - Return the right endpoints of each Interval in the IntervalIndex as - an Index. - """ - return self._data._right - - @property - def closed(self): - """ - Whether the intervals are closed on the left-side, right-side, both or - neither. - """ - return self._data._closed - - @Appender( - _interval_shared_docs["set_closed"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - Examples - -------- - >>> index = pd.interval_range(0, 3) - >>> index - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') - >>> index.set_closed('both') - IntervalIndex([[0, 1], [1, 2], [2, 3]], - closed='both', - dtype='interval[int64]') - """ - ), - ) - ) - def set_closed(self, closed): - if closed not in _VALID_CLOSED: - raise ValueError(f"invalid option for 'closed': {closed}") - - # return self._shallow_copy(closed=closed) - array = self._data.set_closed(closed) - return self._simple_new(array, self.name) - - @property - def length(self): - """ - Return an Index with entries denoting the length of each Interval in - the IntervalIndex. - """ - return self._data.length - - @property - def size(self): - # Avoid materializing ndarray[Interval] - return self._data.size - - def __len__(self) -> int: - return len(self.left) - @cache_readonly def values(self): """ @@ -481,16 +409,6 @@ def values(self): def _values(self): return self._data - @cache_readonly - def _ndarray_values(self) -> np.ndarray: - return np.array(self._data) - - def __array__(self, result=None): - """ - The array interface, return my values. - """ - return self._ndarray_values - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result @@ -500,31 +418,13 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_IntervalIndex, (type(self), d), None - @Appender(_index_shared_docs["copy"]) - def copy(self, deep=False, name=None): - array = self._data - if deep: - array = array.copy() - attributes = self._get_attributes_dict() - if name is not None: - attributes.update(name=name) - - return self._simple_new(array, **attributes) - @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) - return super().astype(dtype, copy=copy) - - @cache_readonly - def dtype(self): - """ - Return the dtype object of the underlying data. - """ - return self._data.dtype + return Index.astype(self, dtype, copy=copy) @property def inferred_type(self) -> str: @@ -537,29 +437,8 @@ def memory_usage(self, deep: bool = False) -> int: # so return the bytes here return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) - @cache_readonly - def mid(self): - """ - Return the midpoint of each Interval in the IntervalIndex as an Index. - """ - return self._data.mid - - @cache_readonly - def is_monotonic(self) -> bool: - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self.is_monotonic_increasing - - @cache_readonly - def is_monotonic_increasing(self) -> bool: - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self._engine.is_monotonic_increasing - + # IntervalTree doesn't have a is_monotonic_decreasing, so have to override + # the Index implemenation @cache_readonly def is_monotonic_decreasing(self) -> bool: """ @@ -592,11 +471,6 @@ def is_unique(self): return True - @cache_readonly - @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % _index_doc_kwargs) - def is_non_overlapping_monotonic(self): - return self._data.is_non_overlapping_monotonic - @property def is_overlapping(self): """ @@ -675,26 +549,6 @@ def _convert_list_indexer(self, keyarr, kind=None): return locs - def _maybe_cast_indexed(self, key): - """ - we need to cast the key, which could be a scalar - or an array-like to the type of our subtype - """ - if isinstance(key, IntervalIndex): - return key - - subtype = self.dtype.subtype - if is_float_dtype(subtype): - if is_integer(key): - key = float(key) - elif isinstance(key, (np.ndarray, Index)): - key = key.astype("float64") - elif is_integer_dtype(subtype): - if is_integer(key): - key = int(key) - - return key - def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -827,34 +681,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) - def _find_non_overlapping_monotonic_bounds(self, key): - if isinstance(key, IntervalMixin): - start = self._searchsorted_monotonic( - key.left, "left", exclude_label=key.open_left - ) - stop = self._searchsorted_monotonic( - key.right, "right", exclude_label=key.open_right - ) - elif isinstance(key, slice): - # slice - start, stop = key.start, key.stop - if (key.step or 1) != 1: - raise NotImplementedError("cannot slice with a slice step") - if start is None: - start = 0 - else: - start = self._searchsorted_monotonic(start, "left") - if stop is None: - stop = len(self) - else: - stop = self._searchsorted_monotonic(stop, "right") - else: - # scalar or index-like - - start = self._searchsorted_monotonic(key, "left") - stop = self._searchsorted_monotonic(key, "right") - return start, stop - def get_loc( self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -1146,8 +972,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): result = self._data.take( indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs ) - attributes = self._get_attributes_dict() - return self._simple_new(result, **attributes) + return self._shallow_copy(result) def __getitem__(self, value): result = self._data[value] @@ -1238,44 +1063,6 @@ def equals(self, other) -> bool: and self.closed == other.closed ) - @Appender( - _interval_shared_docs["contains"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - IntervalIndex([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') - >>> intervals.contains(0.5) - array([ True, False, False]) - """ - ), - ) - ) - def contains(self, other): - return self._data.contains(other) - - @Appender( - _interval_shared_docs["overlaps"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - IntervalIndex([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') - """ - ), - ) - ) - def overlaps(self, other): - return self._data.overlaps(other) - @Appender(_index_shared_docs["intersection"]) @SetopCheck(op_name="intersection") def intersection( @@ -1375,6 +1162,34 @@ def is_all_dates(self) -> bool: # TODO: arithmetic operations + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._data, name) + return prop # no wrapping for now + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the ._data """ + method = getattr(self._data, name) + res = method(*args, **kwargs) + if is_scalar(res) or name in self._raw_inherit: + return res + if isinstance(res, IntervalArray): + return type(self)._simple_new(res, name=self.name) + return Index(res) + + # GH#30817 until IntervalArray implements inequalities, get them from Index + def __lt__(self, other): + return Index.__lt__(self, other) + + def __le__(self, other): + return Index.__le__(self, other) + + def __gt__(self, other): + return Index.__gt__(self, other) + + def __ge__(self, other): + return Index.__ge__(self, other) + IntervalIndex._add_logical_methods_disabled() @@ -1447,7 +1262,7 @@ def interval_range( ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dac9b20104c36..84d7399cc4f2d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,7 +1,6 @@ -from collections import OrderedDict import datetime from sys import getsizeof -from typing import List, Optional +from typing import Hashable, List, Optional, Sequence, Union import warnings import numpy as np @@ -62,8 +61,6 @@ dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") ) -_no_default_names = object() - class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): """ @@ -206,7 +203,7 @@ class MultiIndex(Index): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -374,7 +371,7 @@ def _verify_integrity( return new_codes @classmethod - def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): + def from_arrays(cls, arrays, sortorder=None, names=lib.no_default): """ Convert arrays to MultiIndex. @@ -428,7 +425,7 @@ def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): raise ValueError("all arrays must be same length") codes, levels = factorize_from_iterables(arrays) - if names is _no_default_names: + if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] return MultiIndex( @@ -498,7 +495,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod - def from_product(cls, iterables, sortorder=None, names=_no_default_names): + def from_product(cls, iterables, sortorder=None, names=lib.no_default): """ Make a MultiIndex from the cartesian product of multiple iterables. @@ -549,7 +546,7 @@ def from_product(cls, iterables, sortorder=None, names=_no_default_names): iterables = list(iterables) codes, levels = factorize_from_iterables(iterables) - if names is _no_default_names: + if names is lib.no_default: names = [getattr(it, "name", None) for it in iterables] codes = cartesian_product(codes) @@ -628,6 +625,9 @@ def levels(self): result = [ x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) ] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True return FrozenList(result) @property @@ -659,31 +659,6 @@ def array(self): "'MultiIndex.to_numpy()' to get a NumPy array of tuples." ) - @property - def _is_homogeneous_type(self) -> bool: - """ - Whether the levels of a MultiIndex all have the same dtype. - - This looks at the dtypes of the levels. - - See Also - -------- - Index._is_homogeneous_type : Whether the object has a single - dtype. - DataFrame._is_homogeneous_type : Whether all the columns in a - DataFrame have the same dtype. - - Examples - -------- - >>> MultiIndex.from_tuples([ - ... ('a', 'b'), ('a', 'c')])._is_homogeneous_type - True - >>> MultiIndex.from_tuples([ - ... ('a', 1), ('a', 2)])._is_homogeneous_type - False - """ - return len({x.dtype for x in self.levels}) <= 1 - def _set_levels( self, levels, level=None, copy=False, validate=True, verify_integrity=False ): @@ -743,32 +718,47 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')], + (2, 'one'), (2, 'two'), + (3, 'one'), (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), - ('b', 2)], + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level=0) + >>> idx.set_levels(['a', 'b', 'c'], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')], + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], names=['foo', 'bar']) >>> idx.set_levels(['a', 'b'], level='bar') MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), - (2, 'b')], + (2, 'b'), + (3, 'a'), + (3, 'b')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]], level=[0, 1]) + + If any of the levels passed to ``set_levels()`` exceeds the + existing length, all of the values from that argument will + be stored in the MultiIndex levels, though the values will + be truncated in the MultiIndex output. + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -961,7 +951,7 @@ def copy( _set_identity=_set_identity, ) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return self.values @@ -1639,17 +1629,12 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names - # Guarantee resulting column order + # Guarantee resulting column order - PY36+ dict maintains insertion order result = DataFrame( - OrderedDict( - [ - ( - (level if lvlname is None else lvlname), - self._get_level_values(level), - ) - for lvlname, level in zip(idx_names, range(len(self.levels))) - ] - ), + { + (level if lvlname is None else lvlname): self._get_level_values(level) + for lvlname, level in zip(idx_names, range(len(self.levels))) + }, copy=False, ) @@ -2086,9 +2071,8 @@ def drop(self, codes, level=None, errors="raise"): elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( - "dropping on a non-lexsorted multi-index" - " without a level parameter may impact " - "performance.", + "dropping on a non-lexsorted multi-index " + "without a level parameter may impact performance.", PerformanceWarning, stacklevel=3, ) @@ -2432,7 +2416,53 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): return target, indexer - def get_slice_bound(self, label, side, kind): + def get_slice_bound( + self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + kind : {'loc', 'getitem'} + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left", kind="loc") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right", kind="loc") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ if not isinstance(label, tuple): label = (label,) @@ -2507,7 +2537,7 @@ def _partial_tup_index(self, tup, side="left"): for k, (lab, lev, labs) in enumerate(zipped): section = labs[start:end] - if lab not in lev: + if lab not in lev and not isna(lab): if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): raise TypeError(f"Level type mismatch: {lab}") @@ -2517,13 +2547,38 @@ def _partial_tup_index(self, tup, side="left"): loc -= 1 return start + section.searchsorted(loc, side=side) - idx = lev.get_loc(lab) + idx = self._get_loc_single_level_index(lev, lab) if k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") else: return start + section.searchsorted(idx, side=side) + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + + if is_scalar(key) and isna(key): + return -1 + else: + return level_index.get_loc(key) + def get_loc(self, key, method=None): """ Get location for a label or a tuple of labels as an integer, slice or @@ -2622,7 +2677,9 @@ def _maybe_to_slice(loc): loc = np.arange(start, stop, dtype="int64") for i, k in enumerate(follow_key, len(lead_key)): - mask = self.codes[i][loc] == self.levels[i].get_loc(k) + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) if not mask.all(): loc = loc[mask] if not len(loc): @@ -2640,7 +2697,7 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): key : label or sequence of labels level : int/level name or list thereof, optional drop_level : bool, default True - if ``False``, the resulting index will not drop any level. + If ``False``, the resulting index will not drop any level. Returns ------- @@ -2850,7 +2907,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - code = level_index.get_loc(key) + code = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted @@ -2901,7 +2958,7 @@ def get_locs(self, seq): >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP array([2], dtype=int64) """ - from .numeric import Int64Index + from pandas.core.indexes.numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -3345,14 +3402,11 @@ def isin(self, values, level=None): return algos.isin(self.values, values) else: num = self._get_level_number(level) - levs = self.levels[num] - level_codes = self.codes[num] + levs = self.get_level_values(num) - sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: - return np.zeros(len(level_codes), dtype=np.bool_) - else: - return np.lib.arraysetops.in1d(level_codes, sought_labels) + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) MultiIndex._add_numeric_methods_disabled() diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 39cbe5f151262..9a3a021bd801a 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -73,6 +73,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None): else: subarr = data + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + name = maybe_extract_name(name, data, cls) return cls._simple_new(subarr, name=name) @@ -95,7 +99,7 @@ def _validate_dtype(cls, dtype: Dtype) -> None: @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) @@ -256,7 +260,7 @@ def asi8(self) -> np.ndarray: @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # don't coerce ilocs to integers if kind != "iloc": @@ -313,7 +317,7 @@ def asi8(self) -> np.ndarray: @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # don't coerce ilocs to integers if kind != "iloc": @@ -400,7 +404,7 @@ def astype(self, dtype, copy=True): @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] if kind == "iloc": return self._validate_indexer("positional", key, kind) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 6465a0c1724af..4e3689078d535 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -5,23 +5,29 @@ from pandas._libs import index as libindex from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution -from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period +from pandas._libs.tslibs.period import Period from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, + is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, + is_object_dtype, pandas_dtype, ) from pandas.core.accessor import delegate_names -from pandas.core.algorithms import unique1d -from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq +from pandas.core.arrays.period import ( + PeriodArray, + period_array, + raise_on_incompatible, + validate_dtype_freq, +) from pandas.core.base import _shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase @@ -34,7 +40,8 @@ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ) -from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index +from pandas.core.indexes.datetimes import DatetimeIndex, Index +from pandas.core.indexes.numeric import Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name from pandas.core.tools.datetimes import DateParseError, parse_time_string @@ -65,13 +72,11 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): Delegate from PeriodIndex to PeriodArray. """ - _delegate_class = PeriodArray - _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = set(PeriodArray._datetimelike_methods) | { - "_addsub_int_array", - "strftime", - } - _raw_properties = {"is_leap_year"} + _raw_methods = {"_format_native_types"} + _raw_properties = {"is_leap_year", "freq"} + + _delegated_properties = PeriodArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = set(PeriodArray._datetimelike_methods) | _raw_methods @delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property") @@ -80,8 +85,7 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): ) class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): """ - Immutable ndarray holding ordinal values indicating regular periods in - time such as particular years, quarters, months, etc. + Immutable ndarray holding ordinal values indicating regular periods in time. Index keys are boxed to Period objects which carries the metadata (eg, frequency information). @@ -89,9 +93,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): Parameters ---------- data : array-like (1d int np.ndarray or PeriodArray), optional - Optional period-like data to construct index with + Optional period-like data to construct index with. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. freq : str or period object, optional One of pandas period strings or corresponding objects year : int, array, or Series, default None @@ -102,7 +106,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): minute : int, array, or Series, default None second : int, array, or Series, default None tz : object, default None - Timezone for converting datetime64 data to Periods + Timezone for converting datetime64 data to Periods. dtype : str or PeriodDtype, default None Attributes @@ -262,29 +266,20 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): def values(self): return np.asarray(self) - @property - def freq(self) -> DateOffset: - return self._data.freq - def _shallow_copy(self, values=None, **kwargs): # TODO: simplify, figure out type of values if values is None: values = self._data if isinstance(values, type(self)): - values = values._values + values = values._data if not isinstance(values, PeriodArray): - if isinstance(values, np.ndarray) and is_integer_dtype(values.dtype): + if isinstance(values, np.ndarray) and values.dtype == "i8": values = PeriodArray(values, freq=self.freq) else: - # in particular, I would like to avoid period_array here. - # Some people seem to be calling use with unexpected types - # Index.difference -> ndarray[Period] - # DatetimelikeIndexOpsMixin.repeat -> ndarray[ordinal] - # I think that once all of Datetime* are EAs, we can simplify - # this quite a bit. - values = period_array(values, freq=self.freq) + # GH#30713 this should never be reached + raise TypeError(type(values), getattr(values, "dtype", None)) # We don't allow changing `freq` in _shallow_copy. validate_dtype_freq(self.dtype, kwargs.get("freq")) @@ -344,10 +339,7 @@ def _maybe_convert_timedelta(self, other): if base == self.freq.rule_code: return other.n - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, other) elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically @@ -355,18 +347,11 @@ def _maybe_convert_timedelta(self, other): return other # raise when input doesn't have freq - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=None - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, None) # ------------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep="NaT", quoting=None, **kwargs): - # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) - def _mpl_repr(self): # how to represent ourselves to matplotlib return self.astype(object).values @@ -406,17 +391,7 @@ def _int64index(self): # ------------------------------------------------------------------------ # Index Methods - def _coerce_scalar_to_index(self, item): - """ - we need to coerce a scalar to a compat for our index type - - Parameters - ---------- - item : scalar item to coerce - """ - return PeriodIndex([item], **self._get_attributes_dict()) - - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: if is_integer_dtype(dtype): return self.asi8 else: @@ -494,26 +469,19 @@ def astype(self, dtype, copy=True, how="start"): @Substitution(klass="PeriodIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, Period): - if value.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=value.freqstr, - ) - raise IncompatibleFrequency(msg) - value = value.ordinal + if isinstance(value, Period) or value is NaT: + self._data._check_compatible_with(value) elif isinstance(value, str): try: - value = Period(value, freq=self.freq).ordinal + value = Period(value, freq=self.freq) except DateParseError: raise KeyError(f"Cannot interpret '{value}' as period") + elif not isinstance(value, PeriodArray): + raise TypeError( + "PeriodIndex.searchsorted requires either a Period or PeriodArray" + ) - return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) - - @property - def is_all_dates(self) -> bool: - return True + return self._data.searchsorted(value, side=side, sorter=sorter) @property def is_full(self) -> bool: @@ -541,7 +509,7 @@ def get_value(self, series, key): """ s = com.values_from_object(series) try: - return com.maybe_box(self, super().get_value(s, key), series, key) + value = super().get_value(s, key) except (KeyError, IndexError): if isinstance(key, str): asdt, parsed, reso = parse_time_string(key, self.freq) @@ -573,20 +541,19 @@ def get_value(self, series, key): period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal return com.maybe_box(self, self._int64index.get_value(s, key), series, key) + else: + return com.maybe_box(self, value, series, key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if hasattr(target, "freq") and target.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr, - ) - raise IncompatibleFrequency(msg) - if isinstance(target, PeriodIndex): + if target.freq != self.freq: + # No matches + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches + target = target.asi8 self_index = self._int64index else: @@ -601,39 +568,15 @@ def get_indexer_non_unique(self, target): target = ensure_index(target) if isinstance(target, PeriodIndex): + if target.freq != self.freq: + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + target = target.asi8 - if hasattr(target, "freq") and target.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr, - ) - raise IncompatibleFrequency(msg) indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing - def _get_unique_index(self, dropna=False): - """ - wrap Index._get_unique_index to handle NaT - """ - res = super()._get_unique_index(dropna=dropna) - if dropna: - res = res.dropna() - return res - - @Appender(Index.unique.__doc__) - def unique(self, level=None): - # override the Index.unique method for performance GH#23083 - if level is not None: - # this should never occur, but is retained to make the signature - # match Index.unique - self._validate_index_level(level) - - values = self._ndarray_values - result = unique1d(values) - return self._shallow_copy(result) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -682,7 +625,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} Returns ------- @@ -693,7 +636,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ["ix", "loc", "getitem"] + assert kind in ["loc", "getitem"] if isinstance(label, datetime): return Period(label, freq=self.freq) @@ -764,8 +707,7 @@ def _get_string_slice(self, key): t1, t2 = self._parsed_string_to_bounds(reso, parsed) return slice( - self.searchsorted(t1.ordinal, side="left"), - self.searchsorted(t2.ordinal, side="right"), + self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right") ) def _convert_tolerance(self, tolerance, target): @@ -808,9 +750,8 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return self._apply_meta(result), lidx, ridx return self._apply_meta(result) - @Appender(Index.intersection.__doc__) - def intersection(self, other, sort=False): - return Index.intersection(self, other, sort=sort) + # ------------------------------------------------------------------------ + # Set Operation Methods def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) @@ -818,51 +759,79 @@ def _assert_can_do_setop(self, other): # *Can't* use PeriodIndexes of different freqs # *Can* use PeriodIndex/DatetimeIndex if isinstance(other, PeriodIndex) and self.freq != other.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, other) - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - result = self._apply_meta(result) - result.name = name + def intersection(self, other, sort=False): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if not is_dtype_equal(self.dtype, other.dtype): + # TODO: fastpath for if we have a different PeriodDtype + this = self.astype("O") + other = other.astype("O") + return this.intersection(other, sort=sort) + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.intersection(i8other, sort=sort) + + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) return result - def _apply_meta(self, rawarr): - if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) - return rawarr + def difference(self, other, sort=None): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) - def __setstate__(self, state): - """Necessary for making this object picklable""" + if self.equals(other): + # pass an empty PeriodArray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - if isinstance(state, dict): - super().__setstate__(state) + if is_object_dtype(other): + return self.astype(object).difference(other).astype(self.dtype) - elif isinstance(state, tuple): + elif not is_dtype_equal(self.dtype, other.dtype): + return self - # < 0.15 compat - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.difference(i8other, sort=sort) - # backcompat - freq = Period._maybe_convert_freq(own_state[1]) + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(self, state) - freq = None # ? + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) - data = PeriodArray(data, freq=freq) - self._data = data + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) - else: - raise Exception("invalid pickle state") + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype("O") + other = other.astype("O") + return this._union(other, sort=sort) + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self._union(i8other, sort=sort) + + res_name = get_op_result_name(self, other) + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - _unpickle_compat = __setstate__ + # ------------------------------------------------------------------------ + + def _apply_meta(self, rawarr): + if not isinstance(rawarr, PeriodIndex): + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) + return rawarr def memory_usage(self, deep=False): result = super().memory_usage(deep=deep) @@ -871,10 +840,8 @@ def memory_usage(self, deep=False): return result -PeriodIndex._add_comparison_ops() PeriodIndex._add_numeric_methods_disabled() PeriodIndex._add_logical_methods_disabled() -PeriodIndex._add_datetimelike_methods() def period_range( @@ -910,7 +877,7 @@ def period_range( must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 480a4ae34bfb7..582c257b50ad0 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -3,12 +3,11 @@ import numpy as np -from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib +from pandas._libs import NaT, Timedelta, index as libindex from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( _TD_DTYPE, - ensure_int64, is_float, is_integer, is_list_like, @@ -17,8 +16,7 @@ is_timedelta64_ns_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core.accessor import delegate_names from pandas.core.arrays import datetimelike as dtl @@ -30,10 +28,8 @@ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, DatetimeTimedeltaMixin, - ea_passthrough, ) -from pandas.core.indexes.numeric import Int64Index -from pandas.core.ops import get_op_result_name +from pandas.core.indexes.extension import inherit_names from pandas.tseries.frequencies import to_offset @@ -43,18 +39,28 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we don't want to expose in the .dt accessor. - _delegate_class = TimedeltaArray - _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] - _delegated_methods = TimedeltaArray._datetimelike_methods + [ - "_box_values", - "__neg__", - "__pos__", - "__abs__", - ] - _raw_properties = {"components"} - _raw_methods = {"to_pytimedelta"} + _raw_properties = {"components", "_box_func"} + _raw_methods = {"to_pytimedelta", "sum", "std", "median", "_format_native_types"} + + _delegated_properties = TimedeltaArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = ( + TimedeltaArray._datetimelike_methods + + list(_raw_methods) + + ["_box_values", "__neg__", "__pos__", "__abs__"] + ) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + "_other_ops", + ], + TimedeltaArray, +) @delegate_names( TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property" ) @@ -65,11 +71,7 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): overwrite=True, ) class TimedeltaIndex( - DatetimeTimedeltaMixin, - DatetimeIndexOpsMixin, - dtl.TimelikeOps, - Int64Index, - TimedeltaDelegateMixin, + DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin, ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -120,21 +122,10 @@ class TimedeltaIndex( Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. """ _typ = "timedeltaindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.TimedeltaEngine @@ -143,18 +134,6 @@ def _join_i8_wrapper(joinf, **kwargs): _is_numeric_dtype = True _infer_as_myclass = True - _freq = None - - _bool_ops = TimedeltaArray._bool_ops - _object_ops = TimedeltaArray._object_ops - _field_ops = TimedeltaArray._field_ops - _datetimelike_ops = TimedeltaArray._datetimelike_ops - _datetimelike_methods = TimedeltaArray._datetimelike_methods - _other_ops = TimedeltaArray._other_ops - sum = ea_passthrough(TimedeltaArray.sum) - std = ea_passthrough(TimedeltaArray.std) - median = ea_passthrough(TimedeltaArray.median) - # ------------------------------------------------------------------- # Constructors @@ -223,17 +202,6 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): result._reset_identity() return result - # ------------------------------------------------------------------- - - def __setstate__(self, state): - """Necessary for making this object picklable""" - if isinstance(state, dict): - super().__setstate__(state) - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - # ------------------------------------------------------------------- # Rendering Methods @@ -243,33 +211,6 @@ def _formatter_func(self): return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import Timedelta64Formatter - - return np.asarray( - Timedelta64Formatter( - values=self, nat_rep=na_rep, justify="all" - ).get_result() - ) - - # ------------------------------------------------------------------- - # Wrapping TimedeltaArray - - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - - @property - def _box_func(self): - return lambda x: Timedelta(x, unit="ns") - - def __getitem__(self, key): - result = self._data.__getitem__(key) - if is_scalar(result): - return result - return type(self)(result, name=self.name) - # ------------------------------------------------------------------- @Appender(_index_shared_docs["astype"]) @@ -285,143 +226,6 @@ def astype(self, dtype, copy=True): return Index(result.astype("i8"), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) - def _union(self, other, sort): - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super()._union(other, sort=sort) - - if not isinstance(other, TimedeltaIndex): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - this, other = self, other - - if this._can_fast_union(other): - return this._fast_union(other) - else: - result = Index._union(this, other, sort=sort) - if isinstance(result, TimedeltaIndex): - if result.freq is None: - result._set_freq("infer") - return result - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - if _is_convertible_to_index(other): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - - return Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def intersection(self, other, sort=False): - """ - Specialized intersection for TimedeltaIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : TimedeltaIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - .. versionchanged:: 0.25.0 - - The `sort` keyword is added - - Returns - ------- - y : Index or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - - @Appender(Index.difference.__doc__) - def difference(self, other, sort=None): - new_idx = super().difference(other, sort=sort) - new_idx._set_freq(None) - return new_idx - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, TimedeltaIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined, name=name) - return joined - else: - return self._simple_new(joined, name) - - def _can_fast_union(self, other): - if not isinstance(other, TimedeltaIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - return (right_start == left_end + freq) or right_start in left - - def _fast_union(self, other): - if len(other) == 0: - return self.view(type(self)) - - if len(self) == 0: - return other.view(type(self)) - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - left_end = left[-1] - right_end = right[-1] - - # concatenate - if left_end < right_end: - loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left - def _maybe_promote(self, other): if other.inferred_type == "timedelta": other = TimedeltaIndex(other) @@ -438,7 +242,7 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) try: - return com.maybe_box(self, Index.get_value(self, series, key), series, key) + value = Index.get_value(self, series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -450,10 +254,10 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) except (TypeError, ValueError, KeyError): raise KeyError(key) + else: + return com.maybe_box(self, value, series, key) - def get_value_maybe_box(self, series, key): - if not isinstance(key, Timedelta): - key = Timedelta(key) + def get_value_maybe_box(self, series, key: Timedelta): values = self._engine.get_value(com.values_from_object(series), key) return com.maybe_box(self, values, series, key) @@ -506,13 +310,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- label : object """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if isinstance(label, str): parsed = Timedelta(label) @@ -544,11 +348,25 @@ def _partial_td_slice(self, key): @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): - value = np.array(value, dtype=_TD_DTYPE, copy=False) - else: - value = Timedelta(value).asm8.view(_TD_DTYPE) + if not type(self._data)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self._data)(value) + self._data._check_compatible_with(value) + + elif isinstance(value, self._data._recognized_scalars): + self._data._check_compatible_with(value) + value = self._data._scalar_type(value) + + elif not isinstance(value, TimedeltaArray): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) - return self.values.searchsorted(value, side=side, sorter=sorter) + return self._data.searchsorted(value, side=side, sorter=sorter) def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" @@ -557,10 +375,6 @@ def is_type_compatible(self, typ) -> bool: def inferred_type(self) -> str: return "timedelta64" - @property - def is_all_dates(self) -> bool: - return True - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -577,90 +391,47 @@ def insert(self, loc, item): new_index : Index """ # try to convert if possible - if _is_convertible_to_td(item): - try: - item = Timedelta(item) - except ValueError: - # e.g. str that can't be parsed to timedelta - pass - elif is_scalar(item) and isna(item): + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): # GH 18295 item = self._na_value + elif is_scalar(item) and isna(item): + # i.e. datetime64("NaT") + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) freq = None - if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)): + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) # check freq can be preserved on edge cases - if self.freq is not None: - if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + if self.size and self.freq is not None: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq - item = Timedelta(item).asm8.view(_TD_DTYPE) + item = item.asm8 try: - new_tds = np.concatenate( + new_i8s = np.concatenate( (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) ) - return self._shallow_copy(new_tds, freq=freq) - + return self._shallow_copy(new_i8s, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError("cannot insert TimedeltaIndex with incompatible label") - - def delete(self, loc): - """ - Make a new TimedeltaIndex with passed location(s) deleted. - - Parameters - ---------- - loc: int, slice or array of ints - Indicate which sub-arrays to remove. - - Returns - ------- - new_index : TimedeltaIndex - """ - new_tds = np.delete(self.asi8, loc) - - freq = "infer" - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - - return TimedeltaIndex(new_tds, name=self.name, freq=freq) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) -TimedeltaIndex._add_comparison_ops() TimedeltaIndex._add_logical_methods_disabled() -TimedeltaIndex._add_datetimelike_methods() - - -def _is_convertible_to_index(other) -> bool: - """ - return a boolean whether I can attempt conversion to a TimedeltaIndex - """ - if isinstance(other, TimedeltaIndex): - return True - elif len(other) > 0 and other.inferred_type not in ( - "floating", - "mixed-integer", - "integer", - "integer-na", - "mixed-integer-float", - "mixed", - ): - return True - return False def timedelta_range( @@ -698,7 +469,7 @@ def timedelta_range( ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b86293e78a80d..ea59a6a49e649 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Hashable, List, Tuple, Union import numpy as np @@ -22,21 +22,13 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.indexers import is_list_like_indexer, length_of_indexer +from pandas.core.indexers import ( + check_bool_array_indexer, + is_list_like_indexer, + length_of_indexer, +) from pandas.core.indexes.api import Index, InvalidIndexError - -# the supported indexers -def get_indexers_list(): - - return [ - ("iloc", _iLocIndexer), - ("loc", _LocIndexer), - ("at", _AtIndexer), - ("iat", _iAtIndexer), - ] - - # "null slice" _NS = slice(None, None) @@ -94,6 +86,486 @@ class IndexingError(Exception): pass +class IndexingMixin: + """Mixin for adding .loc/.iloc/.at/.iat to Datafames and Series. + """ + + @property + def iloc(self) -> "_iLocIndexer": + """ + Purely integer-location based indexing for selection by position. + + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), but may also be used with a boolean + array. + + Allowed inputs are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + This is useful in method chains, when you don't have a reference to the + calling object, but would like to base your selection on some value. + + ``.iloc`` will raise ``IndexError`` if a requested indexer is + out-of-bounds, except *slice* indexers which allow out-of-bounds + indexing (this conforms with python/numpy *slice* semantics). + + See more at :ref:`Selection by Position `. + + See Also + -------- + DataFrame.iat : Fast integer location scalar accessor. + DataFrame.loc : Purely label-location based indexer for selection by label. + Series.iloc : Purely integer-location based indexing for + selection by position. + + Examples + -------- + + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, + ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + >>> df = pd.DataFrame(mydict) + >>> df + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + **Indexing just the rows** + + With a scalar integer. + + >>> type(df.iloc[0]) + + >>> df.iloc[0] + a 1 + b 2 + c 3 + d 4 + Name: 0, dtype: int64 + + With a list of integers. + + >>> df.iloc[[0]] + a b c d + 0 1 2 3 4 + >>> type(df.iloc[[0]]) + + + >>> df.iloc[[0, 1]] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + + With a `slice` object. + + >>> df.iloc[:3] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + With a boolean mask the same length as the index. + + >>> df.iloc[[True, False, True]] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + With a callable, useful in method chains. The `x` passed + to the ``lambda`` is the DataFrame being sliced. This selects + the rows whose index label even. + + >>> df.iloc[lambda x: x.index % 2 == 0] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + **Indexing both axes** + + You can mix the indexer types for the index and columns. Use ``:`` to + select the entire axis. + + With scalar integers. + + >>> df.iloc[0, 1] + 2 + + With lists of integers. + + >>> df.iloc[[0, 2], [1, 3]] + b d + 0 2 4 + 2 2000 4000 + + With `slice` objects. + + >>> df.iloc[1:3, 0:3] + a b c + 1 100 200 300 + 2 1000 2000 3000 + + With a boolean array whose length matches the columns. + + >>> df.iloc[:, [True, False, True, False]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + + With a callable function that expects the Series or DataFrame. + + >>> df.iloc[:, lambda df: [0, 2]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + """ + return _iLocIndexer("iloc", self) + + @property + def loc(self) -> "_LocIndexer": + """ + Access a group of rows and columns by label(s) or a boolean array. + + ``.loc[]`` is primarily label based, but may also be used with a + boolean array. + + Allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is + interpreted as a *label* of the index, and **never** as an + integer position along the index). + - A list or array of labels, e.g. ``['a', 'b', 'c']``. + - A slice object with labels, e.g. ``'a':'f'``. + + .. warning:: Note that contrary to usual python slices, **both** the + start and the stop are included + + - A boolean array of the same length as the axis being sliced, + e.g. ``[True, False, True]``. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) + + See more at :ref:`Selection by Label ` + + Raises + ------ + KeyError + If any items are not found. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). + DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the + Series/DataFrame. + Series.loc : Access group of values using labels. + + Examples + -------- + **Getting values** + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + + Single label. Note this returns the row as a Series. + + >>> df.loc['viper'] + max_speed 4 + shield 5 + Name: viper, dtype: int64 + + List of labels. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[['viper', 'sidewinder']] + max_speed shield + viper 4 5 + sidewinder 7 8 + + Single label for row and column + + >>> df.loc['cobra', 'shield'] + 2 + + Slice with labels for row and single label for column. As mentioned + above, note that both the start and stop of the slice are included. + + >>> df.loc['cobra':'viper', 'max_speed'] + cobra 1 + viper 4 + Name: max_speed, dtype: int64 + + Boolean list with the same length as the row axis + + >>> df.loc[[False, False, True]] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series + + >>> df.loc[df['shield'] > 6] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series with column labels specified + + >>> df.loc[df['shield'] > 6, ['max_speed']] + max_speed + sidewinder 7 + + Callable that returns a boolean Series + + >>> df.loc[lambda df: df['shield'] == 8] + max_speed shield + sidewinder 7 8 + + **Setting values** + + Set value for all items matching the list of labels + + >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df + max_speed shield + cobra 1 2 + viper 4 50 + sidewinder 7 50 + + Set value for an entire row + + >>> df.loc['cobra'] = 10 + >>> df + max_speed shield + cobra 10 10 + viper 4 50 + sidewinder 7 50 + + Set value for an entire column + + >>> df.loc[:, 'max_speed'] = 30 + >>> df + max_speed shield + cobra 30 10 + viper 30 50 + sidewinder 30 50 + + Set value for rows matching callable condition + + >>> df.loc[df['shield'] > 35] = 0 + >>> df + max_speed shield + cobra 30 10 + viper 0 0 + sidewinder 0 0 + + **Getting values on a DataFrame with an index that has integer labels** + + Another example using integers for the index + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + Slice with integer labels for rows. As mentioned above, note that both + the start and stop of the slice are included. + + >>> df.loc[7:9] + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + **Getting values with a MultiIndex** + + A number of examples using a DataFrame with a MultiIndex + + >>> tuples = [ + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ] + >>> index = pd.MultiIndex.from_tuples(tuples) + >>> values = [[12, 2], [0, 4], [10, 20], + ... [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> df + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Single label. Note this returns a DataFrame with a single index. + + >>> df.loc['cobra'] + max_speed shield + mark i 12 2 + mark ii 0 4 + + Single index tuple. Note this returns a Series. + + >>> df.loc[('cobra', 'mark ii')] + max_speed 0 + shield 4 + Name: (cobra, mark ii), dtype: int64 + + Single label for row and column. Similar to passing in a tuple, this + returns a Series. + + >>> df.loc['cobra', 'mark i'] + max_speed 12 + shield 2 + Name: (cobra, mark i), dtype: int64 + + Single tuple. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[[('cobra', 'mark ii')]] + max_speed shield + cobra mark ii 0 4 + + Single tuple for the index with a single label for the column + + >>> df.loc[('cobra', 'mark i'), 'shield'] + 2 + + Slice from index tuple to single label + + >>> df.loc[('cobra', 'mark i'):'viper'] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Slice from index tuple to index tuple + + >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + """ + return _LocIndexer("loc", self) + + @property + def at(self) -> "_AtIndexer": + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + If 'label' does not exist in DataFrame. + + See Also + -------- + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + Series.at : Access a single value using a label. + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Set value at specified row/column pair + + >>> df.at[4, 'B'] = 10 + >>> df.at[4, 'B'] + 10 + + Get value within a Series + + >>> df.loc[5].at['B'] + 4 + """ + return _AtIndexer("at", self) + + @property + def iat(self) -> "_iAtIndexer": + """ + Access a single value for a row/column pair by integer position. + + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + IndexError + When integer position is out of bounds. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + """ + return _iAtIndexer("iat", self) + + class _NDFrameIndexer(_NDFrameIndexerBase): _valid_types: str axis = None @@ -1332,244 +1804,8 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): return self.obj.take(indexer, axis=axis) +@Appender(IndexingMixin.loc.__doc__) class _LocIndexer(_LocationIndexer): - """ - Access a group of rows and columns by label(s) or a boolean array. - - ``.loc[]`` is primarily label based, but may also be used with a - boolean array. - - Allowed inputs are: - - - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is - interpreted as a *label* of the index, and **never** as an - integer position along the index). - - A list or array of labels, e.g. ``['a', 'b', 'c']``. - - A slice object with labels, e.g. ``'a':'f'``. - - .. warning:: Note that contrary to usual python slices, **both** the - start and the stop are included - - - A boolean array of the same length as the axis being sliced, - e.g. ``[True, False, True]``. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above) - - See more at :ref:`Selection by Label ` - - Raises - ------ - KeyError - If any items are not found. - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.iloc : Access group of rows and columns by integer position(s). - DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. - Series.loc : Access group of values using labels. - - Examples - -------- - **Getting values** - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) - >>> df - max_speed shield - cobra 1 2 - viper 4 5 - sidewinder 7 8 - - Single label. Note this returns the row as a Series. - - >>> df.loc['viper'] - max_speed 4 - shield 5 - Name: viper, dtype: int64 - - List of labels. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[['viper', 'sidewinder']] - max_speed shield - viper 4 5 - sidewinder 7 8 - - Single label for row and column - - >>> df.loc['cobra', 'shield'] - 2 - - Slice with labels for row and single label for column. As mentioned - above, note that both the start and stop of the slice are included. - - >>> df.loc['cobra':'viper', 'max_speed'] - cobra 1 - viper 4 - Name: max_speed, dtype: int64 - - Boolean list with the same length as the row axis - - >>> df.loc[[False, False, True]] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series - - >>> df.loc[df['shield'] > 6] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series with column labels specified - - >>> df.loc[df['shield'] > 6, ['max_speed']] - max_speed - sidewinder 7 - - Callable that returns a boolean Series - - >>> df.loc[lambda df: df['shield'] == 8] - max_speed shield - sidewinder 7 8 - - **Setting values** - - Set value for all items matching the list of labels - - >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 - >>> df - max_speed shield - cobra 1 2 - viper 4 50 - sidewinder 7 50 - - Set value for an entire row - - >>> df.loc['cobra'] = 10 - >>> df - max_speed shield - cobra 10 10 - viper 4 50 - sidewinder 7 50 - - Set value for an entire column - - >>> df.loc[:, 'max_speed'] = 30 - >>> df - max_speed shield - cobra 30 10 - viper 30 50 - sidewinder 30 50 - - Set value for rows matching callable condition - - >>> df.loc[df['shield'] > 35] = 0 - >>> df - max_speed shield - cobra 30 10 - viper 0 0 - sidewinder 0 0 - - **Getting values on a DataFrame with an index that has integer labels** - - Another example using integers for the index - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) - >>> df - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - Slice with integer labels for rows. As mentioned above, note that both - the start and stop of the slice are included. - - >>> df.loc[7:9] - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - **Getting values with a MultiIndex** - - A number of examples using a DataFrame with a MultiIndex - - >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') - ... ] - >>> index = pd.MultiIndex.from_tuples(tuples) - >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] - >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) - >>> df - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Single label. Note this returns a DataFrame with a single index. - - >>> df.loc['cobra'] - max_speed shield - mark i 12 2 - mark ii 0 4 - - Single index tuple. Note this returns a Series. - - >>> df.loc[('cobra', 'mark ii')] - max_speed 0 - shield 4 - Name: (cobra, mark ii), dtype: int64 - - Single label for row and column. Similar to passing in a tuple, this - returns a Series. - - >>> df.loc['cobra', 'mark i'] - max_speed 12 - shield 2 - Name: (cobra, mark i), dtype: int64 - - Single tuple. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[[('cobra', 'mark ii')]] - max_speed shield - cobra mark ii 0 4 - - Single tuple for the index with a single label for the column - - >>> df.loc[('cobra', 'mark i'), 'shield'] - 2 - - Slice from index tuple to single label - - >>> df.loc[('cobra', 'mark i'):'viper'] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Slice from index tuple to index tuple - - >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - """ - _valid_types = ( "labels (MUST BE IN THE INDEX), slices of labels (BOTH " "endpoints included! Can be slices of integers if the " @@ -1728,142 +1964,8 @@ def _getitem_axis(self, key, axis: int): return self._get_label(key, axis=axis) +@Appender(IndexingMixin.iloc.__doc__) class _iLocIndexer(_LocationIndexer): - """ - Purely integer-location based indexing for selection by position. - - ``.iloc[]`` is primarily integer position based (from ``0`` to - ``length-1`` of the axis), but may also be used with a boolean - array. - - Allowed inputs are: - - - An integer, e.g. ``5``. - - A list or array of integers, e.g. ``[4, 3, 0]``. - - A slice object with ints, e.g. ``1:7``. - - A boolean array. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above). - This is useful in method chains, when you don't have a reference to the - calling object, but would like to base your selection on some value. - - ``.iloc`` will raise ``IndexError`` if a requested indexer is - out-of-bounds, except *slice* indexers which allow out-of-bounds - indexing (this conforms with python/numpy *slice* semantics). - - See more at :ref:`Selection by Position `. - - See Also - -------- - DataFrame.iat : Fast integer location scalar accessor. - DataFrame.loc : Purely label-location based indexer for selection by label. - Series.iloc : Purely integer-location based indexing for - selection by position. - - Examples - -------- - - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, - ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] - >>> df = pd.DataFrame(mydict) - >>> df - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - **Indexing just the rows** - - With a scalar integer. - - >>> type(df.iloc[0]) - - >>> df.iloc[0] - a 1 - b 2 - c 3 - d 4 - Name: 0, dtype: int64 - - With a list of integers. - - >>> df.iloc[[0]] - a b c d - 0 1 2 3 4 - >>> type(df.iloc[[0]]) - - - >>> df.iloc[[0, 1]] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - - With a `slice` object. - - >>> df.iloc[:3] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - With a boolean mask the same length as the index. - - >>> df.iloc[[True, False, True]] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - With a callable, useful in method chains. The `x` passed - to the ``lambda`` is the DataFrame being sliced. This selects - the rows whose index label even. - - >>> df.iloc[lambda x: x.index % 2 == 0] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - **Indexing both axes** - - You can mix the indexer types for the index and columns. Use ``:`` to - select the entire axis. - - With scalar integers. - - >>> df.iloc[0, 1] - 2 - - With lists of integers. - - >>> df.iloc[[0, 2], [1, 3]] - b d - 0 2 4 - 2 2000 4000 - - With `slice` objects. - - >>> df.iloc[1:3, 0:3] - a b c - 1 100 200 300 - 2 1000 2000 3000 - - With a boolean array whose length matches the columns. - - >>> df.iloc[:, [True, False, True, False]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - - With a callable function that expects the Series or DataFrame. - - >>> df.iloc[:, lambda df: [0, 2]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - """ - _valid_types = ( "integer, integer slice (START point is INCLUDED, END " "point is EXCLUDED), listlike of integers, boolean array" @@ -2091,53 +2193,8 @@ def __setitem__(self, key, value): self.obj._set_value(*key, takeable=self._takeable) +@Appender(IndexingMixin.at.__doc__) class _AtIndexer(_ScalarAccessIndexer): - """ - Access a single value for a row/column label pair. - - Similar to ``loc``, in that both provide label-based lookups. Use - ``at`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - KeyError - If 'label' does not exist in DataFrame. - - See Also - -------- - DataFrame.iat : Access a single value for a row/column pair by integer - position. - DataFrame.loc : Access a group of rows and columns by label(s). - Series.at : Access a single value using a label. - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... index=[4, 5, 6], columns=['A', 'B', 'C']) - >>> df - A B C - 4 0 2 3 - 5 0 4 1 - 6 10 20 30 - - Get value at specified row/column pair - - >>> df.at[4, 'B'] - 2 - - Set value at specified row/column pair - - >>> df.at[4, 'B'] = 10 - >>> df.at[4, 'B'] - 10 - - Get value within a Series - - >>> df.loc[5].at['B'] - 4 - """ - _takeable = False def _convert_key(self, key, is_setter: bool = False): @@ -2166,52 +2223,8 @@ def _convert_key(self, key, is_setter: bool = False): return key +@Appender(IndexingMixin.iat.__doc__) class _iAtIndexer(_ScalarAccessIndexer): - """ - Access a single value for a row/column pair by integer position. - - Similar to ``iloc``, in that both provide integer-based lookups. Use - ``iat`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - IndexError - When integer position is out of bounds. - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.loc : Access a group of rows and columns by label(s). - DataFrame.iloc : Access a group of rows and columns by integer position(s). - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... columns=['A', 'B', 'C']) - >>> df - A B C - 0 0 2 3 - 1 0 4 1 - 2 10 20 30 - - Get value at specified row/column pair - - >>> df.iat[1, 2] - 1 - - Set value at specified row/column pair - - >>> df.iat[1, 2] = 10 - >>> df.iat[1, 2] - 10 - - Get value within a series - - >>> df.loc[0].iat[1] - 2 - """ - _takeable = True def _convert_key(self, key, is_setter: bool = False): @@ -2224,7 +2237,7 @@ def _convert_key(self, key, is_setter: bool = False): return key -def _tuplify(ndim: int, loc) -> tuple: +def _tuplify(ndim: int, loc: Hashable) -> Tuple[Union[Hashable, slice], ...]: """ Given an indexer for the first dimension, create an equivalent tuple for indexing over all dimensions. @@ -2238,9 +2251,10 @@ def _tuplify(ndim: int, loc) -> tuple: ------- tuple """ - tup = [slice(None, None) for _ in range(ndim)] - tup[0] = loc - return tuple(tup) + _tup: List[Union[Hashable, slice]] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) def convert_to_index_sliceable(obj, key): @@ -2308,13 +2322,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = np.asarray(result, dtype=bool) - - # GH26658 - if len(result) != len(index): - raise IndexError( - f"Item wrong length {len(result)} instead of {len(index)}." - ) + result = check_bool_array_indexer(index, result) return result diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 8ac0df2fa4e0a..37a3405554745 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,4 @@ -from .blocks import ( # noqa: F401 +from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, BoolBlock, CategoricalBlock, @@ -10,19 +10,38 @@ IntBlock, ObjectBlock, TimeDeltaBlock, + _block_shape, + _safe_reshape, + make_block, ) -from .managers import ( # noqa: F401 +from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks, -) - -from .blocks import _safe_reshape # noqa: F401; io.packers -from .blocks import make_block # noqa: F401; io.pytables, io.packers -from .managers import ( # noqa: F401; reshape.concat, reshape.merge _transform_index, concatenate_block_managers, + create_block_manager_from_arrays, + create_block_manager_from_blocks, ) -from .blocks import _block_shape # noqa:F401; io.pytables +__all__ = [ + "Block", + "BoolBlock", + "CategoricalBlock", + "ComplexBlock", + "DatetimeBlock", + "DatetimeTZBlock", + "ExtensionBlock", + "FloatBlock", + "IntBlock", + "ObjectBlock", + "TimeDeltaBlock", + "_safe_reshape", + "make_block", + "_block_shape", + "BlockManager", + "SingleBlockManager", + "_transform_index", + "concatenate_block_managers", + "create_block_manager_from_arrays", + "create_block_manager_from_blocks", +] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 87c50503ecf11..b30e142293b66 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -380,7 +380,6 @@ def apply(self, func, **kwargs): return nbs if not isinstance(result, Block): - # Exclude the 0-dim case so we can do reductions result = self.make_block(values=_block_shape(result, ndim=self.ndim)) return result @@ -658,9 +657,9 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): if slicer is not None: values = values[:, slicer] mask = isna(values) + itemsize = writers.word_len(na_rep) - if not self.is_object and not quoting: - itemsize = writers.word_len(na_rep) + if not self.is_object and not quoting and itemsize: values = values.astype(f" 1: + # GH#12513 a EA dtype passed with a 2D array, split into + # multiple EAs that view the values + values = [values[:, n] for n in range(values.shape[1])] + else: + values = [values] + if columns is None: - columns = [0] - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + columns = list(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9729f172183e7..066689b3e374e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -34,10 +34,7 @@ from pandas.core.base import PandasObject from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, MultiIndex, ensure_index - -from pandas.io.formats.printing import pprint_thing - -from .blocks import ( +from pandas.core.internals.blocks import ( Block, CategoricalBlock, DatetimeTZBlock, @@ -49,13 +46,15 @@ get_block_type, make_block, ) -from .concat import ( # all for concatenate_block_managers +from pandas.core.internals.concat import ( # all for concatenate_block_managers combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan, is_uniform_join_units, ) +from pandas.io.formats.printing import pprint_thing + # TODO: flexible with index=None and/or items=None @@ -340,6 +339,32 @@ def _verify_integrity(self): f"tot_items: {tot_items}" ) + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + def apply(self, f, filter=None, **kwargs): """ Iterate over the blocks, collect and create a new BlockManager. @@ -712,16 +737,16 @@ def combine(self, blocks, copy=True): return type(self)(new_blocks, axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj: slice, axis: int = 0): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) else: - slicer = [slice(None)] * (axis + 1) - slicer[axis] = slobj - slicer = tuple(slicer) + _slicer = [slice(None)] * (axis + 1) + _slicer[axis] = slobj + slicer = tuple(_slicer) new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] new_axes = list(self.axes) @@ -731,11 +756,11 @@ def get_slice(self, slobj, axis=0): bm._consolidate_inplace() return bm - def __contains__(self, item): + def __contains__(self, item) -> bool: return item in self.items @property - def nblocks(self): + def nblocks(self) -> int: return len(self.blocks) def copy(self, deep=True): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1079f516a4e40..6b03e76a1d691 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -831,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError): result = np.nan else: result = getattr(values, meth)(axis) @@ -1243,8 +1243,14 @@ def nancorr(a, b, method="pearson", min_periods=None): def get_corr_func(method): if method in ["kendall", "spearman"]: from scipy.stats import kendalltau, spearmanr + elif method in ["pearson"]: + pass elif callable(method): return method + else: + raise ValueError( + f"Unkown method '{method}', expected one of 'kendall', 'spearman'" + ) def _pearson(a, b): return np.corrcoef(a, b)[0, 1] diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index be5e53eaa6721..f51d71d5507a0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,6 +10,7 @@ import numpy as np from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype @@ -31,7 +32,6 @@ ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( _arith_doc_FRAME, @@ -302,7 +302,7 @@ def _get_op_name(op, special): """ opname = op.__name__.strip("_") if special: - opname = "__{opname}__".format(opname=opname) + opname = f"__{opname}__" return opname @@ -385,7 +385,7 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": - # We only get here if called via _combine_frame_series, + # We only get here if called via _combine_series_frame, # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) @@ -603,9 +603,7 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=N result : DataFrame """ if fill_value is not None: - raise NotImplementedError( - "fill_value {fill} not supported.".format(fill=fill_value) - ) + raise NotImplementedError(f"fill_value {fill_value} not supported.") if axis is None: # default axis is columns @@ -661,15 +659,13 @@ def to_series(right): else: raise ValueError( "Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}".format( - req_shape=left.shape, given_shape=right.shape - ) + f"must be {left.shape}: given {right.shape}" ) elif right.ndim > 2: raise ValueError( "Unable to coerce to Series/DataFrame, dim " - "must be <= 2: {dim}".format(dim=right.shape) + f"must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): @@ -702,7 +698,11 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # Another DataFrame pass_op = op if should_series_dispatch(self, other, op) else na_op pass_op = pass_op if not is_logical else op - return self._combine_frame(other, pass_op, fill_value, level) + + left, right = self.align(other, join="outer", level=level, copy=False) + new_data = left._combine_frame(right, pass_op, fill_value) + return left._construct_result(new_data) + elif isinstance(other, ABCSeries): # For these values of `axis`, we end up dispatching to Series op, # so do not want the masked op. @@ -763,7 +763,7 @@ def _comp_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - @Appender("Wrapper for comparison method {name}".format(name=op_name)) + @Appender(f"Wrapper for comparison method {op_name}") def f(self, other): other = _align_method_FRAME(self, other, axis=None) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index e0ddd17335175..b84d468fff736 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -246,7 +246,7 @@ def comparison_op( res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - op_name = "__{op}__".format(op=op.__name__) + op_name = f"__{op.__name__}__" method = getattr(lvalues, op_name) with np.errstate(all="ignore"): res_values = method(rvalues) @@ -254,9 +254,8 @@ def comparison_op( if res_values is NotImplemented: res_values = invalid_comparison(lvalues, rvalues, op) if is_scalar(res_values): - raise TypeError( - "Could not compare {typ} type with Series".format(typ=type(rvalues)) - ) + typ = type(rvalues) + raise TypeError(f"Could not compare {typ} type with Series") return res_values @@ -293,11 +292,10 @@ def na_logical_op(x: np.ndarray, y, op): OverflowError, NotImplementedError, ): + typ = type(y).__name__ raise TypeError( - "Cannot perform '{op}' with a dtyped [{dtype}] array " - "and scalar of type [{typ}]".format( - op=op.__name__, dtype=x.dtype, typ=type(y).__name__ - ) + f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " + f"and scalar of type [{typ}]" ) return result diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index 1eb952c1394ac..61a3032c7a02c 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,12 +1,10 @@ """ Functions for defining unary operations. """ -from typing import Any, Callable, Union +from typing import Any, Union import numpy as np -from pandas._typing import ArrayLike - from pandas.core.dtypes.common import ( is_datetime64_dtype, is_extension_array_dtype, @@ -126,94 +124,3 @@ def dispatch_to_extension_op( # on the ExtensionArray res_values = op(left, right) return res_values - - -def maybe_dispatch_ufunc_to_dunder_op( - self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any -): - """ - Dispatch a ufunc to the equivalent dunder method. - - Parameters - ---------- - self : ArrayLike - The array whose dunder method we dispatch to - ufunc : Callable - A NumPy ufunc - method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} - inputs : ArrayLike - The input arrays. - kwargs : Any - The additional keyword arguments, e.g. ``out``. - - Returns - ------- - result : Any - The result of applying the ufunc - """ - # special has the ufuncs we dispatch to the dunder op on - special = { - "add", - "sub", - "mul", - "pow", - "mod", - "floordiv", - "truediv", - "divmod", - "eq", - "ne", - "lt", - "gt", - "le", - "ge", - "remainder", - "matmul", - "or", - "xor", - "and", - } - aliases = { - "subtract": "sub", - "multiply": "mul", - "floor_divide": "floordiv", - "true_divide": "truediv", - "power": "pow", - "remainder": "mod", - "divide": "div", - "equal": "eq", - "not_equal": "ne", - "less": "lt", - "less_equal": "le", - "greater": "gt", - "greater_equal": "ge", - "bitwise_or": "or", - "bitwise_and": "and", - "bitwise_xor": "xor", - } - - # For op(., Array) -> Array.__r{op}__ - flipped = { - "lt": "__gt__", - "le": "__ge__", - "gt": "__lt__", - "ge": "__le__", - "eq": "__eq__", - "ne": "__ne__", - } - - op_name = ufunc.__name__ - op_name = aliases.get(op_name, op_name) - - def not_implemented(*args, **kwargs): - return NotImplemented - - if method == "__call__" and op_name in special and kwargs.get("out") is None: - if isinstance(inputs[0], type(self)): - name = "__{}__".format(op_name) - return getattr(self, name, not_implemented)(inputs[1]) - else: - name = flipped.get(op_name, "__r{}__".format(op_name)) - return getattr(self, name, not_implemented)(inputs[0]) - else: - return NotImplemented diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index 013ff7689b221..cc4a1f11edd2b 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -30,11 +30,8 @@ def invalid_comparison(left, right, op): elif op is operator.ne: res_values = np.ones(left.shape, dtype=bool) else: - raise TypeError( - "Invalid comparison between dtype={dtype} and {typ}".format( - dtype=left.dtype, typ=type(right).__name__ - ) - ) + typ = type(right).__name__ + raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}") return res_values @@ -52,10 +49,8 @@ def make_invalid_op(name: str): """ def invalid_op(self, other=None): - raise TypeError( - "cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__) - ) + typ = type(self).__name__ + raise TypeError(f"cannot perform {name} with this index type: {typ}") invalid_op.__name__ = name return invalid_op diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index 8c66eea270c76..c04658565f235 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -102,7 +102,8 @@ def f(self, other): return self - f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) + name = method.__name__.strip("__") + f.__name__ = f"__i{name}__" return f new_methods.update( @@ -214,7 +215,7 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): ) if special: - dunderize = lambda x: "__{name}__".format(name=x.strip("_")) + dunderize = lambda x: f"__{x.strip('_')}__" else: dunderize = lambda x: x new_methods = {dunderize(k): v for k, v in new_methods.items()} diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 96a615d488bf2..5039ffab33fbd 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -27,7 +27,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar -from .roperator import rdivmod, rfloordiv, rmod +from pandas.core.ops.roperator import rdivmod, rfloordiv, rmod def fill_zeros(result, x, y): diff --git a/pandas/core/ops/roperator.py b/pandas/core/ops/roperator.py index 4cb02238aea16..e6691ddf8984e 100644 --- a/pandas/core/ops/roperator.py +++ b/pandas/core/ops/roperator.py @@ -34,9 +34,8 @@ def rmod(left, right): # formatting operation; this is a TypeError # otherwise perform the op if isinstance(right, str): - raise TypeError( - "{typ} cannot perform the operation mod".format(typ=type(left).__name__) - ) + typ = type(left).__name__ + raise TypeError(f"{typ} cannot perform the operation mod") return right % left diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9ae0aa930779b..0e43880dfda07 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -96,7 +96,7 @@ def __str__(self) -> str: ) return f"{type(self).__name__} [{', '.join(attrs)}]" - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self._attributes: @@ -131,7 +131,7 @@ def ax(self): return self.groupby.ax @property - def _typ(self): + def _typ(self) -> str: """ Masquerade for compat as a Series or a DataFrame. """ @@ -140,7 +140,7 @@ def _typ(self): return "dataframe" @property - def _from_selection(self): + def _from_selection(self) -> bool: """ Is the resampling from a DataFrame column or MultiIndex level. """ @@ -316,7 +316,7 @@ def _downsample(self, f): def _upsample(self, f, limit=None, fill_value=None): raise AbstractMethodError(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim: int, subset=None): """ Sub-classes to define. Return a sliced object. @@ -1076,10 +1076,9 @@ def _upsample(self, method, limit=None, fill_value=None): raise AssertionError("axis must be 0") if self._from_selection: raise ValueError( - "Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like" + "Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to datetime-like" ) ax = self.ax @@ -1135,9 +1134,9 @@ def _convert_obj(self, obj): if self._from_selection: # see GH 14008, GH 12871 msg = ( - "Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index" + "Resampling from level= or on= selection " + "with a PeriodIndex is not currently supported, " + "use .set_index(...) to explicitly set index" ) raise NotImplementedError(msg) @@ -1407,7 +1406,7 @@ def _get_resampler(self, obj, kind=None): f"but got an instance of '{type(ax).__name__}'" ) - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() @@ -1587,7 +1586,10 @@ def _get_period_bins(self, ax): rng += freq_mult # adjust bin edge indexes to account for base rng -= bin_shift - bins = memb.searchsorted(rng, side="left") + + # Wrap in PeriodArray for PeriodArray.searchsorted + prng = type(memb._data)(rng, dtype=memb.dtype) + bins = memb.searchsorted(prng, side="left") if nat_count > 0: # NaT handling as in pandas._lib.lib.generate_bins_dt64() diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cea70012b47ea..502b8d1941fdf 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,10 +2,12 @@ concat routines """ -from typing import List +from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload import numpy as np +from pandas._typing import FrameOrSeriesUnion + from pandas import DataFrame, Index, MultiIndex, Series from pandas.core.arrays.categorical import ( factorize_from_iterable, @@ -26,8 +28,27 @@ # Concatenate DataFrame objects +@overload +def concat( + objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], + axis=0, + join: str = "outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> "DataFrame": + ... + + +@overload def concat( - objs, + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], axis=0, join: str = "outer", ignore_index: bool = False, @@ -37,7 +58,24 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -): +) -> FrameOrSeriesUnion: + ... + + +def concat( + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], + axis=0, + join="outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> FrameOrSeriesUnion: """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -109,7 +147,7 @@ def concat( A walkthrough of how this method fits in with other tools for combining pandas objects can be found `here - `__. + `__. Examples -------- @@ -313,8 +351,8 @@ def __init__( for obj in objs: if not isinstance(obj, (Series, DataFrame)): msg = ( - "cannot concatenate object of type '{typ}';" - " only Series and DataFrame objs are valid".format(typ=type(obj)) + "cannot concatenate object of type '{typ}'; " + "only Series and DataFrame objs are valid".format(typ=type(obj)) ) raise TypeError(msg) @@ -364,8 +402,8 @@ def __init__( self._is_series = isinstance(sample, Series) if not 0 <= axis <= sample.ndim: raise AssertionError( - "axis must be between 0 and {ndim}, input was" - " {axis}".format(ndim=sample.ndim, axis=axis) + "axis must be between 0 and {ndim}, input was " + "{axis}".format(ndim=sample.ndim, axis=axis) ) # if we have mixed ndims, then convert to highest ndim @@ -472,17 +510,12 @@ def _get_result_dim(self) -> int: else: return self.objs[0].ndim - def _get_new_axes(self): + def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() - new_axes = [None] * ndim - - for i in range(ndim): - if i == self.axis: - continue - new_axes[i] = self._get_comb_axis(i) - - new_axes[self.axis] = self._get_concat_axis() - return new_axes + return [ + self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) + for i in range(ndim) + ] def _get_comb_axis(self, i: int) -> Index: data_axis = self.objs[0]._get_block_manager_axis(i) @@ -501,7 +534,7 @@ def _get_concat_axis(self) -> Index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names: List = [None] * len(self.objs) + names: List[Optional[Hashable]] = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): @@ -615,8 +648,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError( - "Cannot concat indices that do" - " not have the same number of levels" + "Cannot concat indices that do " + "not have the same number of levels" ) # also copies diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 38bda94489d01..d4ccb19fc0dda 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,8 +51,8 @@ def melt( missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError( - "The following 'id_vars' are not present" - " in the DataFrame: {missing}" + "The following 'id_vars' are not present " + "in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: @@ -73,8 +73,8 @@ def melt( missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError( - "The following 'value_vars' are not present in" - " the DataFrame: {missing}" + "The following 'value_vars' are not present in " + "the DataFrame: {missing}" "".format(missing=list(missing)) ) frame = frame.loc[:, id_vars + value_vars] @@ -192,7 +192,9 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): +def wide_to_long( + df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" +) -> DataFrame: r""" Wide panel to long format. Less flexible but more user-friendly than melt. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 37ec05c40940e..5f92e4a88b568 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -41,6 +41,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -68,7 +69,7 @@ def merge( copy: bool = True, indicator: bool = False, validate=None, -): +) -> "DataFrame": op = _MergeOperation( left, right, @@ -113,6 +114,7 @@ def _groupby_and_merge( by = [by] lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf @@ -132,7 +134,7 @@ def _groupby_and_merge( try: rby = right.groupby(by, sort=False) except KeyError: - rby = None + pass for key, lhs in lby: @@ -183,7 +185,7 @@ def merge_ordered( fill_method=None, suffixes=("_x", "_y"), how: str = "outer", -): +) -> "DataFrame": """ Perform merge with optional filling/interpolation. @@ -317,7 +319,7 @@ def merge_asof( tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", -): +) -> "DataFrame": """ Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. @@ -1244,32 +1246,32 @@ def _validate(self, validate: str): if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: raise MergeError( - "Merge keys are not unique in either left" - " or right dataset; not a one-to-one merge" + "Merge keys are not unique in either left " + "or right dataset; not a one-to-one merge" ) elif not left_unique: raise MergeError( - "Merge keys are not unique in left dataset;" - " not a one-to-one merge" + "Merge keys are not unique in left dataset; " + "not a one-to-one merge" ) elif not right_unique: raise MergeError( - "Merge keys are not unique in right dataset;" - " not a one-to-one merge" + "Merge keys are not unique in right dataset; " + "not a one-to-one merge" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: raise MergeError( - "Merge keys are not unique in left dataset;" - " not a one-to-many merge" + "Merge keys are not unique in left dataset; " + "not a one-to-many merge" ) elif validate in ["many_to_one", "m:1"]: if not right_unique: raise MergeError( - "Merge keys are not unique in right dataset;" - " not a many-to-one merge" + "Merge keys are not unique in right dataset; " + "not a many-to-one merge" ) elif validate in ["many_to_many", "m:m"]: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 4b21045cd0217..b443ba142369c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Callable, Dict, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, Union import numpy as np @@ -35,12 +35,12 @@ def pivot_table( dropna=True, margins_name="All", observed=False, -): +) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): - pieces = [] + pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( @@ -148,7 +148,7 @@ def pivot_table( table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast="infer") + table = table._ensure_type(table.fillna(fill_value, downcast="infer")) if margins: if dropna: @@ -426,7 +426,7 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data: "DataFrame", index=None, columns=None, values=None): +def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": if values is None: cols = [columns] if index is None else [index, columns] append = index is None @@ -459,7 +459,7 @@ def crosstab( margins_name: str = "All", dropna: bool = True, normalize=False, -): +) -> "DataFrame": """ Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 004bd0199eb58..97f416e32d07b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,5 +1,6 @@ from functools import partial import itertools +from typing import List import numpy as np @@ -357,7 +358,7 @@ def _unstack_multiple(data, clocs, fill_value=None): result = data for i in range(len(clocs)): val = clocs[i] - result = result.unstack(val) + result = result.unstack(val, fill_value=fill_value) clocs = [v if i > v else v - 1 for v in clocs] return result @@ -755,7 +756,7 @@ def get_dummies( sparse=False, drop_first=False, dtype=None, -): +) -> "DataFrame": """ Convert categorical variable into dummy/indicator variables. @@ -899,7 +900,7 @@ def check_len(item, name): if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns - with_dummies = [] + with_dummies: List[DataFrame] = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index ceb4e3290ff75..2e3eb9170b15c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -4,7 +4,6 @@ import numpy as np from pandas._libs import Timedelta, Timestamp -from pandas._libs.interval import Interval from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( @@ -16,6 +15,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, + is_list_like, is_scalar, is_timedelta64_dtype, ) @@ -66,11 +66,12 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels : array or bool, optional + labels : array or False, default None Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). - This argument is ignored when `bins` is an IntervalIndex. + This argument is ignored when `bins` is an IntervalIndex. If True, + raises an error. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. @@ -287,10 +288,10 @@ def qcut( q : int or list-like of int Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. - labels : array or bool, default None + labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the - bins. + bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. @@ -392,15 +393,23 @@ def _bins_to_cuts( has_nas = na_mask.any() if labels is not False: - if labels is None: + if not (labels is None or is_list_like(labels)): + raise ValueError( + "Bin labels must either be False, None or passed in as a " + "list-like argument" + ) + + elif labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) + else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) + if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) @@ -516,17 +525,11 @@ def _format_labels( adjust = lambda x: x - 10 ** (-precision) breaks = [formatter(b) for b in bins] - labels = IntervalIndex.from_breaks(breaks, closed=closed) - if right and include_lowest: - # we will adjust the left hand side by precision to - # account that we are all right closed - v = adjust(labels[0].left) - - i = IntervalIndex([Interval(v, labels[0].right, closed="right")]) - labels = i.append(labels[1:]) + # adjust lhs of first interval by precision to account for being right closed + breaks[0] = adjust(breaks[0]) - return labels + return IntervalIndex.from_breaks(breaks, closed=closed) def _preprocess_for_cut(x): diff --git a/pandas/core/series.py b/pandas/core/series.py index 15fc712672717..ed338700f1011 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4,7 +4,18 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent -from typing import IO, Any, Callable, Hashable, List, Optional +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Hashable, + Iterable, + List, + Optional, + Tuple, + Type, +) import warnings import numpy as np @@ -12,6 +23,7 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs +from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -33,7 +45,6 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeArray, ABCDatetimeIndex, ABCSeries, ABCSparseArray, @@ -59,7 +70,7 @@ is_empty_data, sanitize_array, ) -from pandas.core.generic import _shared_docs +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( @@ -81,6 +92,9 @@ import pandas.io.formats.format as fmt import pandas.plotting +if TYPE_CHECKING: + from pandas.core.frame import DataFrame + __all__ = ["Series"] _shared_doc_kwargs = dict( @@ -160,7 +174,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _typ = "series" - _metadata: List[str] = [] + _name: Optional[Hashable] + _metadata: List[str] = ["name"] _accessors = {"dt", "cat", "str", "sparse"} _deprecations = ( base.IndexOpsMixin._deprecations @@ -182,6 +197,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def __init__( self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False ): + # we are called internally, so short-circuit if fastpath: @@ -250,7 +266,7 @@ def __init__( else: data = data.reindex(index, copy=copy) data = data._data - elif isinstance(data, dict): + elif is_dict_like(data): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -355,11 +371,11 @@ def _init_dict(self, data, index=None, dtype=None): # ---------------------------------------------------------------------- @property - def _constructor(self): + def _constructor(self) -> Type["Series"]: return Series @property - def _constructor_expanddim(self): + def _constructor_expanddim(self) -> Type["DataFrame"]: from pandas.core.frame import DataFrame return DataFrame @@ -371,7 +387,7 @@ def _can_hold_na(self): _index = None - def _set_axis(self, axis, labels, fastpath=False): + def _set_axis(self, axis, labels, fastpath=False) -> None: """ Override generic, we want to set the _typ here. """ @@ -425,13 +441,13 @@ def dtypes(self): @property def name(self) -> Optional[Hashable]: - return self.attrs.get("name", None) + return self._name @name.setter def name(self, value: Optional[Hashable]) -> None: if not is_hashable(value): raise TypeError("Series.name must be a hashable type") - self.attrs["name"] = value + object.__setattr__(self, "_name", value) @property def values(self): @@ -516,7 +532,7 @@ def __len__(self) -> int: """ return len(self._data) - def view(self, dtype=None): + def view(self, dtype=None) -> "Series": """ Create a new view of the Series. @@ -669,7 +685,7 @@ def construct_return(result): else: return construct_return(result) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -716,21 +732,6 @@ def __array__(self, dtype=None): array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - if ( - dtype is None - and isinstance(self.array, ABCDatetimeArray) - and getattr(self.dtype, "tz", None) - ): - msg = ( - "Converting timezone-aware DatetimeArray to timezone-naive " - "ndarray with 'datetime64[ns]' dtype. In the future, this " - "will return an ndarray with 'object' dtype where each " - "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t" - "To accept the future behavior, pass 'dtype=object'.\n\t" - "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = "M8[ns]" return np.asarray(self.array, dtype) # ---------------------------------------------------------------------- @@ -743,7 +744,7 @@ def __array__(self, dtype=None): # ---------------------------------------------------------------------- - def _unpickle_series_compat(self, state): + def _unpickle_series_compat(self, state) -> None: if isinstance(state, dict): self._data = state["_data"] self.name = state["name"] @@ -774,7 +775,7 @@ def _unpickle_series_compat(self, state): # indexers @property - def axes(self): + def axes(self) -> List[Index]: """ Return a list of the row axis labels. """ @@ -784,7 +785,7 @@ def axes(self): # Indexing Methods @Appender(generic.NDFrame.take.__doc__) - def take(self, indices, axis=0, is_copy=False, **kwargs): + def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -830,7 +831,7 @@ def _ixs(self, i: int, axis: int = 0): else: return values[i] - def _slice(self, slobj: slice, axis: int = 0, kind=None): + def _slice(self, slobj: slice, axis: int = 0, kind=None) -> "Series": slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") return self._get_values(slobj) @@ -1114,7 +1115,7 @@ def _set_value(self, label, value, takeable: bool = False): def _is_mixed_type(self): return False - def repeat(self, repeats, axis=None): + def repeat(self, repeats, axis=None) -> "Series": """ Repeat elements of a Series. @@ -1431,15 +1432,15 @@ def to_string( """ ) @Substitution(klass="Series") - @Appender(_shared_docs["to_markdown"]) + @Appender(generic._shared_docs["to_markdown"]) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs ) -> Optional[str]: return self.to_frame().to_markdown(buf, mode, **kwargs) # ---------------------------------------------------------------------- - def items(self): + def items(self) -> Iterable[Tuple[Label, Any]]: """ Lazily iterate over (index, value) tuples. @@ -1469,13 +1470,13 @@ def items(self): return zip(iter(self.index), iter(self)) @Appender(items.__doc__) - def iteritems(self): + def iteritems(self) -> Iterable[Tuple[Label, Any]]: return self.items() # ---------------------------------------------------------------------- # Misc public methods - def keys(self): + def keys(self) -> Index: """ Return alias for index. @@ -1521,7 +1522,7 @@ def to_dict(self, into=dict): into_c = com.standardize_mapping(into) return into_c(self.items()) - def to_frame(self, name=None): + def to_frame(self, name=None) -> "DataFrame": """ Convert Series to DataFrame. @@ -1553,7 +1554,7 @@ def to_frame(self, name=None): return df - def _set_name(self, name, inplace=False): + def _set_name(self, name, inplace=False) -> "Series": """ Set the Series name. @@ -1568,6 +1569,89 @@ def _set_name(self, name, inplace=False): ser.name = name return ser + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.SeriesGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods @@ -1612,7 +1696,7 @@ def count(self, level=None): out = np.bincount(obs, minlength=len(lev) or None) return self._constructor(out, index=lev, dtype="int64").__finalize__(self) - def mode(self, dropna=True): + def mode(self, dropna=True) -> "Series": """ Return the mode(s) of the dataset. @@ -1697,7 +1781,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep="first", inplace=False): + def drop_duplicates(self, keep="first", inplace=False) -> "Series": """ Return Series with duplicate values removed. @@ -1774,7 +1858,7 @@ def drop_duplicates(self, keep="first", inplace=False): """ return super().drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> "Series": """ Indicate duplicate Series values. @@ -1993,7 +2077,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): return np.nan return self.index[i] - def round(self, decimals=0, *args, **kwargs): + def round(self, decimals=0, *args, **kwargs) -> "Series": """ Round each value in a Series to the given number of decimals. @@ -2088,7 +2172,7 @@ def quantile(self, q=0.5, interpolation="linear"): # scalar return result.iloc[0] - def corr(self, other, method="pearson", min_periods=None): + def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute correlation with `other` Series, excluding missing values. @@ -2141,7 +2225,7 @@ def corr(self, other, method="pearson", min_periods=None): f"'{method}' was supplied" ) - def cov(self, other, min_periods=None): + def cov(self, other, min_periods=None) -> float: """ Compute covariance with Series, excluding missing values. @@ -2170,7 +2254,7 @@ def cov(self, other, min_periods=None): return np.nan return nanops.nancov(this.values, other.values, min_periods=min_periods) - def diff(self, periods=1): + def diff(self, periods=1) -> "Series": """ First discrete difference of element. @@ -2234,7 +2318,7 @@ def diff(self, periods=1): result = algorithms.diff(com.values_from_object(self), periods) return self._constructor(result, index=self.index).__finalize__(self) - def autocorr(self, lag=1): + def autocorr(self, lag=1) -> float: """ Compute the lag-N autocorrelation. @@ -2377,7 +2461,7 @@ def searchsorted(self, value, side="left", sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, ignore_index=False, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False) -> "Series": """ Concatenate two or more Series. @@ -2454,8 +2538,10 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_concat.extend(to_append) else: to_concat = [self, to_append] - return concat( - to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + return self._ensure_type( + concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) ) def _binop(self, other, func, level=None, fill_value=None): @@ -2497,7 +2583,7 @@ def _binop(self, other, func, level=None, fill_value=None): ret = ops._construct_result(self, result, new_index, name) return ret - def combine(self, other, func, fill_value=None): + def combine(self, other, func, fill_value=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2594,7 +2680,7 @@ def combine(self, other, func, fill_value=None): new_values = try_cast_to_ea(self._values, new_values) return self._constructor(new_values, index=new_index, name=new_name) - def combine_first(self, other): + def combine_first(self, other) -> "Series": """ Combine Series values, choosing the calling Series's values first. @@ -2634,7 +2720,7 @@ def combine_first(self, other): return this.where(notna(this), other) - def update(self, other): + def update(self, other) -> None: """ Modify Series in place using non-NA values from passed Series. Aligns on index. @@ -2693,10 +2779,10 @@ def sort_values( self, axis=0, ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool = False, ): """ Sort by the values. @@ -2880,6 +2966,7 @@ def sort_index( kind="quicksort", na_position="last", sort_remaining=True, + ignore_index: bool = False, ): """ Sort Series by index labels. @@ -2908,6 +2995,10 @@ def sort_index( sort_remaining : bool, default True If True and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -3035,19 +3126,22 @@ def sort_index( new_values = self._values.take(indexer) result = self._constructor(new_values, index=new_index) + if ignore_index: + result.index = ibase.default_index(len(result)) + if inplace: self._update_inplace(result) else: return result.__finalize__(self) - def argsort(self, axis=0, kind="quicksort", order=None): + def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. Parameters ---------- - axis : int + axis : {0 or "index"} Has no effect but is accepted for compatibility with numpy. kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See np.sort for more @@ -3078,7 +3172,7 @@ def argsort(self, axis=0, kind="quicksort", order=None): np.argsort(values, kind=kind), index=self.index, dtype="int64" ).__finalize__(self) - def nlargest(self, n=5, keep="first"): + def nlargest(self, n=5, keep="first") -> "Series": """ Return the largest `n` elements. @@ -3176,7 +3270,7 @@ def nlargest(self, n=5, keep="first"): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() - def nsmallest(self, n=5, keep="first"): + def nsmallest(self, n=5, keep="first") -> "Series": """ Return the smallest `n` elements. @@ -3273,7 +3367,7 @@ def nsmallest(self, n=5, keep="first"): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() - def swaplevel(self, i=-2, j=-1, copy=True): + def swaplevel(self, i=-2, j=-1, copy=True) -> "Series": """ Swap levels i and j in a :class:`MultiIndex`. @@ -3296,7 +3390,7 @@ def swaplevel(self, i=-2, j=-1, copy=True): self ) - def reorder_levels(self, order): + def reorder_levels(self, order) -> "Series": """ Rearrange index levels using input order. @@ -3420,7 +3514,7 @@ def unstack(self, level=-1, fill_value=None): # ---------------------------------------------------------------------- # function application - def map(self, arg, na_action=None): + def map(self, arg, na_action=None) -> "Series": """ Map values of Series according to input correspondence. @@ -3430,7 +3524,7 @@ def map(self, arg, na_action=None): Parameters ---------- - arg : function, dict, or Series + arg : function, collections.abc.Mapping subclass or Series Mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the @@ -3498,7 +3592,7 @@ def map(self, arg, na_action=None): new_values = super()._map_values(arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim, subset=None) -> "Series": """ Sub-classes to define. Return a sliced object. @@ -3816,7 +3910,16 @@ def align( broadcast_axis=broadcast_axis, ) - def rename(self, index=None, **kwargs): + def rename( + self, + index=None, + *, + axis=None, + copy=True, + inplace=False, + level=None, + errors="ignore", + ): """ Alter Series index labels or name. @@ -3830,6 +3933,8 @@ def rename(self, index=None, **kwargs): Parameters ---------- + axis : {0 or "index"} + Unused. Accepted for compatability with DataFrame method only. index : scalar, hashable sequence, dict-like or function, optional Functions or dict-like are transformations to apply to the index. @@ -3847,6 +3952,7 @@ def rename(self, index=None, **kwargs): See Also -------- + DataFrame.rename : Corresponding DataFrame method. Series.rename_axis : Set the name of the axis. Examples @@ -3873,12 +3979,12 @@ def rename(self, index=None, **kwargs): 5 3 dtype: int64 """ - kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - if callable(index) or is_dict_like(index): - return super().rename(index=index, **kwargs) + return super().rename( + index, copy=copy, inplace=inplace, level=level, errors=errors + ) else: - return self._set_name(index, inplace=kwargs.get("inplace")) + return self._set_name(index, inplace=inplace) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) @@ -3894,7 +4000,7 @@ def drop( level=None, inplace=False, errors="raise", - ): + ) -> "Series": """ Return Series with specified index labels removed. @@ -4005,8 +4111,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs, - ): + ) -> Optional["Series"]: return super().fillna( value=value, method=method, @@ -4014,7 +4119,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs, ) @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) @@ -4037,7 +4141,7 @@ def replace( ) @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4096,7 +4200,7 @@ def memory_usage(self, index=True, deep=False): v += self.index.memory_usage(deep=deep) return v - def isin(self, values): + def isin(self, values) -> "Series": """ Check whether `values` are contained in Series. @@ -4152,7 +4256,7 @@ def isin(self, values): result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__(self) - def between(self, left, right, inclusive=True): + def between(self, left, right, inclusive=True) -> "Series": """ Return boolean Series equivalent to left <= series <= right. @@ -4228,19 +4332,19 @@ def between(self, left, right, inclusive=True): return lmask & rmask @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self) -> "Series": return super().isna() @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self) -> "Series": return super().isnull() @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self) -> "Series": return super().notna() @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self) -> "Series": return super().notnull() def dropna(self, axis=0, inplace=False, how=None): @@ -4334,7 +4438,7 @@ def dropna(self, axis=0, inplace=False, how=None): # ---------------------------------------------------------------------- # Time series-oriented methods - def to_timestamp(self, freq=None, how="start", copy=True): + def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. @@ -4359,7 +4463,7 @@ def to_timestamp(self, freq=None, how="start", copy=True): new_index = self.index.to_timestamp(freq=freq, how=how) return self._constructor(new_values, index=new_index).__finalize__(self) - def to_period(self, freq=None, copy=True): + def to_period(self, freq=None, copy=True) -> "Series": """ Convert Series from DatetimeIndex to PeriodIndex with desired frequency (inferred from index if not passed). @@ -4397,9 +4501,7 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes( - ["index"], docs={"index": "The index (axis labels) of the Series."}, -) +Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) Series._add_numeric_operations() Series._add_series_or_dataframe_operations() diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 02f4eb47ba914..f8d9eeb211a1e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -438,8 +438,8 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): if regex.groups > 0: warnings.warn( - "This pattern has match groups. To actually get the" - " groups, use str.extract.", + "This pattern has match groups. To actually get the " + "groups, use str.extract.", UserWarning, stacklevel=3, ) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f193865d90b71..cfa42d764ee44 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -38,6 +38,7 @@ ) from pandas.core.dtypes.missing import notna +from pandas.arrays import IntegerArray from pandas.core import algorithms from pandas.core.algorithms import unique @@ -316,8 +317,21 @@ def _convert_listlike_datetimes( elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, "values", arg) - result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + arg = getattr(arg, "_values", arg) + + # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + # Explicitly pass NaT mask to array_with_unit_to_datetime + mask = arg.isna() + arg = arg._ndarray_values + else: + mask = None + + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, mask, unit, errors=errors + ) + if errors == "ignore": from pandas import Index @@ -631,7 +645,7 @@ def to_datetime( dtype: datetime64[ns] If a date does not meet the `timestamp limitations - `_, passing errors='ignore' will return the original input instead of raising any exception. diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index fa3582755a202..43655fa3ea913 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -85,11 +85,12 @@ def hash_pandas_object( if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) - if isinstance(obj, ABCIndexClass): + elif isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) h = Series(h, index=obj, dtype="uint64", copy=False) + elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 5b467b03c1fc2..64ec0e68e11b0 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -105,7 +105,7 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( arg2, (np.ndarray, ABCSeries) ): - X, Y = _prep_binary(arg1, arg2) + X, Y = prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): @@ -152,7 +152,7 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = results[j][i] else: results[i][j] = f( - *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) ) from pandas import concat @@ -213,7 +213,7 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'pairwise' is not True/False") else: results = { - i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + i: f(*prep_binary(arg1.iloc[:, i], arg2)) for i, col in enumerate(arg1.columns) } return dataframe_from_int_dict(results, arg1) @@ -250,31 +250,10 @@ def _get_center_of_mass(comass, span, halflife, alpha): return float(comass) -def _offset(window, center): +def calculate_center_offset(window): if not is_integer(window): window = len(window) - offset = (window - 1) / 2.0 if center else 0 - try: - return int(offset) - except TypeError: - return offset.astype(int) - - -def _require_min_periods(p): - def _check_func(minp, window): - if minp is None: - return window - else: - return max(p, minp) - - return _check_func - - -def _use_window(minp, window): - if minp is None: - return window - else: - return minp + return int((window - 1) / 2.0) def calculate_min_periods( @@ -312,7 +291,7 @@ def calculate_min_periods( return max(min_periods, floor) -def _zsqrt(x): +def zsqrt(x): with np.errstate(all="ignore"): result = np.sqrt(x) mask = x < 0 @@ -327,7 +306,7 @@ def _zsqrt(x): return result -def _prep_binary(arg1, arg2): +def prep_binary(arg1, arg2): if not isinstance(arg2, type(arg1)): raise Exception("Input arrays must be of the same type!") @@ -336,3 +315,12 @@ def _prep_binary(arg1, arg2): Y = arg2 + 0 * arg1 return X, Y + + +def get_weighted_roll_func(cfunc: Callable) -> Callable: + def func(arg, window, min_periods=None): + if min_periods is None: + min_periods = len(window) + return cfunc(arg, window, min_periods) + + return func diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index baecba7e78384..37e3cd42f2115 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -9,8 +9,13 @@ from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.base import DataError -from pandas.core.window.common import _doc_template, _get_center_of_mass, _shared_docs -from pandas.core.window.rolling import _flex_binary_moment, _Rolling, _zsqrt +from pandas.core.window.common import ( + _doc_template, + _get_center_of_mass, + _shared_docs, + zsqrt, +) +from pandas.core.window.rolling import _flex_binary_moment, _Rolling _bias_template = """ Parameters @@ -89,7 +94,7 @@ class EWM(_Rolling): (if adjust is True), and 1-alpha and alpha (if adjust is False). More details can be found at - http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows Examples -------- @@ -269,7 +274,7 @@ def std(self, bias=False, *args, **kwargs): Exponential weighted moving stddev. """ nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(bias=bias, **kwargs)) + return zsqrt(self.var(bias=bias, **kwargs)) vol = std @@ -314,7 +319,7 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): inputs. In the case of missing elements, only complete pairwise observations will be used. bias : bool, default False - Use a standard estimation bias correction + Use a standard estimation bias correction. **kwargs Keyword arguments to be passed into func. """ @@ -390,7 +395,7 @@ def _cov(x, y): cov = _cov(x_values, y_values) x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) + corr = cov / zsqrt(x_var * y_var) return X._wrap_result(corr) return _flex_binary_moment( diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0fa24a0ba1b5a..921cdb3c2523f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -32,7 +32,7 @@ class BaseIndexer: - """Base class for window bounds calculations""" + """Base class for window bounds calculations.""" def __init__( self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5b0fbbb3518d2..f612826132fd7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -24,7 +24,6 @@ is_integer_dtype, is_list_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ( @@ -43,11 +42,11 @@ WindowGroupByMixin, _doc_template, _flex_binary_moment, - _offset, _shared_docs, - _use_window, - _zsqrt, + calculate_center_offset, calculate_min_periods, + get_weighted_roll_func, + zsqrt, ) from pandas.core.window.indexers import ( BaseIndexer, @@ -184,7 +183,7 @@ def _gotitem(self, key, ndim, subset=None): self._selection = key return self - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: @@ -252,19 +251,6 @@ def __iter__(self): url = "https://github.com/pandas-dev/pandas/issues/11704" raise NotImplementedError(f"See issue #11704 {url}") - def _get_index(self) -> Optional[np.ndarray]: - """ - Return integer representations as an ndarray if index is frequency. - - Returns - ------- - None or ndarray - """ - - if self.is_freq_type: - return self._on.asi8 - return None - def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: """Convert input to numpy arrays for Cython routines""" if values is None: @@ -305,17 +291,6 @@ def _wrap_result(self, result, block=None, obj=None): if isinstance(result, np.ndarray): - # coerce if necessary - if block is not None: - if is_timedelta64_dtype(block.values.dtype): - # TODO: do we know what result.dtype is at this point? - # i.e. can we just do an astype? - from pandas import to_timedelta - - result = to_timedelta(result.ravel(), unit="ns").values.reshape( - result.shape - ) - if result.ndim == 1: from pandas import Series @@ -384,14 +359,11 @@ def _center_window(self, result, window) -> np.ndarray: if self.axis > result.ndim - 1: raise ValueError("Requested axis is larger then no. of argument dimensions") - offset = _offset(window, True) + offset = calculate_center_offset(window) if offset > 0: - if isinstance(result, (ABCSeries, ABCDataFrame)): - result = result.slice_shift(-offset, axis=self.axis) - else: - lead_indexer = [slice(None)] * result.ndim - lead_indexer[self.axis] = slice(offset, None) - result = np.copy(result[tuple(lead_indexer)]) + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) return result def _get_roll_func(self, func_name: str) -> Callable: @@ -424,17 +396,15 @@ def _get_cython_func_type(self, func: str) -> Callable: return self._get_roll_func(f"{func}_variable") return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) - def _get_window_indexer( - self, index_as_array: Optional[np.ndarray], window: int - ) -> BaseIndexer: + def _get_window_indexer(self, window: int) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ if isinstance(self.window, BaseIndexer): return self.window if self.is_freq_type: - return VariableWindowIndexer(index_array=index_as_array, window_size=window) - return FixedWindowIndexer(index_array=index_as_array, window_size=window) + return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) + return FixedWindowIndexer(window_size=window) def _apply( self, @@ -476,8 +446,7 @@ def _apply( blocks, obj = self._create_blocks() block_list = list(blocks) - index_as_array = self._get_index() - window_indexer = self._get_window_indexer(index_as_array, window) + window_indexer = self._get_window_indexer(window) results = [] exclude: List[Scalar] = [] @@ -498,7 +467,7 @@ def _apply( continue # calculation function - offset = _offset(window, center) if center else 0 + offset = calculate_center_offset(window) if center else 0 additional_nans = np.array([np.nan] * offset) if not is_weighted: @@ -856,7 +825,7 @@ class Window(_Window): changed to the center of the window by setting ``center=True``. To learn more about the offsets & frequency strings, please see `this link - `__. + `__. The recognized win_types are: @@ -903,6 +872,17 @@ class Window(_Window): 3 NaN 4 NaN + Rolling sum with a window length of 2, using the 'gaussian' + window type (note how we need to specify std). + + >>> df.rolling(2, win_type='gaussian').sum(std=3) + B + 0 NaN + 1 0.986207 + 2 2.958621 + 3 NaN + 4 NaN + Rolling sum with a window length of 2, min_periods defaults to the window length. @@ -1051,15 +1031,6 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) - def _get_weighted_roll_func( - self, cfunc: Callable, check_minp: Callable, **kwargs - ) -> Callable: - def func(arg, window, min_periods=None, closed=None): - minp = check_minp(min_periods, len(window)) - return cfunc(arg, window, minp, **kwargs) - - return func - _agg_see_also_doc = dedent( """ See Also @@ -1127,7 +1098,7 @@ def aggregate(self, func, *args, **kwargs): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_weighted_sum") - window_func = self._get_weighted_roll_func(window_func, _use_window) + window_func = get_weighted_roll_func(window_func) return self._apply( window_func, center=self.center, is_weighted=True, name="sum", **kwargs ) @@ -1137,7 +1108,7 @@ def sum(self, *args, **kwargs): def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = self._get_roll_func("roll_weighted_mean") - window_func = self._get_weighted_roll_func(window_func, _use_window) + window_func = get_weighted_roll_func(window_func) return self._apply( window_func, center=self.center, is_weighted=True, name="mean", **kwargs ) @@ -1147,7 +1118,7 @@ def mean(self, *args, **kwargs): def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) - window_func = self._get_weighted_roll_func(window_func, _use_window) + window_func = get_weighted_roll_func(window_func) kwargs.pop("name", None) return self._apply( window_func, center=self.center, is_weighted=True, name="var", **kwargs @@ -1157,7 +1128,7 @@ def var(self, ddof=1, *args, **kwargs): @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(ddof=ddof, name="std", **kwargs)) + return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) class _Rolling(_Window): @@ -1211,8 +1182,6 @@ class _Rolling_and_Expanding(_Rolling): def count(self): blocks, obj = self._create_blocks() - # Validate the index - self._get_index() window = self._get_window() window = min(window, len(obj)) if not self.center else window @@ -1307,7 +1276,7 @@ def apply( kwargs.pop("_level", None) kwargs.pop("floor", None) window = self._get_window() - offset = _offset(window, self.center) + offset = calculate_center_offset(window) if self.center else 0 if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") @@ -1478,7 +1447,7 @@ def std(self, ddof=1, *args, **kwargs): window_func = self._get_cython_func_type("roll_var") def zsqrt_func(values, begin, end, min_periods): - return _zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) # ddof passed again for compat with groupby.rolling return self._apply( diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 518b940ec5da3..34e8e03d8771e 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -69,8 +69,8 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover kwargs["engine"] = "python" elif len(sep) > 1 and kwargs.get("engine") == "c": warnings.warn( - "read_clipboard with regex separator does not work" - " properly with c engine" + "read_clipboard with regex separator does not work " + "properly with c engine" ) return read_csv(StringIO(text), sep=sep, **kwargs) diff --git a/pandas/io/common.py b/pandas/io/common.py index 43cd7d81ae4cd..771a302d647ec 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,25 +1,13 @@ """Common IO api utilities""" import bz2 -import codecs from collections import abc import gzip from io import BufferedIOBase, BytesIO import mmap import os import pathlib -from typing import ( - IO, - Any, - AnyStr, - BinaryIO, - Dict, - List, - Mapping, - Optional, - Tuple, - Union, -) +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union from urllib.parse import ( # noqa urlencode, urljoin, @@ -538,24 +526,3 @@ def __next__(self) -> str: if newline == "": raise StopIteration return newline - - -class UTF8Recoder(abc.Iterator): - """ - Iterator that reads an encoded stream and re-encodes the input to UTF-8 - """ - - def __init__(self, f: BinaryIO, encoding: str): - self.reader = codecs.getreader(encoding)(f) - - def read(self, bytes: int = -1) -> bytes: - return self.reader.read(bytes).encode("utf-8") - - def readline(self) -> bytes: - return self.reader.readline().encode("utf-8") - - def __next__(self) -> bytes: - return next(self.reader).encode("utf-8") - - def close(self): - self.reader.close() diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 553334407d12e..04015a08bce2f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -40,7 +40,7 @@ Parameters ---------- -io : str, ExcelFile, xlrd.Book, path object or file-like object +io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.xlsx``. @@ -297,9 +297,7 @@ def read_excel( for arg in ("sheet", "sheetname", "parse_cols"): if arg in kwds: - raise TypeError( - "read_excel() got an unexpected keyword argument `{}`".format(arg) - ) + raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`") if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -352,6 +350,8 @@ def __init__(self, filepath_or_buffer): self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -429,7 +429,7 @@ def parse( for asheetname in sheets: if verbose: - print("Reading sheet {sheet}".format(sheet=asheetname)) + print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) @@ -528,8 +528,10 @@ def parse( class ExcelWriter(metaclass=abc.ABCMeta): """ - Class for writing DataFrame objects into excel sheets, default is to use - xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. + Class for writing DataFrame objects into excel sheets. + + Default is to use xlwt for xls, openpyxl for xlsx. + See DataFrame.to_excel for typical usage. Parameters ---------- @@ -543,7 +545,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None Format string for datetime objects written into Excel files. - (e.g. 'YYYY-MM-DD HH:MM:SS') + (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' File mode to use (write or append). @@ -622,11 +624,11 @@ def __new__(cls, path, engine=None, **kwargs): ext = "xlsx" try: - engine = config.get_option("io.excel.{ext}.writer".format(ext=ext)) + engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": engine = _get_default_writer(ext) except KeyError: - raise ValueError("No engine for filetype: '{ext}'".format(ext=ext)) + raise ValueError(f"No engine for filetype: '{ext}'") cls = get_writer(engine) return object.__new__(cls) @@ -757,9 +759,8 @@ def check_extension(cls, ext): if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = "Invalid extension for engine '{engine}': '{ext}'".format( - engine=pprint_thing(cls.engine), ext=pprint_thing(ext) - ) + msg = "Invalid extension for engine" + f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" raise ValueError(msg) else: return True @@ -802,7 +803,7 @@ def __init__(self, io, engine=None): if engine is None: engine = "xlrd" if engine not in self._engines: - raise ValueError("Unknown engine: {engine}".format(engine=engine)) + raise ValueError(f"Unknown engine: {engine}") self.engine = engine # could be a str, ExcelFile, Book, etc. diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6b9943136664a..ec5f6fcb17ff8 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -156,7 +156,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if cell_value == 0.0 and str(cell) != cell_value: # NA handling + if cell_value == 0.0: # NA handling return str(cell) if convert_float: @@ -178,4 +178,4 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: elif cell_type == "time": return pd.to_datetime(str(cell)).time() else: - raise ValueError("Unrecognized type {}".format(cell_type)) + raise ValueError(f"Unrecognized type {cell_type}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7a264ed2b0850..be52523e486af 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -99,7 +99,7 @@ def _convert_to_style_kwargs(cls, style_dict): for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None) + _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index ee617d2013136..a084be54dfa10 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -48,7 +48,7 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '{engine}'".format(engine=engine_name)) + raise ValueError(f"No Excel writer '{engine_name}'") def _excel2num(x): @@ -76,7 +76,7 @@ def _excel2num(x): cp = ord(c) if cp < ord("A") or cp > ord("Z"): - raise ValueError("Invalid column name: {x}".format(x=x)) + raise ValueError(f"Invalid column name: {x}") index = index * 26 + cp - ord("A") + 1 @@ -154,8 +154,8 @@ def _validate_freeze_panes(freeze_panes): return True raise ValueError( - "freeze_panes must be of form (row, column)" - " where row and column are integers" + "freeze_panes must be of form (row, column) " + "where row and column are integers" ) # freeze_panes wasn't specified, return False so it won't be applied diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 996ae1caa14c8..d102a885cef0a 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -97,20 +97,20 @@ def _style_to_xlwt( if hasattr(item, "items"): if firstlevel: it = [ - "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(line_sep).join(it)) + out = f"{(line_sep).join(it)} " return out else: it = [ - "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(field_sep).join(it)) + out = f"{(field_sep).join(it)} " return out else: - item = "{item}".format(item=item) + item = f"{item}" item = item.replace("True", "on") item = item.replace("False", "off") return item diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 18340bc702378..9b0f100c1b041 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -4,6 +4,7 @@ from functools import reduce import itertools import re +from typing import Callable, Dict, List, Optional, Sequence, Union import warnings import numpy as np @@ -25,7 +26,9 @@ class ExcelCell: __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") __slots__ = __fields__ - def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None): + def __init__( + self, row: int, col: int, val, style=None, mergestart=None, mergeend=None + ): self.row = row self.col = col self.val = val @@ -56,7 +59,7 @@ class CSSToExcelConverter: # instancemethods so that users can easily experiment with extensions # without monkey-patching. - def __init__(self, inherited=None): + def __init__(self, inherited: Optional[str] = None): if inherited is not None: inherited = self.compute_css(inherited) @@ -64,7 +67,7 @@ def __init__(self, inherited=None): compute_css = CSSResolver() - def __call__(self, declarations_str: str): + def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: """ Convert CSS declarations to ExcelWriter style. @@ -84,7 +87,7 @@ def __call__(self, declarations_str: str): properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props): + def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -95,7 +98,7 @@ def build_xlstyle(self, props): # TODO: handle cell width and height: needs support in pandas.io.excel - def remove_none(d): + def remove_none(d: Dict[str, str]) -> None: """Remove key where value is None, through nested dicts""" for k, v in list(d.items()): if v is None: @@ -118,7 +121,7 @@ def remove_none(d): # OpenXML also has 'justify', 'distributed' } - def build_alignment(self, props): + def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), @@ -130,7 +133,7 @@ def build_alignment(self, props): ), } - def build_border(self, props): + def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: return { side: { "style": self._border_style( @@ -142,7 +145,7 @@ def build_border(self, props): for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style, width): + def _border_style(self, style: Optional[str], width): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -191,7 +194,7 @@ def _border_style(self, style, width): return "dashed" return "mediumDashed" - def build_fill(self, props): + def build_fill(self, props: Dict[str, str]): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type fill_color = props.get("background-color") @@ -215,7 +218,7 @@ def build_fill(self, props): } ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} - def build_font(self, props): + def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: size = props.get("font-size") if size is not None: assert size.endswith("pt") @@ -311,7 +314,7 @@ def build_font(self, props): "white": "FFFFFF", } - def color_to_excel(self, val): + def color_to_excel(self, val: Optional[str]): if val is None: return None if val.startswith("#") and len(val) == 7: @@ -323,7 +326,7 @@ def color_to_excel(self, val): except KeyError: warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) - def build_number_format(self, props): + def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: return {"format_code": props.get("number-format")} @@ -366,15 +369,15 @@ class ExcelFormatter: def __init__( self, df, - na_rep="", - float_format=None, - cols=None, - header=True, - index=True, - index_label=None, - merge_cells=False, - inf_rep="inf", - style_converter=None, + na_rep: str = "", + float_format: Optional[str] = None, + cols: Optional[Sequence] = None, + header: Union[bool, List[str]] = True, + index: bool = True, + index_label: Union[str, Sequence, None] = None, + merge_cells: bool = False, + inf_rep: str = "inf", + style_converter: Optional[Callable] = None, ): self.rowcounter = 0 self.na_rep = na_rep @@ -442,10 +445,8 @@ def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( - "Writing to Excel with MultiIndex" - " columns and no index " - "('index'=False) is not yet " - "implemented." + "Writing to Excel with MultiIndex columns and no " + "index ('index'=False) is not yet implemented." ) has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) @@ -540,7 +541,6 @@ def _format_header(self): return itertools.chain(gen, gen2) def _format_body(self): - if isinstance(self.df.index, ABCMultiIndex): return self._format_hierarchical_rows() else: @@ -716,8 +716,7 @@ def write( num_rows, num_cols = self.df.shape if num_rows > self.max_rows or num_cols > self.max_cols: raise ValueError( - "This sheet is too large! Your sheet size is: " - f"{num_rows}, {num_cols} " + f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} " f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 1b18e0fc3f0fa..6adf69a922000 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -231,7 +231,7 @@ def __init__( self, series: "Series", buf: Optional[IO[str]] = None, - length: bool = True, + length: Union[bool, str] = True, header: bool = True, index: bool = True, na_rep: str = "NaN", @@ -281,7 +281,9 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + series = series._ensure_type( + concat((series.iloc[:row_num], series.iloc[-row_num:])) + ) self.tr_row_num = row_num else: self.tr_row_num = None @@ -450,7 +452,7 @@ def _get_adjustment() -> TextAdjustment: class TableFormatter: - show_dimensions: bool + show_dimensions: Union[bool, str] is_truncated: bool formatters: formatters_type columns: Index @@ -554,7 +556,7 @@ def __init__( max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, - show_dimensions: bool = False, + show_dimensions: Union[bool, str] = False, decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, @@ -577,8 +579,8 @@ def __init__( else: raise ValueError( ( - "Formatters length({flen}) should match" - " DataFrame number of columns({dlen})" + "Formatters length({flen}) should match " + "DataFrame number of columns({dlen})" ).format(flen=len(formatters), dlen=len(frame.columns)) ) self.na_rep = na_rep @@ -1228,7 +1230,7 @@ def _format(x): if x is None: return "None" elif x is NA: - return "NA" + return str(NA) elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): @@ -1276,7 +1278,7 @@ class FloatArrayFormatter(GenericArrayFormatter): """ def __init__(self, *args, **kwargs): - GenericArrayFormatter.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) # float_format is expected to be a string # formatter should be used to pass a function diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index b88478b3da181..b46b2f6c671d6 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -2,12 +2,13 @@ Module for formatting output data in HTML. """ -from collections import OrderedDict from textwrap import dedent from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option +from pandas._libs import lib + from pandas.core.dtypes.generic import ABCMultiIndex from pandas import option_context @@ -138,10 +139,9 @@ def _write_cell( else: start_tag = "<{kind}>".format(kind=kind) - esc: Union[OrderedDict[str, str], Dict] if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) + esc = {"&": r"&", "<": r"<", ">": r">"} else: esc = {} @@ -247,7 +247,7 @@ def _write_col_header(self, indent: int) -> None: if self.fmt.sparsify: # GH3547 - sentinel = object() + sentinel = lib.no_default else: sentinel = False levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) @@ -453,7 +453,7 @@ def _write_hierarchical_rows( if self.fmt.sparsify: # GH3547 - sentinel = object() + sentinel = lib.no_default levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0c9d2d54d3065..8570875569e44 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -15,6 +15,7 @@ from pandas._config import get_option +from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender @@ -1272,9 +1273,9 @@ def bar( color = [color[0], color[0]] elif len(color) > 2: raise ValueError( - "`color` must be string or a list-like" - " of length 2: [`color_neg`, `color_pos`]" - " (eg: color=['#d65f5f', '#5fba7d'])" + "`color` must be string or a list-like " + "of length 2: [`color_neg`, `color_pos`] " + "(eg: color=['#d65f5f', '#5fba7d'])" ) subset = _maybe_numeric_slice(self.data, subset) @@ -1475,8 +1476,7 @@ def _get_level_lengths(index, hidden_elements=None): Result is a dictionary of (level, initial_position): span """ - sentinel = object() - levels = index.format(sparsify=sentinel, adjoin=False, names=False) + levels = index.format(sparsify=lib.no_default, adjoin=False, names=False) if hidden_elements is None: hidden_elements = [] @@ -1492,10 +1492,10 @@ def _get_level_lengths(index, hidden_elements=None): for j, row in enumerate(lvl): if not get_option("display.multi_sparse"): lengths[(i, j)] = 1 - elif (row != sentinel) and (j not in hidden_elements): + elif (row is not lib.no_default) and (j not in hidden_elements): last_label = j lengths[(i, last_label)] = 1 - elif row != sentinel: + elif row is not lib.no_default: # even if its hidden, keep track of it in case # length >1 and later elements are visible last_label = j diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index d9711f4f4626a..69ebc470fba6f 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,6 +1,11 @@ """ Google BigQuery support """ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + from pandas.compat._optional import import_optional_dependency +if TYPE_CHECKING: + from pandas import DataFrame + def _try_import(): # since pandas is a dependency of pandas-gbq @@ -14,21 +19,21 @@ def _try_import(): def read_gbq( - query, - project_id=None, - index_col=None, - col_order=None, - reauth=False, - auth_local_webserver=False, - dialect=None, - location=None, - configuration=None, + query: str, + project_id: Optional[str] = None, + index_col: Optional[str] = None, + col_order: Optional[List[str]] = None, + reauth: bool = False, + auth_local_webserver: bool = False, + dialect: Optional[str] = None, + location: Optional[str] = None, + configuration: Optional[Dict[str, Any]] = None, credentials=None, - use_bqstorage_api=None, + use_bqstorage_api: Optional[bool] = None, private_key=None, verbose=None, - progress_bar_type=None, -): + progress_bar_type: Optional[str] = None, +) -> "DataFrame": """ Load data from Google BigQuery. @@ -157,7 +162,7 @@ def read_gbq( """ pandas_gbq = _try_import() - kwargs = {} + kwargs: Dict[str, Union[str, bool]] = {} # START: new kwargs. Don't populate unless explicitly set. if use_bqstorage_api is not None: @@ -183,20 +188,20 @@ def read_gbq( def to_gbq( - dataframe, - destination_table, - project_id=None, - chunksize=None, - reauth=False, - if_exists="fail", - auth_local_webserver=False, - table_schema=None, - location=None, - progress_bar=True, + dataframe: "DataFrame", + destination_table: str, + project_id: Optional[str] = None, + chunksize: Optional[int] = None, + reauth: bool = False, + if_exists: str = "fail", + auth_local_webserver: bool = False, + table_schema: Optional[List[Dict[str, str]]] = None, + location: Optional[str] = None, + progress_bar: bool = True, credentials=None, verbose=None, private_key=None, -): +) -> None: pandas_gbq = _try_import() pandas_gbq.to_gbq( dataframe, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f5008f0c311ad..12ce5e4a62d24 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -11,6 +11,7 @@ from pandas._libs.tslibs import iNaT from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ensure_str, is_period_dtype @@ -24,11 +25,10 @@ infer_compression, stringify_path, ) +from pandas.io.json._normalize import convert_to_line_delimits +from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer -from ._normalize import convert_to_line_delimits -from ._table_schema import build_table_schema, parse_table_schema - loads = json.loads dumps = json.dumps @@ -346,6 +346,7 @@ def _write( return serialized +@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) def read_json( path_or_buf=None, orient=None, @@ -459,6 +460,8 @@ def read_json( non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. + .. deprecated:: 1.0.0 + precise_float : bool, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but @@ -479,7 +482,7 @@ def read_json( chunksize : int, optional Return JsonReader object for iteration. See the `line-delimited json docs - `_ + `_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index aa14c3f3a63f3..c0596c984575a 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -3,13 +3,14 @@ from collections import defaultdict import copy -from typing import DefaultDict, Dict, List, Optional, Union +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union import numpy as np from pandas._libs.writers import convert_json_to_lines from pandas.util._decorators import deprecate +import pandas as pd from pandas import DataFrame @@ -112,13 +113,13 @@ def nested_to_record( def _json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, - meta: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, -): +) -> "DataFrame": """ Normalize semi-structured JSON data into a flat table. @@ -229,14 +230,23 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js, spec): - result = js + def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + result = js # type: ignore if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + if not isinstance(result, Iterable): + if pd.isnull(result): + result = [] # type: ignore + else: + raise TypeError( + f"{js} has non iterable value {result} for path {spec}. " + "Must be iterable or null." + ) + return result if isinstance(data, list) and not data: @@ -265,21 +275,21 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - meta = [m if isinstance(m, list) else [m] for m in meta] + _meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records: List = [] lengths = [] meta_vals: DefaultDict = defaultdict(list) - meta_keys = [sep.join(val) for val in meta] + meta_keys = [sep.join(val) for val in _meta] def _recursive_extract(data, path, seen_meta, level=0): if isinstance(data, dict): data = [data] if len(path) > 1: for obj in data: - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) @@ -296,7 +306,7 @@ def _recursive_extract(data, path, seen_meta, level=0): # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 87bfd6030ec31..5f23b95c10f8e 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -18,9 +18,9 @@ is_string_dtype, is_timedelta64_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import DataFrame -from pandas.api.types import CategoricalDtype import pandas.core.common as com loads = json.loads diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f68347f042086..3a686a1a3b122 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -76,6 +76,9 @@ def __init__(self): ) import pyarrow.parquet + # import utils to register the pyarrow extension types + import pandas.core.arrays._arrow_utils # noqa + self.api = pyarrow def write( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3235001e14dff..b4eb2fb1411d0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO +from io import BufferedIOBase, StringIO, TextIOWrapper import re import sys from textwrap import fill @@ -62,7 +62,6 @@ from pandas.core.tools import datetimes as tools from pandas.io.common import ( - UTF8Recoder, get_filepath_or_buffer, get_handle, infer_compression, @@ -84,7 +83,7 @@ into chunks. Additional help can be found in the online docs for -`IO Tools `_. +`IO Tools `_. Parameters ---------- @@ -272,7 +271,7 @@ chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs - `_ + `_ for more information on ``iterator`` and ``chunksize``. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and @@ -612,9 +611,9 @@ def parser_f( if delim_whitespace and delimiter != default_sep: raise ValueError( - "Specified a delimiter with both sep and" - " delim_whitespace=True; you can only" - " specify one." + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only " + "specify one." ) if engine is not None: @@ -685,7 +684,7 @@ def parser_f( read_csv = Appender( _doc_read_csv_and_table.format( func_name="read_csv", - summary=("Read a comma-separated values (csv) file into DataFrame."), + summary="Read a comma-separated values (csv) file into DataFrame.", _default_sep="','", ) )(read_csv) @@ -715,7 +714,7 @@ def read_fwf( into chunks. Additional help can be found in the `online docs for IO Tools - `_. + `_. Parameters ---------- @@ -956,8 +955,8 @@ def _clean_options(self, options, engine): if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ( - "the 'c' engine does not support" - " sep=None with delim_whitespace=False" + "the 'c' engine does not support " + "sep=None with delim_whitespace=False" ) engine = "python" elif sep is not None and len(sep) > 1: @@ -1120,9 +1119,9 @@ def _make_engine(self, engine="c"): klass = FixedWidthFieldParser else: raise ValueError( - f"Unknown engine: {engine} (valid options are" - ' "c", "python", or' - ' "python-fwf")' + f"Unknown engine: {engine} (valid options are " + '"c", "python", or ' + '"python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -1868,12 +1867,18 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""): - # if source is utf-16 plain text, convert source to utf-8 + encoding = kwds.get("encoding") + + if kwds.get("compression") is None and encoding: if isinstance(src, str): src = open(src, "rb") self.handles.append(src) - src = UTF8Recoder(src, kwds["encoding"]) + + # Handle the file object with universal line mode enabled. + # We will handle the newline character ourselves later on. + if isinstance(src, BufferedIOBase): + src = TextIOWrapper(src, encoding=encoding, newline="") + kwds["encoding"] = "utf-8" # #2442 diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6ce52da21b4e8..e51f24b551f31 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,13 +1,20 @@ """ pickle compat """ import pickle +from typing import Any, Optional import warnings +from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc -from pandas.io.common import get_handle, stringify_path +from pandas.io.common import get_filepath_or_buffer, get_handle -def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): +def to_pickle( + obj: Any, + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, +): """ Pickle (serialize) object to file. @@ -15,11 +22,17 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): ---------- obj : any object Any python object. - path : str - File path where the pickled object will be stored. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be stored. + + .. versionchanged:: 1.0.0 + Accept URL. URL has to be of S3 or GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible @@ -63,8 +76,12 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): >>> import os >>> os.remove("./dummy.pkl") """ - path = stringify_path(path) - f, fh = get_handle(path, "wb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression, mode="wb" + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -73,9 +90,16 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass -def read_pickle(path, compression="infer"): +def read_pickle( + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" +): """ Load pickled pandas object (or any object) from file. @@ -86,13 +110,17 @@ def read_pickle(path, compression="infer"): Parameters ---------- - path : str - File path where the pickled object will be loaded. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be loaded from. + + .. versionchanged:: 1.0.0 + Accept URL. URL is not limited to S3 and GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', - or '.zip' respectively, and no decompression otherwise. - Set to None for no decompression. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). Returns ------- @@ -134,8 +162,12 @@ def read_pickle(path, compression="infer"): >>> import os >>> os.remove("./dummy.pkl") """ - path = stringify_path(path) - f, fh = get_handle(path, "rb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -159,3 +191,8 @@ def read_pickle(path, compression="infer"): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index db8d9eb669c20..d61d1cf7f0257 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1018,7 +1018,7 @@ def put( data_columns : list, default None List of columns to create as data columns, or True to use all columns. See `here - `__. + `__. encoding : str, default None Provide an encoding for strings. dropna : bool, default False, do not write an ALL nan row to @@ -1138,7 +1138,7 @@ def append( List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here - `__. + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan representation chunksize : size to chunk the writing @@ -1215,9 +1215,8 @@ def append_to_multiple( """ if axes is not None: raise TypeError( - "axes is currently not accepted as a parameter to" - " append_to_multiple; you can create the " - "tables independently instead" + "axes is currently not accepted as a parameter to append_to_multiple; " + "you can create the tables independently instead" ) if not isinstance(d, dict): @@ -1459,7 +1458,7 @@ def copy( data = self.select(k) if isinstance(s, Table): - index: Union[bool, list] = False + index: Union[bool, List[str]] = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( @@ -3548,9 +3547,8 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): if not v.is_indexed: if v.type.startswith("complex"): raise TypeError( - "Columns containing complex values can be stored " - "but cannot" - " be indexed when using table format. Either use " + "Columns containing complex values can be stored but " + "cannot be indexed when using table format. Either use " "fixed format, set index=False, or do not include " "the columns containing complex values to " "data_columns when initializing the table." diff --git a/pandas/io/sas/__init__.py b/pandas/io/sas/__init__.py index fa6b29a1a3fcc..8f81352e6aecb 100644 --- a/pandas/io/sas/__init__.py +++ b/pandas/io/sas/__init__.py @@ -1 +1 @@ -from .sasreader import read_sas # noqa +from pandas.io.sas.sasreader import read_sas # noqa diff --git a/pandas/io/spss.py b/pandas/io/spss.py index cf682ec72f284..cdbe14e9fe927 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -3,7 +3,8 @@ from pandas.compat._optional import import_optional_dependency -from pandas.api.types import is_list_like +from pandas.core.dtypes.inference import is_list_like + from pandas.core.api import DataFrame diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b619ea93b981d..f4527994db0d2 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -241,7 +241,7 @@ def read_sql_table( try: meta.reflect(only=[table_name], views=True) except sqlalchemy.exc.InvalidRequestError: - raise ValueError("Table {name} not found".format(name=table_name)) + raise ValueError(f"Table {table_name} not found") pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( @@ -256,7 +256,7 @@ def read_sql_table( if table is not None: return table else: - raise ValueError("Table {name} not found".format(name=table_name), con) + raise ValueError(f"Table {table_name} not found", con) def read_sql_query( @@ -498,7 +498,7 @@ def to_sql( .. versionadded:: 0.24.0 """ if if_exists not in ("fail", "replace", "append"): - raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) + raise ValueError(f"'{if_exists}' is not valid for if_exists") pandas_sql = pandasSQL_builder(con, schema=schema) @@ -625,7 +625,7 @@ def __init__( self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: - raise ValueError("Could not init table '{name}'".format(name=name)) + raise ValueError(f"Could not init table '{name}'") def exists(self): return self.pd_sql.has_table(self.name, self.schema) @@ -643,18 +643,14 @@ def _execute_create(self): def create(self): if self.exists(): if self.if_exists == "fail": - raise ValueError( - "Table '{name}' already exists.".format(name=self.name) - ) + raise ValueError(f"Table '{self.name}' already exists.") elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() elif self.if_exists == "append": pass else: - raise ValueError( - "'{0}' is not valid for if_exists".format(self.if_exists) - ) + raise ValueError(f"'{self.if_exists}' is not valid for if_exists") else: self._execute_create() @@ -689,7 +685,7 @@ def insert_data(self): try: temp.reset_index(inplace=True) except ValueError as err: - raise ValueError("duplicate name in index/columns: {0}".format(err)) + raise ValueError(f"duplicate name in index/columns: {err}") else: temp = self.frame @@ -732,7 +728,7 @@ def insert(self, chunksize=None, method=None): elif callable(method): exec_insert = partial(method, self) else: - raise ValueError("Invalid parameter `method`: {}".format(method)) + raise ValueError(f"Invalid parameter `method`: {method}") keys, data_list = self.insert_data() @@ -786,7 +782,8 @@ def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None cols = [self.table.c[n] for n in columns] if self.index is not None: - [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]] + for idx in self.index[::-1]: + cols.insert(0, self.table.c[idx]) sql_select = select(cols) else: sql_select = self.table.select() @@ -826,7 +823,7 @@ def _index_name(self, index, index_label): if len(index_label) != nlevels: raise ValueError( "Length of 'index_label' should match number of " - "levels, which is {0}".format(nlevels) + f"levels, which is {nlevels}" ) else: return index_label @@ -839,7 +836,7 @@ def _index_name(self, index, index_label): return ["index"] else: return [ - l if l is not None else "level_{0}".format(i) + l if l is not None else f"level_{i}" for i, l in enumerate(self.frame.index.names) ] @@ -1304,10 +1301,7 @@ def to_sql( for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): - raise ValueError( - "The type of {column} is not a " - "SQLAlchemy type ".format(column=col) - ) + raise ValueError(f"The type of {col} is not a SQLAlchemy type") table = SQLTable( name, @@ -1331,11 +1325,11 @@ def to_sql( ) if name not in table_names: msg = ( - "The provided table name '{0}' is not found exactly as " + f"The provided table name '{name}' is not found exactly as " "such in the database after writing the table, possibly " "due to case sensitivity issues. Consider using lower " "case table names." - ).format(name) + ) warnings.warn(msg, UserWarning) @property @@ -1395,14 +1389,12 @@ def _get_unicode_name(name): try: uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: - raise ValueError( - "Cannot convert identifier to UTF-8: '{name}'".format(name=name) - ) + raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") return uname def _get_valid_sqlite_name(name): - # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ + # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python # Ensure the string can be encoded as UTF-8. # Ensure the string does not include any NUL characters. @@ -1456,13 +1448,14 @@ def insert_statement(self): escape = _get_valid_sqlite_name if self.index is not None: - [names.insert(0, idx) for idx in self.index[::-1]] + for idx in self.index[::-1]: + names.insert(0, idx) bracketed_names = [escape(column) for column in names] col_names = ",".join(bracketed_names) wildcards = ",".join([wld] * len(names)) - insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format( - table=escape(self.name), columns=col_names, wld=wildcards + insert_statement = ( + f"INSERT INTO {escape(self.name)} ({col_names}) VALUES ({wildcards})" ) return insert_statement @@ -1496,9 +1489,7 @@ def _create_table_setup(self): keys = self.keys cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( - "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br - ) + f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})" ) create_stmts = [ @@ -1599,14 +1590,11 @@ def execute(self, *args, **kwargs): self.con.rollback() except Exception as inner_exc: # pragma: no cover ex = DatabaseError( - "Execution failed on sql: {sql}\n{exc}\nunable " - "to rollback".format(sql=args[0], exc=exc) + f"Execution failed on sql: {args[0]}\n{exc}\nunable to rollback" ) raise ex from inner_exc - ex = DatabaseError( - "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) - ) + ex = DatabaseError(f"Execution failed on sql '{args[0]}': {exc}") raise ex from exc @staticmethod @@ -1731,11 +1719,7 @@ def to_sql( if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): - raise ValueError( - "{column} ({type!s}) not a string".format( - column=col, type=my_type - ) - ) + raise ValueError(f"{col} ({my_type}) not a string") table = SQLiteTable( name, @@ -1755,9 +1739,7 @@ def has_table(self, name, schema=None): # esc_name = escape(name) wld = "?" - query = ( - "SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" - ).format(wld=wld) + query = f"SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" return len(self.execute(query, [name]).fetchall()) > 0 @@ -1765,7 +1747,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name)) + drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1f8c6968359c1..b216ee80c3940 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -85,7 +85,7 @@ iterator : bool, default False Return StataReader object.""" -_read_stata_doc = """ +_read_stata_doc = f""" Read Stata file into DataFrame. Parameters @@ -100,10 +100,10 @@ By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. -%s -%s -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_iterator_params} Returns ------- @@ -125,33 +125,24 @@ >>> itr = pd.read_stata('filename.dta', chunksize=10000) >>> for chunk in itr: ... do_something(chunk) -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _chunksize_params, - _iterator_params, -) +""" -_read_method_doc = """\ +_read_method_doc = f"""\ Reads observations from Stata file, converting them into a dataframe Parameters ---------- nrows : int Number of lines to read from data file, if None read whole file. -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} Returns ------- DataFrame -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, -) - +""" -_stata_reader_doc = """\ +_stata_reader_doc = f"""\ Class for reading Stata dta files. Parameters @@ -161,14 +152,10 @@ implementing a binary read() functions. .. versionadded:: 0.23.0 support for pathlib, py.path. -%s -%s -%s -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _chunksize_params, -) +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +""" @Appender(_read_stata_doc) @@ -370,7 +357,7 @@ def convert_delta_safe(base, deltas, unit): month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) else: - raise ValueError("Date fmt {fmt} not understood".format(fmt=fmt)) + raise ValueError(f"Date fmt {fmt} not understood") if has_bad_values: # Restore NaT for bad values conv_dates[bad_locs] = NaT @@ -465,9 +452,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError( - "Format {fmt} is not a known Stata date format".format(fmt=fmt) - ) + raise ValueError(f"Format {fmt} is not a known Stata date format") conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack("= 2 ** 53: - ws = precision_loss_doc % ("uint64", "float64") + ws = precision_loss_doc.format("uint64", "float64") data[col] = data[col].astype(dtype) @@ -585,25 +570,21 @@ def _cast_to_stata_types(data): else: data[col] = data[col].astype(np.float64) if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): - ws = precision_loss_doc % ("int64", "float64") + ws = precision_loss_doc.format("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() if np.isinf(value): raise ValueError( - "Column {col} has a maximum value of " - "infinity which is outside the range " - "supported by Stata.".format(col=col) + f"Column {col} has a maximum value of infinity which is outside " + "the range supported by Stata." ) if dtype == np.float32 and value > float32_max: data[col] = data[col].astype(np.float64) elif dtype == np.float64: if value > float64_max: raise ValueError( - "Column {col} has a maximum value " - "({val}) outside the range supported by " - "Stata ({float64_max})".format( - col=col, val=value, float64_max=float64_max - ) + f"Column {col} has a maximum value ({value}) outside the range " + f"supported by Stata ({float64_max})" ) if ws: @@ -618,26 +599,18 @@ class StataValueLabel: Parameters ---------- - value : int8, int16, int32, float32 or float64 - The Stata missing value code - - Attributes - ---------- - string : string - String representation of the Stata missing value - value : int8, int16, int32, float32 or float64 - The original encoded missing value - - Methods - ------- - generate_value_label - + catarray : Categorical + Categorical Series to encode + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. """ - def __init__(self, catarray): + def __init__(self, catarray, encoding="latin-1"): + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") self.labname = catarray.name - + self._encoding = encoding categories = catarray.cat.categories self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) @@ -656,7 +629,7 @@ def __init__(self, catarray): value_label_mismatch_doc.format(catarray.name), ValueLabelTypeMismatch, ) - + category = category.encode(encoding) self.off.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding self.val.append(vl[0]) @@ -683,31 +656,31 @@ def _encode(self, s): """ return s.encode(self._encoding) - def generate_value_label(self, byteorder, encoding): + def generate_value_label(self, byteorder): """ + Generate the binary representation of the value labals. + Parameters ---------- byteorder : str Byte order of the output - encoding : str - File encoding Returns ------- value_label : bytes Bytes containing the formatted value label """ - - self._encoding = encoding + encoding = self._encoding bio = BytesIO() - null_string = "\x00" null_byte = b"\x00" # len bio.write(struct.pack(byteorder + "i", self.len)) # labname - labname = self._encode(_pad_bytes(self.labname[:32], 33)) + labname = self.labname[:32].encode(encoding) + lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 + labname = _pad_bytes(labname, lab_len + 1) bio.write(labname) # padding - 3 bytes @@ -731,7 +704,7 @@ def generate_value_label(self, byteorder, encoding): # txt - Text labels, null terminated for text in self.txt: - bio.write(self._encode(text + null_string)) + bio.write(text + null_byte) bio.seek(0) return bio.read() @@ -1007,6 +980,22 @@ def __init__(self): "typedef", "typename", "virtual", + "_all", + "_N", + "_skip", + "_b", + "_pi", + "str#", + "in", + "_pred", + "strL", + "_coef", + "_rc", + "using", + "_cons", + "_se", + "with", + "_n", ) @@ -1192,7 +1181,7 @@ def f(typ): try: return self.TYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata types [{0}]".format(typ)) + raise ValueError(f"cannot convert stata types [{typ}]") typlist = [f(x) for x in raw_typlist] @@ -1202,7 +1191,7 @@ def f(typ): try: return self.DTYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata dtype [{0}]".format(typ)) + raise ValueError(f"cannot convert stata dtype [{typ}]") dtyplist = [f(x) for x in raw_typlist] @@ -1330,19 +1319,13 @@ def _read_old_header(self, first_char): try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata types [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_types = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata types [{invalid_types}]") try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata dtypes [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_dtypes = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") if self.format_version > 108: self.varlist = [ @@ -1415,12 +1398,13 @@ def _decode(self, s): except UnicodeDecodeError: # GH 25960, fallback to handle incorrect format produced when 117 # files are converted to 118 files in Stata - msg = """ + encoding = self._encoding + msg = f""" One or more strings in the dta file could not be decoded using {encoding}, and so the fallback encoding of latin-1 is being used. This can happen when a file has been incorrectly encoded by Stata or some other software. You should verify the string values returned are correct.""" - warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) + warnings.warn(msg, UnicodeWarning) return s.decode("latin-1") def _read_value_labels(self): @@ -1794,7 +1778,7 @@ def _do_convert_categoricals( repeats = list(vc.index[vc > 1]) repeats = "-" * 80 + "\n" + "\n".join(repeats) # GH 25772 - msg = """ + msg = f""" Value labels for column {col} are not unique. These cannot be converted to pandas categoricals. @@ -1805,7 +1789,7 @@ def _do_convert_categoricals( The repeated labels are: {repeats} """ - raise ValueError(msg.format(col=col, repeats=repeats)) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) @@ -1874,13 +1858,15 @@ def _set_endianness(endianness): elif endianness.lower() in [">", "big"]: return ">" else: # pragma : no cover - raise ValueError("Endianness {endian} not understood".format(endian=endianness)) + raise ValueError(f"Endianness {endianness} not understood") def _pad_bytes(name, length): """ Take a char string and pads it with null bytes until it's length chars. """ + if isinstance(name, bytes): + return name + b"\x00" * (length - len(name)) return name + "\x00" * (length - len(name)) @@ -1906,7 +1892,7 @@ def _convert_datetime_to_stata_type(fmt): ]: return np.float64 # Stata expects doubles for SIFs else: - raise NotImplementedError("Format {fmt} not implemented".format(fmt=fmt)) + raise NotImplementedError(f"Format {fmt} not implemented") def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -1956,9 +1942,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype == np.int8: return 251 else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): @@ -1985,24 +1969,12 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False if force_strl: return "%9s" if dtype.type == np.object_: - inferred_dtype = infer_dtype(column, skipna=True) - if not (inferred_dtype in ("string", "unicode") or len(column) == 0): - raise ValueError( - "Column `{col}` cannot be exported.\n\nOnly " - "string-like object arrays containing all " - "strings or a mix of strings and None can be " - "exported. Object arrays containing only null " - "values are prohibited. Other object types" - "cannot be exported and must first be converted " - "to one of the supported " - "types.".format(col=column.name) - ) itemsize = max_len_string_array(ensure_object(column.values)) if itemsize > max_str_len: if dta_version >= 117: return "%9s" else: - raise ValueError(excessive_string_length_error % column.name) + raise ValueError(excessive_string_length_error.format(column.name)) return "%" + str(max(itemsize, 1)) + "s" elif dtype == np.float64: return "%10.0g" @@ -2013,9 +1985,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False elif dtype == np.int8 or dtype == np.int16: return "%8.0g" else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") class StataWriter(StataParser): @@ -2043,8 +2013,6 @@ class StataWriter(StataParser): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2086,6 +2054,7 @@ class StataWriter(StataParser): """ _max_string_length = 244 + _encoding = "latin-1" def __init__( self, @@ -2101,7 +2070,6 @@ def __init__( super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - self._encoding = "latin-1" self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2136,7 +2104,8 @@ def _prepare_categoricals(self, data): data_formatted = [] for col, col_is_cat in zip(data, is_cat): if col_is_cat: - self._value_labels.append(StataValueLabel(data[col])) + svl = StataValueLabel(data[col], encoding=self._encoding) + self._value_labels.append(svl) dtype = data[col].cat.codes.dtype if dtype == np.int64: raise ValueError( @@ -2181,6 +2150,36 @@ def _update_strl_names(self): """No-op, forward compatibility""" pass + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 + and _. + """ + for c in name: + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") + return name + def _check_column_names(self, data): """ Checks column names to ensure that they are valid Stata column names. @@ -2204,14 +2203,7 @@ def _check_column_names(self, data): if not isinstance(name, str): name = str(name) - for c in name: - if ( - (c < "A" or c > "Z") - and (c < "a" or c > "z") - and (c < "0" or c > "9") - and c != "_" - ): - name = name.replace(c, "_") + name = self._validate_variable_name(name) # Variable name must not be a reserved word if name in self.RESERVED_WORDS: @@ -2251,7 +2243,7 @@ def _check_column_names(self, data): orig_name = orig_name.encode("utf-8") except (UnicodeDecodeError, AttributeError): pass - msg = "{0} -> {1}".format(orig_name, name) + msg = f"{orig_name} -> {name}" conversion_warning.append(msg) ws = invalid_name_doc.format("\n ".join(conversion_warning)) @@ -2262,12 +2254,12 @@ def _check_column_names(self, data): return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): - self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) - self.typlist.append(_dtype_to_stata_type(dtype, data[col])) + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) + self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) def _prepare_pandas(self, data): # NOTE: we might need a different API / class for pandas objects so @@ -2311,17 +2303,57 @@ def _prepare_pandas(self, data): new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) dtypes[key] = np.dtype(new_type) - self._set_formats_and_types(data, dtypes) + # Verify object arrays are strings and encode to bytes + self._encode_strings() + + self._set_formats_and_types(dtypes) # set the given format for the datetime cols if self._convert_dates is not None: for key in self._convert_dates: self.fmtlist[key] = self._convert_dates[key] + def _encode_strings(self): + """ + Encode strings in dta-specific encoding + + Do not encode columns marked for date conversion or for strL + conversion. The strL converter independently handles conversion and + also accepts empty string arrays. + """ + convert_dates = self._convert_dates + # _convert_strl is not available in dta 114 + convert_strl = getattr(self, "_convert_strl", []) + for i, col in enumerate(self.data): + # Skip columns marked for date conversion or strl conversion + if i in convert_dates or col in convert_strl: + continue + column = self.data[col] + dtype = column.dtype + if dtype.type == np.object_: + inferred_dtype = infer_dtype(column, skipna=True) + if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0): + col = column.name + raise ValueError( + f"""\ +Column `{col}` cannot be exported.\n\nOnly string-like object arrays +containing all strings or a mix of strings and None can be exported. +Object arrays containing only null values are prohibited. Other object +types cannot be exported and must first be converted to one of the +supported types.""" + ) + encoded = self.data[col].str.encode(self._encoding) + # If larger than _max_string_length do nothing + if ( + max_len_string_array(ensure_object(encoded.values)) + <= self._max_string_length + ): + self.data[col] = encoded + def write_file(self): self._file, self._own_file = _open_file_binary_write(self._fname) try: - self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) + self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() self._write_variable_types() self._write_varnames() @@ -2344,9 +2376,8 @@ def write_file(self): os.unlink(self._fname) except OSError: warnings.warn( - "This save was not successful but {0} could not " - "be deleted. This file is not " - "valid.".format(self._fname), + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", ResourceWarning, ) raise exc @@ -2392,7 +2423,7 @@ def _write_expansion_fields(self): def _write_value_labels(self): for vl in self._value_labels: - self._file.write(vl.generate_value_label(self._byteorder, self._encoding)) + self._file.write(vl.generate_value_label(self._byteorder)) def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder @@ -2494,9 +2525,8 @@ def _write_variable_labels(self): is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + "can be encoded in Latin-1" ) self._write(_pad_bytes(label, 81)) else: @@ -2527,9 +2557,9 @@ def _prepare_data(self): typ = typlist[i] if typ <= self._max_string_length: data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) - stype = "S{type}".format(type=typ) + stype = f"S{typ}" dtypes[col] = stype - data[col] = data[col].str.encode(self._encoding).astype(stype) + data[col] = data[col].astype(stype) else: dtype = data[col].dtype if not native_byteorder: @@ -2715,12 +2745,6 @@ def generate_table(self): return gso_table, gso_df - def _encode(self, s): - """ - Python 3 compatibility shim - """ - return s.encode(self._encoding) - def generate_blob(self, gso_table): """ Generates the binary blob of GSOs that is written to the dta file. @@ -2860,6 +2884,7 @@ class StataWriter117(StataWriter): """ _max_string_length = 2045 + _dta_version = 117 def __init__( self, @@ -2906,18 +2931,21 @@ def _write_header(self, data_label=None, time_stamp=None): self._file.write(bytes("", "utf-8")) bio = BytesIO() # ds_format - 117 - bio.write(self._tag(bytes("117", "utf-8"), "release")) + bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) # number of vars, 2 bytes assert self.nvar < 2 ** 16 bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) - # number of obs, 4 bytes - bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), "N")) + # 117 uses 4 bytes, 118 uses 8 + nobs_size = "I" if self._dta_version == 117 else "Q" + bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) # data label 81 bytes, char, null terminated label = data_label[:80] if data_label is not None else "" - label_len = struct.pack(byteorder + "B", len(label)) - label = label_len + bytes(label, "utf-8") + label = label.encode(self._encoding) + label_size = "B" if self._dta_version == 117 else "H" + label_len = struct.pack(byteorder + label_size, len(label)) + label = label_len + label bio.write(self._tag(label, "label")) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm @@ -2947,7 +2975,7 @@ def _write_header(self, data_label=None, time_stamp=None): + time_stamp.strftime(" %Y %H:%M") ) # '\x11' added due to inspection of Stata file - ts = b"\x11" + bytes(ts, "utf8") + ts = b"\x11" + bytes(ts, "utf-8") bio.write(self._tag(ts, "timestamp")) bio.seek(0) self._file.write(self._tag(bio.read(), "header")) @@ -2994,9 +3022,11 @@ def _write_variable_types(self): def _write_varnames(self): self._update_map("varnames") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vn_len = 32 if self._dta_version == 117 else 128 for name in self.varlist: name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "varnames")) @@ -3008,21 +3038,24 @@ def _write_sortlist(self): def _write_formats(self): self._update_map("formats") bio = BytesIO() + fmt_len = 49 if self._dta_version == 117 else 57 for fmt in self.fmtlist: - bio.write(_pad_bytes_new(fmt, 49)) + bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) bio.seek(0) self._file.write(self._tag(bio.read(), "formats")) def _write_value_label_names(self): self._update_map("value_label_names") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 32 if self._dta_version == 117 else 128 for i in range(self.nvar): # Use variable name when categorical name = "" # default name if self._is_col_cat[i]: name = self.varlist[i] name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "value_label_names")) @@ -3031,7 +3064,9 @@ def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination self._update_map("variable_labels") bio = BytesIO() - blank = _pad_bytes_new("", 81) + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 80 if self._dta_version == 117 else 320 + blank = _pad_bytes_new("", vl_len + 1) if self._variable_labels is None: for _ in range(self.nvar): @@ -3045,14 +3080,15 @@ def _write_variable_labels(self): label = self._variable_labels[col] if len(label) > 80: raise ValueError("Variable labels must be 80 characters or fewer") - is_latin1 = all(ord(c) < 256 for c in label) - if not is_latin1: + try: + encoded = label.encode(self._encoding) + except UnicodeEncodeError: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + f"can be encoded in {self._encoding}" ) - bio.write(_pad_bytes_new(label, 81)) + + bio.write(_pad_bytes_new(encoded, vl_len + 1)) else: bio.write(blank) bio.seek(0) @@ -3084,7 +3120,7 @@ def _write_value_labels(self): self._update_map("value_labels") bio = BytesIO() for vl in self._value_labels: - lab = vl.generate_value_label(self._byteorder, self._encoding) + lab = vl.generate_value_label(self._byteorder) lab = self._tag(lab, "lbl") bio.write(lab) bio.seek(0) @@ -3114,19 +3150,140 @@ def _convert_strls(self, data): ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols) + ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): force_strl = col in self._convert_strl fmt = _dtype_to_default_stata_fmt( - dtype, data[col], dta_version=117, force_strl=force_strl + dtype, + self.data[col], + dta_version=self._dta_version, + force_strl=force_strl, ) self.fmtlist.append(fmt) - self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], force_strl)) + self.typlist.append( + _dtype_to_stata_type_117(dtype, self.data[col], force_strl) + ) + + +class StataWriter118(StataWriter117): + """ + A class for writing Stata binary dta files in Stata 15 format (118) + + DTA 118 format files support unicode string data (both fixed and strL) + format. Unicode is also supported in value labels, variable labels and + the dataset label. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + + Returns + ------- + StataWriter118 + The instance has a write_file method, which will write the file to the + given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + Using Unicode data and column names + + >>> from pandas.io.stata import StataWriter118 + >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) + >>> writer = StataWriter118('./data_file.dta', data) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriter118('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _encoding = "utf-8" + _dta_version = 118 + + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 118 support most unicode characters. The only limatation is in + the ascii range where the characters supported are a-z, A-Z, 0-9 and _. + """ + # High code points appear to be acceptable + for c in name: + if ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) or 128 <= ord(c) < 256: + name = name.replace(c, "_") + + return name diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index f9a692b0559ca..27b1d55fe1bd6 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Dict, Type + from pandas.plotting._matplotlib.boxplot import ( BoxPlot, boxplot, @@ -26,7 +28,10 @@ ) from pandas.plotting._matplotlib.tools import table -PLOT_CLASSES = { +if TYPE_CHECKING: + from pandas.plotting._matplotlib.core import MPLPlot # noqa: F401 + +PLOT_CLASSES: Dict[str, Type["MPLPlot"]] = { "line": LinePlot, "bar": BarPlot, "barh": BarhPlot, diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6da13f188357c..2d68bb46a8ada 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -229,10 +229,9 @@ def _validate_color_args(self): for char in s: if char in matplotlib.colors.BASE_COLORS: raise ValueError( - "Cannot pass 'style' string with a color " - "symbol and 'color' keyword argument. Please" - " use one or the other or pass 'style' " - "without a color symbol" + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the other or " + "pass 'style' without a color symbol" ) def _iter_data(self, data=None, keep_index=False, fillna=None): @@ -395,6 +394,10 @@ def _compute_plot_data(self): include_type = [np.number] exclude_type = ["timedelta"] + # GH 18755, include object and category type for scatter plot + if self._kind == "scatter": + include_type.extend(["object", "category"]) + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) try: @@ -866,10 +869,13 @@ def __init__(self, data, x, y, **kwargs): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] - if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires x column to be numeric") - if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires y column to be numeric") + + # Scatter plot allows to plot objects data + if self._kind == "hexbin": + if len(self.data[x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") self.x = x self.y = y diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7f208436ddc4a..ccd42d3940431 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -453,7 +453,7 @@ def __delitem__(self, key): raise ValueError(f"Cannot remove default parameter {key}") return super().__delitem__(key) - def __contains__(self, key): + def __contains__(self, key) -> bool: key = self._get_canonical_key(key) return super().__contains__(key) diff --git a/pandas/testing.py b/pandas/testing.py index acae47367d997..0445fa5b5efc0 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -1,11 +1,17 @@ -# flake8: noqa - """ Public testing utility functions. """ -from pandas.util.testing import ( +from pandas._testing import ( + assert_extension_array_equal, assert_frame_equal, assert_index_equal, assert_series_equal, ) + +__all__ = [ + "assert_extension_array_equal", + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", +] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index d586dbbb72831..8b897524cb053 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,8 +1,9 @@ +import sys from typing import List import pandas as pd from pandas import api, compat -import pandas.util.testing as tm +import pandas._testing as tm class Base: @@ -42,7 +43,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules: List[str] = [] + deprecated_modules: List[str] = ["np", "datetime"] # misc misc = ["IndexSlice", "NaT", "NA"] @@ -67,7 +68,6 @@ class TestPDApi(Base): "RangeIndex", "UInt64Index", "Series", - "SparseArray", "SparseDtype", "StringDtype", "Timedelta", @@ -90,18 +90,20 @@ class TestPDApi(Base): "UInt64Dtype", "NamedAgg", ] - if not compat.PY37: - classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) - deprecated_modules.append("np") # these are already deprecated; awaiting removal deprecated_classes: List[str] = [] # these should be deprecated in the future - deprecated_classes_in_future: List[str] = [] + deprecated_classes_in_future: List[str] = ["SparseArray"] + + if not compat.PY37: + classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) + # deprecated_modules.extend(["np", "datetime"]) + # deprecated_classes_in_future.extend(["SparseArray"]) # external modules exposed in pandas namespace - modules = ["datetime"] + modules: List[str] = [] # top-level functions funcs = [ @@ -193,6 +195,7 @@ class TestPDApi(Base): "_np_version_under1p16", "_np_version_under1p17", "_np_version_under1p18", + "_testing", "_tslib", "_typing", "_version", @@ -200,41 +203,65 @@ class TestPDApi(Base): def test_api(self): - self.check( - pd, + checkthese = ( self.lib + self.misc + self.modules - + self.deprecated_modules + self.classes - + self.deprecated_classes - + self.deprecated_classes_in_future + self.funcs + self.funcs_option + self.funcs_read + self.funcs_json + self.funcs_to - + self.deprecated_funcs_in_future - + self.deprecated_funcs - + self.private_modules, - self.ignored, + + self.private_modules ) + if not compat.PY37: + checkthese.extend( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.deprecated_funcs_in_future + + self.deprecated_funcs + ) + self.check(pd, checkthese, self.ignored) def test_depr(self): - deprecated = ( + deprecated_list = ( self.deprecated_modules + self.deprecated_classes + self.deprecated_classes_in_future + self.deprecated_funcs + self.deprecated_funcs_in_future ) - for depr in deprecated: + for depr in deprecated_list: with tm.assert_produces_warning(FutureWarning): - if compat.PY37: - getattr(pd, depr) - else: - deprecated = getattr(pd, depr) - deprecated.__getattr__(dir(deprecated)[-1]) + deprecated = getattr(pd, depr) + if not compat.PY37: + if depr == "datetime": + deprecated.__getattr__(dir(pd.datetime.datetime)[-1]) + elif depr == "SparseArray": + deprecated([]) + else: + deprecated.__getattr__(dir(deprecated)[-1]) + + +def test_datetime(): + from datetime import datetime + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert datetime(2015, 1, 2, 0, 0) == pd.datetime(2015, 1, 2, 0, 0) + + assert isinstance(pd.datetime(2015, 1, 2, 0, 0), pd.datetime) + + +def test_sparsearray(): + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert isinstance(pd.array([1, 2, 3], dtype="Sparse"), pd.SparseArray) def test_np(): @@ -254,9 +281,33 @@ def test_api(self): class TestTesting(Base): - funcs = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"] + funcs = [ + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", + "assert_extension_array_equal", + ] def test_testing(self): from pandas import testing self.check(testing, self.funcs) + + def test_util_testing_deprecated(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + + with tm.assert_produces_warning(FutureWarning) as m: + import pandas.util.testing # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) + + def test_util_testing_deprecated_direct(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + with tm.assert_produces_warning(FutureWarning) as m: + from pandas.util.testing import assert_series_equal # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 97480502f192c..31423c03dee34 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -1,5 +1,5 @@ +import pandas._testing as tm from pandas.api import types -import pandas.util.testing as tm from .test_api import Base diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index bc02a1e76a695..83d19b8a20ac3 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -5,7 +5,7 @@ import pytest from pandas import DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm def assert_invalid_addsub_type(left, right, msg=None): @@ -70,7 +70,7 @@ def assert_invalid_comparison(left, right, box): result = right != left tm.assert_equal(result, ~expected) - msg = "Invalid comparison between" + msg = "Invalid comparison between|Cannot compare type|not supported between" with pytest.raises(TypeError, match=msg): left < right with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 64588af3e3053..577093c0f2967 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm # ------------------------------------------------------------------ # Helper Functions diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index afce374aebe05..d3f9ac4f3f8b2 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -26,15 +26,13 @@ Timestamp, date_range, ) -import pandas.core.arrays.datetimelike as dtl -from pandas.core.indexes.datetimes import _to_M8 +import pandas._testing as tm from pandas.core.ops import roperator from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, assert_invalid_comparison, get_upcast_box, ) -import pandas.util.testing as tm # ------------------------------------------------------------------ # Comparisons @@ -86,6 +84,52 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra dtarr = tm.box_expected(rng, box_with_array) assert_invalid_comparison(dtarr, other, box_with_array) + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.timedelta_range("1ns", periods=10).array, + np.array(pd.timedelta_range("1ns", periods=10)), + list(pd.timedelta_range("1ns", periods=10)), + pd.timedelta_range("1 Day", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_dt64arr_cmp_arraylike_invalid(self, other, tz_naive_fixture): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="ns", periods=10, tz=tz)._data + assert_invalid_comparison(dta, other, tm.to_array) + + def test_dt64arr_cmp_mixed_invalid(self, tz_naive_fixture): + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="h", periods=5, tz=tz)._data + + other = np.array([0, 1, 2, dta[3], pd.Timedelta(days=1)]) + result = dta == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dta != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + dta < other + with pytest.raises(TypeError, match=msg): + dta > other + with pytest.raises(TypeError, match=msg): + dta <= other + with pytest.raises(TypeError, match=msg): + dta >= other + def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly tz = tz_naive_fixture @@ -296,7 +340,7 @@ class TestDatetimeIndexComparisons: def test_comparators(self, op): index = tm.makeDateIndex(100) element = index[len(index) // 2] - element = _to_M8(element) + element = Timestamp(element).to_datetime64() arr = np.array(index) arr_result = op(arr, element) @@ -1332,7 +1376,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other @@ -1361,7 +1405,7 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): res = dtarr + other expected = DatetimeIndex( [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -1369,11 +1413,11 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): res = dtarr - other expected = DatetimeIndex( [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -2298,7 +2342,7 @@ def test_dti_addsub_offset_arraylike( xbox = get_upcast_box(box, other) - with tm.assert_produces_warning(PerformanceWarning, clear=[dtl]): + with tm.assert_produces_warning(PerformanceWarning): res = op(dti, other) expected = DatetimeIndex( @@ -2307,6 +2351,32 @@ def test_dti_addsub_offset_arraylike( expected = tm.box_expected(expected, xbox) tm.assert_equal(res, expected) + @pytest.mark.parametrize("other_box", [pd.Index, np.array]) + def test_dti_addsub_object_arraylike( + self, tz_naive_fixture, box_with_array, other_box + ): + tz = tz_naive_fixture + + dti = pd.date_range("2017-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + other = other_box([pd.offsets.MonthEnd(), pd.Timedelta(days=4)]) + xbox = get_upcast_box(box_with_array, other) + + expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + result = dtarr + other + tm.assert_equal(result, expected) + + expected = pd.DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + with tm.assert_produces_warning(warn): + result = dtarr - other + tm.assert_equal(result, expected) + @pytest.mark.parametrize("years", [-1, 0, 1]) @pytest.mark.parametrize("months", [-2, 0, 2]) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py new file mode 100644 index 0000000000000..f9e1a515277d5 --- /dev/null +++ b/pandas/tests/arithmetic/test_interval.py @@ -0,0 +1,273 @@ +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_list_like + +import pandas as pd +from pandas import ( + Categorical, + Index, + Interval, + IntervalIndex, + Period, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture( + params=[ + (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])), + (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])), + ( + timedelta_range("0 days", periods=3).insert(4, pd.NaT), + timedelta_range("1 day", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3).insert(4, pd.NaT), + date_range("20170102", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3, tz="US/Eastern").insert(4, pd.NaT), + date_range("20170102", periods=3, tz="US/Eastern").insert(4, pd.NaT), + ), + ], + ids=lambda x: str(x[0].dtype), +) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +@pytest.fixture +def array(left_right_dtypes): + """ + Fixture to generate an IntervalArray of various dtypes containing NA if possible + """ + left, right = left_right_dtypes + return IntervalArray.from_arrays(left, right) + + +def create_categorical_intervals(left, right, closed="right"): + return Categorical(IntervalIndex.from_arrays(left, right, closed)) + + +def create_series_intervals(left, right, closed="right"): + return Series(IntervalArray.from_arrays(left, right, closed)) + + +def create_series_categorical_intervals(left, right, closed="right"): + return Series(Categorical(IntervalIndex.from_arrays(left, right, closed))) + + +class TestComparison: + @pytest.fixture(params=[operator.eq, operator.ne]) + def op(self, request): + return request.param + + @pytest.fixture( + params=[ + IntervalArray.from_arrays, + IntervalIndex.from_arrays, + create_categorical_intervals, + create_series_intervals, + create_series_categorical_intervals, + ], + ids=[ + "IntervalArray", + "IntervalIndex", + "Categorical[Interval]", + "Series[Interval]", + "Series[Categorical[Interval]]", + ], + ) + def interval_constructor(self, request): + """ + Fixture for all pandas native interval constructors. + To be used as the LHS of IntervalArray comparisons. + """ + return request.param + + def elementwise_comparison(self, op, array, other): + """ + Helper that performs elementwise comparisions between `array` and `other` + """ + other = other if is_list_like(other) else [other] * len(array) + return np.array([op(x, y) for x, y in zip(array, other)]) + + def test_compare_scalar_interval(self, op, array): + # matches first interval + other = array[0] + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # matches on a single endpoint but not both + other = Interval(array.left[0], array.right[1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_interval_mixed_closed(self, op, closed, other_closed): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = Interval(0, 1, closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_na(self, op, array, nulls_fixture): + result = op(array, nulls_fixture) + expected = self.elementwise_comparison(op, array, nulls_fixture) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + 0, + 1.0, + True, + "foo", + Timestamp("2017-01-01"), + Timestamp("2017-01-01", tz="US/Eastern"), + Timedelta("0 days"), + Period("2017-01-01", "D"), + ], + ) + def test_compare_scalar_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval( + self, op, array, interval_constructor, + ): + # same endpoints + other = interval_constructor(array.left, array.right) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # different endpoints + other = interval_constructor(array.left[::-1], array.right[::-1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # all nan endpoints + other = interval_constructor([np.nan] * 4, [np.nan] * 4) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval_mixed_closed( + self, op, interval_constructor, closed, other_closed + ): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = interval_constructor(range(2), range(1, 3), closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + ( + Interval(0, 1), + Interval(Timedelta("1 day"), Timedelta("2 days")), + Interval(4, 5, "both"), + Interval(10, 20, "neither"), + ), + (0, 1.5, Timestamp("20170103"), np.nan), + ( + Timestamp("20170102", tz="US/Eastern"), + Timedelta("2 days"), + "baz", + pd.NaT, + ), + ], + ) + def test_compare_list_like_object(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_nan(self, op, array, nulls_fixture): + other = [nulls_fixture] * 4 + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + np.arange(4, dtype="int64"), + np.arange(4, dtype="float64"), + date_range("2017-01-01", periods=4), + date_range("2017-01-01", periods=4, tz="US/Eastern"), + timedelta_range("0 days", periods=4), + period_range("2017-01-01", periods=4, freq="D"), + Categorical(list("abab")), + Categorical(date_range("2017-01-01", periods=4)), + pd.array(list("abcd")), + pd.array(["foo", 3.14, None, object()]), + ], + ids=lambda x: str(x.dtype), + ) + def test_compare_list_like_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("length", [1, 3, 5]) + @pytest.mark.parametrize("other_constructor", [IntervalArray, list]) + def test_compare_length_mismatch_errors(self, op, other_constructor, length): + array = IntervalArray.from_arrays(range(4), range(1, 5)) + other = other_constructor([Interval(0, 1)] * length) + with pytest.raises(ValueError, match="Lengths must match to compare"): + op(array, other) + + @pytest.mark.parametrize( + "constructor, expected_type, assert_func", + [ + (IntervalIndex, np.array, tm.assert_numpy_array_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_index_series_compat(self, op, constructor, expected_type, assert_func): + # IntervalIndex/Series that rely on IntervalArray for comparisons + breaks = range(4) + index = constructor(IntervalIndex.from_breaks(breaks)) + + # scalar comparisons + other = index[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = breaks[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + # list-like comparisons + other = IntervalArray.from_breaks(breaks) + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = [index[0], breaks[0], "foo"] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 9733d589ee93b..f55e2b98ee912 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -12,8 +12,8 @@ import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm def adjust_negative_zero(zero, expected): @@ -65,13 +65,16 @@ def test_df_numeric_cmp_dt64_raises(self): # GH#8932, GH#22163 ts = pd.Timestamp.now() df = pd.DataFrame({"x": range(5)}) - with pytest.raises(TypeError): + + msg = "Invalid comparison between dtype=int64 and Timestamp" + + with pytest.raises(TypeError, match=msg): df > ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): df < ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts < df - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts > df assert not (df == ts).any().any() diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index f9c1de115b3a4..799ef3492e53f 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import Series, Timestamp +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm # ------------------------------------------------------------------ # Comparisons diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index f0edcd11567d2..abb667260f094 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -11,12 +11,14 @@ import pandas as pd from pandas import Period, PeriodIndex, Series, period_range +import pandas._testing as tm from pandas.core import ops from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm from pandas.tseries.frequencies import to_offset +from .common import assert_invalid_comparison + # ------------------------------------------------------------------ # Comparisons @@ -39,11 +41,93 @@ def test_compare_zerodim(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) + @pytest.mark.parametrize( + "scalar", ["foo", pd.Timestamp.now(), pd.Timedelta(days=4)] + ) + def test_compare_invalid_scalar(self, box_with_array, scalar): + # comparison with scalar that cannot be interpreted as a Period + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, scalar, box_with_array) + + @pytest.mark.parametrize( + "other", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("1D", periods=4).array, + np.arange(4), + np.arange(4).astype(np.float64), + list(range(4)), + ], + ) + def test_compare_invalid_listlike(self, box_with_array, other): + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, other, box_with_array) + + @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)]) + def test_compare_object_dtype(self, box_with_array, other_box): + pi = pd.period_range("2000", periods=5) + parr = tm.box_expected(pi, box_with_array) + + xbox = np.ndarray if box_with_array is pd.Index else box_with_array + + other = other_box(pi) + + expected = np.array([True, True, True, True, True]) + expected = tm.box_expected(expected, xbox) + + result = parr == other + tm.assert_equal(result, expected) + result = parr <= other + tm.assert_equal(result, expected) + result = parr >= other + tm.assert_equal(result, expected) + + result = parr != other + tm.assert_equal(result, ~expected) + result = parr < other + tm.assert_equal(result, ~expected) + result = parr > other + tm.assert_equal(result, ~expected) + + other = other_box(pi[::-1]) + + expected = np.array([False, False, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr == other + tm.assert_equal(result, expected) + + expected = np.array([True, True, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr <= other + tm.assert_equal(result, expected) + + expected = np.array([False, False, True, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr >= other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr != other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr < other + tm.assert_equal(result, expected) + + expected = np.array([False, False, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr > other + tm.assert_equal(result, expected) + class TestPeriodIndexComparisons: # TODO: parameterize over boxes - @pytest.mark.parametrize("other", ["2017", 2017]) + @pytest.mark.parametrize("other", ["2017", pd.Period("2017", freq="D")]) def test_eq(self, other): idx = PeriodIndex(["2017", "2017", "2018"], freq="D") expected = np.array([True, True, False]) @@ -51,6 +135,34 @@ def test_eq(self, other): tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "other", + [ + 2017, + [2017, 2017, 2017], + np.array([2017, 2017, 2017]), + np.array([2017, 2017, 2017], dtype=object), + pd.Index([2017, 2017, 2017]), + ], + ) + def test_eq_integer_disallowed(self, other): + # match Period semantics by not treating integers as Periods + + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + expected = np.array([False, False, False]) + result = idx == other + + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises(TypeError): + idx < other + with pytest.raises(TypeError): + idx > other + with pytest.raises(TypeError): + idx <= other + with pytest.raises(TypeError): + idx >= other + def test_pi_cmp_period(self): idx = period_range("2007-01", periods=20, freq="M") @@ -1036,6 +1148,26 @@ def test_parr_add_sub_index(self): expected = pi - pi tm.assert_index_equal(result, expected) + def test_parr_add_sub_object_array(self): + pi = pd.period_range("2000-12-31", periods=3, freq="D") + parr = pi.array + + other = np.array([pd.Timedelta(days=1), pd.offsets.Day(2), 3]) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr + other + + expected = pd.PeriodIndex( + ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" + ).array + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr - other + + expected = pd.PeriodIndex(["2000-12-30"] * 3, freq="D").array + tm.assert_equal(result, expected) + class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index cc337f8fdd7ce..158da37aa7239 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -18,12 +18,12 @@ Timestamp, timedelta_range, ) +import pandas._testing as tm from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, assert_invalid_comparison, get_upcast_box, ) -import pandas.util.testing as tm # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons @@ -76,6 +76,49 @@ def test_td64_comparisons_invalid(self, box_with_array, invalid): assert_invalid_comparison(obj, invalid, box) + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.date_range("1970-01-01", periods=10, tz="UTC").array, + np.array(pd.date_range("1970-01-01", periods=10)), + list(pd.date_range("1970-01-01", periods=10)), + pd.date_range("1970-01-01", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_td64arr_cmp_arraylike_invalid(self, other): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + + rng = timedelta_range("1 days", periods=10)._data + assert_invalid_comparison(rng, other, tm.to_array) + + def test_td64arr_cmp_mixed_invalid(self): + rng = timedelta_range("1 days", periods=5)._data + + other = np.array([0, 1, 2, rng[3], pd.Timestamp.now()]) + result = rng == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = rng != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + rng < other + with pytest.raises(TypeError, match=msg): + rng > other + with pytest.raises(TypeError, match=msg): + rng <= other + with pytest.raises(TypeError, match=msg): + rng >= other + class TestTimedelta64ArrayComparisons: # TODO: All of these need to be parametrized over box @@ -1469,6 +1512,40 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): with tm.assert_produces_warning(PerformanceWarning): anchored - tdi + # ------------------------------------------------------------------ + # Unsorted + + def test_td64arr_add_sub_object_array(self, box_with_array): + tdi = pd.timedelta_range("1 day", periods=3, freq="D") + tdarr = tm.box_expected(tdi, box_with_array) + + other = np.array( + [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] + ) + + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + result = tdarr + other + + expected = pd.Index( + [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + with tm.assert_produces_warning(warn): + tdarr - other + + with tm.assert_produces_warning(warn): + result = other - tdarr + + expected = pd.Index( + [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + class TestTimedeltaArraylikeMulDivOps: # Tests for timedelta64[ns] diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 870a0a5db175e..52640044565fc 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("ordered", [True, False]) @@ -177,3 +177,7 @@ def test_take_nd_deprecated(self): cat = pd.Categorical(["a", "b", "c"]) with tm.assert_produces_warning(FutureWarning): cat.take_nd([0, 1]) + + ci = pd.Index(cat) + with tm.assert_produces_warning(FutureWarning): + ci.take_nd([0, 1]) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 4122a64a64516..90fcf12093909 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,8 +6,8 @@ from pandas.compat import PYPY from pandas import Categorical, Index, NaT, Series, date_range +import pandas._testing as tm from pandas.api.types import is_scalar -import pandas.util.testing as tm class TestCategoricalAnalytics: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 82f2fe1ab8fb6..f49f70f5acf77 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -4,9 +4,9 @@ import pytest from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series +import pandas._testing as tm from pandas.core.arrays.categorical import _recode_for_categories from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalAPI: @@ -87,8 +87,8 @@ def test_rename_categories(self): def test_rename_categories_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( - "new categories need to have the same number of items as the" - " old categories!" + "new categories need to have the same number of items as the " + "old categories!" ) with pytest.raises(ValueError, match=msg): cat.rename_categories(new_categories) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6c8b654c1955c..70a23e9748dd1 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -24,7 +24,7 @@ period_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalConstructors: diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 85bf385b029a3..19746d7d72162 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalDtypes: diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 37dea53f792cb..85d5a6a3dc3ac 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -3,9 +3,9 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series +import pandas._testing as tm import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalIndexingWithFactor(TestCategorical): @@ -157,8 +157,8 @@ def test_categories_assigments(self): def test_categories_assigments_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( - "new categories need to have the same number of items" - " as the old categories!" + "new categories need to have the same number of items " + "as the old categories!" ) with pytest.raises(ValueError, match=msg): cat.categories = new_categories diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 3037ac79cd592..211bf091ee17d 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, Index, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalMissing: diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 10e33bf70dc66..8643e7f6f89c1 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, date_range +import pandas._testing as tm from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalOpsWithFactor(TestCategorical): @@ -172,8 +172,8 @@ def test_comparison_with_unknown_scalars(self): cat = Categorical([1, 2, 3], ordered=True) msg = ( - "Cannot compare a Categorical for op __{}__ with a scalar," - " which is not a category" + "Cannot compare a Categorical for op __{}__ with a scalar, " + "which is not a category" ) with pytest.raises(TypeError, match=msg.format("lt")): cat < 4 diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 9321813b42b33..d08c4b47dd3cb 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -147,8 +147,6 @@ def test_categorical_repr_datetime(self): idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) - # TODO(wesm): exceeding 80 characters in the console is not good - # behavior exp = ( "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n" diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py index a0b09e19ece6e..2a0ef043bf9a9 100644 --- a/pandas/tests/arrays/categorical/test_sorting.py +++ b/pandas/tests/arrays/categorical/test_sorting.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Index -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalSort: diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index cfc7b8541302f..b80d0ff41aba6 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -1,5 +1,5 @@ from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalSubclassing: diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 29bd5252dbe3a..f66c327e9967d 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,16 +1,19 @@ import pytest -import pandas.util.testing as tm +from pandas.util._test_decorators import async_mark + +import pandas._testing as tm class TestCategoricalWarnings: - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = Categorical([])" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("c.", 1)) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 655a6e717119b..e046d87780bb4 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Index, @@ -11,8 +13,8 @@ date_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray -import pandas.util.testing as tm @pytest.fixture( @@ -103,3 +105,110 @@ def test_repr(): "Length: 2, closed: right, dtype: interval[int64]" ) assert result == expected + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +def test_arrow_array(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="different 'subtype'"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks([0, 1, 2, 3]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pyarrow_skip +@pytest.mark.parametrize( + "breaks", + [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], + ids=["int", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/interval/test_ops.py b/pandas/tests/arrays/interval/test_ops.py index a55c33c2f22e9..b4de80dc00a4e 100644 --- a/pandas/tests/arrays/interval/test_ops.py +++ b/pandas/tests/arrays/interval/test_ops.py @@ -3,8 +3,8 @@ import pytest from pandas import Interval, IntervalIndex, Timedelta, Timestamp +import pandas._testing as tm from pandas.core.arrays import IntervalArray -import pandas.util.testing as tm @pytest.fixture(params=[IntervalArray, IntervalIndex]) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index eab174862818c..d8a1831cd61ec 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -6,7 +6,8 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype class TestSeriesAccessor: @@ -31,7 +32,7 @@ def test_accessor_raises(self): def test_from_spmatrix(self, format, labels, dtype): import scipy.sparse - sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = scipy.sparse.eye(10, format=format, dtype=dtype) result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) @@ -48,7 +49,7 @@ def test_from_spmatrix(self, format, labels, dtype): def test_from_spmatrix_columns(self, columns): import scipy.sparse - dtype = pd.SparseDtype("float64", 0.0) + dtype = SparseDtype("float64", 0.0) mat = scipy.sparse.random(10, 2, density=0.5) result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) @@ -67,9 +68,9 @@ def test_to_coo(self): def test_to_dense(self): df = pd.DataFrame( { - "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 0)), - "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 1)), - "C": pd.SparseArray([1.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)), + "A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)), + "B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)), + "C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)), }, index=["b", "a"], ) @@ -82,8 +83,8 @@ def test_to_dense(self): def test_density(self): df = pd.DataFrame( { - "A": pd.SparseArray([1, 0, 2, 1], fill_value=0), - "B": pd.SparseArray([0, 1, 1, 1], fill_value=0), + "A": SparseArray([1, 0, 2, 1], fill_value=0), + "B": SparseArray([0, 1, 1, 1], fill_value=0), } ) res = df.sparse.density @@ -99,9 +100,7 @@ def test_series_from_coo(self, dtype, dense_index): A = scipy.sparse.eye(3, format="coo", dtype=dtype) result = pd.Series.sparse.from_coo(A, dense_index=dense_index) index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - expected = pd.Series( - pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index - ) + expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index) if dense_index: expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) @@ -117,3 +116,8 @@ def test_series_from_coo_incorrect_format_raises(self): TypeError, match="Expected coo_matrix. Got csr_matrix instead." ): pd.Series.sparse.from_coo(m) + + def test_with_column_named_sparse(self): + # https://github.com/pandas-dev/pandas/issues/30758 + df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index f1d2803ce5505..76442a63ccb0f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -4,9 +4,9 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core import ops -from pandas.core.arrays.sparse import SparseDtype -import pandas.util.testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype @pytest.fixture(params=["integer", "block"]) @@ -24,7 +24,7 @@ def mix(request): class TestSparseArrayArithmetics: _base = np.array - _klass = pd.SparseArray + _klass = SparseArray def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) @@ -391,15 +391,15 @@ def test_mixed_array_comparison(self, kind): @pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) result = op(arr, [0, 1]) - expected = op(arr, pd.SparseArray([0, 1])) + expected = op(arr, SparseArray([0, 1])) tm.assert_sp_array_equal(result, expected) def test_with_dataframe(): # GH#27910 - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) df = pd.DataFrame([[1, 2], [3, 4]]) result = arr.__add__(df) assert result is NotImplemented @@ -407,7 +407,7 @@ def test_with_dataframe(): def test_with_zerodim_ndarray(): # GH#27910 - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) result = arr * np.array(2) expected = arr * 2 @@ -416,23 +416,23 @@ def test_with_zerodim_ndarray(): @pytest.mark.parametrize("ufunc", [np.abs, np.exp]) @pytest.mark.parametrize( - "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] + "arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])] ) def test_ufuncs(ufunc, arr): result = ufunc(arr) fill_value = ufunc(arr.fill_value) - expected = pd.SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) + expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) tm.assert_sp_array_equal(result, expected) @pytest.mark.parametrize( "a, b", [ - (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), ], ) @pytest.mark.parametrize("ufunc", [np.add, np.greater]) @@ -440,12 +440,12 @@ def test_binary_ufuncs(ufunc, a, b): # can't say anything about fill value here. result = ufunc(a, b) expected = ufunc(np.asarray(a), np.asarray(b)) - assert isinstance(result, pd.SparseArray) + assert isinstance(result, SparseArray) tm.assert_numpy_array_equal(np.asarray(result), expected) def test_ndarray_inplace(): - sparray = pd.SparseArray([0, 2, 0, 0]) + sparray = SparseArray([0, 2, 0, 0]) ndarray = np.array([0, 1, 2, 3]) ndarray += sparray expected = np.array([0, 3, 2, 3]) @@ -453,19 +453,19 @@ def test_ndarray_inplace(): def test_sparray_inplace(): - sparray = pd.SparseArray([0, 2, 0, 0]) + sparray = SparseArray([0, 2, 0, 0]) ndarray = np.array([0, 1, 2, 3]) sparray += ndarray - expected = pd.SparseArray([0, 3, 2, 3], fill_value=0) + expected = SparseArray([0, 3, 2, 3], fill_value=0) tm.assert_sp_array_equal(sparray, expected) @pytest.mark.parametrize("fill_value", [True, False]) def test_invert(fill_value): arr = np.array([True, False, False, True]) - sparray = pd.SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(arr, fill_value=fill_value) result = ~sparray - expected = pd.SparseArray(~arr, fill_value=not fill_value) + expected = SparseArray(~arr, fill_value=not fill_value) tm.assert_sp_array_equal(result, expected) @@ -473,7 +473,7 @@ def test_invert(fill_value): @pytest.mark.parametrize("op", [operator.pos, operator.neg]) def test_unary_op(op, fill_value): arr = np.array([0, 1, np.nan, 2]) - sparray = pd.SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(arr, fill_value=fill_value) result = op(sparray) - expected = pd.SparseArray(op(arr), fill_value=op(fill_value)) + expected = SparseArray(op(arr), fill_value=op(fill_value)) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 0aaf294378bf7..baca18239b929 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import isna +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype -import pandas.util.testing as tm @pytest.fixture(params=["integer", "block"]) @@ -470,7 +470,7 @@ def test_astype(self): arr.astype("Sparse[i8]") def test_astype_bool(self): - a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) + a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) @@ -682,7 +682,7 @@ def test_getslice_tuple(self): dense[4:, :] def test_boolean_slice_empty(self): - arr = pd.SparseArray([0, 1, 2]) + arr = SparseArray([0, 1, 2]) res = arr[[False, False, False]] assert res.dtype == arr.dtype @@ -828,12 +828,12 @@ def test_fillna_overlap(self): def test_nonzero(self): # Tests regression #21172. - sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) - sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) @@ -1086,11 +1086,11 @@ def test_ufunc_args(self): @pytest.mark.parametrize("fill_value", [0.0, np.nan]) def test_modf(self, fill_value): # https://github.com/pandas-dev/pandas/issues/26946 - sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) + sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) r1, r2 = np.modf(sparse) e1, e2 = np.modf(np.asarray(sparse)) - tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) - tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) + tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value)) def test_nbytes_integer(self): arr = SparseArray([1, 0, 0, 0, 2], kind="integer") @@ -1106,7 +1106,7 @@ def test_nbytes_block(self): assert result == 24 def test_asarray_datetime64(self): - s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"])) + s = SparseArray(pd.to_datetime(["2012", None, None, "2013"])) np.asarray(s) def test_density(self): @@ -1208,7 +1208,7 @@ def test_first_fill_value_loc(arr, loc): ) @pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): - a = pd.SparseArray(arr, fill_value=fill_value).unique() + a = SparseArray(arr, fill_value=fill_value).unique() b = pd.Series(arr).unique() assert isinstance(a, SparseArray) a = np.asarray(a) diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index 4ad1aa60e7b4f..f1697dc9ff7ce 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -1,17 +1,17 @@ import numpy as np import pytest -import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray class TestSparseArrayConcat: @pytest.mark.parametrize("kind", ["integer", "block"]) def test_basic(self, kind): - a = pd.SparseArray([1, 0, 0, 2], kind=kind) - b = pd.SparseArray([1, 0, 2, 2], kind=kind) + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=kind) - result = pd.SparseArray._concat_same_type([a, b]) + result = SparseArray._concat_same_type([a, b]) # Can't make any assertions about the sparse index itself # since we aren't don't merge sparse blocs across arrays # in to_concat @@ -22,10 +22,10 @@ def test_basic(self, kind): @pytest.mark.parametrize("kind", ["integer", "block"]) def test_uses_first_kind(self, kind): other = "integer" if kind == "block" else "block" - a = pd.SparseArray([1, 0, 0, 2], kind=kind) - b = pd.SparseArray([1, 0, 2, 2], kind=other) + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=other) - result = pd.SparseArray._concat_same_type([a, b]) + result = SparseArray._concat_same_type([a, b]) expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 7a85ccf271e76..a2f861d378e67 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -7,8 +7,8 @@ import pandas.util._test_decorators as td from pandas import Series +import pandas._testing as tm from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index -import pandas.util.testing as tm TEST_LENGTH = 20 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c3f342f16a0bf..33e68f029922e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -6,17 +6,19 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm -def test_repr_with_NA(): - a = pd.array(["a", pd.NA, "b"], dtype="string") - for obj in [a, pd.Series(a), pd.DataFrame({"a": a})]: - assert "NA" in repr(obj) and "NaN" not in repr(obj) - assert "NA" in str(obj) and "NaN" not in str(obj) - if hasattr(obj, "_repr_html_"): - html_repr = obj._repr_html_() - assert "NA" in html_repr and "NaN" not in html_repr +def test_repr(): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + expected = "\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected def test_none_to_nan(): @@ -237,3 +239,14 @@ def test_arrow_roundtrip(): tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA + + +def test_value_counts_na(): + arr = pd.array(["a", "b", "a", pd.NA], dtype="string") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index f2a4e73e7b6ad..b1b5a9482e34f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -8,29 +8,34 @@ from pandas.core.dtypes.dtypes import registry import pandas as pd +import pandas._testing as tm from pandas.api.extensions import register_extension_dtype from pandas.api.types import is_scalar +from pandas.arrays import ( + BooleanArray, + DatetimeArray, + IntegerArray, + IntervalArray, + SparseArray, + StringArray, + TimedeltaArray, +) from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal -import pandas.util.testing as tm @pytest.mark.parametrize( "data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])), + ([1, 2], None, IntegerArray._from_sequence([1, 2])), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ( [1, 2], np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - ( - np.array([1, 2], dtype="int64"), - None, - pd.arrays.IntegerArray._from_sequence([1, 2]), - ), + (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -49,37 +54,33 @@ ( [1, 2], np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype="datetime64[ns]") - ), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( np.array([1, 2], dtype="datetime64[ns]"), None, - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype="datetime64[ns]") - ), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( pd.DatetimeIndex(["2000", "2001"]), np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( pd.DatetimeIndex(["2000", "2001"]), None, - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), # Datetime (tz-aware) ( ["2000", "2001"], pd.DatetimeTZDtype(tz="CET"), - pd.arrays.DatetimeArray._from_sequence( + DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), @@ -87,17 +88,17 @@ ( ["1H", "2H"], np.dtype("timedelta64[ns]"), - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), np.dtype("timedelta64[ns]"), - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), None, - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), # Category (["a", "b"], "category", pd.Categorical(["a", "b"])), @@ -110,27 +111,19 @@ ( [pd.Interval(1, 2), pd.Interval(3, 4)], "interval", - pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]), + IntervalArray.from_tuples([(1, 2), (3, 4)]), ), # Sparse - ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")), + ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])), - ( - ["a", None], - pd.StringDtype(), - pd.arrays.StringArray._from_sequence(["a", None]), - ), + (["a", None], "string", StringArray._from_sequence(["a", None])), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),), # Boolean - ([True, None], "boolean", pd.arrays.BooleanArray._from_sequence([True, None])), - ( - [True, None], - pd.BooleanDtype(), - pd.arrays.BooleanArray._from_sequence([True, None]), - ), + ([True, None], "boolean", BooleanArray._from_sequence([True, None])), + ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -181,31 +174,28 @@ def test_array_copy(): period_array(["2000", "2001"], freq="D"), ), # interval - ( - [pd.Interval(0, 1), pd.Interval(1, 2)], - pd.arrays.IntervalArray.from_breaks([0, 1, 2]), - ), + ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),), # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( np.array([1, 2], dtype="M8[ns]"), - pd.arrays.DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + DatetimeArray(np.array([1, 2], dtype="M8[ns]")), ), ( np.array([1, 2], dtype="M8[us]"), - pd.arrays.DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), ), # datetimetz ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], - pd.arrays.DatetimeArray._from_sequence( + DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), @@ -214,30 +204,30 @@ def test_array_copy(): datetime.datetime(2000, 1, 1, tzinfo=cet), datetime.datetime(2001, 1, 1, tzinfo=cet), ], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + DatetimeArray._from_sequence(["2000", "2001"], tz=cet), ), # timedelta ( [pd.Timedelta("1H"), pd.Timedelta("2H")], - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( np.array([1, 2], dtype="m8[ns]"), - pd.arrays.TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), ), ( np.array([1, 2], dtype="m8[us]"), - pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), + TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), ), # integer - ([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])), - ([1, None], pd.arrays.IntegerArray._from_sequence([1, None])), + ([1, 2], IntegerArray._from_sequence([1, 2])), + ([1, None], IntegerArray._from_sequence([1, None])), # string - (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])), - (["a", None], pd.arrays.StringArray._from_sequence(["a", None])), + (["a", "b"], StringArray._from_sequence(["a", "b"])), + (["a", None], StringArray._from_sequence(["a", None])), # Boolean - ([True, False], pd.arrays.BooleanArray._from_sequence([True, False])), - ([True, None], pd.arrays.BooleanArray._from_sequence([True, None])), + ([True, False], BooleanArray._from_sequence([True, False])), + ([True, None], BooleanArray._from_sequence([True, None])), ], ) def test_array_inference(data, expected): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index abec4b42c0ffb..cc8d0cdcb518d 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -6,10 +6,10 @@ import pandas.util._test_decorators as td import pandas as pd +import pandas._testing as tm from pandas.arrays import BooleanArray from pandas.core.arrays.boolean import coerce_to_array from pandas.tests.extension.base import BaseOpsUtil -import pandas.util.testing as tm def make_data(): @@ -251,6 +251,87 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + def test_astype(): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -265,6 +346,10 @@ def test_astype(): expected = np.array([1, 0, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype="object") + tm.assert_numpy_array_equal(result, expected) + # no missing values arr = pd.array([True, False, True], dtype="boolean") result = arr.astype("int64") @@ -783,3 +868,14 @@ def test_arrow_roundtrip(): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.BooleanDtype) tm.assert_frame_equal(result, df) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index e9c64d04ec860..fa45db93c6102 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,13 +4,14 @@ import pytest from pandas._libs import OutOfBoundsDatetime +from pandas.compat.numpy import _np_version_under1p18 import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.util.testing as tm # TODO: more freq variants @@ -758,3 +759,38 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): for nat in non_casting_nats: with pytest.raises(TypeError): array[0] = nat + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("2000", periods=4).array, + ], +) +def test_to_numpy_extra(array): + if _np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + + array[0] = pd.NaT + original = array.copy() + + result = array.to_numpy() + assert isnan(result[0]) + + result = array.to_numpy(dtype="int64") + assert result[0] == -9223372036854775808 + + result = array.to_numpy(dtype="int64", na_value=0) + assert result[0] == 0 + + result = array.to_numpy(na_value=array[1].to_numpy()) + assert result[0] == result[1] + + result = array.to_numpy(na_value=array[1].to_numpy(copy=False)) + assert result[0] == result[1] + + tm.assert_equal(array, original) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index d5ec473f4c74d..5608ab5fbd9db 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -9,9 +9,9 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import sequence_to_dt64ns -import pandas.util.testing as tm class TestDatetimeArrayConstructor: @@ -173,7 +173,7 @@ def test_tz_setter_raises(self): def test_setitem_different_tz_raises(self): data = np.array([1, 2, 3], dtype="M8[ns]") arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) - with pytest.raises(ValueError, match="None"): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): arr[0] = pd.Timestamp("2000") with pytest.raises(ValueError, match="US/Central"): @@ -282,6 +282,77 @@ def test_array_interface(self): ) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_different_tz(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo") + if index: + arr = pd.Index(arr) + + expected = arr.searchsorted(arr[2]) + result = arr.searchsorted(arr[2].tz_convert("UTC")) + assert result == expected + + expected = arr.searchsorted(arr[2:6]) + result = arr.searchsorted(arr[2:6].tz_convert("UTC")) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_tzawareness_compat(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + mismatch = arr.tz_localize("Asia/Tokyo") + + msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch[0]) + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch) + + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr[0]) + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr) + + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.timedelta64("NaT"), + pd.Timedelta(days=2), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10 ** 9, + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize( + "index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail( + reason="Raises ValueError instead of TypeError", raises=ValueError + ), + ), + ], + ) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "searchsorted requires compatible dtype or scalar" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + class TestSequenceToDT64NS: def test_tz_dtype_mismatch_raises(self): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index e534c93c69f68..0c8980c43c370 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -6,6 +6,7 @@ from pandas.core.dtypes.generic import ABCIndexClass import pandas as pd +import pandas._testing as tm from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar from pandas.core.arrays import IntegerArray, integer_array from pandas.core.arrays.integer import ( @@ -19,7 +20,6 @@ UInt64Dtype, ) from pandas.tests.extension.base import BaseOpsUtil -import pandas.util.testing as tm def make_data(): @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" " ...\n" - " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -108,13 +108,19 @@ def test_repr_array_long(): class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) + result = pd.Series( + data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) + ) tm.assert_series_equal(result, expected) # from int / list @@ -156,10 +162,13 @@ def _check_op(self, s, op_name, other, exc=None): # 1 ** na is na, so need to unmask those if op_name == "__pow__": - mask = np.where(s == 1, False, mask) + mask = np.where(~s.isna() & (s == 1), False, mask) elif op_name == "__rpow__": - mask = np.where(other == 1, False, mask) + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) # float result type or float op if ( @@ -208,20 +217,27 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): else: expected = expected.fillna(0) else: - expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 try: - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) except ValueError: expected = expected.astype(float) - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) - expected[mask] = np.nan + expected[mask] = pd.NA # assert that the expected astype is ok # (skip for unsigned as they have wrap around) @@ -255,21 +271,18 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) self._check_op(s, op, other, exc=TypeError) @@ -359,9 +372,9 @@ def test_pow_scalar(self): expected = pd.array([0, 1, None, 2], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = a ** pd.NA - # expected = pd.array([None, 1, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = a ** pd.NA + expected = pd.array([None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = a ** np.nan expected = np.array([np.nan, 1, np.nan, np.nan], dtype="float64") @@ -376,9 +389,9 @@ def test_pow_scalar(self): expected = pd.array([1, 1, 1, 1], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = pd.NA ** a - # expected = pd.array([1, None, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = np.nan ** a expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") @@ -406,10 +419,10 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -417,22 +430,61 @@ def _compare_other(self, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data._data) - expected = op(expected, other) + expected = op(pd.Series(data._data), other) # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA + expected = expected.astype("boolean") tm.assert_series_equal(result, expected) - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, 0) + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.Series([0] * len(data)) - self._compare_other(data, op_name, other) + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) def test_no_shared_mask(self, data): result = data + 1 @@ -442,20 +494,21 @@ def test_compare_to_string(self, any_nullable_int_dtype): # GH 28930 s = pd.Series([1, None], dtype=any_nullable_int_dtype) result = s == "a" - expected = pd.Series([False, False]) + expected = pd.Series([False, pd.NA], dtype="boolean") self.assert_series_equal(result, expected) def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): # GH 28930 - s1 = pd.Series([1, 2, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, 2, 3], dtype="int") + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") method = getattr(s1, all_compare_operators) result = method(2) method = getattr(s2, all_compare_operators) - expected = method(2) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA self.assert_series_equal(result, expected) @@ -543,6 +596,17 @@ def test_astype(self, all_data): expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): s = pd.Series([1, 2, 3], dtype="Int64") @@ -572,12 +636,54 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) + @pytest.mark.parametrize("in_series", [True, False]) + def test_to_numpy_na_nan(self, in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("in_series", [True, False]) + @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) + def test_to_numpy_dtype(self, dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) + def test_to_numpy_na_raises(self, dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + def test_astype_str(self): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NaN\n1 1" + expected = " A\n0 \n1 1" assert result == expected @@ -593,7 +699,7 @@ def test_conversions(data_missing): # we assert that we are exactly equal # including type conversions of scalars result = df["A"].astype("object").values - expected = np.array([np.nan, 1], dtype=object) + expected = np.array([pd.NA, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): @@ -756,7 +862,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, False]) + expected = pd.Series([False, True, None], dtype="boolean") tm.assert_series_equal(result, expected) result = df.A + df.B @@ -820,7 +926,7 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert float NaN to integer" + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." with pytest.raises(ValueError, match=msg): arr.astype("uint32") @@ -895,7 +1001,9 @@ def test_arrow_array(data): import pyarrow as pa arr = pa.array(data) - expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) assert arr.equals(expected) @@ -931,6 +1039,17 @@ def test_stat_method(pandasmethname, kwargs): assert expected == result +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 7a150c35fea09..86793c4ec50dd 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -6,9 +6,9 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.arrays import PandasArray from pandas.core.arrays.numpy_ import PandasDtype -import pandas.util.testing as tm @pytest.fixture( @@ -226,3 +226,25 @@ def test_setitem_no_coercion(): arr = PandasArray(np.array([1, 2, 3])) with pytest.raises(ValueError, match="int"): arr[0] = "a" + + # With a value that we do coerce, check that we coerce the value + # and not the underlying array. + arr[0] = 2.5 + assert isinstance(arr[0], (int, np.integer)), type(arr[0]) + + +def test_setitem_preserves_views(): + # GH#28150, see also extension test of the same name + arr = PandasArray(np.array([1, 2, 3])) + view1 = arr.view() + view2 = arr[:] + view3 = np.asarray(arr) + + arr[0] = 9 + assert view1[0] == 9 + assert view2[0] == 9 + assert view3[0] == 9 + + arr[-1] = 2.5 + view1[-1] = 5 + assert arr[-1] == 5 diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 252f278242fcc..1f4351c7e20ee 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -3,12 +3,13 @@ from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import PeriodDtype, registry import pandas as pd +import pandas._testing as tm from pandas.core.arrays import PeriodArray, period_array -import pandas.util.testing as tm # ---------------------------------------------------------------------------- # Dtype @@ -323,3 +324,91 @@ def test_min_max_empty(self, skipna): result = arr.max(skipna=skipna) assert result is pd.NaT + + +# ---------------------------------------------------------------------------- +# Arrow interaction + +pyarrow_skip = pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + p1 = ArrowPeriodType("D") + p2 = ArrowPeriodType("D") + p3 = ArrowPeriodType("M") + + assert p1.freq == "D" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +@pytest.mark.parametrize( + "data, freq", + [ + (pd.date_range("2017", periods=3), "D"), + (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + ], +) +def test_arrow_array(data, freq): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + periods = period_array(data, freq=freq) + result = pa.array(periods) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == freq + expected = pa.array(periods.asi8, type="int64") + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(periods, type=pa.int64()) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(periods, type="float64") + + with pytest.raises(TypeError, match="different 'freq'"): + pa.array(periods, type=ArrowPeriodType("T")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + + result = pa.array(arr) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == "D" + expected = pa.array([1, None, 3], type="int64") + assert result.storage.equals(expected) + + +@pyarrow_skip +def test_arrow_table_roundtrip(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 8d54ea564e1c2..62cb4766171a4 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm class TestTimedeltaArrayConstructor: @@ -140,6 +140,42 @@ def test_setitem_objects(self, obj): arr[0] = obj assert arr[0] == pd.Timedelta(seconds=1) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.datetime64("NaT"), + pd.Timestamp.now(), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + (np.arange(10) * 24 * 3600 * 10 ** 9).view("datetime64[ns]"), + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize( + "index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail( + reason="Raises ValueError instead of TypeError", raises=ValueError + ), + ), + ], + ) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = TimedeltaArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "searchsorted requires compatible dtype or scalar" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + class TestReductions: @pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"]) diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index a9e0473ac067a..0b7274399aafc 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import DataFrame, Index, Series +import pandas._testing as tm from pandas.core.accessor import PandasDelegate from pandas.core.base import NoNewAttributesMixin, PandasObject -import pandas.util.testing as tm class TestPandasDelegate: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8fa52af832907..07a15d0619bb6 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -6,8 +6,15 @@ import pandas as pd from pandas import CategoricalIndex, Series, Timedelta, Timestamp -from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import ( + DatetimeArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + TimedeltaArray, +) class TestToIterable: @@ -177,14 +184,10 @@ def test_iter_box(self): ), ( pd.PeriodIndex([2018, 2019], freq="A"), - pd.core.arrays.PeriodArray, + PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), ), - ( - pd.IntervalIndex.from_breaks([0, 1, 2]), - pd.core.arrays.IntervalArray, - "interval", - ), + (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",), # This test is currently failing for datetime64[ns] and timedelta64[ns]. # The NumPy type system is sufficient for representing these types, so # we just use NumPy for Series / DataFrame columns of these types (so @@ -270,8 +273,8 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): (pd.Categorical(["a", "b"]), "_codes"), (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), (pd.core.arrays.integer_array([0, np.nan]), "_data"), - (pd.core.arrays.IntervalArray.from_breaks([0, 1]), "_left"), - (pd.SparseArray([0, 1]), "_sparse_values"), + (IntervalArray.from_breaks([0, 1]), "_left"), + (SparseArray([0, 1]), "_sparse_values"), (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), # tz-aware Datetime ( @@ -288,7 +291,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): def test_array(array, attr, index_or_series): box = index_or_series if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - pytest.skip("No index type for {}".format(array.dtype)) + pytest.skip(f"No index type for {array.dtype}") result = box(array, copy=False).array if attr: @@ -315,13 +318,13 @@ def test_array_multiindex_raises(): ), ( pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), + np.array([0, pd.NA], dtype=object), ), ( - pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), + IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), ), - (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), # tz-naive datetime ( DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), @@ -354,7 +357,7 @@ def test_to_numpy(array, expected, index_or_series): thing = box(array) if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - pytest.skip("No index type for {}".format(array.dtype)) + pytest.skip(f"No index type for {array.dtype}") result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) @@ -401,3 +404,36 @@ def test_to_numpy_dtype(as_series): result = obj.to_numpy(dtype="M8[ns]") expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype, na_value, expected", + [ + ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]), + ( + [pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT], + None, + pd.Timestamp("2000"), + [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore +def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): + s = container(values) + result = s.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array(expected) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_kwargs_raises(): + # numpy + s = pd.Series([1, 2, 3]) + match = r"to_numpy\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) + + # extension + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 4231aa844f282..2693eb12dda71 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -29,8 +29,8 @@ TimedeltaIndex, Timestamp, ) +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm class Ops: @@ -62,8 +62,8 @@ def setup_method(self, method): self.unicode_series = Series(arr, index=self.unicode_index, name="a") types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] - self.indexes = [getattr(self, "{}_index".format(t)) for t in types] - self.series = [getattr(self, "{}_series".format(t)) for t in types] + self.indexes = [getattr(self, f"{t}_index") for t in types] + self.series = [getattr(self, f"{t}_series") for t in types] # To test narrow dtypes, we use narrower *data* elements, not *index* elements index = self.int_index @@ -79,7 +79,7 @@ def setup_method(self, method): self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] - self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types] + self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] self.objs = self.indexes + self.series + self.narrow_series diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 8438eea84baa8..7f68abb92ba43 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -16,6 +16,7 @@ import pandas as pd from pandas import DataFrame, Series, compat, date_range +import pandas._testing as tm from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -33,7 +34,6 @@ _special_case_arith_ops_syms, _unary_math_ops, ) -import pandas.util.testing as tm @pytest.fixture( @@ -339,8 +339,8 @@ def check_floor_division(self, lhs, arith1, rhs): self.check_equal(res, expected) else: msg = ( - r"unsupported operand type\(s\) for //: 'VariableNode' and" - " 'VariableNode'" + r"unsupported operand type\(s\) for //: 'VariableNode' and " + "'VariableNode'" ) with pytest.raises(TypeError, match=msg): pd.eval( diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index 20a5be0c8a289..e815a90207a08 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -8,6 +8,8 @@ from pandas.compat import is_platform_windows +import pandas as pd + _all_locales = get_locales() or [] _current_locale = locale.getlocale() @@ -56,21 +58,21 @@ def test_get_locales_prefix(): @_skip_if_only_one_locale -def test_set_locale(): +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +def test_set_locale(lang, enc): if all(x is None for x in _current_locale): # Not sure why, but on some Travis runs with pytest, # getlocale() returned (None, None). pytest.skip("Current locale is not set.") - locale_override = os.environ.get("LOCALE_OVERRIDE", None) - - if locale_override is None: - lang, enc = "it_CH", "UTF-8" - elif locale_override == "C": - lang, enc = "en_US", "ascii" - else: - lang, enc = locale_override.split(".") - enc = codecs.lookup(enc).name new_locale = lang, enc @@ -91,3 +93,13 @@ def test_set_locale(): # Once we exit the "with" statement, locale should be back to what it was. current_locale = locale.getlocale() assert current_locale == _current_locale + + +def test_encoding_detected(): + system_locale = os.environ.get("LC_ALL") + system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8" + + assert ( + codecs.lookup(pd.options.display.encoding).name + == codecs.lookup(system_encoding).name + ) diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index 71f41fcf5b447..cc823a3d6e02c 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -2,7 +2,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm def test_cast_1d_array_like_from_scalar_categorical(): diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index 620e74f80d5fb..fe271392122a2 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.cast import construct_1d_ndarray_preserving_na -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 99afabfa42a04..d6e6ed3022b75 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas import DatetimeIndex, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 37fa003668435..2744cfa8ddc62 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -19,7 +19,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 0939e35bd64fa..69f8f46356a4d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -8,7 +8,6 @@ import pytest from pandas._libs.tslibs import NaT -from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( @@ -406,7 +405,6 @@ def test_maybe_promote_any_with_datetime64( _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -@pytest.mark.xfail(reason="Fails to upcast to object") def test_maybe_promote_datetimetz_with_any_numpy_dtype( tz_aware_fixture, any_numpy_dtype_reduced ): @@ -427,11 +425,6 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fix dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) - from dateutil.tz import tzlocal - - if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") - # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -441,7 +434,6 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fix expected_dtype = dtype else: expected_dtype = np.dtype(object) - pytest.xfail("fails to cast to object") _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index 49e850f3e87b5..bb7a7d059c7ee 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 667ee467f2f29..097e83d93ee71 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List import numpy as np @@ -5,6 +6,7 @@ import pandas.util._test_decorators as td +from pandas.core.dtypes.cast import astype_nansafe import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -13,8 +15,11 @@ IntervalDtype, PeriodDtype, ) +from pandas.core.dtypes.missing import isna import pandas as pd +import pandas._testing as tm +from pandas.arrays import SparseArray from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, @@ -23,7 +28,6 @@ UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES, ) -import pandas.util.testing as tm # EA & Actual Dtypes @@ -179,7 +183,7 @@ def test_is_object(): "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) def test_is_sparse(check_scipy): - assert com.is_sparse(pd.SparseArray([1, 2, 3])) + assert com.is_sparse(SparseArray([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) @@ -195,7 +199,7 @@ def test_is_scipy_sparse(): assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) - assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) + assert not com.is_scipy_sparse(SparseArray([1, 2, 3])) def test_is_categorical(): @@ -488,7 +492,7 @@ def test_is_numeric_v_string_like(): def test_is_datetimelike_v_numeric(): - dt = np.datetime64(pd.datetime(2017, 1, 1)) + dt = np.datetime64(datetime(2017, 1, 1)) assert not com.is_datetimelike_v_numeric(1, 1) assert not com.is_datetimelike_v_numeric(dt, dt) @@ -573,7 +577,7 @@ def test_is_extension_type(check_scipy): cat = pd.Categorical([1, 2, 3]) assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) - assert com.is_extension_type(pd.SparseArray([1, 2, 3])) + assert com.is_extension_type(SparseArray([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") @@ -602,7 +606,7 @@ def test_is_extension_array_dtype(check_scipy): cat = pd.Categorical([1, 2, 3]) assert com.is_extension_array_dtype(cat) assert com.is_extension_array_dtype(pd.Series(cat)) - assert com.is_extension_array_dtype(pd.SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(SparseArray([1, 2, 3])) assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") @@ -625,18 +629,6 @@ def test_is_complex_dtype(): assert com.is_complex_dtype(np.array([1 + 1j, 5])) -def test_is_offsetlike(): - assert com.is_offsetlike(np.array([pd.DateOffset(month=3), pd.offsets.Nano()])) - assert com.is_offsetlike(pd.offsets.MonthEnd()) - assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)])) - - assert not com.is_offsetlike(pd.Timedelta(1)) - assert not com.is_offsetlike(np.array([1 + 1j, 5])) - - # mixed case - assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) - - @pytest.mark.parametrize( "input_param,result", [ @@ -676,7 +668,8 @@ def test__get_dtype(input_param, result): (None, "Cannot deduce dtype from null object"), (1, "data type not understood"), (1.2, "data type not understood"), - ("random string", 'data type "random string" not understood'), + # numpy dev changed from double-quotes to single quotes + ("random string", "data type [\"']random string[\"'] not understood"), (pd.DataFrame([1, 2]), "data type not understood"), ], ) @@ -721,3 +714,42 @@ def test__get_dtype_fails(input_param, expected_error_message): ) def test__is_dtype_type(input_param, result): assert com._is_dtype_type(input_param, lambda tipo: tipo == result) + + +@pytest.mark.parametrize("val", [np.datetime64("NaT"), np.timedelta64("NaT")]) +@pytest.mark.parametrize("typ", [np.int64]) +def test_astype_nansafe(val, typ): + arr = np.array([val]) + + msg = "Cannot convert NaT values to integer" + with pytest.raises(ValueError, match=msg): + astype_nansafe(arr, dtype=typ) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +@pytest.mark.parametrize( + "to_type", + [ + np.uint8, + np.uint16, + np.uint32, + np.int8, + np.int16, + np.int32, + np.float16, + np.float32, + ], +) +def test_astype_datetime64_bad_dtype_raises(from_type, to_type): + arr = np.array([from_type("2018")]) + + with pytest.raises(TypeError, match="cannot astype"): + astype_nansafe(arr, dtype=to_type) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +def test_astype_object_preserves_datetime_na(from_type): + arr = np.array([from_type("NaT")]) + result = astype_nansafe(arr, dtype="object") + + assert isna(result)[0] diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 4dee6e3e92a7f..fddd6239df309 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -27,8 +27,8 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, IntervalIndex, Series, date_range -from pandas.core.arrays.sparse import SparseDtype -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype class Base: @@ -408,6 +408,9 @@ def test_construction_from_string(self): with pytest.raises(TypeError): PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") + with pytest.raises(TypeError, match="list"): + PeriodDtype.construct_from_string([1, 2, 3]) + def test_is_dtype(self): assert PeriodDtype.is_dtype(self.dtype) assert PeriodDtype.is_dtype("period[D]") @@ -685,6 +688,10 @@ def test_caching(self): tm.round_trip_pickle(dtype) assert len(IntervalDtype._cache) == 0 + def test_not_string(self): + # GH30568: though IntervalDtype has object kind, it cannot be string + assert not is_string_dtype(IntervalDtype()) + class TestCategoricalDtypeParametrized: @pytest.mark.parametrize( @@ -907,7 +914,7 @@ def test_registry_find(dtype, expected): (pd.Series([1, 2]), False), (np.array([True, False]), True), (pd.Series([True, False]), True), - (pd.SparseArray([True, False]), True), + (SparseArray([True, False]), True), (SparseDtype(bool), True), ], ) @@ -917,7 +924,7 @@ def test_is_bool_dtype(dtype, expected): def test_is_bool_dtype_sparse(): - result = is_bool_dtype(pd.Series(pd.SparseArray([True, False]))) + result = is_bool_dtype(pd.Series(SparseArray([True, False]))) assert result is True diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index c17a8997a9b8f..2c8631ac2d71d 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -5,7 +5,7 @@ from pandas.core.dtypes import generic as gt import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestABCClasses: @@ -17,7 +17,7 @@ class TestABCClasses: categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) - sparse_array = pd.SparseArray(np.random.randn(10)) + sparse_array = pd.arrays.SparseArray(np.random.randn(10)) datetime_array = pd.core.arrays.DatetimeArray(datetime_index) timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 343dcc6849af6..d022b0e97877a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -52,8 +52,8 @@ Timestamp, isna, ) +import pandas._testing as tm from pandas.core.arrays import IntegerArray -import pandas.util.testing as tm @pytest.fixture(params=[True, False], ids=str) @@ -240,7 +240,7 @@ def __getitem__(self, key): if has_contains: - def __contains__(self, key): + def __contains__(self, key) -> bool: return self.d.__contains__(key) d = DictLike({1: 2}) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 5e7c6e4b48682..7ba59786bb0fa 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,6 +1,5 @@ from datetime import datetime from decimal import Decimal -from warnings import catch_warnings, filterwarnings import numpy as np import pytest @@ -23,7 +22,7 @@ import pandas as pd from pandas import DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm now = pd.Timestamp.now() utcnow = pd.Timestamp.now("UTC") @@ -295,6 +294,11 @@ def test_array_equivalent(): np.array([np.nan, None], dtype="object"), np.array([np.nan, None], dtype="object"), ) + # Check the handling of nested arrays in array_equivalent_object + assert array_equivalent( + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + ) assert array_equivalent( np.array([np.nan, 1 + 1j], dtype="complex"), np.array([np.nan, 1 + 1j], dtype="complex"), @@ -315,23 +319,21 @@ def test_array_equivalent(): assert not array_equivalent( TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) ) - with catch_warnings(): - filterwarnings("ignore", "Converting timezone", FutureWarning) - assert array_equivalent( - DatetimeIndex([0, np.nan], tz="US/Eastern"), - DatetimeIndex([0, np.nan], tz="US/Eastern"), - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz="US/Eastern"), - DatetimeIndex([1, np.nan], tz="US/Eastern"), - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz="CET"), - DatetimeIndex([0, np.nan], tz="US/Eastern"), - ) + assert array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([1, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="CET"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index e88c63b19003f..94dd09d3eb053 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -2,10 +2,10 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm -pytest.importorskip("pyarrow", minversion="0.12.0") +pytest.importorskip("pyarrow", minversion="0.13.0") from .arrays import ArrowBoolArray, ArrowBoolDtype # isort:skip diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index baedcf0dd9088..abd5c1f386dc5 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -2,7 +2,7 @@ import pandas as pd -pytest.importorskip("pyarrow", minversion="0.12.0") +pytest.importorskip("pyarrow", minversion="0.13.0") from .arrays import ArrowStringDtype # isort:skip diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index 2f808d20acd31..144b0825b39a2 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -1,4 +1,4 @@ -import pandas.util.testing as tm +import pandas._testing as tm class BaseExtensionTests: diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7146443bf8de5..58859fc6ac54c 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas as pd from pandas.core.internals import ObjectBlock @@ -21,3 +23,12 @@ def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + + def test_to_numpy(self, data): + expected = np.asarray(data) + + result = data.to_numpy() + self.assert_equal(result, expected) + + result = pd.Series(data).to_numpy() + self.assert_equal(result, expected) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 9a442f346c19f..b6c12b5844086 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -16,8 +16,7 @@ def test_name(self, dtype): def test_kind(self, dtype): valid = set("biufcmMOSUV") - if dtype.kind is not None: - assert dtype.kind in valid + assert dtype.kind in valid def test_construct_from_string_own_name(self, dtype): result = dtype.construct_from_string(dtype.name) @@ -38,6 +37,9 @@ def test_is_dtype_from_self(self, dtype): result = type(dtype).is_dtype(dtype) assert result is True + def test_is_dtype_other_input(self, dtype): + assert dtype.is_dtype([1, 2, 3]) is False + def test_is_not_string_type(self, dtype): return not pd.api.types.is_string_dtype(dtype) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 71c7fbb986267..dc1f62c4c97c5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -121,6 +121,45 @@ def test_getitem_mask(self, data): assert len(result) == 1 assert result.dtype == data.dtype + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + with pytest.raises(IndexError): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] + + def test_getitem_boolean_array_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + self.assert_series_equal(result, expected) + + def test_getitem_boolean_array_mask_raises(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + with pytest.raises(ValueError): + data[mask] + + s = pd.Series(data) + + with pytest.raises(ValueError): + s[mask] + def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index dc926d2ff6ab4..94d0ef7bbea84 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index a29f6deeffae6..cdea96334be2a 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 973088cb72e7a..1e427c6319cab 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.sorting import nargsort -import pandas.util.testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 21bbb365ab0f3..2393d2edcd2c6 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 8766bb771f8a2..6f433d659575a 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 89c9ed3674a66..ec21898852888 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -94,6 +94,19 @@ def test_concat_columns(self, data, na_value): result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) + def test_concat_extension_arrays_copy_false(self, data, na_value): + # GH 20756 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": data[3:7]}) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": data[3:7], + } + ) + result = pd.concat([df1, df2], axis=1, copy=False) + self.assert_frame_equal(result, expected) + def test_align(self, data, na_value): a = data[:3] b = data[2:5] diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index bb6bb02b462e2..0bb8aede6298c 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -186,3 +186,12 @@ def test_setitem_scalar_key_sequence_raise(self, data): arr = data[:5].copy() with pytest.raises(ValueError): arr[0] = arr[[0, 1]] + + def test_setitem_preserves_views(self, data): + # GH#28150 setitem shouldn't swap the underlying data + view1 = data.view() + view2 = data[:] + + data[0] = data[1] + assert view1[0] == data[1] + assert view2[0] == data[1] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 74f1e3cfbaf20..85bd5f7a33fe1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.base import ExtensionDtype import pandas as pd -from pandas.api.extensions import register_extension_dtype +from pandas.api.extensions import no_default, register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin @@ -84,6 +84,12 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + result = np.asarray(self, dtype=dtype) + if decimals is not None: + result = np.asarray([round(x, decimals) for x in result]) + return result + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # if not all( @@ -109,6 +115,15 @@ def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] else: + # array, slice. + if pd.api.types.is_list_like(item): + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + elif pd.api.types.is_integer_dtype(dtype): + item = np.asarray(item, dtype="int") return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b5c3abd8ce8f6..de7c98ab96571 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -6,8 +6,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm from .array import DecimalArray, DecimalDtype, make_data, to_decimal @@ -499,3 +499,17 @@ def DecimalArray__array__(self, dtype=None): df[s > 0.5] s.at[0] df.at[0, "a"] + + +def test_to_numpy_keyword(): + # test the extra keyword + values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")] + expected = np.array( + [decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object" + ) + a = pd.array(values, dtype="decimal") + result = a.to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(a).to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 014581682ac59..17bc2773aad19 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,9 +19,8 @@ import numpy as np -from pandas.core.dtypes.base import ExtensionDtype - -from pandas.core.arrays import ExtensionArray +import pandas as pd +from pandas.api.extensions import ExtensionArray, ExtensionDtype class JSONDtype(ExtensionDtype): @@ -76,17 +75,21 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == "bool": - return self._from_sequence([x for x, m in zip(self, item) if m]) - elif isinstance(item, abc.Iterable): - # fancy indexing - return type(self)([self.data[i] for i in item]) elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) - else: + elif isinstance(item, slice): # slice return type(self)(self.data[item]) + else: + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + return self._from_sequence([x for x, m in zip(self, item) if m]) + # integer + return type(self)([self.data[i] for i in item]) def __setitem__(self, key, value): if isinstance(key, numbers.Integral): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 01f2565e2ee58..4d3145109e3c2 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -4,8 +4,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm from .array import JSONArray, JSONDtype, make_data diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index a02433da2da12..a7ce0fb097599 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -19,9 +19,9 @@ from pandas.compat.numpy import _np_version_under1p14 import pandas as pd +import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(): @@ -226,6 +226,10 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index dff1e58641ade..336b23e54d74c 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -20,9 +20,9 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Timestamp +import pandas._testing as tm from pandas.api.types import CategoricalDtype from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 9b5f9d64f6b67..e43650c291200 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -5,8 +5,8 @@ from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import ExtensionArray -import pandas.util.testing as tm class DummyDtype(dtypes.ExtensionDtype): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..afb8412f12ea9 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -34,7 +34,7 @@ def make_data(): - return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] @pytest.fixture( @@ -65,7 +65,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) + return integer_array([pd.NA, 1], dtype=dtype) @pytest.fixture @@ -75,18 +75,18 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return integer_array([1, np.nan, 0], dtype=dtype) + return integer_array([1, pd.NA, 0], dtype=dtype) @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -94,7 +94,7 @@ def data_for_grouping(dtype): b = 1 a = 0 c = 2 - na = np.nan + na = pd.NA return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -129,7 +129,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.astype(float) + expected = expected.fillna(np.nan).astype(float) if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) @@ -142,6 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 @@ -162,6 +163,16 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -198,7 +209,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.parametrize("dropna", [True, False]) + @pytest.mark.skip(reason="uses nullable integer") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 4fdcf930d224f..2411f6cfbd936 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -147,7 +147,9 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - pass + @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) class TestPrinting(BaseInterval, base.BasePrintingTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 55a617caf28ce..7db38f41d4573 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -4,8 +4,8 @@ from pandas.compat.numpy import _np_version_under1p16 import pandas as pd +import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray, PandasDtype -import pandas.util.testing as tm from . import base diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 6ebe71e173ec2..198a228b621b4 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -4,9 +4,10 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import SparseArray, SparseDtype +from pandas import SparseDtype +import pandas._testing as tm +from pandas.arrays import SparseArray from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(fill_value): @@ -132,6 +133,10 @@ def test_concat_columns(self, data, na_value): self._check_unsupported(data) super().test_concat_columns(data, na_value) + def test_concat_extension_arrays_copy_false(self, data, na_value): + self._check_unsupported(data) + super().test_concat_extension_arrays_copy_false(data, na_value) + def test_align(self, data, na_value): self._check_unsupported(data) super().test_align(data, na_value) @@ -231,7 +236,7 @@ def test_combine_le(self, data_repeated): s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - pd.SparseArray( + SparseArray( [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], fill_value=False, ) @@ -241,7 +246,7 @@ def test_combine_le(self, data_repeated): val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) expected = pd.Series( - pd.SparseArray([a <= val for a in list(orig_data1)], fill_value=False) + SparseArray([a <= val for a in list(orig_data1)], fill_value=False) ) self.assert_series_equal(result, expected) @@ -346,7 +351,7 @@ def _compare_other(self, s, data, op_name, other): with np.errstate(all="ignore"): expected = pd.Series( - pd.SparseArray( + SparseArray( op(np.asarray(data), np.asarray(other)), fill_value=result.values.fill_value, ) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 8519c2999ade3..86aed671f1b88 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -81,7 +81,9 @@ class TestNoReduce(base.BaseNoReduceTests): class TestMethods(base.BaseMethodsTests): - pass + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 915d6edcd8367..774eb443c45fe 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, NaT, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 314359fc22cc4..a29c193676db2 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIndexingCategorical: diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index bde35c04acf4f..a1c12be2b0180 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -1,6 +1,6 @@ import pandas as pd from pandas import DataFrame, Index, Series, date_range, notna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIndexingDatetimeWithTZ: diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index cd384d6fdbfad..33c0e92845484 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -20,9 +20,10 @@ isna, notna, ) +import pandas._testing as tm +from pandas.arrays import SparseArray import pandas.core.common as com from pandas.core.indexing import IndexingError -import pandas.util.testing as tm from pandas.tseries.offsets import BDay @@ -1146,18 +1147,18 @@ def test_setitem_mixed_datetime(self): { "a": [0, 0, 0, 0, 13, 14], "b": [ - pd.datetime(2012, 1, 1), + datetime(2012, 1, 1), 1, "x", "y", - pd.datetime(2013, 1, 1), - pd.datetime(2014, 1, 1), + datetime(2013, 1, 1), + datetime(2014, 1, 1), ], } ) df = pd.DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT - df.loc[0, "b"] = pd.datetime(2012, 1, 1) + df.loc[0, "b"] = datetime(2012, 1, 1) df.loc[1, "b"] = 1 df.loc[[2, 3], "b"] = "x", "y" A = np.array( @@ -1776,7 +1777,7 @@ def test_getitem_ix_float_duplicates(self): def test_getitem_sparse_column(self): # https://github.com/pandas-dev/pandas/issues/23559 - data = pd.SparseArray([0, 1]) + data = SparseArray([0, 1]) df = pd.DataFrame({"A": data}) expected = pd.Series(data, name="A") result = df["A"] @@ -1791,7 +1792,7 @@ def test_getitem_sparse_column(self): def test_setitem_with_sparse_value(self): # GH8131 df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_array = pd.SparseArray([0, 0, 1]) + sp_array = SparseArray([0, 0, 1]) df["new_column"] = sp_array tm.assert_series_equal( df["new_column"], pd.Series(sp_array, name="new_column"), check_names=False @@ -1799,9 +1800,9 @@ def test_setitem_with_sparse_value(self): def test_setitem_with_unaligned_sparse_value(self): df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0]) + sp_series = pd.Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) df["new_column"] = sp_series - exp = pd.Series(pd.SparseArray([1, 0, 0]), name="new_column") + exp = pd.Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], exp) def test_setitem_with_unaligned_tz_aware_datetime_column(self): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 4fea190f28d7b..df1b128dcd227 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIndexingWhere: diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index fac6a9139462f..d128a51f4b390 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameAppend: @@ -177,3 +177,19 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): result = df.append(df.iloc[0]).iloc[-1] expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, dtype", + [ + ([1], pd.Int64Dtype()), + ([1], pd.CategoricalDtype()), + ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), + ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), + ([1], pd.SparseDtype()), + ], + ) + def test_other_dtypes(self, data, dtype): + df = pd.DataFrame(data, dtype=dtype) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(data, name=0, dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 89be3779e5748..0291be0a4083e 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from pandas import DataFrame, Series, Timestamp, date_range, to_datetime -import pandas.util.testing as tm +from pandas import DataFrame, Period, Series, Timestamp, date_range, to_datetime +import pandas._testing as tm @pytest.fixture @@ -30,6 +30,7 @@ def test_basic(self, date_range_frame): ub = df.index[30] dates = list(dates) + result = df.asof(dates) assert result.notna().all(1).all() @@ -65,6 +66,7 @@ def test_missing(self, date_range_frame): # no match found - `where` value before earliest date in index N = 10 df = date_range_frame.iloc[:N].copy() + result = df.asof("1989-12-31") expected = Series( @@ -78,6 +80,12 @@ def test_missing(self, date_range_frame): ) tm.assert_frame_equal(result, expected) + # Check that we handle PeriodIndex correctly, dont end up with + # period.ordinal for series name + df = df.to_period("D") + result = df.asof("1989-12-31") + assert isinstance(result.name, Period) + def test_all_nans(self, date_range_frame): # GH 15713 # DataFrame is all nans @@ -132,5 +140,6 @@ def test_time_zone_aware_index(self, stamp, expected): Timestamp("2018-01-01 22:35:10.550+00:00"), ], ) + result = df.asof(stamp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 48444e909ee01..34727da3b95ae 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameClip: diff --git a/pandas/tests/frame/methods/test_count.py b/pandas/tests/frame/methods/test_count.py index b5d3d60579f54..13a93e3efc48c 100644 --- a/pandas/tests/frame/methods/test_count.py +++ b/pandas/tests/frame/methods/test_count.py @@ -1,5 +1,5 @@ from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameCount: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 209b4a800354d..5c13b60aae0d0 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameCov: diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 09510fc931546..251563e51e15a 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameDescribe: diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 9293855e79b1c..43c25f4c05c2d 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameDiff: diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 29ab2e1bfd512..fd4bae26ade57 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) @@ -393,6 +393,7 @@ def test_drop_duplicates_inplace(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "origin_dict, output_dict, ignore_index, output_index", [ @@ -403,24 +404,17 @@ def test_drop_duplicates_inplace(): ], ) def test_drop_duplicates_ignore_index( - origin_dict, output_dict, ignore_index, output_index + inplace, origin_dict, output_dict, ignore_index, output_index ): # GH 30114 df = DataFrame(origin_dict) expected = DataFrame(output_dict, index=output_index) - # Test when inplace is False - result = df.drop_duplicates(ignore_index=ignore_index) - tm.assert_frame_equal(result, expected) - - # to verify original dataframe is not mutated - tm.assert_frame_equal(df, DataFrame(origin_dict)) - - # Test when inplace is True - copied_df = df.copy() - - copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True) - tm.assert_frame_equal(copied_df, expected) + if inplace: + result_df = df.copy() + result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + else: + result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) - # to verify that input is unchanged + tm.assert_frame_equal(result_df, expected) tm.assert_frame_equal(df, DataFrame(origin_dict)) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index d5c28a416ffa7..72eec8753315c 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 545a4b5f9421e..76c87ed355492 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_error(): diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 5d7dc5c843ec1..0eb94afc99d94 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIsIn: diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 72299ad6b2bf6..4ce474230b686 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -8,7 +8,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py index 0c15533c37f01..8f3f37fb9fff7 100644 --- a/pandas/tests/frame/methods/test_pct_change.py +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFramePctChange: @@ -76,3 +76,21 @@ def test_pct_change_periods_freq( rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) tm.assert_frame_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + data = DataFrame( + {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 + ) + result = data.pct_change(fill_method=fill_method) + if fill_method is None: + second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] + else: + second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0] + expected = DataFrame( + {0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column}, + index=["a", "b"] * 3, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index c25b24121d481..64461c08d34f4 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -3,14 +3,14 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameQuantile: def test_quantile_sparse(self): # GH#17198 - s = pd.Series(pd.SparseArray([1, 2])) - s1 = pd.Series(pd.SparseArray([3, 4])) + s = pd.Series(pd.arrays.SparseArray([1, 2])) + s1 = pd.Series(pd.arrays.SparseArray([3, 4])) df = pd.DataFrame({0: s, 1: s1}) result = df.quantile() @@ -103,8 +103,8 @@ def test_quantile_axis_parameter(self): with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) msg = ( - "No axis named column for object type" - " " + "No axis named column for object type " + "" ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index efb0c64a4f7ac..bab2db3192b4a 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestRank: diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3b01ae0c3c2e8..aa91e7a489356 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1,23 +1,23 @@ from datetime import datetime from io import StringIO import re -from typing import Dict +from typing import Dict, List, Union import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture -def mix_ab() -> Dict[str, list]: +def mix_ab() -> Dict[str, List[Union[int, str]]]: return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture -def mix_abc() -> Dict[str, list]: +def mix_abc() -> Dict[str, List[Union[float, str]]]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} diff --git a/pandas/tests/frame/methods/test_round.py b/pandas/tests/frame/methods/test_round.py index 96ac012ce7892..0865e03cedc50 100644 --- a/pandas/tests/frame/methods/test_round.py +++ b/pandas/tests/frame/methods/test_round.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameRound: diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 7fb8fbbc95627..cfb17de892b1c 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameShift: diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 4f311bbaa8eb9..2c25e1f3740a3 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import CategoricalDtype, DataFrame, IntervalIndex, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSortIndex: @@ -229,3 +229,92 @@ def test_sort_index_intervalindex(self): ) result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, False, [5, 3, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, False, [2, 3, 5]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114 + original_index = [2, 5, 3] + df = DataFrame(original_dict, index=original_index) + expected_df = DataFrame(sorted_dict, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + False, + MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")), + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + False, + MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")), + ), + ], + ) + def test_sort_index_ignore_index_multi_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114, this is to test ignore_index on MulitIndex of index + mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")) + df = DataFrame(original_dict, index=mi) + expected_df = DataFrame(sorted_dict, index=output_index) + + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index e733c01e01740..96f4d6ed90d6b 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, DataFrame, NaT, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSortValues: @@ -461,6 +461,7 @@ def test_sort_values_na_position_with_categories_raises(self): with pytest.raises(ValueError): df.sort_values(by="c", ascending=False, na_position="bad_position") + @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_dict, sorted_dict, ignore_index, output_index", [ @@ -481,24 +482,37 @@ def test_sort_values_na_position_with_categories_raises(self): ], ) def test_sort_values_ignore_index( - self, original_dict, sorted_dict, ignore_index, output_index + self, inplace, original_dict, sorted_dict, ignore_index, output_index ): # GH 30114 df = DataFrame(original_dict) expected = DataFrame(sorted_dict, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} - # Test when inplace is False - sorted_df = df.sort_values("A", ascending=False, ignore_index=ignore_index) - tm.assert_frame_equal(sorted_df, expected) + if inplace: + result_df = df.copy() + result_df.sort_values("A", ascending=False, **kwargs) + else: + result_df = df.sort_values("A", ascending=False, **kwargs) + tm.assert_frame_equal(result_df, expected) tm.assert_frame_equal(df, DataFrame(original_dict)) - # Test when inplace is True - copied_df = df.copy() - - copied_df.sort_values( - "A", ascending=False, ignore_index=ignore_index, inplace=True + def test_sort_values_nat_na_position_default(self): + # GH 13230 + expected = pd.DataFrame( + { + "A": [1, 2, 3, 4, 4], + "date": pd.DatetimeIndex( + [ + "2010-01-01 09:00:00", + "2010-01-01 09:00:01", + "2010-01-01 09:00:02", + "2010-01-01 09:00:03", + "NaT", + ] + ), + } ) - tm.assert_frame_equal(copied_df, expected) - - tm.assert_frame_equal(df, DataFrame(original_dict)) + result = expected.sort_values(["A", "date"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 556d86bed8f14..7b0adceb57668 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -6,7 +6,7 @@ import pytz from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameToDict: diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 18f77088677ec..d0181f0309af1 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -4,7 +4,7 @@ import pytest from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameToRecords: @@ -235,7 +235,7 @@ def test_to_records_with_categorical(self): # Check that bad types raise ( dict(index=False, column_dtypes={"A": "int32", "B": "foo"}), - (TypeError, 'data type "foo" not understood'), + (TypeError, "data type [\"']foo[\"'] not understood"), ), ], ) @@ -326,7 +326,7 @@ def __init__(self, **kwargs): def __getitem__(self, key): return self.d.__getitem__(key) - def __contains__(self, key): + def __contains__(self, key) -> bool: return key in self.d def keys(self): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 71843053cf3a8..428b9e5068407 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,5 +1,5 @@ import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestTranspose: diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index a021a99a45a5c..ad86ee1266874 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameTruncate: diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 48b373d9c7901..602ea9ca0471a 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -25,7 +25,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameAlterAxes: @@ -1312,7 +1312,7 @@ def test_rename_mapper_multi(self): def test_rename_positional_named(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) - result = df.rename(str.lower, columns=str.upper) + result = df.rename(index=str.lower, columns=str.upper) expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) tm.assert_frame_equal(result, expected) @@ -1336,12 +1336,12 @@ def test_rename_axis_style_raises(self): # Multiple targets and axis with pytest.raises(TypeError, match=over_spec_msg): - df.rename(str.lower, str.lower, axis="columns") + df.rename(str.lower, index=str.lower, axis="columns") # Too many targets - over_spec_msg = "Cannot specify all of 'mapper', 'index', 'columns'." + over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" with pytest.raises(TypeError, match=over_spec_msg): - df.rename(str.lower, str.lower, str.lower) + df.rename(str.lower, index=str.lower, columns=str.lower) # Duplicates with pytest.raises(TypeError, match="multiple values"): @@ -1375,16 +1375,42 @@ def test_reindex_api_equivalence(self): for res in [res2, res3]: tm.assert_frame_equal(res1, res) - def test_rename_positional(self): + def test_rename_positional_raises(self): + # GH 29136 df = DataFrame(columns=["A", "B"]) - with tm.assert_produces_warning(FutureWarning) as rec: - result = df.rename(None, str.lower) - expected = DataFrame(columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - assert len(rec) == 1 - message = str(rec[0].message) - assert "rename" in message - assert "Use named arguments" in message + msg = r"rename\(\) takes from 1 to 2 positional arguments" + + with pytest.raises(TypeError, match=msg): + df.rename(None, str.lower) + + def test_rename_no_mappings_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "must pass an index to rename" + with pytest.raises(TypeError, match=msg): + df.rename() + + with pytest.raises(TypeError, match=msg): + df.rename(None, index=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None, index=None) + + def test_rename_mapper_and_positional_arguments_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=msg): + df.rename({}, index={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}, index={}) def test_assign_columns(self, float_frame): float_frame["hi"] = "there" @@ -1409,14 +1435,6 @@ def test_set_index_preserve_categorical_dtype(self): result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) - def test_ambiguous_warns(self): - df = DataFrame({"A": [1, 2]}) - with tm.assert_produces_warning(FutureWarning): - df.rename(id, id) - - with tm.assert_produces_warning(FutureWarning): - df.rename({0: 10}, {"A": "B"}) - def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 68d49c05eaa37..910230c737a2a 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -20,9 +20,9 @@ to_datetime, to_timedelta, ) +import pandas._testing as tm import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops -import pandas.util.testing as tm def assert_stat_op_calc( diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 91fb71c9de7a4..9263409f7a7f8 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,9 +5,12 @@ import numpy as np import pytest +from pandas.compat import PY37 +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameMisc: @@ -261,8 +264,27 @@ def test_itertuples(self, float_frame): df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) - assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple) + if PY37: + assert hasattr(tup3, "_fields") + else: + assert not hasattr(tup3, "_fields") + + # GH 28282 + df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) + result_254_columns = next(df_254_columns.itertuples(index=False)) + assert isinstance(result_254_columns, tuple) + assert hasattr(result_254_columns, "_fields") + + df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) + result_255_columns = next(df_255_columns.itertuples(index=False)) + assert isinstance(result_255_columns, tuple) + + # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 + if PY37: + assert hasattr(result_255_columns, "_fields") + else: + assert not hasattr(result_255_columns, "_fields") def test_sequence_like_with_categorical(self): @@ -518,13 +540,22 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; df = pd.DataFrame()" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("df.", 1)) + + def test_attrs(self): + df = pd.DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["version"] = 1 + + result = df.rename(columns=str) + assert result.attrs == {"version": 1} diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index eb98bdc49f976..e98f74e133ea9 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -11,10 +11,10 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna +import pandas._testing as tm from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError -import pandas.util.testing as tm @pytest.fixture @@ -691,6 +691,18 @@ def test_apply_dup_names_multi_agg(self): tm.assert_frame_equal(result, expected) + def test_apply_nested_result_axis_1(self): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + + df = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = df.apply(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) + class TestInferOutputShape: # the user has supplied an opaque UDF where @@ -1331,8 +1343,8 @@ def test_agg_cython_table(self, df, func, expected, axis): _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ - ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), - ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), ], ), ), @@ -1341,6 +1353,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 5ecbe21d113b5..659b55756c4b6 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -6,8 +6,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int -import pandas.util.testing as tm # ------------------------------------------------------------------- # Comparisons @@ -726,3 +726,14 @@ def test_zero_len_frame_with_series_corner_cases(): result = df + ser expected = df tm.assert_frame_equal(result, expected) + + +def test_frame_single_columns_object_sum_axis_1(): + # GH 13758 + data = { + "One": pd.Series(["A", 1.2, np.nan]), + } + df = pd.DataFrame(data) + result = df.sum(axis=1) + expected = pd.Series(["A", 1.2, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index d6ef3a7600abb..7effa98fd8213 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSelectReindex: diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index eb8febb10a646..d301ed969789e 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -15,10 +15,10 @@ date_range, option_context, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray, integer_array from pandas.core.internals import ObjectBlock from pandas.core.internals.blocks import IntBlock -import pandas.util.testing as tm # Segregated collection of methods that require the BlockManager internal data # structure diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index bfb691a8e75d3..9bad54b051d6c 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameConcatCommon: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f3cc11cb7027d..ea1e339f44d93 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1,5 +1,5 @@ from collections import OrderedDict, abc -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta import functools import itertools @@ -25,9 +25,9 @@ date_range, isna, ) -from pandas.arrays import IntervalArray, PeriodArray +import pandas._testing as tm +from pandas.arrays import IntervalArray, PeriodArray, SparseArray from pandas.core.construction import create_series_with_explicit_dtype -import pandas.util.testing as tm MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] MIXED_INT_DTYPES = [ @@ -511,17 +511,17 @@ def test_constructor_with_embedded_frames(self): result = df2.loc[1, 0] tm.assert_frame_equal(result, df1 + 10) - def test_constructor_subclass_dict(self, float_frame): + def test_constructor_subclass_dict(self, float_frame, dict_subclass): # Test for passing dict subclass to constructor data = { - "col1": tm.TestSubDict((x, 10.0 * x) for x in range(10)), - "col2": tm.TestSubDict((x, 20.0 * x) for x in range(10)), + "col1": dict_subclass((x, 10.0 * x) for x in range(10)), + "col2": dict_subclass((x, 20.0 * x) for x in range(10)), } df = DataFrame(data) refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) tm.assert_frame_equal(refdf, df) - data = tm.TestSubDict(data.items()) + data = dict_subclass(data.items()) df = DataFrame(data) tm.assert_frame_equal(refdf, df) @@ -2414,7 +2414,7 @@ class List(list): "extension_arr", [ Categorical(list("aabbc")), - pd.SparseArray([1, np.nan, np.nan, np.nan]), + SparseArray([1, np.nan, np.nan, np.nan]), IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")), ], @@ -2425,6 +2425,14 @@ def test_constructor_with_extension_array(self, extension_arr): result = DataFrame(extension_arr) tm.assert_frame_equal(result, expected) + def test_datetime_date_tuple_columns_from_dict(self): + # GH 10863 + v = date.today() + tup = v, v + result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup]) + expected = DataFrame([0, 1, 2], columns=pd.Index(pd.Series([tup]))) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): @@ -2551,3 +2559,11 @@ def test_from_tzaware_mixed_object_array(self): "datetime64[ns, CET]", ] assert (res.dtypes == expected_dtypes).all() + + def test_from_2d_ndarray_with_dtype(self): + # GH#12513 + array_dim2 = np.arange(10).reshape((5, 2)) + df = pd.DataFrame(array_dim2, dtype="datetime64[ns, UTC]") + + expected = pd.DataFrame(array_dim2).astype("datetime64[ns, UTC]") + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index ad2cbff888b2e..b545d6aa8afd3 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -9,7 +9,7 @@ import numpy as np from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameCumulativeOps: @@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame): # fix issue cummax_xs = datetime_frame.cummax(axis=1) assert np.shape(cummax_xs) == np.shape(datetime_frame) + + def test_cumulative_ops_preserve_dtypes(self): + # GH#19296 dont incorrectly upcast to object + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]}) + + result = df.cumsum() + + expected = DataFrame( + { + "A": Series([1, 3, 6], dtype=np.int64), + "B": Series([1, 3, 6], dtype=np.float64), + "C": df["C"].cumsum(), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index d8d56e90a2f31..06bb040224455 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -18,8 +18,8 @@ date_range, option_context, ) +import pandas._testing as tm from pandas.core.arrays import integer_array -import pandas.util.testing as tm def _check_cast(df, v): diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index a0cbc1456afa4..c6e28f3c64f12 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, period_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index f9a2061aa1ff4..ae0516dd29a1f 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -8,8 +8,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float -import pandas.util.testing as tm class TestDataFrameMissingData: @@ -670,8 +670,8 @@ def test_fillna_invalid_value(self, float_frame): float_frame.fillna((1, 2)) # frame with series msg = ( - '"value" parameter must be a scalar, dict or Series, but you' - ' passed a "DataFrame"' + '"value" parameter must be a scalar, dict or Series, but you ' + 'passed a "DataFrame"' ) with pytest.raises(TypeError, match=msg): float_frame.iloc[:, 0].fillna(float_frame) @@ -970,3 +970,16 @@ def test_interp_ignore_all_good(self): # all good result = df[["B", "D"]].interpolate(downcast=None) tm.assert_frame_equal(result, df[["B", "D"]]) + + @pytest.mark.parametrize("axis", [0, 1]) + def test_interp_time_inplace_axis(self, axis): + # GH 9687 + periods = 5 + idx = pd.date_range(start="2014-01-01", periods=periods) + data = np.random.rand(periods, periods) + data[data < 0.5] = np.nan + expected = pd.DataFrame(index=idx, columns=idx, data=data) + + result = expected.interpolate(axis=0, method="time") + expected.interpolate(axis=0, method="time", inplace=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 8c0dd67af4e7d..8bc2aa214e035 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm # Column add, remove, delete. diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 8fed695a483f5..32ead406a3e86 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameNonuniqueIndexes: diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index a4f1c0688b144..c727cb398d53e 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm import pandas.core.common as com from pandas.tests.frame.common import _check_mixed_float -import pandas.util.testing as tm class TestDataFrameUnaryOperators: diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py index a545db3365e36..a6b2b334d3ec8 100644 --- a/pandas/tests/frame/test_period.py +++ b/pandas/tests/frame/test_period.py @@ -14,7 +14,7 @@ period_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index d577ff7c71277..703e05998e93c 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -8,8 +8,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm from pandas.core.computation.check import _NUMEXPR_INSTALLED -import pandas.util.testing as tm PARSERS = "python", "pandas" ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @@ -1048,13 +1048,35 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): class TestDataFrameQueryBacktickQuoting: @pytest.fixture(scope="class") def df(self): + """ + Yields a dataframe with strings that may or may not need escaping + by backticks. The last two columns cannot be escaped by backticks + and should raise a ValueError. + """ yield DataFrame( { "A": [1, 2, 3], "B B": [3, 2, 1], "C C": [4, 5, 6], + "C C": [7, 4, 3], "C_C": [8, 9, 10], "D_D D": [11, 1, 101], + "E.E": [6, 3, 5], + "F-F": [8, 1, 10], + "1e1": [2, 4, 8], + "def": [10, 11, 2], + "A (x)": [4, 1, 3], + "B(x)": [1, 1, 5], + "B (x)": [2, 7, 4], + " &^ :!€$?(} > <++*'' ": [2, 5, 6], + "": [10, 11, 1], + " A": [4, 7, 9], + " ": [1, 2, 1], + "it's": [6, 3, 1], + "that's": [9, 1, 8], + "☺": [8, 7, 6], + "foo#bar": [2, 4, 5], + 1: [5, 7, 9], } ) @@ -1093,7 +1115,64 @@ def test_mixed_underscores_and_spaces(self, df): expect = df["A"] + df["D_D D"] tm.assert_series_equal(res, expect) - def backtick_quote_name_with_no_spaces(self, df): + def test_backtick_quote_name_with_no_spaces(self, df): res = df.eval("A + `C_C`") expect = df["A"] + df["C_C"] tm.assert_series_equal(res, expect) + + def test_special_characters(self, df): + res = df.eval("`E.E` + `F-F` - A") + expect = df["E.E"] + df["F-F"] - df["A"] + tm.assert_series_equal(res, expect) + + def test_start_with_digit(self, df): + res = df.eval("A + `1e1`") + expect = df["A"] + df["1e1"] + tm.assert_series_equal(res, expect) + + def test_keyword(self, df): + res = df.eval("A + `def`") + expect = df["A"] + df["def"] + tm.assert_series_equal(res, expect) + + def test_unneeded_quoting(self, df): + res = df.query("`A` > 2") + expect = df[df["A"] > 2] + tm.assert_frame_equal(res, expect) + + def test_parenthesis(self, df): + res = df.query("`A (x)` > 2") + expect = df[df["A (x)"] > 2] + tm.assert_frame_equal(res, expect) + + def test_empty_string(self, df): + res = df.query("`` > 5") + expect = df[df[""] > 5] + tm.assert_frame_equal(res, expect) + + def test_multiple_spaces(self, df): + res = df.query("`C C` > 5") + expect = df[df["C C"] > 5] + tm.assert_frame_equal(res, expect) + + def test_start_with_spaces(self, df): + res = df.eval("` A` + ` `") + expect = df[" A"] + df[" "] + tm.assert_series_equal(res, expect) + + def test_lots_of_operators_string(self, df): + res = df.query("` &^ :!€$?(} > <++*'' ` > 4") + expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] + tm.assert_frame_equal(res, expect) + + def test_failing_quote(self, df): + with pytest.raises(SyntaxError): + df.query("`it's` > `that's`") + + def test_failing_character_outside_range(self, df): + with pytest.raises(SyntaxError): + df.query("`☺` > 4") + + def test_failing_hashtag(self, df): + with pytest.raises(SyntaxError): + df.query("`foo#bar` > 4") diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 60dce36312145..49e6fe4940e18 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -19,7 +19,7 @@ option_context, period_range, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt @@ -164,13 +164,13 @@ def test_repr_column_name_unicode_truncation_bug(self): "Id": [7117434], "StringCol": ( "Is it possible to modify drop plot code" - " so that the output graph is displayed " + "so that the output graph is displayed " "in iphone simulator, Is it possible to " "modify drop plot code so that the " "output graph is \xe2\x80\xa8displayed " "in iphone simulator.Now we are adding " - "the CSV file externally. I want to Call" - " the File through the code.." + "the CSV file externally. I want to Call " + "the File through the code.." ), } ) @@ -205,6 +205,28 @@ def test_info(self, float_frame, datetime_frame): frame.info() frame.info(verbose=False) + def test_info_verbose(self): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + index = i - start + line_nr = " {} ".format(index) + assert line.startswith(line_nr) + def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) @@ -218,7 +240,9 @@ def test_info_memory(self): RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): - a 2 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes """.format( @@ -262,8 +286,8 @@ def test_info_duplicate_columns_shows_correct_dtypes(self): frame.info(buf=io) io.seek(0) lines = io.readlines() - assert "a 1 non-null int64\n" == lines[3] - assert "a 1 non-null float64\n" == lines[4] + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] def test_info_shows_column_dtypes(self): dtypes = [ @@ -283,13 +307,20 @@ def test_info_shows_column_dtypes(self): buf = StringIO() df.info(buf=buf) res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res for i, dtype in enumerate(dtypes): - name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype) + name = " {i:d} {i:d} {n:d} non-null {dtype}".format( + i=i, n=n, dtype=dtype + ) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (10, True)]: + for len_, verbose in [(5, None), (5, False), (12, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context("max_info_columns", 4): buf = StringIO() @@ -297,16 +328,16 @@ def test_info_max_cols(self): res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, verbose in [(10, None), (5, False), (10, True)]: + for len_, verbose in [(12, None), (5, False), (12, True)]: - # max_cols no exceeded + # max_cols not exceeded with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, max_cols in [(10, 5), (5, 4)]: + for len_, max_cols in [(12, 5), (5, 4)]: # setting truncates with option_context("max_info_columns", 4): buf = StringIO() diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 5acd681933914..60b7611c8b9be 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameReshape: @@ -424,8 +424,8 @@ def test_stack_mixed_levels(self): # When mixed types are passed and the ints are not level # names, raise msg = ( - "level should contain all level names or all level numbers, not" - " a mixture of the two" + "level should contain all level names or all level numbers, not " + "a mixture of the two" ) with pytest.raises(ValueError, match=msg): df2.stack(level=["animal", 0]) @@ -1128,3 +1128,34 @@ def test_stack_timezone_aware_values(): ), ) tm.assert_series_equal(result, expected) + + +def test_unstacking_multi_index_df(): + # see gh-30740 + df = DataFrame( + { + "name": ["Alice", "Bob"], + "score": [9.5, 8], + "employed": [False, True], + "kids": [0, 0], + "gender": ["female", "male"], + } + ) + df = df.set_index(["name", "employed", "kids", "gender"]) + df = df.unstack(["gender"], fill_value=0) + expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0) + result = df.unstack(["employed", "kids"], fill_value=0) + expected = DataFrame( + [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]], + index=Index(["Alice", "Bob"], name="name"), + columns=MultiIndex.from_tuples( + [ + ("score", "female", False, 0), + ("score", "female", True, 0), + ("score", "male", False, 0), + ("score", "male", True, 0), + ], + names=[None, "gender", "employed", "kids"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index b0287d9180859..40526ab27ac9a 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -4,7 +4,7 @@ from pandas.errors import PerformanceWarning from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index e1e546256f7cd..4a436d70dc48f 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSubclassing: diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 9985468ac6cd8..e89f4ee07ea00 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -16,7 +16,7 @@ period_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.tseries.offsets as offsets diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index 26ab4ff0ded85..b60f2052a988f 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -11,8 +11,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm class TestDataFrameTimezones: diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 5c39dcc1a7659..aeff92971b42a 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -18,8 +18,8 @@ read_csv, to_datetime, ) +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm from pandas.io.common import get_handle diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 270a7c70a2e81..7fe22e77c5bf3 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm from .test_generic import Generic @@ -196,7 +196,7 @@ def test_set_attribute(self): def test_to_xarray_index_types(self, index): from xarray import Dataset - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") df = DataFrame( { "a": list("abc"), @@ -222,11 +222,10 @@ def test_to_xarray_index_types(self, index): # idempotency # categoricals are not preserved - # datetimes w/tz are not preserved + # datetimes w/tz are preserved # column names are lost expected = df.copy() expected["f"] = expected["f"].astype(object) - expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None tm.assert_frame_equal( result.to_dataframe(), @@ -271,7 +270,6 @@ def test_to_xarray(self): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype(object) - expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0ff9d7fcdb209..10a1e09a09bf8 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm # ---------------------------------------------------------------------- # Generic types test cases @@ -125,7 +125,7 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - msg = "The truth value of a {} is ambiguous".format(self._typ.__name__) + msg = f"The truth value of a {self._typ.__name__} is ambiguous" with pytest.raises(ValueError, match=msg): bool(obj == 0) with pytest.raises(ValueError, match=msg): @@ -203,9 +203,9 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - msg = "compound dtypes are not implemented in the {} constructor".format( - self._typ.__name__ - ) + msg = "compound dtypes are not implemented" + f"in the {self._typ.__name__} constructor" + with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) @@ -820,6 +820,18 @@ def test_take_invalid_kwargs(self): with pytest.raises(ValueError, match=msg): obj.take(indices, mode="clip") + def test_depr_take_kwarg_is_copy(self): + # GH 27357 + df = DataFrame({"A": [1, 2, 3]}) + msg = ( + "is_copy is deprecated and will be removed in a future version. " + "take will always return a copy in the future." + ) + with tm.assert_produces_warning(FutureWarning) as w: + df.take([0, 1], is_copy=True) + + assert w[0].message.args[0] == msg + def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index aaf523956aaed..8ad8355f2d530 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm from .test_generic import Generic @@ -205,7 +205,7 @@ def finalize(self, other, method=None, **kwargs): def test_to_xarray_index_types(self, index): from xarray import DataArray - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") s = Series(range(6), index=index(6)) s.index.name = "foo" result = s.to_xarray() diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0d8379407fef7..0b72a61ed84de 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -8,10 +8,10 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping -import pandas.util.testing as tm def test_agg_regression1(tsframe): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5d50c044cf9f5..5ddda264642de 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range +import pandas._testing as tm from pandas.core.groupby.groupby import DataError -import pandas.util.testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 765bc3bab5d4a..52ee3e652501c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -18,15 +18,15 @@ date_range, period_range, ) +import pandas._testing as tm from pandas.core.base import SpecificationError -import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing def test_agg_api(): # GH 6337 - # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame( @@ -473,8 +473,7 @@ def test_agg_timezone_round_trip(): assert result3 == ts dates = [ - pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific") - for i in range(1, 5) + pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) ] df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) grouped = df.groupby("A") diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 5b8cc86513954..8901af7a90acc 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, MultiIndex +import pandas._testing as tm from pandas.core.groupby.base import reduction_kernels, transformation_kernels -import pandas.util.testing as tm @pytest.fixture diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0e62569fffeb6..2f2f97f2cd993 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, bdate_range -import pandas.util.testing as tm +import pandas._testing as tm def test_apply_issues(): @@ -265,7 +265,7 @@ def desc3(group): result = group.describe() # names are different - result.index.name = "stat_{:d}".format(len(group)) + result.index.name = f"stat_{len(group):d}" result = result[: len(group)] # weirdo @@ -686,6 +686,17 @@ def test_apply_with_mixed_types(): tm.assert_frame_equal(result, expected) +def test_func_returns_object(): + # GH 28652 + df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) + result = df.groupby("a").apply(lambda g: g.index) + expected = Series( + [pd.Int64Index([1]), pd.Int64Index([2])], index=pd.Int64Index([1, 2], name="a") + ) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], @@ -703,3 +714,41 @@ def test_apply_datetime_issue(group_column_dtlike): ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] ) tm.assert_frame_equal(result, expected) + + +def test_apply_series_return_dataframe_groups(): + # GH 10078 + tdf = DataFrame( + { + "day": { + 0: pd.Timestamp("2015-02-24 00:00:00"), + 1: pd.Timestamp("2015-02-24 00:00:00"), + 2: pd.Timestamp("2015-02-24 00:00:00"), + 3: pd.Timestamp("2015-02-24 00:00:00"), + 4: pd.Timestamp("2015-02-24 00:00:00"), + }, + "userAgent": { + 0: "some UA string", + 1: "some UA string", + 2: "some UA string", + 3: "another UA string", + 4: "some UA string", + }, + "userId": { + 0: "17661101", + 1: "17661101", + 2: "17661101", + 3: "17661101", + 4: "17661101", + }, + } + ) + + def most_common_values(df): + return Series({c: s.value_counts().index[0] for c, s in df.iteritems()}) + + result = tdf.groupby("day").apply(most_common_values)["userId"] + expected = pd.Series( + ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index fcdf599e4ba33..ad71f73e80e64 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import ensure_int64 from pandas import Index, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm def test_series_grouper(): @@ -87,7 +87,7 @@ def _check(dtype): counts = np.zeros(len(out), dtype=np.int64) labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(groupby, "group_ohlc_{dtype}".format(dtype=dtype)) + func = getattr(groupby, f"group_ohlc_{dtype}") func(out, counts, obj[:, None], labels) def _ohlc(group): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 89ffcd9ee313e..9323946581a0d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -15,7 +15,7 @@ Series, qcut, ) -import pandas.util.testing as tm +import pandas._testing as tm def cartesian_product_for_groupers(result, args, names): @@ -497,10 +497,10 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): aggr[aggr.isna()] = "missing" if not all(label == aggr): msg = ( - "Labels and aggregation results not consistently sorted\n" - + "for (ordered={}, observed={}, sort={})\n" - + "Result:\n{}" - ).format(ordered, observed, sort, result) + f"Labels and aggregation results not consistently sorted\n" + + "for (ordered={ordered}, observed={observed}, sort={sort})\n" + + "Result:\n{result}" + ) assert False, msg @@ -798,14 +798,14 @@ def test_groupby_empty_with_category(): def test_sort(): - # http://stackoverflow.com/questions/23814368/sorting-pandas- + # https://stackoverflow.com/questions/23814368/sorting-pandas- # categorical-labels-after-groupby # This should result in a properly sorted Series so that the plot # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) @@ -1330,3 +1330,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o # If we expect unobserved values to be zero, we also expect the dtype to be int if zero_or_nan == 0: assert np.issubdtype(result.dtype, np.integer) + + +def test_series_groupby_categorical_aggregation_getitem(): + # GH 8870 + d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} + df = pd.DataFrame(d) + cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=True, sort=True) + result = groups["foo"].agg("mean") + expected = groups.agg("mean")["foo"] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 8e9554085b9ee..b4239d7d34a90 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestCounting: @@ -197,11 +197,8 @@ def test_ngroup_respects_groupby_order(self): @pytest.mark.parametrize( "datetimelike", [ - [ - Timestamp("2016-05-{i:02d} 20:09:25+00:00".format(i=i)) - for i in range(1, 4) - ], - [Timestamp("2016-05-{i:02d} 20:09:25".format(i=i)) for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)], [Timedelta(x, unit="h") for x in range(1, 4)], [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], ], diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index b3ee12b6691d7..c16ad812eb634 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def test_filter_series(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c41c9b4db053a..97cf1af1d2e9e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -20,8 +20,9 @@ date_range, isna, ) +import pandas._testing as tm import pandas.core.nanops as nanops -from pandas.util import _test_decorators as td, testing as tm +from pandas.util import _test_decorators as td @pytest.mark.parametrize("agg_func", ["any", "all"]) @@ -102,9 +103,7 @@ def test_builtins_apply(keys, f): result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = "invalid frame shape: {} (expected ({}, 3))".format( - result.shape, ngroups - ) + assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" assert result.shape == (ngroups, 3), assert_msg tm.assert_frame_equal( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8f88f68c69f2b..7e374811d1960 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,9 +9,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +import pandas._testing as tm from pandas.core.base import SpecificationError import pandas.core.common as com -import pandas.util.testing as tm def test_repr(): @@ -588,6 +588,20 @@ def test_groupby_multiple_columns(df, op): tm.assert_series_equal(result, expected) +def test_as_index_select_column(): + # GH 5764 + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + result = df.groupby("A", as_index=False)["B"].get_group(1) + expected = pd.Series([2, 4], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + expected = pd.Series( + [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) + ) + tm.assert_series_equal(result, expected) + + def test_groupby_as_index_agg(df): grouped = df.groupby("A", as_index=False) @@ -771,7 +785,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) @@ -921,7 +935,7 @@ def test_mutate_groups(): + ["c"] * 2 + ["d"] * 2 + ["e"] * 2, - "cat3": ["g{}".format(x) for x in range(1, 15)], + "cat3": [f"g{x}" for x in range(1, 15)], "val": np.random.randint(100, size=14), } ) @@ -1715,9 +1729,7 @@ def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = pd.DataFrame( { - "eventDate": pd.date_range( - pd.datetime.today(), periods=20, freq="M" - ).tolist(), + "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(), "thename": range(0, 20), } ) @@ -2011,3 +2023,10 @@ def test_groupby_crash_on_nunique(axis): expected = expected.T tm.assert_frame_equal(result, expected) + + +def test_groupby_list_level(): + # GH 9790 + expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) + result = expected.groupby(level=[0]).mean() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index f2af397357e4f..e424913804c33 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -13,8 +13,8 @@ Timestamp, date_range, ) +import pandas._testing as tm from pandas.core.groupby.grouper import Grouping -import pandas.util.testing as tm # selection # -------------------------------- @@ -71,14 +71,12 @@ def test_getitem_list_of_columns(self): ) result = df.groupby("A")[["C", "D"]].mean() - result2 = df.groupby("A")["C", "D"].mean() - result3 = df.groupby("A")[df.columns[2:4]].mean() + result2 = df.groupby("A")[df.columns[2:4]].mean() expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) def test_getitem_numeric_column_names(self): # GH #13731 @@ -91,14 +89,40 @@ def test_getitem_numeric_column_names(self): } ) result = df.groupby(0)[df.columns[1:3]].mean() - result2 = df.groupby(0)[2, 4].mean() - result3 = df.groupby(0)[[2, 4]].mean() + result2 = df.groupby(0)[[2, 4]].mean() expected = df.loc[:, [0, 2, 4]].groupby(0).mean() tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) + + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby(0)[2, 4].mean() + + def test_getitem_single_list_of_columns(self, df): + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby("A")["C", "D"].mean() + + def test_getitem_single_column(self): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) + + result = df.groupby("A")["C"].mean() + + as_frame = df.loc[:, ["A", "C"]].groupby("A").mean() + as_series = as_frame.iloc[:, 0] + expected = as_series + + tm.assert_series_equal(result, expected) # grouping @@ -701,10 +725,7 @@ def test_get_group(self): g.get_group("foo") with pytest.raises(ValueError, match=msg): g.get_group(("foo")) - msg = ( - "must supply a same-length tuple to get_group with multiple" - " grouping keys" - ) + msg = "must supply a same-length tuple to get_group with multiple grouping keys" with pytest.raises(ValueError, match=msg): g.get_group(("foo", "bar", "baz")) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index f5c8873ff9417..971a447b84cae 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[["inner"], ["inner", "outer"]]) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index f83b284a35377..0f850f2e94581 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna -import pandas.util.testing as tm +import pandas._testing as tm def test_first_last_nth(df): @@ -89,6 +89,25 @@ def test_first_last_nth_dtypes(df_mixed_floats): assert f.dtype == "int64" +def test_first_strings_timestamps(): + # GH 11244 + test = pd.DataFrame( + { + pd.Timestamp("2012-01-01 00:00:00"): ["a", "b"], + pd.Timestamp("2012-01-02 00:00:00"): ["c", "d"], + "name": ["e", "e"], + "aaaa": ["f", "g"], + } + ) + result = test.groupby("name").first() + expected = DataFrame( + [["a", "c", "f"]], + columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), + index=Index(["e"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 8f0df9051fc73..3461bf6e10662 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series, concat +import pandas._testing as tm from pandas.core.base import DataError -import pandas.util.testing as tm def test_rank_apply(): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 109382d97440e..6b8bd9e805a0c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -9,9 +9,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper -import pandas.util.testing as tm class TestGroupBy: diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index c46180c1d11cd..6c05c4038a829 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -18,8 +18,8 @@ concat, date_range, ) +import pandas._testing as tm from pandas.core.groupby.groupby import DataError -import pandas.util.testing as tm def assert_fp_equal(a, b): @@ -319,7 +319,7 @@ def test_dispatch_transform(tsframe): def test_transform_select_columns(df): f = lambda x: x.mean() - result = df.groupby("A")["C", "D"].transform(f) + result = df.groupby("A")[["C", "D"]].transform(f) selection = df[["C", "D"]] expected = selection.groupby(df["A"]).transform(f) @@ -765,9 +765,12 @@ def test_transform_with_non_scalar_group(): ], ) @pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) -def test_transform_numeric_ret(cols, exp, comp_func, agg_func): +def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): if agg_func == "size" and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with NDFrameGroupy") + # https://github.com/pytest-dev/pytest/issues/6300 + # workaround to xfail fixture/param permutations + reason = "'size' transformation not supported with NDFrameGroupy" + request.node.add_marker(pytest.mark.xfail(reason=reason)) # GH 19200 df = pd.DataFrame( @@ -874,27 +877,19 @@ def test_pad_stable_sorting(fill_method): ), ], ) -@pytest.mark.parametrize( - "periods,fill_method,limit", - [ - (1, "ffill", None), - (1, "ffill", 1), - (1, "bfill", None), - (1, "bfill", 1), - (-1, "ffill", None), - (-1, "ffill", 1), - (-1, "bfill", None), - (-1, "bfill", 1), - ], -) +@pytest.mark.parametrize("periods", [1, -1]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) +@pytest.mark.parametrize("limit", [None, 1]) def test_pct_change(test_series, freq, periods, fill_method, limit): - # GH 21200, 21621 + # GH 21200, 21621, 30463 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) df = DataFrame({"key": key_v, "vals": vals * 2}) - df_g = getattr(df.groupby("key"), fill_method)(limit=limit) + df_g = df + if fill_method is not None: + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 @@ -967,9 +962,7 @@ def demean_rename(x): if isinstance(x, pd.Series): return result - result = result.rename( - columns={c: "{}_demeaned".format(c) for c in result.columns} - ) + result = result.rename(columns={c: "{c}_demeaned" for c in result.columns}) return result @@ -1138,3 +1131,40 @@ def func(grp): expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) tm.assert_frame_equal(result, expected) + + +def test_transform_lambda_indexing(): + # GH 7883 + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], + "B": ["one", "one", "two", "three", "two", "six", "five", "three"], + "C": range(8), + "D": range(8), + "E": range(8), + } + ) + df = df.set_index(["A", "B"]) + df = df.sort_index() + result = df.groupby(level="A").transform(lambda x: x.iloc[-1]) + expected = DataFrame( + { + "C": [3, 3, 7, 7, 4, 4, 4, 4], + "D": [3, 3, 7, 7, 4, 4, 4, 4], + "E": [3, 3, 7, 7, 4, 4, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "three"), + ("flux", "six"), + ("flux", "three"), + ("foo", "five"), + ("foo", "one"), + ("foo", "two"), + ("foo", "two"), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c76ee09f977b5..c86cb4532bc26 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -10,7 +10,7 @@ import pytest from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime -import pandas.util.testing as tm +import pandas._testing as tm # our starting frame @@ -47,7 +47,7 @@ def seed_df(seed_nans, n, m): keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) - ids.append("{}-{}-{}".format(k, n, m)) + ids.append(f"{k}-{n}-{m}") @pytest.mark.slow diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 48ea2646c52fc..8e387e9202ef6 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -9,12 +9,12 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm from pandas.core.groupby.base import ( groupby_other_methods, reduction_kernels, transformation_kernels, ) -import pandas.util.testing as tm AGG_FUNCTIONS = [ "sum", @@ -404,7 +404,7 @@ def test_all_methods_categorized(mframe): # new public method? if new_names: - msg = """ + msg = f""" There are uncatgeorized methods defined on the Grouper class: {names}. @@ -418,19 +418,19 @@ def test_all_methods_categorized(mframe): see the comments in pandas/core/groupby/base.py for guidance on how to fix this test. """ - raise AssertionError(msg.format(names=names)) + raise AssertionError(msg) # removed a public method? all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods print(names) print(all_categorized) if not (names == all_categorized): - msg = """ + msg = f""" Some methods which are supposed to be on the Grouper class are missing: -{names}. +{all_categorized - names}. They're still defined in one of the lists that live in pandas/core/groupby/base.py. If you removed a method, you should update them """ - raise AssertionError(msg.format(names=all_categorized - names)) + raise AssertionError(msg) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 9a5f9e40374a3..e027641288bb9 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import Categorical, IntervalIndex +import pandas._testing as tm from pandas.core.indexes.api import CategoricalIndex, Index -import pandas.util.testing as tm from ..common import Base @@ -43,7 +43,7 @@ def test_can_hold_identifiers(self): (lambda idx: ["a", "b"] + idx, "__radd__"), ], ) - def test_disallow_set_ops(self, func, op_name): + def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) @@ -298,8 +298,8 @@ def test_insert(self): # invalid msg = ( - "cannot insert an item into a CategoricalIndex that is not" - " already an existing category" + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" ) with pytest.raises(TypeError, match=msg): ci.insert(0, "d") @@ -528,8 +528,8 @@ def test_get_indexer(self): tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) msg = ( - "method='pad' and method='backfill' not implemented yet for" - " CategoricalIndex" + "method='pad' and method='backfill' not implemented yet for " + "CategoricalIndex" ) with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="pad") @@ -673,8 +673,8 @@ def test_equals_categorical(self): ci1 == Index(["a", "b", "c"]) msg = ( - "categorical index comparisons must have the same categories" - " and ordered attributes" + "categorical index comparisons must have the same categories " + "and ordered attributes" "|" "Categoricals can only be compared if 'categories' are the same. " "Categories are different lengths" @@ -975,3 +975,9 @@ def test_engine_type(self, dtype, engine_type): ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="cannot mask with array containing NA"): + idx[:, None] diff --git a/pandas/tests/indexes/categorical/test_constructors.py b/pandas/tests/indexes/categorical/test_constructors.py index f3d580b7215c2..1df0874e2f947 100644 --- a/pandas/tests/indexes/categorical/test_constructors.py +++ b/pandas/tests/indexes/categorical/test_constructors.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, CategoricalDtype, CategoricalIndex, Index -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalIndexConstructors: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 68cca473d6bb0..a16017b0e12c0 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -23,9 +23,9 @@ UInt64Index, isna, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm class Base: @@ -875,3 +875,11 @@ def test_engine_reference_cycle(self): nrefs_pre = len(gc.get_referrers(index)) index._engine assert len(gc.get_referrers(index)) == nrefs_pre + + def test_getitem_2d_deprecated(self): + # GH#30588 + idx = self.create_index() + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + res = idx[:, None] + + assert isinstance(res, np.ndarray), type(res) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 2a9a8bf8d824f..e3e7ff4093b76 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.indexes.api import Index, MultiIndex -import pandas.util.testing as tm indices_dict = { "unicode": tm.makeUnicodeIndex(100), diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 6eedfca129856..3c72d34d84b28 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .common import Base diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index eabf293ae915f..6139726dc34e4 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -17,7 +17,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 58ab44fba08cf..ffe51dd1fb9f5 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import datetime, timedelta from functools import partial from operator import attrgetter @@ -10,17 +10,9 @@ from pandas._libs.tslibs import OutOfBoundsDatetime, conversion import pandas as pd -from pandas import ( - DatetimeIndex, - Index, - Timestamp, - date_range, - datetime, - offsets, - to_datetime, -) +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets, to_datetime +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, period_array -import pandas.util.testing as tm class TestDatetimeIndex: @@ -37,6 +29,25 @@ def test_freq_validation_with_nat(self, dt_cls): with pytest.raises(ValueError, match=msg): dt_cls([pd.NaT, pd.Timestamp("2011-01-01").value], freq="D") + # TODO: better place for tests shared by DTI/TDI? + @pytest.mark.parametrize( + "index", + [ + pd.date_range("2016-01-01", periods=5, tz="US/Pacific"), + pd.timedelta_range("1 Day", periods=5), + ], + ) + def test_shallow_copy_inherits_array_freq(self, index): + # If we pass a DTA/TDA to shallow_copy and dont specify a freq, + # we should inherit the array's freq, not our own. + array = index._data + + arr = array[[0, 3, 2, 4, 1]] + assert arr.freq is None + + result = index._shallow_copy(arr) + assert result.freq is None + def test_categorical_preserves_tz(self): # GH#18664 retain tz when going DTI-->Categorical-->DTI # TODO: parametrize over DatetimeIndex/DatetimeArray @@ -536,15 +547,15 @@ def test_constructor_coverage(self): # non-conforming msg = ( - "Inferred frequency None from passed values does not conform" - " to passed frequency D" + "Inferred frequency None from passed values does not conform " + "to passed frequency D" ) with pytest.raises(ValueError, match=msg): DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") msg = ( - "Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified" + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" ) with pytest.raises(ValueError, match=msg): date_range(start="2011-01-01", freq="b") @@ -711,7 +722,6 @@ def test_constructor_timestamp_near_dst(self): expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) - # TODO(GH-24559): Remove the xfail for the tz-aware case. @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 36cdaa8a6029b..4d0beecbbf5d3 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import DatetimeIndex, Timestamp, bdate_range, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import ( BDay, @@ -945,3 +945,19 @@ def test_range_with_millisecond_resolution(self, start_end): result = pd.date_range(start=start, end=end, periods=2, closed="left") expected = DatetimeIndex([start]) tm.assert_index_equal(result, expected) + + +def test_date_range_with_custom_holidays(): + # GH 30593 + freq = pd.offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) + result = pd.date_range(start="2020-11-25 15:00", periods=4, freq=freq) + expected = pd.DatetimeIndex( + [ + "2020-11-25 15:00:00", + "2020-11-25 16:00:00", + "2020-11-27 15:00:00", + "2020-11-27 16:00:00", + ], + freq=freq, + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 03b9502be2735..ca18d6fbea11a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm randn = np.random.randn @@ -393,15 +393,13 @@ def test_asarray_tz_naive(self): # This shouldn't produce a warning. idx = pd.date_range("2000", periods=2) # M8[ns] by default - with tm.assert_produces_warning(None): - result = np.asarray(idx) + result = np.asarray(idx) expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) # optionally, object - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype=object) + result = np.asarray(idx, dtype=object) expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) @@ -410,15 +408,12 @@ def test_asarray_tz_aware(self): tz = "US/Central" idx = pd.date_range("2000", periods=2, tz=tz) expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - # We warn by default and return an ndarray[M8[ns]] - with tm.assert_produces_warning(FutureWarning): - result = np.asarray(idx) + result = np.asarray(idx, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # Old behavior with no warning - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype="M8[ns]") + result = np.asarray(idx, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -426,8 +421,7 @@ def test_asarray_tz_aware(self): expected = np.array( [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] ) - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype=object) + result = np.asarray(idx, dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 2ff6853b98929..da1bd6f091d1a 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -2,7 +2,7 @@ import pytest from pandas import DatetimeIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 33a744cc25ca1..f34019e06fd5f 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_to_native_types(): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 02ea857550a9b..4c600e510790a 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DatetimeIndex, Index, Timestamp, date_range, notna -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay, CDay @@ -86,7 +86,9 @@ def test_dti_business_getitem(self): def test_dti_business_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END) - values = rng[:, None] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + values = rng[:, None] expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) @@ -110,7 +112,9 @@ def test_dti_custom_getitem(self): def test_dti_custom_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END, freq="C") - values = rng[:, None] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + values = rng[:, None] expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) @@ -132,9 +136,32 @@ def test_where_other(self): i2 = i.copy() i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) - result = i.where(notna(i2), i2.values) + result = i.where(notna(i2), i2._values) tm.assert_index_equal(result, i2) + def test_where_invalid_dtypes(self): + dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + + i2 = dti.copy() + i2 = Index([pd.NaT, pd.NaT] + dti[2:].tolist()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + # passing tz-naive ndarray to tzaware DTI + dti.where(notna(i2), i2.values) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + # passing tz-aware DTI to tznaive DTI + dti.tz_localize(None).where(notna(i2), i2) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.tz_localize(None).to_period("D")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.asi8) + def test_where_tz(self): i = pd.date_range("20130101", periods=3, tz="US/Eastern") result = i.where(notna(i)) @@ -317,7 +344,9 @@ def test_take_fill_value_with_timezone(self): class TestDatetimeIndex: - @pytest.mark.parametrize("null", [None, np.nan, pd.NaT]) + @pytest.mark.parametrize( + "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] + ) @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_nat(self, tz, null): # GH#16537, GH#18295 (test missing) @@ -326,6 +355,12 @@ def test_insert_nat(self, tz, null): res = idx.insert(0, null) tm.assert_index_equal(res, expected) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_invalid_na(self, tz): + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.timedelta64("NaT")) + def test_insert(self): idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") @@ -403,9 +438,9 @@ def test_insert(self): # see gh-7299 idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") - with pytest.raises(ValueError): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, pd.Timestamp("2000-01-04")) - with pytest.raises(ValueError): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, datetime(2000, 1, 4)) with pytest.raises(ValueError): idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) @@ -582,6 +617,23 @@ def test_delete_slice(self): assert result.freq == expected.freq assert result.tz == expected.tz + def test_get_value(self): + # specifically make sure we have test for np.datetime64 key + dti = pd.date_range("2016-01-01", periods=3) + + arr = np.arange(6, 8) + + key = dti[1] + + result = dti.get_value(arr, key) + assert result == 7 + + result = dti.get_value(arr, key.to_pydatetime()) + assert result == 7 + + result = dti.get_value(arr, key.to_datetime64()) + assert result == 7 + def test_get_loc(self): idx = pd.date_range("2000-01-01", periods=3) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index c144f2a447ed3..340f53b2868bd 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,4 +1,5 @@ import calendar +from datetime import datetime import locale import unicodedata @@ -6,8 +7,8 @@ import pytest import pandas as pd -from pandas import DatetimeIndex, Index, Timestamp, date_range, datetime, offsets -import pandas.util.testing as tm +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets +import pandas._testing as tm class TestTimeSeries: diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index 6d94319b33b02..3399c8eaf6750 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index fb032947143d3..ecd4ace705e9e 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -16,8 +16,8 @@ bdate_range, date_range, ) +import pandas._testing as tm from pandas.tests.base.test_ops import Ops -import pandas.util.testing as tm from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 8d5aa64a49cf2..e30cc4449e01e 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -16,8 +16,8 @@ Timestamp, date_range, ) +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm class TestSlicing: diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 62383555f6048..84eee2419f0b8 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DatetimeIndex, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.frequencies import to_offset diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 3fb39b2081d83..78188c54b1d85 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -16,7 +16,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd @@ -163,6 +163,21 @@ def test_union_freq_both_none(self, sort): tm.assert_index_equal(result, expected) assert result.freq is None + def test_union_freq_infer(self): + # When taking the union of two DatetimeIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # TimedeltaIndex behavior. + dti = pd.date_range("2016-01-01", periods=5) + left = dti[[0, 1, 3, 4]] + right = dti[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, dti) + assert result.freq == "D" + def test_union_dataframe_index(self): rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") s1 = Series(np.random.randn(len(rng1)), rng1) diff --git a/pandas/tests/indexes/datetimes/test_shift.py b/pandas/tests/indexes/datetimes/test_shift.py index 6f8315debdfa9..1c87995931c62 100644 --- a/pandas/tests/indexes/datetimes/test_shift.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndexShift: diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 3f942f9b79428..1505ac1dff29c 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -22,7 +22,7 @@ isna, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm class FixedOffset(tzinfo): diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 1aaacfc0949c3..fe65653ba6545 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -30,9 +30,9 @@ isna, to_datetime, ) +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools -import pandas.util.testing as tm class TestTimeConversionFormats: @@ -616,8 +616,8 @@ def test_to_datetime_tz(self, cache): pd.Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), ] msg = ( - "Tz-aware datetime.datetime cannot be converted to datetime64" - " unless utc=True" + "Tz-aware datetime.datetime cannot be " + "converted to datetime64 unless utc=True" ) with pytest.raises(ValueError, match=msg): pd.to_datetime(arr, cache=cache) @@ -2291,3 +2291,25 @@ def test_should_cache_errors(unique_share, check_count, err_message): with pytest.raises(AssertionError, match=err_message): tools.should_cache(arg, unique_share, check_count) + + +def test_nullable_integer_to_datetime(): + # Test for #30050 + ser = pd.Series([1, 2, None, 2 ** 61, None]) + ser = ser.astype("Int64") + ser_copy = ser.copy() + + res = pd.to_datetime(ser, unit="ns") + + expected = pd.Series( + [ + np.datetime64("1970-01-01 00:00:00.000000001"), + np.datetime64("1970-01-01 00:00:00.000000002"), + np.datetime64("NaT"), + np.datetime64("2043-01-25 23:56:49.213693952"), + np.datetime64("NaT"), + ] + ) + tm.assert_series_equal(res, expected) + # Check that ser isn't mutated + tm.assert_series_equal(ser, ser_copy) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 708cd8a4579e8..c94af6c0d533e 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -12,7 +12,7 @@ Timestamp, interval_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class Base: @@ -67,7 +67,7 @@ def test_astype_cannot_cast(self, index, dtype): index.astype(dtype) def test_astype_invalid_dtype(self, index): - msg = 'data type "fake_dtype" not understood' + msg = "data type [\"']fake_dtype[\"'] not understood" with pytest.raises(TypeError, match=msg): index.astype("fake_dtype") diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 339bdaf79c690..d8c2ba8413cfb 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -2,8 +2,8 @@ import pytest from pandas import IntervalIndex, Series, date_range +import pandas._testing as tm from pandas.tests.indexes.common import Base -import pandas.util.testing as tm class TestBase(Base): @@ -79,3 +79,10 @@ def test_where(self, closed, klass): expected = IntervalIndex([np.nan] + idx[1:].tolist()) result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + idx[:, None] diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 98c1f7c6c2a8a..837c124db2bed 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -19,9 +19,9 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture(params=[None, "foo"]) @@ -164,7 +164,7 @@ def test_generic_errors(self, constructor): constructor(dtype="int64", **filler) # invalid dtype - msg = 'data type "invalid" not understood' + msg = "data type [\"']invalid[\"'] not understood" with pytest.raises(TypeError, match=msg): constructor(dtype="invalid", **filler) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index dcc0c818182ab..7acf5c1e0906c 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, IntervalIndex, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndexRendering: diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 15ea9a6b62c20..1bfc58733a110 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -11,8 +11,8 @@ date_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm class TestGetLoc: @@ -349,8 +349,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get left slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get left slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) @@ -358,8 +358,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get left slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get left slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(0, 2)) @@ -369,8 +369,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get right slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get right slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(end=Interval(0, 2)) @@ -378,8 +378,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get right slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get right slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) @@ -431,8 +431,8 @@ def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): with pytest.raises( KeyError, match=( - "'can only get slices from an IntervalIndex if bounds are" - " non-overlapping and all monotonic increasing or decreasing'" + "'can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing'" ), ): index.slice_locs(start, stop) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 6ad7dfb22f2b3..47a0ba7fe0f21 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -17,8 +17,8 @@ notna, timedelta_range, ) +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture(scope="class", params=[None, "foo"]) @@ -586,8 +586,8 @@ def test_missing_values(self, closed): assert idx.equals(idx2) msg = ( - "missing values must be missing in the same location both left" - " and right sides" + "missing values must be missing in the same location both left " + "and right sides" ) with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 7891666e6cdba..2f28c33a3bbc6 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -15,7 +15,7 @@ interval_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index f2fca34e083c2..476ec1dd10b4b 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -6,7 +6,7 @@ from pandas._libs.interval import IntervalTree from pandas import compat -import pandas.util.testing as tm +import pandas._testing as tm def skipif_32bit(param): @@ -20,9 +20,7 @@ def skipif_32bit(param): return pytest.param(param, marks=marks) -@pytest.fixture( - scope="class", params=["int32", "int64", "float32", "float64", "uint64"] -) +@pytest.fixture(scope="class", params=["int64", "float64", "uint64"]) def dtype(request): return request.param @@ -39,12 +37,9 @@ def leaf_size(request): @pytest.fixture( params=[ np.arange(5, dtype="int64"), - np.arange(5, dtype="int32"), np.arange(5, dtype="uint64"), np.arange(5, dtype="float64"), - np.arange(5, dtype="float32"), np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), - np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"), ] ) def tree(request, leaf_size): @@ -63,6 +58,18 @@ def test_get_indexer(self, tree): ): tree.get_indexer(np.array([3.0])) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype) + tree = IntervalTree(left, right) + + result = tree.get_indexer(np.array([target_value], dtype=target_dtype)) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_non_unique(self, tree): indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) @@ -82,6 +89,22 @@ def test_get_indexer_non_unique(self, tree): expected = np.array([2], dtype="intp") tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype) + tree = IntervalTree(left, right) + target = np.array([target_value], dtype=target_dtype) + + result_indexer, result_missing = tree.get_indexer_non_unique(target) + expected_indexer = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + expected_missing = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result_missing, expected_missing) + def test_duplicates(self, dtype): left = np.array([0, 0, 0], dtype=dtype) tree = IntervalTree(left, left + 1) @@ -120,10 +143,10 @@ def test_get_indexer_closed(self, closed, leaf_size): @pytest.mark.parametrize( "left, right, expected", [ - (np.array([0, 1, 4]), np.array([2, 3, 5]), True), - (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True), + (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True), (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), - (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False), (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), ], ) @@ -138,7 +161,7 @@ def test_is_overlapping(self, closed, order, left, right, expected): def test_is_overlapping_endpoints(self, closed, order): """shared endpoints are marked as overlapping""" # GH 23309 - left, right = np.arange(3), np.arange(1, 4) + left, right = np.arange(3, dtype="int64"), np.arange(1, 4) tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping expected = closed == "both" @@ -161,7 +184,7 @@ def test_is_overlapping_trivial(self, closed, left, right): @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") def test_construction_overflow(self): # GH 25485 - left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 + left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 tree = IntervalTree(left, right) # pivot should be average of left/right medians diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 89e733c30b1e3..3246ac6bafde9 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -2,7 +2,7 @@ import pytest from pandas import Index, IntervalIndex, Timestamp, interval_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(scope="class", params=[None, "foo"]) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index a6d08c845d941..ac1e0893683d1 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Index, MultiIndex, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm def test_shift(idx): @@ -348,9 +348,9 @@ def test_numpy_ufuncs(idx, func): ) def test_numpy_type_funcs(idx, func): msg = ( - f"ufunc '{func.__name__}' not supported for the input types, and the inputs" - " could not be safely coerced to any supported types according to" - " the casting rule ''safe''" + f"ufunc '{func.__name__}' not supported for the input types, and the inputs " + "could not be safely coerced to any supported types according to " + "the casting rule ''safe''" ) with pytest.raises(TypeError, match=msg): func(idx) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 93fdeb10b849a..29908537fbe59 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype -import pandas.util.testing as tm +import pandas._testing as tm def test_astype(idx): diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index b02f87dc4aacb..d92cff1e10496 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -2,7 +2,7 @@ import pytest from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_numeric_compat(idx): diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 0e4d144c0fd34..2c4b3ce04f96d 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import Index, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_constructor_single_level(): @@ -65,8 +65,8 @@ def test_constructor_mismatched_codes_levels(idx): MultiIndex(levels=levels, codes=codes) length_error = ( - r"On level 0, code max \(3\) >= length of level \(1\)\." - " NOTE: this index is in an inconsistent state" + r"On level 0, code max \(3\) >= length of level \(1\)\. " + "NOTE: this index is in an inconsistent state" ) label_error = r"Unequal code lengths: \[4, 2\]" code_value_error = r"On level 0, code value \(-2\) < -1" diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 64d2859cd13db..49aa63210cd5e 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_contains_top_level(): @@ -98,3 +98,27 @@ def test_isin_level_kwarg(): with pytest.raises(KeyError, match="'Level C not found'"): idx.isin(vals_1, level="C") + + +def test_contains_with_missing_value(): + # issue 19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + +@pytest.mark.parametrize( + "labels,expected,level", + [ + ([("b", np.nan)], np.array([False, False, True]), None,), + ([np.nan, "a"], np.array([True, True, False]), 0), + (["d", np.nan], np.array([False, True, True]), 1), + ], +) +def test_isin_multi_index_with_missing_value(labels, expected, level): + # GH 19132 + midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) + tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index fab4f72dc153b..8956e6ed4996f 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_tolist(idx): diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 12cd0db6936f5..1acc65aef8b8a 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -3,7 +3,7 @@ import pytest from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def assert_multiindex_copied(copy, original): diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 364420a292ed5..b909025b3f2f9 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_drop(idx): @@ -108,8 +108,8 @@ def test_droplevel_list(): assert dropped.equals(expected) msg = ( - "Cannot remove 3 levels from an index with 3 levels: at least one" - " level must be left" + "Cannot remove 3 levels from an index with 3 levels: " + "at least one level must be left" ) with pytest.raises(ValueError, match=msg): index[:2].droplevel(["one", "two", "three"]) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index ee1f068b92df1..93e1de535835f 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -6,7 +6,7 @@ from pandas._libs import hashtable from pandas import DatetimeIndex, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("names", [None, ["first", "second"]]) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index c81af5a0c6c49..063ede028add7 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_equals(idx): diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 3a8063aed8d20..75f23fb2f32ba 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_format(idx): diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index ec3c654ecb1ed..074072ae581b2 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import CategoricalIndex, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def assert_matching(actual, expected, check_dtype=False): diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 9ef2a77205acc..ad6f06d065150 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -12,8 +12,8 @@ MultiIndex, date_range, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm def test_slice_locs_partial(idx): @@ -437,3 +437,91 @@ def test_timestamp_multiindex_indexer(): ) should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) + + +def test_get_loc_with_values_including_missing_values(): + # issue 19132 + idx = MultiIndex.from_product([[np.nan, 1]] * 2) + expected = slice(0, 2, None) + assert idx.get_loc(np.nan) == expected + + idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]]) + expected = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected) + + idx = MultiIndex.from_product([[np.nan, 1]] * 3) + expected = slice(2, 4, None) + assert idx.get_loc((np.nan, 1)) == expected + + +@pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], +) +def test_get_indexer_with_missing_value(index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "index_arr,expected,target,algo", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"), + ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"), + ], +) +def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_slice_bound(target, side=algo, kind="loc") + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1), + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)), + ], +) +def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_indexer(start=start_idx, end=end_idx) + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")), + ], +) +def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_locs(start=start_idx, end=end_idx) + assert result == expected diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index a8711533e806c..f2ec15e0af88c 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import IntervalIndex, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_labels_dtypes(): diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 42d8cf761842e..062fb92c44552 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -87,3 +87,19 @@ def test_join_self_unique(idx, join_type): if idx.is_unique: joined = idx.join(idx, how=join_type) assert (idx == joined).all() + + +def test_join_multi_wrong_order(): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"]) + + join_idx, lidx, ridx = midx1.join(midx2, return_indexers=False) + + exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp) + + tm.assert_index_equal(midx1, join_idx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index f053f690e1018..a17e1e9928bff 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm def test_fillna(idx): diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 5c3a48c9dd481..479b5ef0211a0 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def check_level_names(index, names): @@ -124,3 +124,20 @@ def test_get_names_from_levels(): assert idx.levels[0].name == "a" assert idx.levels[1].name == "b" + + +def test_setting_names_from_levels_raises(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[0].name = "foo" + + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[1].name = "foo" + + new = pd.Series(1, index=idx.levels[0]) + with pytest.raises(RuntimeError, match="set_names"): + new.index.name = "bar" + + assert pd.Index._no_setting_name is False + assert pd.Int64Index._no_setting_name is False + assert pd.RangeIndex._no_setting_name is False diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 5db1296d828ca..b00018d2ceb69 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_partial_string_timestamp_multiindex(): diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 513efa8941de8..ceb14aa82a76c 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_reindex(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 37df420e9ea2e..2e39c714ca7af 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_insert(idx): diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_setops.py similarity index 99% rename from pandas/tests/indexes/multi/test_set_ops.py rename to pandas/tests/indexes/multi/test_setops.py index 835784054261e..841e3b3f17b38 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("case", [0.5, "xxx"]) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 3dee1dbecf3ba..277bd79cfe953 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_sortlevel(idx): @@ -120,7 +120,7 @@ def test_unsortedindex(): def test_unsortedindex_doc_examples(): - # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa dfm = DataFrame( {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} ) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index fd6013ab5ae08..88e800d66f3ad 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, PeriodIndex, Series, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndex: diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index fa57ec2b1f7ca..ec386dd9dd11c 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, Int64Index, NaT, Period, PeriodIndex, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndexAsType: diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 2adce0b7f8b44..27ee915e48e5c 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -7,14 +7,11 @@ import pandas as pd from pandas import Index, Period, PeriodIndex, Series, date_range, offsets, period_range -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import PeriodArray class TestPeriodIndex: - def setup_method(self, method): - pass - def test_construction_base_constructor(self): # GH 13664 arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="M")] @@ -32,6 +29,30 @@ def test_construction_base_constructor(self): pd.Index(np.array(arr)), pd.Index(np.array(arr), dtype=object) ) + def test_base_constructor_with_period_dtype(self): + dtype = PeriodDtype("D") + values = ["2011-01-01", "2012-03-04", "2014-05-01"] + result = pd.Index(values, dtype=dtype) + + expected = pd.PeriodIndex(values, dtype=dtype) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "values_constructor", [list, np.array, PeriodIndex, PeriodArray._from_sequence] + ) + def test_index_object_dtype(self, values_constructor): + # Index(periods, dtype=object) is an Index (not an PeriodIndex) + periods = [ + pd.Period("2011-01", freq="M"), + pd.NaT, + pd.Period("2011-03", freq="M"), + ] + values = values_constructor(periods) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + def test_constructor_use_start_freq(self): # GH #1118 p = Period("4/2/2012", freq="B") @@ -201,7 +222,7 @@ def test_constructor_dtype(self): assert res.dtype == "period[M]" msg = "specified freq and dtype are different" - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(["2011-01"], freq="M", dtype="period[D]") def test_constructor_empty(self): @@ -261,12 +282,12 @@ def test_constructor_pi_nat(self): def test_constructor_incompat_freq(self): msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] ) - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( np.array( [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] @@ -274,12 +295,12 @@ def test_constructor_incompat_freq(self): ) # first element is pd.NaT - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] ) - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( np.array( [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 2a88b79f381c4..5db373a9f07ae 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import PeriodIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_to_native_types(): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index e17f0af24760c..7dbefbdaff98e 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Period, PeriodIndex, Series, notna, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestGetItem: @@ -235,6 +235,21 @@ def test_where_other(self): result = i.where(notna(i2), i2.values) tm.assert_index_equal(result, i2) + def test_where_invalid_dtypes(self): + pi = period_range("20130101", periods=5, freq="D") + + i2 = pi.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + pi[2:].tolist(), freq="D") + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.to_timestamp("S")) + class TestTake: def test_take(self): @@ -550,6 +565,35 @@ def test_get_indexer(self): res = idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 day")) tm.assert_numpy_array_equal(res, np.array([0, 0, 1, -1], dtype=np.intp)) + def test_get_indexer_mismatched_dtype(self): + # Check that we return all -1s and do not raise or cast incorrectly + + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + pi2 = dti.to_period("W") + + expected = np.array([-1, -1, -1], dtype=np.intp) + + result = pi.get_indexer(dti) + tm.assert_numpy_array_equal(result, expected) + + # This should work in both directions + result = dti.get_indexer(pi) + tm.assert_numpy_array_equal(result, expected) + + result = pi.get_indexer(pi2) + tm.assert_numpy_array_equal(result, expected) + + # We expect the same from get_indexer_non_unique + result = pi.get_indexer_non_unique(dti)[0] + tm.assert_numpy_array_equal(result, expected) + + result = dti.get_indexer_non_unique(pi)[0] + tm.assert_numpy_array_equal(result, expected) + + result = pi.get_indexer_non_unique(pi2)[0] + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_non_unique(self): # GH 17717 p1 = pd.Period("2017-09-02") diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 962e674fa607f..427d9ab712320 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -3,9 +3,9 @@ import pandas as pd from pandas import DatetimeIndex, Index, NaT, PeriodIndex, Series +import pandas._testing as tm from pandas.core.arrays import PeriodArray from pandas.tests.base.test_ops import Ops -import pandas.util.testing as tm class TestPeriodIndexOps(Ops): diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 501c2a4d8edcc..9ca2dd169416f 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Period, Series, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndex: diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index c11dda8f67620..16fa0b0c25925 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -17,7 +17,7 @@ offsets, period_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike @@ -105,25 +105,6 @@ def test_no_millisecond_field(self): with pytest.raises(AttributeError, match=msg): DatetimeIndex([]).millisecond - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: difference of Period MUST preserve frequency - # but the ability to union results must be preserved - - index = period_range("20160920", "20160925", freq="D") - - other = period_range("20160921", "20160924", freq="D") - expected = PeriodIndex(["20160920", "20160925"], freq="D") - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = period_range("20160922", "20160925", freq="D") - idx_diff = index.difference(other, sort) - expected = PeriodIndex(["20160920", "20160921"], freq="D") - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_hash_error(self): index = period_range("20010101", periods=10) msg = f"unhashable type: '{type(index).__name__}'" @@ -242,8 +223,8 @@ def test_period_index_length(self): i1 = period_range(start=start, end=end_intv) msg = ( - "Of the three parameters: start, end, and periods, exactly two" - " must be specified" + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" ) with pytest.raises(ValueError, match=msg): period_range(start=start) @@ -446,8 +427,8 @@ def test_contains_nat(self): def test_periods_number_check(self): msg = ( - "Of the three parameters: start, end, and periods, exactly two" - " must be specified" + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" ) with pytest.raises(ValueError, match=msg): period_range("2011-1-1", "2012-1-1", "B") @@ -470,7 +451,7 @@ def test_index_duplicate_periods(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) - result = ts[2007] + result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) result[:] = 1 @@ -480,8 +461,8 @@ def test_index_duplicate_periods(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) - result = ts[2007] - expected = ts[idx == 2007] + result = ts["2007"] + expected = ts[idx == "2007"] tm.assert_series_equal(result, expected) def test_index_unique(self): diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 828fab08daceb..2c3d22198df9f 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -1,7 +1,7 @@ import pytest from pandas import NaT, Period, PeriodIndex, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodRange: diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py index 7956b9f26e6ef..d9809f0f75611 100644 --- a/pandas/tests/indexes/period/test_scalar_compat.py +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -1,7 +1,7 @@ """Tests for PeriodIndex behaving like a vectorized Period scalar""" from pandas import Timedelta, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndexOps: diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 03e4bd5834166..dc7805880784f 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -1,10 +1,11 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + import pandas as pd from pandas import Index, PeriodIndex, date_range, period_range -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): @@ -177,11 +178,11 @@ def test_union_misc(self, sort): # raise if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.union(index2, sort=sort) index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.join(index3) def test_union_dataframe_index(self): @@ -213,11 +214,11 @@ def test_intersection(self, sort): # raise if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.intersection(index2, sort=sort) index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.intersection(index3, sort=sort) @pytest.mark.parametrize("sort", [None, False]) @@ -353,3 +354,22 @@ def test_difference(self, sort): if sort is None: expected = expected.sort_values() tm.assert_index_equal(result_difference, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq="D") + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = PeriodIndex(["20160920", "20160921"], freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/period/test_shift.py b/pandas/tests/indexes/period/test_shift.py index 7543f85c6d138..5689e98c33455 100644 --- a/pandas/tests/indexes/period/test_shift.py +++ b/pandas/tests/indexes/period/test_shift.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import PeriodIndex, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndexShift: diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index a9c0ecd1a3041..28ab14af71362 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency from pandas._libs.tslibs.ccalendar import MONTHS import pandas as pd @@ -17,8 +18,7 @@ period_range, to_datetime, ) -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodRepresentation: @@ -231,14 +231,43 @@ def test_searchsorted(self, freq): p2 = pd.Period("2014-01-04", freq=freq) assert pidx.searchsorted(p2) == 3 - msg = "Input has different freq=H from PeriodIndex" - with pytest.raises(period.IncompatibleFrequency, match=msg): + assert pidx.searchsorted(pd.NaT) == 0 + + msg = "Input has different freq=H from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): pidx.searchsorted(pd.Period("2014-01-01", freq="H")) - msg = "Input has different freq=5D from PeriodIndex" - with pytest.raises(period.IncompatibleFrequency, match=msg): + msg = "Input has different freq=5D from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): pidx.searchsorted(pd.Period("2014-01-01", freq="5D")) + def test_searchsorted_invalid(self): + pidx = pd.PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + + other = np.array([0, 1], dtype=np.int64) + + msg = "requires either a Period or PeriodArray" + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other.astype("timedelta64[ns]")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64(4)) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64("NaT", "ms")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64(4, "ns")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64("NaT", "ns")) + class TestPeriodIndexConversion: def test_tolist(self): diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index 5067b6c74871b..ba1de6d551d6b 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -4,7 +4,7 @@ import pytest from pandas import Index, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestRangeIndexConstructors: diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index db0cc9828e9e9..8d98ab18963b6 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -1,5 +1,3 @@ -from datetime import datetime, timedelta - import numpy as np import pytest @@ -7,7 +5,7 @@ import pandas as pd from pandas import Float64Index, Index, Int64Index, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm from ..test_numeric import Numeric @@ -464,176 +462,6 @@ def test_join_self(self, join_type): joined = index.join(index, how=join_type) assert index is joined - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, sort): - # intersect with Int64Index - index = self.create_index() - other = Index(np.arange(1, 6)) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - result = other.intersection(index, sort=sort) - expected = Index( - np.sort(np.asarray(np.intersect1d(index.values, other.values))) - ) - tm.assert_index_equal(result, expected) - - # intersect with increasing RangeIndex - other = RangeIndex(1, 6) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - # intersect with decreasing RangeIndex - other = RangeIndex(5, 0, -1) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - # reversed (GH 17296) - result = other.intersection(index, sort=sort) - tm.assert_index_equal(result, expected) - - # GH 17296: intersect two decreasing RangeIndexes - first = RangeIndex(10, -2, -2) - other = RangeIndex(5, -4, -1) - expected = first.astype(int).intersection(other.astype(int), sort=sort) - result = first.intersection(other, sort=sort).astype(int) - tm.assert_index_equal(result, expected) - - # reversed - result = other.intersection(first, sort=sort).astype(int) - tm.assert_index_equal(result, expected) - - index = RangeIndex(5) - - # intersect of non-overlapping indices - other = RangeIndex(5, 10, 1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - other = RangeIndex(-1, -5, -1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - # intersection of empty indices - other = RangeIndex(0, 0, 1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - result = other.intersection(index, sort=sort) - tm.assert_index_equal(result, expected) - - # intersection of non-overlapping values based on start value and gcd - index = RangeIndex(1, 10, 2) - other = RangeIndex(0, 10, 4) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [False, None]) - def test_union_noncomparable(self, sort): - # corner case, non-Int64Index - index = self.create_index() - other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) - result = index.union(other, sort=sort) - expected = Index(np.concatenate((index, other))) - tm.assert_index_equal(result, expected) - - result = other.union(index, sort=sort) - expected = Index(np.concatenate((other, index))) - tm.assert_index_equal(result, expected) - - @pytest.fixture( - params=[ - (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), - (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), I64(range(0, -20, -1))), - ( - RI(0, 10, 2), - RI(1, 10, 2), - RI(0, 10, 1), - I64(list(range(0, 10, 2)) + list(range(1, 10, 2))), - ), - ( - RI(0, 11, 2), - RI(1, 12, 2), - RI(0, 12, 1), - I64(list(range(0, 11, 2)) + list(range(1, 12, 2))), - ), - ( - RI(0, 21, 4), - RI(-2, 24, 4), - RI(-2, 24, 2), - I64(list(range(0, 21, 4)) + list(range(-2, 24, 4))), - ), - ( - RI(0, -20, -2), - RI(-1, -21, -2), - RI(-19, 1, 1), - I64(list(range(0, -20, -2)) + list(range(-1, -21, -2))), - ), - (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), - ( - RI(0, -100, -5), - RI(5, -100, -20), - RI(-95, 10, 5), - I64(list(range(0, -100, -5)) + [5]), - ), - ( - RI(0, -11, -1), - RI(1, -12, -4), - RI(-11, 2, 1), - I64(list(range(0, -11, -1)) + [1, -11]), - ), - (RI(0), RI(0), RI(0), RI(0)), - (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), - (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), I64(range(0, 102, 2))), - ( - RI(0, -100, -2), - RI(-100, 50, 102), - RI(-100, 4, 2), - I64(list(range(0, -100, -2)) + [-100, 2]), - ), - ( - RI(0, -100, -1), - RI(0, -50, -3), - RI(-99, 1, 1), - I64(list(range(0, -100, -1))), - ), - (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), - (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), - (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), - (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])), - ] - ) - def unions(self, request): - """Inputs and expected outputs for RangeIndex.union tests""" - - return request.param - - def test_union_sorted(self, unions): - - idx1, idx2, expected_sorted, expected_notsorted = unions - - res1 = idx1.union(idx2, sort=None) - tm.assert_index_equal(res1, expected_sorted, exact=True) - - res1 = idx1.union(idx2, sort=False) - tm.assert_index_equal(res1, expected_notsorted, exact=True) - - res2 = idx2.union(idx1, sort=None) - res3 = idx1._int64index.union(idx2, sort=None) - tm.assert_index_equal(res2, expected_sorted, exact=True) - tm.assert_index_equal(res3, expected_sorted) - def test_nbytes(self): # memory savings vs int index diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py new file mode 100644 index 0000000000000..5bedc4089feba --- /dev/null +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -0,0 +1,244 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas import Index, Int64Index, RangeIndex +import pandas._testing as tm + + +class TestRangeIndexSetOps: + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, sort): + # intersect with Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index(np.arange(1, 6)) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + expected = Index( + np.sort(np.asarray(np.intersect1d(index.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + # intersect with increasing RangeIndex + other = RangeIndex(1, 6) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # intersect with decreasing RangeIndex + other = RangeIndex(5, 0, -1) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # reversed (GH 17296) + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 17296: intersect two decreasing RangeIndexes + first = RangeIndex(10, -2, -2) + other = RangeIndex(5, -4, -1) + expected = first.astype(int).intersection(other.astype(int), sort=sort) + result = first.intersection(other, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + # reversed + result = other.intersection(first, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + index = RangeIndex(5) + + # intersect of non-overlapping indices + other = RangeIndex(5, 10, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + other = RangeIndex(-1, -5, -1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + # intersection of empty indices + other = RangeIndex(0, 0, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # intersection of non-overlapping values based on start value and gcd + index = RangeIndex(1, 10, 2) + other = RangeIndex(0, 10, 4) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, None]) + def test_union_noncomparable(self, sort): + # corner case, non-Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other, sort=sort) + expected = Index(np.concatenate((index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(index, sort=sort) + expected = Index(np.concatenate((other, index))) + tm.assert_index_equal(result, expected) + + @pytest.fixture( + params=[ + ( + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(5, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(10, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(-10, -20, -1), + RangeIndex(-19, 1, 1), + Int64Index(range(0, -20, -1)), + ), + ( + RangeIndex(0, 10, 2), + RangeIndex(1, 10, 2), + RangeIndex(0, 10, 1), + Int64Index(list(range(0, 10, 2)) + list(range(1, 10, 2))), + ), + ( + RangeIndex(0, 11, 2), + RangeIndex(1, 12, 2), + RangeIndex(0, 12, 1), + Int64Index(list(range(0, 11, 2)) + list(range(1, 12, 2))), + ), + ( + RangeIndex(0, 21, 4), + RangeIndex(-2, 24, 4), + RangeIndex(-2, 24, 2), + Int64Index(list(range(0, 21, 4)) + list(range(-2, 24, 4))), + ), + ( + RangeIndex(0, -20, -2), + RangeIndex(-1, -21, -2), + RangeIndex(-19, 1, 1), + Int64Index(list(range(0, -20, -2)) + list(range(-1, -21, -2))), + ), + ( + RangeIndex(0, 100, 5), + RangeIndex(0, 100, 20), + RangeIndex(0, 100, 5), + Int64Index(range(0, 100, 5)), + ), + ( + RangeIndex(0, -100, -5), + RangeIndex(5, -100, -20), + RangeIndex(-95, 10, 5), + Int64Index(list(range(0, -100, -5)) + [5]), + ), + ( + RangeIndex(0, -11, -1), + RangeIndex(1, -12, -4), + RangeIndex(-11, 2, 1), + Int64Index(list(range(0, -11, -1)) + [1, -11]), + ), + (RangeIndex(0), RangeIndex(0), RangeIndex(0), RangeIndex(0)), + ( + RangeIndex(0, -10, -2), + RangeIndex(0), + RangeIndex(0, -10, -2), + RangeIndex(0, -10, -2), + ), + ( + RangeIndex(0, 100, 2), + RangeIndex(100, 150, 200), + RangeIndex(0, 102, 2), + Int64Index(range(0, 102, 2)), + ), + ( + RangeIndex(0, -100, -2), + RangeIndex(-100, 50, 102), + RangeIndex(-100, 4, 2), + Int64Index(list(range(0, -100, -2)) + [-100, 2]), + ), + ( + RangeIndex(0, -100, -1), + RangeIndex(0, -50, -3), + RangeIndex(-99, 1, 1), + Int64Index(list(range(0, -100, -1))), + ), + ( + RangeIndex(0, 1, 1), + RangeIndex(5, 6, 10), + RangeIndex(0, 6, 5), + Int64Index([0, 5]), + ), + ( + RangeIndex(0, 10, 5), + RangeIndex(-5, -6, -20), + RangeIndex(-5, 10, 5), + Int64Index([0, 5, -5]), + ), + ( + RangeIndex(0, 3, 1), + RangeIndex(4, 5, 1), + Int64Index([0, 1, 2, 4]), + Int64Index([0, 1, 2, 4]), + ), + ( + RangeIndex(0, 10, 1), + Int64Index([]), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + ), + ] + ) + def unions(self, request): + """Inputs and expected outputs for RangeIndex.union tests""" + + return request.param + + def test_union_sorted(self, unions): + + idx1, idx2, expected_sorted, expected_notsorted = unions + + res1 = idx1.union(idx2, sort=None) + tm.assert_index_equal(res1, expected_sorted, exact=True) + + res1 = idx1.union(idx2, sort=False) + tm.assert_index_equal(res1, expected_notsorted, exact=True) + + res2 = idx2.union(idx1, sort=None) + res3 = idx1._int64index.union(idx2, sort=None) + tm.assert_index_equal(res2, expected_sorted, exact=True) + tm.assert_index_equal(res3, expected_sorted) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6ec35a32d74ce..1047c457d6b82 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -12,6 +12,7 @@ from pandas._libs.tslib import Timestamp from pandas.compat.numpy import np_datetime64_compat +from pandas.util._test_decorators import async_mark from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.generic import ABCIndex @@ -32,6 +33,7 @@ isna, period_range, ) +import pandas._testing as tm from pandas.core.algorithms import safe_sort from pandas.core.indexes.api import ( Index, @@ -42,7 +44,6 @@ ) from pandas.tests.indexes.common import Base from pandas.tests.indexes.conftest import indices_dict -import pandas.util.testing as tm class TestIndex(Base): @@ -70,7 +71,9 @@ def test_can_hold_identifiers(self): @pytest.mark.parametrize("index", ["datetime"], indirect=True) def test_new_axis(self, index): - new_index = index[None, :] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + new_index = index[None, :] assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) @@ -108,8 +111,8 @@ def test_constructor_copy(self, index): def test_constructor_corner(self): # corner case msg = ( - r"Index\(\.\.\.\) must be called with a collection of some" - " kind, 0 was passed" + r"Index\(\.\.\.\) must be called with a collection of some " + "kind, 0 was passed" ) with pytest.raises(TypeError, match=msg): Index(0) @@ -244,7 +247,7 @@ class ArrayLike: def __init__(self, array): self.array = array - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return self.array expected = pd.Index(array) @@ -2397,13 +2400,14 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) @@ -2782,9 +2786,35 @@ def test_shape_of_invalid_index(): # about this). However, as long as this is not solved in general,this test ensures # that the returned shape is consistent with this underlying array for # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) - a = np.arange(8).reshape(2, 2, 2) - idx = pd.Index(a) - assert idx.shape == a.shape - idx = pd.Index([0, 1, 2, 3]) - assert idx[:, None].shape == (4, 1) + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + assert idx[:, None].shape == (4, 1) + + +def test_validate_1d_input(): + # GH#27125 check that we do not have >1-dimensional input + msg = "Index data must be 1-dimensional" + + arr = np.arange(8).reshape(2, 2, 2) + with pytest.raises(ValueError, match=msg): + pd.Index(arr) + + with pytest.raises(ValueError, match=msg): + pd.Float64Index(arr.astype(np.float64)) + + with pytest.raises(ValueError, match=msg): + pd.Int64Index(arr.astype(np.int64)) + + with pytest.raises(ValueError, match=msg): + pd.UInt64Index(arr.astype(np.uint64)) + + df = pd.DataFrame(arr.reshape(4, 2)) + with pytest.raises(ValueError, match=msg): + pd.Index(df) + + # GH#13601 trying to assign a multi-dimensional array to an index is not + # allowed + ser = pd.Series(0, range(4)) + with pytest.raises(ValueError, match=msg): + ser.index = np.array([[2, 3]] * 4) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 82ef71efa70d0..7e30233353553 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import CategoricalIndex, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestCommon: diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 37976d89ecba4..f025168643ab9 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import Float64Index, Index, Int64Index, Series, UInt64Index +import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.tests.indexes.common import Base -import pandas.util.testing as tm class Numeric(Base): @@ -194,8 +194,8 @@ def test_constructor_invalid(self): with pytest.raises(TypeError, match=msg): Float64Index(0.0) msg = ( - "String dtype not supported, you may need to explicitly cast to" - " a numeric type" + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" ) with pytest.raises(TypeError, match=msg): Float64Index(["a", "b", 0.0]) @@ -570,8 +570,8 @@ def test_union_noncomparable(self): def test_cant_or_shouldnt_cast(self): msg = ( - "String dtype not supported, you may need to explicitly cast to" - " a numeric type" + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" ) # can't data = ["foo", "bar", "baz"] @@ -655,8 +655,8 @@ def test_constructor(self): # scalar raise Exception msg = ( - r"Int64Index\(\.\.\.\) must be called with a collection of some" - " kind, 5 was passed" + r"Int64Index\(\.\.\.\) must be called with a collection of some " + "kind, 5 was passed" ) with pytest.raises(TypeError, match=msg): Int64Index(5) @@ -736,6 +736,12 @@ def test_get_indexer(self): expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + def test_get_indexer_nan(self): + # GH 7820 + result = Index([1, 2, np.nan]).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + def test_intersection(self): index = self.create_index() other = Index([1, 2, 3, 4, 5]) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 3d24c70afdda2..583556656ac87 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -12,8 +12,8 @@ _np_version_under1p17, _np_version_under1p18, ) +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a7e2363ec422e..abfa413d56655 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -11,9 +11,9 @@ import pandas as pd from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.tests.indexes.conftest import indices_dict -import pandas.util.testing as tm COMPATIBLE_INCONSISTENT_PAIRS = { (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index e479d93af2902..82c9d995c9c7c 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -13,7 +13,7 @@ TimedeltaIndex, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndex: diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index ff6ee051755bb..39abbf59d1e56 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Timedelta, TimedeltaIndex, timedelta_range, to_timedelta +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm class TestTimedeltaIndex: @@ -176,15 +176,15 @@ def test_constructor_coverage(self): # non-conforming freq msg = ( - "Inferred frequency None from passed values does not conform to" - " passed frequency D" + "Inferred frequency None from passed values does not conform to " + "passed frequency D" ) with pytest.raises(ValueError, match=msg): TimedeltaIndex(["1 days", "2 days", "4 days"], freq="D") msg = ( - "Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified" + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" ) with pytest.raises(ValueError, match=msg): timedelta_range(periods=10, freq="D") diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 17ab85033acfb..e8665ee1a3555 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -4,8 +4,8 @@ import pytest import pandas as pd -from pandas import Index, Timedelta, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +from pandas import Index, Timedelta, TimedeltaIndex, notna, timedelta_range +import pandas._testing as tm class TestGetItem: @@ -58,8 +58,20 @@ def test_timestamp_invalid_key(self, key): class TestWhere: - # placeholder for symmetry with DatetimeIndex and PeriodIndex tests - pass + def test_where_invalid_dtypes(self): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + + i2 = tdi.copy() + i2 = Index([pd.NaT, pd.NaT] + tdi[2:].tolist()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), i2 + pd.Timestamp.now()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), (i2 + pd.Timestamp.now()).to_period("D")) class TestTake: @@ -161,6 +173,15 @@ def test_take_fill_value(self): class TestTimedeltaIndex: + def test_insert_empty(self): + # Corner case inserting with length zero doesnt raise IndexError + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + idx[:0].insert(0, td) + idx[:0].insert(1, td) + idx[:0].insert(-1, td) + def test_insert(self): idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") @@ -219,11 +240,29 @@ def test_insert(self): assert result.name == expected.name assert result.freq == expected.freq + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) - for na in (np.nan, pd.NaT, None): - result = timedelta_range("1day", "3day").insert(1, na) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.datetime64("NaT")) + + def test_insert_dont_cast_strings(self): + # To match DatetimeIndex and PeriodIndex behavior, dont try to + # parse strings to Timedelta + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + assert result.dtype == object + assert result[0] == "1 Day" def test_delete(self): idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 56043cf3edb2d..25f27da758ad8 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import Series, TimedeltaIndex, timedelta_range +import pandas._testing as tm from pandas.tests.base.test_ops import Ops -import pandas.util.testing as tm from pandas.tseries.offsets import Day, Hour diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 4448b5e39684b..29e2c7dd20be0 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timedelta, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSlicing: diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 38f1d2c7d4a1b..44f4a2adedaad 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestVectorizedTimedelta: diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index bbdd6c8c7c017..0aa784cbb7710 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Int64Index, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Hour @@ -22,6 +22,22 @@ def test_union(self): i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" + def test_union_sort_false(self): + tdi = timedelta_range("1day", periods=5) + + left = tdi[3:] + right = tdi[:3] + + # Check that we are testing the desired code path + assert left._can_fast_union(right) + + result = left.union(right) + tm.assert_index_equal(result, tdi) + + result = left.union(right, sort=False) + expected = pd.TimedeltaIndex(["4 Days", "5 Days", "1 Days", "2 Day", "3 Days"]) + tm.assert_index_equal(result, expected) + def test_union_coverage(self): idx = TimedeltaIndex(["3d", "1d", "2d"]) @@ -62,6 +78,21 @@ def test_union_bug_4564(self): exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) + def test_union_freq_infer(self): + # When taking the union of two TimedeltaIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # DatetimeIndex behavior. + tdi = pd.timedelta_range("1 Day", periods=5) + left = tdi[[0, 1, 3, 4]] + right = tdi[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, tdi) + assert result.freq == "D" + def test_intersection_bug_1708(self): index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(5) @@ -179,3 +210,51 @@ def test_intersection_non_monotonic(self, rng, expected, sort): assert isinstance(result.freq, Hour) else: assert result.freq is None + + +class TestTimedeltaIndexDifference: + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_sort(self, sort): + + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) + + other = timedelta_range("1 days", "4 days", freq="D") + idx_diff = index.difference(other, sort) + + expected = TimedeltaIndex(["5 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["1 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_shift.py b/pandas/tests/indexes/timedeltas/test_shift.py index 048b29c0da501..98933ff0423ab 100644 --- a/pandas/tests/indexes/timedeltas/test_shift.py +++ b/pandas/tests/indexes/timedeltas/test_shift.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import TimedeltaIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndexShift: diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 35575f3349f83..3b52b93fa6369 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -14,7 +14,7 @@ date_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike @@ -57,52 +57,6 @@ def test_fillna_timedelta(self): ) tm.assert_index_equal(idx.fillna("x"), exp) - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: Difference of TimedeltaIndex should not preserve frequency - - index = timedelta_range("0 days", "5 days", freq="D") - - other = timedelta_range("1 days", "4 days", freq="D") - expected = TimedeltaIndex(["0 days", "5 days"], freq=None) - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_sort(self, sort): - - index = pd.TimedeltaIndex( - ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] - ) - - other = timedelta_range("1 days", "4 days", freq="D") - idx_diff = index.difference(other, sort) - - expected = TimedeltaIndex(["5 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["1 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_isin(self): index = tm.makeTimedeltaIndex(4) @@ -247,6 +201,13 @@ def test_append_numpy_bug_1681(self): result = a.append(c) assert (result["B"] == td).all() + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = pd.TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None + def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 1c1d0f1a735cf..1cef9de6a3a77 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import timedelta_range, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day, Second diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 5bd7a2a583b84..477fc092a4e16 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Series, TimedeltaIndex, isna, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltas: @@ -73,8 +73,7 @@ def test_to_timedelta_invalid(self): # time not supported ATM msg = ( - "Value must be Timedelta, string, integer, float, timedelta or" - " convertible" + "Value must be Timedelta, string, integer, float, timedelta or convertible" ) with pytest.raises(ValueError, match=msg): to_timedelta(time(second=1)) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 08e8dbad4e102..3c027b035c2b8 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -7,7 +7,7 @@ from pandas.core.dtypes.common import is_scalar from pandas import DataFrame, Float64Index, MultiIndex, Series, UInt64Index, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _mklbl(prefix, n): diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 5d23236207f94..634020982b1c2 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, IntervalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndex: diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index a86a9d16d3f9f..43036fbbd9844 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -4,7 +4,7 @@ import pytest from pandas import Interval, IntervalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndex: diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index e58e6ed0d5d83..e6d5a9eb84410 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index e0206c8e7f6aa..8bfba8c12e934 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -2,13 +2,13 @@ import pytest from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm def test_detect_chained_assignment(): # Inplace ops, originally from: - # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] b = [123, None] c = [1234, 2345] diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 519a1eb5b16d8..8ea825da8f94f 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm # ---------------------------------------------------------------------------- # test indexing of Series with multi-level Index diff --git a/pandas/tests/indexing/multiindex/test_iloc.py b/pandas/tests/indexing/multiindex/test_iloc.py index 2c2e4d06f1ae3..9859c7235c380 100644 --- a/pandas/tests/indexing/multiindex/test_iloc.py +++ b/pandas/tests/indexing/multiindex/test_iloc.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index aab44daf8d17f..8ea1cebd7bf7b 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.slow diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py index 35f3137dac059..01b0b392d52a3 100644 --- a/pandas/tests/indexing/multiindex/test_ix.py +++ b/pandas/tests/indexing/multiindex/test_ix.py @@ -4,7 +4,7 @@ from pandas.errors import PerformanceWarning from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndex: diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index da7d89a15125b..3b8aa963ac698 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm @pytest.fixture @@ -437,3 +437,34 @@ def test_loc_nan_multiindex(): columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), ) tm.assert_frame_equal(result, expected) + + +def test_loc_period_string_indexing(): + # GH 9892 + a = pd.period_range("2013Q1", "2013Q4", freq="Q") + i = (1111, 2222, 3333) + idx = pd.MultiIndex.from_product((a, i), names=("Periode", "CVR")) + df = pd.DataFrame( + index=idx, + columns=( + "OMS", + "OMK", + "RES", + "DRIFT_IND", + "OEVRIG_IND", + "FIN_IND", + "VARE_UD", + "LOEN_UD", + "FIN_UD", + ), + ) + result = df.loc[("2013Q1", 1111), "OMS"] + expected = pd.Series( + [np.nan], + dtype=object, + name="OMS", + index=pd.MultiIndex.from_tuples( + [(pd.Period("2013Q1"), 1111)], names=["Periode", "CVR"] + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 8c6afef1234da..8163de8588232 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,12 +1,11 @@ import numpy as np -import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexBasic: @@ -47,17 +46,6 @@ def test_multiindex_contains_dropped(self): assert "a" in idx.levels[0] assert "a" not in idx - @pytest.mark.parametrize( - "data, expected", - [ - (MultiIndex.from_product([(), ()]), True), - (MultiIndex.from_product([(1, 2), (3, 4)]), True), - (MultiIndex.from_product([("a", "b"), (1, 2)]), False), - ], - ) - def test_multiindex_is_homogeneous_type(self, data, expected): - assert data._is_homogeneous_type is expected - def test_indexing_over_hashtable_size_cutoff(self): n = 10000 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 05ea949721b65..9d181bdcb9491 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexPartial: diff --git a/pandas/tests/indexing/multiindex/test_set_ops.py b/pandas/tests/indexing/multiindex/test_set_ops.py index 66cb0d0d46380..f2cbfadb3cfa5 100644 --- a/pandas/tests/indexing/multiindex/test_set_ops.py +++ b/pandas/tests/indexing/multiindex/test_set_ops.py @@ -1,7 +1,7 @@ from numpy.random import randn from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexSetOps: diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 7fc95ba62a888..aebd1ad2573ed 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -4,8 +4,8 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm class TestMultiIndexSetItem: @@ -141,7 +141,7 @@ def test_multiindex_setitem(self): df.loc["bar"] *= 2 # from SO - # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + # https://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation df_orig = DataFrame.from_dict( { "price": { diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index ee0f160b33cf1..6fa9d3bd2cdbb 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp +import pandas._testing as tm from pandas.core.indexing import _non_reducing_slice from pandas.tests.indexing.common import _mklbl -import pandas.util.testing as tm class TestMultiIndexSlicers: diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 5b8300827609a..4bec0f429a34e 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -2,7 +2,7 @@ from numpy.random import randn from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexSorted: diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index ffbe1bb785cda..db8c0c643a623 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 81dedfdc74409..621417eb38d94 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexingCallable: diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 40fd6575abf44..8c8dece53277e 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -16,8 +16,8 @@ Timestamp, conftest, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT -import pandas.util.testing as tm class TestCategoricalIndex: @@ -74,8 +74,8 @@ def test_loc_scalar(self): df.loc["d"] = 10 msg = ( - "cannot insert an item into a CategoricalIndex that is not" - " already an existing category" + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" ) with pytest.raises(TypeError, match=msg): df.loc["d", "A"] = 10 @@ -365,8 +365,9 @@ def test_loc_listlike(self): # not all labels in the categories with pytest.raises( KeyError, - match="'a list-indexer must only include values that are in the" - " categories'", + match=( + "'a list-indexer must only include values that are in the categories'" + ), ): self.df2.loc[["a", "d"]] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 760bb655534b2..e845487ffca9a 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range, option_context +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm class TestCaching: @@ -273,7 +273,7 @@ def random_text(nobs=100): str(df) # from SO: - # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc + # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 256aaef8eb5a7..b904755b099d0 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -7,7 +7,7 @@ import pandas.compat as compat import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm ############################################################### # Index / Series common tests which may trigger dtype coercions @@ -432,13 +432,19 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype): ) self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) - msg = "Passed item and index have different timezone" if fill_val.tz: - with pytest.raises(ValueError, match=msg): + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): obj.insert(1, pd.Timestamp("2012-01-01")) - with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + msg = "Timezones don't match" + with pytest.raises(ValueError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + + else: + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) msg = "cannot insert DatetimeIndex with incompatible label" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index f2e3f7f6b3723..42f992339f036 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 52d0e30f0bcad..2cc8232566aa9 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestFloatIndexers: @@ -90,11 +90,11 @@ def test_scalar_non_numeric(self): else: error = TypeError msg = ( - r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}|" - "Cannot index by location index with a" - " non-integer key".format(klass=type(i), kind=str(float)) + r"cannot do (label|index|positional) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}|" + "Cannot index by location index with a " + "non-integer key".format(klass=type(i), kind=str(float)) ) with pytest.raises(error, match=msg): idxr(s)[3.0] @@ -111,9 +111,9 @@ def test_scalar_non_numeric(self): else: error = TypeError msg = ( - r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(error, match=msg): s.loc[3.0] @@ -344,9 +344,9 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[l] @@ -354,10 +354,10 @@ def test_slice_non_numeric(self): for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})".format( + "cannot do slice indexing " + r"on {klass} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of ({kind_float}|{kind_int})".format( klass=type(index), kind_float=str(float), kind_int=str(int), @@ -370,9 +370,9 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 @@ -424,9 +424,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -448,9 +448,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[-6\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[-6\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] @@ -474,9 +474,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(2|3)\.5\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|3)\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -492,9 +492,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -515,9 +515,9 @@ def test_integer_positional_indexing(self): klass = RangeIndex msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(2|4)\.0\] of" - " {kind}".format(klass=str(klass), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|4)\.0\] of " + "{kind}".format(klass=str(klass), kind=str(float)) ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -540,9 +540,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(0|1)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -555,9 +555,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[-10\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[-10\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-10.0, 10.0)] @@ -574,9 +574,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[0\.5\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[0\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -591,9 +591,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 2f27757d6a754..26dedf02e7333 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1,5 +1,6 @@ """ test positional based indexing with iloc """ +from datetime import datetime from warnings import catch_warnings, simplefilter import numpy as np @@ -7,10 +8,10 @@ import pandas as pd from pandas import DataFrame, Series, concat, date_range, isna +import pandas._testing as tm from pandas.api.types import is_scalar from pandas.core.indexing import IndexingError from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestiLoc(Base): @@ -122,7 +123,7 @@ def check(result, expected): [ ([slice(None), ["A", "D"]]), (["1", "2"], slice(None)), - ([pd.datetime(2019, 1, 1)], slice(None)), + ([datetime(2019, 1, 1)], slice(None)), ], ) def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d75afd1540f22..448a06070c45c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -13,11 +13,11 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series +import pandas._testing as tm from pandas.core.generic import NDFrame from pandas.core.indexers import validate_indices from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice from pandas.tests.indexing.common import Base, _mklbl -import pandas.util.testing as tm # ------------------------------------------------------------------------ # Indexing test cases @@ -83,12 +83,9 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): msg = ( r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" - "No matching signature found|" # TypeError - "unhashable type: 'numpy.ndarray'" # TypeError + "Index data must be 1-dimensional" ) if ( @@ -104,21 +101,12 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): "categorical", ] ): - idxr[nd3] - else: - if ( - isinstance(obj, DataFrame) - and idxr_id == "getitem" - and index.inferred_type == "boolean" - ): - error = TypeError - elif idxr_id == "getitem" and index.inferred_type == "interval": - error = TypeError - else: - error = ValueError - - with pytest.raises(error, match=msg): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] + else: + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(DeprecationWarning): + idxr[nd3] @pytest.mark.parametrize( "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ @@ -148,14 +136,12 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): msg = ( r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Only 1-dimensional input arrays are supported|" - "'pandas._libs.interval.IntervalTree' object has no attribute" - " 'set_value'|" # AttributeError + "'pandas._libs.interval.IntervalTree' object has no attribute " + "'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError "No matching signature found|" # TypeError - r"^\[\[\[" # pandas.core.indexing.IndexingError + r"^\[\[\[|" # pandas.core.indexing.IndexingError + "Index data must be 1-dimensional" ) if (idxr_id == "iloc") or ( @@ -176,10 +162,8 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): ): idxr[nd3] = 0 else: - with pytest.raises( - (ValueError, AttributeError, TypeError, pd.core.indexing.IndexingError), - match=msg, - ): + err = (ValueError, AttributeError) + with pytest.raises(err, match=msg): idxr[nd3] = 0 def test_inf_upcast(self): @@ -1190,3 +1174,13 @@ def test_duplicate_index_mistyped_key_raises_keyerror(): with pytest.raises(KeyError): ser.index._engine.get_loc(None) + + +def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(): + # GH 30567 + ser = pd.Series([None] * 10) + mask = [False] * 3 + [True] * 5 + [False] * 2 + ser[mask] = range(5) + result = ser + expected = pd.Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py index 7303c1ff3d111..edb5d7d7f3a57 100644 --- a/pandas/tests/indexing/test_indexing_engines.py +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -2,7 +2,7 @@ from pandas._libs import algos as libalgos, index as libindex -import pandas.util.testing as tm +import pandas._testing as tm class TestNumericEngine: diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index bf8c6afd00561..2ffa44bec14a6 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexingSlow: diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 8b3620e8cd843..a36078b11c663 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -7,9 +7,9 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import is_scalar from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestLoc(Base): @@ -371,6 +371,9 @@ def test_loc_index(self): result = df.loc[mask.values] tm.assert_frame_equal(result, expected) + result = df.loc[pd.array(mask, dtype="boolean")] + tm.assert_frame_equal(result, expected) + def test_loc_general(self): df = DataFrame( @@ -966,3 +969,36 @@ def test_loc_getitem_label_list_integer_labels( expected = df.iloc[:, expected_columns] result = df.loc[["A", "B", "C"], column_key] tm.assert_frame_equal(result, expected, check_column_type=check_column_type) + + +def test_loc_setitem_float_intindex(): + # GH 8720 + rand_data = np.random.randn(8, 4) + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1))) + expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + tm.assert_frame_equal(result, expected) + + +def test_loc_axis_1_slice(): + # GH 10586 + cols = [(yr, m) for yr in [2014, 2015] for m in [7, 8, 9, 10]] + df = pd.DataFrame( + np.ones((10, 8)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples(cols), + ) + result = df.loc(axis=1)[(2014, 9):(2015, 8)] + expected = pd.DataFrame( + np.ones((10, 4)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples( + [(2014, 9), (2014, 10), (2015, 7), (2015, 8)] + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..befe4fee8ecf8 --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,79 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([1, 2, 3], "int64"), + ([1.0, 2.0, 3.0], "float64"), + (["a", "b", "c"], "object"), + (["a", "b", "c"], "string"), + ([1, 2, 3], "datetime64[ns]"), + ([1, 2, 3], "datetime64[ns, CET]"), + ([1, 2, 3], "timedelta64[ns]"), + (["2000", "2001", "2002"], "Period[D]"), + ([1, 0, 3], "Sparse"), + ([pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(3, 4)], "interval"), + ], +) +@pytest.mark.parametrize( + "mask", [[True, False, False], [True, True, True], [False, False, False]] +) +@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_series_mask_boolean(values, dtype, mask, box_mask, frame): + ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) + if frame: + ser = ser.to_frame() + mask = pd.array(mask, dtype="boolean") + if box_mask: + mask = pd.Series(mask, index=ser.index) + + expected = ser[mask.astype("bool")] + + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + # empty + mask = mask[:0] + ser = ser.iloc[:0] + expected = ser[mask.astype("bool")] + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_indexing_with_na_raises(frame): + s = pd.Series([1, 2, 3], name="name") + + if frame: + s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") + match = "cannot mask with array containing NA / NaN values" + with pytest.raises(ValueError, match=match): + s[mask] + + with pytest.raises(ValueError, match=match): + s.loc[mask] + + with pytest.raises(ValueError, match=match): + s.iloc[mask] diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 15c65be37e0d9..5fda759020f1a 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPartialSetting: diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index ddaea5b597d6d..a567fb9b8ccc7 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -4,8 +4,8 @@ import pytest from pandas import DataFrame, Series, Timedelta, Timestamp, date_range +import pandas._testing as tm from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestScalar(Base): @@ -132,8 +132,8 @@ def test_at_to_fail(self): result = s.at["a"] assert result == 1 msg = ( - "At based indexing on an non-integer index can only have" - " non-integer indexers" + "At based indexing on an non-integer index can only have " + "non-integer indexers" ) with pytest.raises(ValueError, match=msg): s.at[0] diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 5c9865ddc7090..dd4750123c0b5 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndexing: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 551782d0b363a..15b1434f8629f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -10,19 +10,11 @@ from pandas._libs.internals import BlockPlacement import pandas as pd -from pandas import ( - Categorical, - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - SparseArray, -) +from pandas import Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray, TimedeltaArray +from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray from pandas.core.internals import BlockManager, SingleBlockManager, make_block -import pandas.util.testing as tm @pytest.fixture diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 3f034107ef24f..7810778602e12 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -2,7 +2,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import read_csv diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index 4495ba9b80b67..a257735dc1ec5 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -2,7 +2,7 @@ import pandas.util._test_decorators as td -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import read_csv diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 6e5610f4f5838..b9a3e8b59b133 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm pytest.importorskip("odf") diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index e8c60870e2a85..10ed192062d9c 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter, _OpenpyxlWriter diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 6add99858da68..629d3d02028bd 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -13,7 +13,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @contextlib.contextmanager @@ -988,3 +988,13 @@ def test_conflicting_excel_engines(self, read_ext): with pd.ExcelFile("test1" + read_ext) as xl: with pytest.raises(ValueError, match=msg): pd.read_excel(xl, engine="foo") + + def test_excel_read_binary(self, engine, read_ext): + # GH 15914 + expected = pd.read_excel("test1" + read_ext, engine=engine) + + with open("test1" + read_ext, "rb") as f: + data = f.read() + + actual = pd.read_excel(data, engine=engine) + tm.assert_frame_equal(expected, actual) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 41363bf13ed4e..88f4c3736bc0d 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter from pandas.io.formats.excel import ExcelFormatter diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e0cb75b0a6c99..55b987a599670 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, get_option, set_option -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ( ExcelFile, @@ -252,7 +252,7 @@ def test_read_excel_parse_dates(self, ext): res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") res = pd.read_excel( pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 ) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index e04dfc97d4968..d1f900a2dc58b 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelFile diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index deb72cc230669..b6f791434a92b 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -3,7 +3,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index c6af78c2704d8..01feab08eb5e3 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter, _XlwtWriter diff --git a/pandas/tests/io/formats/data/html/render_links_false.html b/pandas/tests/io/formats/data/html/render_links_false.html index 6509a0e985597..6feb403d63051 100644 --- a/pandas/tests/io/formats/data/html/render_links_false.html +++ b/pandas/tests/io/formats/data/html/render_links_false.html @@ -11,7 +11,7 @@ - + diff --git a/pandas/tests/io/formats/data/html/render_links_true.html b/pandas/tests/io/formats/data/html/render_links_true.html index e9cb5632aad1d..3eb53f3160a77 100644 --- a/pandas/tests/io/formats/data/html/render_links_true.html +++ b/pandas/tests/io/formats/data/html/render_links_true.html @@ -11,7 +11,7 @@ - + diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index a6ad5d5edbf5f..7008cef7b28fa 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.formats.css import CSSResolver, CSSWarning @@ -101,29 +101,25 @@ def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions assert_resolves( - "{shorthand}: 1pt".format(shorthand=shorthand), - {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt".format(shorthand=shorthand), - {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt 2pt".format(shorthand=shorthand), + f"{shorthand}: 1pt 4pt 2pt", {top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt 2pt 0pt".format(shorthand=shorthand), + f"{shorthand}: 1pt 4pt 2pt 0pt", {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, ) with tm.assert_produces_warning(CSSWarning): - assert_resolves( - "{shorthand}: 1pt 1pt 1pt 1pt 1pt".format(shorthand=shorthand), {} - ) + assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) @pytest.mark.parametrize( @@ -174,10 +170,10 @@ def test_css_none_absent(style, equiv): "size,resolved", [ ("xx-small", "6pt"), - ("x-small", "{pt:f}pt".format(pt=7.5)), - ("small", "{pt:f}pt".format(pt=9.6)), + ("x-small", f"{7.5:f}pt"), + ("small", f"{9.6:f}pt"), ("medium", "12pt"), - ("large", "{pt:f}pt".format(pt=13.5)), + ("large", f"{13.5:f}pt"), ("x-large", "18pt"), ("xx-large", "24pt"), ("8px", "6pt"), @@ -196,9 +192,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): else: inherited = {"font-size": relative_to} assert_resolves( - "font-size: {size}".format(size=size), - {"font-size": resolved}, - inherited=inherited, + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, ) @@ -224,7 +218,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): ("inherit", "16pt", "16pt"), ("smaller", None, "10pt"), ("smaller", "18pt", "15pt"), - ("larger", None, "{pt:f}pt".format(pt=14.4)), + ("larger", None, f"{14.4:f}pt"), ("larger", "15pt", "18pt"), ], ) @@ -234,7 +228,5 @@ def test_css_relative_font_size(size, relative_to, resolved): else: inherited = {"font-size": relative_to} assert_resolves( - "font-size: {size}".format(size=size), - {"font-size": resolved}, - inherited=inherited, + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, ) diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index 2edbff3766c9d..6801316ada8a3 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f51dd2918efff..97956489e7da6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -35,7 +35,7 @@ reset_option, set_option, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt import pandas.io.formats.printing as printing @@ -421,12 +421,10 @@ def test_repr_truncation_column_size(self): def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: - pytest.skip( - "terminal size too small, {0} x {1}".format(term_width, term_height) - ) + pytest.skip(f"terminal size too small, {term_width} x {term_height}") def mkframe(n): - index = ["{i:05d}".format(i=i) for i in range(n)] + index = [f"{i:05d}" for i in range(n)] return DataFrame(0, index, index) df6 = mkframe(6) @@ -667,9 +665,9 @@ def test_to_string_with_formatters(self): ) formatters = [ - ("int", lambda x: "0x{x:x}".format(x=x)), - ("float", lambda x: "[{x: 4.1f}]".format(x=x)), - ("object", lambda x: "-{x!s}-".format(x=x)), + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), ] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) @@ -711,7 +709,7 @@ def format_func(x): def test_to_string_with_formatters_unicode(self): df = DataFrame({"c/\u03c3": [1, 2, 3]}) - result = df.to_string(formatters={"c/\u03c3": lambda x: "{x}".format(x=x)}) + result = df.to_string(formatters={"c/\u03c3": str}) assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" def test_east_asian_unicode_false(self): @@ -1240,7 +1238,7 @@ def test_wide_repr(self): set_option("display.expand_frame_repr", False) rep_str = repr(df) - assert "10 rows x {c} columns".format(c=max_cols - 1) in rep_str + assert f"10 rows x {max_cols - 1} columns" in rep_str set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr @@ -1351,7 +1349,7 @@ def test_long_series(self): n = 1000 s = Series( np.random.randint(-50, 50, n), - index=["s{x:04d}".format(x=x) for x in range(n)], + index=[f"s{x:04d}" for x in range(n)], dtype="int64", ) @@ -1477,9 +1475,7 @@ def test_to_string(self): expected = ["A"] assert header == expected - biggie.to_string( - columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)} - ) + biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) biggie.to_string(columns=["B", "A"], float_format=str) biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) @@ -1610,7 +1606,7 @@ def test_to_string_small_float_values(self): result = df.to_string() # sadness per above - if "{x:.4g}".format(x=1.7e8) == "1.7e+008": + if _three_digit_exp(): expected = ( " a\n" "0 1.500000e+000\n" @@ -1922,7 +1918,7 @@ def test_repr_html_long(self): long_repr = df._repr_html_() assert ".." in long_repr assert str(41 + max_rows // 2) not in long_repr - assert "{h} rows ".format(h=h) in long_repr + assert f"{h} rows " in long_repr assert "2 columns" in long_repr def test_repr_html_float(self): @@ -1939,7 +1935,7 @@ def test_repr_html_float(self): ).set_index("idx") reg_repr = df._repr_html_() assert ".." not in reg_repr - assert "".format(val=str(40 + h)) in reg_repr + assert f"" in reg_repr h = max_rows + 1 df = DataFrame( @@ -1951,8 +1947,8 @@ def test_repr_html_float(self): ).set_index("idx") long_repr = df._repr_html_() assert ".." in long_repr - assert "".format(val="31") not in long_repr - assert "{h} rows ".format(h=h) in long_repr + assert "" not in long_repr + assert f"{h} rows " in long_repr assert "2 columns" in long_repr def test_repr_html_long_multiindex(self): @@ -2181,9 +2177,7 @@ def test_to_string(self): cp.name = "foo" result = cp.to_string(length=True, name=True, dtype=True) last_line = result.split("\n")[-1].strip() - assert last_line == ( - "Freq: B, Name: foo, Length: {cp}, dtype: float64".format(cp=len(cp)) - ) + assert last_line == (f"Freq: B, Name: foo, Length: {len(cp)}, dtype: float64") def test_freq_name_separation(self): s = Series( @@ -2665,14 +2659,14 @@ def test_format_explicit(self): assert exp == res res = repr(test_sers["asc"]) exp = ( - "0 a\n1 ab\n ... \n4 abcde\n5" - " abcdef\ndtype: object" + "0 a\n1 ab\n ... \n4 abcde\n5 " + "abcdef\ndtype: object" ) assert exp == res res = repr(test_sers["desc"]) exp = ( - "5 abcdef\n4 abcde\n ... \n1 ab\n0" - " a\ndtype: object" + "5 abcdef\n4 abcde\n ... \n1 ab\n0 " + "a\ndtype: object" ) assert exp == res @@ -2782,7 +2776,7 @@ def test_to_string_na_rep(self): def test_to_string_float_format(self): s = pd.Series(range(10), dtype="float64") - res = s.to_string(float_format=lambda x: "{0:2.1f}".format(x), max_rows=2) + res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) exp = "0 0.0\n ..\n9 9.0" assert res == exp @@ -2807,7 +2801,7 @@ def test_to_string_multindex_header(self): def _three_digit_exp(): - return "{x:.4g}".format(x=1.7e8) == "1.7e+008" + return f"{1.7e8:.4g}" == "1.7e+008" class TestFloatArrayFormatter: diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 5a3afb5025e51..e5dac18acedf6 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm jinja2 = pytest.importorskip("jinja2") from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip @@ -24,7 +24,7 @@ def setup_method(self, method): self.g = lambda x: x def h(x, foo="bar"): - return pd.Series("color: {foo}".format(foo=foo), index=x.index, name=x.name) + return pd.Series(f"color: {foo}", index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) @@ -278,7 +278,7 @@ def test_numeric_columns(self): def test_apply_axis(self): df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) - f = lambda x: ["val: {max}".format(max=x.max()) for v in x] + f = lambda x: [f"val: {x.max()}" for v in x] result = df.style.apply(f, axis=1) assert len(result._todo) == 1 assert len(result.ctx) == 0 @@ -362,7 +362,7 @@ def color_negative_red(val): strings, black otherwise. """ color = "red" if val < 0 else "black" - return "color: {color}".format(color=color) + return f"color: {color}" dic = { ("a", "d"): [-1.12, 2.11], @@ -530,20 +530,17 @@ def test_bar_align_left_0points(self): (1, 0): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (1, 1): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (1, 2): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (2, 0): [ "width: 10em", @@ -572,8 +569,7 @@ def test_bar_align_left_0points(self): (0, 1): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (0, 2): [ "width: 10em", @@ -1215,13 +1211,9 @@ def test_highlight_max(self): def test_export(self): f = lambda x: "color: red" if x > 0 else "color: blue" - g = ( - lambda x, y, z: "color: {z}".format(z=z) - if x > 0 - else "color: {z}".format(z=z) - ) + g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" style1 = self.styler - style1.applymap(f).applymap(g, y="a", z="b").highlight_max() + style1.applymap(f).applymap(g, z="b").highlight_max() result = style1.export() style2 = self.df.style style2.use(result) @@ -1645,9 +1637,7 @@ def test_hide_columns_mult_levels(self): def test_pipe(self): def set_caption_from_template(styler, a, b): - return styler.set_caption( - "Dataframe with a = {a} and b = {b}".format(a=a, b=b) - ) + return styler.set_caption(f"Dataframe with a = {a} and b = {b}") styler = self.df.style.pipe(set_caption_from_template, "A", b="B") assert "Dataframe with a = A and b = B" in styler.render() diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 24233a0ec84b1..a211ac11cf725 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,4 @@ +import io import os import sys @@ -6,7 +7,7 @@ import pandas as pd from pandas import DataFrame, compat -import pandas.util.testing as tm +import pandas._testing as tm class TestToCSV: @@ -204,6 +205,14 @@ def test_to_csv_na_rep(self): assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + # GH 29975 + # Make sure full na_rep shows up when a dtype is provided + csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv + csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ") + assert expected == csv + def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) @@ -486,10 +495,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): compression = compression_only if compression == "zip": - pytest.skip( - "{compression} is not supported " - "for to_csv".format(compression=compression) - ) + pytest.skip(f"{compression} is not supported for to_csv") # We'll complete file extension subsequently. filename = "test." @@ -563,3 +569,17 @@ def test_to_csv_na_rep_long_string(self, df_new_type): result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") assert expected == result + + def test_to_csv_timedelta_precision(self): + # GH 6783 + s = pd.Series([1, 1]).astype("timedelta64[ns]") + buf = io.StringIO() + s.to_csv(buf) + result = buf.getvalue() + expected_rows = [ + ",0", + "0,0 days 00:00:00.000000001", + "1,0 days 00:00:00.000000001", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 4d8edec7c7f14..883240b74c32c 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -5,7 +5,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.formats.css import CSSWarning from pandas.io.formats.excel import CSSToExcelConverter @@ -270,13 +270,13 @@ def test_css_to_excel_inherited(css, inherited, expected): def test_css_to_excel_good_colors(input_color, output_color): # see gh-18392 css = ( - "border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}" - ).format(color=input_color) + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) expected = dict() @@ -297,13 +297,13 @@ def test_css_to_excel_good_colors(input_color, output_color): def test_css_to_excel_bad_colors(input_color): # see gh-18392 css = ( - "border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}" - ).format(color=input_color) + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) expected = dict() diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a2a577a0753f7..d3f044a42eb28 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -7,18 +7,18 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, option_context -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt lorem_ipsum = ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex" - " ea commodo consequat. Duis aute irure dolor in reprehenderit in" - " voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur" - " sint occaecat cupidatat non proident, sunt in culpa qui officia" - " deserunt mollit anim id est laborum." + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " + "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " + "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex " + "ea commodo consequat. Duis aute irure dolor in reprehenderit in " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur " + "sint occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum." ) @@ -688,7 +688,7 @@ def test_to_html_float_format_no_fixed_width(value, float_format, expected, data def test_to_html_render_links(render_links, expected, datapath): # GH 2679 data = [ - [0, "http://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], [0, "www.pydata.org", "pydata.org"], ] df = DataFrame(data, columns=["foo", "bar", None]) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index ea8688517bd93..bd681032f155d 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestToLatex: diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 5c5c04c35d6b7..182c21ed1d416 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_compression_roundtrip(compression): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index fba74d8ebcf97..2ac2acc6748d1 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._table_schema import ( as_json_table_type, diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 038dd2df4d632..efb95a0cb2a42 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Index, json_normalize -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._normalize import nested_to_record @@ -462,6 +462,30 @@ def test_nested_flattening_consistent(self): # They should be the same. tm.assert_frame_equal(df1, df2) + def test_nonetype_record_path(self, nulls_fixture): + # see gh-30148 + # should not raise TypeError + result = json_normalize( + [ + {"state": "Texas", "info": nulls_fixture}, + {"state": "Florida", "info": [{"i": 2}]}, + ], + record_path=["info"], + ) + expected = DataFrame({"i": 2}, index=[0]) + tm.assert_equal(result, expected) + + def test_non_interable_record_path_errors(self): + # see gh-30148 + test_input = {"state": "Texas", "info": 1} + test_path = "info" + msg = ( + f"{test_input} has non iterable value 1 for path {test_path}. " + "Must be iterable or null." + ) + with pytest.raises(TypeError, match=msg): + json_normalize([test_input], record_path=[test_path]) + class TestNestedToRecord: def test_flat_stays_flat(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6e27b79458faf..e909a4952948c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json -import pandas.util.testing as tm +import pandas._testing as tm _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -39,6 +39,7 @@ def assert_json_roundtrip_equal(result, expected, orient): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: @pytest.fixture(scope="function", autouse=True) def setup(self, datapath): @@ -854,7 +855,7 @@ def test_date_format_frame(self, date, date_unit): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - # expected.index = expected.index.tz_localize("UTC") + expected.index = expected.index.tz_localize("UTC") expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) @@ -884,7 +885,7 @@ def test_date_format_series(self, date, date_unit): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - # expected.index = expected.index.tz_localize("UTC") + expected.index = expected.index.tz_localize("UTC") expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -1597,3 +1598,19 @@ def test_json_indent_all_orients(self, orient, expected): def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): pd.DataFrame().to_json(indent=-1) + + def test_emca_262_nan_inf_support(self): + # GH 12213 + data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' + result = pd.read_json(data) + expected = pd.DataFrame( + ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] + ) + tm.assert_frame_equal(result, expected) + + def test_deprecate_numpy_argument_read_json(self): + # GH 28512 + expected = DataFrame([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + result = read_json(expected.to_json(), numpy=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 90da175855c34..e531457627342 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import DataFrame, read_json -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._json import JsonReader diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index dab2882499634..bedd60084124c 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -17,7 +17,7 @@ import pandas.compat as compat from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _clean_dict(d): @@ -559,11 +559,6 @@ def test_loads_non_str_bytes_raises(self): with pytest.raises(TypeError, match=msg): ujson.loads(None) - def test_version(self): - assert re.match( - r"^\d+\.\d+(\.\d+)?$", ujson.__version__ - ), "ujson.__version__ must be a string like '1.4.0'" - def test_encode_numeric_overflow(self): with pytest.raises(OverflowError): ujson.encode(12839128391289382193812939) diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index a87e1e796c194..15967e3be176a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -46,11 +46,17 @@ class PythonParser(BaseParser): @pytest.fixture def csv_dir_path(datapath): + """ + The directory path to the data files needed for parser tests. + """ return datapath("io", "parser", "data") @pytest.fixture def csv1(csv_dir_path): + """ + The path to the data file "test1.csv" needed for parser tests. + """ return os.path.join(csv_dir_path, "test1.csv") @@ -69,14 +75,49 @@ def csv1(csv_dir_path): @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) def all_parsers(request): + """ + Fixture all of the CSV parsers. + """ return request.param @pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) def c_parser_only(request): + """ + Fixture all of the CSV parsers using the C engine. + """ return request.param @pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): + """ + Fixture all of the CSV parsers using the Python engine. + """ + return request.param + + +_utf_values = [8, 16, 32] + +_encoding_seps = ["", "-", "_"] +_encoding_prefixes = ["utf", "UTF"] + +_encoding_fmts = [ + f"{prefix}{sep}" + "{0}" for sep in _encoding_seps for prefix in _encoding_prefixes +] + + +@pytest.fixture(params=_utf_values) +def utf_value(request): + """ + Fixture for all possible integer values for a UTF encoding. + """ + return request.param + + +@pytest.fixture(params=_encoding_fmts) +def encoding_fmt(request): + """ + Fixture for all possible string formats of a UTF encoding. + """ return request.param diff --git a/pandas/tests/io/parser/data/utf32_ex_small.zip b/pandas/tests/io/parser/data/utf32_ex_small.zip new file mode 100644 index 0000000000000..9a6d5c08da9db Binary files /dev/null and b/pandas/tests/io/parser/data/utf32_ex_small.zip differ diff --git a/pandas/tests/io/parser/data/utf8_ex_small.zip b/pandas/tests/io/parser/data/utf8_ex_small.zip new file mode 100644 index 0000000000000..a4c5440bdffa7 Binary files /dev/null and b/pandas/tests/io/parser/data/utf8_ex_small.zip differ diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 77b52eb90d61f..1737f14e7adf9 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,7 +17,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, concat -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -597,3 +597,14 @@ def test_file_binary_mode(c_parser_only): with open(path, "rb") as f: result = parser.read_csv(f, header=None) tm.assert_frame_equal(result, expected) + + +def test_unix_style_breaks(c_parser_only): + # GH 11020 + parser = c_parser_only + with tm.ensure_clean() as path: + with open(path, "w", newline="\n") as f: + f.write("blah\n\ncol_1,col_2,col_3\n\n") + result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") + expected = DataFrame(columns=["col_1", "col_2", "col_3"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index e1d422142ab0b..60e32d7c27200 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -8,7 +8,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("na_values", [None, ["NaN"]]) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 42a4a55988b0f..4c02a37b66455 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -3,10 +3,9 @@ specific classification into the other test modules. """ import codecs -from collections import OrderedDict import csv from datetime import datetime -from io import BytesIO, StringIO +from io import StringIO import os import platform from tempfile import TemporaryFile @@ -19,7 +18,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError from pandas import DataFrame, Index, MultiIndex, Series, compat, concat -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -70,17 +69,6 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) -def test_bytes_io_input(all_parsers): - encoding = "cp1255" - parser = all_parsers - - data = BytesIO("שלום:1234\n562:123".encode(encoding)) - result = parser.read_csv(data, sep=":", encoding=encoding) - - expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - def test_empty_decimal_marker(all_parsers): data = """A|B|C 1|2,334|5 @@ -317,15 +305,6 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_unicode(all_parsers): - parser = all_parsers - data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) - - result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) - expected = DataFrame([["\u0141aski, Jan", 1]]) - tm.assert_frame_equal(result, expected) - - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. data = """A,B,C,D,E,F @@ -1065,59 +1044,6 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep", [",", "\t"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) -def test_utf16_bom_skiprows(all_parsers, sep, encoding): - # see gh-2298 - parser = all_parsers - data = """skip this -skip this too -A,B,C -1,2,3 -4,5,6""".replace( - ",", sep - ) - path = "__{}__.csv".format(tm.rands(10)) - kwargs = dict(sep=sep, skiprows=2) - utf8 = "utf-8" - - with tm.ensure_clean(path) as path: - from io import TextIOWrapper - - bytes_data = data.encode(encoding) - - with open(path, "wb") as f: - f.write(bytes_data) - - bytes_buffer = BytesIO(data.encode(utf8)) - bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) - - result = parser.read_csv(path, encoding=encoding, **kwargs) - expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) - - bytes_buffer.close() - tm.assert_frame_equal(result, expected) - - -def test_utf16_example(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - result = parser.read_csv(path, encoding="utf-16", sep="\t") - assert len(result) == 50 - - -def test_unicode_encoding(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - - result = parser.read_csv(path, header=None, encoding="latin-1") - result = result.set_index(0) - got = result[1][1632] - - expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" - assert got == expected - - def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1132,7 +1058,7 @@ def test_trailing_delimiters(all_parsers): def test_escapechar(all_parsers): - # http://stackoverflow.com/questions/13824840/feature-request-for- + # https://stackoverflow.com/questions/13824840/feature-request-for- # pandas-read-csv data = '''SEARCH_TERM,ACTUAL_URL "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" @@ -1316,9 +1242,7 @@ def test_float_parser(all_parsers): def test_scientific_no_exponent(all_parsers): # see gh-12215 - df = DataFrame.from_dict( - OrderedDict([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])]) - ) + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) data = df.to_csv(index=False) parser = all_parsers @@ -1918,39 +1842,6 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # Basic test - ("a\n1", dict(), DataFrame({"a": [1]})), - # "Regular" quoting - ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), - # Test in a data row instead of header - ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), - # Test in empty data row with skipping - ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), - # Test in empty data row without skipping - ( - "\n1", - dict(names=["a"], skip_blank_lines=False), - DataFrame({"a": [np.nan, 1]}), - ), - ], -) -def test_utf8_bom(all_parsers, data, kwargs, expected): - # see gh-4793 - parser = all_parsers - bom = "\ufeff" - utf8 = "utf-8" - - def _encode_data_with_bom(_data): - bom_data = (bom + _data).encode(utf8) - return BytesIO(bom_data) - - result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) - tm.assert_frame_equal(result, expected) - - def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -1968,20 +1859,6 @@ def test_temporary_file(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) -def test_read_csv_utf_aliases(all_parsers, byte, fmt): - # see gh-13549 - expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) - parser = all_parsers - - encoding = fmt.format(byte) - data = "mb_num,multibyte\n4.8,test".encode(encoding) - - result = parser.read_csv(BytesIO(data), encoding=encoding) - tm.assert_frame_equal(result, expected) - - def test_internal_eof_byte(all_parsers): # see gh-5500 parser = all_parsers @@ -2041,30 +1918,6 @@ def test_file_handles_with_open(all_parsers, csv1): assert not f.closed -@pytest.mark.parametrize( - "fname,encoding", - [ - ("test1.csv", "utf-8"), - ("unicode_series.csv", "latin-1"), - ("sauron.SHIFT_JIS.csv", "shiftjis"), - ], -) -def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): - # gh-23779: Python csv engine shouldn't error on files opened in binary. - parser = all_parsers - - fpath = os.path.join(csv_dir_path, fname) - expected = parser.read_csv(fpath, encoding=encoding) - - with open(fpath, mode="r", encoding=encoding) as fa: - result = parser.read_csv(fa) - tm.assert_frame_equal(expected, result) - - with open(fpath, mode="rb") as fb: - result = parser.read_csv(fb, encoding=encoding) - tm.assert_frame_equal(expected, result) - - def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: @@ -2207,3 +2060,13 @@ def test_first_row_bom(all_parsers): result = parser.read_csv(StringIO(data), delimiter="\t") expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 9d0eab0b9a907..dc03370daa1e2 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -9,7 +9,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf16_encoding(all_parsers, csv_dir_path): - # see gh-18071 +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): + # see gh-18071, gh-24130 parser = all_parsers - path = os.path.join(csv_dir_path, "utf16_ex_small.zip") + encoding = encoding_fmt.format(utf_value) + path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") - result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t") + result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") expected = pd.DataFrame( { "Country": ["Venezuela", "Venezuela"], diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 2a3b1dc82fc59..88b400d9a11df 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm def test_converters_type_must_be_dict(all_parsers): diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index dc10352bc6460..cc65def0fd096 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -11,7 +11,7 @@ from pandas.errors import ParserWarning from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index a68d46e8a6c15..d08c86bf2ae75 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("dtype", [str, object]) @@ -79,7 +79,7 @@ def test_invalid_dtype_per_column(all_parsers): 3,4.5 4,5.5""" - with pytest.raises(TypeError, match='data type "foo" not understood'): + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py new file mode 100644 index 0000000000000..33abf4bb7d9ee --- /dev/null +++ b/pandas/tests/io/parser/test_encoding.py @@ -0,0 +1,172 @@ +""" +Tests encoding functionality during parsing +for all of the parsers defined in parsers.py +""" + +from io import BytesIO +import os +import tempfile + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_bytes_io_input(all_parsers): + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([["\u0141aski, Jan", 1]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = """skip this +skip this too +A,B,C +1,2,3 +4,5,6""".replace( + ",", sep + ) + path = "__{}__.csv".format(tm.rands(10)) + kwargs = dict(sep=sep, skiprows=2) + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + from io import TextIOWrapper + + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + bytes_buffer = BytesIO(data.encode(utf8)) + bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) + + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + + bytes_buffer.close() + tm.assert_frame_equal(result, expected) + + +def test_utf16_example(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + result = parser.read_csv(path, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" + assert got == expected + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) +def test_utf8_bom(all_parsers, data, kwargs, expected): + # see gh-4793 + parser = all_parsers + bom = "\ufeff" + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = encoding_fmt.format(utf_value) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("pass_encoding", [True, False]) +def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): + # see gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + + expected = DataFrame({"foo": ["bar"]}) + + with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f: + f.write("foo\nbar") + f.seek(0) + + result = parser.read_csv(f, encoding=encoding if pass_encoding else None) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 214b93b6f0628..7dc106ef0c186 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -12,7 +12,7 @@ from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_read_with_bad_header(all_parsers): diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 66e00f4eb6c1c..f67a658cadfa2 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -5,10 +5,11 @@ """ from io import StringIO +import numpy as np import pytest from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("with_header", [True, False]) @@ -172,3 +173,14 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): ), ) tm.assert_frame_equal(result, expected) + + +def test_no_multi_index_level_names_empty(all_parsers): + # GH 10984 + parser = all_parsers + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) + with tm.ensure_clean() as path: + expected.to_csv(path) + result = parser.read_csv(path, index_col=[0, 1, 2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index d144421090274..5c4e642115798 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,7 +8,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index c94adf9da0bf3..64ccaf60ec230 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm def _construct_dataframe(num_rows): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 353d309a84823..f9a083d7f5d22 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -10,7 +10,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_string_nas(all_parsers): @@ -89,6 +89,7 @@ def test_default_na_values(all_parsers): "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 57e2950b06ce8..b8d66874bc660 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -11,7 +11,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import read_csv diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 36391e19a102e..b01b22e811ee3 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -20,8 +20,8 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm import pandas.io.date_converters as conv diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 5b381e43e3e19..7367b19b40dc3 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -13,7 +13,7 @@ from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_default_separator(python_parser_only): diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 94858226d0b44..14773dfbea20e 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -11,7 +11,7 @@ from pandas.errors import ParserError from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 9ddaccc4d38b7..27aef2376e87d 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import EmptyDataError, read_csv, read_fwf diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index d4f219d13ac53..fdccef1127c7e 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -12,7 +12,7 @@ from pandas.errors import EmptyDataError from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index e34f1010d690e..8d5af85c20d33 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -12,7 +12,7 @@ from pandas._libs.parsers import TextReader from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import TextFileReader, read_csv diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 07ab41b47bf27..267fae760398a 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -12,7 +12,7 @@ from pandas.errors import ParserError -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.parsers as parsers from pandas.io.parsers import read_csv diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 539fdf2470c51..979eb4702cc84 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -10,7 +10,7 @@ from pandas._libs.tslib import Timestamp from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm _msg_validate_usecols_arg = ( "'usecols' must either be list-like " diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py index 6164f5d0722cc..214f95c6fb441 100644 --- a/pandas/tests/io/pytables/conftest.py +++ b/pandas/tests/io/pytables/conftest.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index a82e21532eddb..c7200385aa998 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,8 +1,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path -import pandas.util.testing as tm tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 91ee1061a5ef1..543940e674dba 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store -import pandas.util.testing as tm from pandas.io.pytables import read_hdf diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py index 4ceb80889c989..9adb0a6d227da 100644 --- a/pandas/tests/io/pytables/test_pytables_missing.py +++ b/pandas/tests/io/pytables/test_pytables_missing.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @td.skip_if_installed("tables") diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index c8a8e738faa9c..64c4ad800f49d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -33,6 +33,7 @@ isna, timedelta_range, ) +import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, create_tempfile, @@ -42,7 +43,6 @@ safe_remove, tables, ) -import pandas.util.testing as tm from pandas.io.pytables import ( ClosedFileError, @@ -66,8 +66,11 @@ class TestHDFStore: def test_format_kwarg_in_constructor(self, setup_path): # GH 13291 + + msg = "format is not a defined argument for HDFStore" + with ensure_clean_path(setup_path) as path: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): HDFStore(path, format="table") def test_context(self, setup_path): @@ -203,21 +206,27 @@ def test_api(self, setup_path): # Invalid. df = tm.makeDataFrame() - with pytest.raises(ValueError): + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="f") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="fixed") - with pytest.raises(TypeError): + msg = r"invalid HDFStore format specified \[foo\]" + + with pytest.raises(TypeError, match=msg): df.to_hdf(path, "df", append=True, format="foo") - with pytest.raises(TypeError): - df.to_hdf(path, "df", append=False, format="bar") + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=False, format="foo") # File path doesn't exist path = "" - with pytest.raises(FileNotFoundError): + msg = f"File {path} does not exist" + + with pytest.raises(FileNotFoundError, match=msg): read_hdf(path, "df") def test_api_default_format(self, setup_path): @@ -230,7 +239,10 @@ def test_api_default_format(self, setup_path): _maybe_remove(store, "df") store.put("df", df) assert not store.get_storer("df").is_table - with pytest.raises(ValueError): + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): store.append("df2", df) pd.set_option("io.hdf.default_format", "table") @@ -251,7 +263,7 @@ def test_api_default_format(self, setup_path): df.to_hdf(path, "df") with HDFStore(path) as store: assert not store.get_storer("df").is_table - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df2", append=True) pd.set_option("io.hdf.default_format", "table") @@ -384,7 +396,10 @@ def test_versioning(self, setup_path): # this is an error because its table_type is appendable, but no # version info store.get_node("df2")._v_attrs.pandas_version = None - with pytest.raises(Exception): + + msg = "'NoneType' object has no attribute 'startswith'" + + with pytest.raises(Exception, match=msg): store.select("df2") def test_mode(self, setup_path): @@ -428,7 +443,11 @@ def check(mode): # conv read if mode in ["w"]: - with pytest.raises(ValueError): + msg = ( + "mode w is not allowed while performing a read. " + r"Allowed modes are r, r\+ and a." + ) + with pytest.raises(ValueError, match=msg): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 1acb0ac6e06d2..2bf22d982e5fe 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -7,12 +7,12 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range +import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_path, ensure_clean_store, ) -import pandas.util.testing as tm def _compare_with_tz(a, b): diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index fcd2e0e35ad9e..5d2643c20ceb2 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -3,7 +3,7 @@ import pytest from pandas import read_sas -import pandas.util.testing as tm +import pandas._testing as tm class TestSas: diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index d3480b246b91f..62e9ac6929c8e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,3 +1,4 @@ +from datetime import datetime import io import os from pathlib import Path @@ -9,7 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm # https://github.com/cython/cython/issues/1720 @@ -23,7 +24,7 @@ def setup_method(self, datapath): for j in 1, 2: fname = os.path.join(self.dirpath, f"test_sas7bdat_{j}.csv") df = pd.read_csv(fname) - epoch = pd.datetime(1960, 1, 1) + epoch = datetime(1960, 1, 1) t1 = pd.to_timedelta(df["Column4"], unit="d") df["Column4"] = epoch + t1 t2 = pd.to_timedelta(df["Column12"], unit="d") diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index a52b22122ba81..ee97f08ef9400 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.sas.sasreader import read_sas diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 666dfd245acaa..a69e5556f3e85 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, get_option, read_clipboard -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.clipboard import PyperclipException, clipboard_get, clipboard_set diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index cfcd2c9f2df95..a126f83164ce5 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -12,7 +12,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.common as icom @@ -146,11 +146,15 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) @@ -177,17 +181,21 @@ def test_read_expands_user_home_dir( path = os.path.join("~", "does_not_exist." + fn_ext) monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index e17a32cbc8b68..fb81e57912dac 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -6,7 +6,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.common as icom diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py index 2fa5e3b30d6af..cdb8eca02a3e5 100644 --- a/pandas/tests/io/test_date_converters.py +++ b/pandas/tests/io/test_date_converters.py @@ -2,7 +2,7 @@ import numpy as np -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.date_converters as conv diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index e06f2c31a2870..0038df78dd866 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -5,7 +5,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip @@ -136,7 +136,7 @@ def test_write_with_index(self): # column multi-index df.index = [0, 1, 2] - df.columns = (pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]),) + df.columns = pd.MultiIndex.from_tuples([("a", 1)]) self.check_error_on_write(df, ValueError) def test_path_pathlib(self): diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 48c8923dab7cd..7a5eba5264421 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -68,6 +68,10 @@ def _get_client(): return bigquery.Client(project=project_id, credentials=credentials) +def generate_rand_str(length: int = 10) -> str: + return "".join(random.choices(string.ascii_lowercase, k=length)) + + def make_mixed_dataframe_v2(test_size): # create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1, test_size)).astype(bool) @@ -153,19 +157,15 @@ def gbq_dataset(self): _skip_if_no_project_id() _skip_if_no_private_key_path() - dataset_id = "pydata_pandas_bq_testing_py31" + dataset_id = "pydata_pandas_bq_testing_" + generate_rand_str() self.client = _get_client() self.dataset = self.client.dataset(dataset_id) - try: - # Clean-up previous test runs. - self.client.delete_dataset(self.dataset, delete_contents=True) - except api_exceptions.NotFound: - pass # It's OK if the dataset doesn't already exist. + # Create the dataset self.client.create_dataset(bigquery.Dataset(self.dataset)) - table_name = "".join(random.choices(string.ascii_lowercase, k=10)) + table_name = generate_rand_str() destination_table = f"{dataset_id}.{table_name}" yield destination_table diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 85ac56c8193a6..557a9d5c13987 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -5,8 +5,8 @@ import pytest from pandas import DataFrame, date_range, read_csv +import pandas._testing as tm from pandas.util import _test_decorators as td -import pandas.util.testing as tm from pandas.io.common import is_gcs_url diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index bc26615d1aad5..626df839363cb 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -15,7 +15,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.common import file_path_to_url import pandas.io.html @@ -383,7 +383,15 @@ def test_thousands_macau_stats(self, datapath): assert not any(s.isna().any() for _, s in df.items()) @pytest.mark.slow - def test_thousands_macau_index_col(self, datapath): + def test_thousands_macau_index_col(self, datapath, request): + # https://github.com/pandas-dev/pandas/issues/29622 + # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly + if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import( + "bs4", "4.8.0" + ): + reason = "fails for bs4 version >= 4.8.0" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + all_non_nan_table_index = -2 macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 9f3ec274007d0..a1f9c6f6af51a 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import read_orc -import pandas.util.testing as tm +import pandas._testing as tm pytest.importorskip("pyarrow", minversion="0.13.0") pytest.importorskip("pyarrow.orc") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fc3d55e110d69..d51c712ed5abd 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -10,7 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parquet import ( FastParquetImpl, @@ -443,11 +443,12 @@ def test_duplicate_columns(self, pa): self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) - # pyarrow 0.11 raises ArrowTypeError - # older pyarrows raise ArrowInvalid - self.check_error_on_write(df, pa, Exception) + if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"): + # period - will be supported using an extension type with pyarrow 1.0 + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # pyarrow 0.11 raises ArrowTypeError + # older pyarrows raise ArrowInvalid + self.check_error_on_write(df, pa, Exception) # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) @@ -550,6 +551,19 @@ def test_additional_extension_arrays(self, pa): expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) + @td.skip_if_no("pyarrow", min_version="0.15.1.dev") + def test_additional_extension_types(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + by defining a custom ExtensionType + df = pd.DataFrame( + { + # Arrow does not yet support struct in writing to Parquet (ARROW-1644) + # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), + "d": pd.period_range("2012-01-01", periods=3, freq="D"), + } + ) + check_round_trip(df, pa) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.3.2") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3be966edef080..3d427dde573af 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -22,10 +22,11 @@ import pytest from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +import pandas.util._test_decorators as td import pandas as pd from pandas import Index -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day, MonthEnd @@ -381,12 +382,108 @@ def test_read(self, protocol, get_random_path): tm.assert_frame_equal(df, df2) -def test_unicode_decode_error(): +def test_unicode_decode_error(datapath): # pickle file written with py27, should be readable without raising # UnicodeDecodeError, see GH#28645 - path = os.path.join(os.path.dirname(__file__), "data", "pickle", "test_py27.pkl") + path = datapath("io", "data", "pickle", "test_py27.pkl") df = pd.read_pickle(path) # just test the columns are correct since the values are random excols = pd.Index(["a", "b", "c"]) tm.assert_index_equal(df.columns, excols) + + +# --------------------- +# tests for buffer I/O +# --------------------- + + +def test_pickle_buffer_roundtrip(): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + with open(path, "wb") as fh: + df.to_pickle(fh) + with open(path, "rb") as fh: + result = pd.read_pickle(fh) + tm.assert_frame_equal(df, result) + + +# --------------------- +# tests for URL I/O +# --------------------- + + +@pytest.mark.parametrize( + "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] +) +def test_pickle_generalurl_read(monkeypatch, mockurl): + def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + class MockReadResponse: + def __init__(self, path): + self.file = open(path, "rb") + if "gzip" in path: + self.headers = {"Content-Encoding": "gzip"} + else: + self.headers = {"Content-Encoding": None} + + def read(self): + return self.file.read() + + def close(self): + return self.file.close() + + with tm.ensure_clean() as path: + + def mock_urlopen_read(*args, **kwargs): + return MockReadResponse(path) + + df = tm.makeDataFrame() + python_pickler(df, path) + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) +def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockGCSFileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("s3fs") +@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) +def test_pickle_s3url_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockS3FileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index ccf3167d49371..013f56f83c5ec 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm pyreadstat = pytest.importorskip("pyreadstat") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d5c2b368a3c6a..45b3e839a08d1 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -41,7 +41,7 @@ to_datetime, to_timedelta, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.sql as sql from pandas.io.sql import read_sql_query, read_sql_table diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index cbc5ebd986c15..1d3cddbf01738 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -12,8 +12,8 @@ from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd +import pandas._testing as tm from pandas.core.frame import DataFrame, Series -import pandas.util.testing as tm from pandas.io.parsers import read_csv from pandas.io.stata import ( @@ -21,6 +21,7 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, + StataWriter118, read_stata, ) @@ -1271,11 +1272,9 @@ def test_invalid_variable_labels(self, version): variable_labels["a"] = "invalid character Œ" with tm.ensure_clean() as path: - msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises( + ValueError, match="Variable labels must contain only characters" + ): original.to_stata( path, variable_labels=variable_labels, version=version ) @@ -1297,8 +1296,8 @@ def test_write_variable_label_errors(self): } msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" + "Variable labels must contain only characters that can be " + "encoded in Latin-1" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1425,8 +1424,8 @@ def test_out_of_range_double(self): } ) msg = ( - r"Column ColumnTooBig has a maximum value \(.+\)" - r" outside the range supported by Stata \(.+\)" + r"Column ColumnTooBig has a maximum value \(.+\) outside the range " + r"supported by Stata \(.+\)" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1434,8 +1433,8 @@ def test_out_of_range_double(self): df.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which is outside " + "the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1468,8 +1467,8 @@ def test_out_of_range_float(self): original.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which " + "is outside the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1706,15 +1705,7 @@ def test_all_none_exception(self, version): output = pd.DataFrame(output) output.loc[:, "none"] = None with tm.ensure_clean() as path: - msg = ( - r"Column `none` cannot be exported\.\n\n" - "Only string-like object arrays containing all strings or a" - r" mix of strings and None can be exported\. Object arrays" - r" containing only null values are prohibited\. Other" - " object typescannot be exported and must first be" - r" converted to one of the supported types\." - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="Column `none` cannot be exported"): output.to_stata(path, version=version) @pytest.mark.parametrize("version", [114, 117]) @@ -1778,3 +1769,41 @@ def test_stata_119(self): assert df.iloc[0, 7] == 3.14 assert df.iloc[0, -1] == 1 assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + + def test_118_writer(self): + cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) + data = pd.DataFrame( + [ + [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"], + [2.0, 2, "ᴮ", ""], + [3.0, 3, "ᴰ", None], + ], + columns=["a", "β", "ĉ", "strls"], + ) + data["ᴐᴬᵀ"] = cat + variable_labels = { + "a": "apple", + "β": "ᵈᵉᵊ", + "ĉ": "ᴎტჄႲႳႴႶႺ", + "strls": "Long Strings", + "ᴐᴬᵀ": "", + } + data_label = "ᴅaᵀa-label" + data["β"] = data["β"].astype(np.int32) + with tm.ensure_clean() as path: + writer = StataWriter118( + path, + data, + data_label=data_label, + convert_strl=["strls"], + variable_labels=variable_labels, + write_index=False, + ) + writer.write_file() + reread_encoded = read_stata(path) + # Missing is intentionally converted to empty strl + data["strls"] = data["strls"].fillna("") + tm.assert_frame_equal(data, reread_encoded) + reader = StataReader(path) + assert reader.data_label == data_label + assert reader.variable_labels() == variable_labels diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 86cb7fc57b225..9f43027836eb4 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm """ diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 116d924f5a596..8ee279f0e1f38 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -10,8 +10,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, MultiIndex, Series, date_range, timedelta_range +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 71a186dc2f3b0..9cd3ccbf9214e 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import np_datetime64_compat from pandas import Index, Period, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.plotting import ( deregister_matplotlib_converters, @@ -66,11 +66,10 @@ def test_registering_no_warning(self): # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() - with tm.assert_produces_warning(None) as w: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 DeprecationWarning from 2D indexing ax.plot(s.index, s.values) - assert len(w) == 0 - def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) @@ -101,19 +100,16 @@ def test_option_no_warning(self): # Test without registering first, no warning with ctx: - with tm.assert_produces_warning(None) as w: + # GH#30588 DeprecationWarning from 2D indexing on Index + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): ax.plot(s.index, s.values) - assert len(w) == 0 - # Now test with registering register_matplotlib_converters() with ctx: - with tm.assert_produces_warning(None) as w: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): ax.plot(s.index, s.values) - assert len(w) == 0 - def test_registry_resets(self): units = pytest.importorskip("matplotlib.units") dates = pytest.importorskip("matplotlib.dates") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 8456f095e5868..fb86b600d3d3c 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -9,12 +9,12 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Index, NaT, Series, isna +import pandas._testing as tm from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import DatetimeIndex from pandas.tests.plotting.common import TestPlotBase -import pandas.util.testing as tm from pandas.tseries.offsets import DateOffset @@ -121,8 +121,8 @@ def test_both_style_and_color(self): ts = tm.makeTimeSeries() msg = ( "Cannot pass 'style' string with a color symbol and 'color' " - "keyword argument. Please use one or the other or pass 'style'" - " without a color symbol" + "keyword argument. Please use one or the other or pass 'style' " + "without a color symbol" ) with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4fcdc350bc90a..1c429bafa9a19 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -17,9 +17,9 @@ import pandas as pd from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range +import pandas._testing as tm from pandas.core.arrays import integer_array from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing import pandas.plotting as plotting @@ -1162,6 +1162,36 @@ def test_plot_scatter(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + def test_raise_error_on_datetime_time_data(self): + # GH 8113, datetime.time type is not supported by matplotlib in scatter + df = pd.DataFrame(np.random.randn(10), columns=["a"]) + df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time + msg = "must be a string or a number, not 'datetime.time'" + + with pytest.raises(TypeError, match=msg): + df.plot(kind="scatter", x="dtime", y="a") + + def test_scatterplot_datetime_data(self): + # GH 30391 + dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") + vals = np.random.normal(0, 1, len(dates)) + df = pd.DataFrame({"dates": dates, "vals": vals}) + + _check_plot_works(df.plot.scatter, x="dates", y="vals") + _check_plot_works(df.plot.scatter, x=0, y=1) + + def test_scatterplot_object_data(self): + # GH 18755 + df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + + df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + @pytest.mark.slow def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # addressing issue #10611, to ensure colobar does not @@ -1216,24 +1246,15 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() + @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) @pytest.mark.slow - def test_plot_scatter_with_categorical_data(self): - # GH 16199 + def test_plot_scatter_with_categorical_data(self, x, y): + # after fixing GH 18755, should be able to plot categorical data df = pd.DataFrame( {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} ) - with pytest.raises(ValueError) as ve: - df.plot(x="x", y="y", kind="scatter") - ve.match("requires y column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="x", kind="scatter") - ve.match("requires x column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="y", kind="scatter") - ve.match("requires x column to be numeric") + _check_plot_works(df.plot.scatter, x=x, y=y) @pytest.mark.slow def test_plot_scatter_with_c(self): @@ -3250,6 +3271,34 @@ def test_plot_no_numeric_data(self): with pytest.raises(TypeError): df.plot() + def test_missing_markers_legend(self): + # 14958 + df = pd.DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) + ax = df.plot(y=["A"], marker="x", linestyle="solid") + df.plot(y=["B"], marker="o", linestyle="dotted", ax=ax) + df.plot(y=["C"], marker="<", linestyle="dotted", ax=ax) + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=["x", "o", "<"]) + + def test_missing_markers_legend_using_style(self): + # 14563 + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6], + "B": [2, 4, 1, 3, 2, 4], + "C": [3, 3, 2, 6, 4, 2], + "X": [1, 2, 3, 4, 5, 6], + } + ) + + fig, ax = self.plt.subplots() + for kind in "ABC": + df.plot("X", kind, label=kind, ax=ax, style=".") + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=[".", ".", "."]) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index bb1747710fe18..8fec4bb134cb4 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -8,8 +8,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase -import pandas.util.testing as tm @td.skip_if_no_mpl diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 74d48c10ad9a0..50ebbc22f2739 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -9,8 +9,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm @td.skip_if_no_mpl diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index eadcc12d8428c..228c84528e882 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -10,8 +10,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting @@ -319,8 +319,8 @@ def test_subplot_titles(self, iris): # Case len(title) > len(df) msg = ( - "The length of `title` must equal the number of columns if" - " using `title` of type `list` and `subplots=True`" + "The length of `title` must equal the number of columns if " + "using `title` of type `list` and `subplots=True`" ) with pytest.raises(ValueError, match=msg): df.plot(subplots=True, title=title + ["kittens > puppies"]) @@ -331,8 +331,8 @@ def test_subplot_titles(self, iris): # Case subplots=False and title is of type list msg = ( - "Using `title` of type `list` is not supported unless" - " `subplots=True` is passed" + "Using `title` of type `list` is not supported unless " + "`subplots=True` is passed" ) with pytest.raises(ValueError, match=msg): df.plot(subplots=False, title=title) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 61722d726b28b..8463f30bee8f0 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -14,8 +14,8 @@ import pandas as pd from pandas import DataFrame, Series, date_range +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index d66472b1c2054..7400b049961d5 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -21,8 +21,8 @@ timedelta_range, to_timedelta, ) +import pandas._testing as tm from pandas.core import nanops -import pandas.util.testing as tm def get_objs(): diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 432811b5a8264..59dbcb9ab9fa0 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray -import pandas.util.testing as tm class TestDatetimeLikeStatReductions: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 02203f476af8e..f8a1810e66219 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -5,12 +5,12 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.groupby import DataError from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -import pandas.util.testing as tm # a fixture value can be overridden by the test parameter value. Note that the # value of the fixture can be overridden this way even if the test doesn't use @@ -84,8 +84,8 @@ def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() msg = ( - "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Index'" + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Index'" ) with pytest.raises(TypeError, match=msg): xp.resample("A").mean() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index bcd7081d5b1a5..4860329718f54 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import DataFrame, Series, Timedelta, Timestamp, isna, notna +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges -import pandas.util.testing as tm import pandas.tseries.offsets as offsets from pandas.tseries.offsets import BDay, Minute diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 0ed0bf18a82ee..955f8c7482937 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.resample import _get_period_range_edges -import pandas.util.testing as tm import pandas.tseries.offsets as offsets @@ -82,9 +82,9 @@ def test_selection(self, index, freq, kind, kwargs): index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), ) msg = ( - "Resampling from level= or on= selection with a PeriodIndex is" - r" not currently supported, use \.set_index\(\.\.\.\) to" - " explicitly set index" + "Resampling from level= or on= selection with a PeriodIndex is " + r"not currently supported, use \.set_index\(\.\.\.\) to " + "explicitly set index" ) with pytest.raises(NotImplementedError, match=msg): df.resample(freq, kind=kind, **kwargs) @@ -130,8 +130,8 @@ def test_not_subperiod(self, simple_period_range_series, rule, expected_error_ms # These are incompatible period rules for resampling ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") msg = ( - "Frequency cannot be resampled to {}, as they" - " are not sub or super periods" + "Frequency cannot be resampled to {}, as they " + "are not sub or super periods" ).format(expected_error_msg) with pytest.raises(IncompatibleFrequency, match=msg): ts.resample(rule).mean() @@ -236,8 +236,8 @@ def test_resample_same_freq(self, resample_method): def test_resample_incompat_freq(self): msg = ( - "Frequency cannot be resampled to ," - " as they are not sub or super periods" + "Frequency cannot be resampled to , " + "as they are not sub or super periods" ) with pytest.raises(IncompatibleFrequency, match=msg): Series( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7852afcdbfea9..170201b4f8e5c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") @@ -519,8 +519,8 @@ def test_selection_api_validation(): # non DatetimeIndex msg = ( - "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Int64Index'" + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Int64Index'" ) with pytest.raises(TypeError, match=msg): df.resample("2D", level="v") @@ -539,8 +539,8 @@ def test_selection_api_validation(): # upsampling not allowed msg = ( - "Upsampling from level= or on= selection is not supported, use" - r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like" + "Upsampling from level= or on= selection is not supported, use " + r"\.set_index\(\.\.\.\) to explicitly set index to datetime-like" ) with pytest.raises(ValueError, match=msg): df.resample("2D", level="d").asfreq() diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 7efc6b0d466b9..4e3585c0be884 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -2,10 +2,12 @@ import numpy as np +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm test_frame = DataFrame( {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}, @@ -13,17 +15,18 @@ ) -def test_tab_complete_ipython6_warning(ip): +@async_mark() +async def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( """\ - import pandas.util.testing as tm + import pandas._testing as tm s = tm.makeTimeSeries() rs = s.resample("D") """ ) - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 4c27d48cff6fd..3aa7765954634 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 7a6ebf826ca4d..d1bcdc55cb509 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -4,8 +4,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.timedeltas import timedelta_range -import pandas.util.testing as tm def test_asfreq_bug(): diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 091bdd8de2995..7020d373caf82 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge +import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data -import pandas.util.testing as tm a_ = np.array @@ -212,8 +212,8 @@ def test_join_on(self): source_copy = source.copy() source_copy["A"] = 0 msg = ( - "You are trying to merge on float64 and object columns. If" - " you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object columns. If " + "you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e191bf67c51ca..30c440035d48e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -25,10 +25,10 @@ TimedeltaIndex, UInt64Index, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge -import pandas.util.testing as tm N = 50 NGROUPS = 8 @@ -201,8 +201,8 @@ def test_merge_misspecified(self): merge(self.left, self.right, right_index=True) msg = ( - 'Can only pass argument "on" OR "left_on" and "right_on", not' - " a combination of both" + 'Can only pass argument "on" OR "left_on" and "right_on", not ' + "a combination of both" ) with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.left, left_on="key", on="key") @@ -1013,10 +1013,9 @@ def test_indicator(self): df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]}) msg = ( - "Cannot use `indicator=True` option when data contains a" - " column named {}|" - "Cannot use name of an existing column for indicator" - " column" + "Cannot use `indicator=True` option when data contains a " + "column named {}|" + "Cannot use name of an existing column for indicator column" ).format(i) with pytest.raises(ValueError, match=msg): merge(df1, df_badcolumn, on="col1", how="outer", indicator=True) @@ -1235,8 +1234,8 @@ def test_validation(self): ) msg = ( - "Merge keys are not unique in either left or right dataset;" - " not a one-to-one merge" + "Merge keys are not unique in either left or right dataset; " + "not a one-to-one merge" ) with pytest.raises(MergeError, match=msg): merge(left, right, on="a", validate="1:1") diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b2e764c5463fa..8037095aff0b9 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import Timedelta, merge_asof, read_csv, to_datetime +import pandas._testing as tm from pandas.core.reshape.merge import MergeError -import pandas.util.testing as tm class TestAsOfMerge: @@ -1185,6 +1185,13 @@ def test_merge_datatype_categorical_error_raises(self): with pytest.raises(MergeError, match=msg): merge_asof(left, right, on="a") + def test_merge_groupby_multiple_column_with_categorical_column(self): + # GH 16454 + df = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + result = merge_asof(df, df, on="x", by=["y", "z"]) + expected = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] ) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 4e0f570567c07..691f2549c0ece 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 6d6429fb4e6b5..e0063925a03e1 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, merge_ordered -import pandas.util.testing as tm +import pandas._testing as tm class TestMergeOrdered: diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index bce62571d55ec..1f78c1900d237 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,14 +1,12 @@ -from collections import OrderedDict - import numpy as np from numpy.random import randn import pytest import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import merge -import pandas.util.testing as tm @pytest.fixture @@ -474,17 +472,13 @@ def test_merge_datetime_index(self, klass): if klass is not None: on_vector = klass(on_vector) - expected = DataFrame( - OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) - ) + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict( - [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] - ) + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} ) result = df.merge(df, on=[df.index.year], how="inner") @@ -788,17 +782,13 @@ def test_merge_datetime_index(self, box): if box is not None: on_vector = box(on_vector) - expected = DataFrame( - OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) - ) + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict( - [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] - ) + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} ) result = df.merge(df, on=[df.index.year], how="inner") @@ -828,3 +818,22 @@ def test_single_common_level(self): ).set_index(["key", "X", "Y"]) tm.assert_frame_equal(result, expected) + + def test_join_multi_wrong_order(self): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"]) + + left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]}) + right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]}) + + result = left.join(right) + + expected = pd.DataFrame( + index=midx1, + data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]}, + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 05193c00f0649..b3b2c5a05c6ad 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -27,9 +27,10 @@ isna, read_csv, ) +import pandas._testing as tm +from pandas.core.arrays import SparseArray from pandas.core.construction import create_series_with_explicit_dtype from pandas.tests.extension.decimal import to_decimal -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -197,8 +198,8 @@ def test_concatlike_same_dtypes(self): # cannot append non-index msg = ( - r"cannot concatenate object of type '.+';" - " only Series and DataFrame objs are valid" + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" ) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) @@ -1865,8 +1866,8 @@ def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = tm.makeCustomDataframe(10, 2) msg = ( - "cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid" + "cannot concatenate object of type '{}'; " + "only Series and DataFrame objs are valid" ) for obj in [1, dict(), [1, 2], (1, 2)]: with pytest.raises(TypeError, match=msg.format(type(obj))): @@ -2730,3 +2731,22 @@ def test_concat_datetimeindex_freq(): expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) expected.index._data.freq = None tm.assert_frame_equal(result, expected) + + +def test_concat_empty_df_object_dtype(): + # GH 9149 + df_1 = pd.DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) + df_2 = pd.DataFrame(columns=df_1.columns) + result = pd.concat([df_1, df_2], axis=0) + expected = df_1.astype(object) + tm.assert_frame_equal(result, expected) + + +def test_concat_sparse(): + # GH 23557 + a = pd.Series(SparseArray([0, 1, 2])) + expected = pd.DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( + pd.SparseDtype(np.int64, 0) + ) + result = pd.concat([a, a], axis=1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 611c3272c123f..13b6f05ed304a 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -19,9 +19,9 @@ timedelta_range, to_datetime, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT import pandas.core.reshape.tile as tmod -import pandas.util.testing as tm def test_simple(): @@ -603,3 +603,12 @@ def test_cut_bool_coercion_to_int(bins, box, compare): expected = cut(data_expected, bins, duplicates="drop") result = cut(data_result, bins, duplicates="drop") compare(result, expected) + + +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_cut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + cut(values, 4, labels=labels) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 2c03c48209fea..814325844cb4c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, lreshape, melt, wide_to_long -import pandas.util.testing as tm +import pandas._testing as tm class TestMelt: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index bd1d3d2d5bb63..743fc50c87e96 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import date, datetime, timedelta from itertools import product @@ -16,9 +15,9 @@ concat, date_range, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import crosstab, pivot_table -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -1044,7 +1043,7 @@ def test_pivot_columns_lexsorted(self): assert pivoted.columns.is_monotonic def test_pivot_complex_aggfunc(self): - f = OrderedDict([("D", ["std"]), ("E", ["sum"])]) + f = {"D": ["std"], "E": ["sum"]} expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") result = self.data.pivot_table(index="A", columns="B", aggfunc=f) @@ -1966,6 +1965,31 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_table_empty_aggfunc(self): + # GH 9186 + df = pd.DataFrame( + { + "A": [2, 2, 3, 3, 2], + "id": [5, 6, 7, 8, 9], + "C": ["p", "q", "q", "p", "q"], + "D": [None, None, None, None, None], + } + ) + result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_pivot_table_no_column_raises(self): + # GH 10326 + def agg(l): + return np.mean(l) + + foo = pd.DataFrame( + {"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]} + ) + with pytest.raises(KeyError, match="notpresent"): + foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + class TestCrosstab: def setup_method(self, method): diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index eca9b11bd4364..95406a5ebf4f7 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -18,9 +18,9 @@ qcut, timedelta_range, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.algorithms import quantile -import pandas.util.testing as tm from pandas.tseries.offsets import Day, Nano @@ -130,6 +130,38 @@ def test_qcut_return_intervals(): tm.assert_series_equal(res, exp) +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_qcut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))]) +def test_qcut_wrong_length_labels(labels): + # GH 13318 + values = range(10) + msg = "Bin labels must be one fewer than the number of bin edges" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize( + "labels, expected", + [ + (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), + (list(range(3)), Categorical([0, 1, 2], ordered=True)), + ], +) +def test_qcut_list_like_labels(labels, expected): + # GH 13318 + values = range(3) + result = qcut(values, 3, labels=labels) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize( "kwargs,msg", [ diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index b695b05c7c7db..f25291f4aef12 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, get_dummies +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype -import pandas.util.testing as tm class TestGetDummies: @@ -45,7 +45,7 @@ def test_basic(self, sparse, dtype): dtype=self.effective_dtype(dtype), ) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0.0) + expected = expected.apply(SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) @@ -132,7 +132,7 @@ def test_include_na(self, sparse, dtype): {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) ) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0.0) + exp = exp.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -145,7 +145,7 @@ def test_include_na(self, sparse, dtype): # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: - exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) + exp_na = exp_na.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype) @@ -167,7 +167,7 @@ def test_unicode(self, sparse): dtype=np.uint8, ) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): @@ -180,10 +180,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): if sparse: expected = pd.DataFrame( { - "A_a": pd.SparseArray([1, 0, 1], dtype="uint8"), - "A_b": pd.SparseArray([0, 1, 0], dtype="uint8"), - "B_b": pd.SparseArray([1, 1, 0], dtype="uint8"), - "B_c": pd.SparseArray([0, 0, 1], dtype="uint8"), + "A_a": SparseArray([1, 0, 1], dtype="uint8"), + "A_b": SparseArray([0, 1, 0], dtype="uint8"), + "B_b": SparseArray([1, 1, 0], dtype="uint8"), + "B_c": SparseArray([0, 0, 1], dtype="uint8"), } ) @@ -226,7 +226,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected = expected[["C"] + cols] - typ = pd.SparseArray if sparse else pd.Series + typ = SparseArray if sparse else pd.Series expected[cols] = expected[cols].apply(lambda x: typ(x)) tm.assert_frame_equal(result, expected) @@ -423,7 +423,7 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) @@ -457,7 +457,7 @@ def test_basic_drop_first_NA(self, sparse): res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) @@ -466,7 +466,7 @@ def test_basic_drop_first_NA(self, sparse): ["b", np.nan], axis=1 ) if sparse: - exp_na = exp_na.apply(pd.SparseArray, fill_value=0) + exp_na = exp_na.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies( @@ -480,7 +480,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): @@ -494,7 +494,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: - expected[col] = pd.SparseArray(expected[col]) + expected[col] = SparseArray(expected[col]) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_na(self, df, sparse): @@ -516,7 +516,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): expected = expected.sort_index(axis=1) if sparse: for col in cols: - expected[col] = pd.SparseArray(expected[col]) + expected[col] = SparseArray(expected[col]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 9b56c4df6d7de..a503173bd74b1 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestUnionCategoricals: diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 60c6d7ec3017b..cd518dda4edbf 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -2,8 +2,8 @@ import pytest from pandas import Index, date_range +import pandas._testing as tm from pandas.core.reshape.util import cartesian_product -import pandas.util.testing as tm class TestCartesianProduct: diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 73371c48f9370..6af9c9884589c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -16,7 +16,7 @@ import pandas as pd from pandas import NaT, Period, Timedelta, Timestamp, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodConstruction: diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 82d5b097733f1..7d05511239ebc 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import is_scalar import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_singleton(): @@ -16,8 +16,8 @@ def test_singleton(): def test_repr(): - assert repr(NA) == "NA" - assert str(NA) == "NA" + assert repr(NA) == "" + assert str(NA) == "" def test_truthiness(): @@ -58,12 +58,6 @@ def test_comparison_ops(): assert (NA >= other) is NA assert (NA < other) is NA assert (NA <= other) is NA - - if isinstance(other, (np.int64, np.bool_)): - # for numpy scalars we get a deprecation warning and False as result - # for equality or error for larger/lesser than - continue - assert (other == NA) is NA assert (other != NA) is NA assert (other > NA) is NA @@ -87,9 +81,17 @@ def test_comparison_ops(): np.float_(-0), ], ) -def test_pow_special(value): +@pytest.mark.parametrize("asarray", [True, False]) +def test_pow_special(value, asarray): + if asarray: + value = np.array([value]) result = pd.NA ** value - assert isinstance(result, type(value)) + + if asarray: + result = result[0] + else: + # this assertion isn't possible for ndarray. + assert isinstance(result, type(value)) assert result == 1 @@ -108,12 +110,20 @@ def test_pow_special(value): np.float_(-1), ], ) -def test_rpow_special(value): +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_special(value, asarray): + if asarray: + value = np.array([value]) result = value ** pd.NA - assert result == value - if not isinstance(value, (np.float_, np.bool_, np.int_)): + + if asarray: + result = result[0] + elif not isinstance(value, (np.float_, np.bool_, np.int_)): + # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) + assert result == value + def test_unary_ops(): assert +NA is NA @@ -162,6 +172,19 @@ def test_logical_not(): assert ~NA is NA +@pytest.mark.parametrize( + "shape", [(3,), (3, 3), (1, 2, 3)], +) +def test_arithmetic_ndarray(shape, all_arithmetic_functions): + op = all_arithmetic_functions + a = np.zeros(shape) + if op.__name__ == "pow": + a += 5 + result = op(pd.NA, a) + expected = np.full(a.shape, pd.NA, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_is_scalar(): assert is_scalar(NA) is True @@ -177,6 +200,55 @@ def test_series_isna(): tm.assert_series_equal(s.isna(), expected) +def test_ufunc(): + assert np.log(pd.NA) is pd.NA + assert np.add(pd.NA, 1) is pd.NA + result = np.divmod(pd.NA, 1) + assert result[0] is pd.NA and result[1] is pd.NA + + result = np.frexp(pd.NA) + assert result[0] is pd.NA and result[1] is pd.NA + + +def test_ufunc_raises(): + with pytest.raises(ValueError, match="ufunc method 'at'"): + np.log.at(pd.NA, 0) + + +def test_binary_input_not_dunder(): + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + result = np.logaddexp(a, pd.NA) + tm.assert_numpy_array_equal(result, expected) + + result = np.logaddexp(pd.NA, a) + tm.assert_numpy_array_equal(result, expected) + + # all NA, multiple inputs + assert np.logaddexp(pd.NA, pd.NA) is pd.NA + + result = np.modf(pd.NA, pd.NA) + assert len(result) == 2 + assert all(x is pd.NA for x in result) + + +def test_divmod_ufunc(): + # binary in, binary out. + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + + result = np.divmod(a, pd.NA) + assert isinstance(result, tuple) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + result = np.divmod(pd.NA, a) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + def test_integer_hash_collision_dict(): # GH 30013 result = {NA: "foo", hash(NA): "bar"} diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e709db980b721..a537f000959e3 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -21,9 +21,9 @@ Timestamp, isna, ) +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.ops import roperator -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -123,6 +123,13 @@ def test_round_nat(klass, method, freq): "dst", "fromordinal", "fromtimestamp", + pytest.param( + "fromisocalendar", + marks=pytest.mark.skipif( + not compat.PY38, + reason="'fromisocalendar' was added in stdlib datetime in python 3.8", + ), + ), "isocalendar", "strftime", "strptime", @@ -297,6 +304,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + + # "fromisocalendar" was introduced in 3.8 if klass is Timestamp and not compat.PY38: expected.remove("fromisocalendar") diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index fed613b910c55..3764d9b7548fc 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import NaT, Timedelta, Timestamp, offsets +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm class TestTimedeltaAdditionSubtraction: diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index b988a72fd2684..e1d965bbb14e9 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaArithmetic: diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 25609cb852ed4..f1fcf46a936fd 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -14,12 +14,13 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import dateutil_gettz as gettz, get_timezone +import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat from pandas.errors import OutOfBoundsDatetime import pandas.util._test_decorators as td from pandas import NaT, Period, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries import offsets @@ -700,6 +701,19 @@ class SubDatetime(datetime): expected = Timestamp(2000, 1, 1) assert result == expected + @pytest.mark.skipif( + not compat.PY38, + reason="datetime.fromisocalendar was added in Python version 3.8", + ) + def test_constructor_fromisocalendar(self): + # GH 30395 + expected_timestamp = Timestamp("2000-01-03 00:00:00") + expected_stdlib = datetime.fromisocalendar(2000, 1, 1) + result = Timestamp.fromisocalendar(2000, 1, 1) + assert result == expected_timestamp + assert result == expected_stdlib + assert isinstance(result, Timestamp) + class TestTimestamp: def test_tz(self): diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index db63e0bf9cd30..65066fd0099ba 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -10,7 +10,7 @@ import pandas.util._test_decorators as td from pandas import NaT, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.frequencies import to_offset diff --git a/pandas/tests/series/conftest.py b/pandas/tests/series/conftest.py index 18d3c87a01f87..ff0b0c71f88b0 100644 --- a/pandas/tests/series/conftest.py +++ b/pandas/tests/series/conftest.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index c47b99fa38989..47f40e24e1637 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, Series, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index c2912cf3ce53f..d75efcf52c271 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Index, Series, Timestamp, date_range, isna +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm from pandas.tseries.offsets import BDay @@ -75,7 +75,7 @@ def test_getitem_boolean_object(string_series): # nans raise exception omask[5:10] = np.nan - msg = "cannot index with vector containing NA / NaN values" + msg = "cannot mask with array containing NA / NaN values" with pytest.raises(ValueError, match=msg): s[omask] with pytest.raises(ValueError, match=msg): @@ -285,8 +285,8 @@ def test_where_error(): with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] msg = ( - "NumPy boolean array indexing assignment cannot assign 0 input" - " values to the 1 output values where the mask is true" + "NumPy boolean array indexing assignment cannot assign 0 input " + "values to the 1 output values where the mask is true" ) with pytest.raises(ValueError, match=msg): s[[True, False]] = [] diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py index 2d879eed967e5..fe575cf146641 100644 --- a/pandas/tests/series/indexing/test_callable.py +++ b/pandas/tests/series/indexing/test_callable.py @@ -1,5 +1,5 @@ import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_getitem_callable(): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 83c1c0ff16f4c..15ff5f6b343d1 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm """ diff --git a/pandas/tests/series/indexing/test_iloc.py b/pandas/tests/series/indexing/test_iloc.py index eef4d89af3832..f276eb5b0b23d 100644 --- a/pandas/tests/series/indexing/test_iloc.py +++ b/pandas/tests/series/indexing/test_iloc.py @@ -1,7 +1,7 @@ import numpy as np from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm def test_iloc(): diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 5bebd480ce8d4..4601cabf69b52 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay @@ -294,8 +294,8 @@ def test_getitem_dataframe(): s = pd.Series(10, index=rng) df = pd.DataFrame(rng, index=rng) msg = ( - "Indexing a Series with DataFrame is not supported," - " use the appropriate DataFrame column" + "Indexing a Series with DataFrame is not supported, " + "use the appropriate DataFrame column" ) with pytest.raises(TypeError, match=msg): s[df > 5] diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py index e6b5b5df2b000..7d6b6c78cc492 100644 --- a/pandas/tests/series/indexing/test_loc.py +++ b/pandas/tests/series/indexing/test_loc.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("val,expected", [(2 ** 63 - 1, 3), (2 ** 63, 4)]) diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index a641b47f2e690..ce0d04ff99077 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_get(): diff --git a/pandas/tests/series/methods/test_append.py b/pandas/tests/series/methods/test_append.py index ec357786f18fb..dc0fca4bba067 100644 --- a/pandas/tests/series/methods/test_append.py +++ b/pandas/tests/series/methods/test_append.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAppend: diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 9dd3045ad86d9..1fc98ded0d3d2 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, Timestamp, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesArgsort: diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 8bc9e9c38d83a..b121efd202744 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, Timestamp, date_range, isna, notna, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAsof: diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index c2bec2744583a..37764d3b82c2d 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timestamp, isna, notna -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesClip: diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 9cf776c0d9f1a..1ca48eeb7c441 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import Categorical, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesCount: diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index f7dae87018419..1f6033d435323 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesCov: diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index ed412e7da3d43..b147a04b11090 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,7 +1,7 @@ import numpy as np from pandas import Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDescribe: diff --git a/pandas/tests/series/methods/test_diff.py b/pandas/tests/series/methods/test_diff.py index 9cb4ec827a271..033f75e95f11b 100644 --- a/pandas/tests/series/methods/test_diff.py +++ b/pandas/tests/series/methods/test_diff.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, TimedeltaIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDiff: diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 2c5dcd2c45171..2d052505d5ecc 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index 36b3b559477a6..5cc297913e851 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -2,7 +2,7 @@ import pytest from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index e79d3c0556cf1..979199e1efc62 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_basic(): diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 69b2f896aec52..ca93e989ba6b5 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesIsIn: diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 423b4ad78a78a..a029965c7394f 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm main_dtypes = [ "datetime", diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index abc5c498813ef..1efb57894f986 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesPctChange: @@ -68,3 +68,12 @@ def test_pct_change_periods_freq( rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) tm.assert_series_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) + result = s.pct_change(fill_method=fill_method) + expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 4eb275d63e878..79f50afca658f 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Index, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import Timestamp -import pandas.util.testing as tm class TestSeriesQuantile: diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 793e8b7da4965..3d4688c8274f9 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -8,8 +8,8 @@ import pandas.util._test_decorators as td from pandas import NaT, Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import CategoricalDtype -import pandas.util.testing as tm class TestSeriesRank: @@ -203,8 +203,7 @@ def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") msg = ( - "No axis named average for object type" - " " + "No axis named average for object type " ) with pytest.raises(ValueError, match=msg): s.rank("average") diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 4125b5816422a..b20baa2836363 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesReplace: diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 1776468ef5a83..7f0711a0f30d7 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -2,7 +2,7 @@ import pytest from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesRound: diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py index 0d6e9635579f0..fd6c6f74a9136 100644 --- a/pandas/tests/series/methods/test_searchsorted.py +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -1,8 +1,8 @@ import numpy as np from pandas import Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import is_scalar -import pandas.util.testing as tm class TestSeriesSearchSorted: diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 2cf847c928862..8256e2f33b936 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -13,7 +13,7 @@ date_range, offsets, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index ab15b8c814029..6fa4eeaee34c0 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -4,7 +4,7 @@ import pytest from pandas import IntervalIndex, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSortIndex: @@ -135,3 +135,34 @@ def test_sort_index_intervals(self): [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ascending, ignore_index, output_index", + [ + ([2, 3, 6, 1], [2, 3, 6, 1], True, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [2, 3, 6, 1], True, False, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, False, [3, 2, 1, 0]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_list, sorted_list, ascending, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_ser = ser.copy() + result_ser.sort_index(**kwargs) + else: + result_ser = ser.sort_index(**kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index 2cea6f061de76..caa2abd61af6a 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSortValues: @@ -77,8 +77,8 @@ def test_sort_values(self, datetime_series): s = df.iloc[:, 0] msg = ( - "This Series is a view of some other array, to sort in-place" - " you must create a copy" + "This Series is a view of some other array, to sort in-place " + "you must create a copy" ) with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) @@ -157,6 +157,7 @@ def test_sort_values_categorical(self): expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_list, sorted_list, ignore_index, output_index", [ @@ -165,21 +166,18 @@ def test_sort_values_categorical(self): ], ) def test_sort_values_ignore_index( - self, original_list, sorted_list, ignore_index, output_index + self, inplace, original_list, sorted_list, ignore_index, output_index ): # GH 30114 - sr = Series(original_list) + ser = Series(original_list) expected = Series(sorted_list, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} - # Test when inplace is False - sorted_sr = sr.sort_values(ascending=False, ignore_index=ignore_index) - tm.assert_series_equal(sorted_sr, expected) + if inplace: + result_ser = ser.copy() + result_ser.sort_values(ascending=False, **kwargs) + else: + result_ser = ser.sort_values(ascending=False, **kwargs) - tm.assert_series_equal(sr, Series(original_list)) - - # Test when inplace is True - copied_sr = sr.copy() - copied_sr.sort_values(ascending=False, ignore_index=ignore_index, inplace=True) - tm.assert_series_equal(copied_sr, expected) - - tm.assert_series_equal(sr, Series(original_list)) + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/pandas/tests/series/methods/test_to_dict.py b/pandas/tests/series/methods/test_to_dict.py index 0f1359f99e594..2fbf3e8d39cf3 100644 --- a/pandas/tests/series/methods/test_to_dict.py +++ b/pandas/tests/series/methods/test_to_dict.py @@ -3,7 +3,7 @@ import pytest from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesToDict: diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index b2bf5e854fbcc..d4e2890ed8bf0 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 15d895f44c7b2..fdb35befeb0c2 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesValueCounts: diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 9e1bae8469138..628c66583535d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAlterAxes: @@ -19,8 +19,8 @@ def test_setindex(self, string_series): # wrong length msg = ( - "Length mismatch: Expected axis has 30 elements, new" - " values have 29 elements" + "Length mismatch: Expected axis has 30 elements, " + "new values have 29 elements" ) with pytest.raises(ValueError, match=msg): string_series.index = np.arange(len(string_series) - 1) @@ -83,8 +83,9 @@ def test_rename_axis_supported(self): s = Series(range(5)) s.rename({}, axis=0) s.rename({}, axis="index") - with pytest.raises(ValueError, match="No axis named 5"): - s.rename({}, axis=5) + # TODO: clean up shared index validation + # with pytest.raises(ValueError, match="No axis named 5"): + # s.rename({}, axis=5) def test_set_name_attribute(self): s = Series([1, 2, 3]) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index a88043c7777c4..c29bd3ea0cb7d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAnalytics: diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index a187a1362297c..f96d6ddfc357e 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import ( Categorical, @@ -19,8 +21,8 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import PeriodArray -import pandas.util.testing as tm import pandas.io.formats.printing as printing @@ -126,8 +128,8 @@ def test_constructor_dict(self): expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) tm.assert_series_equal(result, expected) - def test_constructor_subclass_dict(self): - data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) + def test_constructor_subclass_dict(self, dict_subclass): + data = dict_subclass((x, 10.0 * x) for x in range(10)) series = Series(data) expected = Series(dict(data.items())) tm.assert_series_equal(series, expected) @@ -491,13 +493,14 @@ def test_empty_method(self): for full_series in [pd.Series([1]), s2]: assert not full_series.empty - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("s.", 1)) @@ -509,6 +512,13 @@ def test_integer_series_size(self): s = Series(range(9), dtype="Int64") assert s.size == 9 + def test_attrs(self): + s = pd.Series([0, 1], name="abc") + assert s.attrs == {} + s.attrs["version"] = 1 + result = s + 1 + assert result.attrs == {"version": 1} + class TestCategoricalSeries: @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 30b8b5c7c8545..a4c55a80a9f0f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, Index, Series, isna +import pandas._testing as tm from pandas.conftest import _get_cython_table_params from pandas.core.base import SpecificationError -import pandas.util.testing as tm class TestSeriesApply: @@ -627,6 +627,30 @@ class DictWithoutMissing(dict): expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) + def test_map_abc_mapping(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_mapping_dict_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + def test_map_abc_mapping_with_missing(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_mapping_dict_subclass): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + def test_map_box(self): vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) @@ -756,3 +780,10 @@ def test_apply_scaler_on_date_time_index_aware_series(self): series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) + + def test_map_float_to_string_precision(self): + # GH 13228 + ser = pd.Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 68d6169fa4f34..f3ffdc373e178 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -3,10 +3,11 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + import pandas as pd from pandas import Series -from pandas.core.indexes.period import IncompatibleFrequency -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): @@ -46,6 +47,22 @@ def test_flex_method_equivalence(self, opname, ts): expected = alt(other, series) tm.assert_almost_equal(result, expected) + def test_flex_method_subclass_metadata_preservation(self, all_arithmetic_operators): + # GH 13208 + class MySeries(Series): + _metadata = ["x"] + + @property + def _constructor(self): + return MySeries + + opname = all_arithmetic_operators + op = getattr(Series, opname) + m = MySeries([1, 2, 3], name="test") + m.x = 42 + result = op(m, 1) + assert result.x == 42 + class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted @@ -171,6 +188,14 @@ def test_ser_cmp_result_names(self, names, op): result = op(ser, tdi) assert result.name == names[2] + # interval dtype + if op in [operator.eq, operator.ne]: + # interval dtype comparisons not yet implemented + ii = pd.interval_range(start=0, periods=5, name=names[0]) + ser = Series(ii).rename(names[1]) + result = op(ser, ii) + assert result.name == names[2] + # categorical if op in [operator.eq, operator.ne]: # categorical dtype comparisons raise for inequalities diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 238a413af7a31..239353d3955b4 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesCombine: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 20a83ec4cd162..c38e5708be09b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -26,8 +26,8 @@ period_range, timedelta_range, ) -from pandas.core.arrays import period_array -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import IntervalArray, period_array class TestSeriesConstructors: @@ -967,16 +967,34 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") tm.assert_series_equal(result, expected) - def test_construction_interval(self): + @pytest.mark.parametrize("interval_constructor", [IntervalIndex, IntervalArray]) + def test_construction_interval(self, interval_constructor): # construction from interval & array of intervals - index = IntervalIndex.from_breaks(np.arange(3), closed="right") - result = Series(index) - repr(result) - str(result) - tm.assert_index_equal(Index(result.values), index) + intervals = interval_constructor.from_breaks(np.arange(3), closed="right") + result = Series(intervals) + assert result.dtype == "interval[int64]" + tm.assert_index_equal(Index(result.values), Index(intervals)) - result = Series(index.values) - tm.assert_index_equal(Index(result.values), index) + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_interval(self, data_constructor): + # GH 23563: consistent closed results in interval dtype + data = [pd.Interval(0, 1), pd.Interval(0, 2), None] + result = pd.Series(data_constructor(data)) + expected = pd.Series(IntervalArray(data)) + assert result.dtype == "interval[float64]" + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_interval_mixed_closed(self, data_constructor): + # GH 23563: mixed closed results in object dtype (not interval dtype) + data = [pd.Interval(0, 1, closed="both"), pd.Interval(0, 2, closed="neither")] + result = Series(data_constructor(data)) + assert result.dtype == object + assert result.tolist() == data def test_construction_consistency(self): @@ -993,17 +1011,16 @@ def test_construction_consistency(self): result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) - def test_constructor_infer_period(self): + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_period(self, data_constructor): data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] - result = pd.Series(data) + result = pd.Series(data_constructor(data)) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" - data = np.asarray(data, dtype=object) - tm.assert_series_equal(result, expected) - assert result.dtype == "Period[D]" - def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) @@ -1089,6 +1106,14 @@ def create_data(constructor): tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) + def test_constructor_mapping(self, non_mapping_dict_subclass): + # GH 29788 + ndm = non_mapping_dict_subclass({3: "three"}) + result = Series(ndm) + expected = Series(["three"], index=[3]) + + tm.assert_series_equal(result, expected) + def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index f72206e42403c..885b5bf0476f2 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -11,7 +11,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def _check_accum_op(name, series, check_dtype=True): diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index aa56131f05570..b8be4ea137e3d 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -24,9 +24,9 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import PeriodArray import pandas.core.common as com -import pandas.util.testing as tm class TestSeriesDatetimeValues: diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 69e34a4d97006..a57ec2ba05d54 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -20,7 +20,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDtypes: @@ -465,13 +465,6 @@ def test_infer_objects_series(self): assert actual.dtype == "object" tm.assert_series_equal(actual, expected) - def test_is_homogeneous_type(self): - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - empty = Series() - assert empty._is_homogeneous_type - assert Series([1, 2])._is_homogeneous_type - assert Series(pd.Categorical([1, 2]))._is_homogeneous_type - @pytest.mark.parametrize( "data", [ diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 57d919ccb89ec..3513db6177951 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -2,8 +2,8 @@ import pytest from pandas import Categorical, Series +import pandas._testing as tm from pandas.core.construction import create_series_with_explicit_dtype -import pandas.util.testing as tm def test_nunique(): diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index efcb500a0b79f..4c817ed2e2d59 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import NaT, Series, Timestamp +import pandas._testing as tm from pandas.core.internals.blocks import IntBlock -import pandas.util.testing as tm class TestSeriesInternals: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f53081ac53b01..510c11a51ca38 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.common import get_handle diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index c49cd6930781e..7b6d9210ed3d9 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -21,7 +21,7 @@ date_range, isna, ) -import pandas.util.testing as tm +import pandas._testing as tm def _simple_ts(start, end, freq="D"): @@ -457,9 +457,9 @@ def test_fillna_consistency(self): def test_where_sparse(self): # GH#17198 make sure we dont get an AttributeError for sp_index - ser = pd.Series(pd.SparseArray([1, 2])) + ser = pd.Series(pd.arrays.SparseArray([1, 2])) result = ser.where(ser >= 2, 0) - expected = pd.Series(pd.SparseArray([0, 2])) + expected = pd.Series(pd.arrays.SparseArray([0, 2])) tm.assert_series_equal(result, expected) def test_datetime64tz_fillna_round_issue(self): @@ -1180,8 +1180,8 @@ def test_interpolate_index_values(self): def test_interpolate_non_ts(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) msg = ( - "time-weighted interpolation only works on Series or DataFrames" - " with a DatetimeIndex" + "time-weighted interpolation only works on Series or DataFrames " + "with a DatetimeIndex" ) with pytest.raises(ValueError, match=msg): s.interpolate(method="time") diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 06fe64d69fb6b..bdd9f92d92d3f 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna +import pandas._testing as tm from pandas.core import ops import pandas.core.nanops as nanops -import pandas.util.testing as tm class TestSeriesLogicalOps: diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 4aeb211170d8f..03fee389542e3 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Period, Series, period_range +import pandas._testing as tm from pandas.core.arrays import PeriodArray -import pandas.util.testing as tm class TestSeriesPeriod: diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 008ae50e4cde5..64a8c4569406e 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -15,7 +15,7 @@ period_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesRepr: diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 5e2d23a70e5be..73247bbf8b3d6 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,4 +1,4 @@ -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSubclassing: diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index c3e5e8b975cda..a2d14f27d7b7a 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -21,7 +21,7 @@ timedelta_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay, BMonthEnd @@ -137,7 +137,9 @@ def test_first_last_valid(self, datetime_series): assert ts.last_valid_index().freq == ts.index.freq def test_mpl_compat_hack(self, datetime_series): - result = datetime_series[:, np.newaxis] + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 multi-dimensional indexing deprecated + result = datetime_series[:, np.newaxis] expected = datetime_series.values[:, np.newaxis] tm.assert_almost_equal(result, expected) @@ -501,10 +503,7 @@ def test_between_time_raises(self): def test_between_time_types(self): # GH11818 rng = date_range("1/1/2000", "1/5/2000", freq="5min") - msg = ( - r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" - " to a time" - ) + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" with pytest.raises(ValueError, match=msg): rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) @@ -731,14 +730,12 @@ def test_asarray_tz_naive(self): # This shouldn't produce a warning. ser = pd.Series(pd.date_range("2000", periods=2)) expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - with tm.assert_produces_warning(None): - result = np.asarray(ser) + result = np.asarray(ser) tm.assert_numpy_array_equal(result, expected) # optionally, object - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype=object) + result = np.asarray(ser, dtype=object) expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) @@ -747,15 +744,12 @@ def test_asarray_tz_aware(self): tz = "US/Central" ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - # We warn by default and return an ndarray[M8[ns]] - with tm.assert_produces_warning(FutureWarning): - result = np.asarray(ser) + result = np.asarray(ser, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # Old behavior with no warning - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype="M8[ns]") + result = np.asarray(ser, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -763,7 +757,6 @@ def test_asarray_tz_aware(self): expected = np.array( [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] ) - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype=object) + result = np.asarray(ser, dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index 5e255e7cd5dcd..a363f927d10a9 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -11,8 +11,8 @@ from pandas._libs.tslibs import conversion, timezones from pandas import DatetimeIndex, Index, NaT, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm class TestSeriesTimezones: diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 120eaeaf785b0..ece7f1f21ab23 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -5,7 +5,8 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.arrays import SparseArray UNARY_UFUNCS = [np.positive, np.floor, np.exp] BINARY_UFUNCS = [np.add, np.logaddexp] # dunder op @@ -33,7 +34,7 @@ def test_unary_ufunc(ufunc, sparse): array = np.random.randint(0, 10, 10, dtype="int64") array[::2] = 0 if sparse: - array = pd.SparseArray(array, dtype=pd.SparseDtype("int64", 0)) + array = SparseArray(array, dtype=pd.SparseDtype("int64", 0)) index = list(string.ascii_letters[:10]) name = "name" @@ -51,8 +52,8 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -79,8 +80,8 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # * ufunc(Index, Series) dispatches to Series (returns a Series) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -110,8 +111,8 @@ def test_binary_ufunc_with_series( # with alignment between the indices a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -149,7 +150,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): # * ufunc(Series, scalar) == ufunc(scalar, Series) array, _ = arrays_for_binary_ufunc if sparse: - array = pd.SparseArray(array) + array = SparseArray(array) other = 2 series = pd.Series(array, name="name") @@ -183,8 +184,8 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ a2[a2 == 0] = 1 if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) s1 = pd.Series(a1) s2 = pd.Series(a2) @@ -209,7 +210,7 @@ def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): array, _ = arrays_for_binary_ufunc if sparse: - array = pd.SparseArray(array) + array = SparseArray(array) series = pd.Series(array, name="name") result = np.modf(series) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 82f647c9385b2..2b46f86d49c5e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -30,11 +30,11 @@ Timestamp, compat, ) +import pandas._testing as tm from pandas.conftest import BYTES_DTYPES, STRING_DTYPES import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com -import pandas.util.testing as tm class TestFactorize: diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 12d834131f71b..ee006233c4c1b 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -9,7 +9,7 @@ import pytest from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm def import_module(name): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 9808c3d78b436..fadab5d821470 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -5,9 +5,9 @@ from numpy.random import randn import pytest +import pandas._testing as tm from pandas.core.api import DataFrame from pandas.core.computation import expressions as expr -import pandas.util.testing as tm _frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") _frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64") diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 8940a82b33777..129dc275c4d5a 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -4,7 +4,7 @@ from pandas._libs import join as _join from pandas import Categorical, DataFrame, Index, merge -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexer: diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 77841f0bb9f0d..f839aa198d03f 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -4,7 +4,7 @@ from pandas._libs import lib, writers as libwriters from pandas import Index -import pandas.util.testing as tm +import pandas._testing as tm class TestMisc: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 39c122addd8b1..5382ad84bcca2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna -import pandas.util.testing as tm +import pandas._testing as tm AGG_FUNCTIONS = [ "sum", @@ -1359,6 +1359,30 @@ def test_mixed_depth_drop(self): ) tm.assert_frame_equal(expected, result) + def test_drop_multiindex_other_level_nan(self): + # GH 12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + def test_drop_nonunique(self): df = DataFrame( [ @@ -2286,6 +2310,14 @@ def test_sort_index_and_reconstruction_doc_example(self): tm.assert_frame_equal(result, expected) + def test_sort_index_non_existent_label_multiindex(self): + # GH 12261 + df = DataFrame(0, columns=[], index=pd.MultiIndex.from_product([[], []])) + df.loc["b", "2"] = 1 + df.loc["a", "3"] = 1 + result = df.sort_index().index.is_monotonic + assert result is True + def test_sort_index_reorder_on_ops(self): # 15687 df = DataFrame( diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index c207c803510ca..2c5d028ebe42e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -11,9 +11,9 @@ import pandas as pd from pandas import Series, isna +import pandas._testing as tm from pandas.core.arrays import DatetimeArray import pandas.core.nanops as nanops -import pandas.util.testing as tm use_bn = nanops._USE_BOTTLENECK has_c16 = hasattr(np, "complex128") @@ -598,6 +598,14 @@ def test_nancorr_spearman(self): targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") + @td.skip_if_no_scipy + def test_invalid_method(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'" + with pytest.raises(ValueError, match=msg): + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo") + def test_nancov(self): targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index cd154ed5fe570..ce527214e55e7 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -5,7 +5,7 @@ from pandas.compat._optional import VERSIONS, import_optional_dependency -import pandas.util.testing as tm +import pandas._testing as tm def test_import_optional(): diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index 6b40ff8b3fa1e..08a5581886522 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @contextlib.contextmanager diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 90cd9cc3e006d..98297474243e4 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -6,6 +6,7 @@ import pytest from pandas import DataFrame, MultiIndex, Series, array, concat, merge +import pandas._testing as tm from pandas.core.algorithms import safe_sort import pandas.core.common as com from pandas.core.sorting import ( @@ -15,7 +16,6 @@ lexsort_indexer, nargsort, ) -import pandas.util.testing as tm class TestSorting: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ae7ab6addc3fb..a92f917820bd0 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -8,8 +8,8 @@ from pandas._libs import lib from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna +import pandas._testing as tm import pandas.core.strings as strings -import pandas.util.testing as tm def assert_series_or_index_equal(left, right): @@ -3392,8 +3392,8 @@ def test_encode_decode_errors(self): encodeBase = Series(["a", "b", "a\x9d"]) msg = ( - r"'charmap' codec can't encode character '\\x9d' in position 1:" - " character maps to " + r"'charmap' codec can't encode character '\\x9d' in position 1: " + "character maps to " ) with pytest.raises(UnicodeEncodeError, match=msg): encodeBase.str.encode("cp1252") @@ -3406,8 +3406,8 @@ def test_encode_decode_errors(self): decodeBase = Series([b"a", b"b", b"a\x9d"]) msg = ( - "'charmap' codec can't decode byte 0x9d in position 1:" - " character maps to " + "'charmap' codec can't decode byte 0x9d in position 1: " + "character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): decodeBase.str.decode("cp1252") diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index d2a9e1dc94bb5..465296a6f9e51 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -6,8 +6,8 @@ from pandas._libs.tslib import iNaT +import pandas._testing as tm import pandas.core.algorithms as algos -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 082277796e602..2fd39d5a7b703 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, to_numeric -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[None, "ignore", "raise", "coerce"]) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 250c37cdadbe4..c4660417599a8 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -8,8 +8,8 @@ from pandas.compat import is_platform_windows from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range +import pandas._testing as tm from pandas.core.tools.datetimes import to_datetime -import pandas.util.testing as tm import pandas.tseries.frequencies as frequencies import pandas.tseries.offsets as offsets diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index c122f92ed228c..5b4a7c74b1af1 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -3,7 +3,7 @@ import pytest from pandas import DatetimeIndex, offsets, to_datetime -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.holiday import ( AbstractHolidayCalendar, diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 7748b965f8962..a2c146dbd65e8 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -3,7 +3,7 @@ import pytest from pytz import utc -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.holiday import ( MO, diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index 9ab722b866c76..5686119593e18 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.frequencies import get_offset from pandas.tseries.offsets import FY5253, FY5253Quarter diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index fcabc0bee85b6..2f00a58fe80be 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -22,9 +22,9 @@ from pandas.compat.numpy import np_datetime64_compat from pandas.errors import PerformanceWarning -from pandas.core.indexes.datetimes import DatetimeIndex, _to_M8, date_range +import pandas._testing as tm +from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.series import Series -import pandas.util.testing as tm from pandas.io.pickle import read_pickle from pandas.tseries.frequencies import _get_offset, _offset_map @@ -81,17 +81,6 @@ class WeekDay: SUN = 6 -#### -# Misc function tests -#### - - -def test_to_M8(): - valb = datetime(2007, 10, 1) - valu = _to_M8(valb) - assert isinstance(valu, np.datetime64) - - ##### # DateOffset Tests ##### @@ -2792,8 +2781,8 @@ def test_apply_large_n(self): def test_apply_corner(self): msg = ( - "Only know how to combine trading day with datetime, datetime64" - " or timedelta" + "Only know how to combine trading day " + "with datetime, datetime64 or timedelta" ) with pytest.raises(ApplyTypeError, match=msg): CDay().apply(BMonthEnd()) diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 2914d4ddf0da0..297e5c3178379 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -8,7 +8,7 @@ import pytest from pandas import Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries import offsets from pandas.tseries.offsets import Hour, Micro, Milli, Minute, Nano, Second diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 5cf2165993cd7..a40fcd725d604 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -9,7 +9,7 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 6c30e2b6c7a1c..2beeae85de683 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -8,7 +8,7 @@ from pandas._libs.tslibs import conversion, timezones, tzconversion from pandas import Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _compare_utc_to_local(tz_didx): diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index cd729956a027c..943f4207df543 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -2,7 +2,7 @@ from pandas._libs.tslibs import fields -import pandas.util.testing as tm +import pandas._testing as tm def test_fields_readonly(): diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 0bc30347b3fa9..36f7ada7326bf 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -11,7 +11,7 @@ from pandas._libs.tslibs.parsing import parse_time_string import pandas.util._test_decorators as td -import pandas.util.testing as tm +import pandas._testing as tm def test_parse_time_string(): diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index f430e2893ca33..b8048891e4876 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def _assert_almost_equal_both(a, b, **kwargs): @@ -39,9 +39,7 @@ def _assert_not_almost_equal(a, b, **kwargs): """ try: tm.assert_almost_equal(a, b, **kwargs) - msg = ( - "{a} and {b} were approximately equal when they shouldn't have been" - ).format(a=a, b=b) + msg = f"{a} and {b} were approximately equal when they shouldn't have been" pytest.fail(msg=msg) except AssertionError: pass @@ -248,13 +246,12 @@ def test_assert_almost_equal_value_mismatch(): [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], ) def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): - msg = """numpy array are different + + msg = f"""numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format( - klass1=klass1, klass2=klass2 - ) +\\[right\\]: {klass2}""" with pytest.raises(AssertionError, match=msg): tm.assert_almost_equal(a, b) diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index 44400498ddc64..8957e7a172666 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -77,13 +77,11 @@ def test_categorical_equal_ordered_mismatch(): @pytest.mark.parametrize("obj", ["index", "foo", "pandas"]) def test_categorical_equal_object_override(obj): data = [1, 2, 3, 4] - msg = """{obj} are different + msg = f"""{obj} are different Attribute "ordered" are different \\[left\\]: False -\\[right\\]: True""".format( - obj=obj - ) +\\[right\\]: True""" c1 = Categorical(data, ordered=False) c2 = Categorical(data, ordered=True) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index cecf9273004d7..0547323b882f6 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,8 +1,8 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -96,7 +96,7 @@ def test_assert_extension_array_equal_non_extension_array(side): numpy_array = np.arange(5) extension_array = SparseArray(numpy_array) - msg = "{side} is not an ExtensionArray".format(side=side) + msg = f"{side} is not an ExtensionArray" args = ( (numpy_array, extension_array) if side == "left" diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index b46a8460a28b2..23c845f2b2795 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -80,7 +80,7 @@ def test_frame_equal_row_order_mismatch(check_like, obj_fixture): df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. - msg = "{obj}.index are different".format(obj=obj_fixture) + msg = f"{obj_fixture}.index are different" with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) else: @@ -95,7 +95,7 @@ def test_frame_equal_row_order_mismatch(check_like, obj_fixture): ], ) def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): - msg = "{obj} are different".format(obj=obj_fixture) + msg = f"{obj_fixture} are different" with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, obj=obj_fixture) @@ -149,13 +149,11 @@ def test_empty_dtypes(check_dtype): def test_frame_equal_index_mismatch(obj_fixture): - msg = """{obj}\\.index are different + msg = f"""{obj_fixture}\\.index are different -{obj}\\.index values are different \\(33\\.33333 %\\) +{obj_fixture}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""".format( - obj=obj_fixture - ) +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) @@ -165,13 +163,11 @@ def test_frame_equal_index_mismatch(obj_fixture): def test_frame_equal_columns_mismatch(obj_fixture): - msg = """{obj}\\.columns are different + msg = f"""{obj_fixture}\\.columns are different -{obj}\\.columns values are different \\(50\\.0 %\\) +{obj_fixture}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""".format( - obj=obj_fixture - ) +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) @@ -181,13 +177,12 @@ def test_frame_equal_columns_mismatch(obj_fixture): def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): - msg = """{obj}\\.iloc\\[:, 1\\] are different + obj = obj_fixture + msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different -{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) \\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""".format( - obj=obj_fixture - ) +\\[right\\]: \\[4, 5, 7\\]""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) @@ -202,18 +197,18 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): ( DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), - """{obj}\\.iloc\\[:, 1\\] are different + """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different -{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) \\[left\\]: \\[é, è, ë\\] \\[right\\]: \\[é, è, e̊\\]""", ), ( DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), - """{obj}\\.iloc\\[:, 0\\] are different + """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different -{obj}\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) \\[left\\]: \\[á, à, ä\\] \\[right\\]: \\[a, a, a\\]""", ), diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 8c3f242f0c96b..bbbeebcec2569 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Index, MultiIndex, NaT -import pandas.util.testing as tm +import pandas._testing as tm def test_index_equal_levels_mismatch(): @@ -135,11 +135,6 @@ def test_index_equal_level_values_mismatch(check_exact, check_less_precise): [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)], ) def test_index_equal_names(name1, name2): - msg = """Index are different - -Attribute "names" are different -\\[left\\]: \\[{name1}\\] -\\[right\\]: \\[{name2}\\]""" idx1 = Index([1, 2, 3], name=name1) idx2 = Index([1, 2, 3], name=name2) @@ -149,7 +144,11 @@ def test_index_equal_names(name1, name2): else: name1 = "'x'" if name1 == "x" else name1 name2 = "'x'" if name2 == "x" else name2 - msg = msg.format(name1=name1, name2=name2) + msg = f"""Index are different + +Attribute "names" are different +\\[left\\]: \\[{name1}\\] +\\[right\\]: \\[{name2}\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_index_equal(idx1, idx2) diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index b264b484a04ab..96f2973a1528c 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import interval_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py index 53bcedf3a16f1..c8ae9ebdd8651 100644 --- a/pandas/tests/util/test_assert_numpy_array_equal.py +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def test_assert_numpy_array_equal_shape_mismatch(): @@ -28,13 +28,11 @@ def test_assert_numpy_array_equal_bad_type(): [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], ) def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): - msg = """numpy array are different + msg = f"""numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format( - klass1=klass1, klass2=klass2 - ) +\\[right\\]: {klass2}""" with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(a, b) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index c681817896903..87765c909938d 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -2,7 +2,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm def f(): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 0a6047c4662ba..eaf0824f52927 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import Categorical, DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm def _assert_series_equal_both(a, b, **kwargs): diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py index 8fbc8037ed7c5..ee4f7e3f34f2e 100644 --- a/pandas/tests/util/test_deprecate.py +++ b/pandas/tests/util/test_deprecate.py @@ -4,7 +4,7 @@ from pandas.util._decorators import deprecate -import pandas.util.testing as tm +import pandas._testing as tm def new_func(): diff --git a/pandas/tests/util/test_deprecate_kwarg.py b/pandas/tests/util/test_deprecate_kwarg.py index c17c48197ccf7..b165e9fba0e4f 100644 --- a/pandas/tests/util/test_deprecate_kwarg.py +++ b/pandas/tests/util/test_deprecate_kwarg.py @@ -2,7 +2,7 @@ from pandas.util._decorators import deprecate_kwarg -import pandas.util.testing as tm +import pandas._testing as tm @deprecate_kwarg("old", "new") diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index ebbdbd6c29842..c915edad4bb8e 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples from pandas.util import hash_array, hash_pandas_object -import pandas.util.testing as tm @pytest.fixture( @@ -353,3 +353,24 @@ def test_hash_collisions(): result = hash_array(np.asarray(hashes, dtype=object), "utf8") tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) + + +def test_hash_with_tuple(): + # GH#28969 array containing a tuple raises on call to arr.astype(str) + # apparently a numpy bug github.com/numpy/numpy/issues/9441 + + df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + result = hash_pandas_object(df) + expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + result = hash_pandas_object(df2) + expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + # require that the elements of such tuples are themselves hashable + + df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + hash_pandas_object(df3) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 60124c8e943ad..6a19adef728e4 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -4,7 +4,7 @@ import pandas.compat as compat -import pandas.util.testing as tm +import pandas._testing as tm def test_rands(): diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py index 1f1365d62c64e..746d859b3322e 100644 --- a/pandas/tests/util/test_validate_args.py +++ b/pandas/tests/util/test_validate_args.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_args @@ -22,10 +20,8 @@ def test_bad_arg_length_max_value_single(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -40,10 +36,8 @@ def test_bad_arg_length_max_value_multiple(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -54,15 +48,11 @@ def test_bad_arg_length_max_value_multiple(): def test_not_all_defaults(i): bad_arg = "foo" msg = ( - "the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + f"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) - compat_args = OrderedDict() - compat_args["foo"] = 2 - compat_args["bar"] = -1 - compat_args["baz"] = 3 - + compat_args = {"foo": 2, "bar": -1, "baz": 3} arg_vals = (1, -1, 3) with pytest.raises(ValueError, match=msg): @@ -73,8 +63,5 @@ def test_validation(): # No exceptions should be raised. validate_args(_fname, (None,), 2, dict(out=None)) - compat_args = OrderedDict() - compat_args["axis"] = 1 - compat_args["out"] = None - + compat_args = {"axis": 1, "out": None} validate_args(_fname, (1, None), 2, compat_args) diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py index 6aa2088c07b5d..941ba86c61319 100644 --- a/pandas/tests/util/test_validate_args_and_kwargs.py +++ b/pandas/tests/util/test_validate_args_and_kwargs.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_args_and_kwargs @@ -17,10 +15,8 @@ def test_invalid_total_length_max_length_one(): actual_length = len(kwargs) + len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -37,10 +33,8 @@ def test_invalid_total_length_max_length_multiple(): actual_length = len(kwargs) + len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -52,13 +46,11 @@ def test_missing_args_or_kwargs(args, kwargs): bad_arg = "bar" min_fname_arg_count = 2 - compat_args = OrderedDict() - compat_args["foo"] = -5 - compat_args[bad_arg] = 1 + compat_args = {"foo": -5, bad_arg: 1} msg = ( - r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) with pytest.raises(ValueError, match=msg): @@ -68,11 +60,7 @@ def test_missing_args_or_kwargs(args, kwargs): def test_duplicate_argument(): min_fname_arg_count = 2 - compat_args = OrderedDict() - compat_args["foo"] = None - compat_args["bar"] = None - compat_args["baz"] = None - + compat_args = {"foo": None, "bar": None, "baz": None} kwargs = {"foo": None, "bar": None} args = (None,) # duplicate value for "foo" @@ -84,10 +72,7 @@ def test_duplicate_argument(): def test_validation(): # No exceptions should be raised. - compat_args = OrderedDict() - compat_args["foo"] = 1 - compat_args["bar"] = None - compat_args["baz"] = -2 + compat_args = {"foo": 1, "bar": None, "baz": -2} kwargs = {"baz": -2} args = (1, None) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index 54b5c6ed034a2..a7b6d8f98cc60 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_bool_kwarg, validate_kwargs @@ -11,9 +9,7 @@ def test_bad_kwarg(): good_arg = "f" bad_arg = good_arg + "o" - compat_args = OrderedDict() - compat_args[good_arg] = "foo" - compat_args[bad_arg + "o"] = "bar" + compat_args = {good_arg: "foo", bad_arg + "o": "bar"} kwargs = {good_arg: "foo", bad_arg: "bar"} msg = fr"{_fname}\(\) got an unexpected keyword argument '{bad_arg}'" @@ -26,14 +22,11 @@ def test_bad_kwarg(): def test_not_all_none(i): bad_arg = "foo" msg = ( - r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) - compat_args = OrderedDict() - compat_args["foo"] = 1 - compat_args["bar"] = "s" - compat_args["baz"] = None + compat_args = {"foo": 1, "bar": "s", "baz": None} kwarg_keys = ("foo", "bar", "baz") kwarg_vals = (2, "s", None) @@ -46,10 +39,7 @@ def test_not_all_none(i): def test_validation(): # No exceptions should be raised. - compat_args = OrderedDict() - compat_args["f"] = None - compat_args["b"] = 1 - compat_args["ba"] = "s" + compat_args = {"f": None, "b": 1, "ba": "s"} kwargs = dict(f=None, b=1) validate_kwargs(_fname, kwargs, compat_args) diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 1dfc0f34b2b8d..6aeada3152dbb 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -3,7 +3,8 @@ import numpy as np from numpy.random import randn -from pandas import DataFrame, Series, bdate_range +from pandas import DataFrame, Series, bdate_range, notna +import pandas._testing as tm N, K = 100, 10 @@ -21,3 +22,365 @@ def _create_data(self): self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) + + +# create the data only once as we are not setting it +def _create_consistency_data(): + def create_series(): + return [ + Series(dtype=object), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notna(values)])) == 1 + + def no_nans(x): + return x.notna().all().all() + + # data is a tuple(object, is_constant, no_nans) + data = create_series() + create_dataframes() + + return [(x, is_constant(x), no_nans(x)) for x in data] + + +_consistency_data = _create_consistency_data() + + +class ConsistencyBase(Base): + base_functions = [ + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), + # restore once GH 8086 is fixed + # lambda v: Series(v).skew(), 3, 'skew'), + # (lambda v: Series(v).kurt(), 4, 'kurt'), + # restore once GH 8084 is fixed + # lambda v: Series(v).quantile(0.3), None, 'quantile'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), + ] + no_nan_functions = [ + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), + ] + + def _create_data(self): + super()._create_data() + self.data = _consistency_data + + def _test_moments_consistency_mock_mean(self, mean, mock_mean): + for (x, is_constant, no_nans) in self.data: + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + tm.assert_equal(mean_x, expected.astype("float64")) + + def _test_moments_consistency_is_constant(self, min_periods, count, mean, corr): + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + + if is_constant: + exp = x.max() if isinstance(x, Series) else x.max().max() + + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) + + def _test_moments_consistency_var_debiasing_factors( + self, var_biased=None, var_unbiased=None, var_debiasing_factors=None + ): + for (x, is_constant, no_nans) in self.data: + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + def _test_moments_consistency( + self, + min_periods, + count, + mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + ): + + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + assert not (var_x < 0).any().any() + assert not (std_x < 0).any().any() + if cov: + cov_x_x = cov(x, x) + assert not (cov_x_x < 0).any().any() + + # check that var(x) == cov(x, x) + tm.assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + tm.assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if var is var_unbiased: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isna().equals(y.isna()): + # can only easily test two Series with similar + # structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + tm.assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + tm.assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + std_y = std(y) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + tm.assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) + + def _check_pairwise_moment(self, dispatch, name, **kwargs): + def get_result(obj, obj2=None): + return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) + + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = get_result(self.frame[1], self.frame[5]) + tm.assert_series_equal(result, expected, check_names=False) + + +def ew_func(A, B, com, name, **kwargs): + return getattr(A.ewm(com, **kwargs), name)(B) + + +def check_binary_ew(name, A, B): + + result = ew_func(A=A, B=B, com=20, name=name, min_periods=5) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + +def check_binary_ew_min_periods(name, min_periods, A, B): + # GH 7898 + result = ew_func(A, B, 20, name=name, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + empty = Series([], dtype=np.float64) + result = ew_func(empty, empty, 50, name=name, min_periods=min_periods) + tm.assert_series_equal(result, empty) + + # check series of length 1 + result = ew_func( + Series([1.0]), Series([1.0]), 50, name=name, min_periods=min_periods + ) + tm.assert_series_equal(result, Series([np.NaN])) diff --git a/pandas/tests/window/moments/conftest.py b/pandas/tests/window/moments/conftest.py new file mode 100644 index 0000000000000..2002f4d0bff43 --- /dev/null +++ b/pandas/tests/window/moments/conftest.py @@ -0,0 +1,20 @@ +import numpy as np +from numpy.random import randn +import pytest + +from pandas import Series + + +@pytest.fixture +def binary_ew_data(): + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + return A, B + + +@pytest.fixture(params=[0, 1, 2]) +def min_periods(request): + return request.param diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py new file mode 100644 index 0000000000000..599761259e041 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -0,0 +1,439 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Series, concat +import pandas._testing as tm +from pandas.tests.window.common import ( + Base, + ConsistencyBase, + check_binary_ew, + check_binary_ew_min_periods, + ew_func, +) + + +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestMoments(Base): + def setup_method(self, method): + self._create_data() + + def test_ewma(self): + self._check_ew(name="mean") + + vals = pd.Series(np.zeros(1000)) + vals[5] = 1 + result = vals.ewm(span=100, adjust=False).mean().sum() + assert np.abs(result - 1) < 1e-2 + + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewma_cases(self, adjust, ignore_na): + # try adjust/ignore_na args matrix + + s = Series([1.0, 2.0, 4.0, 8.0]) + + if adjust: + expected = Series([1.0, 1.6, 2.736842, 4.923077]) + else: + expected = Series([1.0, 1.333333, 2.222222, 4.148148]) + + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) + + def test_ewma_nan_handling(self): + s = Series([1.0] + [np.nan] * 5 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([1.0] * len(s))) + + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) + + # GH 7603 + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) + + def simple_wma(s, w): + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") + + for (s, adjust, ignore_na, w) in [ + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: + expected = simple_wma(s, Series(w)) + result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() + + tm.assert_series_equal(result, expected) + if ignore_na is False: + # check that ignore_na defaults to False + result = s.ewm(com=com, adjust=adjust).mean() + tm.assert_series_equal(result, expected) + + def test_ewmvar(self): + self._check_ew(name="var") + + def test_ewmvol(self): + self._check_ew(name="vol") + + def test_ewma_span_com_args(self): + A = self.series.ewm(com=9.5).mean() + B = self.series.ewm(span=20).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20) + with pytest.raises(ValueError): + self.series.ewm().mean() + + def test_ewma_halflife_arg(self): + A = self.series.ewm(com=13.932726172912965).mean() + B = self.series.ewm(halflife=10.0).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm() + + def test_ewm_alpha(self): + # GH 10789 + s = Series(self.arr) + a = s.ewm(alpha=0.61722699889169674).mean() + b = s.ewm(com=0.62014947789973052).mean() + c = s.ewm(span=2.240298955799461).mean() + d = s.ewm(halflife=0.721792864318).mean() + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) + + def test_ewm_alpha_arg(self): + # GH 10789 + s = self.series + with pytest.raises(ValueError): + s.ewm() + with pytest.raises(ValueError): + s.ewm(com=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(span=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(halflife=10.0, alpha=0.5) + + def test_ewm_domain_checks(self): + # GH 12492 + s = Series(self.arr) + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): + s.ewm(com=-0.1) + s.ewm(com=0.0) + s.ewm(com=0.1) + + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(span=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.0) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.9) + s.ewm(span=1.0) + s.ewm(span=1.1) + + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=0.0) + s.ewm(halflife=0.1) + + msg = "alpha must satisfy: 0 < alpha <= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=0.0) + s.ewm(alpha=0.1) + s.ewm(alpha=1.0) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=1.1) + + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) + def test_ew_empty_series(self, method): + vals = pd.Series([], dtype=np.float64) + + ewm = vals.ewm(3) + result = getattr(ewm, method)() + tm.assert_almost_equal(result, vals) + + def _check_ew(self, name=None, preserve_nan=False): + series_result = getattr(self.series.ewm(com=10), name)() + assert isinstance(series_result, Series) + + frame_result = getattr(self.frame.ewm(com=10), name)() + assert type(frame_result) == DataFrame + + result = getattr(self.series.ewm(com=10), name)() + if preserve_nan: + assert result[self._nan_locs].isna().all() + + @pytest.mark.parametrize("min_periods", [0, 1]) + @pytest.mark.parametrize("name", ["mean", "var", "vol"]) + def test_ew_min_periods(self, min_periods, name): + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + s = Series(arr) + + # check min_periods + # GH 7898 + result = getattr(s.ewm(com=50, min_periods=2), name)() + assert result[:11].isna().all() + assert not result[11:].isna().any() + + result = getattr(s.ewm(com=50, min_periods=min_periods), name)() + if name == "mean": + assert result[:10].isna().all() + assert not result[10:].isna().any() + else: + # ewm.std, ewm.vol, ewm.var (with bias=False) require at least + # two values + assert result[:11].isna().all() + assert not result[11:].isna().any() + + # check series of length 0 + result = getattr( + Series(dtype=object).ewm(com=50, min_periods=min_periods), name + )() + tm.assert_series_equal(result, Series(dtype="float64")) + + # check series of length 1 + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) + else: + # ewm.std, ewm.vol, ewm.var with bias=False require at least + # two values + tm.assert_series_equal(result, Series([np.NaN])) + + # pass in ints + result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() + assert result2.dtype == np.float_ + + +class TestEwmMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_ewmcov_pairwise(self): + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) + + @pytest.mark.parametrize("name", ["cov", "corr"]) + def test_ewm_corr_cov(self, name, min_periods, binary_ew_data): + A, B = binary_ew_data + + check_binary_ew(name="corr", A=A, B=B) + check_binary_ew_min_periods("corr", min_periods, A, B) + + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) + + @pytest.mark.parametrize("name", ["cov", "corr"]) + def test_different_input_array_raise_exception(self, name, binary_ew_data): + + A, _ = binary_ew_data + msg = "Input arrays must be of the same type!" + # exception raised is Exception + with pytest.raises(Exception, match=msg): + ew_func(A, randn(50), 20, name=name, min_periods=5) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewm_consistency(self, min_periods, adjust, ignore_na): + def _weights(s, com, adjust, ignore_na): + if isinstance(s, DataFrame): + if not len(s.columns): + return DataFrame(index=s.index, columns=s.columns) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) + w.index = s.index + w.columns = s.columns + return w + + w = Series(np.nan, index=s.index) + alpha = 1.0 / (1.0 + com) + if ignore_na: + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) + elif adjust: + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + w.iat[i] = pow(1.0 / (1.0 - alpha), i) + else: + sum_wts = 0.0 + prev_i = -1 + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + if prev_i == -1: + w.iat[i] = 1.0 + else: + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) + sum_wts += w.iat[i] + prev_i = i + return w + + def _variance_debiasing_factors(s, com, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") + numerator = cum_sum * cum_sum + denominator = numerator - cum_sum_sq + denominator[denominator <= 0.0] = np.nan + return numerator / denominator + + def _ewma(s, com, min_periods, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan + return result + + com = 3.0 + self._test_moments_consistency_mock_mean( + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + var_debiasing_factors=lambda x: ( + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + std_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), + cov_unbiased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), + cov_biased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), + ) diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py new file mode 100644 index 0000000000000..4596552d8f255 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -0,0 +1,409 @@ +import warnings + +import numpy as np +from numpy.random import randn +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series, isna, notna +import pandas._testing as tm +from pandas.tests.window.common import ConsistencyBase + + +class TestExpandingMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_expanding_apply_args_kwargs(self, raw): + def mean_w_arg(x, const): + return np.mean(x) + const + + df = DataFrame(np.random.rand(20, 3)) + + expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + + result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + tm.assert_frame_equal(result, expected) + + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr(self): + A = self.series.dropna() + B = (A + randn(len(A)))[:-5] + + result = A.expanding().corr(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_count(self): + result = self.series.expanding().count() + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) + + def test_expanding_quantile(self): + result = self.series.expanding().quantile(0.5) + + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) + + tm.assert_almost_equal(result, rolling_result) + + def test_expanding_cov(self): + A = self.series + B = (A + randn(len(A)))[:-5] + + result = A.expanding().cov(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_cov_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_corr_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_cov_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().cov(s2) + expected = Series([None, None, 2.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().cov(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().cov(s2) + expected = Series([None, None, None, 4.5]) + tm.assert_series_equal(result, expected) + + def test_expanding_corr_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().corr(s2) + expected = Series([None, None, 1.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().corr(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().corr(s2) + expected = Series([None, None, None, 1.0]) + tm.assert_series_equal(result, expected) + + def test_expanding_cov_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) + # TODO: xref gh-15826 + # .loc is not preserving the names + result1 = df1.expanding().cov(df2, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + def test_expanding_corr_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) + def test_expanding_func(self, func, static_comp, has_min_periods): + def expanding_func(x, min_periods=1, center=False, axis=0): + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) + return getattr(exp, func)() + + self._check_expanding(expanding_func, static_comp, preserve_nan=False) + self._check_expanding_has_min_periods( + expanding_func, static_comp, has_min_periods + ) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + def test_expanding_apply(self, raw, has_min_periods): + def expanding_mean(x, min_periods=1): + + exp = x.expanding(min_periods=min_periods) + result = exp.apply(lambda x: x.mean(), raw=raw) + return result + + # TODO(jreback), needed to add preserve_nan=False + # here to make this pass + self._check_expanding(expanding_mean, np.mean, preserve_nan=False) + self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) + + def test_expanding_apply_empty_series(self, raw): + ser = Series([], dtype=np.float64) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + + def test_expanding_apply_min_periods_0(self, raw): + # GH 8080 + s = Series([None, None, None]) + result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def _check_expanding(self, func, static_comp, preserve_nan=True): + + series_result = func(self.series) + assert isinstance(series_result, Series) + frame_result = func(self.frame) + assert isinstance(frame_result, DataFrame) + + result = func(self.series) + tm.assert_almost_equal(result[10], static_comp(self.series[:11])) + + if preserve_nan: + assert result.iloc[self._nan_locs].isna().all() + + def _check_expanding_has_min_periods(self, func, static_comp, has_min_periods): + ser = Series(randn(50)) + + if has_min_periods: + result = func(ser, min_periods=30) + assert result[:29].isna().all() + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + # min_periods is working correctly + result = func(ser, min_periods=15) + assert isna(result.iloc[13]) + assert notna(result.iloc[14]) + + ser2 = Series(randn(20)) + result = func(ser2, min_periods=5) + assert isna(result[3]) + assert notna(result[4]) + + # min_periods=0 + result0 = func(ser, min_periods=0) + result1 = func(ser, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = func(ser) + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + ], + ) + def test_moment_functions_zero_length(self, f): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.parametrize( + "f", + [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + ], + ) + def test_moment_functions_zero_length_pairwise(self, f): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) + ) + df2_expected = DataFrame( + index=MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + def test_expanding_consistency(self, min_periods): + + # suppress warnings about empty slices, as we are deliberately testing + # with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + + # test consistency between different expanding_* moments + self._test_moments_consistency_mock_mean( + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + var_debiasing_factors=lambda x: ( + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), + ) + + # test consistency between expanding_xyz() and either (a) + # expanding_apply of Series.xyz(), or (b) expanding_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + expanding_f = getattr(x.expanding(min_periods=min_periods), name) + + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): + continue + + if name == "count": + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) + else: + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) + else: + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=min_periods + ).apply(func=f, raw=True) + + # GH 9422 + if name in ["sum", "prod"]: + tm.assert_equal(expanding_f_result, expanding_apply_f_result) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/moments/test_moments_rolling.py similarity index 55% rename from pandas/tests/window/test_moments.py rename to pandas/tests/window/moments/test_moments_rolling.py index b1c5fc429cc03..9acb4ffcb40b8 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -9,10 +9,10 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, concat, isna, notna +from pandas import DataFrame, Index, Series, isna, notna +import pandas._testing as tm from pandas.core.window.common import _flex_binary_moment -from pandas.tests.window.common import Base -import pandas.util.testing as tm +from pandas.tests.window.common import Base, ConsistencyBase import pandas.tseries.offsets as offsets @@ -915,400 +915,6 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_series_equal(series_xp, series_rs) tm.assert_frame_equal(frame_xp, frame_rs) - def test_ewma(self): - self._check_ew(name="mean") - - vals = pd.Series(np.zeros(1000)) - vals[5] = 1 - result = vals.ewm(span=100, adjust=False).mean().sum() - assert np.abs(result - 1) < 1e-2 - - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewma_cases(self, adjust, ignore_na): - # try adjust/ignore_na args matrix - - s = Series([1.0, 2.0, 4.0, 8.0]) - - if adjust: - expected = Series([1.0, 1.6, 2.736842, 4.923077]) - else: - expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - - result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() - tm.assert_series_equal(result, expected) - - def test_ewma_nan_handling(self): - s = Series([1.0] + [np.nan] * 5 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.0] * len(s))) - - s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) - - # GH 7603 - s0 = Series([np.nan, 1.0, 101.0]) - s1 = Series([1.0, np.nan, 101.0]) - s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) - s3 = Series([1.0, np.nan, 101.0, 50.0]) - com = 2.0 - alpha = 1.0 / (1.0 + com) - - def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") - - for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), - (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), - (s0, False, False, [np.nan, (1.0 - alpha), alpha]), - (s0, False, True, [np.nan, (1.0 - alpha), alpha]), - (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), - (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), - (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - ( - s2, - True, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], - ), - (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), - ( - s2, - False, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], - ), - (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), - (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), - ( - s3, - False, - False, - [ - (1.0 - alpha) ** 3, - np.nan, - (1.0 - alpha) * alpha, - alpha * ((1.0 - alpha) ** 2 + alpha), - ], - ), - ( - s3, - False, - True, - [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], - ), - ]: - expected = simple_wma(s, Series(w)) - result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - - tm.assert_series_equal(result, expected) - if ignore_na is False: - # check that ignore_na defaults to False - result = s.ewm(com=com, adjust=adjust).mean() - tm.assert_series_equal(result, expected) - - def test_ewmvar(self): - self._check_ew(name="var") - - def test_ewmvol(self): - self._check_ew(name="vol") - - def test_ewma_span_com_args(self): - A = self.series.ewm(com=9.5).mean() - B = self.series.ewm(span=20).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20) - with pytest.raises(ValueError): - self.series.ewm().mean() - - def test_ewma_halflife_arg(self): - A = self.series.ewm(com=13.932726172912965).mean() - B = self.series.ewm(halflife=10.0).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm() - - def test_ewm_alpha(self): - # GH 10789 - s = Series(self.arr) - a = s.ewm(alpha=0.61722699889169674).mean() - b = s.ewm(com=0.62014947789973052).mean() - c = s.ewm(span=2.240298955799461).mean() - d = s.ewm(halflife=0.721792864318).mean() - tm.assert_series_equal(a, b) - tm.assert_series_equal(a, c) - tm.assert_series_equal(a, d) - - def test_ewm_alpha_arg(self): - # GH 10789 - s = self.series - with pytest.raises(ValueError): - s.ewm() - with pytest.raises(ValueError): - s.ewm(com=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(span=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(halflife=10.0, alpha=0.5) - - def test_ewm_domain_checks(self): - # GH 12492 - s = Series(self.arr) - msg = "comass must satisfy: comass >= 0" - with pytest.raises(ValueError, match=msg): - s.ewm(com=-0.1) - s.ewm(com=0.0) - s.ewm(com=0.1) - - msg = "span must satisfy: span >= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(span=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.0) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.9) - s.ewm(span=1.0) - s.ewm(span=1.1) - - msg = "halflife must satisfy: halflife > 0" - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=0.0) - s.ewm(halflife=0.1) - - msg = "alpha must satisfy: 0 < alpha <= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=0.0) - s.ewm(alpha=0.1) - s.ewm(alpha=1.0) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=1.1) - - @pytest.mark.parametrize("method", ["mean", "vol", "var"]) - def test_ew_empty_series(self, method): - vals = pd.Series([], dtype=np.float64) - - ewm = vals.ewm(3) - result = getattr(ewm, method)() - tm.assert_almost_equal(result, vals) - - def _check_ew(self, name=None, preserve_nan=False): - series_result = getattr(self.series.ewm(com=10), name)() - assert isinstance(series_result, Series) - - frame_result = getattr(self.frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - - result = getattr(self.series.ewm(com=10), name)() - if preserve_nan: - assert result[self._nan_locs].isna().all() - - # excluding NaNs correctly - arr = randn(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN - s = Series(arr) - - # check min_periods - # GH 7898 - result = getattr(s.ewm(com=50, min_periods=2), name)() - assert result[:11].isna().all() - assert not result[11:].isna().any() - - for min_periods in (0, 1): - result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == "mean": - assert result[:10].isna().all() - assert not result[10:].isna().any() - else: - # ewm.std, ewm.vol, ewm.var (with bias=False) require at least - # two values - assert result[:11].isna().all() - assert not result[11:].isna().any() - - # check series of length 0 - result = getattr( - Series(dtype=object).ewm(com=50, min_periods=min_periods), name - )() - tm.assert_series_equal(result, Series(dtype="float64")) - - # check series of length 1 - result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() - if name == "mean": - tm.assert_series_equal(result, Series([1.0])) - else: - # ewm.std, ewm.vol, ewm.var with bias=False require at least - # two values - tm.assert_series_equal(result, Series([np.NaN])) - - # pass in ints - result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ - - -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(dtype=object), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.0]), - Series([np.nan, 3.0]), - Series([3.0, np.nan]), - Series([1.0, 3.0]), - Series([2.0, 2.0]), - Series([3.0, 1.0]), - Series( - [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] - ), - Series( - [ - np.nan, - 5.0, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - np.nan, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series(range(10)), - Series(range(20, 0, -2)), - ] - - def create_dataframes(): - return [ - DataFrame(), - DataFrame(columns=["a"]), - DataFrame(columns=["a", "a"]), - DataFrame(columns=["a", "b"]), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel() - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - - # data is a tuple(object, is_constant, no_nans) - data = create_series() + create_dataframes() - - return [(x, is_constant(x), no_nans(x)) for x in data] - - -_consistency_data = _create_consistency_data() - def _rolling_consistency_cases(): for window in [1, 2, 3, 10, 20]: @@ -1319,363 +925,10 @@ def _rolling_consistency_cases(): yield window, min_periods, center -class TestMomentsConsistency(Base): - base_functions = [ - (lambda v: Series(v).count(), None, "count"), - (lambda v: Series(v).max(), None, "max"), - (lambda v: Series(v).min(), None, "min"), - (lambda v: Series(v).sum(), None, "sum"), - (lambda v: Series(v).mean(), None, "mean"), - (lambda v: Series(v).std(), 1, "std"), - (lambda v: Series(v).cov(Series(v)), None, "cov"), - (lambda v: Series(v).corr(Series(v)), None, "corr"), - (lambda v: Series(v).var(), 1, "var"), - # restore once GH 8086 is fixed - # lambda v: Series(v).skew(), 3, 'skew'), - # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed - # lambda v: Series(v).quantile(0.3), None, 'quantile'), - (lambda v: Series(v).median(), None, "median"), - (np.nanmax, 1, "max"), - (np.nanmin, 1, "min"), - (np.nansum, 1, "sum"), - (np.nanmean, 1, "mean"), - (lambda v: np.nanstd(v, ddof=1), 1, "std"), - (lambda v: np.nanvar(v, ddof=1), 1, "var"), - (np.nanmedian, 1, "median"), - ] - no_nan_functions = [ - (np.max, None, "max"), - (np.min, None, "min"), - (np.sum, None, "sum"), - (np.mean, None, "mean"), - (lambda v: np.std(v, ddof=1), 1, "std"), - (lambda v: np.var(v, ddof=1), 1, "var"), - (np.median, None, "median"), - ] - - def _create_data(self): - super()._create_data() - self.data = _consistency_data - +class TestRollingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def _test_moments_consistency( - self, - min_periods, - count, - mean, - mock_mean, - corr, - var_unbiased=None, - std_unbiased=None, - cov_unbiased=None, - var_biased=None, - std_biased=None, - cov_biased=None, - var_debiasing_factors=None, - ): - def _non_null_values(x): - values = x.values.ravel() - return set(values[notna(values)].tolist()) - - for (x, is_constant, no_nans) in self.data: - count_x = count(x) - mean_x = mean(x) - - if mock_mean: - # check that mean equals mock_mean - expected = mock_mean(x) - tm.assert_equal(mean_x, expected.astype("float64")) - - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = corr(x, x) - - # assert _non_null_values(corr_x_x).issubset(set([1.])) - # restore once rolling_cov(x, x) is identically equal to var(x) - - if is_constant: - exp = x.max() if isinstance(x, Series) else x.max().max() - - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) - - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) - - if var_unbiased and var_biased and var_debiasing_factors: - # check variance debiasing factors - var_unbiased_x = var_unbiased(x) - var_biased_x = var_biased(x) - var_debiasing_factors_x = var_debiasing_factors(x) - tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - - for (std, var, cov) in [ - (std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased), - ]: - - # check that var(x), std(x), and cov(x) are all >= 0 - var_x = var(x) - std_x = std(x) - assert not (var_x < 0).any().any() - assert not (std_x < 0).any().any() - if cov: - cov_x_x = cov(x, x) - assert not (cov_x_x < 0).any().any() - - # check that var(x) == cov(x, x) - tm.assert_equal(var_x, cov_x_x) - - # check that var(x) == std(x)^2 - tm.assert_equal(var_x, std_x * std_x) - - if var is var_biased: - # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = mean(x * x) - tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) - - if is_constant: - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if var is var_unbiased: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) - - if isinstance(x, Series): - for (y, is_constant, no_nans) in self.data: - if not x.isna().equals(y.isna()): - # can only easily test two Series with similar - # structure - continue - - # check that cor(x, y) is symmetric - corr_x_y = corr(x, y) - corr_y_x = corr(y, x) - tm.assert_equal(corr_x_y, corr_y_x) - - if cov: - # check that cov(x, y) is symmetric - cov_x_y = cov(x, y) - cov_y_x = cov(y, x) - tm.assert_equal(cov_x_y, cov_y_x) - - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - var_x_plus_y = var(x + y) - var_y = var(y) - tm.assert_equal( - cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) - ) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - std_y = std(y) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if cov is cov_biased: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_y = mean(y) - mean_x_times_y = mean(x * y) - tm.assert_equal( - cov_x_y, mean_x_times_y - (mean_x * mean_y) - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewm_consistency(self, min_periods, adjust, ignore_na): - def _weights(s, com, adjust, ignore_na): - if isinstance(s, DataFrame): - if not len(s.columns): - return DataFrame(index=s.index, columns=s.columns) - w = concat( - [ - _weights( - s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na - ) - for i, _ in enumerate(s.columns) - ], - axis=1, - ) - w.index = s.index - w.columns = s.columns - return w - - w = Series(np.nan, index=s.index) - alpha = 1.0 / (1.0 + com) - if ignore_na: - w[s.notna()] = _weights( - s[s.notna()], com=com, adjust=adjust, ignore_na=False - ) - elif adjust: - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1.0 / (1.0 - alpha), i) - else: - sum_wts = 0.0 - prev_i = -1 - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - if prev_i == -1: - w.iat[i] = 1.0 - else: - w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) - sum_wts += w.iat[i] - prev_i = i - return w - - def _variance_debiasing_factors(s, com, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method="ffill") - cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") - numerator = cum_sum * cum_sum - denominator = numerator - cum_sum_sq - denominator[denominator <= 0.0] = np.nan - return numerator / denominator - - def _ewma(s, com, min_periods, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = ( - s.multiply(weights) - .cumsum() - .divide(weights.cumsum()) - .fillna(method="ffill") - ) - result[ - s.expanding().count() < (max(min_periods, 1) if min_periods else 1) - ] = np.nan - return result - - com = 3.0 - # test consistency between different ewm* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean(), - mock_mean=lambda x: _ewma( - x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ), - corr=lambda x, y: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(y), - var_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=False) - ), - std_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=False) - ), - cov_unbiased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=False) - ), - var_biased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=True) - ), - std_biased=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=True), - cov_biased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=True) - ), - var_debiasing_factors=lambda x: ( - _variance_debiasing_factors( - x, com=com, adjust=adjust, ignore_na=ignore_na - ) - ), - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - def test_expanding_consistency(self, min_periods): - - # suppress warnings about empty slices, as we are deliberately testing - # with empty/0-length Series/DataFrames - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - # test consistency between different expanding_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding(min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() - / x.expanding().count(), - corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( - y, ddof=0 - ), - var_debiasing_factors=lambda x: ( - x.expanding().count() - / (x.expanding().count() - 1.0).replace(0.0, np.nan) - ), - ) - - # test consistency between expanding_xyz() and either (a) - # expanding_apply of Series.xyz(), or (b) expanding_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - expanding_f = getattr(x.expanding(min_periods=min_periods), name) - - if ( - require_min_periods - and (min_periods is not None) - and (min_periods < require_min_periods) - ): - continue - - if name == "count": - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding(min_periods=0).apply( - func=f, raw=True - ) - else: - if name in ["cov", "corr"]: - expanding_f_result = expanding_f(pairwise=False) - else: - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=min_periods - ).apply(func=f, raw=True) - - # GH 9422 - if name in ["sum", "prod"]: - tm.assert_equal(expanding_f_result, expanding_apply_f_result) - @pytest.mark.slow @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) @@ -1692,9 +945,7 @@ def test_rolling_consistency(self, window, min_periods, center): ) # test consistency between different rolling_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), + self._test_moments_consistency_mock_mean( mean=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1709,6 +960,53 @@ def test_rolling_consistency(self, window, min_periods, center): ).count() ) ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), + corr=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), + var_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), + var_debiasing_factors=lambda x: ( + x.rolling(window=window, center=center) + .count() + .divide( + (x.rolling(window=window, center=center).count() - 1.0).replace( + 0.0, np.nan + ) + ) + ), + ) + + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), corr=lambda x, y: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1744,15 +1042,6 @@ def test_rolling_consistency(self, window, min_periods, center): window=window, min_periods=min_periods, center=center ).cov(y, ddof=0) ), - var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center) - .count() - .divide( - (x.rolling(window=window, center=center).count() - 1.0).replace( - 0.0, np.nan - ) - ) - ), ) # test consistency between rolling_xyz() and either (a) @@ -1835,22 +1124,12 @@ def test_rolling_corr_with_zero_variance(self, window): assert s.rolling(window=window).corr(other=other).isna().all() - def _check_pairwise_moment(self, dispatch, name, **kwargs): - def get_result(obj, obj2=None): - return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - - result = get_result(self.frame) - result = result.loc[(slice(None), 1), 5] - result.index = result.index.droplevel(1) - expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(result, expected, check_names=False) - def test_flex_binary_moment(self): # GH3155 # don't blow the stack msg = ( - "arguments to moment function must be of type" - " np.ndarray/Series/DataFrame" + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" ) with pytest.raises(TypeError, match=msg): _flex_binary_moment(5, 6, None) @@ -1905,156 +1184,6 @@ def test_flex_binary_frame(self, method): ) tm.assert_frame_equal(res3, exp) - def test_ewmcov(self): - self._check_binary_ew("cov") - - def test_ewmcov_pairwise(self): - self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) - - def test_ewmcorr(self): - self._check_binary_ew("corr") - - def test_ewmcorr_pairwise(self): - self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) - - def _check_binary_ew(self, name): - def func(A, B, com, **kwargs): - return getattr(A.ewm(com, **kwargs), name)(B) - - A = Series(randn(50), index=np.arange(50)) - B = A[2:] + randn(48) - - A[:10] = np.NaN - B[-10:] = np.NaN - - result = func(A, B, 20, min_periods=5) - assert np.isnan(result.values[:14]).all() - assert not np.isnan(result.values[14:]).any() - - # GH 7898 - for min_periods in (0, 1, 2): - result = func(A, B, 20, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) with bias=False require at - # least two values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() - - # check series of length 0 - empty = Series([], dtype=np.float64) - result = func(empty, empty, 50, min_periods=min_periods) - tm.assert_series_equal(result, empty) - - # check series of length 1 - result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([np.NaN])) - - msg = "Input arrays must be of the same type!" - # exception raised is Exception - with pytest.raises(Exception, match=msg): - func(A, randn(50), 20, min_periods=5) - - def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): - return np.mean(x) + const - - df = DataFrame(np.random.rand(20, 3)) - - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) - tm.assert_frame_equal(result, expected) - - result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) - tm.assert_frame_equal(result, expected) - - def test_expanding_corr(self): - A = self.series.dropna() - B = (A + randn(len(A)))[:-5] - - result = A.expanding().corr(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_count(self): - result = self.series.expanding().count() - tm.assert_almost_equal( - result, self.series.rolling(window=len(self.series)).count() - ) - - def test_expanding_quantile(self): - result = self.series.expanding().quantile(0.5) - - rolling_result = self.series.rolling( - window=len(self.series), min_periods=1 - ).quantile(0.5) - - tm.assert_almost_equal(result, rolling_result) - - def test_expanding_cov(self): - A = self.series - B = (A + randn(len(A)))[:-5] - - result = A.expanding().cov(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_cov_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_corr_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_cov_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().cov(s2) - expected = Series([None, None, 2.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().cov(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) - tm.assert_series_equal(result, expected) - - def test_expanding_corr_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().corr(s2) - expected = Series([None, None, 1.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().corr(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) - tm.assert_series_equal(result, expected) - def test_rolling_cov_diff_length(self): # GH 7512 s1 = Series([1, 2, 3], index=[0, 1, 2]) @@ -2082,8 +1211,8 @@ def test_rolling_corr_diff_length(self): @pytest.mark.parametrize( "f", [ - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), @@ -2136,154 +1265,6 @@ def test_rolling_functions_window_non_shrinkage_binary(self): df_result = f(df) tm.assert_frame_equal(df_result, df_expected) - def test_moment_functions_zero_length(self): - # GH 8056 - s = Series(dtype=np.float64) - s_expected = s - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=["a"]) - df2["a"] = df2["a"].astype("float64") - df2_expected = df2 - - functions = [ - lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply(sum, raw=False), - lambda x: x.expanding(min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - ] - for f in functions: - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - except (ImportError): - - # scipy needed for rolling_window - continue - - def test_moment_functions_zero_length_pairwise(self): - - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) - df2["a"] = df2["a"].astype("float64") - - df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([]), - ) - df2_expected = DataFrame( - index=pd.MultiIndex.from_product( - [df2.index, df2.columns], names=["bar", "foo"] - ), - columns=Index(["a"], name="foo"), - dtype="float64", - ) - - functions = [ - lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), - ] - for f in functions: - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - - def test_expanding_cov_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) - df1a = DataFrame( - [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") - ) - # TODO: xref gh-15826 - # .loc is not preserving the names - result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] - result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(["A", "B"], name="foo"), - index=Index(["X", "Y"], name="foo"), - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - - def test_expanding_corr_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame( - [[1, 2], [3, 2], [3, 4]], - columns=["A", "B"], - index=Index(range(3), name="bar"), - ) - df1a = DataFrame( - [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], - columns=["X", "Y"], - index=Index(range(3), name="bar"), - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] - ) - result1 = df1.expanding().corr(df2, pairwise=True).loc[2] - result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] - result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - def test_rolling_skew_edge_cases(self): all_nan = Series([np.NaN] * 5) @@ -2334,83 +1315,6 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - @pytest.mark.parametrize( - "func,static_comp", - [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], - ids=["sum", "mean", "max", "min"], - ) - def test_expanding_func(self, func, static_comp): - def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, center=center, axis=axis) - return getattr(exp, func)() - - self._check_expanding(expanding_func, static_comp, preserve_nan=False) - - def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): - - exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) - return result - - # TODO(jreback), needed to add preserve_nan=False - # here to make this pass - self._check_expanding(expanding_mean, np.mean, preserve_nan=False) - - ser = Series([], dtype=np.float64) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) - - # GH 8080 - s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 3.0]) - tm.assert_series_equal(result, expected) - - def _check_expanding( - self, - func, - static_comp, - has_min_periods=True, - has_time_rule=True, - preserve_nan=True, - ): - - series_result = func(self.series) - assert isinstance(series_result, Series) - frame_result = func(self.frame) - assert isinstance(frame_result, DataFrame) - - result = func(self.series) - tm.assert_almost_equal(result[10], static_comp(self.series[:11])) - - if preserve_nan: - assert result.iloc[self._nan_locs].isna().all() - - ser = Series(randn(50)) - - if has_min_periods: - result = func(ser, min_periods=30) - assert result[:29].isna().all() - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - - # min_periods is working correctly - result = func(ser, min_periods=15) - assert isna(result.iloc[13]) - assert notna(result.iloc[14]) - - ser2 = Series(randn(20)) - result = func(ser2, min_periods=5) - assert isna(result[3]) - assert notna(result[4]) - - # min_periods=0 - result0 = func(ser, min_periods=0) - result1 = func(ser, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = func(ser) - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - def test_rolling_max_gh6297(self): """Replicate result expected in GH #6297""" @@ -2532,3 +1436,76 @@ def test_rolling_min_max_numeric_types(self): assert result.dtypes[0] == np.dtype("f8") result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") + + def test_moment_functions_zero_length(self): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + functions = [ + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] + for f in functions: + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + except (ImportError): + + # scipy needed for rolling_window + continue + + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([]), + ) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] + + for f in functions: + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5085576cc96f0..5e70e13209de5 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -7,9 +7,9 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, concat +import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestApi(Base): diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 4b56cbd48c388..7132e64c1191c 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -4,7 +4,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("bad_raw", [None, 1, 0]) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 6a3f2c19babdc..606520c6d68ca 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -2,9 +2,9 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.api.indexers import BaseIndexer from pandas.core.window.indexers import ExpandingIndexer -import pandas.util.testing as tm def test_bad_get_window_bounds_signature(): diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 9d023034c570a..b1c9b66ab09d3 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -4,8 +4,8 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.base import DataError -import pandas.util.testing as tm # gh-12373 : rolling functions error on float32 data # make sure rolling functions works for different dtypes diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 098acdff93ac6..fc4bd50f25c73 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.window import Expanding from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestExpanding(Base): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 189942bc07d2a..355ef3a90d424 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby -import pandas.util.testing as tm class TestGrouperGrouping: diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 2fbf05f678431..cc8aef1779b46 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -4,7 +4,7 @@ import pandas.util._test_decorators as td from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm @td.skip_if_no("numba", "0.46.0") diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 6f6d4c09526ff..717273cff64ea 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -3,8 +3,8 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.algorithms import safe_sort -import pandas.util.testing as tm class TestPairwise: diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 227055eb222f8..04fab93b71c4a 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import DataFrame, Index, Series +import pandas._testing as tm from pandas.core.window import Rolling from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestRolling(Base): diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index c0d47fc2ca624..5f5e10b5dd497 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -10,7 +10,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.tseries.offsets as offsets diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index 39ab3ffd9319e..cc29ab4f2cd62 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -65,7 +65,7 @@ def test_agg_function_support(self, arg): df = pd.DataFrame({"A": np.arange(5)}) roll = df.rolling(2, win_type="triang") - msg = "'{arg}' is not a valid function for 'Window' object".format(arg=arg) + msg = f"'{arg}' is not a valid function for 'Window' object" with pytest.raises(AttributeError, match=msg): roll.agg(arg) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index ac64a875ca0aa..e2d007cd2d7f8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -244,7 +244,7 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: Parameters ---------- index : DatetimeIndex or TimedeltaIndex - if passed a Series will use the values of the series (NOT THE INDEX). + If passed a Series will use the values of the series (NOT THE INDEX). warn : bool, default True Returns diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 2e5477ea00e39..62d7c26b590cc 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -186,16 +186,16 @@ class from pandas.tseries.offsets def __repr__(self) -> str: info = "" if self.year is not None: - info += "year={year}, ".format(year=self.year) - info += "month={mon}, day={day}, ".format(mon=self.month, day=self.day) + info += f"year={self.year}, " + info += f"month={self.month}, day={self.day}, " if self.offset is not None: - info += "offset={offset}".format(offset=self.offset) + info += f"offset={self.offset}" if self.observance is not None: - info += "observance={obs}".format(obs=self.observance) + info += f"observance={self.observance}" - repr = "Holiday: {name} ({info})".format(name=self.name, info=info) + repr = f"Holiday: {self.name} ({info})" return repr def dates(self, start_date, end_date, return_name=False): @@ -394,8 +394,7 @@ def holidays(self, start=None, end=None, return_name=False): """ if self.rules is None: raise Exception( - "Holiday Calendar {name} does not have any " - "rules specified".format(name=self.name) + f"Holiday Calendar {self.name} does not have any rules specified" ) if start is None: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f20d385ffbbce..d31c23c7ccf1d 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -365,7 +365,7 @@ def apply_index(self, i): "applied vectorized" ) - def is_anchored(self): + def is_anchored(self) -> bool: # TODO: Does this make sense for the general case? It would help # if there were a canonical docstring for what is_anchored means. return self.n == 1 @@ -378,7 +378,7 @@ def onOffset(self, dt): ) return self.is_on_offset(dt) - def isAnchored(self): + def isAnchored(self) -> bool: warnings.warn( "isAnchored is a deprecated, use is_anchored instead", FutureWarning, @@ -389,7 +389,7 @@ def isAnchored(self): # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` - def _repr_attrs(self): + def _repr_attrs(self) -> str: exclude = {"n", "inc", "normalize"} attrs = [] for attr in sorted(self.__dict__): @@ -405,7 +405,7 @@ def _repr_attrs(self): return out @property - def name(self): + def name(self) -> str: return self.rule_code def rollback(self, dt): @@ -452,15 +452,15 @@ def is_on_offset(self, dt): # way to get around weirdness with rule_code @property - def _prefix(self): + def _prefix(self) -> str: raise NotImplementedError("Prefix not defined") @property - def rule_code(self): + def rule_code(self) -> str: return self._prefix @cache_readonly - def freqstr(self): + def freqstr(self) -> str: try: code = self.rule_code except NotImplementedError: @@ -480,7 +480,7 @@ def freqstr(self): return fstr - def _offset_str(self): + def _offset_str(self) -> str: return "" @property @@ -529,11 +529,11 @@ def offset(self): # Alias for backward compat return self._offset - def _repr_attrs(self): + def _repr_attrs(self) -> str: if self.offset: attrs = [f"offset={repr(self.offset)}"] else: - attrs = None + attrs = [] out = "" if attrs: out += ": " + ", ".join(attrs) @@ -553,7 +553,7 @@ def __init__(self, n=1, normalize=False, offset=timedelta(0)): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) - def _offset_str(self): + def _offset_str(self) -> str: def get_str(td): off_str = "" if td.days > 0: @@ -649,7 +649,7 @@ def apply_index(self, i): result = shifted.to_timestamp() + time return result - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.weekday() < 5 @@ -896,7 +896,15 @@ def apply(self, other): # adjust by business days first if bd != 0: - skip_bd = BusinessDay(n=bd) + if isinstance(self, _CustomMixin): # GH 30593 + skip_bd = CustomBusinessDay( + n=bd, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) + else: + skip_bd = BusinessDay(n=bd) # midnight business hour may not on BusinessDay if not self.next_bday.is_on_offset(other): prev_open = self._prev_opening_time(other) @@ -1079,7 +1087,7 @@ def apply(self, other): def apply_index(self, i): raise NotImplementedError - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False day64 = _to_dt64(dt, "datetime64[D]") @@ -1126,14 +1134,14 @@ class MonthOffset(SingleConstructorOffset): __init__ = BaseOffset.__init__ @property - def name(self): + def name(self) -> str: if self.is_anchored: return self.rule_code else: month = ccalendar.MONTH_ALIASES[self.n] return f"{self.code_rule}-{month}" - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1325,7 +1333,7 @@ def _from_name(cls, suffix=None): return cls(day_of_month=suffix) @property - def rule_code(self): + def rule_code(self) -> str: suffix = f"-{self.day_of_month}" return self._prefix + suffix @@ -1421,7 +1429,7 @@ class SemiMonthEnd(SemiMonthOffset): _prefix = "SM" _min_day_of_month = 1 - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False days_in_month = ccalendar.get_days_in_month(dt.year, dt.month) @@ -1479,7 +1487,7 @@ class SemiMonthBegin(SemiMonthOffset): _prefix = "SMS" - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day in (1, self.day_of_month) @@ -1548,7 +1556,7 @@ def __init__(self, n=1, normalize=False, weekday=None): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def is_anchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self.weekday is not None @apply_wraps @@ -1624,7 +1632,7 @@ def _end_apply_index(self, dtindex): return base + off + Timedelta(1, "ns") - Timedelta(1, "D") - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False elif self.weekday is None: @@ -1632,7 +1640,7 @@ def is_on_offset(self, dt): return dt.weekday() == self.weekday @property - def rule_code(self): + def rule_code(self) -> str: suffix = "" if self.weekday is not None: weekday = ccalendar.int_to_weekday[self.weekday] @@ -1709,7 +1717,7 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): if self.week < 0 or self.week > 3: raise ValueError(f"Week must be 0<=week<=3, got {self.week}") - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: """ Find the day in the same month as other that has the same weekday as self.weekday and is the self.week'th such day in the month. @@ -1728,7 +1736,7 @@ def _get_offset_day(self, other): return 1 + shift_days + self.week * 7 @property - def rule_code(self): + def rule_code(self) -> str: weekday = ccalendar.int_to_weekday.get(self.weekday, "") return f"{self._prefix}-{self.week + 1}{weekday}" @@ -1777,7 +1785,7 @@ def __init__(self, n=1, normalize=False, weekday=0): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: """ Find the day in the same month as other that has the same weekday as self.weekday and is the last such day in the month. @@ -1797,7 +1805,7 @@ def _get_offset_day(self, other): return dim - shift_days @property - def rule_code(self): + def rule_code(self) -> str: weekday = ccalendar.int_to_weekday.get(self.weekday, "") return f"{self._prefix}-{weekday}" @@ -1834,7 +1842,7 @@ def __init__(self, n=1, normalize=False, startingMonth=None): startingMonth = self._default_startingMonth object.__setattr__(self, "startingMonth", startingMonth) - def is_anchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self.startingMonth is not None @classmethod @@ -1848,7 +1856,7 @@ def _from_name(cls, suffix=None): return cls(**kwargs) @property - def rule_code(self): + def rule_code(self) -> str: month = ccalendar.MONTH_ALIASES[self.startingMonth] return f"{self._prefix}-{month}" @@ -1866,7 +1874,7 @@ def apply(self, other): months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False mod_month = (dt.month - self.startingMonth) % 3 @@ -1945,7 +1953,7 @@ class YearOffset(DateOffset): _adjust_dst = True _attributes = frozenset(["n", "normalize", "month"]) - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: # override BaseOffset method to use self.month instead of other.month # TODO: there may be a more performant way to do this return liboffsets.get_day_of_month( @@ -1969,7 +1977,7 @@ def apply_index(self, dtindex): shifted, freq=dtindex.freq, dtype=dtindex.dtype ) - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.month == self.month and dt.day == self._get_offset_day(dt) @@ -1991,7 +1999,7 @@ def _from_name(cls, suffix=None): return cls(**kwargs) @property - def rule_code(self): + def rule_code(self) -> str: month = ccalendar.MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" @@ -2109,12 +2117,12 @@ def __init__( if self.variation not in ["nearest", "last"]: raise ValueError(f"{self.variation} is not a valid variation") - def is_anchored(self): + def is_anchored(self) -> bool: return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False dt = datetime(dt.year, dt.month, dt.day) @@ -2209,18 +2217,18 @@ def get_year_end(self, dt): return target_date + timedelta(days_forward - 7) @property - def rule_code(self): + def rule_code(self) -> str: prefix = self._prefix suffix = self.get_rule_code_suffix() return f"{prefix}-{suffix}" - def _get_suffix_prefix(self): + def _get_suffix_prefix(self) -> str: if self.variation == "nearest": return "N" else: return "L" - def get_rule_code_suffix(self): + def get_rule_code_suffix(self) -> str: prefix = self._get_suffix_prefix() month = ccalendar.MONTH_ALIASES[self.startingMonth] weekday = ccalendar.int_to_weekday[self.weekday] @@ -2338,7 +2346,7 @@ def _offset(self): variation=self.variation, ) - def is_anchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self._offset.is_anchored() def _rollback_to_year(self, other): @@ -2426,7 +2434,7 @@ def get_weeks(self, dt): return ret - def year_has_extra_week(self, dt): + def year_has_extra_week(self, dt: datetime) -> bool: # Avoid round-down errors --> normalize to get # e.g. '370D' instead of '360D23H' norm = Timestamp(dt).normalize().tz_localize(None) @@ -2437,7 +2445,7 @@ def year_has_extra_week(self, dt): assert weeks_in_year in [52, 53], weeks_in_year return weeks_in_year == 53 - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False if self._offset.is_on_offset(dt): @@ -2455,7 +2463,7 @@ def is_on_offset(self, dt): return False @property - def rule_code(self): + def rule_code(self) -> str: suffix = self._offset.get_rule_code_suffix() qtr = self.qtr_with_extra_week return f"{self._prefix}-{suffix}-{qtr}" @@ -2508,7 +2516,7 @@ def apply(self, other): ) return new - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return date(dt.year, dt.month, dt.day) == easter(dt.year) @@ -2588,7 +2596,7 @@ def __eq__(self, other: Any) -> bool: # This is identical to DateOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. - def __hash__(self): + def __hash__(self) -> int: return hash(self._params) def __ne__(self, other): @@ -2609,7 +2617,7 @@ def __ne__(self, other): return True @property - def delta(self): + def delta(self) -> Timedelta: return self.n * self._inc @property @@ -2640,11 +2648,11 @@ def apply(self, other): raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") - def is_anchored(self): + def is_anchored(self) -> bool: return False -def _delta_to_tick(delta): +def _delta_to_tick(delta: timedelta) -> Tick: if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0: # nanoseconds only for pd.Timedelta if delta.seconds == 0: diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py index 5733663dd7ab3..5694ca24aab57 100644 --- a/pandas/util/_depr_module.py +++ b/pandas/util/_depr_module.py @@ -46,7 +46,7 @@ def __repr__(self) -> str: __str__ = __repr__ - def __getattr__(self, name): + def __getattr__(self, name: str): if name in self.self_dir: return object.__getattribute__(self, name) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index a280da6e239b2..d8804994af426 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -32,6 +32,7 @@ def test_foo(): import pytest from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR @@ -251,3 +252,13 @@ def new_func(*args, **kwargs): assert flist2 == flist return new_func + + +def async_mark(): + try: + import_optional_dependency("pytest_asyncio") + async_mark = pytest.mark.asyncio + except ImportError: + async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") + + return async_mark diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 6cc14c7804b4a..a715094e65e98 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -84,15 +84,13 @@ def validate_args(fname, args, max_fname_arg_count, compat_args): The maximum number of arguments that the function `fname` can accept, excluding those in `args`. Used for displaying appropriate error messages. Must be non-negative. - compat_args : Dict - An ordered dictionary of keys and their associated default values. + compat_args : dict + A dictionary of keys and their associated default values. In order to accommodate buggy behaviour in some versions of `numpy`, where a signature displayed keyword arguments but then passed those arguments **positionally** internally when calling downstream - implementations, an ordered dictionary ensures that the original - order of the keyword arguments is enforced. Note that if there is - only one key, a generic dict can be passed in as well. - + implementations, a dict ensures that the original + order of the keyword arguments is enforced. Raises ------ TypeError @@ -168,10 +166,9 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar The minimum number of arguments that the function `fname` requires, excluding those in `args`. Used for displaying appropriate error messages. Must be non-negative. - compat_args: OrderedDict - A ordered dictionary of keys that `kwargs` is allowed to - have and their associated default values. Note that if there - is only one key, a generic dict can be passed in as well. + compat_args: dict + A dictionary of keys that `kwargs` is allowed to + have and their associated default values. Raises ------ @@ -300,7 +297,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " "arguments to remove any ambiguity. In the future, using " "positional arguments for 'index' or 'columns' will raise " - " a 'TypeError'." + "a 'TypeError'." ) warnings.warn(msg.format(method_name=method_name), FutureWarning, stacklevel=4) out[data._AXIS_NAMES[0]] = args[0] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c31cddc102afb..af9fe4846b27d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1,2745 +1,12 @@ -import bz2 -from collections import Counter -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -import gzip -import os -from shutil import rmtree -import string -import tempfile -from typing import List, Optional, Union, cast import warnings -import zipfile -import numpy as np -from numpy.random import rand, randn +from pandas._testing import * # noqa -from pandas._config.localization import ( # noqa:F401 - can_set_locale, - get_locales, - set_locale, +warnings.warn( + ( + "pandas.util.testing is deprecated. Use the functions in the " + "public API at pandas.testing instead." + ), + FutureWarning, + stacklevel=2, ) - -import pandas._libs.testing as _testing -from pandas._typing import FrameOrSeries -from pandas.compat import _get_lzma_file, _import_lzma - -from pandas.core.dtypes.common import ( - is_bool, - is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_extension_array_dtype, - is_interval_dtype, - is_list_like, - is_number, - is_period_dtype, - is_sequence, - is_timedelta64_dtype, - needs_i8_conversion, -) -from pandas.core.dtypes.missing import array_equivalent - -import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - DatetimeIndex, - Index, - IntervalIndex, - MultiIndex, - RangeIndex, - Series, - bdate_range, -) -from pandas.core.algorithms import take_1d -from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, - IntervalArray, - PeriodArray, - TimedeltaArray, - period_array, -) - -from pandas.io.common import urlopen -from pandas.io.formats.printing import pprint_thing - -lzma = _import_lzma() - -N = 30 -K = 4 -_RAISE_NETWORK_ERROR_DEFAULT = False - -# set testing_mode -_testing_mode_warnings = (DeprecationWarning, ResourceWarning) - - -def set_testing_mode(): - # set the testing mode filters - testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") - if "deprecate" in testing_mode: - warnings.simplefilter("always", _testing_mode_warnings) - - -def reset_testing_mode(): - # reset the testing mode filters - testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") - if "deprecate" in testing_mode: - warnings.simplefilter("ignore", _testing_mode_warnings) - - -set_testing_mode() - - -def reset_display_options(): - """ - Reset the display options for printing and representing objects. - """ - pd.reset_option("^display.", silent=True) - - -def round_trip_pickle(obj: FrameOrSeries, path: Optional[str] = None) -> FrameOrSeries: - """ - Pickle an object and then read it again. - - Parameters - ---------- - obj : pandas object - The object to pickle and then re-read. - path : str, default None - The path where the pickled object is written and then read. - - Returns - ------- - pandas object - The original object that was pickled and then re-read. - """ - if path is None: - path = f"__{rands(10)}__.pickle" - with ensure_clean(path) as path: - pd.to_pickle(obj, path) - return pd.read_pickle(path) - - -def round_trip_pathlib(writer, reader, path: Optional[str] = None): - """ - Write an object to file specified by a pathlib.Path and read it back - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - pandas object - The original object that was serialized and then re-read. - """ - import pytest - - Path = pytest.importorskip("pathlib").Path - if path is None: - path = "___pathlib___" - with ensure_clean(path) as path: - writer(Path(path)) - obj = reader(Path(path)) - return obj - - -def round_trip_localpath(writer, reader, path: Optional[str] = None): - """ - Write an object to file specified by a py.path LocalPath and read it back. - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - pandas object - The original object that was serialized and then re-read. - """ - import pytest - - LocalPath = pytest.importorskip("py.path").local - if path is None: - path = "___localpath___" - with ensure_clean(path) as path: - writer(LocalPath(path)) - obj = reader(LocalPath(path)) - return obj - - -@contextmanager -def decompress_file(path, compression): - """ - Open a compressed file and return a file object. - - Parameters - ---------- - path : str - The path where the file is read from. - - compression : {'gzip', 'bz2', 'zip', 'xz', None} - Name of the decompression to use - - Returns - ------- - file object - """ - if compression is None: - f = open(path, "rb") - elif compression == "gzip": - f = gzip.open(path, "rb") - elif compression == "bz2": - f = bz2.BZ2File(path, "rb") - elif compression == "xz": - f = _get_lzma_file(lzma)(path, "rb") - elif compression == "zip": - zip_file = zipfile.ZipFile(path) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) - else: - raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") - else: - raise ValueError(f"Unrecognized compression type: {compression}") - - try: - yield f - finally: - f.close() - if compression == "zip": - zip_file.close() - - -def write_to_compressed(compression, path, data, dest="test"): - """ - Write data to a compressed file. - - Parameters - ---------- - compression : {'gzip', 'bz2', 'zip', 'xz'} - The compression type to use. - path : str - The file path to write the data. - data : str - The data to write. - dest : str, default "test" - The destination file (for ZIP only) - - Raises - ------ - ValueError : An invalid compression value was passed in. - """ - if compression == "zip": - import zipfile - - compress_method = zipfile.ZipFile - elif compression == "gzip": - import gzip - - compress_method = gzip.GzipFile - elif compression == "bz2": - import bz2 - - compress_method = bz2.BZ2File - elif compression == "xz": - compress_method = _get_lzma_file(lzma) - else: - raise ValueError(f"Unrecognized compression type: {compression}") - - if compression == "zip": - mode = "w" - args = (dest, data) - method = "writestr" - else: - mode = "wb" - args = (data,) - method = "write" - - with compress_method(path, mode=mode) as f: - getattr(f, method)(*args) - - -def assert_almost_equal( - left, - right, - check_dtype: Union[bool, str] = "equiv", - check_less_precise: Union[bool, int] = False, - **kwargs, -): - """ - Check that the left and right objects are approximately equal. - - By approximately equal, we refer to objects that are numbers or that - contain numbers which may be equivalent to specific levels of precision. - - Parameters - ---------- - left : object - right : object - check_dtype : bool or {'equiv'}, default 'equiv' - Check dtype if both a and b are the same type. If 'equiv' is passed in, - then `RangeIndex` and `Int64Index` are also considered equivalent - when doing type checking. - check_less_precise : bool or int, default False - Specify comparison precision. 5 digits (False) or 3 digits (True) - after decimal points are compared. If int, then specify the number - of digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - """ - if isinstance(left, pd.Index): - assert_index_equal( - left, - right, - check_exact=False, - exact=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - elif isinstance(left, pd.Series): - assert_series_equal( - left, - right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - elif isinstance(left, pd.DataFrame): - assert_frame_equal( - left, - right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - else: - # Other sequences. - if check_dtype: - if is_number(left) and is_number(right): - # Do not compare numeric classes, like np.float64 and float. - pass - elif is_bool(left) and is_bool(right): - # Do not compare bool classes, like np.bool_ and bool. - pass - else: - if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): - obj = "numpy array" - else: - obj = "Input" - assert_class_equal(left, right, obj=obj) - _testing.assert_almost_equal( - left, - right, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - -def _check_isinstance(left, right, cls): - """ - Helper method for our assert_* methods that ensures that - the two objects being compared have the right type before - proceeding with the comparison. - - Parameters - ---------- - left : The first object being compared. - right : The second object being compared. - cls : The class type to check against. - - Raises - ------ - AssertionError : Either `left` or `right` is not an instance of `cls`. - """ - cls_name = cls.__name__ - - if not isinstance(left, cls): - raise AssertionError( - f"{cls_name} Expected type {cls}, found {type(left)} instead" - ) - if not isinstance(right, cls): - raise AssertionError( - f"{cls_name} Expected type {cls}, found {type(right)} instead" - ) - - -def assert_dict_equal(left, right, compare_keys: bool = True): - - _check_isinstance(left, right, dict) - _testing.assert_dict_equal(left, right, compare_keys=compare_keys) - - -def randbool(size=(), p: float = 0.5): - return rand(*size) <= p - - -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) -RANDU_CHARS = np.array( - list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), - dtype=(np.unicode_, 1), -) - - -def rands_array(nchars, size, dtype="O"): - """ - Generate an array of byte strings. - """ - retval = ( - np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) - .view((np.str_, nchars)) - .reshape(size) - ) - if dtype is None: - return retval - else: - return retval.astype(dtype) - - -def randu_array(nchars, size, dtype="O"): - """ - Generate an array of unicode strings. - """ - retval = ( - np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) - .view((np.unicode_, nchars)) - .reshape(size) - ) - if dtype is None: - return retval - else: - return retval.astype(dtype) - - -def rands(nchars): - """ - Generate one random byte string. - - See `rands_array` if you want to create an array of random strings. - - """ - return "".join(np.random.choice(RANDS_CHARS, nchars)) - - -def randu(nchars): - """ - Generate one random unicode string. - - See `randu_array` if you want to create an array of random unicode strings. - - """ - return "".join(np.random.choice(RANDU_CHARS, nchars)) - - -def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close - - if fignum is None: - for fignum in get_fignums(): - _close(fignum) - else: - _close(fignum) - - -# ----------------------------------------------------------------------------- -# contextmanager to ensure the file cleanup - - -@contextmanager -def ensure_clean(filename=None, return_filelike=False): - """ - Gets a temporary path and agrees to remove on close. - - Parameters - ---------- - filename : str (optional) - if None, creates a temporary file which is then removed when out of - scope. if passed, creates temporary file with filename as ending. - return_filelike : bool (default False) - if True, returns a file-like which is *always* cleaned. Necessary for - savefig and other functions which want to append extensions. - """ - filename = filename or "" - fd = None - - if return_filelike: - f = tempfile.TemporaryFile(suffix=filename) - try: - yield f - finally: - f.close() - else: - # don't generate tempfile if using a path with directory specified - if len(os.path.dirname(filename)): - raise ValueError("Can't pass a qualified name to ensure_clean()") - - try: - fd, filename = tempfile.mkstemp(suffix=filename) - except UnicodeEncodeError: - import pytest - - pytest.skip("no unicode file names on this system") - - try: - yield filename - finally: - try: - os.close(fd) - except OSError: - print(f"Couldn't close file descriptor: {fd} (file: {filename})") - try: - if os.path.exists(filename): - os.remove(filename) - except OSError as e: - print(f"Exception on removing file: {e}") - - -@contextmanager -def ensure_clean_dir(): - """ - Get a temporary directory path and agrees to remove on close. - - Yields - ------ - Temporary directory path - """ - directory_name = tempfile.mkdtemp(suffix="") - try: - yield directory_name - finally: - try: - rmtree(directory_name) - except OSError: - pass - - -@contextmanager -def ensure_safe_environment_variables(): - """ - Get a context manager to safely set environment variables - - All changes will be undone on close, hence environment variables set - within this contextmanager will neither persist nor change global state. - """ - saved_environ = dict(os.environ) - try: - yield - finally: - os.environ.clear() - os.environ.update(saved_environ) - - -# ----------------------------------------------------------------------------- -# Comparators - - -def equalContents(arr1, arr2) -> bool: - """ - Checks if the set of unique elements of arr1 and arr2 are equivalent. - """ - return frozenset(arr1) == frozenset(arr2) - - -def assert_index_equal( - left: Index, - right: Index, - exact: Union[bool, str] = "equiv", - check_names: bool = True, - check_less_precise: Union[bool, int] = False, - check_exact: bool = True, - check_categorical: bool = True, - obj: str = "Index", -) -> None: - """ - Check that left and right Index are equal. - - Parameters - ---------- - left : Index - right : Index - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted for - Int64Index as well. - check_names : bool, default True - Whether to check the names attribute. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - check_exact : bool, default True - Whether to compare number exactly. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - obj : str, default 'Index' - Specify object name being compared, internally used to show appropriate - assertion message. - """ - __tracebackhide__ = True - - def _check_types(l, r, obj="Index"): - if exact: - assert_class_equal(l, r, exact=exact, obj=obj) - - # Skip exact dtype checking when `check_categorical` is False - if check_categorical: - assert_attr_equal("dtype", l, r, obj=obj) - - # allow string-like to have different inferred_types - if l.inferred_type in ("string", "unicode"): - assert r.inferred_type in ("string", "unicode") - else: - assert_attr_equal("inferred_type", l, r, obj=obj) - - def _get_ilevel_values(index, level): - # accept level number only - unique = index.levels[level] - level_codes = index.codes[level] - filled = take_1d(unique.values, level_codes, fill_value=unique._na_value) - values = unique._shallow_copy(filled, name=index.names[level]) - return values - - # instance validation - _check_isinstance(left, right, Index) - - # class / dtype comparison - _check_types(left, right, obj=obj) - - # level comparison - if left.nlevels != right.nlevels: - msg1 = f"{obj} levels are different" - msg2 = f"{left.nlevels}, {left}" - msg3 = f"{right.nlevels}, {right}" - raise_assert_detail(obj, msg1, msg2, msg3) - - # length comparison - if len(left) != len(right): - msg1 = f"{obj} length are different" - msg2 = f"{len(left)}, {left}" - msg3 = f"{len(right)}, {right}" - raise_assert_detail(obj, msg1, msg2, msg3) - - # MultiIndex special comparison for little-friendly error messages - if left.nlevels > 1: - left = cast(MultiIndex, left) - right = cast(MultiIndex, right) - - for level in range(left.nlevels): - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) - - lobj = f"MultiIndex level [{level}]" - assert_index_equal( - llevel, - rlevel, - exact=exact, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - obj=lobj, - ) - # get_level_values may change dtype - _check_types(left.levels[level], right.levels[level], obj=obj) - - # skip exact index checking when `check_categorical` is False - if check_exact and check_categorical: - if not left.equals(right): - diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) - msg = f"{obj} values are different ({np.round(diff, 5)} %)" - raise_assert_detail(obj, msg, left, right) - else: - _testing.assert_almost_equal( - left.values, - right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, - lobj=left, - robj=right, - ) - - # metadata comparison - if check_names: - assert_attr_equal("names", left, right, obj=obj) - if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): - assert_attr_equal("freq", left, right, obj=obj) - if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): - assert_interval_array_equal(left.values, right.values) - - if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, obj=f"{obj} category") - - -def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): - """ - Checks classes are equal. - """ - __tracebackhide__ = True - - def repr_class(x): - if isinstance(x, Index): - # return Index as it is to include values in the error message - return x - - try: - return type(x).__name__ - except AttributeError: - return repr(type(x)) - - if exact == "equiv": - if type(left) != type(right): - # allow equivalence of Int64Index/RangeIndex - types = {type(left).__name__, type(right).__name__} - if len(types - {"Int64Index", "RangeIndex"}): - msg = f"{obj} classes are not equivalent" - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) - elif exact: - if type(left) != type(right): - msg = f"{obj} classes are different" - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) - - -def assert_attr_equal(attr, left, right, obj="Attributes"): - """checks attributes are equal. Both objects must have attribute. - - Parameters - ---------- - attr : str - Attribute name being compared. - left : object - right : object - obj : str, default 'Attributes' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - left_attr = getattr(left, attr) - right_attr = getattr(right, attr) - - if left_attr is right_attr: - return True - elif ( - is_number(left_attr) - and np.isnan(left_attr) - and is_number(right_attr) - and np.isnan(right_attr) - ): - # np.nan - return True - - try: - result = left_attr == right_attr - except TypeError: - # datetimetz on rhs may raise TypeError - result = False - if not isinstance(result, bool): - result = result.all() - - if result: - return True - else: - msg = f'Attribute "{attr}" are different' - raise_assert_detail(obj, msg, left_attr, right_attr) - - -def assert_is_valid_plot_return_object(objs): - import matplotlib.pyplot as plt - - if isinstance(objs, (pd.Series, np.ndarray)): - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {repr(type(el).__name__)}" - ) - assert isinstance(el, (plt.Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{repr(type(objs).__name__)}" - ) - assert isinstance(objs, (plt.Artist, tuple, dict)), msg - - -def isiterable(obj): - return hasattr(obj, "__iter__") - - -def assert_is_sorted(seq): - """Assert that the sequence is sorted.""" - if isinstance(seq, (Index, Series)): - seq = seq.values - # sorting does not change precisions - assert_numpy_array_equal(seq, np.sort(np.array(seq))) - - -def assert_categorical_equal( - left, right, check_dtype=True, check_category_order=True, obj="Categorical" -): - """Test that Categoricals are equivalent. - - Parameters - ---------- - left : Categorical - right : Categorical - check_dtype : bool, default True - Check that integer dtype of the codes are the same - check_category_order : bool, default True - Whether the order of the categories should be compared, which - implies identical integer codes. If False, only the resulting - values are compared. The ordered attribute is - checked regardless. - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message - """ - _check_isinstance(left, right, Categorical) - - if check_category_order: - assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") - assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", - ) - else: - assert_index_equal( - left.categories.sort_values(), - right.categories.sort_values(), - obj=f"{obj}.categories", - ) - assert_index_equal( - left.categories.take(left.codes), - right.categories.take(right.codes), - obj=f"{obj}.values", - ) - - assert_attr_equal("ordered", left, right, obj=obj) - - -def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): - """Test that two IntervalArrays are equivalent. - - Parameters - ---------- - left, right : IntervalArray - The IntervalArrays to compare. - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted for - Int64Index as well. - obj : str, default 'IntervalArray' - Specify object name being compared, internally used to show appropriate - assertion message - """ - _check_isinstance(left, right, IntervalArray) - - assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") - assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") - assert_attr_equal("closed", left, right, obj=obj) - - -def assert_period_array_equal(left, right, obj="PeriodArray"): - _check_isinstance(left, right, PeriodArray) - - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") - assert_attr_equal("freq", left, right, obj=obj) - - -def assert_datetime_array_equal(left, right, obj="DatetimeArray"): - __tracebackhide__ = True - _check_isinstance(left, right, DatetimeArray) - - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) - assert_attr_equal("tz", left, right, obj=obj) - - -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): - __tracebackhide__ = True - _check_isinstance(left, right, TimedeltaArray) - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) - - -def raise_assert_detail(obj, message, left, right, diff=None): - __tracebackhide__ = True - - if isinstance(left, np.ndarray): - left = pprint_thing(left) - elif is_categorical_dtype(left): - left = repr(left) - - if isinstance(right, np.ndarray): - right = pprint_thing(right) - elif is_categorical_dtype(right): - right = repr(right) - - msg = f"""{obj} are different - -{message} -[left]: {left} -[right]: {right}""" - - if diff is not None: - msg += f"\n[diff]: {diff}" - - raise AssertionError(msg) - - -def assert_numpy_array_equal( - left, - right, - strict_nan=False, - check_dtype=True, - err_msg=None, - check_same=None, - obj="numpy array", -): - """ Checks that 'np.ndarray' is equivalent - - Parameters - ---------- - left : np.ndarray or iterable - right : np.ndarray or iterable - strict_nan : bool, default False - If True, consider NaN and None to be different. - check_dtype: bool, default True - check dtype if both a and b are np.ndarray - err_msg : str, default None - If provided, used as assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area - obj : str, default 'numpy array' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - # instance validation - # Show a detailed error message when classes are different - assert_class_equal(left, right, obj=obj) - # both classes must be an np.ndarray - _check_isinstance(left, right, np.ndarray) - - def _get_base(obj): - return obj.base if getattr(obj, "base", None) is not None else obj - - left_base = _get_base(left) - right_base = _get_base(right) - - if check_same == "same": - if left_base is not right_base: - raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") - elif check_same == "copy": - if left_base is right_base: - raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - - def _raise(left, right, err_msg): - if err_msg is None: - if left.shape != right.shape: - raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape, - ) - - diff = 0 - for l, r in zip(left, right): - # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): - diff += 1 - - diff = diff * 100.0 / left.size - msg = f"{obj} values are different ({np.round(diff, 5)} %)" - raise_assert_detail(obj, msg, left, right) - - raise AssertionError(err_msg) - - # compare shape and values - if not array_equivalent(left, right, strict_nan=strict_nan): - _raise(left, right, err_msg) - - if check_dtype: - if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - assert_attr_equal("dtype", left, right, obj=obj) - - -def assert_extension_array_equal( - left, right, check_dtype=True, check_less_precise=False, check_exact=False -): - """Check that left and right ExtensionArrays are equal. - - Parameters - ---------- - left, right : ExtensionArray - The two arrays to compare - check_dtype : bool, default True - Whether to check if the ExtensionArray dtypes are identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - check_exact : bool, default False - Whether to compare number exactly. - - Notes - ----- - Missing values are checked separately from valid values. - A mask of missing values is computed for each and checked to match. - The remaining all-valid values are cast to object dtype and checked. - """ - assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" - assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" - if check_dtype: - assert_attr_equal("dtype", left, right, obj="ExtensionArray") - - if hasattr(left, "asi8") and type(right) == type(left): - # Avoid slow object-dtype comparisons - assert_numpy_array_equal(left.asi8, right.asi8) - return - - left_na = np.asarray(left.isna()) - right_na = np.asarray(right.isna()) - assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") - - left_valid = np.asarray(left[~left_na].astype(object)) - right_valid = np.asarray(right[~right_na].astype(object)) - if check_exact: - assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") - else: - _testing.assert_almost_equal( - left_valid, - right_valid, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - obj="ExtensionArray", - ) - - -# This could be refactored to use the NDFrame.equals method -def assert_series_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_series_type=True, - check_less_precise=False, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - obj="Series", -): - """ - Check that left and right Series are equal. - - Parameters - ---------- - left : Series - right : Series - check_dtype : bool, default True - Whether to check the Series dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_series_type : bool, default True - Whether to check the Series class is identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - check_names : bool, default True - Whether to check the Series and Index names attribute. - check_exact : bool, default False - Whether to compare number exactly. - check_datetimelike_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - obj : str, default 'Series' - Specify object name being compared, internally used to show appropriate - assertion message. - """ - __tracebackhide__ = True - - # instance validation - _check_isinstance(left, right, Series) - - if check_series_type: - # ToDo: There are some tests using rhs is sparse - # lhs is dense. Should use assert_class_equal in future - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) - - # length comparison - if len(left) != len(right): - msg1 = f"{len(left)}, {left.index}" - msg2 = f"{len(right)}, {right.index}" - raise_assert_detail(obj, "Series length are different", msg1, msg2) - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) - - if check_dtype: - # We want to skip exact dtype checking when `check_categorical` - # is False. We'll still raise if only one is a `Categorical`, - # regardless of `check_categorical` - if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) - and not check_categorical - ): - pass - else: - assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - - if check_exact: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - obj=str(obj), - ) - elif check_datetimelike_compat: - # we want to check only if we have compat dtypes - # e.g. integer and M|m are NOT compat, but we can simply check - # the values in that case - if needs_i8_conversion(left) or needs_i8_conversion(right): - - # datetimelike may have different objects (e.g. datetime.datetime - # vs Timestamp) but will compare equal - if not Index(left.values).equals(Index(right.values)): - msg = ( - f"[datetimelike_compat=True] {left.values} " - f"is not equal to {right.values}." - ) - raise AssertionError(msg) - else: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - ) - elif is_interval_dtype(left) or is_interval_dtype(right): - assert_interval_array_equal(left.array, right.array) - elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): - # .values is an ndarray, but ._values is the ExtensionArray. - # TODO: Use .array - assert is_extension_array_dtype(right.dtype) - assert_extension_array_equal(left._values, right._values) - elif ( - is_extension_array_dtype(left) - and not is_categorical_dtype(left) - and is_extension_array_dtype(right) - and not is_categorical_dtype(right) - ): - assert_extension_array_equal(left.array, right.array) - else: - _testing.assert_almost_equal( - left._internal_get_values(), - right._internal_get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj=str(obj), - ) - - # metadata comparison - if check_names: - assert_attr_equal("name", left, right, obj=obj) - - if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, obj=f"{obj} category") - - -# This could be refactored to use the NDFrame.equals method -def assert_frame_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_column_type="equiv", - check_frame_type=True, - check_less_precise=False, - check_names=True, - by_blocks=False, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_like=False, - obj="DataFrame", -): - """ - Check that left and right DataFrame are equal. - - This function is intended to compare two DataFrames and output any - differences. Is is mostly intended for use in unit tests. - Additional parameters allow varying the strictness of the - equality checks performed. - - Parameters - ---------- - left : DataFrame - First DataFrame to compare. - right : DataFrame - Second DataFrame to compare. - check_dtype : bool, default True - Whether to check the DataFrame dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_column_type : bool or {'equiv'}, default 'equiv' - Whether to check the columns class, dtype and inferred_type - are identical. Is passed as the ``exact`` argument of - :func:`assert_index_equal`. - check_frame_type : bool, default True - Whether to check the DataFrame class is identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - check_names : bool, default True - Whether to check that the `names` attribute for both the `index` - and `column` attributes of the DataFrame is identical. - by_blocks : bool, default False - Specify how to compare internal data. If False, compare by columns. - If True, compare by blocks. - check_exact : bool, default False - Whether to compare number exactly. - check_datetimelike_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_like : bool, default False - If True, ignore the order of index & columns. - Note: index labels must match their respective rows - (same as in columns) - same labels must be with the same data. - obj : str, default 'DataFrame' - Specify object name being compared, internally used to show appropriate - assertion message. - - See Also - -------- - assert_series_equal : Equivalent method for asserting Series equality. - DataFrame.equals : Check DataFrame equality. - - Examples - -------- - This example shows comparing two DataFrames that are equal - but with columns of differing dtypes. - - >>> from pandas.util.testing import assert_frame_equal - >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) - - df1 equals itself. - - >>> assert_frame_equal(df1, df1) - - df1 differs from df2 as column 'b' is of a different type. - - >>> assert_frame_equal(df1, df2) - Traceback (most recent call last): - ... - AssertionError: Attributes of DataFrame.iloc[:, 1] are different - - Attribute "dtype" are different - [left]: int64 - [right]: float64 - - Ignore differing dtypes in columns with check_dtype. - - >>> assert_frame_equal(df1, df2, check_dtype=False) - """ - __tracebackhide__ = True - - # instance validation - _check_isinstance(left, right, DataFrame) - - if check_frame_type: - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) - - # shape comparison - if left.shape != right.shape: - raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", - ) - - if check_like: - left, right = left.reindex_like(right), right - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) - - # column comparison - assert_index_equal( - left.columns, - right.columns, - exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.columns", - ) - - # compare by blocks - if by_blocks: - rblocks = right._to_dict_of_blocks() - lblocks = left._to_dict_of_blocks() - for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): - assert dtype in lblocks - assert dtype in rblocks - assert_frame_equal( - lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj - ) - - # compare by columns - else: - for i, col in enumerate(left.columns): - assert col in right - lcol = left.iloc[:, i] - rcol = right.iloc[:, i] - assert_series_equal( - lcol, - rcol, - check_dtype=check_dtype, - check_index_type=check_index_type, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_names=check_names, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - obj=f"{obj}.iloc[:, {i}]", - ) - - -def assert_equal(left, right, **kwargs): - """ - Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. - - Parameters - ---------- - left : Index, Series, DataFrame, ExtensionArray, or np.ndarray - right : Index, Series, DataFrame, ExtensionArray, or np.ndarray - **kwargs - """ - __tracebackhide__ = True - - if isinstance(left, pd.Index): - assert_index_equal(left, right, **kwargs) - elif isinstance(left, pd.Series): - assert_series_equal(left, right, **kwargs) - elif isinstance(left, pd.DataFrame): - assert_frame_equal(left, right, **kwargs) - elif isinstance(left, IntervalArray): - assert_interval_array_equal(left, right, **kwargs) - elif isinstance(left, PeriodArray): - assert_period_array_equal(left, right, **kwargs) - elif isinstance(left, DatetimeArray): - assert_datetime_array_equal(left, right, **kwargs) - elif isinstance(left, TimedeltaArray): - assert_timedelta_array_equal(left, right, **kwargs) - elif isinstance(left, ExtensionArray): - assert_extension_array_equal(left, right, **kwargs) - elif isinstance(left, np.ndarray): - assert_numpy_array_equal(left, right, **kwargs) - elif isinstance(left, str): - assert kwargs == {} - return left == right - else: - raise NotImplementedError(type(left)) - - -def box_expected(expected, box_cls, transpose=True): - """ - Helper function to wrap the expected output of a test in a given box_class. - - Parameters - ---------- - expected : np.ndarray, Index, Series - box_cls : {Index, Series, DataFrame} - - Returns - ------- - subclass of box_cls - """ - if box_cls is pd.Index: - expected = pd.Index(expected) - elif box_cls is pd.Series: - expected = pd.Series(expected) - elif box_cls is pd.DataFrame: - expected = pd.Series(expected).to_frame() - if transpose: - # for vector operations, we we need a DataFrame to be a single-row, - # not a single-column, in order to operate against non-DataFrame - # vectors of the same length. - expected = expected.T - elif box_cls is PeriodArray: - # the PeriodArray constructor is not as flexible as period_array - expected = period_array(expected) - elif box_cls is DatetimeArray: - expected = DatetimeArray(expected) - elif box_cls is TimedeltaArray: - expected = TimedeltaArray(expected) - elif box_cls is np.ndarray: - expected = np.array(expected) - elif box_cls is to_array: - expected = to_array(expected) - else: - raise NotImplementedError(box_cls) - return expected - - -def to_array(obj): - # temporary implementation until we get pd.array in place - if is_period_dtype(obj): - return period_array(obj) - elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): - return DatetimeArray._from_sequence(obj) - elif is_timedelta64_dtype(obj): - return TimedeltaArray._from_sequence(obj) - else: - return np.array(obj) - - -# ----------------------------------------------------------------------------- -# Sparse - - -def assert_sp_array_equal( - left, - right, - check_dtype=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, -): - """Check that the left and right SparseArray are equal. - - Parameters - ---------- - left : SparseArray - right : SparseArray - check_dtype : bool, default True - Whether to check the data dtype is identical. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. - """ - - _check_isinstance(left, right, pd.SparseArray) - - assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) - - # SparseIndex comparison - assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) - assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) - - if not check_kind: - left_index = left.sp_index.to_block_index() - right_index = right.sp_index.to_block_index() - else: - left_index = left.sp_index - right_index = right.sp_index - - if consolidate_block_indices and left.kind == "block": - # we'll probably remove this hack... - left_index = left_index.to_int_index().to_block_index() - right_index = right_index.to_int_index().to_block_index() - - if not left_index.equals(right_index): - raise_assert_detail( - "SparseArray.index", "index are not equal", left_index, right_index - ) - else: - # Just ensure a - pass - - if check_fill_value: - assert_attr_equal("fill_value", left, right) - if check_dtype: - assert_attr_equal("dtype", left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) - - -# ----------------------------------------------------------------------------- -# Others - - -def assert_contains_all(iterable, dic): - for k in iterable: - assert k in dic, f"Did not contain item: {repr(k)}" - - -def assert_copy(iter1, iter2, **eql_kwargs): - """ - iter1, iter2: iterables that produce elements - comparable with assert_almost_equal - - Checks that the elements are equal, but not - the same object. (Does not check that items - in sequences are also not the same object) - """ - for elem1, elem2 in zip(iter1, iter2): - assert_almost_equal(elem1, elem2, **eql_kwargs) - msg = ( - f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " - "different objects, but they were the same object." - ) - assert elem1 is not elem2, msg - - -def getCols(k): - return string.ascii_uppercase[:k] - - -# make index -def makeStringIndex(k=10, name=None): - return Index(rands_array(nchars=10, size=k), name=name) - - -def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k), name=name) - - -def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): - """ make a length k index or n categories """ - x = rands_array(nchars=4, size=n) - return CategoricalIndex( - Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs - ) - - -def makeIntervalIndex(k=10, name=None, **kwargs): - """ make a length k IntervalIndex """ - x = np.linspace(0, 100, num=(k + 1)) - return IntervalIndex.from_breaks(x, name=name, **kwargs) - - -def makeBoolIndex(k=10, name=None): - if k == 1: - return Index([True], name=name) - elif k == 2: - return Index([False, True], name=name) - return Index([False, True] + [False] * (k - 2), name=name) - - -def makeIntIndex(k=10, name=None): - return Index(list(range(k)), name=name) - - -def makeUIntIndex(k=10, name=None): - return Index([2 ** 63 + i for i in range(k)], name=name) - - -def makeRangeIndex(k=10, name=None, **kwargs): - return RangeIndex(0, k, 1, name=name, **kwargs) - - -def makeFloatIndex(k=10, name=None): - values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) - return Index(values * (10 ** np.random.randint(0, 9)), name=name) - - -def makeDateIndex(k=10, freq="B", name=None, **kwargs): - dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k, freq=freq, name=name) - return DatetimeIndex(dr, name=name, **kwargs) - - -def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): - return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) - - -def makePeriodIndex(k=10, name=None, **kwargs): - dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) - return dr - - -def makeMultiIndex(k=10, names=None, **kwargs): - return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) - - -_names = [ - "Alice", - "Bob", - "Charlie", - "Dan", - "Edith", - "Frank", - "George", - "Hannah", - "Ingrid", - "Jerry", - "Kevin", - "Laura", - "Michael", - "Norbert", - "Oliver", - "Patricia", - "Quinn", - "Ray", - "Sarah", - "Tim", - "Ursula", - "Victor", - "Wendy", - "Xavier", - "Yvonne", - "Zelda", -] - - -def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): - """ - Make a DataFrame with a DatetimeIndex - - Parameters - ---------- - start : str or Timestamp, default "2000-01-01" - The start of the index. Passed to date_range with `freq`. - end : str or Timestamp, default "2000-12-31" - The end of the index. Passed to date_range with `freq`. - freq : str or Freq - The frequency to use for the DatetimeIndex - seed : int, optional - The random state seed. - - * name : object dtype with string names - * id : int dtype with - * x, y : float dtype - - Examples - -------- - >>> _make_timeseries() - id name x y - timestamp - 2000-01-01 982 Frank 0.031261 0.986727 - 2000-01-02 1025 Edith -0.086358 -0.032920 - 2000-01-03 982 Edith 0.473177 0.298654 - 2000-01-04 1009 Sarah 0.534344 -0.750377 - 2000-01-05 963 Zelda -0.271573 0.054424 - ... ... ... ... ... - 2000-12-27 980 Ingrid -0.132333 -0.422195 - 2000-12-28 972 Frank -0.376007 -0.298687 - 2000-12-29 1009 Ursula -0.865047 -0.503133 - 2000-12-30 1000 Hannah -0.063757 -0.507336 - 2000-12-31 972 Tim -0.869120 0.531685 - """ - index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") - n = len(index) - state = np.random.RandomState(seed) - columns = { - "name": state.choice(_names, size=n), - "id": state.poisson(1000, size=n), - "x": state.rand(n) * 2 - 1, - "y": state.rand(n) * 2 - 1, - } - df = pd.DataFrame(columns, index=index, columns=sorted(columns)) - if df.index[-1] == end: - df = df.iloc[:-1] - return df - - -def all_index_generator(k=10): - """Generator which can be iterated over to get instances of all the various - index classes. - - Parameters - ---------- - k: length of each of the index instances - """ - all_make_index_funcs = [ - makeIntIndex, - makeFloatIndex, - makeStringIndex, - makeUnicodeIndex, - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeBoolIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - ] - for make_index_func in all_make_index_funcs: - yield make_index_func(k=k) - - -def index_subclass_makers_generator(): - make_index_funcs = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - makeMultiIndex, - ] - for make_index_func in make_index_funcs: - yield make_index_func - - -def all_timeseries_index_generator(k=10): - """Generator which can be iterated over to get instances of all the classes - which represent time-series. - - Parameters - ---------- - k: length of each of the index instances - """ - make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] - for make_index_func in make_index_funcs: - yield make_index_func(k=k) - - -# make series -def makeFloatSeries(name=None): - index = makeStringIndex(N) - return Series(randn(N), index=index, name=name) - - -def makeStringSeries(name=None): - index = makeStringIndex(N) - return Series(randn(N), index=index, name=name) - - -def makeObjectSeries(name=None): - data = makeStringIndex(N) - data = Index(data, dtype=object) - index = makeStringIndex(N) - return Series(data, index=index, name=name) - - -def getSeriesData(): - index = makeStringIndex(N) - return {c: Series(randn(N), index=index) for c in getCols(K)} - - -def makeTimeSeries(nper=None, freq="B", name=None): - if nper is None: - nper = N - return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) - - -def makePeriodSeries(nper=None, name=None): - if nper is None: - nper = N - return Series(randn(nper), index=makePeriodIndex(nper), name=name) - - -def getTimeSeriesData(nper=None, freq="B"): - return {c: makeTimeSeries(nper, freq) for c in getCols(K)} - - -def getPeriodData(nper=None): - return {c: makePeriodSeries(nper) for c in getCols(K)} - - -# make frame -def makeTimeDataFrame(nper=None, freq="B"): - data = getTimeSeriesData(nper, freq) - return DataFrame(data) - - -def makeDataFrame(): - data = getSeriesData() - return DataFrame(data) - - -def getMixedTypeDict(): - index = Index(["a", "b", "c", "d", "e"]) - - data = { - "A": [0.0, 1.0, 2.0, 3.0, 4.0], - "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": bdate_range("1/1/2009", periods=5), - } - - return index, data - - -def makeMixedDataFrame(): - return DataFrame(getMixedTypeDict()[1]) - - -def makePeriodFrame(nper=None): - data = getPeriodData(nper) - return DataFrame(data) - - -def makeCustomIndex( - nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None -): - """Create an index/multindex with given dimensions, levels, names, etc' - - nentries - number of entries in index - nlevels - number of levels (> 1 produces multindex) - prefix - a string prefix for labels - names - (Optional), bool or list of strings. if True will use default - names, if false will use no names, if a list is given, the name of - each level in the index will be taken from the list. - ndupe_l - (Optional), list of ints, the number of rows for which the - label will repeated at the corresponding level, you can specify just - the first few, the rest will use the default ndupe_l of 1. - len(ndupe_l) <= nlevels. - idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index - "dt" create a datetime index. - "td" create a datetime index. - - if unspecified, string labels will be generated. - """ - - if ndupe_l is None: - ndupe_l = [1] * nlevels - assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels - assert names is None or names is False or names is True or len(names) is nlevels - assert idx_type is None or ( - idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 - ) - - if names is True: - # build default names - names = [prefix + str(i) for i in range(nlevels)] - if names is False: - # pass None to index constructor for no name - names = None - - # make singleton case uniform - if isinstance(names, str) and nlevels == 1: - names = [names] - - # specific 1D index type requested? - idx_func = dict( - i=makeIntIndex, - f=makeFloatIndex, - s=makeStringIndex, - u=makeUnicodeIndex, - dt=makeDateIndex, - td=makeTimedeltaIndex, - p=makePeriodIndex, - ).get(idx_type) - if idx_func: - idx = idx_func(nentries) - # but we need to fill in the name - if names: - idx.name = names[0] - return idx - elif idx_type is not None: - raise ValueError( - f"{repr(idx_type)} is not a legal value for `idx_type`, " - "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." - ) - - if len(ndupe_l) < nlevels: - ndupe_l.extend([1] * (nlevels - len(ndupe_l))) - assert len(ndupe_l) == nlevels - - assert all(x > 0 for x in ndupe_l) - - tuples = [] - for i in range(nlevels): - - def keyfunc(x): - import re - - numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") - return [int(num) for num in numeric_tuple] - - # build a list of lists to create the index from - div_factor = nentries // ndupe_l[i] + 1 - cnt = Counter() - for j in range(div_factor): - label = f"{prefix}_l{i}_g{j}" - cnt[label] = ndupe_l[i] - # cute Counter trick - result = sorted(cnt.elements(), key=keyfunc)[:nentries] - tuples.append(result) - - tuples = list(zip(*tuples)) - - # convert tuples to index - if nentries == 1: - # we have a single level of tuples, i.e. a regular Index - index = Index(tuples[0], name=names[0]) - elif nlevels == 1: - name = None if names is None else names[0] - index = Index((x[0] for x in tuples), name=name) - else: - index = MultiIndex.from_tuples(tuples, names=names) - return index - - -def makeCustomDataframe( - nrows, - ncols, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - nrows, ncols - number of data rows/cols - c_idx_names, idx_names - False/True/list of strings, yields No names , - default names or uses the provided names for the levels of the - corresponding index. You can provide a single string when - c_idx_nlevels ==1. - c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex - r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex - data_gen_f - a function f(row,col) which return the data value - at that position, the default generator used yields values of the form - "RxCy" based on position. - c_ndupe_l, r_ndupe_l - list of integers, determines the number - of duplicates for each label at a given level of the corresponding - index. The default `None` value produces a multiplicity of 1 across - all levels, i.e. a unique index. Will accept a partial list of length - N < idx_nlevels, for just the first N levels. If ndupe doesn't divide - nrows/ncol, the last label might have lower multiplicity. - dtype - passed to the DataFrame constructor as is, in case you wish to - have more control in conjunction with a custom `data_gen_f` - r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index - "dt" create a datetime index. - "td" create a timedelta index. - - if unspecified, string labels will be generated. - - Examples: - - # 5 row, 3 columns, default names on both, single index on both axis - >> makeCustomDataframe(5,3) - - # make the data a random int between 1 and 100 - >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) - - # 2-level multiindex on rows with each label duplicated - # twice on first level, default names on both axis, single - # index on both axis - >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) - - # DatetimeIndex on row, index with unicode labels on columns - # no names on either axis - >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, - r_idx_type="dt",c_idx_type="u") - - # 4-level multindex on rows with names provided, 2-level multindex - # on columns with default labels and default names. - >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, - r_idx_names=["FEE","FI","FO","FAM"], - c_idx_nlevels=2) - - >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) - """ - - assert c_idx_nlevels > 0 - assert r_idx_nlevels > 0 - assert r_idx_type is None or ( - r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 - ) - assert c_idx_type is None or ( - c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 - ) - - columns = makeCustomIndex( - ncols, - nlevels=c_idx_nlevels, - prefix="C", - names=c_idx_names, - ndupe_l=c_ndupe_l, - idx_type=c_idx_type, - ) - index = makeCustomIndex( - nrows, - nlevels=r_idx_nlevels, - prefix="R", - names=r_idx_names, - ndupe_l=r_ndupe_l, - idx_type=r_idx_type, - ) - - # by default, generate data based on location - if data_gen_f is None: - data_gen_f = lambda r, c: f"R{r}C{c}" - - data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] - - return DataFrame(data, index, columns, dtype=dtype) - - -def _create_missing_idx(nrows, ncols, density, random_state=None): - if random_state is None: - random_state = np.random - else: - random_state = np.random.RandomState(random_state) - - # below is cribbed from scipy.sparse - size = int(np.round((1 - density) * nrows * ncols)) - # generate a few more to ensure unique values - min_rows = 5 - fac = 1.02 - extra_size = min(size + min_rows, fac * size) - - def _gen_unique_rand(rng, _extra_size): - ind = rng.rand(int(_extra_size)) - return np.unique(np.floor(ind * nrows * ncols))[:size] - - ind = _gen_unique_rand(random_state, extra_size) - while ind.size < size: - extra_size *= 1.05 - ind = _gen_unique_rand(random_state, extra_size) - - j = np.floor(ind * 1.0 / nrows).astype(int) - i = (ind - j * nrows).astype(int) - return i.tolist(), j.tolist() - - -def makeMissingCustomDataframe( - nrows, - ncols, - density=0.9, - random_state=None, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - Parameters - ---------- - Density : float, optional - Float in (0, 1) that gives the percentage of non-missing numbers in - the DataFrame. - random_state : {np.random.RandomState, int}, optional - Random number generator or random seed. - - See makeCustomDataframe for descriptions of the rest of the parameters. - """ - df = makeCustomDataframe( - nrows, - ncols, - c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, - r_ndupe_l=r_ndupe_l, - dtype=dtype, - c_idx_type=c_idx_type, - r_idx_type=r_idx_type, - ) - - i, j = _create_missing_idx(nrows, ncols, density, random_state) - df.values[i, j] = np.nan - return df - - -def makeMissingDataframe(density=0.9, random_state=None): - df = makeDataFrame() - i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) - df.values[i, j] = np.nan - return df - - -class TestSubDict(dict): - def __init__(self, *args, **kwargs): - dict.__init__(self, *args, **kwargs) - - -def optional_args(decorator): - """allows a decorator to take optional positional and keyword arguments. - Assumes that taking a single, callable, positional argument means that - it is decorating a function, i.e. something like this:: - - @my_decorator - def function(): pass - - Calls decorator with decorator(f, *args, **kwargs)""" - - @wraps(decorator) - def wrapper(*args, **kwargs): - def dec(f): - return decorator(f, *args, **kwargs) - - is_decorating = not kwargs and len(args) == 1 and callable(args[0]) - if is_decorating: - f = args[0] - args = [] - return dec(f) - else: - return dec - - return wrapper - - -# skip tests on exceptions with this message -_network_error_messages = ( - # 'urlopen error timed out', - # 'timeout: timed out', - # 'socket.timeout: timed out', - "timed out", - "Server Hangup", - "HTTP Error 503: Service Unavailable", - "502: Proxy Error", - "HTTP Error 502: internal error", - "HTTP Error 502", - "HTTP Error 503", - "HTTP Error 403", - "HTTP Error 400", - "Temporary failure in name resolution", - "Name or service not known", - "Connection refused", - "certificate verify", -) - -# or this e.errno/e.reason.errno -_network_errno_vals = ( - 101, # Network is unreachable - 111, # Connection refused - 110, # Connection timed out - 104, # Connection reset Error - 54, # Connection reset by peer - 60, # urllib.error.URLError: [Errno 60] Connection timed out -) - -# Both of the above shouldn't mask real issues such as 404's -# or refused connections (changed DNS). -# But some tests (test_data yahoo) contact incredibly flakey -# servers. - -# and conditionally raise on exception types in _get_default_network_errors - - -def _get_default_network_errors(): - # Lazy import for http.client because it imports many things from the stdlib - import http.client - - return (IOError, http.client.HTTPException, TimeoutError) - - -def can_connect(url, error_classes=None): - """Try to connect to the given url. True if succeeds, False if IOError - raised - - Parameters - ---------- - url : basestring - The URL to try to connect to - - Returns - ------- - connectable : bool - Return True if no IOError (unable to connect) or URLError (bad url) was - raised - """ - - if error_classes is None: - error_classes = _get_default_network_errors() - - try: - with urlopen(url): - pass - except error_classes: - return False - else: - return True - - -@optional_args -def network( - t, - url="http://www.google.com", - raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, - check_before_test=False, - error_classes=None, - skip_errnos=_network_errno_vals, - _skip_on_messages=_network_error_messages, -): - """ - Label a test as requiring network connection and, if an error is - encountered, only raise if it does not find a network connection. - - In comparison to ``network``, this assumes an added contract to your test: - you must assert that, under normal conditions, your test will ONLY fail if - it does not have network connectivity. - - You can call this in 3 ways: as a standard decorator, with keyword - arguments, or with a positional argument that is the url to check. - - Parameters - ---------- - t : callable - The test requiring network connectivity. - url : path - The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'http://www.google.com'. - raise_on_error : bool - If True, never catches errors. - check_before_test : bool - If True, checks connectivity before running the test case. - error_classes : tuple or Exception - error classes to ignore. If not in ``error_classes``, raises the error. - defaults to IOError. Be careful about changing the error classes here. - skip_errnos : iterable of int - Any exception that has .errno or .reason.erno set to one - of these values will be skipped with an appropriate - message. - _skip_on_messages: iterable of string - any exception e for which one of the strings is - a substring of str(e) will be skipped with an appropriate - message. Intended to suppress errors where an errno isn't available. - - Notes - ----- - * ``raise_on_error`` supercedes ``check_before_test`` - - Returns - ------- - t : callable - The decorated test ``t``, with checks for connectivity errors. - - Example - ------- - - Tests decorated with @network will fail if it's possible to make a network - connection to another URL (defaults to google.com):: - - >>> from pandas.util.testing import network - >>> from pandas.io.common import urlopen - >>> @network - ... def test_network(): - ... with urlopen("rabbit://bonanza.com"): - ... pass - Traceback - ... - URLError: - - You can specify alternative URLs:: - - >>> @network("http://www.yahoo.com") - ... def test_something_with_yahoo(): - ... raise IOError("Failure Message") - >>> test_something_with_yahoo() - Traceback (most recent call last): - ... - IOError: Failure Message - - If you set check_before_test, it will check the url first and not run the - test on failure:: - - >>> @network("failing://url.blaher", check_before_test=True) - ... def test_something(): - ... print("I ran!") - ... raise ValueError("Failure") - >>> test_something() - Traceback (most recent call last): - ... - - Errors not related to networking will always be raised. - """ - from pytest import skip - - if error_classes is None: - error_classes = _get_default_network_errors() - - t.network = True - - @wraps(t) - def wrapper(*args, **kwargs): - if check_before_test and not raise_on_error: - if not can_connect(url, error_classes): - skip() - try: - return t(*args, **kwargs) - except Exception as err: - errno = getattr(err, "errno", None) - if not errno and hasattr(errno, "reason"): - errno = getattr(err.reason, "errno", None) - - if errno in skip_errnos: - skip(f"Skipping test due to known errno and error {err}") - - e_str = str(err) - - if any(m.lower() in e_str.lower() for m in _skip_on_messages): - skip( - f"Skipping test because exception message is known and error {err}" - ) - - if not isinstance(err, error_classes): - raise - - if raise_on_error or can_connect(url, error_classes): - raise - else: - skip(f"Skipping test due to lack of connectivity and error {err}") - - return wrapper - - -with_connectivity_check = network - - -@contextmanager -def assert_produces_warning( - expected_warning=Warning, - filter_level="always", - clear=None, - check_stacklevel=True, - raise_on_extra_warnings=True, -): - """ - Context manager for running code expected to either raise a specific - warning, or not raise any warnings. Verifies that the code raises the - expected warning, and that it does not raise any other unexpected - warnings. It is basically a wrapper around ``warnings.catch_warnings``. - - Parameters - ---------- - expected_warning : {Warning, False, None}, default Warning - The type of Exception raised. ``exception.Warning`` is the base - class for all warnings. To check that no warning is returned, - specify ``False`` or ``None``. - filter_level : str or None, default "always" - Specifies whether warnings are ignored, displayed, or turned - into errors. - Valid values are: - - * "error" - turns matching warnings into exceptions - * "ignore" - discard the warning - * "always" - always emit a warning - * "default" - print the warning the first time it is generated - from each location - * "module" - print the warning the first time it is generated - from each module - * "once" - print the warning the first time it is generated - - clear : str, default None - If not ``None`` then remove any previously raised warnings from - the ``__warningsregistry__`` to ensure that no warning messages are - suppressed by this context manager. If ``None`` is specified, - the ``__warningsregistry__`` keeps track of which warnings have been - shown, and does not show them again. - check_stacklevel : bool, default True - If True, displays the line that called the function containing - the warning to show were the function is called. Otherwise, the - line that implements the function is displayed. - raise_on_extra_warnings : bool, default True - Whether extra warnings not of the type `expected_warning` should - cause the test to fail. - - Examples - -------- - >>> import warnings - >>> with assert_produces_warning(): - ... warnings.warn(UserWarning()) - ... - >>> with assert_produces_warning(False): - ... warnings.warn(RuntimeWarning()) - ... - Traceback (most recent call last): - ... - AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. - >>> with assert_produces_warning(UserWarning): - ... warnings.warn(RuntimeWarning()) - Traceback (most recent call last): - ... - AssertionError: Did not see expected warning of class 'UserWarning'. - - ..warn:: This is *not* thread-safe. - """ - __tracebackhide__ = True - - with warnings.catch_warnings(record=True) as w: - - if clear is not None: - # make sure that we are clearing these warnings - # if they have happened before - # to guarantee that we will catch them - if not is_list_like(clear): - clear = [clear] - for m in clear: - try: - m.__warningregistry__.clear() - except AttributeError: - # module may not have __warningregistry__ - pass - - saw_warning = False - warnings.simplefilter(filter_level) - yield w - extra_warnings = [] - - for actual_warning in w: - if expected_warning and issubclass( - actual_warning.category, expected_warning - ): - saw_warning = True - - if check_stacklevel and issubclass( - actual_warning.category, (FutureWarning, DeprecationWarning) - ): - from inspect import getframeinfo, stack - - caller = getframeinfo(stack()[2][0]) - msg = ( - "Warning not set with correct stacklevel. " - f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" - ) - assert actual_warning.filename == caller.filename, msg - else: - extra_warnings.append( - ( - actual_warning.category.__name__, - actual_warning.message, - actual_warning.filename, - actual_warning.lineno, - ) - ) - if expected_warning: - msg = ( - f"Did not see expected warning of class " - f"{repr(expected_warning.__name__)}" - ) - assert saw_warning, msg - if raise_on_extra_warnings and extra_warnings: - raise AssertionError( - f"Caused unexpected warning(s): {repr(extra_warnings)}" - ) - - -class RNGContext: - """ - Context manager to set the numpy random number generator speed. Returns - to the original value upon exiting the context manager. - - Parameters - ---------- - seed : int - Seed for numpy.random.seed - - Examples - -------- - - with RNGContext(42): - np.random.randn() - """ - - def __init__(self, seed): - self.seed = seed - - def __enter__(self): - - self.start_state = np.random.get_state() - np.random.seed(self.seed) - - def __exit__(self, exc_type, exc_value, traceback): - - np.random.set_state(self.start_state) - - -@contextmanager -def with_csv_dialect(name, **kwargs): - """ - Context manager to temporarily register a CSV dialect for parsing CSV. - - Parameters - ---------- - name : str - The name of the dialect. - kwargs : mapping - The parameters for the dialect. - - Raises - ------ - ValueError : the name of the dialect conflicts with a builtin one. - - See Also - -------- - csv : Python's CSV library. - """ - import csv - - _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} - - if name in _BUILTIN_DIALECTS: - raise ValueError("Cannot override builtin dialect.") - - csv.register_dialect(name, **kwargs) - yield - csv.unregister_dialect(name) - - -@contextmanager -def use_numexpr(use, min_elements=None): - from pandas.core.computation import expressions as expr - - if min_elements is None: - min_elements = expr._MIN_ELEMENTS - - olduse = expr._USE_NUMEXPR - oldmin = expr._MIN_ELEMENTS - expr.set_use_numexpr(use) - expr._MIN_ELEMENTS = min_elements - yield - expr._MIN_ELEMENTS = oldmin - expr.set_use_numexpr(olduse) - - -def test_parallel(num_threads=2, kwargs_list=None): - """Decorator to run the same function multiple times in parallel. - - Parameters - ---------- - num_threads : int, optional - The number of times the function is run in parallel. - kwargs_list : list of dicts, optional - The list of kwargs to update original - function kwargs on different threads. - Notes - ----- - This decorator does not pass the return value of the decorated function. - - Original from scikit-image: - - https://github.com/scikit-image/scikit-image/pull/1519 - - """ - - assert num_threads > 0 - has_kwargs_list = kwargs_list is not None - if has_kwargs_list: - assert len(kwargs_list) == num_threads - import threading - - def wrapper(func): - @wraps(func) - def inner(*args, **kwargs): - if has_kwargs_list: - update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) - else: - update_kwargs = lambda i: kwargs - threads = [] - for i in range(num_threads): - updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) - threads.append(thread) - for thread in threads: - thread.start() - for thread in threads: - thread.join() - - return inner - - return wrapper - - -class SubclassedSeries(Series): - _metadata = ["testattr", "name"] - - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - -class SubclassedDataFrame(DataFrame): - _metadata = ["testattr"] - - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries - - -class SubclassedCategorical(Categorical): - @property - def _constructor(self): - return SubclassedCategorical - - -@contextmanager -def set_timezone(tz: str): - """ - Context manager for temporarily setting a timezone. - - Parameters - ---------- - tz : str - A string representing a valid timezone. - - Examples - -------- - - >>> from datetime import datetime - >>> from dateutil.tz import tzlocal - >>> tzlocal().tzname(datetime.now()) - 'IST' - - >>> with set_timezone('US/Eastern'): - ... tzlocal().tzname(datetime.now()) - ... - 'EDT' - """ - - import os - import time - - def setTZ(tz): - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() - - orig_tz = os.environ.get("TZ") - setTZ(tz) - try: - yield - finally: - setTZ(orig_tz) - - -def _make_skipna_wrapper(alternative, skipna_alternative=None): - """ - Create a function for calling on an array. - - Parameters - ---------- - alternative : function - The function to be called on the array with no NaNs. - Only used when 'skipna_alternative' is None. - skipna_alternative : function - The function to be called on the original array - - Returns - ------- - function - """ - if skipna_alternative: - - def skipna_wrapper(x): - return skipna_alternative(x.values) - - else: - - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - - return skipna_wrapper - - -def convert_rows_list_to_csv_str(rows_list: List[str]): - """ - Convert list of CSV rows to single CSV-formatted string for current OS. - - This method is used for creating expected value of to_csv() method. - - Parameters - ---------- - rows_list : List[str] - Each element represents the row of csv. - - Returns - ------- - str - Expected output of to_csv() in current OS. - """ - sep = os.linesep - expected = sep.join(rows_list) + sep - return expected diff --git a/requirements-dev.txt b/requirements-dev.txt index 9f18bf767ae56..017e6258d9941 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ # This file is auto-generated from environment.yml, do not modify. -# See that file for comments about the need/usage of each depdendency. +# See that file for comments about the need/usage of each dependency. numpy>=1.15 python-dateutil>=2.6.1 @@ -36,6 +36,7 @@ moto pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 +pytest-asyncio seaborn statsmodels ipywidgets @@ -45,7 +46,7 @@ pip blosc bottleneck>=1.2.1 ipykernel -ipython>=5.6.0,<=7.10.1 +ipython>=7.11.1 jinja2 matplotlib>=2.2.2 numexpr>=2.6.8 diff --git a/scripts/download_wheels.py b/scripts/download_wheels.py index 4ca1354321134..3d36eed2d888a 100644 --- a/scripts/download_wheels.py +++ b/scripts/download_wheels.py @@ -26,7 +26,7 @@ def fetch(version): files = [ x for x in root.xpath("//a/text()") - if x.startswith("pandas-{}".format(version)) and not dest.joinpath(x).exists() + if x.startswith(f"pandas-{version}") and not dest.joinpath(x).exists() ] N = len(files) @@ -35,9 +35,7 @@ def fetch(version): out = str(dest.joinpath(filename)) link = urllib.request.urljoin(base, filename) urllib.request.urlretrieve(link, out) - print( - "Downloaded {link} to {out} [{i}/{N}]".format(link=link, out=out, i=i, N=N) - ) + print(f"Downloaded {link} to {out} [{i}/{N}]") def main(args=None): diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 1d2c33aeee384..53a27e8782ad7 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -92,7 +92,7 @@ def main(conda_fname, pip_fname, compare=False): fname = os.path.split(conda_fname)[1] header = ( f"# This file is auto-generated from {fname}, do not modify.\n" - "# See that file for comments about the need/usage of each depdendency.\n\n" + "# See that file for comments about the need/usage of each dependency.\n\n" ) pip_content = header + "\n".join(pip_deps) @@ -127,13 +127,13 @@ def main(conda_fname, pip_fname, compare=False): ) if res: msg = ( - "`requirements-dev.txt` has to be generated with `{}` after " - "`environment.yml` is modified.\n".format(sys.argv[0]) + f"`requirements-dev.txt` has to be generated with `{sys.argv[0]}` after " + "`environment.yml` is modified.\n" ) if args.azure: msg = ( "##vso[task.logissue type=error;" - "sourcepath=requirements-dev.txt]{}".format(msg) + f"sourcepath=requirements-dev.txt]{msg}" ) sys.stderr.write(msg) sys.exit(res) diff --git a/scripts/list_future_warnings.sh b/scripts/list_future_warnings.sh index 0c4046bbb5f49..121f4f5a92abb 100755 --- a/scripts/list_future_warnings.sh +++ b/scripts/list_future_warnings.sh @@ -25,7 +25,7 @@ EXCLUDE="^pandas/tests/|" # tests validate that FutureWarnings are raised EXCLUDE+="^pandas/util/_decorators.py$|" # generic deprecate function that raises warning EXCLUDE+="^pandas/util/_depr_module.py$|" # generic deprecate module that raises warnings -EXCLUDE+="^pandas/util/testing.py$|" # contains function to evaluate if warning is raised +EXCLUDE+="^pandas._testing.py$|" # contains function to evaluate if warning is raised EXCLUDE+="^pandas/io/parsers.py$" # implements generic deprecation system in io reading BASE_DIR="$(dirname $0)/.." diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 674e3b72884fa..a1bccb1dd1629 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1300,7 +1300,7 @@ def test_resolves_class_name(self, name, expected_obj): @pytest.mark.parametrize("invalid_name", ["panda", "panda.DataFrame"]) def test_raises_for_invalid_module_name(self, invalid_name): - msg = 'No module can be imported from "{}"'.format(invalid_name) + msg = f'No module can be imported from "{invalid_name}"' with pytest.raises(ImportError, match=msg): validate_docstrings.Docstring(invalid_name) @@ -1310,7 +1310,7 @@ def test_raises_for_invalid_module_name(self, invalid_name): def test_raises_for_invalid_attribute_name(self, invalid_name): name_components = invalid_name.split(".") obj_name, invalid_attr_name = name_components[-2], name_components[-1] - msg = "'{}' has no attribute '{}'".format(obj_name, invalid_attr_name) + msg = f"'{obj_name}' has no attribute '{invalid_attr_name}'" with pytest.raises(AttributeError, match=msg): validate_docstrings.Docstring(invalid_name) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index b0eeb7b96e0eb..bcf3fd5d276f5 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -15,7 +15,6 @@ """ import argparse import ast -import collections import doctest import functools import glob @@ -286,7 +285,7 @@ def _load_obj(name): continue if "obj" not in locals(): - raise ImportError("No module can be imported " 'from "{}"'.format(name)) + raise ImportError(f'No module can be imported from "{name}"') for part in func_parts: obj = getattr(obj, part) @@ -358,7 +357,7 @@ def source_file_def_line(self): @property def github_url(self): url = "https://github.com/pandas-dev/pandas/blob/master/" - url += "{}#L{}".format(self.source_file_name, self.source_file_def_line) + url += f"{self.source_file_name}#L{self.source_file_def_line}" return url @property @@ -422,7 +421,7 @@ def needs_summary(self): @property def doc_parameters(self): - parameters = collections.OrderedDict() + parameters = {} for names, type_, desc in self.doc["Parameters"]: for name in names.split(", "): parameters[name] = (type_, "".join(desc)) @@ -502,7 +501,7 @@ def parameter_desc(self, param): desc = self.doc_parameters[param][1] # Find and strip out any sphinx directives for directive in DIRECTIVES: - full_directive = ".. {}".format(directive) + full_directive = f".. {directive}" if full_directive in desc: # Only retain any description before the directive desc = desc[: desc.index(full_directive)] @@ -510,7 +509,7 @@ def parameter_desc(self, param): @property def see_also(self): - result = collections.OrderedDict() + result = {} for funcs, desc in self.doc["See Also"]: for func, _ in funcs: result[func] = "".join(desc) @@ -826,14 +825,12 @@ def get_validation_data(doc): "EX03", error_code=err.error_code, error_message=err.message, - times_happening=" ({} times)".format(err.count) - if err.count > 1 - else "", + times_happening=f" ({err.count} times)" if err.count > 1 else "", ) ) examples_source_code = "".join(doc.examples_source_code) for wrong_import in ("numpy", "pandas"): - if "import {}".format(wrong_import) in examples_source_code: + if f"import {wrong_import}" in examples_source_code: errs.append(error("EX04", imported_library=wrong_import)) return errs, wrns, examples_errs @@ -921,7 +918,7 @@ def validate_all(prefix, ignore_deprecated=False): api_item_names = set(list(zip(*api_items))[0]) for class_ in (pandas.Series, pandas.DataFrame): for member in inspect.getmembers(class_): - func_name = "pandas.{}.{}".format(class_.__name__, member[0]) + func_name = f"pandas.{class_.__name__}.{member[0]}" if not member[0].startswith("_") and func_name not in api_item_names: if prefix and not func_name.startswith(prefix): continue @@ -939,13 +936,9 @@ def header(title, width=80, char="#"): full_line = char * width side_len = (width - len(title) - 2) // 2 adj = "" if len(title) % 2 == 0 else " " - title_line = "{side} {title}{adj} {side}".format( - side=char * side_len, title=title, adj=adj - ) + title_line = f"{char * side_len} {title}{adj} {char * side_len}" - return "\n{full_line}\n{title_line}\n{full_line}\n\n".format( - full_line=full_line, title_line=title_line - ) + return f"\n{full_line}\n{title_line}\n{full_line}\n\n" exit_status = 0 if func_name is None: @@ -987,24 +980,24 @@ def header(title, width=80, char="#"): else: result = validate_one(func_name) - sys.stderr.write(header("Docstring ({})".format(func_name))) - sys.stderr.write("{}\n".format(result["docstring"])) + sys.stderr.write(header(f"Docstring ({func_name})")) + sys.stderr.write(f"{result['docstring']}\n") sys.stderr.write(header("Validation")) if result["errors"]: - sys.stderr.write("{} Errors found:\n".format(len(result["errors"]))) + sys.stderr.write(f"{len(result['errors'])} Errors found:\n") for err_code, err_desc in result["errors"]: # Failing examples are printed at the end if err_code == "EX02": sys.stderr.write("\tExamples do not pass tests\n") continue - sys.stderr.write("\t{}\n".format(err_desc)) + sys.stderr.write(f"\t{err_desc}\n") if result["warnings"]: - sys.stderr.write("{} Warnings found:\n".format(len(result["warnings"]))) + sys.stderr.write(f"{len(result['warnings'])} Warnings found:\n") for wrn_code, wrn_desc in result["warnings"]: - sys.stderr.write("\t{}\n".format(wrn_desc)) + sys.stderr.write(f"\t{wrn_desc}\n") if not result["errors"]: - sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) + sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') if result["examples_errors"]: sys.stderr.write(header("Doctests")) @@ -1028,7 +1021,7 @@ def header(title, width=80, char="#"): choices=format_opts, help="format of the output when validating " "multiple docstrings (ignored when validating one)." - "It can be {}".format(str(format_opts)[1:-1]), + f"It can be {str(format_opts)[1:-1]}", ) argparser.add_argument( "--prefix", diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py new file mode 100755 index 0000000000000..3feeddaabe8d2 --- /dev/null +++ b/scripts/validate_string_concatenation.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" +GH #30454 + +Check where there is a string that needs to be concatenated. + +This is necessary after black formating, +where for example black transforms this: + +>>> foo = ( +... "bar " +... "baz" +... ) + +into this: + +>>> foo = ("bar " "baz") + +Black is not considering this as an +issue (see issue https://github.com/psf/black/issues/1051), +so we are checking it here. +""" + +import argparse +import os +import sys +import token +import tokenize +from typing import Generator, List, Tuple + +FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") + + +def main(source_path: str, output_format: str) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + source_path : str + Source path representing path to a file/directory. + output_format : str + Output format of the script. + + Returns + ------- + bool + True if found any strings that needs to be concatenated. + + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + if not os.path.exists(source_path): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + + is_failed: bool = False + + msg = "String unnecessarily split in two by black. Please merge them manually." + + if os.path.isfile(source_path): + for source_path, line_number in strings_to_concatenate(source_path): + is_failed = True + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) + + for subdir, _, files in os.walk(source_path): + for file_name in files: + if any( + file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK + ): + for source_path, line_number in strings_to_concatenate( + os.path.join(subdir, file_name) + ): + is_failed = True + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) + return is_failed + + +def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, None]: + """ + Yielding the strings that needs to be concatenated in a given file. + + Parameters + ---------- + source_path : str + File path pointing to a single file. + + Yields + ------ + source_path : str + Source file path. + line_number : int + Line number of unconcatenated string. + """ + with open(source_path, "r") as file_name: + tokens: List = list(tokenize.generate_tokens(file_name.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token[0] == next_token[0] == token.STRING: + yield source_path, current_token[2][0] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Validate concatenated strings") + + parser.add_argument( + "path", nargs="?", default=".", help="Source path of file/directory to check." + ) + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}", + help="Output format of the unconcatenated strings.", + ) + + args = parser.parse_args() + + sys.exit(main(source_path=args.path, output_format=args.format)) diff --git a/setup.cfg b/setup.cfg index 8fb602188dad5..d0570cee6fe10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,6 +66,7 @@ xfail_strict = True filterwarnings = error:Sparse:FutureWarning error:The SparseArray:FutureWarning +junit_family=xunit2 [coverage:run] branch = False @@ -109,7 +110,7 @@ known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 @@ -123,6 +124,7 @@ skip = pandas/__init__.py,pandas/core/api.py ignore_missing_imports=True no_implicit_optional=True check_untyped_defs=True +strict_equality=True [mypy-pandas.tests.*] check_untyped_defs=False @@ -151,15 +153,9 @@ ignore_errors=True [mypy-pandas._version] check_untyped_defs=False -[mypy-pandas.core.arrays.boolean] -check_untyped_defs=False - [mypy-pandas.core.arrays.categorical] check_untyped_defs=False -[mypy-pandas.core.arrays.integer] -check_untyped_defs=False - [mypy-pandas.core.arrays.interval] check_untyped_defs=False @@ -169,12 +165,6 @@ check_untyped_defs=False [mypy-pandas.core.base] check_untyped_defs=False -[mypy-pandas.core.computation.align] -check_untyped_defs=False - -[mypy-pandas.core.computation.eval] -check_untyped_defs=False - [mypy-pandas.core.computation.expr] check_untyped_defs=False @@ -190,15 +180,9 @@ check_untyped_defs=False [mypy-pandas.core.computation.scope] check_untyped_defs=False -[mypy-pandas.core.config_init] -check_untyped_defs=False - [mypy-pandas.core.dtypes.cast] check_untyped_defs=False -[mypy-pandas.core.dtypes.generic] -check_untyped_defs=False - [mypy-pandas.core.frame] check_untyped_defs=False @@ -217,9 +201,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.base] check_untyped_defs=False -[mypy-pandas.core.indexes.category] -check_untyped_defs=False - [mypy-pandas.core.indexes.datetimelike] check_untyped_defs=False @@ -232,9 +213,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexes.timedeltas] -check_untyped_defs=False - [mypy-pandas.core.indexing] check_untyped_defs=False @@ -268,9 +246,6 @@ check_untyped_defs=False [mypy-pandas.core.reshape.reshape] check_untyped_defs=False -[mypy-pandas.core.series] -check_untyped_defs=False - [mypy-pandas.core.strings] check_untyped_defs=False @@ -325,9 +300,6 @@ check_untyped_defs=False [mypy-pandas.io.json._json] check_untyped_defs=False -[mypy-pandas.io.json._normalize] -check_untyped_defs=False - [mypy-pandas.io.json._table_schema] check_untyped_defs=False @@ -346,9 +318,6 @@ check_untyped_defs=False [mypy-pandas.io.sas.sasreader] check_untyped_defs=False -[mypy-pandas.io.sql] -check_untyped_defs=False - [mypy-pandas.io.stata] check_untyped_defs=False @@ -361,14 +330,11 @@ check_untyped_defs=False [mypy-pandas.plotting._matplotlib.misc] check_untyped_defs=False -[mypy-pandas.plotting._matplotlib.timeseries] -check_untyped_defs=False - [mypy-pandas.tseries.holiday] check_untyped_defs=False [mypy-pandas.tseries.offsets] check_untyped_defs=False -[mypy-pandas.util.testing] +[mypy-pandas._testing] check_untyped_defs=False diff --git a/setup.py b/setup.py index af70ee3b30095..c33ce063cb4d9 100755 --- a/setup.py +++ b/setup.py @@ -49,11 +49,12 @@ def is_platform_mac(): try: import Cython - ver = Cython.__version__ + _CYTHON_VERSION = Cython.__version__ from Cython.Build import cythonize - _CYTHON_INSTALLED = ver >= LooseVersion(min_cython_ver) + _CYTHON_INSTALLED = _CYTHON_VERSION >= LooseVersion(min_cython_ver) except ImportError: + _CYTHON_VERSION = None _CYTHON_INSTALLED = False cythonize = lambda x, *args, **kwargs: x # dummy func @@ -506,6 +507,11 @@ def maybe_cythonize(extensions, *args, **kwargs): elif not cython: # GH#28836 raise a helfpul error message + if _CYTHON_VERSION: + raise RuntimeError( + f"Cannot cythonize with old Cython version ({_CYTHON_VERSION} " + f"installed, needs {min_cython_ver})" + ) raise RuntimeError("Cannot cythonize without Cython installed.") numpy_incl = pkg_resources.resource_filename("numpy", "core/include") @@ -526,6 +532,11 @@ def maybe_cythonize(extensions, *args, **kwargs): elif parsed.j: nthreads = parsed.j + # GH#30356 Cythonize doesn't support parallel on Windows + if is_platform_windows() and nthreads > 0: + print("Parallel build for cythonize ignored on Windows") + nthreads = 0 + kwargs["nthreads"] = nthreads build_ext.render_templates(_pxifiles) return cythonize(extensions, *args, **kwargs) @@ -591,6 +602,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): }, "_libs.reduction": {"pyxfile": "_libs/reduction"}, "_libs.ops": {"pyxfile": "_libs/ops"}, + "_libs.ops_dispatch": {"pyxfile": "_libs/ops_dispatch"}, "_libs.properties": {"pyxfile": "_libs/properties"}, "_libs.reshape": {"pyxfile": "_libs/reshape", "depends": []}, "_libs.sparse": {"pyxfile": "_libs/sparse", "depends": _pxi_dep["sparse"]}, diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 120058afd1190..92126a7b5a2f2 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -84,11 +84,6 @@ -
  • - - - -
  • pandas is a fiscally sponsored project of NumFOCUS diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md index 8a5c2735b3d93..35a6b3361f32e 100644 --- a/web/pandas/about/roadmap.md +++ b/web/pandas/about/roadmap.md @@ -134,19 +134,6 @@ pandas documentation. Some specific goals include subsections of the documentation to make navigation and finding content easier. -## Package docstring validation - -To improve the quality and consistency of pandas docstrings, we've -developed tooling to check docstrings in a variety of ways. - -contains the checks. - -Like many other projects, pandas uses the -[numpydoc](https://numpydoc.readthedocs.io/en/latest/) style for writing -docstrings. With the collaboration of the numpydoc maintainers, we'd -like to move the checks to a package other than pandas so that other -projects can easily use them as well. - ## Performance monitoring Pandas uses [airspeed velocity](https://asv.readthedocs.io/en/stable/) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index e2a95a5039884..d1fb7ba0f7b86 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -50,8 +50,6 @@ navbar: target: /community/blog.html - name: "Ask a question (StackOverflow)" target: https://stackoverflow.com/questions/tagged/pandas - - name: "Discuss" - target: https://pandas.discourse.group - name: "Code of conduct" target: /community/coc.html - name: "Ecosystem" diff --git a/web/pandas/index.html b/web/pandas/index.html index df6e5ab9a330b..5aac5da16295b 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -35,7 +35,7 @@

    Documentation
    0 0http://pandas.pydata.org/?q1=a&q2=bhttps://pandas.pydata.org/?q1=a&q2=b pydata.org
    0 0http://pandas.pydata.org/?q1=a&q2=bhttps://pandas.pydata.org/?q1=a&q2=b pydata.org
    {val}{40 + h}{val}31