Merge remote-tracking branch 'upstream/master' into bug/categorical-i…

…ndexing-1row-df
keechongtan · Nov 26, 2019 · 2b71592 · 2b71592
2 parents 39c95f4 + db60ab6
commit 2b71592
Show file tree

Hide file tree

Showing 65 changed files with 626 additions and 876 deletions.
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -14,21 +14,6 @@
         pass
 
 
-class Concat:
-    def setup(self):
-        N = 10 ** 5
-        self.s = pd.Series(list("aabbcd") * N).astype("category")
-
-        self.a = pd.Categorical(list("aabbcd") * N)
-        self.b = pd.Categorical(list("bbcdjk") * N)
-
-    def time_concat(self):
-        pd.concat([self.s, self.s])
-
-    def time_union(self):
-        union_categoricals([self.a, self.b])
-
-
 class Constructor:
     def setup(self):
         N = 10 ** 5
@@ -77,6 +62,33 @@ def time_existing_series(self):
         pd.Categorical(self.series)
 
 
+class CategoricalOps:
+    params = ["__lt__", "__le__", "__eq__", "__ne__", "__ge__", "__gt__"]
+    param_names = ["op"]
+
+    def setup(self, op):
+        N = 10 ** 5
+        self.cat = pd.Categorical(list("aabbcd") * N, ordered=True)
+
+    def time_categorical_op(self, op):
+        getattr(self.cat, op)("b")
+
+
+class Concat:
+    def setup(self):
+        N = 10 ** 5
+        self.s = pd.Series(list("aabbcd") * N).astype("category")
+
+        self.a = pd.Categorical(list("aabbcd") * N)
+        self.b = pd.Categorical(list("bbcdjk") * N)
+
+    def time_concat(self):
+        pd.concat([self.s, self.s])
+
+    def time_union(self):
+        union_categoricals([self.a, self.b])
+
+
 class ValueCounts:
 
     params = [True, False]

diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml
@@ -44,16 +44,13 @@ jobs:
           PATTERN: "not slow and not network"
           LOCALE_OVERRIDE: "zh_CN.UTF-8"
 
-        # https://github.com/pandas-dev/pandas/issues/29432
-        # py37_np_dev:
-        #   ENV_FILE: ci/deps/azure-37-numpydev.yaml
-        #   CONDA_PY: "37"
-        #   PATTERN: "not slow and not network"
-        #   TEST_ARGS: "-W error"
-        #   PANDAS_TESTING_MODE: "deprecate"
-        #   EXTRA_APT: "xsel"
-        #   # TODO:
-        #   continueOnError: true
+        py37_np_dev:
+          ENV_FILE: ci/deps/azure-37-numpydev.yaml
+          CONDA_PY: "37"
+          PATTERN: "not slow and not network"
+          TEST_ARGS: "-W error"
+          PANDAS_TESTING_MODE: "deprecate"
+          EXTRA_APT: "xsel"
 
   steps:
     - script: |

diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml
@@ -20,9 +20,9 @@ dependencies:
   - matplotlib=2.2.3
   - nomkl
   - numexpr
-  - numpy=1.13.3
+  - numpy=1.14
   - openpyxl
-  - pyarrow
+  - pyarrow>=0.12.0
   - pytables
   - python-dateutil==2.6.1
   - pytz

diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml
@@ -20,7 +20,7 @@ dependencies:
   - numexpr
   - numpy=1.15.*
   - openpyxl
-  - pyarrow
+  - pyarrow>=0.12.0
   - pytables
   - python-dateutil
   - pytz

diff --git a/doc/redirects.csv b/doc/redirects.csv
@@ -828,7 +828,6 @@ generated/pandas.MultiIndex.sortlevel,../reference/api/pandas.MultiIndex.sortlev
 generated/pandas.MultiIndex.swaplevel,../reference/api/pandas.MultiIndex.swaplevel
 generated/pandas.MultiIndex.to_flat_index,../reference/api/pandas.MultiIndex.to_flat_index
 generated/pandas.MultiIndex.to_frame,../reference/api/pandas.MultiIndex.to_frame
-generated/pandas.MultiIndex.to_hierarchical,../reference/api/pandas.MultiIndex.to_hierarchical
 generated/pandas.notna,../reference/api/pandas.notna
 generated/pandas.notnull,../reference/api/pandas.notnull
 generated/pandas.option_context,../reference/api/pandas.option_context

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -258,7 +258,7 @@ matplotlib                2.2.2              Visualization
 openpyxl                  2.4.8              Reading / writing for xlsx files
 pandas-gbq                0.8.0              Google Big Query access
 psycopg2                                     PostgreSQL engine for sqlalchemy
-pyarrow                   0.9.0              Parquet and feather reading / writing
+pyarrow                   0.12.0             Parquet and feather reading / writing
 pymysql                   0.7.11             MySQL engine for sqlalchemy
 pyreadstat                                   SPSS files (.sav) reading
 pytables                  3.4.2              HDF5 reading / writing

diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
@@ -305,7 +305,6 @@ MultiIndex components
 
    MultiIndex.set_levels
    MultiIndex.set_codes
-   MultiIndex.to_hierarchical
    MultiIndex.to_flat_index
    MultiIndex.to_frame
    MultiIndex.is_lexsorted

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
@@ -93,9 +93,9 @@ Use efficient datatypes
 -----------------------
 
 The default pandas data types are not the most memory efficient. This is
-especially true for high-cardinality text data (columns with relatively few
-unique values). By using more efficient data types you can store larger datasets
-in memory.
+especially true for text data columns with relatively few unique values (commonly
+referred to as "low-cardinality" data). By using more efficient data types you
+can store larger datasets in memory.
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -265,62 +265,62 @@ The following methods now also correctly output values for unobserved categories
 Increased minimum versions for dependencies
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Some minimum supported versions of dependencies were updated (:issue:`29723`).
+Some minimum supported versions of dependencies were updated (:issue:`29766`, :issue:`29723`).
 If installed, we now require:
 
-+-----------------+-----------------+----------+
-| Package         | Minimum Version | Required |
-+=================+=================+==========+
-| numpy           | 1.13.3          |    X     |
-+-----------------+-----------------+----------+
-| pytz            | 2015.4          |    X     |
-+-----------------+-----------------+----------+
-| python-dateutil | 2.6.1           |    X     |
-+-----------------+-----------------+----------+
-| bottleneck      | 1.2.1           |          |
-+-----------------+-----------------+----------+
-| numexpr         | 2.6.2           |          |
-+-----------------+-----------------+----------+
-| pytest (dev)    | 4.0.2           |          |
-+-----------------+-----------------+----------+
++-----------------+-----------------+----------+---------+
+| Package         | Minimum Version | Required | Changed |
++=================+=================+==========+=========+
+| numpy           | 1.13.3          |    X     |         |
++-----------------+-----------------+----------+---------+
+| pytz            | 2015.4          |    X     |         |
++-----------------+-----------------+----------+---------+
+| python-dateutil | 2.6.1           |    X     |         |
++-----------------+-----------------+----------+---------+
+| bottleneck      | 1.2.1           |          |         |
++-----------------+-----------------+----------+---------+
+| numexpr         | 2.6.2           |          |         |
++-----------------+-----------------+----------+---------+
+| pytest (dev)    | 4.0.2           |          |         |
++-----------------+-----------------+----------+---------+
 
 For `optional libraries <https://dev.pandas.io/docs/install.html#dependencies>`_ the general recommendation is to use the latest version.
 The following table lists the lowest version per library that is currently being tested throughout the development of pandas.
 Optional libraries below the lowest tested version may still work, but are not considered supported.
 
-+-----------------+-----------------+
-| Package         | Minimum Version |
-+=================+=================+
-| beautifulsoup4  | 4.6.0           |
-+-----------------+-----------------+
-| fastparquet     | 0.3.2           |
-+-----------------+-----------------+
-| gcsfs           | 0.2.2           |
-+-----------------+-----------------+
-| lxml            | 3.8.0           |
-+-----------------+-----------------+
-| matplotlib      | 2.2.2           |
-+-----------------+-----------------+
-| openpyxl        | 2.4.8           |
-+-----------------+-----------------+
-| pyarrow         | 0.9.0           |
-+-----------------+-----------------+
-| pymysql         | 0.7.1           |
-+-----------------+-----------------+
-| pytables        | 3.4.2           |
-+-----------------+-----------------+
-| scipy           | 0.19.0          |
-+-----------------+-----------------+
-| sqlalchemy      | 1.1.4           |
-+-----------------+-----------------+
-| xarray          | 0.8.2           |
-+-----------------+-----------------+
-| xlrd            | 1.1.0           |
-+-----------------+-----------------+
-| xlsxwriter      | 0.9.8           |
-+-----------------+-----------------+
-| xlwt            | 1.2.0           |
-+-----------------+-----------------+
++-----------------+-----------------+---------+
+| Package         | Minimum Version | Changed |
++=================+=================+=========+
+| beautifulsoup4  | 4.6.0           |         |
++-----------------+-----------------+---------+
+| fastparquet     | 0.3.2           |    X    |
++-----------------+-----------------+---------+
+| gcsfs           | 0.2.2           |         |
++-----------------+-----------------+---------+
+| lxml            | 3.8.0           |         |
++-----------------+-----------------+---------+
+| matplotlib      | 2.2.2           |         |
++-----------------+-----------------+---------+
+| openpyxl        | 2.4.8           |         |
++-----------------+-----------------+---------+
+| pyarrow         | 0.12.0          |    X    |
++-----------------+-----------------+---------+
+| pymysql         | 0.7.1           |         |
++-----------------+-----------------+---------+
+| pytables        | 3.4.2           |         |
++-----------------+-----------------+---------+
+| scipy           | 0.19.0          |         |
++-----------------+-----------------+---------+
+| sqlalchemy      | 1.1.4           |         |
++-----------------+-----------------+---------+
+| xarray          | 0.8.2           |         |
++-----------------+-----------------+---------+
+| xlrd            | 1.1.0           |         |
++-----------------+-----------------+---------+
+| xlsxwriter      | 0.9.8           |         |
++-----------------+-----------------+---------+
+| xlwt            | 1.2.0           |         |
++-----------------+-----------------+---------+
 
 See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
 
@@ -364,7 +364,7 @@ Deprecations
   value in ``idx`` of ``idx_val`` and a new value of ``val``, ``idx.set_value(arr, idx_val, val)``
   is equivalent to ``arr[idx.get_loc(idx_val)] = val``, which should be used instead (:issue:`28621`).
 - :func:`is_extension_type` is deprecated, :func:`is_extension_array_dtype` should be used instead (:issue:`29457`)
-
+- :func:`eval` keyword argument "truediv" is deprecated and will be removed in a future version (:issue:`29812`)
 
 .. _whatsnew_1000.prior_deprecations:
 
@@ -401,10 +401,12 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
 
 **Other removals**
 
+- Floordiv of integer-dtyped array by :class:`Timedelta` now raises ``TypeError`` (:issue:`21036`)
 - Removed the previously deprecated :meth:`Index.summary` (:issue:`18217`)
 - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`)
 - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`)
 - Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`)
+- A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`)
 - Removed :meth:`Series.from_array` (:issue:`18258`)
 - Removed :meth:`DataFrame.from_items` (:issue:`18458`)
 - Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`)
@@ -415,6 +417,11 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
 - :func:`core.internals.blocks.make_block` no longer accepts the "fastpath" keyword(:issue:`19265`)
 - :meth:`Block.make_block_same_class` no longer accepts the "dtype" keyword(:issue:`19434`)
 - Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`)
+- Removed the previously deprecated :meth:`MultiIndex.to_hierarchical` (:issue:`21613`)
+- Removed the previously deprecated :attr:`MultiIndex.labels`, use :attr:`MultiIndex.codes` instead (:issue:`23752`)
+- Removed the previously deprecated "labels" keyword from the :class:`MultiIndex` constructor, use "codes" instead (:issue:`23752`)
+- Removed the previously deprecated :meth:`MultiIndex.set_labels`, use :meth:`MultiIndex.set_codes` instead (:issue:`23752`)
+- Removed the previously deprecated "labels" keyword from :meth:`MultiIndex.set_codes`, :meth:`MultiIndex.copy`, :meth:`MultiIndex.drop`, use "codes" instead (:issue:`23752`)
 - Removed support for legacy HDF5 formats (:issue:`29787`)
 - :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`)
 - :func:`read_excel` no longer allows an integer value for the parameter ``usecols``, instead pass a list of integers from 0 to ``usecols`` inclusive (:issue:`23635`)
@@ -434,11 +441,17 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
 - Removed the previously deprecated :meth:`DataFrame.get_ftype_counts`, :meth:`Series.get_ftype_counts` (:issue:`18243`)
 - Removed the previously deprecated :meth:`Index.get_duplicated`, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`)
 - Removed the previously deprecated :meth:`Series.clip_upper`, :meth:`Series.clip_lower`, :meth:`DataFrame.clip_upper`, :meth:`DataFrame.clip_lower` (:issue:`24203`)
+- Removed the ability to alter :attr:`DatetimeIndex.freq`, :attr:`TimedeltaIndex.freq`, or :attr:`PeriodIndex.freq` (:issue:`20772`)
+- Removed the previously deprecated :attr:`DatetimeIndex.offset` (:issue:`20730`)
+- Removed the previously deprecated :meth:`DatetimeIndex.asobject`, :meth:`TimedeltaIndex.asobject`, :meth:`PeriodIndex.asobject`, use ``astype(object)`` instead (:issue:`29801`)
 - Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`)
 - Removed previously deprecated "v" argument from :meth:`FrozenNDarray.searchsorted`, use "value" instead (:issue:`22672`)
 - :func:`read_stata` and :meth:`DataFrame.to_stata` no longer supports the "encoding" argument (:issue:`21400`)
+- In :func:`concat` the default value for ``sort`` has been changed from ``None`` to ``False`` (:issue:`20613`)
 - Removed previously deprecated "raise_conflict" argument from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`)
 - Removed previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`)
+- Changed the default value for the `raw` argument in :func:`Series.rolling().apply() <pandas.core.window.Rolling.apply>`, :func:`DataFrame.rolling().apply() <pandas.core.window.Rolling.apply>`,
+- :func:`Series.expanding().apply() <pandas.core.window.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <pandas.core.window.Expanding.apply>` to ``False`` (:issue:`20584`)
 -
 
 .. _whatsnew_1000.performance:
@@ -453,7 +466,9 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`)
 - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`)
 - Performance improvement in :meth:`Categorical.searchsorted` and  :meth:`CategoricalIndex.searchsorted` (:issue:`28795`)
-- Performance improvement when comparing a :meth:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`)
+- Performance improvement when comparing a :class:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`)
+- Performance improvement when checking if values in a :class:`Categorical` are equal, equal or larger or larger than a given scalar.
+  The improvement is not present if checking if the :class:`Categorical` is less than or less than or equal than the scalar (:issue:`29820`)
 
 .. _whatsnew_1000.bug_fixes:
 
@@ -549,6 +564,7 @@ Indexing
 - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`)
 - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`)
 - :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
+- Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`)
 - Bug in :meth:`DataFrame.loc` with incorrect dtype when setting Categorical value in 1-row DataFrame (:issue:`25495`)
 
 Missing
@@ -664,4 +680,4 @@ Other
 .. _whatsnew_1000.contributors:
 
 Contributors
-~~~~~~~~~~~~
+~~~~~~~~~~~~
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -141,8 +141,12 @@ cdef class IndexEngine:
 
         if self.is_monotonic_increasing:
             values = self._get_index_values()
-            left = values.searchsorted(val, side='left')
-            right = values.searchsorted(val, side='right')
+            try:
+                left = values.searchsorted(val, side='left')
+                right = values.searchsorted(val, side='right')
+            except TypeError:
+                # e.g. GH#29189 get_loc(None) with a Float64Index
+                raise KeyError(val)
 
             diff = right - left
             if diff == 0:

diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
@@ -2201,7 +2201,7 @@ cdef class _Period:
         return self.days_in_month
 
     @property
-    def is_leap_year(self):
+    def is_leap_year(self) -> bool:
         return bool(is_leapyear(self.year))
 
     @classmethod

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1509,18 +1509,8 @@ class Timedelta(_Timedelta):
             if other.dtype.kind == 'm':
                 # also timedelta-like
                 return _broadcast_floordiv_td64(self.value, other, _rfloordiv)
-            elif other.dtype.kind == 'i':
-                # Backwards compatibility
-                # GH-19761
-                msg = textwrap.dedent("""\
-                Floor division between integer array and Timedelta is
-                deprecated. Use 'array // timedelta.value' instead.
-                If you want to obtain epochs from an array of timestamps,
-                you can rather use
-                '(array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'.
-                """)
-                warnings.warn(msg, FutureWarning)
-                return other // self.value
+
+            # Includes integer array // Timedelta, deprecated in GH#19761
             raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__')
 
         elif is_float_object(other) and util.is_nan(other):