From 9185af206b1e9e07d3b6f5a556b32862624fb100 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Mar 2019 09:03:13 +0100 Subject: [PATCH 01/22] DOC: clean-up of 0.24.2 whatsnew file (#25660) --- doc/source/whatsnew/v0.24.2.rst | 44 +-------------------------------- 1 file changed, 1 insertion(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 8da33a46e79c6..0af2427ead512 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -18,7 +18,7 @@ including other versions of pandas. .. _whatsnew_0242.regressions: Fixed Regressions -^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.all` and :meth:`DataFrame.any` where ``bool_only=True`` was ignored (:issue:`25101`) - Fixed issue in ``DataFrame`` construction with passing a mixed list of mixed types could segfault. (:issue:`25075`) @@ -34,68 +34,26 @@ Fixed Regressions - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) - Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) -.. _whatsnew_0242.enhancements: - -Enhancements -^^^^^^^^^^^^ - -- -- - .. _whatsnew_0242.bug_fixes: Bug Fixes ~~~~~~~~~ -**Conversion** - -- -- -- - -**Indexing** - -- -- -- - **I/O** - Better handling of terminal printing when the terminal dimensions are not known (:issue:`25080`) - Bug in reading a HDF5 table-format ``DataFrame`` created in Python 2, in Python 3 (:issue:`24925`) - Bug in reading a JSON with ``orient='table'`` generated by :meth:`DataFrame.to_json` with ``index=False`` (:issue:`25170`) - Bug where float indexes could have misaligned values when printing (:issue:`25061`) -- - -**Categorical** - -- -- -- - -**Timezones** - -- -- -- - -**Timedelta** - -- -- -- **Reshaping** - Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) -- **Visualization** - Bug in :meth:`Series.plot` where a secondary y axis could not be set to log scale (:issue:`25545`) -- -- **Other** From bace4d04fe98bd20df77ee4192518c90fc610810 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Mar 2019 13:43:01 +0100 Subject: [PATCH 02/22] REGR: to_timedelta precision issues with floating data (#25651) --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/_libs/tslibs/timedeltas.pyx | 19 ++++++++++++++++--- pandas/core/arrays/timedeltas.py | 15 +++++++++------ pandas/tests/indexes/timedeltas/test_tools.py | 7 +++++++ 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 0af2427ead512..09deb69d3b39e 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -31,6 +31,7 @@ Fixed Regressions - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) +- Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) - Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 37aa05659b70f..5918c7963acf7 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -246,9 +246,11 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): return iresult.base # .base to access underlying np.ndarray -cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: - """ return a casting of the unit represented to nanoseconds - round the fractional part of a float to our precision, p """ +cpdef inline object precision_from_unit(object unit): + """ + Return a casting of the unit represented to nanoseconds + the precision + to round the fractional part. + """ cdef: int64_t m int p @@ -285,6 +287,17 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: p = 0 else: raise ValueError("cannot cast unit {unit}".format(unit=unit)) + return m, p + + +cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: + """ return a casting of the unit represented to nanoseconds + round the fractional part of a float to our precision, p """ + cdef: + int64_t m + int p + + m, p = precision_from_unit(unit) # just give me the unit back if ts is None: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 74fe8072e6924..1badb476085bf 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -11,7 +11,7 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( - array_to_timedelta64, parse_timedelta_unit) + array_to_timedelta64, parse_timedelta_unit, precision_from_unit) import pandas.compat as compat from pandas.util._decorators import Appender @@ -918,12 +918,15 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): copy = copy and not copy_made elif is_float_dtype(data.dtype): - # treat as multiples of the given unit. If after converting to nanos, - # there are fractional components left, these are truncated - # (i.e. NOT rounded) + # cast the unit, multiply base/frace separately + # to avoid precision issues from float -> int mask = np.isnan(data) - coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') - data = (coeff * data).astype(np.int64).view('timedelta64[ns]') + m, p = precision_from_unit(unit) + base = data.astype(np.int64) + frac = data - base + if p: + frac = np.round(frac, p) + data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]') data[mask] = iNaT copy = False diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 58482a174dfd1..819184d4b14f3 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -181,3 +181,10 @@ def test_to_timedelta_on_missing_values(self): actual = pd.to_timedelta(pd.NaT) assert actual.value == timedelta_NaT.astype('int64') + + def test_to_timedelta_float(self): + # https://github.com/pandas-dev/pandas/issues/25077 + arr = np.arange(0, 1, 1e-6)[-10:] + result = pd.to_timedelta(arr, unit='s') + expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype='int64') + tm.assert_numpy_array_equal(result.asi8, expected_asi8) From 848e262823f4ea3c28ee4abf493437a6b87f16e7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 12 Mar 2019 12:44:37 +0000 Subject: [PATCH 03/22] CLN: remove Panel from concat error message (#25676) --- pandas/core/reshape/concat.py | 8 ++--- pandas/tests/reshape/test_concat.py | 45 +++++------------------------ 2 files changed, 11 insertions(+), 42 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index a6c945ac2e464..6cc355fb62f23 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -282,10 +282,10 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: - if not isinstance(obj, NDFrame): - msg = ('cannot concatenate object of type "{0}";' - ' only pd.Series, pd.DataFrame, and pd.Panel' - ' (deprecated) objs are valid'.format(type(obj))) + if not isinstance(obj, (Series, DataFrame)): + msg = ("cannot concatenate object of type '{}';" + ' only Series and DataFrame objs are valid' + .format(type(obj))) raise TypeError(msg) # consolidate diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index a186d32ed8800..9dbc14c23f3f4 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -16,7 +16,7 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Panel, Series, + Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, concat, date_range, isna, read_csv) from pandas.tests.extension.decimal import to_decimal from pandas.util import testing as tm @@ -196,9 +196,8 @@ def test_concatlike_same_dtypes(self): tm.assert_series_equal(res, exp, check_index_type=True) # cannot append non-index - msg = (r'cannot concatenate object of type \"(.+?)\";' - ' only pd.Series, pd.DataFrame, and pd.Panel' - r' \(deprecated\) objs are valid') + msg = (r"cannot concatenate object of type '.+';" + " only Series and DataFrame objs are valid") with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) @@ -1534,33 +1533,6 @@ def test_dtype_coerceion(self): result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) - @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") - # Panel.rename warning we don't care about - @pytest.mark.filterwarnings("ignore:Using:FutureWarning") - def test_panel_concat_buglet(self, sort): - # #2257 - def make_panel(): - index = 5 - cols = 3 - - def df(): - return DataFrame(np.random.randn(index, cols), - index=["I%s" % i for i in range(index)], - columns=["C%s" % i for i in range(cols)]) - return Panel({"Item%s" % x: df() for x in ['A', 'B', 'C']}) - - panel1 = make_panel() - panel2 = make_panel() - - panel2 = panel2.rename(major_axis={x: "%s_1" % x - for x in panel2.major_axis}) - - panel3 = panel2.rename(major_axis=lambda x: '%s_1' % x) - panel3 = panel3.rename(minor_axis=lambda x: '%s_1' % x) - - # it works! - concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort) - def test_concat_series(self): ts = tm.makeTimeSeries() @@ -1781,9 +1753,8 @@ def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = mkdf(10, 2) - msg = ('cannot concatenate object of type "{}";' - ' only pd.Series, pd.DataFrame, and pd.Panel' - r' \(deprecated\) objs are valid') + msg = ("cannot concatenate object of type '{}';" + " only Series and DataFrame objs are valid") for obj in [1, dict(), [1, 2], (1, 2)]: with pytest.raises(TypeError, match=msg.format(type(obj))): concat([df1, obj]) @@ -2400,9 +2371,8 @@ def test_concat_different_extension_dtypes_upcasts(self): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) +@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) @pytest.mark.parametrize('dt', np.sctypes['float']) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_concat_no_unnecessary_upcast(dt, pdt): # GH 13247 dims = pdt().ndim @@ -2413,9 +2383,8 @@ def test_concat_no_unnecessary_upcast(dt, pdt): assert x.values.dtype == dt -@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) +@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) @pytest.mark.parametrize('dt', np.sctypes['int']) -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") def test_concat_will_upcast(dt, pdt): with catch_warnings(record=True): dims = pdt().ndim From 26d991fc4e568432c3264151265c0a3ea9a6f133 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 12 Mar 2019 12:45:59 +0000 Subject: [PATCH 04/22] CLN: Remove unused test code (#25670) --- pandas/tests/computation/test_eval.py | 62 +++++++-------------------- 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 062d1876141f8..e52db86c7fd5e 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -23,16 +23,13 @@ from pandas.core.computation.expressions import ( _NUMEXPR_INSTALLED, _USE_NUMEXPR) from pandas.core.computation.ops import ( - _arith_ops_syms, _binary_math_ops, _binary_ops_dict, _bool_ops_syms, + _arith_ops_syms, _binary_math_ops, _binary_ops_dict, _special_case_arith_ops_syms, _unary_math_ops) import pandas.util.testing as tm from pandas.util.testing import ( assert_frame_equal, assert_numpy_array_equal, assert_produces_warning, assert_series_equal, makeCustomDataframe as mkdf, randbool) -_series_frame_incompatible = _bool_ops_syms -_scalar_skip = 'in', 'not in' - @pytest.fixture(params=( pytest.param(engine, @@ -162,13 +159,21 @@ def teardown_method(self, method): del self.pandas_rhses, self.pandas_lhses, self.current_engines @pytest.mark.slow - def test_complex_cmp_ops(self): - cmp_ops = ('!=', '==', '<=', '>=', '<', '>') - cmp2_ops = ('>', '<') - for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, cmp_ops, - self.rhses, self.bin_ops, - cmp2_ops): - self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2) + @pytest.mark.parametrize('cmp1', ['!=', '==', '<=', '>=', '<', '>'], + ids=['ne', 'eq', 'le', 'ge', 'lt', 'gt']) + @pytest.mark.parametrize('cmp2', ['>', '<'], ids=['gt', 'lt']) + def test_complex_cmp_ops(self, cmp1, cmp2): + for lhs, rhs, binop in product( + self.lhses, self.rhses, self.bin_ops): + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + expected = _eval_single_bin( + lhs_new, binop, rhs_new, self.engine) + + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format( + cmp1=cmp1, binop=binop, cmp2=cmp2) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + self.check_equal(result, expected) def test_simple_cmp_ops(self): bool_lhses = (DataFrame(randbool(size=(10, 5))), @@ -225,41 +230,6 @@ def check_equal(self, result, expected): else: assert result == expected - def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): - skip_these = _scalar_skip - ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, - binop=binop, - cmp2=cmp2) - scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or - cmp2 in skip_these)) - if scalar_with_in_notin: - with pytest.raises(TypeError): - pd.eval(ex, engine=self.engine, parser=self.parser) - with pytest.raises(TypeError): - pd.eval(ex, engine=self.engine, parser=self.parser, - local_dict={'lhs': lhs, 'rhs': rhs}) - else: - lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) - rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) - if (isinstance(lhs_new, Series) and - isinstance(rhs_new, DataFrame) and - binop in _series_frame_incompatible): - pass - # TODO: the code below should be added back when left and right - # hand side bool ops are fixed. - # - # try: - # pytest.raises(Exception, pd.eval, ex, - # local_dict={'lhs': lhs, 'rhs': rhs}, - # engine=self.engine, parser=self.parser) - # except AssertionError: - # raise - else: - expected = _eval_single_bin( - lhs_new, binop, rhs_new, self.engine) - result = pd.eval(ex, engine=self.engine, parser=self.parser) - self.check_equal(result, expected) - def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): def check_operands(left, right, cmp_op): From 5c341dc136cae12bf23c1a4b8e34c9951828c0cf Mon Sep 17 00:00:00 2001 From: Tom Neep Date: Tue, 12 Mar 2019 18:03:15 +0100 Subject: [PATCH 05/22] BUG: Fix to_string output when using header (#16718) (#25602) Also affects to_latex midrule position Tests added for both to_string and to_latex Whatsnew added for v0.25.0 --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/format.py | 9 ++++----- pandas/tests/io/formats/test_format.py | 8 ++++++++ pandas/tests/io/formats/test_to_latex.py | 16 ++++++++++++++++ 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 284943cf49070..ab2f8d66c37da 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -215,6 +215,7 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) +- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - - diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f68ef2cc39006..91e5edc8de9f4 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -528,6 +528,10 @@ def _to_str_columns(self): else: str_columns = self._get_formatted_column_labels(frame) + if self.show_row_idx_names: + for x in str_columns: + x.append('') + stringified = [] for i, c in enumerate(frame): cheader = str_columns[i] @@ -770,11 +774,6 @@ def space_format(x, y): need_leadsp[x] else x] for i, (col, x) in enumerate(zip(columns, fmt_columns))] - - if self.show_row_idx_names: - for x in str_columns: - x.append('') - # self.str_columns = str_columns return str_columns diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b0cf5a2f17609..43bb382ea3f20 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2380,6 +2380,14 @@ def test_to_string_header(self): exp = '0 0\n ..\n9 9' assert res == exp + def test_to_string_multindex_header(self): + # GH 16718 + df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]}) + .set_index(['a', 'b'])) + res = df.to_string(header=['r1', 'r2']) + exp = ' r1 r2\na b \n0 1 2 3' + assert res == exp + def _three_digit_exp(): return '{x:.4g}'.format(x=1.7e8) == '1.7e+008' diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 1653e474aa7b0..4bec3bca1820b 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -735,3 +735,19 @@ def test_to_latex_float_format_no_fixed_width(self): \end{tabular} """ assert df.to_latex(float_format='%.0f') == expected + + def test_to_latex_multindex_header(self): + # GH 16718 + df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]}) + .set_index(['a', 'b'])) + observed = df.to_latex(header=['r1', 'r2']) + expected = r"""\begin{tabular}{llrr} +\toprule + & & r1 & r2 \\ +a & b & & \\ +\midrule +0 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + assert observed == expected From a8fad16899bedfc3dabb6f196217d0c1004b231a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Mar 2019 21:25:25 +0100 Subject: [PATCH 06/22] BUG: fix usage of na_sentinel with sort=True in factorize() (#25592) --- doc/source/whatsnew/v0.24.2.rst | 1 + pandas/core/algorithms.py | 20 +++++++++++++------- pandas/tests/test_algos.py | 15 +++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 09deb69d3b39e..c07959c758780 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -33,6 +33,7 @@ Fixed Regressions - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`). - Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) .. _whatsnew_0242.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4a71951e2435e..5ed2e3efe26a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -619,13 +619,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - try: - order = uniques.argsort() - order2 = order.argsort() - labels = take_1d(order2, labels, fill_value=na_sentinel) - uniques = uniques.take(order) - except TypeError: - # Mixed types, where uniques.argsort fails. + if na_sentinel == -1: + # GH-25409 take_1d only works for na_sentinels of -1 + try: + order = uniques.argsort() + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) + else: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3f75c508d22f9..083307371b699 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,6 +326,21 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) + @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) + def test_factorize_na_sentinel(self, sort, na_sentinel): + data = np.array(['b', 'a', None, 'b'], dtype=object) + labels, uniques = algos.factorize(data, sort=sort, + na_sentinel=na_sentinel) + if sort: + expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) + expected_uniques = np.array(['a', 'b'], dtype=object) + else: + expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) + expected_uniques = np.array(['b', 'a'], dtype=object) + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(uniques, expected_uniques) + class TestUnique(object): From 12fd316de829b994d6e3d1fc14c59d8e8bf34500 Mon Sep 17 00:00:00 2001 From: Kendall Masse Date: Tue, 12 Mar 2019 16:45:23 -0400 Subject: [PATCH 07/22] BUG: Fix error in replace with strings that are large numbers (#25616) (#25644) --- doc/source/whatsnew/v0.24.2.rst | 2 ++ pandas/core/internals/blocks.py | 4 ++-- pandas/tests/series/test_replace.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index c07959c758780..5b5c9c78d10da 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -33,6 +33,7 @@ Fixed Regressions - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed regression in :func:`to_timedelta` losing precision when converting floating data to ``Timedelta`` data (:issue:`25077`). - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :meth:`DataFrame.replace` where large strings of numbers would be coerced into ``int64``, causing an ``OverflowError`` (:issue:`25616`) - Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`). - Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) @@ -90,6 +91,7 @@ A total of 25 people contributed patches to this release. People with a "+" by t * Joris Van den Bossche * Josh * Justin Zheng +* Kendall Masse * Matthew Roeschke * Max Bolingbroke + * rbenes + diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ada663556899b..0375f782badcc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1079,7 +1079,7 @@ def coerce_to_target_dtype(self, other): try: return self.astype(dtype) - except (ValueError, TypeError): + except (ValueError, TypeError, OverflowError): pass return self.astype(object) @@ -3210,7 +3210,7 @@ def _putmask_smart(v, m, n): nv = v.copy() nv[m] = nn_at return nv - except (ValueError, IndexError, TypeError): + except (ValueError, IndexError, TypeError, OverflowError): pass n = np.asarray(n) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 40b28047080da..2e7b746f6c9f2 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -280,3 +280,17 @@ def test_replace_mixed_types_with_string(self): result = s.replace([2, '4'], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + + def test_replace_with_no_overflowerror(self): + # GH 25616 + # casts to object without Exception from OverflowError + s = pd.Series([0, 1, 2, 3, 4]) + result = s.replace([3], ['100000000000000000000']) + expected = pd.Series([0, 1, 2, '100000000000000000000', 4]) + tm.assert_series_equal(result, expected) + + s = pd.Series([0, '100000000000000000000', + '100000000000000000001']) + result = s.replace(['100000000000000000000'], [1]) + expected = pd.Series([0, 1, '100000000000000000001']) + tm.assert_series_equal(result, expected) From 479f821ef9881c545a73878571024d6c0671f382 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Mar 2019 22:06:58 +0100 Subject: [PATCH 08/22] DOC: update date of 0.24.2 release notes (#25699) --- doc/source/whatsnew/v0.24.2.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 5b5c9c78d10da..6ad299de45e2a 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -2,8 +2,8 @@ .. _whatsnew_0242: -Whats New in 0.24.2 (February XX, 2019) ---------------------------------------- +Whats New in 0.24.2 (March 12, 2019) +------------------------------------ .. warning:: From c57d162139c629a5e90bec6c1f336606755a0f7e Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 13 Mar 2019 04:52:34 -0700 Subject: [PATCH 09/22] Pinned pycodestyle (#25701) --- environment.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index c1669c9f49017..ff7c5d56052d2 100644 --- a/environment.yml +++ b/environment.yml @@ -19,6 +19,7 @@ dependencies: - hypothesis>=3.82 - isort - moto + - pycodestyle=2.4 - pytest>=4.0.2 - pytest-mock - sphinx diff --git a/requirements-dev.txt b/requirements-dev.txt index be84c6f29fdeb..02d8b0a70aab6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,6 +10,7 @@ gitpython hypothesis>=3.82 isort moto +pycodestyle==2.4 pytest>=4.0.2 pytest-mock sphinx From 69551aa6d815b198091502a6e0e3de08ca0133c0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 13 Mar 2019 12:49:50 +0000 Subject: [PATCH 10/22] TST/CLN: Remove more Panel tests (#25675) --- pandas/tests/api/test_api.py | 4 +- pandas/tests/computation/test_eval.py | 10 +-- pandas/tests/dtypes/test_inference.py | 9 +-- pandas/tests/generic/test_generic.py | 76 ++++--------------- .../generic/test_label_or_level_utils.py | 64 ---------------- pandas/tests/indexing/test_iloc.py | 1 - pandas/tests/reshape/merge/test_join.py | 1 - pandas/tests/test_base.py | 4 +- 8 files changed, 21 insertions(+), 148 deletions(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 599ab9a3c5f7c..51bf52c40ad3c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -53,10 +53,10 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_classes = ['TimeGrouper'] + deprecated_classes = ['TimeGrouper', 'Panel'] # these should be deprecated in the future - deprecated_classes_in_future = ['Panel'] + deprecated_classes_in_future = [] # external modules exposed in pandas namespace modules = ['np', 'datetime'] diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index e52db86c7fd5e..49d263feab664 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar import pandas as pd -from pandas import DataFrame, Panel, Series, date_range +from pandas import DataFrame, Series, date_range from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -1112,14 +1112,6 @@ def test_bool_ops_with_constants(self): exp = eval(ex) assert res == exp - @pytest.mark.filterwarnings("ignore::FutureWarning") - def test_panel_fails(self): - x = Panel(randn(3, 4, 5)) - y = Series(randn(10)) - with pytest.raises(NotImplementedError): - self.eval('x + y', - local_dict={'x': x, 'y': y}) - def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 187b37d4f788e..467f0a8eb0fc4 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -11,7 +11,6 @@ from fractions import Fraction from numbers import Number import re -from warnings import catch_warnings, simplefilter import numpy as np import pytest @@ -30,8 +29,8 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DateOffset, DatetimeIndex, Index, Interval, Panel, - Period, Series, Timedelta, TimedeltaIndex, Timestamp, compat, isna) + Categorical, DataFrame, DateOffset, DatetimeIndex, Index, Interval, Period, + Series, Timedelta, TimedeltaIndex, Timestamp, compat, isna) from pandas.util import testing as tm @@ -1305,10 +1304,6 @@ def test_is_scalar_pandas_containers(self): assert not is_scalar(Series([1])) assert not is_scalar(DataFrame()) assert not is_scalar(DataFrame([[1]])) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - assert not is_scalar(Panel()) - assert not is_scalar(Panel([[[1]]])) assert not is_scalar(Index([])) assert not is_scalar(Index([1])) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 6f2707f764920..c40544d6e3f86 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -2,7 +2,6 @@ # pylint: disable-msg=E1101,W0612 from copy import copy, deepcopy -from warnings import catch_warnings, simplefilter import numpy as np import pytest @@ -12,7 +11,7 @@ from pandas.core.dtypes.common import is_scalar import pandas as pd -from pandas import DataFrame, MultiIndex, Panel, Series, date_range +from pandas import DataFrame, MultiIndex, Series, date_range import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -238,12 +237,6 @@ def test_metadata_propagation(self): o2 = self._construct(shape=3) o2.name = 'bar' - # TODO - # Once panel can do non-trivial combine operations - # (currently there is an a raise in the Panel arith_ops to prevent - # this, though it actually does work) - # can remove all of these try: except: blocks on the actual operations - # ---------- # preserving # ---------- @@ -255,63 +248,37 @@ def test_metadata_propagation(self): # ops with like for op in ['__add__', '__sub__', '__truediv__', '__mul__']: - try: - result = getattr(o, op)(o) - self.check_metadata(o, result) - except (ValueError, AttributeError): - pass + result = getattr(o, op)(o) + self.check_metadata(o, result) # simple boolean for op in ['__eq__', '__le__', '__ge__']: v1 = getattr(o, op)(o) self.check_metadata(o, v1) - - try: - self.check_metadata(o, v1 & v1) - except (ValueError): - pass - - try: - self.check_metadata(o, v1 | v1) - except (ValueError): - pass + self.check_metadata(o, v1 & v1) + self.check_metadata(o, v1 | v1) # combine_first - try: - result = o.combine_first(o2) - self.check_metadata(o, result) - except (AttributeError): - pass + result = o.combine_first(o2) + self.check_metadata(o, result) # --------------------------- # non-preserving (by default) # --------------------------- # add non-like - try: - result = o + o2 - self.check_metadata(result) - except (ValueError, AttributeError): - pass + result = o + o2 + self.check_metadata(result) # simple boolean for op in ['__eq__', '__le__', '__ge__']: # this is a name matching op v1 = getattr(o, op)(o) - v2 = getattr(o, op)(o2) self.check_metadata(v2) - - try: - self.check_metadata(v1 & v2) - except (ValueError): - pass - - try: - self.check_metadata(v1 | v2) - except (ValueError): - pass + self.check_metadata(v1 & v2) + self.check_metadata(v1 | v2) def test_head_tail(self): # GH5370 @@ -325,12 +292,7 @@ def test_head_tail(self): axis = o._get_axis_name(0) setattr(o, axis, index(len(getattr(o, axis)))) - # Panel + dims - try: - o.head() - except (NotImplementedError): - pytest.skip('not implemented on {0}'.format( - o.__class__.__name__)) + o.head() self._compare(o.head(), o.iloc[:5]) self._compare(o.tail(), o.iloc[-5:]) @@ -639,19 +601,12 @@ def test_sample(sel): sample1 = df.sample(n=1, weights='easyweights') assert_frame_equal(sample1, df.iloc[5:6]) - # Ensure proper error if string given as weight for Series, panel, or + # Ensure proper error if string given as weight for Series or # DataFrame with axis = 1. s = Series(range(10)) with pytest.raises(ValueError): s.sample(n=3, weights='weight_column') - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - panel = Panel(items=[0, 1, 2], major_axis=[2, 3, 4], - minor_axis=[3, 4, 5]) - with pytest.raises(ValueError): - panel.sample(n=1, weights='weight_column') - with pytest.raises(ValueError): df.sample(n=1, weights='weight_column', axis=1) @@ -754,12 +709,9 @@ def test_squeeze(self): # don't fail with 0 length dimensions GH11229 & GH8999 empty_series = Series([], name='five') empty_frame = DataFrame([empty_series]) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - empty_panel = Panel({'six': empty_frame}) [tm.assert_series_equal(empty_series, higher_dim.squeeze()) - for higher_dim in [empty_series, empty_frame, empty_panel]] + for higher_dim in [empty_series, empty_frame]] # axis argument df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 91c58e01f0c45..1341837c46669 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -3,7 +3,6 @@ from pandas.core.dtypes.missing import array_equivalent import pandas as pd -import pandas.util.testing as tm # Fixtures @@ -46,13 +45,6 @@ def df_duplabels(df): return df -@pytest.fixture -def panel(): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - return pd.Panel() - - # Test is label/level reference # ============================= def get_labels_levels(df_levels): @@ -134,32 +126,6 @@ def test_is_level_reference_series_axis1_error(df): s._is_level_reference('L1', axis=1) -# Panel -# ----- -def test_is_level_reference_panel_error(panel): - msg = ("_is_level_reference is not implemented for {type}" - .format(type=type(panel))) - - with pytest.raises(NotImplementedError, match=msg): - panel._is_level_reference('L1', axis=0) - - -def test_is_label_reference_panel_error(panel): - msg = ("_is_label_reference is not implemented for {type}" - .format(type=type(panel))) - - with pytest.raises(NotImplementedError, match=msg): - panel._is_label_reference('L1', axis=0) - - -def test_is_label_or_level_reference_panel_error(panel): - msg = ("_is_label_or_level_reference is not implemented for {type}" - .format(type=type(panel))) - - with pytest.raises(NotImplementedError, match=msg): - panel._is_label_or_level_reference('L1', axis=0) - - # Test _check_label_or_level_ambiguity_df # ======================================= @@ -215,16 +181,6 @@ def test_check_label_or_level_ambiguity_series_axis1_error(df): s._check_label_or_level_ambiguity('L1', axis=1) -# Panel -# ----- -def test_check_label_or_level_ambiguity_panel_error(panel): - msg = ("_check_label_or_level_ambiguity is not implemented for {type}" - .format(type=type(panel))) - - with pytest.raises(NotImplementedError, match=msg): - panel._check_label_or_level_ambiguity("L1", axis=0) - - # Test _get_label_or_level_values # =============================== def assert_label_values(frame, labels, axis): @@ -322,16 +278,6 @@ def test_get_label_or_level_values_series_axis1_error(df): s._get_label_or_level_values('L1', axis=1) -# Panel -# ----- -def test_get_label_or_level_values_panel_error(panel): - msg = ("_get_label_or_level_values is not implemented for {type}" - .format(type=type(panel))) - - with pytest.raises(NotImplementedError, match=msg): - panel._get_label_or_level_values('L1', axis=0) - - # Test _drop_labels_or_levels # =========================== def assert_labels_dropped(frame, labels, axis): @@ -394,13 +340,3 @@ def test_drop_labels_or_levels_series(df): with pytest.raises(ValueError, match="not valid labels or levels"): s._drop_labels_or_levels('L4', axis=0) - - -# Panel -# ----- -def test_drop_labels_or_levels_panel_error(panel): - msg = ("_drop_labels_or_levels is not implemented for {type}" - .format(type=type(panel))) - - with pytest.raises(NotImplementedError, match=msg): - panel._drop_labels_or_levels('L1', axis=0) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 69ec6454e952a..7147757953b01 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -217,7 +217,6 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): def test_iloc_getitem_dups(self): - # no dups in panel (bug?) self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, objs=['series', 'frame'], typs=['ints', 'uints']) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 62c9047b17f3d..63ee899944e92 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -17,7 +17,6 @@ a_ = np.array -@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") class TestJoin(object): def setup_method(self, method): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index ac365eb87d1bc..d6c6fdd312d3e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -21,7 +21,7 @@ import pandas as pd from pandas import ( CategoricalIndex, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex, - Panel, PeriodIndex, Series, Timedelta, TimedeltaIndex, Timestamp) + PeriodIndex, Series, Timedelta, TimedeltaIndex, Timestamp) from pandas.core.accessor import PandasDelegate from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray from pandas.core.base import NoNewAttributesMixin, PandasObject @@ -239,7 +239,7 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): with pytest.raises(err): getattr(o, op) - @pytest.mark.parametrize('klass', [Series, DataFrame, Panel]) + @pytest.mark.parametrize('klass', [Series, DataFrame]) def test_binary_ops_docs(self, klass): op_map = {'add': '+', 'sub': '-', From 486472a97a2d27172640d8055bf815384d9310ce Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 13 Mar 2019 12:59:05 +0000 Subject: [PATCH 11/22] CLN: remove pandas.core.categorical (#25655) --- pandas/core/categorical.py | 9 --------- pandas/tests/api/test_api.py | 17 ----------------- 2 files changed, 26 deletions(-) delete mode 100644 pandas/core/categorical.py diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py deleted file mode 100644 index 43c35c4000bb6..0000000000000 --- a/pandas/core/categorical.py +++ /dev/null @@ -1,9 +0,0 @@ -import warnings - -from pandas.core.dtypes.dtypes import CategoricalDtype # noqa - -from pandas.core.arrays import Categorical # noqa - -# TODO: Remove after 0.23.x -warnings.warn("'pandas.core' is private. Use 'pandas.Categorical'", - FutureWarning, stacklevel=2) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 51bf52c40ad3c..8a0a27a71784c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -import sys - import pandas as pd from pandas import api from pandas.util import testing as tm @@ -148,18 +146,3 @@ def test_deprecation_cdaterange(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): cdate_range('2017-01-01', '2017-12-31') - - -class TestCategoricalMove(object): - - def test_categorical_move(self): - # May have been cached by another import, e.g. pickle tests. - sys.modules.pop("pandas.core.categorical", None) - - with tm.assert_produces_warning(FutureWarning): - from pandas.core.categorical import Categorical # noqa - - sys.modules.pop("pandas.core.categorical", None) - - with tm.assert_produces_warning(FutureWarning): - from pandas.core.categorical import CategoricalDtype # noqa From 86879ac9af524adc684bec77c621d27b19a36ca0 Mon Sep 17 00:00:00 2001 From: Alexander Ponomaroff <33966871+alexander-ponomaroff@users.noreply.github.com> Date: Wed, 13 Mar 2019 09:11:28 -0400 Subject: [PATCH 12/22] Fix concat not respecting order of OrderedDict (#25224) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/groupby/generic.py | 2 +- pandas/core/reshape/concat.py | 2 +- pandas/tests/groupby/test_groupby.py | 6 +++--- pandas/tests/reshape/test_concat.py | 13 +++++++++++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ab2f8d66c37da..1840c47b4054f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -242,6 +242,7 @@ Reshaping - Bug in :func:`pandas.merge` adds a string of ``None`` if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`). - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) +- Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) Sparse diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 683c21f7bd47a..c5f9e52e07ecf 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -822,7 +822,7 @@ def _aggregate_multiple_funcs(self, arg, _level): columns.append(com.get_callable_name(f)) arg = lzip(columns, arg) - results = {} + results = collections.OrderedDict() for name, func in arg: obj = self if name in results: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 6cc355fb62f23..4ad05f2b52ec5 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -253,7 +253,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if isinstance(objs, dict): if keys is None: - keys = sorted(objs) + keys = com.dict_keys_to_ordered_list(objs) objs = [objs[k] for k in keys] else: objs = list(objs) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c062fb90ca43b..f80a7300334e4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1690,9 +1690,9 @@ def test_groupby_agg_ohlc_non_first(): [1, 1, 1, 1, 1], [1, 1, 1, 1, 1] ], columns=pd.MultiIndex.from_tuples(( - ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'), - ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'), - ('foo', 'sum', 'foo'))), index=pd.date_range( + ('foo', 'sum', 'foo'), ('foo', 'ohlc', 'open'), + ('foo', 'ohlc', 'high'), ('foo', 'ohlc', 'low'), + ('foo', 'ohlc', 'close'))), index=pd.date_range( '2018-01-01', periods=2, freq='D')) result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 9dbc14c23f3f4..ccd50998e39b1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1,4 +1,4 @@ -from collections import deque +from collections import OrderedDict, deque import datetime as dt from datetime import datetime from decimal import Decimal @@ -18,6 +18,7 @@ from pandas import ( Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, concat, date_range, isna, read_csv) +import pandas.core.common as com from pandas.tests.extension.decimal import to_decimal from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, makeCustomDataframe as mkdf @@ -1162,7 +1163,7 @@ def test_concat_dict(self): 'baz': DataFrame(np.random.randn(4, 3)), 'qux': DataFrame(np.random.randn(4, 3))} - sorted_keys = sorted(frames) + sorted_keys = com.dict_keys_to_ordered_list(frames) result = concat(frames) expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) @@ -2370,6 +2371,14 @@ def test_concat_different_extension_dtypes_upcasts(self): ], dtype=object) tm.assert_series_equal(result, expected) + def test_concat_odered_dict(self): + # GH 21510 + expected = pd.concat([pd.Series(range(3)), pd.Series(range(4))], + keys=['First', 'Another']) + result = pd.concat(OrderedDict([('First', pd.Series(range(3))), + ('Another', pd.Series(range(4)))])) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) @pytest.mark.parametrize('dt', np.sctypes['float']) From 203481faf2af3b43243ed2d8d56447e2381e51d2 Mon Sep 17 00:00:00 2001 From: Tao He Date: Wed, 13 Mar 2019 23:28:31 +0800 Subject: [PATCH 13/22] BUG: Preserve name in DatetimeIndex.snap (#25585) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/indexes/datetimes.py | 4 +-- pandas/tests/series/indexing/test_datetime.py | 28 ++++++++++++------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1840c47b4054f..3f877cf862a4d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -86,7 +86,7 @@ Other API Changes - :class:`DatetimeTZDtype` will now standardize pytz timezones to a common timezone instance (:issue:`24713`) - ``Timestamp`` and ``Timedelta`` scalars now implement the :meth:`to_numpy` method as aliases to :meth:`Timestamp.to_datetime64` and :meth:`Timedelta.to_timedelta64`, respectively. (:issue:`24653`) - :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`) -- +- Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) .. _whatsnew_0250.deprecations: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index b8d052ce7be04..b65e59a3d58b7 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -787,8 +787,8 @@ def snap(self, freq='S'): snapped[i] = s # we know it conforms; skip check - return DatetimeIndex._simple_new(snapped, freq=freq) - # TODO: what about self.name? tz? if so, use shallow_copy? + return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, + freq=freq) def join(self, other, how='left', level=None, return_indexers=False, sort=False): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 0efc9feb0dbd4..8e4c7d9b17efc 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -52,20 +52,28 @@ def test_fancy_setitem(): assert (s[48:54] == -3).all() -def test_dti_snap(): +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.parametrize('tz', [None, 'Asia/Shanghai', 'Europe/Berlin']) +@pytest.mark.parametrize('name', [None, 'my_dti']) +def test_dti_snap(name, tz): dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', - '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') + '1/5/2002', '1/6/2002', '1/7/2002'], + name=name, tz=tz, freq='D') - res = dti.snap(freq='W-MON') - exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') - exp = exp.repeat([3, 4]) - assert (res == exp).all() + result = dti.snap(freq='W-MON') + expected = date_range('12/31/2001', '1/7/2002', + name=name, tz=tz, freq='w-mon') + expected = expected.repeat([3, 4]) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz - res = dti.snap(freq='B') + result = dti.snap(freq='B') - exp = date_range('1/1/2002', '1/7/2002', freq='b') - exp = exp.repeat([1, 1, 1, 2, 2]) - assert (res == exp).all() + expected = date_range('1/1/2002', '1/7/2002', + name=name, tz=tz, freq='b') + expected = expected.repeat([1, 1, 1, 2, 2]) + tm.assert_index_equal(result, expected) + assert result.tz == expected.tz def test_dti_reset_index_round_trip(): From 58a4151c0a7909ea20191ce11b8eebe08b6f2af0 Mon Sep 17 00:00:00 2001 From: Chris Bertinato Date: Wed, 13 Mar 2019 11:42:07 -0400 Subject: [PATCH 14/22] DEPR: Deprecate box kwarg for to_timedelta and to_datetime (#24486) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/dtypes/cast.py | 4 +- pandas/core/indexes/datetimelike.py | 3 +- pandas/core/tools/datetimes.py | 8 ++ pandas/core/tools/timedeltas.py | 8 ++ pandas/io/parsers.py | 4 +- pandas/tests/indexes/datetimes/test_tools.py | 95 ++++++++++--------- pandas/tests/indexes/timedeltas/test_tools.py | 89 ++++++++++------- .../tests/scalar/timedelta/test_timedelta.py | 8 +- 9 files changed, 131 insertions(+), 89 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3f877cf862a4d..d05669e862695 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -94,6 +94,7 @@ Deprecations ~~~~~~~~~~~~ - Deprecated the `M (months)` and `Y (year)` `units` parameter of :func: `pandas.to_timedelta`, :func: `pandas.Timedelta` and :func: `pandas.TimedeltaIndex` (:issue:`16344`) +- The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64`/:meth:`Timedelta.to_timedelta64`. (:issue:`24416`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f6561948df99a..1823a8e8654fd 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -794,10 +794,10 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, # Immediate return if coerce if datetime: from pandas import to_datetime - return to_datetime(values, errors='coerce', box=False) + return to_datetime(values, errors='coerce').to_numpy() elif timedelta: from pandas import to_timedelta - return to_timedelta(values, errors='coerce', box=False) + return to_timedelta(values, errors='coerce').to_numpy() elif numeric: from pandas import to_numeric return to_numeric(values, errors='coerce') diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index aa7332472fc07..830f234b85757 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -300,7 +300,8 @@ def asobject(self): return self.astype(object) def _convert_tolerance(self, tolerance, target): - tolerance = np.asarray(to_timedelta(tolerance, box=False)) + tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) + if target.size != tolerance.size and tolerance.size > 1: raise ValueError('list-like tolerance size must match ' 'target index size') diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0c76ac6cd75ac..64e06787db6fe 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -9,6 +9,7 @@ DateParseError, _format_is_iso, _guess_datetime_format, parse_time_string) from pandas._libs.tslibs.strptime import array_strptime from pandas.compat import zip +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ( ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, @@ -398,6 +399,7 @@ def _adjust_to_origin(arg, origin, unit): return arg +@deprecate_kwarg(old_arg_name='box', new_arg_name=None) def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', @@ -444,6 +446,12 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If True returns a DatetimeIndex or Index-like object - If False returns ndarray of values. + + .. deprecated:: 0.25.0 + Use :meth:`.to_numpy` or :meth:`Timestamp.to_datetime64` + instead to get an ndarray of values or numpy.datetime64, + respectively. + format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 7ebaf3056e79e..41dca3bfe7500 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -8,6 +8,7 @@ from pandas._libs.tslibs import NaT from pandas._libs.tslibs.timedeltas import Timedelta, parse_timedelta_unit +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -15,6 +16,7 @@ from pandas.core.arrays.timedeltas import sequence_to_td64ns +@deprecate_kwarg(old_arg_name='box', new_arg_name=None) def to_timedelta(arg, unit='ns', box=True, errors='raise'): """ Convert argument to timedelta. @@ -40,6 +42,12 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): - If True returns a Timedelta/TimedeltaIndex of the results. - If False returns a numpy.timedelta64 or numpy.darray of values of dtype timedelta64[ns]. + + .. deprecated:: 0.25.0 + Use :meth:`.to_numpy` or :meth:`Timedelta.to_timedelta64` + instead to get an ndarray of values or numpy.timedelta64, + respectively. + errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4163a571df800..5f33c387769ee 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3164,11 +3164,11 @@ def converter(*date_cols): return tools.to_datetime( ensure_object(strs), utc=None, - box=False, dayfirst=dayfirst, errors='ignore', infer_datetime_format=infer_datetime_format - ) + ).to_numpy() + except ValueError: return tools.to_datetime( parsing.try_parse_dates(strs, dayfirst=dayfirst)) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index dd914d8a79837..1a1e33bd508fc 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -184,9 +184,6 @@ def test_to_datetime_format_weeks(self, cache): for s, format, dt in data: assert to_datetime(s, format=format, cache=cache) == dt - @pytest.mark.parametrize("box,const", [ - [True, pd.Index], - [False, np.array]]) @pytest.mark.parametrize("fmt,dates,expected_dates", [ ['%Y-%m-%d %H:%M:%S %Z', ['2010-01-01 12:00:00 UTC'] * 2, @@ -218,15 +215,15 @@ def test_to_datetime_format_weeks(self, cache): tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC pd.Timestamp('2010-01-01 12:00:00', tzinfo=pytz.FixedOffset(0))]]]) - def test_to_datetime_parse_tzname_or_tzoffset(self, box, const, - fmt, dates, expected_dates): + def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, + expected_dates): # GH 13486 - result = pd.to_datetime(dates, format=fmt, box=box) - expected = const(expected_dates) + result = pd.to_datetime(dates, format=fmt) + expected = pd.Index(expected_dates) tm.assert_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(dates, format=fmt, box=box, utc=True) + pd.to_datetime(dates, format=fmt, utc=True) @pytest.mark.parametrize('offset', [ '+0', '-1foo', 'UTCbar', ':10', '+01:000:01', '']) @@ -256,7 +253,7 @@ def test_to_datetime_dtarr(self, tz): result = to_datetime(arr) assert result is arr - result = to_datetime(arr, box=True) + result = to_datetime(arr) assert result is arr def test_to_datetime_pydatetime(self): @@ -363,9 +360,9 @@ def test_to_datetime_array_of_dt64s(self, cache): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing - tm.assert_numpy_array_equal( - pd.to_datetime(dts, box=False, cache=cache), - np.array([Timestamp(x).asm8 for x in dts]) + tm.assert_index_equal( + pd.to_datetime(dts, cache=cache), + pd.DatetimeIndex([Timestamp(x).asm8 for x in dts]) ) # A list of datetimes where the last one is out of bounds @@ -375,28 +372,26 @@ def test_to_datetime_array_of_dt64s(self, cache): with pytest.raises(OutOfBoundsDatetime, match=msg): pd.to_datetime(dts_with_oob, errors='raise') - tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='coerce', + tm.assert_index_equal( + pd.to_datetime(dts_with_oob, errors='coerce', cache=cache), - np.array( + pd.DatetimeIndex( [ Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8, - tslib.iNaT, - ], - dtype='M8' + pd.NaT + ] ) ) # With errors='ignore', out of bounds datetime64s # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date - tm.assert_numpy_array_equal( - pd.to_datetime(dts_with_oob, box=False, errors='ignore', + tm.assert_index_equal( + pd.to_datetime(dts_with_oob, errors='ignore', cache=cache), - np.array( - [dt.item() for dt in dts_with_oob], - dtype='O' + pd.Index( + [dt.item() for dt in dts_with_oob] ) ) @@ -622,20 +617,16 @@ def test_datetime_invalid_index(self, values, format, infer): @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) - @pytest.mark.parametrize("box", [True, False]) @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index]) - def test_to_datetime_cache(self, utc, format, box, constructor): + def test_to_datetime_cache(self, utc, format, constructor): date = '20130101 00:00:00' test_dates = [date] * 10**5 data = constructor(test_dates) - result = pd.to_datetime(data, utc=utc, format=format, box=box, - cache=True) - expected = pd.to_datetime(data, utc=utc, format=format, box=box, - cache=False) - if box: - tm.assert_index_equal(result, expected) - else: - tm.assert_numpy_array_equal(result, expected) + + result = pd.to_datetime(data, utc=utc, format=format, cache=True) + expected = pd.to_datetime(data, utc=utc, format=format, cache=False) + + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) @@ -684,7 +675,10 @@ def test_iso_8601_strings_with_same_offset(self): def test_iso_8601_strings_same_offset_no_box(self): # GH 22446 data = ['2018-01-04 09:01:00+09:00', '2018-01-04 09:02:00+09:00'] - result = pd.to_datetime(data, box=False) + + with tm.assert_produces_warning(FutureWarning): + result = pd.to_datetime(data, box=False) + expected = np.array([ datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)), datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540)) @@ -753,6 +747,16 @@ def test_timestamp_utc_true(self, ts, expected): result = to_datetime(ts, utc=True) assert result == expected + def test_to_datetime_box_deprecated(self): + expected = np.datetime64('2018-09-09') + + # Deprecated - see GH24416 + with tm.assert_produces_warning(FutureWarning): + pd.to_datetime(expected, box=False) + + result = pd.to_datetime(expected).to_datetime64() + assert result == expected + class TestToDatetimeUnit(object): @pytest.mark.parametrize('cache', [True, False]) @@ -891,7 +895,7 @@ def test_unit_rounding(self, cache): def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = pd.Index([15e9] * 2, name='name') - result = pd.to_datetime(expected, errors='ignore', box=True, unit='s', + result = pd.to_datetime(expected, errors='ignore', unit='s', cache=cache) tm.assert_index_equal(result, expected) @@ -1052,7 +1056,10 @@ def test_dataframe_box_false(self): df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) - result = pd.to_datetime(df, box=False) + + with tm.assert_produces_warning(FutureWarning): + result = pd.to_datetime(df, box=False) + expected = np.array(['2015-02-04', '2016-03-05'], dtype='datetime64[ns]') tm.assert_numpy_array_equal(result, expected) @@ -1069,8 +1076,7 @@ def test_dataframe_utc_true(self): def test_to_datetime_errors_ignore_utc_true(self): # GH 23758 - result = pd.to_datetime([1], unit='s', box=True, utc=True, - errors='ignore') + result = pd.to_datetime([1], unit='s', utc=True, errors='ignore') expected = DatetimeIndex(['1970-01-01 00:00:01'], tz='UTC') tm.assert_index_equal(result, expected) @@ -1188,19 +1194,16 @@ def test_to_datetime_types(self, cache): # assert result == expected @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize('box, klass', [ - [True, Index], - [False, np.array] - ]) - def test_to_datetime_unprocessable_input(self, cache, box, klass): + def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 - result = to_datetime([1, '1'], errors='ignore', cache=cache, box=box) - expected = klass(np.array([1, '1'], dtype='O')) + result = to_datetime([1, '1'], errors='ignore', cache=cache) + + expected = Index(np.array([1, '1'], dtype='O')) tm.assert_equal(result, expected) msg = "invalid string coercion to datetime" with pytest.raises(TypeError, match=msg): - to_datetime([1, '1'], errors='raise', cache=cache, box=box) + to_datetime([1, '1'], errors='raise', cache=cache) def test_to_datetime_other_datetime64_units(self): # 5/25/2012 diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 819184d4b14f3..55664e6ca4323 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -19,15 +19,18 @@ def conv(v): d1 = np.timedelta64(1, 'D') - assert (to_timedelta('1 days 06:05:01.00003', box=False) == - conv(d1 + np.timedelta64(6 * 3600 + 5 * 60 + 1, 's') + - np.timedelta64(30, 'us'))) - assert (to_timedelta('15.5us', box=False) == - conv(np.timedelta64(15500, 'ns'))) + with tm.assert_produces_warning(FutureWarning): + assert (to_timedelta('1 days 06:05:01.00003', box=False) == + conv(d1 + np.timedelta64(6 * 3600 + 5 * 60 + 1, 's') + + np.timedelta64(30, 'us'))) - # empty string - result = to_timedelta('', box=False) - assert result.astype('int64') == iNaT + with tm.assert_produces_warning(FutureWarning): + assert (to_timedelta('15.5us', box=False) == + conv(np.timedelta64(15500, 'ns'))) + + # empty string + result = to_timedelta('', box=False) + assert result.astype('int64') == iNaT result = to_timedelta(['', '']) assert isna(result).all() @@ -37,10 +40,11 @@ def conv(v): expected = pd.Index(np.array([np.timedelta64(1, 's')])) tm.assert_index_equal(result, expected) - # ints - result = np.timedelta64(0, 'ns') - expected = to_timedelta(0, box=False) - assert result == expected + with tm.assert_produces_warning(FutureWarning): + # ints + result = np.timedelta64(0, 'ns') + expected = to_timedelta(0, box=False) + assert result == expected # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) @@ -53,16 +57,18 @@ def conv(v): expected = to_timedelta([0, 10], unit='s') tm.assert_index_equal(result, expected) - # single element conversion - v = timedelta(seconds=1) - result = to_timedelta(v, box=False) - expected = np.timedelta64(timedelta(seconds=1)) - assert result == expected + with tm.assert_produces_warning(FutureWarning): + # single element conversion + v = timedelta(seconds=1) + result = to_timedelta(v, box=False) + expected = np.timedelta64(timedelta(seconds=1)) + assert result == expected - v = np.timedelta64(timedelta(seconds=1)) - result = to_timedelta(v, box=False) - expected = np.timedelta64(timedelta(seconds=1)) - assert result == expected + with tm.assert_produces_warning(FutureWarning): + v = np.timedelta64(timedelta(seconds=1)) + result = to_timedelta(v, box=False) + expected = np.timedelta64(timedelta(seconds=1)) + assert result == expected # arrays of various dtypes arr = np.array([1] * 5, dtype='int64') @@ -90,22 +96,27 @@ def conv(v): expected = TimedeltaIndex([np.timedelta64(1, 'D')] * 5) tm.assert_index_equal(result, expected) - # Test with lists as input when box=false - expected = np.array(np.arange(3) * 1000000000, dtype='timedelta64[ns]') - result = to_timedelta(range(3), unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) + with tm.assert_produces_warning(FutureWarning): + # Test with lists as input when box=false + expected = np.array(np.arange(3) * 1000000000, + dtype='timedelta64[ns]') + result = to_timedelta(range(3), unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) - result = to_timedelta(np.arange(3), unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) + with tm.assert_produces_warning(FutureWarning): + result = to_timedelta(np.arange(3), unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) - result = to_timedelta([0, 1, 2], unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) + with tm.assert_produces_warning(FutureWarning): + result = to_timedelta([0, 1, 2], unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) - # Tests with fractional seconds as input: - expected = np.array( - [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]') - result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False) - tm.assert_numpy_array_equal(expected, result) + with tm.assert_produces_warning(FutureWarning): + # Tests with fractional seconds as input: + expected = np.array( + [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]') + result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False) + tm.assert_numpy_array_equal(expected, result) def test_to_timedelta_invalid(self): @@ -188,3 +199,13 @@ def test_to_timedelta_float(self): result = pd.to_timedelta(arr, unit='s') expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype='int64') tm.assert_numpy_array_equal(result.asi8, expected_asi8) + + def test_to_timedelta_box_deprecated(self): + result = np.timedelta64(0, 'ns') + + # Deprecated - see GH24416 + with tm.assert_produces_warning(FutureWarning): + to_timedelta(0, box=False) + + expected = to_timedelta(0).to_timedelta64() + assert result == expected diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index ee2c2e9e1959c..42ba9bbd87e52 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -318,12 +318,12 @@ def test_iso_conversion(self): assert to_timedelta('P0DT0H0M1S') == expected def test_nat_converters(self): - result = to_timedelta('nat', box=False) - assert result.dtype.kind == 'm' + result = to_timedelta('nat').to_numpy() + assert result.dtype.kind == 'M' assert result.astype('int64') == iNaT - result = to_timedelta('nan', box=False) - assert result.dtype.kind == 'm' + result = to_timedelta('nan').to_numpy() + assert result.dtype.kind == 'M' assert result.astype('int64') == iNaT @pytest.mark.filterwarnings("ignore:M and Y units are deprecated") From 69ae24bf5de3e7c45fe32f5f17728bacc0bceaef Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 13 Mar 2019 15:44:40 +0000 Subject: [PATCH 15/22] TST: resolve issues with test_constructor_dtype_datetime64 (#24868) --- pandas/tests/frame/test_dtypes.py | 8 +++- pandas/tests/series/test_constructors.py | 49 ++++++++++++++++++------ pandas/tests/series/test_dtypes.py | 4 +- 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index b37bf02a6b8e7..ca54993712439 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -808,11 +808,15 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - with pytest.raises(TypeError): + msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" + r" \[timedelta64\[{}\]\]").format(unit) + with pytest.raises(TypeError, match=msg): df.astype(other) + msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" + r" \[datetime64\[{}\]\]").format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_timedeltas(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 8525b877618c9..96e18c6a60cac 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -683,17 +683,44 @@ def test_constructor_dtype_datetime64(self): assert s.dtype == 'M8[ns]' # GH3414 related - # msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - # r" \[int32\]") - # with pytest.raises(TypeError, match=msg): - # Series(Series(dates).astype('int') / 1000000, dtype='M8[ms]') - pytest.raises(TypeError, lambda x: Series( - Series(dates).astype('int') / 1000000, dtype='M8[ms]')) - - msg = (r"The 'datetime64' dtype has no unit\. Please pass in" - r" 'datetime64\[ns\]' instead\.") - with pytest.raises(ValueError, match=msg): - Series(dates, dtype='datetime64') + expected = Series([ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + ], dtype='datetime64[ns]') + + result = Series( + Series(dates).astype(np.int64) / 1000000, dtype='M8[ms]') + tm.assert_series_equal(result, expected) + + result = Series(dates, dtype='datetime64[ns]') + tm.assert_series_equal(result, expected) + + expected = Series([ + pd.NaT, + datetime(2013, 1, 2), + datetime(2013, 1, 3), + ], dtype='datetime64[ns]') + result = Series([np.nan] + dates[1:], dtype='datetime64[ns]') + tm.assert_series_equal(result, expected) + + dts = Series(dates, dtype='datetime64[ns]') + + # valid astype + dts.astype('int64') + + # invalid casting + msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" + r" \[int32\]") + with pytest.raises(TypeError, match=msg): + dts.astype('int32') + + # ints are ok + # we test with np.int64 to get similar results on + # windows / 32-bit platforms + result = Series(dts, dtype=np.int64) + expected = Series(dts.astype(np.int64)) + tm.assert_series_equal(result, expected) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index d8046c4944afc..735b8553b14d3 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -415,7 +415,9 @@ def test_astype_generic_timestamp_no_frequency(self, dtype): data = [1] s = Series(data) - msg = "dtype has no unit. Please pass in" + msg = ((r"The '{dtype}' dtype has no unit\. " + r"Please pass in '{dtype}\[ns\]' instead.") + .format(dtype=dtype.__name__)) with pytest.raises(ValueError, match=msg): s.astype(dtype) From dcf7fce19a662a86a2892396472e052ab9f69708 Mon Sep 17 00:00:00 2001 From: Antoine Viscardi <30598967+antoineviscardi@users.noreply.github.com> Date: Wed, 13 Mar 2019 13:10:36 -0400 Subject: [PATCH 16/22] Json normalize nan support (#25619) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/io/json/normalize.py | 3 +- pandas/tests/io/json/test_normalize.py | 114 +++++++++++++------------ 3 files changed, 62 insertions(+), 57 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d05669e862695..e045f0dc0ee39 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -215,10 +215,10 @@ I/O - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) - Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`) +- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`) - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`) - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`) - -- Plotting diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 279630ccd107c..7a8188dd07b6b 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0): raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) - result[k] = np.array(v).repeat(lengths) + # forcing dtype to object to avoid the metadata being casted to string + result[k] = np.array(v, dtype=object).repeat(lengths) return result diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3bf699cc8a1f0..5362274274d72 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -66,6 +66,25 @@ def author_missing_data(): }] +@pytest.fixture +def missing_metadata(): + return [ + {'name': 'Alice', + 'addresses': [{'number': 9562, + 'street': 'Morris St.', + 'city': 'Massillon', + 'state': 'OH', + 'zip': 44646}] + }, + {'addresses': [{'number': 8449, + 'street': 'Spring St.', + 'city': 'Elizabethton', + 'state': 'TN', + 'zip': 37643}] + } + ] + + class TestJSONNormalize(object): def test_simple_records(self): @@ -318,66 +337,51 @@ def test_nested_flattens(self): assert result == expected - def test_json_normalize_errors(self): - # GH14583: If meta keys are not always present - # a new option to set errors='ignore' has been implemented - i = { - "Trades": [{ - "general": { - "tradeid": 100, - "trade_version": 1, - "stocks": [{ - - "symbol": "AAPL", - "name": "Apple", - "price": "0" - }, { - "symbol": "GOOG", - "name": "Google", - "price": "0" - } - ] - } - }, { - "general": { - "tradeid": 100, - "stocks": [{ - "symbol": "AAPL", - "name": "Apple", - "price": "0" - }, { - "symbol": "GOOG", - "name": "Google", - "price": "0" - } - ] - } - } - ] - } - j = json_normalize(data=i['Trades'], - record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], - ['general', 'trade_version']], - errors='ignore') - expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, - 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, - 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, - 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, - 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} - - assert j.fillna('').to_dict() == expected - - msg = ("Try running with errors='ignore' as key 'trade_version'" + def test_json_normalize_errors(self, missing_metadata): + # GH14583: + # If meta keys are not always present a new option to set + # errors='ignore' has been implemented + + msg = ("Try running with errors='ignore' as key 'name'" " is not always present") with pytest.raises(KeyError, match=msg): json_normalize( - data=i['Trades'], - record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], - ['general', 'trade_version']], + data=missing_metadata, + record_path='addresses', + meta='name', errors='raise') + def test_missing_meta(self, missing_metadata): + # GH25468 + # If metadata is nullable with errors set to ignore, the null values + # should be numpy.nan values + result = json_normalize( + data=missing_metadata, + record_path='addresses', + meta='name', + errors='ignore') + ex_data = [ + {'city': 'Massillon', + 'number': 9562, + 'state': 'OH', + 'street': 'Morris St.', + 'zip': 44646, + 'name': 'Alice'}, + {'city': 'Elizabethton', + 'number': 8449, + 'state': 'TN', + 'street': 'Spring St.', + 'zip': 37643, + 'name': np.nan} + ] + ex_data = [ + ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'], + ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan] + ] + columns = ['city', 'number', 'state', 'street', 'zip', 'name'] + expected = DataFrame(ex_data, columns=columns) + tm.assert_frame_equal(result, expected) + def test_donot_drop_nonevalues(self): # GH21356 data = [ From 10173821965fa65654b172891b2102a8426132ca Mon Sep 17 00:00:00 2001 From: gwrome Date: Wed, 13 Mar 2019 15:10:48 -0500 Subject: [PATCH 17/22] #25707 - Fixed flakiness in stata write test (#25714) --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 3354bca63be92..4051adc7ee4cb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -251,7 +251,7 @@ def test_read_fspath_all_read_table(self, datapath): ('to_latex', {}, 'os'), ('to_msgpack', {}, 'os'), ('to_pickle', {}, 'os'), - ('to_stata', {}, 'os'), + ('to_stata', {'time_stamp': pd.to_datetime('2019-01-01 00:00')}, 'os'), ]) def test_write_fspath_all(self, writer_name, writer_kwargs, module): p1 = tm.ensure_clean('string') From 79205ea8a6aac4c82a1572276ede7510f5a38e8e Mon Sep 17 00:00:00 2001 From: Bharat Raghunathan Date: Thu, 14 Mar 2019 07:23:41 +0530 Subject: [PATCH 18/22] Make Rolling.apply documentation clearer (#25712) --- pandas/core/window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index 9e29fdb94c1e0..b073a7f379db6 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -953,7 +953,7 @@ def count(self): ---------- func : function Must produce a single value from an ndarray input if ``raw=True`` - or a Series if ``raw=False``. + or a single value from a Series if ``raw=False``. raw : bool, default None * ``False`` : passes each row or column as a Series to the function. From 873e22ef4ac1de06a3a033567daa3cc8be39ac26 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Thu, 14 Mar 2019 06:38:52 -0600 Subject: [PATCH 19/22] ENH: Add public start, stop, and step attributes to RangeIndex (#25720) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/indexes/range.py | 27 ++++++++++- pandas/tests/indexes/test_range.py | 77 ++++++++++++++---------------- 3 files changed, 63 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e045f0dc0ee39..72c40b04a1195 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -26,6 +26,7 @@ Other Enhancements - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`) - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) +- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 5aafe9734b6a0..886a48e2acfa9 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -48,7 +48,9 @@ class RangeIndex(Int64Index): Attributes ---------- - None + start + stop + step Methods ------- @@ -209,6 +211,29 @@ def _format_data(self, name=None): return None # -------------------------------------------------------------------- + @property + def start(self): + """ + The value of the `start` parameter (or ``0`` if this was not supplied) + """ + # GH 25710 + return self._start + + @property + def stop(self): + """ + The value of the `stop` parameter + """ + # GH 25710 + return self._stop + + @property + def step(self): + """ + The value of the `step` parameter (or ``1`` if this was not supplied) + """ + # GH 25710 + return self._step @cache_readonly def nbytes(self): diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 96cf83d477376..583e6bd81bb99 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -35,47 +35,25 @@ def test_too_many_names(self): with pytest.raises(ValueError, match="^Length"): self.index.names = ["roger", "harold"] - def test_constructor(self): - index = RangeIndex(5) - expected = np.arange(5, dtype=np.int64) - assert isinstance(index, RangeIndex) - assert index._start == 0 - assert index._stop == 5 - assert index._step == 1 - assert index.name is None - tm.assert_index_equal(Index(expected), index) - - index = RangeIndex(1, 5) - expected = np.arange(1, 5, dtype=np.int64) - assert isinstance(index, RangeIndex) - assert index._start == 1 - tm.assert_index_equal(Index(expected), index) - - index = RangeIndex(1, 5, 2) - expected = np.arange(1, 5, 2, dtype=np.int64) - assert isinstance(index, RangeIndex) - assert index._step == 2 - tm.assert_index_equal(Index(expected), index) - - for index in [RangeIndex(0), RangeIndex(start=0), RangeIndex(stop=0), - RangeIndex(0, 0)]: - expected = np.empty(0, dtype=np.int64) - assert isinstance(index, RangeIndex) - assert index._start == 0 - assert index._stop == 0 - assert index._step == 1 - tm.assert_index_equal(Index(expected), index) - - for index in [RangeIndex(0, name='Foo'), - RangeIndex(start=0, name='Foo'), - RangeIndex(stop=0, name='Foo'), - RangeIndex(0, 0, name='Foo')]: - assert isinstance(index, RangeIndex) - assert index.name == 'Foo' - - # we don't allow on a bare Index - with pytest.raises(TypeError): - Index(0, 1000) + @pytest.mark.parametrize('name', [None, 'foo']) + @pytest.mark.parametrize('args, kwargs, start, stop, step', [ + ((5,), dict(), 0, 5, 1), + ((1, 5), dict(), 1, 5, 1), + ((1, 5, 2), dict(), 1, 5, 2), + ((0,), dict(), 0, 0, 1), + ((0, 0), dict(), 0, 0, 1), + (tuple(), dict(start=0), 0, 0, 1), + (tuple(), dict(stop=0), 0, 0, 1)]) + def test_constructor(self, args, kwargs, start, stop, step, name): + result = RangeIndex(*args, name=name, **kwargs) + expected = Index(np.arange(start, stop, step, dtype=np.int64), + name=name) + assert isinstance(result, RangeIndex) + assert result._start == start + assert result._stop == stop + assert result._step == step + assert result.name is name + tm.assert_index_equal(result, expected) def test_constructor_invalid_args(self): msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" @@ -92,6 +70,12 @@ def test_constructor_invalid_args(self): with pytest.raises(TypeError): RangeIndex(i) + # we don't allow on a bare Index + msg = (r'Index\(\.\.\.\) must be called with a collection of some ' + r'kind, 0 was passed') + with pytest.raises(TypeError, match=msg): + Index(0, 1000) + def test_constructor_same(self): # pass thru w and w/o copy @@ -172,6 +156,17 @@ def test_constructor_corner(self): with pytest.raises(TypeError): RangeIndex(1, 5, dtype='float64') + @pytest.mark.parametrize('index, start, stop, step', [ + (RangeIndex(5), 0, 5, 1), + (RangeIndex(0, 5), 0, 5, 1), + (RangeIndex(5, step=2), 0, 5, 2), + (RangeIndex(1, 5, 2), 1, 5, 2)]) + def test_start_stop_step_attrs(self, index, start, stop, step): + # GH 25710 + assert index.start == start + assert index.stop == stop + assert index.step == step + def test_copy(self): i = RangeIndex(5, name='Foo') i_copy = i.copy() From a5d251de3af3cf07dfec39baa343633a9989c1d5 Mon Sep 17 00:00:00 2001 From: Katherine Surta Date: Thu, 14 Mar 2019 16:02:02 +0300 Subject: [PATCH 20/22] DOC: fix some grammar and inconsistency issues in the User Guide (#25728) --- doc/source/user_guide/text.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 6f21a7d9beb36..f7fdfcf8bf882 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -46,8 +46,8 @@ Since ``df.columns`` is an Index object, we can use the ``.str`` accessor df.columns.str.lower() These string methods can then be used to clean up the columns as needed. -Here we are removing leading and trailing white spaces, lower casing all names, -and replacing any remaining white spaces with underscores: +Here we are removing leading and trailing whitespaces, lower casing all names, +and replacing any remaining whitespaces with underscores: .. ipython:: python @@ -65,7 +65,7 @@ and replacing any remaining white spaces with underscores: ``Series``. Please note that a ``Series`` of type ``category`` with string ``.categories`` has - some limitations in comparison of ``Series`` of type string (e.g. you can't add strings to + some limitations in comparison to ``Series`` of type string (e.g. you can't add strings to each other: ``s + " " + s`` won't work if ``s`` is a ``Series`` of type ``category``). Also, ``.str`` methods which operate on elements of type ``list`` are not available on such a ``Series``. From 64f5961798438f09f2c6b6c474065909f5ebc336 Mon Sep 17 00:00:00 2001 From: Fabian Rost Date: Thu, 14 Mar 2019 16:26:14 +0100 Subject: [PATCH 21/22] Update ValueError message in corr (#25729) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- pandas/tests/frame/test_analytics.py | 4 ++-- pandas/tests/series/test_analytics.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 72c40b04a1195..d186fdfe0f322 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -124,7 +124,7 @@ Bug Fixes ~~~~~~~~~ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- +- Bug in error messages in :meth:`DataFrame.corr` and :meth:`Series.corr`. Added the possibility of using a callable. (:issue:`25729`) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3996728a1cc90..7317682d95256 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7088,8 +7088,8 @@ def corr(self, method='pearson', min_periods=1): correl[j, i] = c else: raise ValueError("method must be either 'pearson', " - "'spearman', or 'kendall', '{method}' " - "was supplied".format(method=method)) + "'spearman', 'kendall', or a callable, " + "'{method}' was supplied".format(method=method)) return self._constructor(correl, index=idx, columns=cols) diff --git a/pandas/core/series.py b/pandas/core/series.py index 03fc26efa4516..04fe3b4407149 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2159,8 +2159,8 @@ def corr(self, other, method='pearson', min_periods=None): min_periods=min_periods) raise ValueError("method must be either 'pearson', " - "'spearman', or 'kendall', '{method}' " - "was supplied".format(method=method)) + "'spearman', 'kendall', or a callable, " + "'{method}' was supplied".format(method=method)) def cov(self, other, min_periods=None): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 2969e8be2db03..88c8d89ec4b63 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -332,8 +332,8 @@ def test_corr_cov_independent_index_column(self): def test_corr_invalid_method(self): # GH 22298 df = pd.DataFrame(np.random.normal(size=(10, 2))) - msg = ("method must be either 'pearson', 'spearman', " - "or 'kendall'") + msg = ("method must be either 'pearson', " + "'spearman', 'kendall', or a callable, ") with pytest.raises(ValueError, match=msg): df.corr(method="____") diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d7d9c526503cb..13195a0d81d9c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -387,8 +387,8 @@ def test_corr_invalid_method(self): # GH PR #22298 s1 = pd.Series(np.random.randn(10)) s2 = pd.Series(np.random.randn(10)) - msg = ("method must be either 'pearson', 'spearman', " - "or 'kendall'") + msg = ("method must be either 'pearson', " + "'spearman', 'kendall', or a callable, ") with pytest.raises(ValueError, match=msg): s1.corr(s2, method="____") From 998e1de593d6bbf11c985eae9243b83f4710c1c6 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Thu, 14 Mar 2019 16:52:00 +0100 Subject: [PATCH 22/22] Fixturize tests/frame/test_operators.py (#25641) --- pandas/tests/frame/test_operators.py | 130 +++++++++++++-------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index e9521fa1506af..9707ae80e6812 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -13,7 +13,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, compat import pandas.core.common as com -from pandas.tests.frame.common import TestData, _check_mixed_float +from pandas.tests.frame.common import _check_mixed_float import pandas.util.testing as tm from pandas.util.testing import ( assert_frame_equal, assert_numpy_array_equal, assert_series_equal) @@ -207,7 +207,7 @@ def test_logical_with_nas(self): assert_series_equal(result, expected) -class TestDataFrameOperators(TestData): +class TestDataFrameOperators(object): @pytest.mark.parametrize('op', [operator.add, operator.sub, operator.mul, operator.truediv]) @@ -238,9 +238,9 @@ def test_operators_none_as_na(self, op): ('__ne__', True)]) # TODO: not sure what's correct here. @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning") - def test_logical_typeerror_with_non_valid(self, op, res): + def test_logical_typeerror_with_non_valid(self, op, res, float_frame): # we are comparing floats vs a string - result = getattr(self.frame, op)('foo') + result = getattr(float_frame, op)('foo') assert bool(result.all().all()) is res def test_binary_ops_align(self): @@ -318,16 +318,17 @@ def test_dti_tz_convert_to_utc(self): exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base) assert_frame_equal(df1 + df2, exp) - def test_combineFrame(self): - frame_copy = self.frame.reindex(self.frame.index[::2]) + def test_combineFrame(self, float_frame, mixed_float_frame, + mixed_int_frame): + frame_copy = float_frame.reindex(float_frame.index[::2]) del frame_copy['D'] frame_copy['C'][:5] = np.nan - added = self.frame + frame_copy + added = float_frame + frame_copy indexer = added['A'].dropna().index - exp = (self.frame['A'] * 2).copy() + exp = (float_frame['A'] * 2).copy() tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer]) @@ -340,95 +341,94 @@ def test_combineFrame(self): assert np.isnan(added['D']).all() - self_added = self.frame + self.frame - tm.assert_index_equal(self_added.index, self.frame.index) + self_added = float_frame + float_frame + tm.assert_index_equal(self_added.index, float_frame.index) - added_rev = frame_copy + self.frame + added_rev = frame_copy + float_frame assert np.isnan(added['D']).all() assert np.isnan(added_rev['D']).all() # corner cases # empty - plus_empty = self.frame + self.empty + plus_empty = float_frame + DataFrame() assert np.isnan(plus_empty.values).all() - empty_plus = self.empty + self.frame + empty_plus = DataFrame() + float_frame assert np.isnan(empty_plus.values).all() - empty_empty = self.empty + self.empty + empty_empty = DataFrame() + DataFrame() assert empty_empty.empty # out of order - reverse = self.frame.reindex(columns=self.frame.columns[::-1]) + reverse = float_frame.reindex(columns=float_frame.columns[::-1]) - assert_frame_equal(reverse + self.frame, self.frame * 2) + assert_frame_equal(reverse + float_frame, float_frame * 2) # mix vs float64, upcast - added = self.frame + self.mixed_float + added = float_frame + mixed_float_frame _check_mixed_float(added, dtype='float64') - added = self.mixed_float + self.frame + added = mixed_float_frame + float_frame _check_mixed_float(added, dtype='float64') # mix vs mix - added = self.mixed_float + self.mixed_float2 - _check_mixed_float(added, dtype=dict(C=None)) - added = self.mixed_float2 + self.mixed_float + added = mixed_float_frame + mixed_float_frame _check_mixed_float(added, dtype=dict(C=None)) # with int - added = self.frame + self.mixed_int + added = float_frame + mixed_int_frame _check_mixed_float(added, dtype='float64') - def test_combineSeries(self): + def test_combineSeries(self, float_frame, mixed_float_frame, + mixed_int_frame, datetime_frame): # Series - series = self.frame.xs(self.frame.index[0]) + series = float_frame.xs(float_frame.index[0]) - added = self.frame + series + added = float_frame + series for key, s in compat.iteritems(added): - assert_series_equal(s, self.frame[key] + series[key]) + assert_series_equal(s, float_frame[key] + series[key]) larger_series = series.to_dict() larger_series['E'] = 1 larger_series = Series(larger_series) - larger_added = self.frame + larger_series + larger_added = float_frame + larger_series - for key, s in compat.iteritems(self.frame): + for key, s in compat.iteritems(float_frame): assert_series_equal(larger_added[key], s + series[key]) assert 'E' in larger_added assert np.isnan(larger_added['E']).all() # no upcast needed - added = self.mixed_float + series + added = mixed_float_frame + series _check_mixed_float(added) # vs mix (upcast) as needed - added = self.mixed_float + series.astype('float32') + added = mixed_float_frame + series.astype('float32') _check_mixed_float(added, dtype=dict(C=None)) - added = self.mixed_float + series.astype('float16') + added = mixed_float_frame + series.astype('float16') _check_mixed_float(added, dtype=dict(C=None)) # these raise with numexpr.....as we are adding an int64 to an # uint64....weird vs int - # added = self.mixed_int + (100*series).astype('int64') + # added = mixed_int_frame + (100*series).astype('int64') # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = # 'int64', D = 'int64')) - # added = self.mixed_int + (100*series).astype('int32') + # added = mixed_int_frame + (100*series).astype('int32') # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = # 'int32', D = 'int64')) # TimeSeries - ts = self.tsframe['A'] + ts = datetime_frame['A'] # 10890 # we no longer allow auto timeseries broadcasting # and require explicit broadcasting - added = self.tsframe.add(ts, axis='index') + added = datetime_frame.add(ts, axis='index') - for key, col in compat.iteritems(self.tsframe): + for key, col in compat.iteritems(datetime_frame): result = col + ts assert_series_equal(added[key], result, check_names=False) assert added[key].name == key @@ -437,52 +437,52 @@ def test_combineSeries(self): else: assert result.name is None - smaller_frame = self.tsframe[:-5] + smaller_frame = datetime_frame[:-5] smaller_added = smaller_frame.add(ts, axis='index') - tm.assert_index_equal(smaller_added.index, self.tsframe.index) + tm.assert_index_equal(smaller_added.index, datetime_frame.index) smaller_ts = ts[:-5] - smaller_added2 = self.tsframe.add(smaller_ts, axis='index') + smaller_added2 = datetime_frame.add(smaller_ts, axis='index') assert_frame_equal(smaller_added, smaller_added2) # length 0, result is all-nan - result = self.tsframe.add(ts[:0], axis='index') - expected = DataFrame(np.nan, index=self.tsframe.index, - columns=self.tsframe.columns) + result = datetime_frame.add(ts[:0], axis='index') + expected = DataFrame(np.nan, index=datetime_frame.index, + columns=datetime_frame.columns) assert_frame_equal(result, expected) # Frame is all-nan - result = self.tsframe[:0].add(ts, axis='index') - expected = DataFrame(np.nan, index=self.tsframe.index, - columns=self.tsframe.columns) + result = datetime_frame[:0].add(ts, axis='index') + expected = DataFrame(np.nan, index=datetime_frame.index, + columns=datetime_frame.columns) assert_frame_equal(result, expected) # empty but with non-empty index - frame = self.tsframe[:1].reindex(columns=[]) + frame = datetime_frame[:1].reindex(columns=[]) result = frame.mul(ts, axis='index') assert len(result) == len(ts) - def test_combineFunc(self): - result = self.frame * 2 - tm.assert_numpy_array_equal(result.values, self.frame.values * 2) + def test_combineFunc(self, float_frame, mixed_float_frame): + result = float_frame * 2 + tm.assert_numpy_array_equal(result.values, float_frame.values * 2) # vs mix - result = self.mixed_float * 2 + result = mixed_float_frame * 2 for c, s in compat.iteritems(result): tm.assert_numpy_array_equal( - s.values, self.mixed_float[c].values * 2) + s.values, mixed_float_frame[c].values * 2) _check_mixed_float(result, dtype=dict(C=None)) - result = self.empty * 2 - assert result.index is self.empty.index + result = DataFrame() * 2 + assert result.index.equals(DataFrame().index) assert len(result.columns) == 0 - def test_comparisons(self): + def test_comparisons(self, simple_frame, float_frame): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() - row = self.simple.xs('a') + row = simple_frame.xs('a') ndim_5 = np.ones(df1.shape + (1, 1, 1)) def test_comp(func): @@ -493,17 +493,17 @@ def test_comp(func): with pytest.raises(ValueError, match='dim must be <= 2'): func(df1, ndim_5) - result2 = func(self.simple, row) + result2 = func(simple_frame, row) tm.assert_numpy_array_equal(result2.values, - func(self.simple.values, row.values)) + func(simple_frame.values, row.values)) - result3 = func(self.frame, 0) + result3 = func(float_frame, 0) tm.assert_numpy_array_equal(result3.values, - func(self.frame.values, 0)) + func(float_frame.values, 0)) msg = 'Can only compare identically-labeled DataFrame' with pytest.raises(ValueError, match=msg): - func(self.simple, self.simple[:2]) + func(simple_frame, simple_frame[:2]) test_comp(operator.eq) test_comp(operator.ne) @@ -599,9 +599,9 @@ def test_boolean_comparison(self): with pytest.raises(ValueError, match=msg1d): result = df == tup - def test_combine_generic(self): - df1 = self.frame - df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']] + def test_combine_generic(self, float_frame): + df1 = float_frame + df2 = float_frame.loc[float_frame.index[:-5], ['A', 'B', 'C']] combined = df1.combine(df2, np.add) combined2 = df2.combine(df1, np.add) @@ -611,8 +611,8 @@ def test_combine_generic(self): chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']] chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']] - exp = self.frame.loc[self.frame.index[:-5], - ['A', 'B', 'C']].reindex_like(chunk) * 2 + exp = float_frame.loc[float_frame.index[:-5], + ['A', 'B', 'C']].reindex_like(chunk) * 2 assert_frame_equal(chunk, exp) assert_frame_equal(chunk2, exp)