From 0a3706780feb77f241715ffcdebb14ad7d678d3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Apr 2017 12:59:17 +0200 Subject: [PATCH 01/22] DEPR: correct locations to access public tslib objects (#15897) --- pandas/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 1bc85899fb89f..83ad85e3e292b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -64,7 +64,11 @@ parser = _DeprecatedModule(deprmod='pandas.parser', deprmodto='pandas.io.libparsers') lib = _DeprecatedModule(deprmod='pandas.lib', deprmodto='pandas._libs.lib', moved={'infer_dtype': 'pandas.api.lib.infer_dtype'}) -tslib = _DeprecatedModule(deprmod='pandas.tslib', deprmodto='pandas._libs.tslib') +tslib = _DeprecatedModule(deprmod='pandas.tslib', deprmodto='pandas._libs.tslib', + moved={'Timestamp': 'pandas.Timestamp', + 'Timedelta': 'pandas.Timedelta', + 'NaT': 'pandas.NaT', + 'OutOfBoundsDatetime': 'pandas.errors.OutOfBoundsDatetime'}) # use the closest tagged version if possible from ._version import get_versions From dbc1654fb1604b99c1b4fe31a26b5548ea623565 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 5 Apr 2017 15:15:37 -0400 Subject: [PATCH 02/22] TST: better testing of Series.nlargest/nsmallest xref #15299 Author: Jeff Reback Closes #15902 from jreback/series_n and squashes the following commits: 657eac8 [Jeff Reback] TST: better testing of Series.nlargest/nsmallest --- pandas/core/algorithms.py | 56 ++++++-- pandas/tests/series/test_analytics.py | 180 +++++++++++++++----------- 2 files changed, 151 insertions(+), 85 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a62d290277443..99ef76e0f4812 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -12,6 +12,7 @@ from pandas.types.common import (is_unsigned_integer_dtype, is_signed_integer_dtype, is_integer_dtype, + is_complex_dtype, is_categorical_dtype, is_extension_type, is_datetimetz, @@ -40,6 +41,44 @@ from pandas._libs.tslib import iNaT +# --------------- # +# dtype access # +# --------------- # + +def _ensure_data_view(values): + """ + helper routine to ensure that our data is of the correct + input dtype for lower-level routines + + Parameters + ---------- + values : array-like + """ + + if needs_i8_conversion(values): + values = values.view(np.int64) + elif is_period_arraylike(values): + from pandas.tseries.period import PeriodIndex + values = PeriodIndex(values).asi8 + elif is_categorical_dtype(values): + values = values.values.codes + elif isinstance(values, (ABCSeries, ABCIndex)): + values = values.values + + if is_signed_integer_dtype(values): + values = _ensure_int64(values) + elif is_unsigned_integer_dtype(values): + values = _ensure_uint64(values) + elif is_complex_dtype(values): + values = _ensure_float64(values) + elif is_float_dtype(values): + values = _ensure_float64(values) + else: + values = _ensure_object(values) + + return values + + # --------------- # # top-level algos # # --------------- # @@ -867,9 +906,7 @@ def nsmallest(arr, n, keep='first'): narr = len(arr) n = min(n, narr) - sdtype = str(arr.dtype) - arr = arr.view(_dtype_map.get(sdtype, sdtype)) - + arr = _ensure_data_view(arr) kth_val = algos.kth_smallest(arr.copy(), n - 1) return _finalize_nsmallest(arr, kth_val, n, keep, narr) @@ -880,8 +917,7 @@ def nlargest(arr, n, keep='first'): Note: Fails silently with NaN. """ - sdtype = str(arr.dtype) - arr = arr.view(_dtype_map.get(sdtype, sdtype)) + arr = _ensure_data_view(arr) return nsmallest(-arr, n, keep=keep) @@ -910,9 +946,10 @@ def select_n_series(series, n, keep, method): nordered : Series """ dtype = series.dtype - if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64, - np.timedelta64)): - raise TypeError("Cannot use method %r with dtype %s" % (method, dtype)) + if not ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or + needs_i8_conversion(dtype)): + raise TypeError("Cannot use method '{method}' with " + "dtype {dtype}".format(method=method, dtype=dtype)) if keep not in ('first', 'last'): raise ValueError('keep must be either "first", "last"') @@ -964,9 +1001,6 @@ def _finalize_nsmallest(arr, kth_val, n, keep, narr): return inds -_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} - - # ------- # # helpers # # ------- # diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b747a680c17dd..732142f1bce9a 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1381,80 +1381,6 @@ def test_is_monotonic(self): self.assertFalse(s.is_monotonic) self.assertTrue(s.is_monotonic_decreasing) - def test_nsmallest_nlargest(self): - # float, int, datetime64 (use i8), timedelts64 (same), - # object that are numbers, object that are strings - - base = [3, 2, 1, 2, 5] - - s_list = [ - Series(base, dtype='int8'), - Series(base, dtype='int16'), - Series(base, dtype='int32'), - Series(base, dtype='int64'), - Series(base, dtype='float32'), - Series(base, dtype='float64'), - Series(base, dtype='uint8'), - Series(base, dtype='uint16'), - Series(base, dtype='uint32'), - Series(base, dtype='uint64'), - Series(base).astype('timedelta64[ns]'), - Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005'])), - ] - - raising = [ - Series([3., 2, 1, 2, '5'], dtype='object'), - Series([3., 2, 1, 2, 5], dtype='object'), - # not supported on some archs - # Series([3., 2, 1, 2, 5], dtype='complex256'), - Series([3., 2, 1, 2, 5], dtype='complex128'), - ] - - for r in raising: - dt = r.dtype - msg = "Cannot use method 'n(larg|small)est' with dtype %s" % dt - args = 2, len(r), 0, -1 - methods = r.nlargest, r.nsmallest - for method, arg in product(methods, args): - with tm.assertRaisesRegexp(TypeError, msg): - method(arg) - - for s in s_list: - - assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]]) - - empty = s.iloc[0:0] - assert_series_equal(s.nsmallest(0), empty) - assert_series_equal(s.nsmallest(-1), empty) - assert_series_equal(s.nlargest(0), empty) - assert_series_equal(s.nlargest(-1), empty) - - assert_series_equal(s.nsmallest(len(s)), s.sort_values()) - assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values()) - assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) - assert_series_equal(s.nlargest(len(s) + 1), - s.iloc[[4, 0, 1, 3, 2]]) - - s = Series([3., np.nan, 1, 2, 5]) - assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) - assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) - - msg = 'keep must be either "first", "last"' - with tm.assertRaisesRegexp(ValueError, msg): - s.nsmallest(keep='invalid') - with tm.assertRaisesRegexp(ValueError, msg): - s.nlargest(keep='invalid') - - # GH 13412 - s = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) - result = s.nlargest(3) - expected = s.sort_values(ascending=False).head(3) - assert_series_equal(result, expected) - result = s.nsmallest(3) - expected = s.sort_values().head(3) - assert_series_equal(result, expected) - def test_sort_index_level(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) @@ -1729,3 +1655,109 @@ def test_value_counts_categorical_not_ordered(self): index=exp_idx, name='xxx') tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + +@pytest.fixture +def s_main_dtypes(): + df = pd.DataFrame( + {'datetime': pd.to_datetime(['2003', '2002', + '2001', '2002', + '2005']), + 'datetimetz': pd.to_datetime( + ['2003', '2002', + '2001', '2002', + '2005']).tz_localize('US/Eastern'), + 'timedelta': pd.to_timedelta(['3d', '2d', '1d', + '2d', '5d'])}) + + for dtype in ['int8', 'int16', 'int32', 'int64', + 'float32', 'float64', + 'uint8', 'uint16', 'uint32', 'uint64']: + df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) + + return df + + +class TestNLargestNSmallest(object): + + @pytest.mark.parametrize( + "r", [Series([3., 2, 1, 2, '5'], dtype='object'), + Series([3., 2, 1, 2, 5], dtype='object'), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3., 2, 1, 2, 5], dtype='complex128'), + Series(list('abcde'), dtype='category'), + Series(list('abcde'))]) + def test_error(self, r): + dt = r.dtype + msg = ("Cannot use method 'n(larg|small)est' with " + "dtype {dt}".format(dt=dt)) + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with tm.assertRaisesRegexp(TypeError, msg): + method(arg) + + @pytest.mark.parametrize( + "s", + [v for k, v in s_main_dtypes().iteritems()]) + def test_nsmallest_nlargest(self, s): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + + assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) + assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]]) + + empty = s.iloc[0:0] + assert_series_equal(s.nsmallest(0), empty) + assert_series_equal(s.nsmallest(-1), empty) + assert_series_equal(s.nlargest(0), empty) + assert_series_equal(s.nlargest(-1), empty) + + assert_series_equal(s.nsmallest(len(s)), s.sort_values()) + assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values()) + assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), + s.iloc[[4, 0, 1, 3, 2]]) + + def test_misc(self): + + s = Series([3., np.nan, 1, 2, 5]) + assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) + assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) + + msg = 'keep must be either "first", "last"' + with tm.assertRaisesRegexp(ValueError, msg): + s.nsmallest(keep='invalid') + with tm.assertRaisesRegexp(ValueError, msg): + s.nlargest(keep='invalid') + + # GH 15297 + s = Series([1] * 5, index=[1, 2, 3, 4, 5]) + expected_first = Series([1] * 3, index=[1, 2, 3]) + expected_last = Series([1] * 3, index=[5, 4, 3]) + + result = s.nsmallest(3) + assert_series_equal(result, expected_first) + + result = s.nsmallest(3, keep='last') + assert_series_equal(result, expected_last) + + result = s.nlargest(3) + assert_series_equal(result, expected_first) + + result = s.nlargest(3, keep='last') + assert_series_equal(result, expected_last) + + @pytest.mark.parametrize('n', range(1, 5)) + def test_n(self, n): + + # GH 13412 + s = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = s.nlargest(n) + expected = s.sort_values(ascending=False).head(n) + assert_series_equal(result, expected) + + result = s.nsmallest(n) + expected = s.sort_values().head(n) + assert_series_equal(result, expected) From e4e87ec55765d31e59e97d89c71ed5a3fa2f3d38 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 5 Apr 2017 15:16:40 -0400 Subject: [PATCH 03/22] ENH: Add file buffer validation to I/O ops 1) Allows for more uniform handling of invalid file buffers to our `read_*` functions. 2) Adds a ton of new documentation to `inference.py` Closes #15337. xref #15895. Author: gfyoung Closes #15894 from gfyoung/validate-file-like and squashes the following commits: 5a8f8da [gfyoung] DOC: Document all of inference.py 81103f7 [gfyoung] ENH: Add file buffer validation to I/O ops --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/common.py | 23 +- pandas/io/excel.py | 5 +- pandas/tests/api/test_types.py | 2 +- pandas/tests/io/parser/common.py | 17 ++ pandas/tests/types/test_inference.py | 16 +- pandas/types/api.py | 1 + pandas/types/inference.py | 328 +++++++++++++++++++++++++-- 8 files changed, 361 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2e1cc396287ce..cbb4d32cc5edb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1033,6 +1033,7 @@ I/O - Bug in ``pd.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) - Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`) +- Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`) - Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 8bc7217db87f9..8ee6ded67f790 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,7 +10,7 @@ from pandas import compat from pandas.formats.printing import pprint_thing from pandas.core.common import AbstractMethodError -from pandas.types.common import is_number +from pandas.types.common import is_number, is_file_like # compat from pandas.errors import (ParserError, DtypeWarning, # noqa @@ -197,9 +197,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, encoding=encoding, compression=compression) - # It is a pathlib.Path/py.path.local or string + # Convert pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) - return _expand_user(filepath_or_buffer), None, compression + + if isinstance(filepath_or_buffer, (compat.string_types, + compat.binary_type, + mmap.mmap)): + return _expand_user(filepath_or_buffer), None, compression + + if not is_file_like(filepath_or_buffer): + msg = "Invalid file path or buffer object type: {_type}" + raise ValueError(msg.format(_type=type(filepath_or_buffer))) + + return filepath_or_buffer, None, compression def file_path_to_url(path): @@ -416,6 +426,9 @@ def __init__(self, f): def __getattr__(self, name): return getattr(self.mmap, name) + def __iter__(self): + return self + def __next__(self): newline = self.mmap.readline() @@ -433,6 +446,10 @@ def __next__(self): return newline +if not compat.PY3: + MMapWrapper.next = lambda self: self.__next__() + + class UTF8Recoder(BaseIterator): """ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 6d136869fc73f..737141f11d7d1 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -243,9 +243,8 @@ def __init__(self, io, **kwds): # to get_filepath_or_buffer() if _is_url(io): io = _urlopen(io) - # Deal with S3 urls, path objects, etc. Will convert them to - # buffer or path string - io, _, _ = get_filepath_or_buffer(io) + elif not isinstance(io, (ExcelFile, xlrd.Book)): + io, _, _ = get_filepath_or_buffer(io) if engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 686de4a196034..f3fd6332417a1 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -24,7 +24,7 @@ class TestTypes(Base, tm.TestCase): 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', 'is_unsigned_integer_dtype', 'is_period', 'is_period_dtype', 'is_re', 'is_re_compilable', - 'is_dict_like', 'is_iterator', + 'is_dict_like', 'is_iterator', 'is_file_like', 'is_list_like', 'is_hashable', 'is_named_tuple', 'is_sequence', 'pandas_dtype'] diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 7faf485b65d10..36d5f2dd5274b 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1678,3 +1678,20 @@ def test_file_handles(self): if PY3: self.assertFalse(m.closed) m.close() + + def test_invalid_file_buffer(self): + # see gh-15337 + + class InvalidBuffer(object): + pass + + msg = "Invalid file path or buffer object type" + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(InvalidBuffer()) + + if PY3: + from unittest import mock + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(mock.Mock()) diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index b41df0da45234..de3a2ca35a7f5 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -17,7 +17,7 @@ from pandas import (Series, Index, DataFrame, Timedelta, DatetimeIndex, TimedeltaIndex, Timestamp, Panel, Period, Categorical) -from pandas.compat import u, PY2, lrange +from pandas.compat import u, PY2, PY3, StringIO, lrange from pandas.types import inference from pandas.types.common import (is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -78,6 +78,20 @@ def test_is_dict_like(): assert not inference.is_dict_like(f) +def test_is_file_like(): + is_file = inference.is_file_like + + data = StringIO("data") + assert is_file(data) + + data = [1, 2, 3] + assert not is_file(data) + + if PY3: + from unittest import mock + assert not is_file(mock.Mock()) + + def test_is_named_tuple(): passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) diff --git a/pandas/types/api.py b/pandas/types/api.py index c809cb3614a8c..e78514ce77822 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -52,6 +52,7 @@ is_re_compilable, is_dict_like, is_iterator, + is_file_like, is_list_like, is_hashable, is_named_tuple, diff --git a/pandas/types/inference.py b/pandas/types/inference.py index d8e3b3ee7329b..91418677c6b19 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -4,7 +4,7 @@ import re import numpy as np from numbers import Number -from pandas.compat import (string_types, text_type, +from pandas.compat import (PY2, string_types, text_type, string_and_binary_types) from pandas._libs import lib @@ -22,28 +22,211 @@ def is_number(obj): + """ + Check if the object is a number. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_number : bool + Whether `obj` is a number or not. + + Examples + -------- + >>> is_number(1) + True + >>> is_number("foo") + False + """ + return isinstance(obj, (Number, np.number)) def is_string_like(obj): + """ + Check if the object is a string. + + Parameters + ---------- + obj : The object to check. + + Examples + -------- + >>> is_string_like("foo") + True + >>> is_string_like(1) + False + + Returns + ------- + is_str_like : bool + Whether `obj` is a string or not. + """ + return isinstance(obj, (text_type, string_types)) -def _iterable_not_string(x): - return (isinstance(x, collections.Iterable) and - not isinstance(x, string_types)) +def _iterable_not_string(obj): + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> _iterable_not_string([1, 2, 3]) + True + >>> _iterable_not_string("foo") + False + >>> _iterable_not_string(1) + False + """ + + return (isinstance(obj, collections.Iterable) and + not isinstance(obj, string_types)) def is_iterator(obj): - # python 3 generators have __next__ instead of next - return hasattr(obj, 'next') or hasattr(obj, '__next__') + """ + Check if the object is an iterator. + + For example, lists are considered iterators + but not strings or datetime objects. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter : bool + Whether `obj` is an iterator. + + Examples + -------- + >>> is_iterator([1, 2, 3]) + True + >>> is_iterator(datetime(2017, 1, 1)) + False + >>> is_iterator("foo") + False + >>> is_iterator(1) + False + """ + + if not hasattr(obj, '__iter__'): + return False + + if PY2: + return hasattr(obj, 'next') + else: + # Python 3 generators have + # __next__ instead of next + return hasattr(obj, '__next__') + + +def is_file_like(obj): + """ + Check if the object is a file-like object. + + For objects to be considered file-like, they must + be an iterator AND have the following four methods: + + 1) read + 2) write + 3) seek + 4) tell + + Note: file-like objects must be iterable, but + iterable objects need not be file-like. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_file_like : bool + Whether `obj` has file-like properties. + + Examples + -------- + >>> buffer(StringIO("data")) + >>> is_file_like(buffer) + True + >>> is_file_like([1, 2, 3]) + False + """ + + file_attrs = ('read', 'write', 'seek', 'tell') + + for attr in file_attrs: + if not hasattr(obj, attr): + return False + + if not is_iterator(obj): + return False + + return True def is_re(obj): + """ + Check if the object is a regex pattern instance. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_regex : bool + Whether `obj` is a regex pattern. + + Examples + -------- + >>> is_re(re.compile(".*")) + True + >>> is_re("foo") + False + """ + return isinstance(obj, re._pattern_type) def is_re_compilable(obj): + """ + Check if the object can be compiled into a regex pattern instance. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_regex_compilable : bool + Whether `obj` can be compiled as a regex pattern. + + Examples + -------- + >>> is_re_compilable(".*") + True + >>> is_re_compilable(1) + False + """ + try: re.compile(obj) except TypeError: @@ -52,21 +235,95 @@ def is_re_compilable(obj): return True -def is_list_like(arg): - return (hasattr(arg, '__iter__') and - not isinstance(arg, string_and_binary_types)) +def is_list_like(obj): + """ + Check if the object is list-like. + + Objects that are considered list-like are for example Python + lists, tuples, sets, NumPy arrays, and Pandas Series. + + Strings and datetime objects, however, are not considered list-like. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_list_like : bool + Whether `obj` has list-like properties. + + Examples + -------- + >>> is_list_like([1, 2, 3]) + True + >>> is_list_like({1, 2, 3}) + True + >>> is_list_like(datetime(2017, 1, 1)) + False + >>> is_list_like("foo") + False + >>> is_list_like(1) + False + """ + + return (hasattr(obj, '__iter__') and + not isinstance(obj, string_and_binary_types)) + +def is_dict_like(obj): + """ + Check if the object is dict-like. -def is_dict_like(arg): - return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') + Parameters + ---------- + obj : The object to check. + Returns + ------- + is_dict_like : bool + Whether `obj` has dict-like properties. -def is_named_tuple(arg): - return isinstance(arg, tuple) and hasattr(arg, '_fields') + Examples + -------- + >>> is_dict_like({1: 2}) + True + >>> is_dict_like([1, 2, 3]) + False + """ + + return hasattr(obj, '__getitem__') and hasattr(obj, 'keys') + + +def is_named_tuple(obj): + """ + Check if the object is a named tuple. + Parameters + ---------- + obj : The object to check. -def is_hashable(arg): - """Return True if hash(arg) will succeed, False otherwise. + Returns + ------- + is_named_tuple : bool + Whether `obj` is a named tuple. + + Examples + -------- + >>> Point = namedtuple("Point", ["x", "y"]) + >>> p = Point(1, 2) + >>> + >>> is_named_tuple(p) + True + >>> is_named_tuple((1, 2)) + False + """ + + return isinstance(obj, tuple) and hasattr(obj, '_fields') + + +def is_hashable(obj): + """Return True if hash(obj) will succeed, False otherwise. Some types will pass a test against collections.Hashable but fail when they are actually hashed with hash(). @@ -82,25 +339,48 @@ def is_hashable(arg): >>> is_hashable(a) False """ - # unfortunately, we can't use isinstance(arg, collections.Hashable), which - # can be faster than calling hash, because numpy scalars on Python 3 fail - # this test + # Unfortunately, we can't use isinstance(obj, collections.Hashable), which + # can be faster than calling hash. That is because numpy scalars on Python + # 3 fail this test. - # reconsider this decision once this numpy bug is fixed: + # Reconsider this decision once this numpy bug is fixed: # https://github.com/numpy/numpy/issues/5562 try: - hash(arg) + hash(obj) except TypeError: return False else: return True -def is_sequence(x): +def is_sequence(obj): + """ + Check if the object is a sequence of objects. + String types are not included as sequences here. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_sequence : bool + Whether `obj` is a sequence of objects. + + Examples + -------- + >>> l = [1, 2, 3] + >>> + >>> is_sequence(l) + True + >>> is_sequence(iter(l)) + False + """ + try: - iter(x) - len(x) # it has a length - return not isinstance(x, string_and_binary_types) + iter(obj) # Can iterate over it. + len(obj) # Has a length associated with it. + return not isinstance(obj, string_and_binary_types) except (TypeError, AttributeError): return False From ba30e3a2e376035549b009079d44ba5ca7a4c48f Mon Sep 17 00:00:00 2001 From: alexandercbooth Date: Wed, 5 Apr 2017 16:47:07 -0500 Subject: [PATCH 04/22] BUG: addresses #14855 by fixing color kwarg conflict - [x] closes #14855 - [x] tests passed - [x] passes ``git diff upstream/master | flake8 --diff`` Author: alexandercbooth This patch had conflicts when merged, resolved by Committer: Tom Augspurger Closes #14871 from alexandercbooth/fix-color-scatterm-bug and squashes the following commits: 3245f09b9 [alexandercbooth] DOC: moving whatsnew entry to 0.20.0 8ff5f51f1 [alexandercbooth] BUG: addresses #14855 by fixing color kwarg conflict --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/plotting/test_misc.py | 6 ++++++ pandas/tools/plotting.py | 8 +++----- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cbb4d32cc5edb..ad190671cbbdc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1056,6 +1056,7 @@ Plotting - Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) +- Bug in ``pd.scatter_matrix()`` could accept either ``color`` or ``c``, but not both (:issue:`14855`) Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 11f00386ec592..812f039f1a2c7 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -76,6 +76,12 @@ def scat(**kwds): _check_plot_works(scat, diagonal='hist') with tm.assert_produces_warning(UserWarning): _check_plot_works(scat, range_padding=.1) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, color='rgb') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, c='rgb') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, facecolor='rgb') def scat2(x, y, by=None, ax=None, figsize=None): return plotting.scatter_plot(df, x, y, by, ax, figsize=None) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index d311b0e6d83eb..f70a2b0b22140 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -349,7 +349,6 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix(df, alpha=0.2) """ - import matplotlib.pyplot as plt df = frame._get_numeric_data() n = df.columns.size @@ -367,8 +366,8 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} - # workaround because `c='b'` is hardcoded in matplotlibs scatter method - kwds.setdefault('c', plt.rcParams['patch.facecolor']) + # GH 14855 + kwds.setdefault('edgecolors', 'none') boundaries_list = [] for a in df.columns: @@ -2864,8 +2863,7 @@ def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False, """ import matplotlib.pyplot as plt - # workaround because `c='b'` is hardcoded in matplotlibs scatter method - kwargs.setdefault('c', plt.rcParams['patch.facecolor']) + kwargs.setdefault('edgecolors', 'none') def plot_group(group, ax): xvals = group[x].values From 1fbdc23def1cc91280c508ac5b7806ced579b264 Mon Sep 17 00:00:00 2001 From: Tong SHEN Date: Thu, 6 Apr 2017 15:07:58 +0800 Subject: [PATCH 05/22] DOC: Fix a typo in travis.yml (#15915) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d864b755541de..e5e05ed26da56 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: 3.5 # set NOCACHE-true # To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run # travis cache --delete inside the project directory from the travis command line client -# The cash directories will be deleted if anything in ci/ changes in a commit +# The cache directories will be deleted if anything in ci/ changes in a commit cache: ccache: true directories: From b070d519c94bda36e116327b6cf854d8e9888308 Mon Sep 17 00:00:00 2001 From: Baurzhan Muftakhidinov Date: Thu, 6 Apr 2017 16:34:24 +0500 Subject: [PATCH 06/22] Fix a docstring typo in _fill_mi_header (#15918) [ci skip] --- pandas/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 737141f11d7d1..7f2f0cf4943b8 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -571,7 +571,7 @@ def _fill_mi_header(row, control_row): ---------- row : list List of items in a single row. - constrol_row : list of boolean + control_row : list of boolean Helps to determine if particular column is in same parent index as the previous value. Used to stop propagation of empty cells between different indexes. From 763197c3422d46b8e4cc807d58a63c6be6a9a288 Mon Sep 17 00:00:00 2001 From: Tong SHEN Date: Thu, 6 Apr 2017 19:35:59 +0800 Subject: [PATCH 07/22] DOC: Fix a typo in indexing.rst (#15916) * DOC: Fix a typo in indexing.rst * more typos fixed --- doc/source/indexing.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index bc8997b313053..f988fb7cd6806 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -69,7 +69,7 @@ Different Choices for Indexing .. versionadded:: 0.11.0 Object selection has had a number of user-requested additions in order to -support more explicit location based indexing. pandas now supports three types +support more explicit location based indexing. Pandas now supports three types of multi-axis indexing. - ``.loc`` is primarily label based, but may also be used with a boolean array. ``.loc`` will raise ``KeyError`` when the items are not found. Allowed inputs are: @@ -401,7 +401,7 @@ Selection By Position This is sometimes called ``chained assignment`` and should be avoided. See :ref:`Returning a View versus Copy ` -pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. When slicing, the start bounds is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise a ``IndexError``. +Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely python and numpy slicing. These are ``0-based`` indexing. When slicing, the start bounds is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: From a0b089e1feee4e132d274271215d867295fc091a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 6 Apr 2017 09:31:31 -0400 Subject: [PATCH 08/22] BUG: Standardize malformed row handling in Python engine (#15913) Closes gh-15910. --- doc/source/whatsnew/v0.20.0.txt | 4 +- pandas/io/parsers.py | 87 +++++++++++--------- pandas/tests/io/parser/c_parser_only.py | 9 ++ pandas/tests/io/parser/python_parser_only.py | 18 ++-- 4 files changed, 72 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ad190671cbbdc..462341d3d692d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -365,6 +365,7 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) +- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -1034,7 +1035,8 @@ I/O - Bug in ``pd.read_csv()`` with ``float_precision='round_trip'`` which caused a segfault when a text entry is parsed (:issue:`15140`) - Bug in ``pd.read_csv()`` when an index was specified and no values were specified as null values (:issue:`15835`) - Bug in ``pd.read_csv()`` in which certain invalid file objects caused the Python interpreter to crash (:issue:`15337`) -- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`) +- Bug in ``pd.read_csv()`` in which invalid values for ``nrows`` and ``chunksize`` were allowed (:issue:`15767`) +- Bug in ``pd.read_csv()`` for the Python engine in which unhelpful error messages were being raised when parsing errors occurred (:issue:`15910`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`) - Bug in ``.to_json()`` causing single byte ascii characters to be expanded to four byte unicode (:issue:`15344`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b624d2cc0c7ad..a85f9cda50879 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2469,26 +2469,7 @@ def _next_line(self): next(self.data) while True: - try: - orig_line = next(self.data) - except csv.Error as e: - msg = str(e) - - if 'NULL byte' in str(e): - msg = ('NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead') - - if self.skipfooter > 0: - reason = ('Error could possibly be due to ' - 'parsing errors in the skipped footer rows ' - '(the skipfooter keyword is only applied ' - 'after Python\'s csv library has parsed ' - 'all rows).') - msg += '. ' + reason - - raise csv.Error(msg) + orig_line = self._next_iter_line() line = self._check_comments([orig_line])[0] self.pos += 1 if (not self.skip_blank_lines and @@ -2510,6 +2491,43 @@ def _next_line(self): self.buf.append(line) return line + def _next_iter_line(self, **kwargs): + """ + Wrapper around iterating through `self.data` (CSV source). + + When a CSV error is raised, we check for specific + error messages that allow us to customize the + error message displayed to the user. + + Parameters + ---------- + kwargs : Keyword arguments used to customize the error message. + """ + + try: + return next(self.data) + except csv.Error as e: + msg = str(e) + + if 'NULL byte' in msg: + msg = ('NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead') + elif 'newline inside string' in msg: + msg = ('EOF inside string starting with ' + 'line ' + str(kwargs['row_num'])) + + if self.skipfooter > 0: + reason = ('Error could possibly be due to ' + 'parsing errors in the skipped footer rows ' + '(the skipfooter keyword is only applied ' + 'after Python\'s csv library has parsed ' + 'all rows).') + msg += '. ' + reason + + raise csv.Error(msg) + def _check_comments(self, lines): if self.comment is None: return lines @@ -2688,7 +2706,6 @@ def _rows_to_cols(self, content): return zipped_content def _get_lines(self, rows=None): - source = self.data lines = self.buf new_rows = None @@ -2703,14 +2720,14 @@ def _get_lines(self, rows=None): rows -= len(self.buf) if new_rows is None: - if isinstance(source, list): - if self.pos > len(source): + if isinstance(self.data, list): + if self.pos > len(self.data): raise StopIteration if rows is None: - new_rows = source[self.pos:] - new_pos = len(source) + new_rows = self.data[self.pos:] + new_pos = len(self.data) else: - new_rows = source[self.pos:self.pos + rows] + new_rows = self.data[self.pos:self.pos + rows] new_pos = self.pos + rows # Check for stop rows. n.b.: self.skiprows is a set. @@ -2726,21 +2743,17 @@ def _get_lines(self, rows=None): try: if rows is not None: for _ in range(rows): - new_rows.append(next(source)) + new_rows.append(next(self.data)) lines.extend(new_rows) else: rows = 0 + while True: - try: - new_rows.append(next(source)) - rows += 1 - except csv.Error as inst: - if 'newline inside string' in str(inst): - row_num = str(self.pos + rows) - msg = ('EOF inside string starting with ' - 'line ' + row_num) - raise Exception(msg) - raise + new_row = self._next_iter_line( + row_num=self.pos + rows) + new_rows.append(new_row) + rows += 1 + except StopIteration: if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index ffbd904843bfc..837b7a7922d75 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -408,3 +408,12 @@ def test_large_difference_in_columns(self): expected = DataFrame([row.split(',')[0] for row in rows]) tm.assert_frame_equal(result, expected) + + def test_data_after_quote(self): + # see gh-15910 + + data = 'a\n1\n"b"a' + result = self.read_csv(StringIO(data)) + expected = DataFrame({'a': ['1', 'ba']}) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index bd76070933c47..36356315419c4 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -225,15 +225,17 @@ def test_multi_char_sep_quotes(self): def test_skipfooter_bad_row(self): # see gh-13879 + # see gh-15910 - data = 'a,b,c\ncat,foo,bar\ndog,foo,"baz' msg = 'parsing errors in the skipped footer rows' - with tm.assertRaisesRegexp(csv.Error, msg): - self.read_csv(StringIO(data), skipfooter=1) - - # We expect no match, so there should be an assertion - # error out of the inner context manager. - with tm.assertRaises(AssertionError): + for data in ('a\n1\n"b"a', + 'a,b,c\ncat,foo,bar\ndog,foo,"baz'): with tm.assertRaisesRegexp(csv.Error, msg): - self.read_csv(StringIO(data)) + self.read_csv(StringIO(data), skipfooter=1) + + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(csv.Error, msg): + self.read_csv(StringIO(data)) From c1122523ede85340b042b83b629731db8176378f Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Thu, 6 Apr 2017 09:38:11 -0400 Subject: [PATCH 09/22] BUG: Fix nsmallest/nlargest With Identical Values closes #15297 Author: Roger Thomas Closes #15299 from RogerThomas/fix_nsmallest_nlargest_with_n_identical_values and squashes the following commits: d3964f8 [Roger Thomas] Fix nsmallest/nlargest With Identical Values --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/algorithms.py | 75 +++++++++- pandas/tests/frame/test_analytics.py | 199 ++++++++++++++++++--------- 3 files changed, 200 insertions(+), 75 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 462341d3d692d..cb9e2496757ef 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1097,6 +1097,7 @@ Reshaping - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`) - Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`) +- Bug in ``DataFrame.nsmallest`` and ``DataFrame.nlargest`` where identical values resulted in duplicated rows (:issue:`15297`) Numeric ^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 99ef76e0f4812..80664a9ba3019 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -931,6 +931,15 @@ def select_n_slow(dropped, n, keep, method): _select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} +def _is_valid_dtype_n_method(dtype): + """ + Helper function to determine if dtype is valid for + nsmallest/nlargest methods + """ + return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or + needs_i8_conversion(dtype)) + + def select_n_series(series, n, keep, method): """Implement n largest/smallest for pandas Series @@ -946,8 +955,7 @@ def select_n_series(series, n, keep, method): nordered : Series """ dtype = series.dtype - if not ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or - needs_i8_conversion(dtype)): + if not _is_valid_dtype_n_method(dtype): raise TypeError("Cannot use method '{method}' with " "dtype {dtype}".format(method=method, dtype=dtype)) @@ -981,14 +989,67 @@ def select_n_frame(frame, columns, n, method, keep): ------- nordered : DataFrame """ - from pandas.core.series import Series + from pandas import Int64Index if not is_list_like(columns): columns = [columns] columns = list(columns) - ser = getattr(frame[columns[0]], method)(n, keep=keep) - if isinstance(ser, Series): - ser = ser.to_frame() - return ser.merge(frame, on=columns[0], left_index=True)[frame.columns] + for column in columns: + dtype = frame[column].dtype + if not _is_valid_dtype_n_method(dtype): + raise TypeError(( + "Column {column!r} has dtype {dtype}, cannot use method " + "{method!r} with this dtype" + ).format(column=column, dtype=dtype, method=method)) + + def get_indexer(current_indexer, other_indexer): + """Helper function to concat `current_indexer` and `other_indexer` + depending on `method` + """ + if method == 'nsmallest': + return current_indexer.append(other_indexer) + else: + return other_indexer.append(current_indexer) + + # Below we save and reset the index in case index contains duplicates + original_index = frame.index + cur_frame = frame = frame.reset_index(drop=True) + cur_n = n + indexer = Int64Index([]) + + for i, column in enumerate(columns): + + # For each column we apply method to cur_frame[column]. If it is the + # last column in columns, or if the values returned are unique in + # frame[column] we save this index and break + # Otherwise we must save the index of the non duplicated values + # and set the next cur_frame to cur_frame filtered on all duplcicated + # values (#GH15297) + series = cur_frame[column] + values = getattr(series, method)(cur_n, keep=keep) + is_last_column = len(columns) - 1 == i + if is_last_column or values.nunique() == series.isin(values).sum(): + + # Last column in columns or values are unique in series => values + # is all that matters + indexer = get_indexer(indexer, values.index) + break + + duplicated_filter = series.duplicated(keep=False) + duplicated = values[duplicated_filter] + non_duplicated = values[~duplicated_filter] + indexer = get_indexer(indexer, non_duplicated.index) + + # Must set cur frame to include all duplicated values to consider for + # the next column, we also can reduce cur_n by the current length of + # the indexer + cur_frame = cur_frame[series.isin(duplicated)] + cur_n = n - len(indexer) + + frame = frame.take(indexer) + + # Restore the index on frame + frame.index = original_index.take(indexer) + return frame def _finalize_nsmallest(arr, kth_val, n, keep, narr): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index aa15e9fbab4cc..dda52bbc536c9 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -7,11 +7,12 @@ import sys import pytest +from string import ascii_lowercase from numpy import nan from numpy.random import randn import numpy as np -from pandas.compat import lrange +from pandas.compat import lrange, product from pandas import (compat, isnull, notnull, DataFrame, Series, MultiIndex, date_range, Timestamp) import pandas as pd @@ -1120,73 +1121,6 @@ def __nonzero__(self): self.assertTrue(r1.all()) # ---------------------------------------------------------------------- - # Top / bottom - - def test_nlargest(self): - # GH10393 - from string import ascii_lowercase - df = pd.DataFrame({'a': np.random.permutation(10), - 'b': list(ascii_lowercase[:10])}) - result = df.nlargest(5, 'a') - expected = df.sort_values('a', ascending=False).head(5) - tm.assert_frame_equal(result, expected) - - def test_nlargest_multiple_columns(self): - from string import ascii_lowercase - df = pd.DataFrame({'a': np.random.permutation(10), - 'b': list(ascii_lowercase[:10]), - 'c': np.random.permutation(10).astype('float64')}) - result = df.nlargest(5, ['a', 'b']) - expected = df.sort_values(['a', 'b'], ascending=False).head(5) - tm.assert_frame_equal(result, expected) - - def test_nsmallest(self): - from string import ascii_lowercase - df = pd.DataFrame({'a': np.random.permutation(10), - 'b': list(ascii_lowercase[:10])}) - result = df.nsmallest(5, 'a') - expected = df.sort_values('a').head(5) - tm.assert_frame_equal(result, expected) - - def test_nsmallest_multiple_columns(self): - from string import ascii_lowercase - df = pd.DataFrame({'a': np.random.permutation(10), - 'b': list(ascii_lowercase[:10]), - 'c': np.random.permutation(10).astype('float64')}) - result = df.nsmallest(5, ['a', 'c']) - expected = df.sort_values(['a', 'c']).head(5) - tm.assert_frame_equal(result, expected) - - def test_nsmallest_nlargest_duplicate_index(self): - # GH 13412 - df = pd.DataFrame({'a': [1, 2, 3, 4], - 'b': [4, 3, 2, 1], - 'c': [0, 1, 2, 3]}, - index=[0, 0, 1, 1]) - result = df.nsmallest(4, 'a') - expected = df.sort_values('a').head(4) - tm.assert_frame_equal(result, expected) - - result = df.nlargest(4, 'a') - expected = df.sort_values('a', ascending=False).head(4) - tm.assert_frame_equal(result, expected) - - result = df.nsmallest(4, ['a', 'c']) - expected = df.sort_values(['a', 'c']).head(4) - tm.assert_frame_equal(result, expected) - - result = df.nsmallest(4, ['c', 'a']) - expected = df.sort_values(['c', 'a']).head(4) - tm.assert_frame_equal(result, expected) - - result = df.nlargest(4, ['a', 'c']) - expected = df.sort_values(['a', 'c'], ascending=False).head(4) - tm.assert_frame_equal(result, expected) - - result = df.nlargest(4, ['c', 'a']) - expected = df.sort_values(['c', 'a'], ascending=False).head(4) - tm.assert_frame_equal(result, expected) - # ---------------------------------------------------------------------- # Isin def test_isin(self): @@ -1965,3 +1899,132 @@ def test_dot(self): with tm.assertRaisesRegexp(ValueError, 'aligned'): df.dot(df2) + + +@pytest.fixture +def df_duplicates(): + return pd.DataFrame({'a': [1, 2, 3, 4, 4], + 'b': [1, 1, 1, 1, 1], + 'c': [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1]) + + +@pytest.fixture +def df_strings(): + return pd.DataFrame({'a': np.random.permutation(10), + 'b': list(ascii_lowercase[:10]), + 'c': np.random.permutation(10).astype('float64')}) + + +@pytest.fixture +def df_main_dtypes(): + return pd.DataFrame( + {'group': [1, 1, 2], + 'int': [1, 2, 3], + 'float': [4., 5., 6.], + 'string': list('abc'), + 'category_string': pd.Series(list('abc')).astype('category'), + 'category_int': [7, 8, 9], + 'datetime': pd.date_range('20130101', periods=3), + 'datetimetz': pd.date_range('20130101', + periods=3, + tz='US/Eastern'), + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, + columns=['group', 'int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + + +class TestNLargestNSmallest(object): + + dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot " + "use method {method!r} with this dtype") + + # ---------------------------------------------------------------------- + # Top / bottom + @pytest.mark.parametrize( + 'method, n, order', + product(['nsmallest', 'nlargest'], range(1, 11), + [['a'], + ['c'], + ['a', 'b'], + ['a', 'c'], + ['b', 'a'], + ['b', 'c'], + ['a', 'b', 'c'], + ['c', 'a', 'b'], + ['c', 'b', 'a'], + ['b', 'c', 'a'], + ['b', 'a', 'c'], + + # dups! + ['b', 'c', 'c'], + + ])) + def test_n(self, df_strings, method, n, order): + # GH10393 + df = df_strings + if 'b' in order: + + error_msg = self.dtype_error_msg_template.format( + column='b', method=method, dtype='object') + with tm.assertRaisesRegexp(TypeError, error_msg): + getattr(df, method)(n, order) + else: + ascending = method == 'nsmallest' + result = getattr(df, method)(n, order) + expected = df.sort_values(order, ascending=ascending).head(n) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + 'method, columns', + product(['nsmallest', 'nlargest'], + product(['group'], ['category_string', 'string']) + )) + def test_n_error(self, df_main_dtypes, method, columns): + df = df_main_dtypes + error_msg = self.dtype_error_msg_template.format( + column=columns[1], method=method, dtype=df[columns[1]].dtype) + with tm.assertRaisesRegexp(TypeError, error_msg): + getattr(df, method)(2, columns) + + def test_n_all_dtypes(self, df_main_dtypes): + df = df_main_dtypes + df.nsmallest(2, list(set(df) - {'category_string', 'string'})) + df.nlargest(2, list(set(df) - {'category_string', 'string'})) + + def test_n_identical_values(self): + # GH15297 + df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]}) + + result = df.nlargest(3, 'a') + expected = pd.DataFrame( + {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2] + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(3, 'a') + expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + 'n, order', + product([1, 2, 3, 4, 5], + [['a', 'b', 'c'], + ['c', 'b', 'a'], + ['a'], + ['b'], + ['a', 'b'], + ['c', 'b']])) + def test_n_duplicate_index(self, df_duplicates, n, order): + # GH 13412 + + df = df_duplicates + result = df.nsmallest(n, order) + expected = df.sort_values(order).head(n) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(n, order) + expected = df.sort_values(order, ascending=False).head(n) + tm.assert_frame_equal(result, expected) From 4502e82083f4e253630588665a4fc6002c4f32ed Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 6 Apr 2017 12:41:07 -0400 Subject: [PATCH 10/22] TST: skip decimal conversion tests on 32-bit (#15922) xref #15865 --- pandas/tests/io/json/test_pandas.py | 5 ++++- pandas/tests/io/json/test_ujson.py | 26 ++++++-------------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 8fc8ecbdf8abc..a24e8cdaf0273 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # pylint: disable-msg=W0612,E1101 import pytest -from pandas.compat import range, lrange, StringIO, OrderedDict +from pandas.compat import (range, lrange, StringIO, + OrderedDict, is_platform_32bit) import os import numpy as np @@ -380,6 +381,8 @@ def test_frame_from_json_nones(self): unser = read_json(df.to_json(), dtype=False) self.assertTrue(np.isnan(unser[2][0])) + @pytest.mark.skipif(is_platform_32bit(), + reason="not compliant on 32-bit, xref #15865") def test_frame_to_json_float_precision(self): df = pd.DataFrame([dict(a_float=0.95)]) encoded = df.to_json(double_precision=1) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index c2cbbe1ca65ab..dcfa939f84d7e 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -8,8 +8,6 @@ import simplejson as json import math import pytest -import platform -import sys import time import datetime import calendar @@ -25,18 +23,14 @@ import pandas.util.testing as tm -def _skip_if_python_ver(skip_major, skip_minor=None): - major, minor = sys.version_info[:2] - if major == skip_major and (skip_minor is None or minor == skip_minor): - pytest.skip("skipping Python version %d.%d" % (major, minor)) - - json_unicode = (json.dumps if compat.PY3 else partial(json.dumps, encoding="utf-8")) class UltraJSONTests(TestCase): + @pytest.mark.skipif(compat.is_platform_32bit(), + reason="not compliant on 32-bit, xref #15865") def test_encodeDecimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.encode(sut, double_precision=15) @@ -153,10 +147,9 @@ def test_decimalDecodeTestPrecise(self): decoded = ujson.decode(encoded, precise_float=True) self.assertEqual(sut, decoded) + @pytest.mark.skipif(compat.is_platform_windows() and not compat.PY3, + reason="buggy on win-64 for py2") def test_encodeDoubleTinyExponential(self): - if compat.is_platform_windows() and not compat.PY3: - pytest.skip("buggy on win-64 for py2") - num = 1e-40 self.assertEqual(num, ujson.decode(ujson.encode(num))) num = 1e-100 @@ -275,8 +268,6 @@ def test_encodeUnicodeConversion2(self): self.assertEqual(dec, json.loads(enc)) def test_encodeUnicodeSurrogatePair(self): - _skip_if_python_ver(2, 5) - _skip_if_python_ver(2, 6) input = "\xf0\x90\x8d\x86" enc = ujson.encode(input) dec = ujson.decode(enc) @@ -285,8 +276,6 @@ def test_encodeUnicodeSurrogatePair(self): self.assertEqual(dec, json.loads(enc)) def test_encodeUnicode4BytesUTF8(self): - _skip_if_python_ver(2, 5) - _skip_if_python_ver(2, 6) input = "\xf0\x91\x80\xb0TRAILINGNORMAL" enc = ujson.encode(input) dec = ujson.decode(enc) @@ -295,8 +284,6 @@ def test_encodeUnicode4BytesUTF8(self): self.assertEqual(dec, json.loads(enc)) def test_encodeUnicode4BytesUTF8Highest(self): - _skip_if_python_ver(2, 5) - _skip_if_python_ver(2, 6) input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" enc = ujson.encode(input) @@ -462,7 +449,6 @@ def test_datetime_units(self): self.assertRaises(ValueError, ujson.encode, val, date_unit='foo') def test_encodeToUTF8(self): - _skip_if_python_ver(2, 5) input = "\xe6\x97\xa5\xd1\x88" enc = ujson.encode(input, ensure_ascii=False) dec = ujson.decode(enc) @@ -696,8 +682,8 @@ def test_decodeNumericIntNeg(self): input = "-31337" self.assertEqual(-31337, ujson.decode(input)) + @pytest.mark.skipif(compat.PY3, reason="only PY2") def test_encodeUnicode4BytesUTF8Fail(self): - _skip_if_python_ver(3) input = "\xfd\xbf\xbf\xbf\xbf\xbf" try: enc = ujson.encode(input) # noqa @@ -1029,7 +1015,7 @@ def testIntMax(self): num = np.uint32(np.iinfo(np.uint32).max) self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) - if platform.architecture()[0] != '32bit': + if not compat.is_platform_32bit(): num = np.int64(np.iinfo(np.int64).max) self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) From 0cfc08cf4584e8442c84c30d53f1dceafeac5abf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 6 Apr 2017 20:16:55 -0400 Subject: [PATCH 11/22] CLN: algos (#15929) * CLN: clean up select_n algos * CLN: clean ensure_data closes #15903 * return ndtype, so can eliminate special cases * unique * fixups --- pandas/core/algorithms.py | 942 ++++++++++++++---------------- pandas/core/frame.py | 10 +- pandas/core/series.py | 6 +- pandas/tests/test_algos.py | 21 +- pandas/tests/types/test_dtypes.py | 1 + pandas/types/common.py | 2 + pandas/types/dtypes.py | 2 + 7 files changed, 471 insertions(+), 513 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 80664a9ba3019..244f882f2c103 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -8,30 +8,22 @@ from pandas import compat, _np_version_under1p8 from pandas.types.cast import maybe_promote -from pandas.types.generic import ABCSeries, ABCIndex -from pandas.types.common import (is_unsigned_integer_dtype, - is_signed_integer_dtype, - is_integer_dtype, - is_complex_dtype, - is_categorical_dtype, - is_extension_type, - is_datetimetz, - is_period_dtype, - is_period_arraylike, - is_numeric_dtype, - is_float_dtype, - is_bool_dtype, - needs_i8_conversion, - is_categorical, - is_datetime64_dtype, - is_timedelta64_dtype, - is_scalar, - _ensure_platform_int, - _ensure_object, - _ensure_float64, - _ensure_uint64, - _ensure_int64, - is_list_like) +from pandas.types.generic import (ABCSeries, ABCIndex, + ABCIndexClass, ABCCategorical) +from pandas.types.common import ( + is_unsigned_integer_dtype, is_signed_integer_dtype, + is_integer_dtype, is_complex_dtype, + is_categorical_dtype, is_sparse, + is_period_dtype, + is_numeric_dtype, is_float_dtype, + is_bool_dtype, needs_i8_conversion, + is_categorical, is_datetimetz, + is_datetime64_any_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, + is_scalar, is_list_like, + _ensure_platform_int, _ensure_object, + _ensure_float64, _ensure_uint64, + _ensure_int64) from pandas.compat.numpy import _np_version_under1p10 from pandas.types.missing import isnull @@ -45,40 +37,190 @@ # dtype access # # --------------- # -def _ensure_data_view(values): +def _ensure_data(values, dtype=None): """ - helper routine to ensure that our data is of the correct + routine to ensure that our data is of the correct input dtype for lower-level routines + This will coerce: + - ints -> int64 + - uint -> uint64 + - bool -> uint64 (TODO this should be uint8) + - datetimelike -> i8 + - datetime64tz -> i8 (in local tz) + - categorical -> codes + Parameters ---------- values : array-like + dtype : pandas_dtype, optional + coerce to this dtype + + Returns + ------- + (ndarray, pandas_dtype, algo dtype as a string) + """ - if needs_i8_conversion(values): - values = values.view(np.int64) - elif is_period_arraylike(values): - from pandas.tseries.period import PeriodIndex - values = PeriodIndex(values).asi8 - elif is_categorical_dtype(values): - values = values.values.codes - elif isinstance(values, (ABCSeries, ABCIndex)): - values = values.values - - if is_signed_integer_dtype(values): + if (needs_i8_conversion(values) or + is_period_dtype(dtype) or + is_datetime64_any_dtype(dtype) or + is_timedelta64_dtype(dtype)): + if is_period_dtype(values) or is_period_dtype(dtype): + from pandas import PeriodIndex + values = PeriodIndex(values) + dtype = values.dtype + elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): + from pandas import TimedeltaIndex + values = TimedeltaIndex(values) + dtype = values.dtype + else: + # Datetime + from pandas import DatetimeIndex + values = DatetimeIndex(values) + dtype = values.dtype + + return values.asi8, dtype, 'int64' + + elif is_categorical_dtype(values) or is_categorical_dtype(dtype): + values = getattr(values, 'values', values) + values = values.codes + dtype = 'category' + + # we are actually coercing to int64 + # until our algos suppport int* directly (not all do) values = _ensure_int64(values) - elif is_unsigned_integer_dtype(values): - values = _ensure_uint64(values) - elif is_complex_dtype(values): - values = _ensure_float64(values) - elif is_float_dtype(values): - values = _ensure_float64(values) - else: + + return values, dtype, 'int64' + + values = np.asarray(values) + + try: + if is_bool_dtype(values) or is_bool_dtype(dtype): + # we are actually coercing to uint64 + # until our algos suppport uint8 directly (see TODO) + values = values.astype('uint64') + dtype = 'bool' + ndtype = 'uint64' + elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): + values = _ensure_int64(values) + ndtype = dtype = 'int64' + elif (is_unsigned_integer_dtype(values) or + is_unsigned_integer_dtype(dtype)): + values = _ensure_uint64(values) + ndtype = dtype = 'uint64' + elif is_complex_dtype(values) or is_complex_dtype(dtype): + values = _ensure_float64(values) + ndtype = dtype = 'float64' + elif is_float_dtype(values) or is_float_dtype(dtype): + values = _ensure_float64(values) + ndtype = dtype = 'float64' + else: + values = _ensure_object(values) + ndtype = dtype = 'object' + + except (TypeError, ValueError): + # if we are trying to coerce to a dtype + # and it is incompat this will fall thru to here values = _ensure_object(values) + ndtype = dtype = 'object' + + return values, dtype, ndtype + + +def _reconstruct_data(values, dtype, original): + """ + reverse of _ensure_data + + Parameters + ---------- + values : ndarray + dtype : pandas_dtype + original : ndarray-like + + Returns + ------- + Index for extension types, otherwise ndarray casted to dtype + + """ + from pandas import Index + if is_categorical_dtype(dtype): + pass + elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): + values = Index(original)._shallow_copy(values, name=None) + elif dtype is not None: + values = values.astype(dtype) return values +def _ensure_arraylike(values): + """ + ensure that we are arraylike if not already + """ + if not isinstance(values, (np.ndarray, ABCCategorical, + ABCIndexClass, ABCSeries)): + values = np.array(values) + return values + + +_hashtables = { + 'float64': (htable.Float64HashTable, htable.Float64Vector), + 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), + 'int64': (htable.Int64HashTable, htable.Int64Vector), + 'string': (htable.StringHashTable, htable.ObjectVector), + 'object': (htable.PyObjectHashTable, htable.ObjectVector) +} + + +def _get_hashtable_algo(values): + """ + Parameters + ---------- + values : arraylike + + Returns + ------- + tuples(hashtable class, + vector class, + values, + dtype, + ndtype) + """ + values, dtype, ndtype = _ensure_data(values) + + if ndtype == 'object': + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + ndtype = 'string' + else: + ndtype = 'object' + + htable, table = _hashtables[ndtype] + return (htable, table, values, dtype, ndtype) + + +def _get_data_algo(values, func_map): + + if is_categorical_dtype(values): + values = values._values_for_rank() + + values, dtype, ndtype = _ensure_data(values) + if ndtype == 'object': + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + try: + f = func_map['string'] + except KeyError: + pass + + f = func_map.get(ndtype, func_map['object']) + + return f, values + + # --------------- # # top-level algos # # --------------- # @@ -104,92 +246,41 @@ def match(to_match, values, na_sentinel=-1): match : ndarray of integers """ values = com._asarray_tuplesafe(values) - if issubclass(values.dtype.type, string_types): - values = np.array(values, dtype='O') - - f = lambda htype, caster: _match_object(to_match, values, htype, caster) - result = _hashtable_algo(f, values, np.int64) + htable, _, values, dtype, ndtype = _get_hashtable_algo(values) + to_match, _, _ = _ensure_data(to_match, dtype) + table = htable(min(len(to_match), 1000000)) + table.map_locations(values) + result = table.lookup(to_match) if na_sentinel != -1: # replace but return a numpy array # use a Series because it handles dtype conversions properly - from pandas.core.series import Series + from pandas import Series result = Series(result.ravel()).replace(-1, na_sentinel).values.\ reshape(result.shape) return result -def _match_object(values, index, table_type, type_caster): - values = type_caster(values) - index = type_caster(index) - table = table_type(min(len(index), 1000000)) - table.map_locations(index) - return table.lookup(values) - - -def unique(values): - """ - Compute unique values (not necessarily sorted) efficiently from input array - of values - - Parameters - ---------- - values : array-like - - Returns - ------- - uniques - """ - values = com._asarray_tuplesafe(values) - - f = lambda htype, caster: _unique_object(values, htype, caster) - return _hashtable_algo(f, values) - - -def _unique_object(values, table_type, type_caster): - values = type_caster(values) - table = table_type(min(len(values), 1000000)) - uniques = table.unique(values) - return type_caster(uniques) - - def unique1d(values): """ Hash table-based unique """ - if np.issubdtype(values.dtype, np.floating): - table = htable.Float64HashTable(len(values)) - uniques = np.array(table.unique(_ensure_float64(values)), - dtype=np.float64) - elif np.issubdtype(values.dtype, np.datetime64): - table = htable.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('M8[ns]') - elif np.issubdtype(values.dtype, np.timedelta64): - table = htable.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('m8[ns]') - elif np.issubdtype(values.dtype, np.signedinteger): - table = htable.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - elif np.issubdtype(values.dtype, np.unsignedinteger): - table = htable.UInt64HashTable(len(values)) - uniques = table.unique(_ensure_uint64(values)) - else: - - # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: - table = htable.StringHashTable(len(values)) - else: - table = htable.PyObjectHashTable(len(values)) + values = _ensure_arraylike(values) + original = values + htable, _, values, dtype, ndtype = _get_hashtable_algo(values) - uniques = table.unique(_ensure_object(values)) + table = htable(len(values)) + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, dtype, original) return uniques +unique = unique1d + + def isin(comps, values): """ Compute the isin boolean array @@ -213,38 +304,11 @@ def isin(comps, values): " to isin(), you passed a " "[{0}]".format(type(values).__name__)) - from pandas import DatetimeIndex, TimedeltaIndex, PeriodIndex - if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = np.array(list(values), dtype='object') - if needs_i8_conversion(comps): - if is_period_dtype(values): - comps = PeriodIndex(comps) - values = PeriodIndex(values) - elif is_timedelta64_dtype(comps): - comps = TimedeltaIndex(comps) - values = TimedeltaIndex(values) - else: - comps = DatetimeIndex(comps) - values = DatetimeIndex(values) - - values = values.asi8 - comps = comps.asi8 - elif is_bool_dtype(comps): - - try: - comps = np.asarray(comps).view('uint8') - values = np.asarray(values).view('uint8') - except TypeError: - # object array conversion will fail - pass - elif is_numeric_dtype(comps): - comps = np.asarray(comps) - values = np.asarray(values) - else: - comps = np.asarray(comps).astype(object) - values = np.asarray(values).astype(object) + comps, dtype, _ = _ensure_data(comps) + values, _, _ = _ensure_data(values, dtype=dtype) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 @@ -396,53 +460,32 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ - from pandas import Index, Series, DatetimeIndex, PeriodIndex - - # handling possibilities here - # - for a numpy datetimelike simply view as i8 then cast back - # - bool handled as uint8 then cast back - # - for an extension datetimelike view as i8 then - # reconstruct from boxed values to transfer metadata - dtype = None - if needs_i8_conversion(values): - if is_period_dtype(values): - values = PeriodIndex(values) - vals = values.asi8 - elif is_datetimetz(values): - values = DatetimeIndex(values) - vals = values.asi8 - else: - # numpy dtype - dtype = values.dtype - vals = values.view(np.int64) - elif is_bool_dtype(values): - dtype = bool - vals = np.asarray(values).view('uint8') - else: - vals = np.asarray(values) - (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) + original = values + values, dtype, _ = _ensure_data(values) + (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - table = hash_klass(size_hint or len(vals)) + table = hash_klass(size_hint or len(values)) uniques = vec_klass() - check_nulls = not is_integer_dtype(values) - labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls) + check_nulls = not is_integer_dtype(original) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) labels = _ensure_platform_int(labels) - uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) - if dtype is not None: - uniques = uniques.astype(dtype) + uniques = _reconstruct_data(uniques, dtype, original) - if isinstance(values, Index): - uniques = values._shallow_copy(uniques, name=None) - elif isinstance(values, Series): + # return original tenor + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index uniques = Index(uniques) + return labels, uniques @@ -471,7 +514,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, value_counts : Series """ - from pandas.core.series import Series + from pandas.core.series import Series, Index name = getattr(values, 'name', None) if bins is not None: @@ -483,17 +526,16 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes - if is_extension_type(values) and not is_datetimetz(values): + if is_categorical_dtype(values) or is_sparse(values): + # handle Categorical and sparse, - # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values + else: - # ndarray path. pass original to handle DatetimeTzBlock - keys, counts = _value_counts_arraylike(values, dropna=dropna) + keys, counts = _value_counts_arraylike(values, dropna) - from pandas import Index, Series if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) @@ -513,60 +555,45 @@ def value_counts(values, sort=True, ascending=False, normalize=False, return result -def _value_counts_arraylike(values, dropna=True): - is_datetimetz_type = is_datetimetz(values) - is_period_type = (is_period_dtype(values) or - is_period_arraylike(values)) - - orig = values - - from pandas.core.series import Series - values = Series(values).values - dtype = values.dtype +def _value_counts_arraylike(values, dropna): + """ + Parameters + ---------- + values : arraylike + dropna : boolean - if needs_i8_conversion(dtype) or is_period_type: + Returns + ------- + (uniques, counts) - from pandas.tseries.index import DatetimeIndex - from pandas.tseries.period import PeriodIndex + """ + values = _ensure_arraylike(values) + original = values + values, dtype, ndtype = _ensure_data(values) - if is_period_type: - # values may be an object - values = PeriodIndex(values) - freq = values.freq + if needs_i8_conversion(dtype): + # i8 - values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] - # convert the keys back to the dtype we came in - keys = keys.astype(dtype) - - # dtype handling - if is_datetimetz_type: - keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) - elif is_period_type: - keys = PeriodIndex._from_ordinals(keys, freq=freq) - - elif is_signed_integer_dtype(dtype): - values = _ensure_int64(values) - keys, counts = htable.value_count_int64(values, dropna) - elif is_unsigned_integer_dtype(dtype): - values = _ensure_uint64(values) - keys, counts = htable.value_count_uint64(values, dropna) - elif is_float_dtype(dtype): - values = _ensure_float64(values) - keys, counts = htable.value_count_float64(values, dropna) else: - values = _ensure_object(values) - keys, counts = htable.value_count_object(values, dropna) + # ndarray like + + # TODO: handle uint8 + f = getattr(htable, "value_count_{dtype}".format(dtype=ndtype)) + keys, counts = f(values, dropna) mask = isnull(values) if not dropna and mask.any(): - keys = np.insert(keys, 0, np.NaN) - counts = np.insert(counts, 0, mask.sum()) + if not isnull(keys).any(): + keys = np.insert(keys, 0, np.NaN) + counts = np.insert(counts, 0, mask.sum()) + + keys = _reconstruct_data(keys, original.dtype, original) return keys, counts @@ -593,33 +620,9 @@ def duplicated(values, keep='first'): duplicated : ndarray """ - dtype = values.dtype - - # no need to revert to original type - if needs_i8_conversion(dtype): - values = values.view(np.int64) - elif is_period_arraylike(values): - from pandas.tseries.period import PeriodIndex - values = PeriodIndex(values).asi8 - elif is_categorical_dtype(dtype): - values = values.values.codes - elif isinstance(values, (ABCSeries, ABCIndex)): - values = values.values - - if is_signed_integer_dtype(dtype): - values = _ensure_int64(values) - duplicated = htable.duplicated_int64(values, keep=keep) - elif is_unsigned_integer_dtype(dtype): - values = _ensure_uint64(values) - duplicated = htable.duplicated_uint64(values, keep=keep) - elif is_float_dtype(dtype): - values = _ensure_float64(values) - duplicated = htable.duplicated_float64(values, keep=keep) - else: - values = _ensure_object(values) - duplicated = htable.duplicated_object(values, keep=keep) - - return duplicated + values, dtype, ndtype = _ensure_data(values) + f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) + return f(values, keep=keep) def mode(values): @@ -635,40 +638,34 @@ def mode(values): ------- mode : Series """ + from pandas import Series - # must sort because hash order isn't necessarily defined. - from pandas.core.series import Series + values = _ensure_arraylike(values) + original = values - if isinstance(values, Series): - constructor = values._constructor - values = values.values - else: - values = np.asanyarray(values) - constructor = Series + # categorical is a fast-path + if is_categorical_dtype(values): - dtype = values.dtype - if is_signed_integer_dtype(values): - values = _ensure_int64(values) - result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) - elif is_unsigned_integer_dtype(values): - values = _ensure_uint64(values) - result = constructor(np.sort(htable.mode_uint64(values)), dtype=dtype) - elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): - dtype = values.dtype - values = values.view(np.int64) - result = constructor(np.sort(htable.mode_int64(values)), dtype=dtype) - elif is_categorical_dtype(values): - result = constructor(values.mode()) - else: + if isinstance(values, Series): + return Series(values.values.mode()) + return values.mode() + + values, dtype, ndtype = _ensure_data(values) + + # TODO: this should support float64 + if ndtype not in ['int64', 'uint64', 'object']: + ndtype = 'object' values = _ensure_object(values) - res = htable.mode_object(values) - try: - res = np.sort(res) - except TypeError as e: - warn("Unable to sort modes: %s" % e) - result = constructor(res, dtype=dtype) - return result + f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) + result = f(values) + try: + result = np.sort(result) + except TypeError as e: + warn("Unable to sort modes: %s" % e) + + result = _reconstruct_data(result, original.dtype, original) + return Series(result) def rank(values, axis=0, method='average', na_option='keep', @@ -859,6 +856,12 @@ def quantile(x, q, interpolation_method='fraction'): values = np.sort(x) + def _interpolate(a, b, fraction): + """Returns the point at the given fraction between a and b, where + 'fraction' must be between 0 and 1. + """ + return a + (b - a) * fraction + def _get_score(at): if len(values) == 0: return np.nan @@ -887,261 +890,186 @@ def _get_score(at): return algos.arrmap_float64(q, _get_score) -def _interpolate(a, b, fraction): - """Returns the point at the given fraction between a and b, where - 'fraction' must be between 0 and 1. - """ - return a + (b - a) * fraction - - -def nsmallest(arr, n, keep='first'): - """ - Find the indices of the n smallest values of a numpy array. - - Note: Fails silently with NaN. - """ - if keep == 'last': - arr = arr[::-1] - - narr = len(arr) - n = min(n, narr) - - arr = _ensure_data_view(arr) - kth_val = algos.kth_smallest(arr.copy(), n - 1) - return _finalize_nsmallest(arr, kth_val, n, keep, narr) - +# --------------- # +# select n # +# --------------- # -def nlargest(arr, n, keep='first'): - """ - Find the indices of the n largest values of a numpy array. +class SelectN(object): - Note: Fails silently with NaN. - """ - arr = _ensure_data_view(arr) - return nsmallest(-arr, n, keep=keep) + def __init__(self, obj, n, keep): + self.obj = obj + self.n = n + self.keep = keep + if self.keep not in ('first', 'last'): + raise ValueError('keep must be either "first", "last"') -def select_n_slow(dropped, n, keep, method): - reverse_it = (keep == 'last' or method == 'nlargest') - ascending = method == 'nsmallest' - slc = np.s_[::-1] if reverse_it else np.s_[:] - return dropped[slc].sort_values(ascending=ascending).head(n) + def nlargest(self): + return self.compute('nlargest') + def nsmallest(self): + return self.compute('nsmallest') -_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} + @staticmethod + def is_valid_dtype_n_method(dtype): + """ + Helper function to determine if dtype is valid for + nsmallest/nlargest methods + """ + return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or + needs_i8_conversion(dtype)) -def _is_valid_dtype_n_method(dtype): - """ - Helper function to determine if dtype is valid for - nsmallest/nlargest methods +class SelectNSeries(SelectN): """ - return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or - needs_i8_conversion(dtype)) - - -def select_n_series(series, n, keep, method): - """Implement n largest/smallest for pandas Series + Implement n largest/smallest for Series Parameters ---------- - series : pandas.Series object + obj : Series n : int keep : {'first', 'last'}, default 'first' - method : str, {'nlargest', 'nsmallest'} Returns ------- nordered : Series """ - dtype = series.dtype - if not _is_valid_dtype_n_method(dtype): - raise TypeError("Cannot use method '{method}' with " - "dtype {dtype}".format(method=method, dtype=dtype)) - if keep not in ('first', 'last'): - raise ValueError('keep must be either "first", "last"') + def compute(self, method): + + n = self.n + dtype = self.obj.dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError("Cannot use method '{method}' with " + "dtype {dtype}".format(method=method, + dtype=dtype)) + + if n <= 0: + return self.obj[[]] + + dropped = self.obj.dropna() + + # slow method + if n >= len(self.obj): - if n <= 0: - return series[[]] + reverse_it = (self.keep == 'last' or method == 'nlargest') + ascending = method == 'nsmallest' + slc = np.s_[::-1] if reverse_it else np.s_[:] + return dropped[slc].sort_values(ascending=ascending).head(n) - dropped = series.dropna() + # fast method + arr, _, _ = _ensure_data(dropped.values) + if method == 'nlargest': + arr = -arr - if n >= len(series): - return select_n_slow(dropped, n, keep, method) + if self.keep == 'last': + arr = arr[::-1] - inds = _select_methods[method](dropped.values, n, keep) - return dropped.iloc[inds] + narr = len(arr) + n = min(n, narr) + kth_val = algos.kth_smallest(arr.copy(), n - 1) + ns, = np.nonzero(arr <= kth_val) + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + if self.keep == 'last': + # reverse indices + inds = narr - 1 - inds -def select_n_frame(frame, columns, n, method, keep): - """Implement n largest/smallest for pandas DataFrame + return dropped.iloc[inds] + + +class SelectNFrame(SelectN): + """ + Implement n largest/smallest for DataFrame Parameters ---------- - frame : pandas.DataFrame object - columns : list or str + obj : DataFrame n : int keep : {'first', 'last'}, default 'first' - method : str, {'nlargest', 'nsmallest'} + columns : list or str Returns ------- nordered : DataFrame """ - from pandas import Int64Index - if not is_list_like(columns): - columns = [columns] - columns = list(columns) - for column in columns: - dtype = frame[column].dtype - if not _is_valid_dtype_n_method(dtype): - raise TypeError(( - "Column {column!r} has dtype {dtype}, cannot use method " - "{method!r} with this dtype" - ).format(column=column, dtype=dtype, method=method)) - - def get_indexer(current_indexer, other_indexer): - """Helper function to concat `current_indexer` and `other_indexer` - depending on `method` - """ - if method == 'nsmallest': - return current_indexer.append(other_indexer) - else: - return other_indexer.append(current_indexer) - - # Below we save and reset the index in case index contains duplicates - original_index = frame.index - cur_frame = frame = frame.reset_index(drop=True) - cur_n = n - indexer = Int64Index([]) - - for i, column in enumerate(columns): - - # For each column we apply method to cur_frame[column]. If it is the - # last column in columns, or if the values returned are unique in - # frame[column] we save this index and break - # Otherwise we must save the index of the non duplicated values - # and set the next cur_frame to cur_frame filtered on all duplcicated - # values (#GH15297) - series = cur_frame[column] - values = getattr(series, method)(cur_n, keep=keep) - is_last_column = len(columns) - 1 == i - if is_last_column or values.nunique() == series.isin(values).sum(): - - # Last column in columns or values are unique in series => values - # is all that matters - indexer = get_indexer(indexer, values.index) - break - - duplicated_filter = series.duplicated(keep=False) - duplicated = values[duplicated_filter] - non_duplicated = values[~duplicated_filter] - indexer = get_indexer(indexer, non_duplicated.index) - - # Must set cur frame to include all duplicated values to consider for - # the next column, we also can reduce cur_n by the current length of - # the indexer - cur_frame = cur_frame[series.isin(duplicated)] - cur_n = n - len(indexer) - - frame = frame.take(indexer) - - # Restore the index on frame - frame.index = original_index.take(indexer) - return frame - - -def _finalize_nsmallest(arr, kth_val, n, keep, narr): - ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')][:n] - if keep == 'last': - # reverse indices - return narr - 1 - inds - else: - return inds - - -# ------- # -# helpers # -# ------- # - -def _hashtable_algo(f, values, return_dtype=None): - """ - f(HashTable, type_caster) -> result - """ - - dtype = values.dtype - if is_float_dtype(dtype): - return f(htable.Float64HashTable, _ensure_float64) - elif is_signed_integer_dtype(dtype): - return f(htable.Int64HashTable, _ensure_int64) - elif is_unsigned_integer_dtype(dtype): - return f(htable.UInt64HashTable, _ensure_uint64) - elif is_datetime64_dtype(dtype): - return_dtype = return_dtype or 'M8[ns]' - return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) - elif is_timedelta64_dtype(dtype): - return_dtype = return_dtype or 'm8[ns]' - return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) - - # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: - return f(htable.StringHashTable, _ensure_object) - - # use Object - return f(htable.PyObjectHashTable, _ensure_object) - - -_hashtables = { - 'float64': (htable.Float64HashTable, htable.Float64Vector), - 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), - 'int64': (htable.Int64HashTable, htable.Int64Vector), - 'string': (htable.StringHashTable, htable.ObjectVector), - 'object': (htable.PyObjectHashTable, htable.ObjectVector) -} - - -def _get_data_algo(values, func_map): - - f = None - - if is_categorical_dtype(values): - values = values._values_for_rank() - - if is_float_dtype(values): - f = func_map['float64'] - values = _ensure_float64(values) - - elif needs_i8_conversion(values): - f = func_map['int64'] - values = values.view('i8') - - elif is_signed_integer_dtype(values): - f = func_map['int64'] - values = _ensure_int64(values) - - elif is_unsigned_integer_dtype(values): - f = func_map['uint64'] - values = _ensure_uint64(values) - - else: - values = _ensure_object(values) - - # its cheaper to use a String Hash Table than Object - if lib.infer_dtype(values) in ['string']: - try: - f = func_map['string'] - except KeyError: - pass - - if f is None: - f = func_map['object'] - - return f, values - -# ---- # + def __init__(self, obj, n, keep, columns): + super(SelectNFrame, self).__init__(obj, n, keep) + if not is_list_like(columns): + columns = [columns] + columns = list(columns) + self.columns = columns + + def compute(self, method): + + from pandas import Int64Index + n = self.n + frame = self.obj + columns = self.columns + + for column in columns: + dtype = frame[column].dtype + if not self.is_valid_dtype_n_method(dtype): + raise TypeError(( + "Column {column!r} has dtype {dtype}, cannot use method " + "{method!r} with this dtype" + ).format(column=column, dtype=dtype, method=method)) + + def get_indexer(current_indexer, other_indexer): + """Helper function to concat `current_indexer` and `other_indexer` + depending on `method` + """ + if method == 'nsmallest': + return current_indexer.append(other_indexer) + else: + return other_indexer.append(current_indexer) + + # Below we save and reset the index in case index contains duplicates + original_index = frame.index + cur_frame = frame = frame.reset_index(drop=True) + cur_n = n + indexer = Int64Index([]) + + for i, column in enumerate(columns): + + # For each column we apply method to cur_frame[column]. + # If it is the last column in columns, or if the values + # returned are unique in frame[column] we save this index + # and break + # Otherwise we must save the index of the non duplicated values + # and set the next cur_frame to cur_frame filtered on all + # duplcicated values (#GH15297) + series = cur_frame[column] + values = getattr(series, method)(cur_n, keep=self.keep) + is_last_column = len(columns) - 1 == i + if is_last_column or values.nunique() == series.isin(values).sum(): + + # Last column in columns or values are unique in + # series => values + # is all that matters + indexer = get_indexer(indexer, values.index) + break + + duplicated_filter = series.duplicated(keep=False) + duplicated = values[duplicated_filter] + non_duplicated = values[~duplicated_filter] + indexer = get_indexer(indexer, non_duplicated.index) + + # Must set cur frame to include all duplicated values + # to consider for the next column, we also can reduce + # cur_n by the current length of the indexer + cur_frame = cur_frame[series.isin(duplicated)] + cur_n = n - len(indexer) + + frame = frame.take(indexer) + + # Restore the index on frame + frame.index = original_index.take(indexer) + return frame + + +# ------- ## ---- # # take # # ---- # @@ -1534,23 +1462,41 @@ def func(arr, indexer, out, fill_value=np.nan): def diff(arr, n, axis=0): - """ difference of n between self, - analagoust to s-s.shift(n) """ + """ + difference of n between self, + analagoust to s-s.shift(n) + + Parameters + ---------- + arr : ndarray + n : int + number of periods + axis : int + axis to shift on + + Returns + ------- + shifted + + """ n = int(n) na = np.nan dtype = arr.dtype + is_timedelta = False if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') na = iNaT is_timedelta = True - elif issubclass(dtype.type, np.integer): - dtype = np.float64 - elif issubclass(dtype.type, np.bool_): + + elif is_bool_dtype(dtype): dtype = np.object_ + elif is_integer_dtype(dtype): + dtype = np.float64 + dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3980bf6cdbc09..f6199be2d1fc9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3441,7 +3441,10 @@ def nlargest(self, n, columns, keep='first'): 1 10 b 2 2 8 d NaN """ - return algorithms.select_n_frame(self, columns, n, 'nlargest', keep) + return algorithms.SelectNFrame(self, + n=n, + keep=keep, + columns=columns).nlargest() def nsmallest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` smallest @@ -3475,7 +3478,10 @@ def nsmallest(self, n, columns, keep='first'): 0 1 a 1 2 8 d NaN """ - return algorithms.select_n_frame(self, columns, n, 'nsmallest', keep) + return algorithms.SelectNFrame(self, + n=n, + keep=keep, + columns=columns).nsmallest() def swaplevel(self, i=-2, j=-1, axis=0): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 1aaa106d2c68f..d6a1a9d98faf4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1856,8 +1856,7 @@ def nlargest(self, n=5, keep='first'): 121637 4.240952 dtype: float64 """ - return algorithms.select_n_series(self, n=n, keep=keep, - method='nlargest') + return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() def nsmallest(self, n=5, keep='first'): """Return the smallest `n` elements. @@ -1903,8 +1902,7 @@ def nsmallest(self, n=5, keep='first'): 359919 -4.331927 dtype: float64 """ - return algorithms.select_n_series(self, n=n, keep=keep, - method='nsmallest') + return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ac3a42c3cf122..d893183dae0ed 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -620,9 +620,9 @@ def test_dropna(self): # 32-bit linux has a different ordering if not compat.is_platform_32bit(): - tm.assert_series_equal( - pd.Series([10.3, 5., 5., None]).value_counts(dropna=False), - pd.Series([2, 1, 1], index=[5., 10.3, np.nan])) + result = pd.Series([10.3, 5., 5., None]).value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[5., 10.3, np.nan]) + tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 @@ -1356,16 +1356,19 @@ def test_uint64_overflow(self): def test_categorical(self): c = Categorical([1, 2]) - exp = Series([1, 2], dtype=np.int64) - tm.assert_series_equal(algos.mode(c), exp) + exp = c + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) c = Categorical([1, 'a', 'a']) - exp = Series(['a'], dtype=object) - tm.assert_series_equal(algos.mode(c), exp) + exp = Categorical(['a'], categories=[1, 'a']) + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) c = Categorical([1, 1, 2, 3, 3]) - exp = Series([1, 3], dtype=np.int64) - tm.assert_series_equal(algos.mode(c), exp) + exp = Categorical([1, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(algos.mode(c), exp) + tm.assert_categorical_equal(c.mode(), exp) def test_index(self): idx = Index([1, 2, 3]) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 8ef2868ae324f..e7b2edeb57714 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -149,6 +149,7 @@ def test_construction_from_string(self): lambda: DatetimeTZDtype.construct_from_string('foo')) def test_is_dtype(self): + self.assertFalse(DatetimeTZDtype.is_dtype(None)) self.assertTrue(DatetimeTZDtype.is_dtype(self.dtype)) self.assertTrue(DatetimeTZDtype.is_dtype('datetime64[ns, US/Eastern]')) self.assertFalse(DatetimeTZDtype.is_dtype('foo')) diff --git a/pandas/types/common.py b/pandas/types/common.py index a1f03e59a5e6e..017805673defe 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -359,6 +359,8 @@ def _coerce_to_dtype(dtype): def _get_dtype(arr_or_dtype): + if arr_or_dtype is None: + raise TypeError if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype elif isinstance(arr_or_dtype, type): diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 43135ba94ab46..c3494df93476b 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -82,6 +82,8 @@ def is_dtype(cls, dtype): return True elif isinstance(dtype, np.dtype): return False + elif dtype is None: + return False try: return cls.construct_from_string(dtype) is not None except: From 2f160f033f37d539e6799da61733ce38f9234119 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 08:03:55 -0400 Subject: [PATCH 12/22] TST: suppress some warnings (#15932) --- pandas/core/algorithms.py | 8 ++++++-- pandas/tests/test_errors.py | 10 ++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 244f882f2c103..9b88ea23483bd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from __future__ import division -from warnings import warn +from warnings import warn, catch_warnings import numpy as np from pandas import compat, _np_version_under1p8 @@ -110,7 +110,11 @@ def _ensure_data(values, dtype=None): values = _ensure_uint64(values) ndtype = dtype = 'uint64' elif is_complex_dtype(values) or is_complex_dtype(dtype): - values = _ensure_float64(values) + + # ignore the fact that we are casting to float + # which discards complex parts + with catch_warnings(record=True): + values = _ensure_float64(values) ndtype = dtype = 'float64' elif is_float_dtype(values) or is_float_dtype(dtype): values = _ensure_float64(values) diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index aabce7ecb7066..4a0850734e134 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import pytest +from warnings import catch_warnings import pandas # noqa import pandas as pd @@ -44,7 +45,8 @@ def test_error_rename(): except CParserError: pass - try: - raise ParserError() - except pd.parser.CParserError: - pass + with catch_warnings(record=True): + try: + raise ParserError() + except pd.parser.CParserError: + pass From 3b53202d90e86a1bc0f5db7a9c2e66c164f909a9 Mon Sep 17 00:00:00 2001 From: Tong SHEN Date: Fri, 7 Apr 2017 20:08:57 +0800 Subject: [PATCH 13/22] DOC: Fix a typo in advanced.rst (#15933) --- doc/source/advanced.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index f380070ddac79..0b81bc6d934e1 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -46,7 +46,7 @@ data with an arbitrary number of dimensions in lower dimensional data structures like Series (1d) and DataFrame (2d). In this section, we will show what exactly we mean by "hierarchical" indexing -and how it integrates with the all of the pandas indexing functionality +and how it integrates with all of the pandas indexing functionality described above and in prior sections. Later, when discussing :ref:`group by ` and :ref:`pivoting and reshaping data `, we'll show non-trivial applications to illustrate how it aids in structuring data for From f478e4f4b0a353fa48ddb19e70cb9abe5b36e1b5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 11:17:38 -0400 Subject: [PATCH 14/22] BUG: DataFrame.sort_index broken if not both lexsorted and monotonic in levels closes #15622 closes #15687 closes #14015 closes #13431 Author: Jeff Reback Closes #15694 from jreback/sort3 and squashes the following commits: bd17d2b [Jeff Reback] rename sort_index_montonic -> _sort_index_monotonic 31097fc [Jeff Reback] add doc-strings, rename sort_monotonic -> sort_levels_monotonic 48249ab [Jeff Reback] add doc example 527c3a6 [Jeff Reback] simpler algo for remove_used_levels 520c9c1 [Jeff Reback] versionadded tags f2ddc9c [Jeff Reback] replace _reconstruct with: sort_monotonic, and remove_unused_levels (public) 3c4ca22 [Jeff Reback] add degenerate test case 269cb3b [Jeff Reback] small doc updates b234bdb [Jeff Reback] support for removing unused levels (internally) 7be8941 [Jeff Reback] incorrectly raising KeyError rather than UnsortedIndexError, caught by doc-example 47c67d6 [Jeff Reback] BUG: construct MultiIndex identically from levels/labels when concatting --- asv_bench/benchmarks/timeseries.py | 5 +- doc/source/advanced.rst | 63 +++++----- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 70 +++++++++++- pandas/core/frame.py | 19 +-- pandas/core/groupby.py | 9 +- pandas/core/reshape.py | 9 +- pandas/core/series.py | 18 ++- pandas/core/sorting.py | 21 ++++ pandas/indexes/multi.py | 144 ++++++++++++++++++++++- pandas/tests/indexes/test_multi.py | 98 ++++++++++++++++ pandas/tests/series/test_analytics.py | 2 +- pandas/tests/test_multilevel.py | 159 ++++++++++++++++++++++++++ pandas/tests/tools/test_hashing.py | 29 +++++ pandas/tests/tools/test_pivot.py | 3 +- 15 files changed, 593 insertions(+), 57 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 6e9ef4b10273c..dfe3f0ef87c11 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -292,7 +292,10 @@ def setup(self): self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') self.ts3 = Series(1, index=self.rng3) - def time_sort_index(self): + def time_sort_index_monotonic(self): + self.ts2.sort_index() + + def time_sort_index_non_monotonic(self): self.ts.sort_index() def time_timeseries_slice_minutely(self): diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 0b81bc6d934e1..43373fc86c4d1 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -136,7 +136,7 @@ can find yourself working with hierarchically-indexed data without creating a may wish to generate your own ``MultiIndex`` when preparing the data set. Note that how the index is displayed by be controlled using the -``multi_sparse`` option in ``pandas.set_printoptions``: +``multi_sparse`` option in ``pandas.set_options()``: .. ipython:: python @@ -175,35 +175,40 @@ completely analogous way to selecting a column in a regular DataFrame: See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. -.. note:: +.. _advanced.shown_levels: + +Defined Levels +~~~~~~~~~~~~~~ + +The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even +if the they are not actually used. When slicing an index, you may notice this. +For example: - The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even - if the they are not actually used. When slicing an index, you may notice this. - For example: +.. ipython:: python - .. ipython:: python + # original multi-index + df.columns - # original multi-index - df.columns + # sliced + df[['foo','qux']].columns - # sliced - df[['foo','qux']].columns +This is done to avoid a recomputation of the levels in order to make slicing +highly performant. If you want to see the actual used levels. - This is done to avoid a recomputation of the levels in order to make slicing - highly performant. If you want to see the actual used levels. +.. ipython:: python - .. ipython:: python + df[['foo','qux']].columns.values - df[['foo','qux']].columns.values + # for a specific level + df[['foo','qux']].columns.get_level_values(0) - # for a specific level - df[['foo','qux']].columns.get_level_values(0) +To reconstruct the multiindex with only the used levels - To reconstruct the multiindex with only the used levels +.. versionadded:: 0.20.0 - .. ipython:: python +.. ipython:: python - pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values) + df[['foo','qux']].columns.remove_unused_levels() Data alignment and using ``reindex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -288,7 +293,7 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1','A3'),.....),:] + df.loc[(slice('A1','A3'),.....), :] rather than this: @@ -317,43 +322,43 @@ Basic multi-index slicing using slices, lists, and labels. .. ipython:: python - dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :] You can use a ``pd.IndexSlice`` to have a more natural syntax using ``:`` rather than using ``slice(None)`` .. ipython:: python idx = pd.IndexSlice - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] It is possible to perform quite complicated selections using this method on multiple axes at the same time. .. ipython:: python - dfmi.loc['A1',(slice(None),'foo')] - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + dfmi.loc['A1', (slice(None), 'foo')] + dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - mask = dfmi[('a','foo')]>200 - dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + mask = dfmi[('a', 'foo')] > 200 + dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']] You can also specify the ``axis`` argument to ``.loc`` to interpret the passed slicers on a single axis. .. ipython:: python - dfmi.loc(axis=0)[:,:,['C1','C3']] + dfmi.loc(axis=0)[:, :, ['C1', 'C3']] Furthermore you can *set* the values using these methods .. ipython:: python df2 = dfmi.copy() - df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10 df2 You can use a right-hand-side of an alignable object as well. @@ -361,7 +366,7 @@ You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = dfmi.copy() - df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000 df2 .. _advanced.xs: diff --git a/doc/source/api.rst b/doc/source/api.rst index 24bad7d515305..336b0b9b14c6c 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1432,6 +1432,7 @@ MultiIndex Components MultiIndex.droplevel MultiIndex.swaplevel MultiIndex.reorder_levels + MultiIndex.remove_unused_levels .. _api.datetimeindex: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cb9e2496757ef..21b259e7663ba 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -366,6 +366,8 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) - ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) +- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels `. (:issue:`15694`) +- :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -714,6 +716,72 @@ If indicated, a deprecation warning will be issued if you reference that module. "pandas._hash", "pandas.tools.libhash", "" "pandas._window", "pandas.core.libwindow", "" +.. _whatsnew_0200.api_breaking.sort_index: + +DataFrame.sort_index changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort. +This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`) + +This is UNCHANGED between versions, but showing for illustration purposes: + +.. ipython:: python + + df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)])) + df + +.. ipython:: python + + df.index.is_lexsorted() + df.index.is_monotonic + +Sorting works as expected + +.. ipython:: python + + df.sort_index() + +.. ipython:: python + + df.sort_index().index.is_lexsorted() + df.sort_index().index.is_monotonic + +However, this example, which has a non-monotonic 2nd level, +doesn't behave as desired. + +.. ipython:: python + df = pd.DataFrame( + {'value': [1, 2, 3, 4]}, + index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + +Previous Behavior: + +.. ipython:: python + + In [11]: df.sort_index() + Out[11]: + value + a bb 1 + aa 2 + b bb 3 + aa 4 + + In [14]: df.sort_index().index.is_lexsorted() + Out[14]: True + + In [15]: df.sort_index().index.is_monotonic + Out[15]: False + +New Behavior: + +.. ipython:: python + + df.sort_index() + df.sort_index().index.is_lexsorted() + df.sort_index().index.is_monotonic + .. _whatsnew_0200.api_breaking.groupby_describe: @@ -965,7 +1033,7 @@ Performance Improvements - Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`). - +- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f6199be2d1fc9..c8c21b0c5fd7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3322,6 +3322,10 @@ def trans(v): def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None): + + # TODO: this can be combined with Series.sort_index impl as + # almost identical + inplace = validate_bool_kwarg(inplace, 'inplace') # 10726 if by is not None: @@ -3335,8 +3339,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) labels = self._get_axis(axis) - # sort by the index - if level is not None: + if level: new_axis, indexer = labels.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) @@ -3346,17 +3349,14 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer - if not labels.is_lexsorted(): - labels = MultiIndex.from_tuples(labels.values) - + labels = labels._sort_levels_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) else: from pandas.core.sorting import nargsort - # GH11080 - Check monotonic-ness before sort an index - # if monotonic (already sorted), return None or copy() according - # to 'inplace' + # Check monotonic-ness before sort an index + # GH11080 if ((ascending and labels.is_monotonic_increasing) or (not ascending and labels.is_monotonic_decreasing)): if inplace: @@ -3367,8 +3367,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, indexer = nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) + baxis = self._get_block_manager_axis(axis) new_data = self._data.take(indexer, - axis=self._get_block_manager_axis(axis), + axis=baxis, convert=False, verify=False) if inplace: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fe764a099bb63..add2987b8f452 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1882,6 +1882,13 @@ def get_group_levels(self): 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] } + def _is_builtin_func(self, arg): + """ + if we define an builtin function for this argument, return it, + otherwise return the arg + """ + return SelectionMixin._builtin_table.get(arg, arg) + def _get_cython_function(self, kind, how, values, is_numeric): dtype_str = values.dtype.name @@ -2107,7 +2114,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) - obj = obj.take(indexer, convert=False) + obj = obj.take(indexer, convert=False).to_dense() group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index c7e06d63fbda9..b03c3d77928c7 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -22,8 +22,8 @@ from pandas.sparse.libsparse import IntIndex from pandas.core.categorical import Categorical, _factorize_from_iterable -from pandas.core.sorting import (get_group_index, compress_group_index, - decons_obs_group_ids) +from pandas.core.sorting import (get_group_index, get_compressed_ids, + compress_group_index, decons_obs_group_ids) import pandas.core.algorithms as algos from pandas._libs import algos as _algos, reshape as _reshape @@ -496,11 +496,6 @@ def _unstack_frame(obj, level, fill_value=None): return unstacker.get_result() -def get_compressed_ids(labels, sizes): - ids = get_group_index(labels, sizes, sort=True, xnull=False) - return compress_group_index(ids, sort=True) - - def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the diff --git a/pandas/core/series.py b/pandas/core/series.py index d6a1a9d98faf4..760abc20351cf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1751,17 +1751,31 @@ def _try_kind_sort(arr): def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True): + # TODO: this can be combined with DataFrame.sort_index impl as + # almost identical inplace = validate_bool_kwarg(inplace, 'inplace') axis = self._get_axis_number(axis) index = self.index - if level is not None: + + if level: new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(index.labels, orders=ascending) + labels = index._sort_levels_monotonic() + indexer = lexsort_indexer(labels.labels, orders=ascending) else: from pandas.core.sorting import nargsort + + # Check monotonic-ness before sort an index + # GH11080 + if ((ascending and index.is_monotonic_increasing) or + (not ascending and index.is_monotonic_decreasing)): + if inplace: + return + else: + return self.copy() + indexer = nargsort(index, kind=kind, ascending=ascending, na_position=na_position) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 205d0d94d2ec3..e56a4f50de134 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -93,6 +93,27 @@ def maybe_lift(lab, size): # pormote nan values return loop(list(labels), list(shape)) +def get_compressed_ids(labels, sizes): + """ + + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + + Parameters + ---------- + labels : list of label arrays + sizes : list of size of the levels + + Returns + ------- + tuple of (comp_ids, obs_group_ids) + + """ + ids = get_group_index(labels, sizes, sort=True, xnull=False) + return compress_group_index(ids, sort=True) + + def is_int64_overflow_possible(shape): the_prod = long(1) for x in shape: diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index f12b10ae682fa..96e0effbd7608 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1171,9 +1171,142 @@ def from_product(cls, iterables, sortorder=None, names=None): labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) + return MultiIndex(levels, labels, sortorder=sortorder, names=names) - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names) + def _sort_levels_monotonic(self): + """ + .. versionadded:: 0.20.0 + + This is an *internal* function. + + create a new MultiIndex from the current to monotonically sorted + items IN the levels. This does not actually make the entire MultiIndex + monotonic, JUST the levels. + + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + + Returns + ------- + MultiIndex + + Examples + -------- + + >>> i = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> i + MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + + >>> i.sort_monotonic() + MultiIndex(levels=[['a', 'b'], ['aa', 'bb']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]]) + + """ + + if self.is_lexsorted() and self.is_monotonic: + return self + + new_levels = [] + new_labels = [] + + for lev, lab in zip(self.levels, self.labels): + + if lev.is_monotonic: + new_levels.append(lev) + new_labels.append(lab) + continue + + # indexer to reorder the levels + indexer = lev.argsort() + lev = lev.take(indexer) + + # indexer to reorder the labels + ri = lib.get_reverse_indexer(indexer, len(indexer)) + lab = algos.take_1d(ri, lab) + + new_levels.append(lev) + new_labels.append(lab) + + return MultiIndex(new_levels, new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) + + def remove_unused_levels(self): + """ + create a new MultiIndex from the current that removing + unused levels, meaning that they are not expressed in the labels + + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. + + .. versionadded:: 0.20.0 + + Returns + ------- + MultiIndex + + Examples + -------- + >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) + MultiIndex(levels=[[0, 1], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + + + >>> i[2:] + MultiIndex(levels=[[0, 1], ['a', 'b']], + labels=[[1, 1], [0, 1]]) + + # the 0 from the first level is not represented + # and can be removed + >>> i[2:].remove_unused_levels() + MultiIndex(levels=[[1], ['a', 'b']], + labels=[[0, 0], [0, 1]]) + + """ + + new_levels = [] + new_labels = [] + + changed = np.ones(self.nlevels, dtype=bool) + for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): + + uniques = algos.unique(lab) + + # nothing unused + if len(uniques) == len(lev): + new_levels.append(lev) + new_labels.append(lab) + changed[i] = False + continue + + # set difference, then reverse sort + diff = Index(np.arange(len(lev))).difference(uniques) + unused = diff.sort_values(ascending=False) + + # new levels are simple + lev = lev.take(uniques) + + # new labels, we remove the unsued + # by decrementing the labels for that value + # prob a better way + for u in unused: + + lab = np.where(lab > u, lab - 1, lab) + + new_levels.append(lev) + new_labels.append(lab) + + # nothing changed + if not changed.any(): + return self + + return MultiIndex(new_levels, new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) @property def nlevels(self): @@ -1744,9 +1877,10 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side='left'): if len(tup) > self.lexsort_depth: - raise KeyError('Key length (%d) was greater than MultiIndex' - ' lexsort depth (%d)' % - (len(tup), self.lexsort_depth)) + raise UnsortedIndexError( + 'Key length (%d) was greater than MultiIndex' + ' lexsort depth (%d)' % + (len(tup), self.lexsort_depth)) n = len(tup) start, end = 0, len(self) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 470526043234f..e93319a30d5d8 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2411,6 +2411,80 @@ def test_is_monotonic(self): self.assertFalse(i.is_monotonic) + def test_reconstruct_sort(self): + + # starts off lexsorted & monotonic + mi = MultiIndex.from_arrays([ + ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] + ]) + assert mi.is_lexsorted() + assert mi.is_monotonic + + recons = mi._sort_levels_monotonic() + assert recons.is_lexsorted() + assert recons.is_monotonic + assert mi is recons + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), + ('x', 'b'), ('y', 'a'), ('z', 'b')], + names=['one', 'two']) + assert not mi.is_lexsorted() + assert not mi.is_monotonic + + recons = mi._sort_levels_monotonic() + assert not recons.is_lexsorted() + assert not recons.is_monotonic + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], + labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=['col1', 'col2']) + assert not mi.is_lexsorted() + assert not mi.is_monotonic + + recons = mi._sort_levels_monotonic() + assert not recons.is_lexsorted() + assert not recons.is_monotonic + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + def test_reconstruct_remove_unused(self): + # xref to GH 2770 + df = DataFrame([['deleteMe', 1, 9], + ['keepMe', 2, 9], + ['keepMeToo', 3, 9]], + columns=['first', 'second', 'third']) + df2 = df.set_index(['first', 'second'], drop=False) + df2 = df2[df2['first'] != 'deleteMe'] + + # removed levels are there + expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'], + [1, 2, 3]], + labels=[[1, 2], [1, 2]], + names=['first', 'second']) + result = df2.index + tm.assert_index_equal(result, expected) + + expected = MultiIndex(levels=[['keepMe', 'keepMeToo'], + [2, 3]], + labels=[[0, 1], [0, 1]], + names=['first', 'second']) + result = df2.index.remove_unused_levels() + tm.assert_index_equal(result, expected) + + # idempotent + result2 = result.remove_unused_levels() + tm.assert_index_equal(result2, expected) + assert result2 is result + def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] @@ -2699,6 +2773,30 @@ def test_unsortedindex(self): with assertRaises(KeyError): df.loc(axis=0)['q', :] + def test_unsortedindex_doc_examples(self): + # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + dfm = DataFrame({'jim': [0, 0, 1, 1], + 'joe': ['x', 'x', 'z', 'y'], + 'jolie': np.random.rand(4)}) + + dfm = dfm.set_index(['jim', 'joe']) + with tm.assert_produces_warning(PerformanceWarning): + dfm.loc[(1, 'z')] + + with pytest.raises(UnsortedIndexError): + dfm.loc[(0, 'y'):(1, 'z')] + + assert not dfm.index.is_lexsorted() + assert dfm.index.lexsort_depth == 1 + + # sort it + dfm = dfm.sort_index() + dfm.loc[(1, 'z')] + dfm.loc[(0, 'y'):(1, 'z')] + + assert dfm.index.is_lexsorted() + assert dfm.index.lexsort_depth == 2 + def test_tuples_with_name_string(self): # GH 15110 and GH 14848 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 732142f1bce9a..a682e8643d251 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1526,7 +1526,7 @@ def test_unstack(self): labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) expected = DataFrame({'bar': s.values}, index=exp_index).sort_index(level=0) - unstacked = s.unstack(0) + unstacked = s.unstack(0).sort_index() assert_frame_equal(unstacked, expected) # GH5873 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 5584c1ac6a239..914d26fcafb4a 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -11,6 +11,7 @@ from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp +from pandas.core.common import UnsortedIndexError from pandas.types.common import is_float_dtype, is_integer_dtype import pandas.core.common as com import pandas.util.testing as tm @@ -2438,6 +2439,30 @@ def test_getitem_slice_not_sorted(self): expected = df.reindex(columns=df.columns[:3]) tm.assert_frame_equal(result, expected) + def test_frame_getitem_not_sorted2(self): + # 13431 + df = DataFrame({'col1': ['b', 'd', 'b', 'a'], + 'col2': [3, 1, 1, 2], + 'data': ['one', 'two', 'three', 'four']}) + + df2 = df.set_index(['col1', 'col2']) + df2_original = df2.copy() + + df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) + df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + assert not df2.index.is_lexsorted() + assert not df2.index.is_monotonic + + assert df2_original.index.equals(df2.index) + expected = df2.sort_index() + assert not expected.index.is_lexsorted() + assert expected.index.is_monotonic + + result = df2.sort_index(level=0) + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + tm.assert_frame_equal(result, expected) + def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' @@ -2474,3 +2499,137 @@ def test_series_getitem_not_sorted(self): expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) + + def test_sort_index_and_reconstruction(self): + + # 15622 + # lexsortedness should be identical + # across MultiIndex consruction methods + + df = DataFrame([[1, 1], [2, 2]], index=list('ab')) + expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples([(0.5, 'a'), + (0.5, 'b'), + (0.8, 'a'), + (0.8, 'b')])) + assert expected.index.is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) + result = result.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + result = result.sort_index() + assert result.index.is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + # this will be monotonic, but not lexsorted! + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # 14015 + df = DataFrame([[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, '20160811 12:00:00'), + (0, '20160809 12:00:00')], + names=['l1', 'Date'])) + + df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), + level=1, + inplace=True) + assert not df.columns.is_lexsorted() + assert not df.columns.is_monotonic + result = df.sort_index(axis=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + result = df.sort_index(axis=1, level=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + + def test_sort_index_and_reconstruction_doc_example(self): + # doc example + df = DataFrame({'value': [1, 2, 3, 4]}, + index=MultiIndex( + levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + assert df.index.is_lexsorted() + assert not df.index.is_monotonic + + # sort it + expected = DataFrame({'value': [2, 1, 4, 3]}, + index=MultiIndex( + levels=[['a', 'b'], ['aa', 'bb']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + result = df.sort_index() + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # reconstruct + result = df.sort_index().copy() + result.index = result.index._sort_levels_monotonic() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + def test_sort_index_reorder_on_ops(self): + # 15687 + df = pd.DataFrame( + np.random.randn(8, 2), + index=MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['red', 'blu']], + names=['letter', 'size', 'color']), + columns=['near', 'far']) + df = df.sort_index() + + def my_func(group): + group.index = ['newz', 'newa'] + return group + + result = df.groupby(level=['letter', 'size']).apply( + my_func).sort_index() + expected = MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['newa', 'newz']], + names=['letter', 'size', None]) + + tm.assert_index_equal(result.index, expected) + + def test_sort_non_lexsorted(self): + # degenerate case where we sort but don't + # have a satisfying result :< + + idx = MultiIndex([['A', 'B', 'C'], + ['c', 'b', 'a']], + [[0, 1, 2, 0, 1, 2], + [0, 2, 1, 1, 0, 2]]) + + df = DataFrame({'col': range(len(idx))}, index=idx) + assert df.index.is_lexsorted() is False + assert df.index.is_monotonic is False + + result = df.sort_index() + assert result.index.is_lexsorted() is False + assert result.index.is_monotonic is True + + with pytest.raises(UnsortedIndexError): + result.loc[pd.IndexSlice['B':'C', 'a':'c'], :] diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 9bed0d428bc41..864b5018abc75 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -87,6 +87,35 @@ def test_multiindex_unique(self): result = hash_pandas_object(mi) self.assertTrue(result.is_unique) + def test_multiindex_objects(self): + mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], + labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=['col1', 'col2']) + recons = mi._sort_levels_monotonic() + + # these are equal + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # _hashed_values and hash_pandas_object(..., index=False) + # equivalency + expected = hash_pandas_object( + mi, index=False).values + result = mi._hashed_values + tm.assert_numpy_array_equal(result, expected) + + expected = hash_pandas_object( + recons, index=False).values + result = recons._hashed_values + tm.assert_numpy_array_equal(result, expected) + + expected = mi._hashed_values + result = recons._hashed_values + + # values should match, but in different order + tm.assert_numpy_array_equal(np.sort(result), + np.sort(expected)) + def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), diff --git a/pandas/tests/tools/test_pivot.py b/pandas/tests/tools/test_pivot.py index 4502f232c6d9c..c8dfaf5e29bc6 100644 --- a/pandas/tests/tools/test_pivot.py +++ b/pandas/tests/tools/test_pivot.py @@ -2,6 +2,7 @@ import numpy as np +from collections import OrderedDict import pandas as pd from pandas import (DataFrame, Series, Index, MultiIndex, Grouper, date_range, concat) @@ -513,7 +514,7 @@ def test_pivot_columns_lexsorted(self): self.assertTrue(pivoted.columns.is_monotonic) def test_pivot_complex_aggfunc(self): - f = {'D': ['std'], 'E': ['sum']} + f = OrderedDict([('D', ['std']), ('E', ['sum'])]) expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') result = self.data.pivot_table(index='A', columns='B', aggfunc=f) From d9e00d2ac0803c561c9f7af1131f586aecfb8113 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 11:21:04 -0400 Subject: [PATCH 15/22] DEPR: deprecate pd.get_store as not api consistent and cluttering (#15940) --- doc/source/whatsnew/v0.20.0.txt | 3 ++- pandas/io/pytables.py | 7 +++++++ pandas/tests/api/test_api.py | 16 ++++++++++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 21b259e7663ba..31b0efa14a44d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -981,13 +981,14 @@ Deprecations - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`) - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`) - The ``as_indexer`` keyword of ``Series.str.match()`` has been deprecated (ignored keyword) (:issue:`15257`). -- The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`) +- The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`, :issue:`15940`) * ``pd.pnow()``, replaced by ``Period.now()`` * ``pd.Term``, is removed, as it is not applicable to user code. Instead use in-line string expressions in the where clause when searching in HDFStore * ``pd.Expr``, is removed, as it is not applicable to user code. * ``pd.match()``, is removed. * ``pd.groupby()``, replaced by using the ``.groupby()`` method directly on a ``Series/DataFrame`` + * ``pd.get_store()``, replaced by a direct call to ``pd.HDFStore(...)`` .. _whatsnew_0200.prior_deprecations: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9b525b76b0f17..802f460ecba07 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1323,6 +1323,13 @@ def _read_group(self, group, **kwargs): def get_store(path, **kwargs): """ Backwards compatible alias for ``HDFStore`` """ + warnings.warn( + "get_store is deprecated and be " + "removed in a future version\n" + "HDFStore(path, **kwargs) is the replacement", + FutureWarning, + stacklevel=6) + return HDFStore(path, **kwargs) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7d1308d67668e..7301c87026114 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -2,6 +2,7 @@ from warnings import catch_warnings +import pytest import pandas as pd from pandas import api from pandas.util import testing as tm @@ -63,7 +64,7 @@ class TestPDApi(Base, tm.TestCase): # top-level functions funcs = ['bdate_range', 'concat', 'crosstab', 'cut', 'date_range', 'eval', - 'factorize', 'get_dummies', 'get_store', + 'factorize', 'get_dummies', 'infer_freq', 'isnull', 'lreshape', 'melt', 'notnull', 'offsets', 'merge', 'merge_ordered', 'merge_asof', @@ -102,7 +103,7 @@ class TestPDApi(Base, tm.TestCase): 'rolling_median', 'rolling_min', 'rolling_quantile', 'rolling_skew', 'rolling_std', 'rolling_sum', 'rolling_var', 'rolling_window', 'ordered_merge', - 'pnow', 'match', 'groupby'] + 'pnow', 'match', 'groupby', 'get_store'] def test_api(self): @@ -140,6 +141,7 @@ def test_deprecation_access_obj(self): class TestTopLevelDeprecations(tm.TestCase): + # top-level API deprecations # GH 13790 @@ -168,6 +170,16 @@ def test_groupby(self): check_stacklevel=False): pd.groupby(pd.Series([1, 2, 3]), [1, 1, 1]) + # GH 15940 + + def test_get_store(self): + pytest.importorskip('tables') + with tm.ensure_clean() as path: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + s = pd.get_store(path) + s.close() + class TestJson(tm.TestCase): From 88bed545067fe2b466eb3f2d04a1802b493c4909 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 7 Apr 2017 20:58:20 +0200 Subject: [PATCH 16/22] BLD: update merge script to update on github (#15917) * BLD: update merge script to update on github * adapt question --- scripts/{merge-py.py => merge-pr.py} | 55 +++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) rename scripts/{merge-py.py => merge-pr.py} (83%) diff --git a/scripts/merge-py.py b/scripts/merge-pr.py similarity index 83% rename from scripts/merge-py.py rename to scripts/merge-pr.py index b9350f8feceb8..1fc4eef3d0583 100755 --- a/scripts/merge-py.py +++ b/scripts/merge-pr.py @@ -99,6 +99,14 @@ def continue_maybe(prompt): fail("Okay, exiting") +def continue_maybe2(prompt): + result = input("\n%s (y/n): " % prompt) + if result.lower() != "y": + return False + else: + return True + + original_head = run_cmd("git rev-parse HEAD")[:8] @@ -193,6 +201,40 @@ def merge_pr(pr_num, target_ref): return merge_hash +def update_pr(pr_num, user_login, base_ref): + + pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) + + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, + pr_branch_name)) + run_cmd("git checkout %s" % pr_branch_name) + + continue_maybe("Update ready (local ref %s)? Push to %s/%s?" % ( + pr_branch_name, user_login, base_ref)) + + push_user_remote = "https://github.com/%s/pandas.git" % user_login + + try: + run_cmd('git push %s %s:%s' % (push_user_remote, pr_branch_name, + base_ref)) + except Exception as e: + + if continue_maybe2("Force push?"): + try: + run_cmd( + 'git push -f %s %s:%s' % (push_user_remote, pr_branch_name, + base_ref)) + except Exception as e: + fail("Exception while pushing: %s" % e) + clean_up() + else: + fail("Exception while pushing: %s" % e) + clean_up() + + clean_up() + print("Pull request #%s updated!" % pr_num) + + def cherry_pick(pr_num, merge_hash, default_branch): pick_ref = input("Enter a branch name [%s]: " % default_branch) if pick_ref == "": @@ -257,8 +299,17 @@ def fix_version_from_branch(branch, versions): print("\n=== Pull Request #%s ===" % pr_num) print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % (title, pr_repo_desc, target_ref, url)) -continue_maybe("Proceed with merging pull request #%s?" % pr_num) + + merged_refs = [target_ref] -merge_hash = merge_pr(pr_num, target_ref) +print("\nProceed with updating or merging pull request #%s?" % pr_num) +update = input("Update PR and push to remote (r), merge locally (l), " + "or do nothing (n) ?") +update = update.lower() + +if update == 'r': + merge_hash = update_pr(pr_num, user_login, base_ref) +elif update == 'l': + merge_hash = merge_pr(pr_num, target_ref) From c4dca36801a58d7ae5cdc395404d4e35f03d110d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 7 Apr 2017 21:04:31 +0200 Subject: [PATCH 17/22] DEPR: correct locations to access public json/parser objects in depr message (#15909) * DEPR: correct locations to access public json objects in depr message * Additional corrections --- pandas/__init__.py | 10 +++++++--- pandas/tslib.py | 3 +-- pandas/util/depr_module.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 83ad85e3e292b..529750cd97076 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -60,11 +60,15 @@ # extension module deprecations from pandas.util.depr_module import _DeprecatedModule -json = _DeprecatedModule(deprmod='pandas.json', deprmodto='pandas.io.json.libjson') -parser = _DeprecatedModule(deprmod='pandas.parser', deprmodto='pandas.io.libparsers') +json = _DeprecatedModule(deprmod='pandas.json', + moved={'dumps': 'pandas.io.json.dumps', + 'loads': 'pandas.io.json.loads'}) +parser = _DeprecatedModule(deprmod='pandas.parser', + removals=['na_values'], + moved={'CParserError': 'pandas.errors.ParserError'}) lib = _DeprecatedModule(deprmod='pandas.lib', deprmodto='pandas._libs.lib', moved={'infer_dtype': 'pandas.api.lib.infer_dtype'}) -tslib = _DeprecatedModule(deprmod='pandas.tslib', deprmodto='pandas._libs.tslib', +tslib = _DeprecatedModule(deprmod='pandas.tslib', moved={'Timestamp': 'pandas.Timestamp', 'Timedelta': 'pandas.Timedelta', 'NaT': 'pandas.NaT', diff --git a/pandas/tslib.py b/pandas/tslib.py index 3d96dc496c0de..f7d99538c2ea2 100644 --- a/pandas/tslib.py +++ b/pandas/tslib.py @@ -2,7 +2,6 @@ import warnings warnings.warn("The pandas.tslib module is deprecated and will be " - "removed in a future version. Please import from " - "the pandas or pandas.errors instead", FutureWarning, stacklevel=2) + "removed in a future version.", FutureWarning, stacklevel=2) from pandas._libs.tslib import (Timestamp, Timedelta, NaT, OutOfBoundsDatetime) diff --git a/pandas/util/depr_module.py b/pandas/util/depr_module.py index 0885c81ce2757..1f428198c19f3 100644 --- a/pandas/util/depr_module.py +++ b/pandas/util/depr_module.py @@ -68,7 +68,7 @@ def __getattr__(self, name): elif self.moved is not None and name in self.moved: warnings.warn( "{deprmod} is deprecated and will be removed in " - "a future version.\nYou can access {name} in {moved}".format( + "a future version.\nYou can access {name} as {moved}".format( deprmod=self.deprmod, name=name, moved=self.moved[name]), From c25fbde09272f369f280212e5216441d5975687c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 15:09:09 -0400 Subject: [PATCH 18/22] DEPR: Panel deprecated closes #13563 on top of #15677 Author: Jeff Reback Closes #15601 from jreback/panel and squashes the following commits: 04104a7 [Jeff Reback] fine grained catching warnings in tests f8800dc [Jeff Reback] add numpy reference for searchsorted fa136dd [Jeff Reback] doc correction c39453a [Jeff Reback] add perf optimization in searchsorted for FrozenNDArray 0e9c4a4 [Jeff Reback] fix docs as per review & column name changes 3df0abe [Jeff Reback] remove Panel from doc-strings, catch internal warning on Panel construction 755606d [Jeff Reback] more docs d04db2e [Jeff Reback] add deprecate_panel section to docs 538b8e8 [Jeff Reback] pep fix 912d523 [Jeff Reback] TST: separate out test_append_to_multiple_dropna to two tests; when drop=False this is sometimes failing a2625ba [Jeff Reback] remove most Term references in test_pytables.py cd5b6b8 [Jeff Reback] DEPR: Panel deprecated 6b20ddc [Jeff Reback] fix names on return structure f41d3df [Jeff Reback] API: df.rolling(..).corr()/cov() when pairwise=True to return MI DataFrame 84e788b [Jeff Reback] BUG/PERF: handle a slice correctly in get_level_indexer --- doc/source/computation.rst | 20 +- doc/source/dsintro.rst | 55 + doc/source/whatsnew/v0.20.0.txt | 75 + pandas/core/panel.py | 16 +- pandas/core/window.py | 68 +- pandas/indexes/frozen.py | 24 + pandas/indexes/multi.py | 20 +- pandas/tests/io/test_pytables.py | 1462 ++++++------ pandas/tests/test_expressions.py | 32 +- pandas/tests/test_generic.py | 171 +- pandas/tests/test_panel.py | 3422 +++++++++++++++------------- pandas/tests/test_window.py | 441 ++-- pandas/tests/tools/test_concat.py | 91 +- pandas/tests/types/test_missing.py | 14 +- pandas/util/testing.py | 6 +- 15 files changed, 3193 insertions(+), 2724 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 57480a244f308..2423f1a342994 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -505,13 +505,18 @@ two ``Series`` or any combination of ``DataFrame/Series`` or - ``DataFrame/DataFrame``: by default compute the statistic for matching column names, returning a DataFrame. If the keyword argument ``pairwise=True`` is passed then computes the statistic for each pair of columns, returning a - ``Panel`` whose ``items`` are the dates in question (see :ref:`the next section + ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section `). For example: .. ipython:: python + df = pd.DataFrame(np.random.randn(1000, 4), + index=pd.date_range('1/1/2000', periods=1000), + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + df2 = df[:20] df2.rolling(window=5).corr(df2['B']) @@ -520,11 +525,16 @@ For example: Computing rolling pairwise covariances and correlations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + Prior to version 0.20.0 if ``pairwise=True`` was passed, a ``Panel`` would be returned. + This will now return a 2-level MultiIndexed DataFrame, see the whatsnew :ref:`here ` + In financial data analysis and other fields it's common to compute covariance and correlation matrices for a collection of time series. Often one is also interested in moving-window covariance and correlation matrices. This can be done by passing the ``pairwise`` keyword argument, which in the case of -``DataFrame`` inputs will yield a ``Panel`` whose ``items`` are the dates in +``DataFrame`` inputs will yield a MultiIndexed ``DataFrame`` whose ``index`` are the dates in question. In the case of a single DataFrame argument the ``pairwise`` argument can even be omitted: @@ -539,12 +549,12 @@ can even be omitted: .. ipython:: python covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) - covs[df.index[-50]] + covs.loc['2002-09-22':] .. ipython:: python correls = df.rolling(window=50).corr() - correls[df.index[-50]] + correls.loc['2002-09-22':] You can efficiently retrieve the time series of correlations between two columns using ``.loc`` indexing: @@ -557,7 +567,7 @@ columns using ``.loc`` indexing: .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.loc[:, 'A', 'C'].plot() + correls.loc[:, ('A', 'C')].plot() .. _stats.aggregate: diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 4fcb63c18757a..2b11b23b1d1c2 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -763,6 +763,11 @@ completion mechanism so they can be tab-completed: Panel ----- +.. warning:: + + In 0.20.0, ``Panel`` is deprecated and will be removed in + a future version. See the section :ref:`Deprecate Panel `. + Panel is a somewhat less-used, but still important container for 3-dimensional data. The term `panel data `__ is derived from econometrics and is partially responsible for the name pandas: @@ -783,6 +788,7 @@ From 3D ndarray with optional axis labels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. ipython:: python + :okwarning: wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=pd.date_range('1/1/2000', periods=5), @@ -794,6 +800,7 @@ From dict of DataFrame objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. ipython:: python + :okwarning: data = {'Item1' : pd.DataFrame(np.random.randn(4, 3)), 'Item2' : pd.DataFrame(np.random.randn(4, 2))} @@ -816,6 +823,7 @@ dictionary of DataFrames as above, and the following named parameters: For example, compare to the construction above: .. ipython:: python + :okwarning: pd.Panel.from_dict(data, orient='minor') @@ -824,6 +832,7 @@ DataFrame objects with mixed-type columns, all of the data will get upcasted to ``dtype=object`` unless you pass ``orient='minor'``: .. ipython:: python + :okwarning: df = pd.DataFrame({'a': ['foo', 'bar', 'baz'], 'b': np.random.randn(3)}) @@ -851,6 +860,7 @@ This method was introduced in v0.7 to replace ``LongPanel.to_long``, and convert a DataFrame with a two-level index to a Panel. .. ipython:: python + :okwarning: midx = pd.MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]]) df = pd.DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx) @@ -880,6 +890,7 @@ A Panel can be rearranged using its ``transpose`` method (which does not make a copy by default unless the data are heterogeneous): .. ipython:: python + :okwarning: wp.transpose(2, 0, 1) @@ -909,6 +920,7 @@ Squeezing Another way to change the dimensionality of an object is to ``squeeze`` a 1-len object, similar to ``wp['Item1']`` .. ipython:: python + :okwarning: wp.reindex(items=['Item1']).squeeze() wp.reindex(items=['Item1'], minor=['B']).squeeze() @@ -923,12 +935,55 @@ for more on this. To convert a Panel to a DataFrame, use the ``to_frame`` method: .. ipython:: python + :okwarning: panel = pd.Panel(np.random.randn(3, 5, 4), items=['one', 'two', 'three'], major_axis=pd.date_range('1/1/2000', periods=5), minor_axis=['a', 'b', 'c', 'd']) panel.to_frame() + +.. _dsintro.deprecate_panel: + +Deprecate Panel +--------------- + +Over the last few years, pandas has increased in both breadth and depth, with new features, +datatype support, and manipulation routines. As a result, supporting efficient indexing and functional +routines for ``Series``, ``DataFrame`` and ``Panel`` has contributed to an increasingly fragmented and +difficult-to-understand codebase. + +The 3-d structure of a ``Panel`` is much less common for many types of data analysis, +than the 1-d of the ``Series`` or the 2-D of the ``DataFrame``. Going forward it makes sense for +pandas to focus on these areas exclusively. + +Oftentimes, one can simply use a MultiIndex ``DataFrame`` for easily working with higher dimensional data. + +In additon, the ``xarray`` package was built from the ground up, specifically in order to +support the multi-dimensional analysis that is one of ``Panel`` s main usecases. +`Here is a link to the xarray panel-transition documentation `__. + +.. ipython:: python + :okwarning: + + p = tm.makePanel() + p + +Convert to a MultiIndex DataFrame + +.. ipython:: python + :okwarning: + + p.to_frame() + +Alternatively, one can convert to an xarray ``DataArray``. + +.. ipython:: python + + p.to_xarray() + +You can see the full-documentation for the `xarray package `__. + .. _dsintro.panelnd: .. _dsintro.panel4d: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 31b0efa14a44d..132f20cb73142 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -10,13 +10,16 @@ users upgrade to this version. Highlights include: - The ``.ix`` indexer has been deprecated, see :ref:`here ` +- ``Panel`` has been deprecated, see :ref:`here ` - Improved user API when accessing levels in ``.groupby()``, see :ref:`here ` - Improved support for UInt64 dtypes, see :ref:`here ` - A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref:`here ` +- Window Binary Corr/Cov operations return a MultiIndexed ``DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, see :ref:`here ` - Support for S3 handling now uses ``s3fs``, see :ref:`here ` - Google BigQuery support now uses the ``pandas-gbq`` library, see :ref:`here ` - Switched the test framework to use `pytest `__ (:issue:`13097`) + Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. .. contents:: What's new in v0.20.0 @@ -425,6 +428,33 @@ Using ``.iloc``. Here we will get the location of the 'A' column, then use *posi df.iloc[[0, 2], df.columns.get_loc('A')] +.. _whatsnew_0200.api_breaking.deprecate_panel: + +Deprecate Panel +^^^^^^^^^^^^^^^ + +``Panel`` is deprecated and will be removed in a future version. The recommended way to represent 3-D data are +with a ``MultiIndex``on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +provides a :meth:`~Panel.to_xarray` method to automate this conversion. See the documentation :ref:`Deprecate Panel `. (:issue:`13563`). + +.. ipython:: python + :okwarning: + + p = tm.makePanel() + p + +Convert to a MultiIndex DataFrame + +.. ipython:: python + + p.to_frame() + +Convert to an xarray DataArray + +.. ipython:: python + + p.to_xarray() + .. _whatsnew.api_breaking.io_compat: Possible incompat for HDF5 formats for pandas < 0.13.0 @@ -836,6 +866,51 @@ New Behavior: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) +.. _whatsnew_0200.api_breaking.rolling_pairwise: + +Window Binary Corr/Cov operations return a MultiIndex DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object, +will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, +see :ref:`here <_whatsnew_0200.api_breaking.deprecate_panel>`. These are equivalent in function, +but MultiIndexed ``DataFrame`` s enjoy more support in pandas. +See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) + +.. ipython:: python + + np.random.seed(1234) + df = pd.DataFrame(np.random.rand(100, 2), + columns=pd.Index(['A', 'B'], name='bar'), + index=pd.date_range('20160101', + periods=100, freq='D', name='foo')) + df.tail() + +Old Behavior: + +.. code-block:: ipython + + In [2]: df.rolling(12).corr() + Out[2]: + + Dimensions: 100 (items) x 2 (major_axis) x 2 (minor_axis) + Items axis: 2016-01-01 00:00:00 to 2016-04-09 00:00:00 + Major_axis axis: A to B + Minor_axis axis: A to B + +New Behavior: + +.. ipython:: python + + res = df.rolling(12).corr() + res.tail() + +Retrieving a correlation matrix for a cross-section + +.. ipython:: python + + df.rolling(12).corr().loc['2016-04-07'] + .. _whatsnew_0200.api_breaking.hdfstore_where: HDFStore where string comparison diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 9e95023ccb359..24f4d219fb9ca 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -4,10 +4,8 @@ # pylint: disable=E1103,W0231,W0212,W0621 from __future__ import division -import warnings - import numpy as np - +import warnings from pandas.types.cast import (infer_dtype_from_scalar, maybe_cast_item) from pandas.types.common import (is_integer, is_list_like, @@ -132,6 +130,18 @@ def _constructor(self): def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): + # deprecation GH13563 + warnings.warn("\nPanel is deprecated and will be removed in a " + "future version.\nThe recommended way to represent " + "these types of 3-dimensional data are with a " + "MultiIndex on a DataFrame, via the " + "Panel.to_frame() method\n" + "Alternatively, you can use the xarray package " + "http://xarray.pydata.org/en/stable/.\n" + "Pandas provides a `.to_xarray()` method to help " + "automate this conversion.\n", + DeprecationWarning, stacklevel=3) + self._init_data(data=data, items=items, major_axis=major_axis, minor_axis=minor_axis, copy=copy, dtype=dtype) diff --git a/pandas/core/window.py b/pandas/core/window.py index 9c9f861451309..89d2f5b24d77e 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -927,8 +927,9 @@ def f(arg, *args, **kwargs): If False then only matching columns between self and other will be used and the output will be a DataFrame. If True then all pairwise combinations will be calculated and the - output will be a Panel in the case of DataFrame inputs. In the case of - missing elements, only complete pairwise observations will be used. + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements.""") @@ -964,11 +965,12 @@ def _get_cov(X, Y): other : Series, DataFrame, or ndarray, optional if not supplied then will default to self and produce pairwise output pairwise : bool, default None - If False then only matching columns between self and other will be used - and the output will be a DataFrame. + If False then only matching columns between self and other will be + used and the output will be a DataFrame. If True then all pairwise combinations will be calculated and the - output will be a Panel in the case of DataFrame inputs. In the case of - missing elements, only complete pairwise observations will be used.""") + output will be a MultiIndex DataFrame in the case of DataFrame inputs. + In the case of missing elements, only complete pairwise observations + will be used.""") def corr(self, other=None, pairwise=None, **kwargs): if other is None: @@ -1397,8 +1399,9 @@ def _constructor(self): If False then only matching columns between self and other will be used and the output will be a DataFrame. If True then all pairwise combinations will be calculated and the output - will be a Panel in the case of DataFrame inputs. In the case of missing - elements, only complete pairwise observations will be used. + will be a MultiIndex DataFrame in the case of DataFrame inputs. + In the case of missing elements, only complete pairwise observations will + be used. bias : boolean, default False Use a standard estimation bias correction """ @@ -1652,7 +1655,8 @@ def _cov(x, y): def _flex_binary_moment(arg1, arg2, f, pairwise=False): - from pandas import Series, DataFrame, Panel + from pandas import Series, DataFrame + if not (isinstance(arg1, (np.ndarray, Series, DataFrame)) and isinstance(arg2, (np.ndarray, Series, DataFrame))): raise TypeError("arguments to moment function must be of type " @@ -1684,10 +1688,13 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") - X, Y = arg1.align(arg2, join='outer') + with warnings.catch_warnings(record=True): + X, Y = arg1.align(arg2, join='outer') X = X + 0 * Y Y = Y + 0 * X - res_columns = arg1.columns.union(arg2.columns) + + with warnings.catch_warnings(record=True): + res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) @@ -1703,12 +1710,39 @@ def dataframe_from_int_dict(data, frame_template): else: results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) - p = Panel.from_dict(results).swapaxes('items', 'major') - if len(p.major_axis) > 0: - p.major_axis = arg1.columns[p.major_axis] - if len(p.minor_axis) > 0: - p.minor_axis = arg2.columns[p.minor_axis] - return p + + # TODO: not the most efficient (perf-wise) + # though not bad code-wise + from pandas import Panel, MultiIndex, Index + with warnings.catch_warnings(record=True): + p = Panel.from_dict(results).swapaxes('items', 'major') + if len(p.major_axis) > 0: + p.major_axis = arg1.columns[p.major_axis] + if len(p.minor_axis) > 0: + p.minor_axis = arg2.columns[p.minor_axis] + + if len(p.items): + result = pd.concat( + [p.iloc[i].T for i in range(len(p.items))], + keys=p.items) + else: + + result = DataFrame( + index=MultiIndex(levels=[arg1.index, arg1.columns], + labels=[[], []]), + columns=arg2.columns, + dtype='float64') + + # reset our index names to arg1 names + # reset our column names to arg2 names + # careful not to mutate the original names + result.columns = Index(result.columns).set_names( + arg2.columns.name) + result.index = result.index.set_names( + [arg1.index.name, arg1.columns.name]) + + return result + else: raise ValueError("'pairwise' is not True/False") else: diff --git a/pandas/indexes/frozen.py b/pandas/indexes/frozen.py index 97a1a3ea99e65..ab1228c008ca8 100644 --- a/pandas/indexes/frozen.py +++ b/pandas/indexes/frozen.py @@ -117,6 +117,30 @@ def __unicode__(self): quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + def searchsorted(self, v, side='left', sorter=None): + """ + Find indices where elements of v should be inserted + in a to maintain order. + + For full documentation, see `numpy.searchsorted` + + See Also + -------- + numpy.searchsorted : equivalent function + """ + + # we are much more performant if the searched + # indexer is the same type as the array + # this doesn't matter for int64, but DOES + # matter for smaller int dtypes + # https://github.com/numpy/numpy/issues/5370 + try: + v = self.dtype.type(v) + except: + pass + return super(FrozenNDArray, self).searchsorted( + v, side=side, sorter=sorter) + def _ensure_frozen(array_like, categories, copy=False): array_like = coerce_indexer_dtype(array_like, categories) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 96e0effbd7608..77774f3284fef 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -2203,20 +2203,14 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): else: loc = level_index.get_loc(key) - if level > 0 or self.lexsort_depth == 0: + if isinstance(loc, slice): + return loc + elif level > 0 or self.lexsort_depth == 0: return np.array(labels == loc, dtype=bool) - else: - # sorted, so can return slice object -> view - try: - loc = labels.dtype.type(loc) - except TypeError: - # this occurs when loc is a slice (partial string indexing) - # but the TypeError raised by searchsorted in this case - # is catched in Index._has_valid_type() - pass - i = labels.searchsorted(loc, side='left') - j = labels.searchsorted(loc, side='right') - return slice(i, j) + + i = labels.searchsorted(loc, side='left') + j = labels.searchsorted(loc, side='right') + return slice(i, j) def get_locs(self, tup): """ diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 82a98f5d08488..9908a320a6646 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1,9 +1,9 @@ import pytest import sys import os -from warnings import catch_warnings import tempfile from contextlib import contextmanager +from warnings import catch_warnings import datetime from datetime import timedelta @@ -11,7 +11,7 @@ import pandas import pandas as pd -from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, +from pandas import (Series, DataFrame, Panel, Panel4D, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, isnull) @@ -22,8 +22,6 @@ tables = pytest.importorskip('tables') from pandas.io.pytables import TableIterator from pandas.io.pytables import (HDFStore, get_store, Term, read_hdf, - IncompatibilityWarning, PerformanceWarning, - AttributeConflictWarning, PossibleDataLossError, ClosedFileError) from pandas.io import pytables as pytables @@ -205,8 +203,10 @@ def roundtrip(key, obj, **kwargs): o = tm.makeDataFrame() assert_frame_equal(o, roundtrip('frame', o)) - o = tm.makePanel() - assert_panel_equal(o, roundtrip('panel', o)) + with catch_warnings(record=True): + + o = tm.makePanel() + assert_panel_equal(o, roundtrip('panel', o)) # table df = DataFrame(dict(A=lrange(5), B=lrange(5))) @@ -368,8 +368,9 @@ def test_keys(self): store['a'] = tm.makeTimeSeries() store['b'] = tm.makeStringSeries() store['c'] = tm.makeDataFrame() - store['d'] = tm.makePanel() - store['foo/bar'] = tm.makePanel() + with catch_warnings(record=True): + store['d'] = tm.makePanel() + store['foo/bar'] = tm.makePanel() self.assertEqual(len(store), 5) expected = set(['/a', '/b', '/c', '/d', '/foo/bar']) self.assertTrue(set(store.keys()) == expected) @@ -388,9 +389,11 @@ def test_repr(self): store['a'] = tm.makeTimeSeries() store['b'] = tm.makeStringSeries() store['c'] = tm.makeDataFrame() - store['d'] = tm.makePanel() - store['foo/bar'] = tm.makePanel() - store.append('e', tm.makePanel()) + + with catch_warnings(record=True): + store['d'] = tm.makePanel() + store['foo/bar'] = tm.makePanel() + store.append('e', tm.makePanel()) df = tm.makeDataFrame() df['obj1'] = 'foo' @@ -755,6 +758,7 @@ def test_put_mixed_type(self): with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df') + # PerformanceWarning with catch_warnings(record=True): store.put('df', df) @@ -764,39 +768,42 @@ def test_put_mixed_type(self): def test_append(self): with ensure_clean_store(self.path) as store: - df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df[:10]) - store.append('df1', df[10:]) - tm.assert_frame_equal(store['df1'], df) - - _maybe_remove(store, 'df2') - store.put('df2', df[:10], format='table') - store.append('df2', df[10:]) - tm.assert_frame_equal(store['df2'], df) - - _maybe_remove(store, 'df3') - store.append('/df3', df[:10]) - store.append('/df3', df[10:]) - tm.assert_frame_equal(store['df3'], df) # this is allowed by almost always don't want to do it # tables.NaturalNameWarning): with catch_warnings(record=True): + + df = tm.makeTimeDataFrame() + _maybe_remove(store, 'df1') + store.append('df1', df[:10]) + store.append('df1', df[10:]) + tm.assert_frame_equal(store['df1'], df) + + _maybe_remove(store, 'df2') + store.put('df2', df[:10], format='table') + store.append('df2', df[10:]) + tm.assert_frame_equal(store['df2'], df) + + _maybe_remove(store, 'df3') + store.append('/df3', df[:10]) + store.append('/df3', df[10:]) + tm.assert_frame_equal(store['df3'], df) + + # this is allowed by almost always don't want to do it + # tables.NaturalNameWarning _maybe_remove(store, '/df3 foo') store.append('/df3 foo', df[:10]) store.append('/df3 foo', df[10:]) tm.assert_frame_equal(store['df3 foo'], df) - # panel - wp = tm.makePanel() - _maybe_remove(store, 'wp1') - store.append('wp1', wp.iloc[:, :10, :]) - store.append('wp1', wp.iloc[:, 10:, :]) - assert_panel_equal(store['wp1'], wp) + # panel + wp = tm.makePanel() + _maybe_remove(store, 'wp1') + store.append('wp1', wp.iloc[:, :10, :]) + store.append('wp1', wp.iloc[:, 10:, :]) + assert_panel_equal(store['wp1'], wp) - # ndim - with catch_warnings(record=True): + # ndim p4d = tm.makePanel4D() _maybe_remove(store, 'p4d') store.append('p4d', p4d.iloc[:, :, :10, :]) @@ -820,42 +827,42 @@ def test_append(self): 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) assert_panel4d_equal(store['p4d2'], p4d2) - # test using differt order of items on the non-index axes - _maybe_remove(store, 'wp1') - wp_append1 = wp.iloc[:, :10, :] - store.append('wp1', wp_append1) - wp_append2 = wp.iloc[:, 10:, :].reindex(items=wp.items[::-1]) - store.append('wp1', wp_append2) - assert_panel_equal(store['wp1'], wp) - - # dtype issues - mizxed type in a single object column - df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df['mixed_column'] = 'testing' - df.loc[2, 'mixed_column'] = np.nan - _maybe_remove(store, 'df') - store.append('df', df) - tm.assert_frame_equal(store['df'], df) - - # uints - test storage of uints - uint_data = DataFrame({ - 'u08': Series(np.random.randint(0, high=255, size=5), - dtype=np.uint8), - 'u16': Series(np.random.randint(0, high=65535, size=5), - dtype=np.uint16), - 'u32': Series(np.random.randint(0, high=2**30, size=5), - dtype=np.uint32), - 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62], - dtype=np.uint64)}, index=np.arange(5)) - _maybe_remove(store, 'uints') - store.append('uints', uint_data) - tm.assert_frame_equal(store['uints'], uint_data) - - # uints - test storage of uints in indexable columns - _maybe_remove(store, 'uints') - # 64-bit indices not yet supported - store.append('uints', uint_data, data_columns=[ - 'u08', 'u16', 'u32']) - tm.assert_frame_equal(store['uints'], uint_data) + # test using differt order of items on the non-index axes + _maybe_remove(store, 'wp1') + wp_append1 = wp.iloc[:, :10, :] + store.append('wp1', wp_append1) + wp_append2 = wp.iloc[:, 10:, :].reindex(items=wp.items[::-1]) + store.append('wp1', wp_append2) + assert_panel_equal(store['wp1'], wp) + + # dtype issues - mizxed type in a single object column + df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) + df['mixed_column'] = 'testing' + df.loc[2, 'mixed_column'] = np.nan + _maybe_remove(store, 'df') + store.append('df', df) + tm.assert_frame_equal(store['df'], df) + + # uints - test storage of uints + uint_data = DataFrame({ + 'u08': Series(np.random.randint(0, high=255, size=5), + dtype=np.uint8), + 'u16': Series(np.random.randint(0, high=65535, size=5), + dtype=np.uint16), + 'u32': Series(np.random.randint(0, high=2**30, size=5), + dtype=np.uint32), + 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62], + dtype=np.uint64)}, index=np.arange(5)) + _maybe_remove(store, 'uints') + store.append('uints', uint_data) + tm.assert_frame_equal(store['uints'], uint_data) + + # uints - test storage of uints in indexable columns + _maybe_remove(store, 'uints') + # 64-bit indices not yet supported + store.append('uints', uint_data, data_columns=[ + 'u08', 'u16', 'u32']) + tm.assert_frame_equal(store['uints'], uint_data) def test_append_series(self): @@ -937,8 +944,9 @@ def check(format, index): # only support for fixed types (and they have a perf warning) self.assertRaises(TypeError, check, 'table', index) - with tm.assert_produces_warning( - expected_warning=PerformanceWarning): + + # PerformanceWarning + with catch_warnings(record=True): check('fixed', index) def test_encoding(self): @@ -1131,15 +1139,17 @@ def test_append_all_nans(self): [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_missing = Panel(matrix, items=['Item1', 'Item2', 'Item3'], - major_axis=[1, 2], - minor_axis=['A', 'B', 'C']) + with catch_warnings(record=True): + panel_with_missing = Panel(matrix, + items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], + minor_axis=['A', 'B', 'C']) - with ensure_clean_path(self.path) as path: - panel_with_missing.to_hdf( - path, 'panel_with_missing', format='table') - reloaded_panel = read_hdf(path, 'panel_with_missing') - tm.assert_panel_equal(panel_with_missing, reloaded_panel) + with ensure_clean_path(self.path) as path: + panel_with_missing.to_hdf( + path, 'panel_with_missing', format='table') + reloaded_panel = read_hdf(path, 'panel_with_missing') + tm.assert_panel_equal(panel_with_missing, reloaded_panel) def test_append_frame_column_oriented(self): @@ -1158,13 +1168,14 @@ def test_append_frame_column_oriented(self): # selection on the non-indexable result = store.select( - 'df1', ('columns=A', Term('index=df.index[0:4]'))) + 'df1', ('columns=A', 'index=df.index[0:4]')) expected = df.reindex(columns=['A'], index=df.index[0:4]) tm.assert_frame_equal(expected, result) # this isn't supported - self.assertRaises(TypeError, store.select, 'df1', ( - 'columns=A', Term('index>df.index[4]'))) + with pytest.raises(TypeError): + store.select('df1', + 'columns=A and index>df.index[4]') def test_append_with_different_block_ordering(self): @@ -1265,15 +1276,15 @@ def check_indexers(key, indexers): assert_panel4d_equal(result, expected) # partial selection2 - result = store.select('p4d', [Term( - 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + result = store.select( + 'p4d', "labels='l1' and items='ItemA' and minor_axis='B'") expected = p4d.reindex( labels=['l1'], items=['ItemA'], minor_axis=['B']) assert_panel4d_equal(result, expected) # non-existant partial selection - result = store.select('p4d', [Term( - 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + result = store.select( + 'p4d', "labels='l1' and items='Item1' and minor_axis='B'") expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) assert_panel4d_equal(result, expected) @@ -1281,100 +1292,103 @@ def check_indexers(key, indexers): def test_append_with_strings(self): with ensure_clean_store(self.path) as store: - wp = tm.makePanel() - wp2 = wp.rename_axis( - dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) - - def check_col(key, name, size): - self.assertEqual(getattr(store.get_storer( - key).table.description, name).itemsize, size) - - store.append('s1', wp, min_itemsize=20) - store.append('s1', wp2) - expected = concat([wp, wp2], axis=2) - expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) - assert_panel_equal(store['s1'], expected) - check_col('s1', 'minor_axis', 20) - - # test dict format - store.append('s2', wp, min_itemsize={'minor_axis': 20}) - store.append('s2', wp2) - expected = concat([wp, wp2], axis=2) - expected = expected.reindex(minor_axis=sorted(expected.minor_axis)) - assert_panel_equal(store['s2'], expected) - check_col('s2', 'minor_axis', 20) - - # apply the wrong field (similar to #1) - store.append('s3', wp, min_itemsize={'major_axis': 20}) - self.assertRaises(ValueError, store.append, 's3', wp2) - - # test truncation of bigger strings - store.append('s4', wp) - self.assertRaises(ValueError, store.append, 's4', wp2) - - # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - store.append('df_big', df) - tm.assert_frame_equal(store.select('df_big'), df) - check_col('df_big', 'values_block_1', 15) - - # appending smaller string ok - df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) - store.append('df_big', df2) - expected = concat([df, df2]) - tm.assert_frame_equal(store.select('df_big'), expected) - check_col('df_big', 'values_block_1', 15) - - # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - store.append('df_big2', df, min_itemsize={'values': 50}) - tm.assert_frame_equal(store.select('df_big2'), df) - check_col('df_big2', 'values_block_1', 50) - - # bigger string on next append - store.append('df_new', df) - df_new = DataFrame( - [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) - self.assertRaises(ValueError, store.append, 'df_new', df_new) - - # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index('C') - store.append('ss', df['B'], min_itemsize={'index': 4}) - tm.assert_series_equal(store.select('ss'), df['B']) - - # same as above, with data_columns=True - store.append('ss2', df['B'], data_columns=True, - min_itemsize={'index': 4}) - tm.assert_series_equal(store.select('ss2'), df['B']) - - # min_itemsize in index without appending (GH 10381) - store.put('ss3', df, format='table', - min_itemsize={'index': 6}) - # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C='longer').set_index('C') - store.append('ss3', df2) - tm.assert_frame_equal(store.select('ss3'), - pd.concat([df, df2])) - - # same as above, with a Series - store.put('ss4', df['B'], format='table', - min_itemsize={'index': 6}) - store.append('ss4', df2['B']) - tm.assert_series_equal(store.select('ss4'), - pd.concat([df['B'], df2['B']])) - - # with nans - _maybe_remove(store, 'df') - df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.loc[1:4, 'string'] = np.nan - df['string2'] = 'bar' - df.loc[4:8, 'string2'] = np.nan - df['string3'] = 'bah' - df.loc[1:, 'string3'] = np.nan - store.append('df', df) - result = store.select('df') - tm.assert_frame_equal(result, df) + with catch_warnings(record=True): + wp = tm.makePanel() + wp2 = wp.rename_axis( + dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) + + def check_col(key, name, size): + self.assertEqual(getattr(store.get_storer( + key).table.description, name).itemsize, size) + + store.append('s1', wp, min_itemsize=20) + store.append('s1', wp2) + expected = concat([wp, wp2], axis=2) + expected = expected.reindex( + minor_axis=sorted(expected.minor_axis)) + assert_panel_equal(store['s1'], expected) + check_col('s1', 'minor_axis', 20) + + # test dict format + store.append('s2', wp, min_itemsize={'minor_axis': 20}) + store.append('s2', wp2) + expected = concat([wp, wp2], axis=2) + expected = expected.reindex( + minor_axis=sorted(expected.minor_axis)) + assert_panel_equal(store['s2'], expected) + check_col('s2', 'minor_axis', 20) + + # apply the wrong field (similar to #1) + store.append('s3', wp, min_itemsize={'major_axis': 20}) + self.assertRaises(ValueError, store.append, 's3', wp2) + + # test truncation of bigger strings + store.append('s4', wp) + self.assertRaises(ValueError, store.append, 's4', wp2) + + # avoid truncation on elements + df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) + store.append('df_big', df) + tm.assert_frame_equal(store.select('df_big'), df) + check_col('df_big', 'values_block_1', 15) + + # appending smaller string ok + df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) + store.append('df_big', df2) + expected = concat([df, df2]) + tm.assert_frame_equal(store.select('df_big'), expected) + check_col('df_big', 'values_block_1', 15) + + # avoid truncation on elements + df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) + store.append('df_big2', df, min_itemsize={'values': 50}) + tm.assert_frame_equal(store.select('df_big2'), df) + check_col('df_big2', 'values_block_1', 50) + + # bigger string on next append + store.append('df_new', df) + df_new = DataFrame( + [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) + self.assertRaises(ValueError, store.append, 'df_new', df_new) + + # min_itemsize on Series index (GH 11412) + df = tm.makeMixedDataFrame().set_index('C') + store.append('ss', df['B'], min_itemsize={'index': 4}) + tm.assert_series_equal(store.select('ss'), df['B']) + + # same as above, with data_columns=True + store.append('ss2', df['B'], data_columns=True, + min_itemsize={'index': 4}) + tm.assert_series_equal(store.select('ss2'), df['B']) + + # min_itemsize in index without appending (GH 10381) + store.put('ss3', df, format='table', + min_itemsize={'index': 6}) + # just make sure there is a longer string: + df2 = df.copy().reset_index().assign(C='longer').set_index('C') + store.append('ss3', df2) + tm.assert_frame_equal(store.select('ss3'), + pd.concat([df, df2])) + + # same as above, with a Series + store.put('ss4', df['B'], format='table', + min_itemsize={'index': 6}) + store.append('ss4', df2['B']) + tm.assert_series_equal(store.select('ss4'), + pd.concat([df['B'], df2['B']])) + + # with nans + _maybe_remove(store, 'df') + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.loc[1:4, 'string'] = np.nan + df['string2'] = 'bar' + df.loc[4:8, 'string2'] = np.nan + df['string3'] = 'bah' + df.loc[1:, 'string3'] = np.nan + store.append('df', df) + result = store.select('df') + tm.assert_frame_equal(result, df) with ensure_clean_store(self.path) as store: @@ -1452,13 +1466,13 @@ def test_append_with_data_columns(self): assert(store._handle.root.df.table.cols.B.is_indexed is True) # data column searching - result = store.select('df', [Term('B>0')]) + result = store.select('df', 'B>0') expected = df[df.B > 0] tm.assert_frame_equal(result, expected) # data column searching (with an indexable and a data_columns) result = store.select( - 'df', [Term('B>0'), Term('index>df.index[3]')]) + 'df', 'B>0 and index>df.index[3]') df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) @@ -1470,7 +1484,7 @@ def test_append_with_data_columns(self): df_new.loc[5:6, 'string'] = 'bar' _maybe_remove(store, 'df') store.append('df', df_new, data_columns=['string']) - result = store.select('df', [Term('string=foo')]) + result = store.select('df', "string='foo'") expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) @@ -1523,15 +1537,15 @@ def check_col(key, name, size): _maybe_remove(store, 'df') store.append( 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = store.select('df', [Term('string=foo'), Term( - 'string2=foo'), Term('A>0'), Term('B<0')]) + result = store.select('df', + "string='foo' and string2='foo'" + " and A>0 and B<0") expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected, check_index_type=False) # yield an empty frame - result = store.select('df', [Term('string=foo'), Term( - 'string2=cool')]) + result = store.select('df', "string='foo' and string2='cool'") expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'cool')] tm.assert_frame_equal(result, expected, check_index_type=False) @@ -1551,7 +1565,7 @@ def check_col(key, name, size): store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2', 'datetime']) - result = store.select('df_dc', [Term('B>0')]) + result = store.select('df_dc', 'B>0') expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected, check_index_type=False) @@ -1578,7 +1592,7 @@ def check_col(key, name, size): store.append('df_dc', df_dc, data_columns=[ 'B', 'C', 'string', 'string2']) - result = store.select('df_dc', [Term('B>0')]) + result = store.select('df_dc', 'B>0') expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) @@ -1589,99 +1603,104 @@ def check_col(key, name, size): tm.assert_frame_equal(result, expected) with ensure_clean_store(self.path) as store: - # panel - # GH5717 not handling data_columns - np.random.seed(1234) - p = tm.makePanel() - - store.append('p1', p) - tm.assert_panel_equal(store.select('p1'), p) - - store.append('p2', p, data_columns=True) - tm.assert_panel_equal(store.select('p2'), p) - - result = store.select('p2', where='ItemA>0') - expected = p.to_frame() - expected = expected[expected['ItemA'] > 0] - tm.assert_frame_equal(result.to_frame(), expected) - - result = store.select('p2', where='ItemA>0 & minor_axis=["A","B"]') - expected = p.to_frame() - expected = expected[expected['ItemA'] > 0] - expected = expected[expected.reset_index( - level=['major']).index.isin(['A', 'B'])] - tm.assert_frame_equal(result.to_frame(), expected) + with catch_warnings(record=True): + # panel + # GH5717 not handling data_columns + np.random.seed(1234) + p = tm.makePanel() + + store.append('p1', p) + tm.assert_panel_equal(store.select('p1'), p) + + store.append('p2', p, data_columns=True) + tm.assert_panel_equal(store.select('p2'), p) + + result = store.select('p2', where='ItemA>0') + expected = p.to_frame() + expected = expected[expected['ItemA'] > 0] + tm.assert_frame_equal(result.to_frame(), expected) + + result = store.select( + 'p2', where='ItemA>0 & minor_axis=["A","B"]') + expected = p.to_frame() + expected = expected[expected['ItemA'] > 0] + expected = expected[expected.reset_index( + level=['major']).index.isin(['A', 'B'])] + tm.assert_frame_equal(result.to_frame(), expected) def test_create_table_index(self): with ensure_clean_store(self.path) as store: - def col(t, column): - return getattr(store.get_storer(t).table.cols, column) + with catch_warnings(record=True): + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) - # index=False - wp = tm.makePanel() - store.append('p5', wp, index=False) - store.create_table_index('p5', columns=['major_axis']) - assert(col('p5', 'major_axis').is_indexed is True) - assert(col('p5', 'minor_axis').is_indexed is False) - - # index=True - store.append('p5i', wp, index=True) - assert(col('p5i', 'major_axis').is_indexed is True) - assert(col('p5i', 'minor_axis').is_indexed is True) - - # default optlevels - store.get_storer('p5').create_index() - assert(col('p5', 'major_axis').index.optlevel == 6) - assert(col('p5', 'minor_axis').index.kind == 'medium') - - # let's change the indexing scheme - store.create_table_index('p5') - assert(col('p5', 'major_axis').index.optlevel == 6) - assert(col('p5', 'minor_axis').index.kind == 'medium') - store.create_table_index('p5', optlevel=9) - assert(col('p5', 'major_axis').index.optlevel == 9) - assert(col('p5', 'minor_axis').index.kind == 'medium') - store.create_table_index('p5', kind='full') - assert(col('p5', 'major_axis').index.optlevel == 9) - assert(col('p5', 'minor_axis').index.kind == 'full') - store.create_table_index('p5', optlevel=1, kind='light') - assert(col('p5', 'major_axis').index.optlevel == 1) - assert(col('p5', 'minor_axis').index.kind == 'light') - - # data columns - df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df['string2'] = 'bar' - store.append('f', df, data_columns=['string', 'string2']) - assert(col('f', 'index').is_indexed is True) - assert(col('f', 'string').is_indexed is True) - assert(col('f', 'string2').is_indexed is True) - - # specify index=columns - store.append( - 'f2', df, index=['string'], data_columns=['string', 'string2']) - assert(col('f2', 'index').is_indexed is False) - assert(col('f2', 'string').is_indexed is True) - assert(col('f2', 'string2').is_indexed is False) + # index=False + wp = tm.makePanel() + store.append('p5', wp, index=False) + store.create_table_index('p5', columns=['major_axis']) + assert(col('p5', 'major_axis').is_indexed is True) + assert(col('p5', 'minor_axis').is_indexed is False) + + # index=True + store.append('p5i', wp, index=True) + assert(col('p5i', 'major_axis').is_indexed is True) + assert(col('p5i', 'minor_axis').is_indexed is True) + + # default optlevels + store.get_storer('p5').create_index() + assert(col('p5', 'major_axis').index.optlevel == 6) + assert(col('p5', 'minor_axis').index.kind == 'medium') + + # let's change the indexing scheme + store.create_table_index('p5') + assert(col('p5', 'major_axis').index.optlevel == 6) + assert(col('p5', 'minor_axis').index.kind == 'medium') + store.create_table_index('p5', optlevel=9) + assert(col('p5', 'major_axis').index.optlevel == 9) + assert(col('p5', 'minor_axis').index.kind == 'medium') + store.create_table_index('p5', kind='full') + assert(col('p5', 'major_axis').index.optlevel == 9) + assert(col('p5', 'minor_axis').index.kind == 'full') + store.create_table_index('p5', optlevel=1, kind='light') + assert(col('p5', 'major_axis').index.optlevel == 1) + assert(col('p5', 'minor_axis').index.kind == 'light') + + # data columns + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df['string2'] = 'bar' + store.append('f', df, data_columns=['string', 'string2']) + assert(col('f', 'index').is_indexed is True) + assert(col('f', 'string').is_indexed is True) + assert(col('f', 'string2').is_indexed is True) + + # specify index=columns + store.append( + 'f2', df, index=['string'], + data_columns=['string', 'string2']) + assert(col('f2', 'index').is_indexed is False) + assert(col('f2', 'string').is_indexed is True) + assert(col('f2', 'string2').is_indexed is False) - # try to index a non-table - _maybe_remove(store, 'f2') - store.put('f2', df) - self.assertRaises(TypeError, store.create_table_index, 'f2') + # try to index a non-table + _maybe_remove(store, 'f2') + store.put('f2', df) + self.assertRaises(TypeError, store.create_table_index, 'f2') def test_append_diff_item_order(self): - wp = tm.makePanel() - wp1 = wp.iloc[:, :10, :] - wp2 = wp.iloc[wp.items.get_indexer(['ItemC', 'ItemB', 'ItemA']), - 10:, :] + with catch_warnings(record=True): + wp = tm.makePanel() + wp1 = wp.iloc[:, :10, :] + wp2 = wp.iloc[wp.items.get_indexer(['ItemC', 'ItemB', 'ItemA']), + 10:, :] - with ensure_clean_store(self.path) as store: - store.put('panel', wp1, format='table') - self.assertRaises(ValueError, store.put, 'panel', wp2, - append=True) + with ensure_clean_store(self.path) as store: + store.put('panel', wp1, format='table') + self.assertRaises(ValueError, store.put, 'panel', wp2, + append=True) def test_append_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], @@ -1909,8 +1928,9 @@ def check(obj, comparator): df['time2'] = Timestamp('20130102') check(df, tm.assert_frame_equal) - p = tm.makePanel() - check(p, assert_panel_equal) + with catch_warnings(record=True): + p = tm.makePanel() + check(p, assert_panel_equal) with catch_warnings(record=True): p4d = tm.makePanel4D() @@ -1936,21 +1956,23 @@ def check(obj, comparator): store.put('df2', df) assert_frame_equal(store.select('df2'), df) - # 0 len - p_empty = Panel(items=list('ABC')) - store.append('p', p_empty) - self.assertRaises(KeyError, store.select, 'p') + with catch_warnings(record=True): - # repeated append of 0/non-zero frames - p = Panel(np.random.randn(3, 4, 5), items=list('ABC')) - store.append('p', p) - assert_panel_equal(store.select('p'), p) - store.append('p', p_empty) - assert_panel_equal(store.select('p'), p) + # 0 len + p_empty = Panel(items=list('ABC')) + store.append('p', p_empty) + self.assertRaises(KeyError, store.select, 'p') - # store - store.put('p2', p_empty) - assert_panel_equal(store.select('p2'), p_empty) + # repeated append of 0/non-zero frames + p = Panel(np.random.randn(3, 4, 5), items=list('ABC')) + store.append('p', p) + assert_panel_equal(store.select('p'), p) + store.append('p', p_empty) + assert_panel_equal(store.select('p'), p) + + # store + store.put('p2', p_empty) + assert_panel_equal(store.select('p2'), p_empty) def test_append_raise(self): @@ -2066,22 +2088,25 @@ def test_table_mixed_dtypes(self): store.append('df1_mixed', df) tm.assert_frame_equal(store.select('df1_mixed'), df) - # panel - wp = tm.makePanel() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['ItemA'] > 0 - wp['bool2'] = wp['ItemB'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - wp = wp._consolidate() + with catch_warnings(record=True): - with ensure_clean_store(self.path) as store: - store.append('p1_mixed', wp) - assert_panel_equal(store.select('p1_mixed'), wp) + # panel + wp = tm.makePanel() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['ItemA'] > 0 + wp['bool2'] = wp['ItemB'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp._consolidate() with catch_warnings(record=True): + with ensure_clean_store(self.path) as store: + store.append('p1_mixed', wp) + assert_panel_equal(store.select('p1_mixed'), wp) + + with catch_warnings(record=True): # ndim wp = tm.makePanel4D() wp['obj1'] = 'foo' @@ -2166,9 +2191,12 @@ def test_append_with_timedelta(self): result = store.select('df') assert_frame_equal(result, df) - result = store.select('df', "C<100000") + result = store.select('df', where="C<100000") assert_frame_equal(result, df) + result = store.select('df', where="Cfoo') - self.assertRaises(KeyError, store.remove, 'a', [crit1]) + with catch_warnings(record=True): - # try to remove non-table (with crit) - # non-table ok (where = None) - wp = tm.makePanel(30) - store.put('wp', wp, format='table') - store.remove('wp', ["minor_axis=['A', 'D']"]) - rs = store.select('wp') - expected = wp.reindex(minor_axis=['B', 'C']) - assert_panel_equal(rs, expected) + # non-existance + crit1 = 'index>foo' + self.assertRaises(KeyError, store.remove, 'a', [crit1]) - # empty where - _maybe_remove(store, 'wp') - store.put('wp', wp, format='table') + # try to remove non-table (with crit) + # non-table ok (where = None) + wp = tm.makePanel(30) + store.put('wp', wp, format='table') + store.remove('wp', ["minor_axis=['A', 'D']"]) + rs = store.select('wp') + expected = wp.reindex(minor_axis=['B', 'C']) + assert_panel_equal(rs, expected) - # deleted number (entire table) - n = store.remove('wp', []) - self.assertTrue(n == 120) + # empty where + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') - # non - empty where - _maybe_remove(store, 'wp') - store.put('wp', wp, format='table') - self.assertRaises(ValueError, store.remove, - 'wp', ['foo']) + # deleted number (entire table) + n = store.remove('wp', []) + self.assertTrue(n == 120) - # selectin non-table with a where - # store.put('wp2', wp, format='f') - # self.assertRaises(ValueError, store.remove, - # 'wp2', [('column', ['A', 'D'])]) + # non - empty where + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') + self.assertRaises(ValueError, store.remove, + 'wp', ['foo']) def test_remove_startstop(self): # GH #4835 and #6177 with ensure_clean_store(self.path) as store: - wp = tm.makePanel(30) - - # start - _maybe_remove(store, 'wp1') - store.put('wp1', wp, format='t') - n = store.remove('wp1', start=32) - self.assertTrue(n == 120 - 32) - result = store.select('wp1') - expected = wp.reindex(major_axis=wp.major_axis[:32 // 4]) - assert_panel_equal(result, expected) - - _maybe_remove(store, 'wp2') - store.put('wp2', wp, format='t') - n = store.remove('wp2', start=-32) - self.assertTrue(n == 32) - result = store.select('wp2') - expected = wp.reindex(major_axis=wp.major_axis[:-32 // 4]) - assert_panel_equal(result, expected) - - # stop - _maybe_remove(store, 'wp3') - store.put('wp3', wp, format='t') - n = store.remove('wp3', stop=32) - self.assertTrue(n == 32) - result = store.select('wp3') - expected = wp.reindex(major_axis=wp.major_axis[32 // 4:]) - assert_panel_equal(result, expected) - - _maybe_remove(store, 'wp4') - store.put('wp4', wp, format='t') - n = store.remove('wp4', stop=-32) - self.assertTrue(n == 120 - 32) - result = store.select('wp4') - expected = wp.reindex(major_axis=wp.major_axis[-32 // 4:]) - assert_panel_equal(result, expected) - - # start n stop - _maybe_remove(store, 'wp5') - store.put('wp5', wp, format='t') - n = store.remove('wp5', start=16, stop=-16) - self.assertTrue(n == 120 - 32) - result = store.select('wp5') - expected = wp.reindex(major_axis=wp.major_axis[ - :16 // 4].union(wp.major_axis[-16 // 4:])) - assert_panel_equal(result, expected) - - _maybe_remove(store, 'wp6') - store.put('wp6', wp, format='t') - n = store.remove('wp6', start=16, stop=16) - self.assertTrue(n == 0) - result = store.select('wp6') - expected = wp.reindex(major_axis=wp.major_axis) - assert_panel_equal(result, expected) - - # with where - _maybe_remove(store, 'wp7') - - # TODO: unused? - date = wp.major_axis.take(np.arange(0, 30, 3)) # noqa - - crit = Term('major_axis=date') - store.put('wp7', wp, format='t') - n = store.remove('wp7', where=[crit], stop=80) - self.assertTrue(n == 28) - result = store.select('wp7') - expected = wp.reindex(major_axis=wp.major_axis.difference( - wp.major_axis[np.arange(0, 20, 3)])) - assert_panel_equal(result, expected) + with catch_warnings(record=True): + wp = tm.makePanel(30) + + # start + _maybe_remove(store, 'wp1') + store.put('wp1', wp, format='t') + n = store.remove('wp1', start=32) + self.assertTrue(n == 120 - 32) + result = store.select('wp1') + expected = wp.reindex(major_axis=wp.major_axis[:32 // 4]) + assert_panel_equal(result, expected) + + _maybe_remove(store, 'wp2') + store.put('wp2', wp, format='t') + n = store.remove('wp2', start=-32) + self.assertTrue(n == 32) + result = store.select('wp2') + expected = wp.reindex(major_axis=wp.major_axis[:-32 // 4]) + assert_panel_equal(result, expected) + + # stop + _maybe_remove(store, 'wp3') + store.put('wp3', wp, format='t') + n = store.remove('wp3', stop=32) + self.assertTrue(n == 32) + result = store.select('wp3') + expected = wp.reindex(major_axis=wp.major_axis[32 // 4:]) + assert_panel_equal(result, expected) + + _maybe_remove(store, 'wp4') + store.put('wp4', wp, format='t') + n = store.remove('wp4', stop=-32) + self.assertTrue(n == 120 - 32) + result = store.select('wp4') + expected = wp.reindex(major_axis=wp.major_axis[-32 // 4:]) + assert_panel_equal(result, expected) + + # start n stop + _maybe_remove(store, 'wp5') + store.put('wp5', wp, format='t') + n = store.remove('wp5', start=16, stop=-16) + self.assertTrue(n == 120 - 32) + result = store.select('wp5') + expected = wp.reindex( + major_axis=(wp.major_axis[:16 // 4] + .union(wp.major_axis[-16 // 4:]))) + assert_panel_equal(result, expected) + + _maybe_remove(store, 'wp6') + store.put('wp6', wp, format='t') + n = store.remove('wp6', start=16, stop=16) + self.assertTrue(n == 0) + result = store.select('wp6') + expected = wp.reindex(major_axis=wp.major_axis) + assert_panel_equal(result, expected) + + # with where + _maybe_remove(store, 'wp7') + + # TODO: unused? + date = wp.major_axis.take(np.arange(0, 30, 3)) # noqa + + crit = 'major_axis=date' + store.put('wp7', wp, format='t') + n = store.remove('wp7', where=[crit], stop=80) + self.assertTrue(n == 28) + result = store.select('wp7') + expected = wp.reindex(major_axis=wp.major_axis.difference( + wp.major_axis[np.arange(0, 20, 3)])) + assert_panel_equal(result, expected) def test_remove_crit(self): with ensure_clean_store(self.path) as store: - wp = tm.makePanel(30) - - # group row removal - _maybe_remove(store, 'wp3') - date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) - crit4 = Term('major_axis=date4') - store.put('wp3', wp, format='t') - n = store.remove('wp3', where=[crit4]) - self.assertTrue(n == 36) - - result = store.select('wp3') - expected = wp.reindex(major_axis=wp.major_axis.difference(date4)) - assert_panel_equal(result, expected) - - # upper half - _maybe_remove(store, 'wp') - store.put('wp', wp, format='table') - date = wp.major_axis[len(wp.major_axis) // 2] - - crit1 = Term('major_axis>date') - crit2 = Term("minor_axis=['A', 'D']") - n = store.remove('wp', where=[crit1]) - self.assertTrue(n == 56) - - n = store.remove('wp', where=[crit2]) - self.assertTrue(n == 32) - - result = store['wp'] - expected = wp.truncate(after=date).reindex(minor=['B', 'C']) - assert_panel_equal(result, expected) - - # individual row elements - _maybe_remove(store, 'wp2') - store.put('wp2', wp, format='table') - - date1 = wp.major_axis[1:3] - crit1 = Term('major_axis=date1') - store.remove('wp2', where=[crit1]) - result = store.select('wp2') - expected = wp.reindex(major_axis=wp.major_axis.difference(date1)) - assert_panel_equal(result, expected) - - date2 = wp.major_axis[5] - crit2 = Term('major_axis=date2') - store.remove('wp2', where=[crit2]) - result = store['wp2'] - expected = wp.reindex(major_axis=wp.major_axis.difference(date1) - .difference(Index([date2]))) - assert_panel_equal(result, expected) - - date3 = [wp.major_axis[7], wp.major_axis[9]] - crit3 = Term('major_axis=date3') - store.remove('wp2', where=[crit3]) - result = store['wp2'] - expected = wp.reindex(major_axis=wp.major_axis - .difference(date1) - .difference(Index([date2])) - .difference(Index(date3))) - assert_panel_equal(result, expected) - - # corners - _maybe_remove(store, 'wp4') - store.put('wp4', wp, format='table') - n = store.remove( - 'wp4', where=[Term('major_axis>wp.major_axis[-1]')]) - result = store.select('wp4') - assert_panel_equal(result, wp) + with catch_warnings(record=True): + wp = tm.makePanel(30) + + # group row removal + _maybe_remove(store, 'wp3') + date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) + crit4 = 'major_axis=date4' + store.put('wp3', wp, format='t') + n = store.remove('wp3', where=[crit4]) + self.assertTrue(n == 36) + + result = store.select('wp3') + expected = wp.reindex( + major_axis=wp.major_axis.difference(date4)) + assert_panel_equal(result, expected) + + # upper half + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = 'major_axis>date' + crit2 = "minor_axis=['A', 'D']" + n = store.remove('wp', where=[crit1]) + self.assertTrue(n == 56) + + n = store.remove('wp', where=[crit2]) + self.assertTrue(n == 32) + + result = store['wp'] + expected = wp.truncate(after=date).reindex(minor=['B', 'C']) + assert_panel_equal(result, expected) + + # individual row elements + _maybe_remove(store, 'wp2') + store.put('wp2', wp, format='table') + + date1 = wp.major_axis[1:3] + crit1 = 'major_axis=date1' + store.remove('wp2', where=[crit1]) + result = store.select('wp2') + expected = wp.reindex( + major_axis=wp.major_axis.difference(date1)) + assert_panel_equal(result, expected) + + date2 = wp.major_axis[5] + crit2 = 'major_axis=date2' + store.remove('wp2', where=[crit2]) + result = store['wp2'] + expected = wp.reindex( + major_axis=(wp.major_axis + .difference(date1) + .difference(Index([date2])) + )) + assert_panel_equal(result, expected) + + date3 = [wp.major_axis[7], wp.major_axis[9]] + crit3 = 'major_axis=date3' + store.remove('wp2', where=[crit3]) + result = store['wp2'] + expected = wp.reindex(major_axis=wp.major_axis + .difference(date1) + .difference(Index([date2])) + .difference(Index(date3))) + assert_panel_equal(result, expected) + + # corners + _maybe_remove(store, 'wp4') + store.put('wp4', wp, format='table') + n = store.remove( + 'wp4', where="major_axis>wp.major_axis[-1]") + result = store.select('wp4') + assert_panel_equal(result, wp) def test_invalid_terms(self): @@ -2464,24 +2497,32 @@ def test_terms(self): with ensure_clean_store(self.path) as store: - wp = tm.makePanel() - wpneg = Panel.fromDict({-1: tm.makeDataFrame(), - 0: tm.makeDataFrame(), - 1: tm.makeDataFrame()}) - with catch_warnings(record=True): + wp = tm.makePanel() + wpneg = Panel.fromDict({-1: tm.makeDataFrame(), + 0: tm.makeDataFrame(), + 1: tm.makeDataFrame()}) p4d = tm.makePanel4D() store.put('p4d', p4d, format='table') - - store.put('wp', wp, format='table') - store.put('wpneg', wpneg, format='table') - - # panel - result = store.select( - 'wp', "major_axis<'20000108' and minor_axis=['A', 'B']") - expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) - assert_panel_equal(result, expected) + store.put('wp', wp, format='table') + store.put('wpneg', wpneg, format='table') + + # panel + result = store.select( + 'wp', + "major_axis<'20000108' and minor_axis=['A', 'B']") + expected = wp.truncate( + after='20000108').reindex(minor=['A', 'B']) + assert_panel_equal(result, expected) + + # with deprecation + result = store.select( + 'wp', where=("major_axis<'20000108' " + "and minor_axis=['A', 'B']")) + expected = wp.truncate( + after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) # p4d with catch_warnings(record=True): @@ -2516,74 +2557,79 @@ def test_terms(self): store.select('p4d', t) # valid for p4d only - terms = [(("labels=['l1', 'l2']"),), - Term("labels=['l1', 'l2']"), - ] - + terms = ["labels=['l1', 'l2']"] for t in terms: store.select('p4d', t) - with tm.assertRaisesRegexp(TypeError, - 'Only named functions are supported'): - store.select('wp', Term( - 'major_axis == (lambda x: x)("20130101")')) + with tm.assertRaisesRegexp( + TypeError, 'Only named functions are supported'): + store.select( + 'wp', + 'major_axis == (lambda x: x)("20130101")') - # check USub node parsing - res = store.select('wpneg', Term('items == -1')) - expected = Panel({-1: wpneg[-1]}) - tm.assert_panel_equal(res, expected) + with catch_warnings(record=True): + # check USub node parsing + res = store.select('wpneg', 'items == -1') + expected = Panel({-1: wpneg[-1]}) + tm.assert_panel_equal(res, expected) - with tm.assertRaisesRegexp(NotImplementedError, - 'Unary addition not supported'): - store.select('wpneg', Term('items == +1')) + with tm.assertRaisesRegexp(NotImplementedError, + 'Unary addition not supported'): + store.select('wpneg', 'items == +1') def test_term_compat(self): with ensure_clean_store(self.path) as store: - wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - store.append('wp', wp) + with catch_warnings(record=True): + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp', wp) - result = store.select( - 'wp', "major_axis>20000102 and minor_axis=['A', 'B']") - expected = wp.loc[:, wp.major_axis > - Timestamp('20000102'), ['A', 'B']] - assert_panel_equal(result, expected) + result = store.select( + 'wp', where=("major_axis>20000102 " + "and minor_axis=['A', 'B']")) + expected = wp.loc[:, wp.major_axis > + Timestamp('20000102'), ['A', 'B']] + assert_panel_equal(result, expected) - store.remove('wp', 'major_axis>20000103') - result = store.select('wp') - expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :] - assert_panel_equal(result, expected) + store.remove('wp', 'major_axis>20000103') + result = store.select('wp') + expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :] + assert_panel_equal(result, expected) with ensure_clean_store(self.path) as store: - wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - store.append('wp', wp) - - # stringified datetimes - result = store.select( - 'wp', "major_axis>datetime.datetime(2000, 1, 2)") - expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] - assert_panel_equal(result, expected) - - result = store.select( - 'wp', "major_axis>datetime.datetime(2000, 1, 2, 0, 0)") - expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] - assert_panel_equal(result, expected) - - result = store.select( - 'wp', ("major_axis=[datetime.datetime(2000, 1, 2, 0, 0), " - "datetime.datetime(2000, 1, 3, 0, 0)]")) - expected = wp.loc[:, [Timestamp('20000102'), - Timestamp('20000103')]] - assert_panel_equal(result, expected) - - result = store.select('wp', "minor_axis=['A', 'B']") - expected = wp.loc[:, :, ['A', 'B']] - assert_panel_equal(result, expected) + with catch_warnings(record=True): + wp = Panel(np.random.randn(2, 5, 4), + items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp', wp) + + # stringified datetimes + result = store.select( + 'wp', 'major_axis>datetime.datetime(2000, 1, 2)') + expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] + assert_panel_equal(result, expected) + + result = store.select( + 'wp', 'major_axis>datetime.datetime(2000, 1, 2)') + expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] + assert_panel_equal(result, expected) + + result = store.select( + 'wp', + "major_axis=[datetime.datetime(2000, 1, 2, 0, 0), " + "datetime.datetime(2000, 1, 3, 0, 0)]") + expected = wp.loc[:, [Timestamp('20000102'), + Timestamp('20000103')]] + assert_panel_equal(result, expected) + + result = store.select( + 'wp', "minor_axis=['A', 'B']") + expected = wp.loc[:, :, ['A', 'B']] + assert_panel_equal(result, expected) def test_same_name_scoping(self): @@ -2678,12 +2724,13 @@ def test_tuple_index(self): def test_index_types(self): - values = np.random.randn(2) + with catch_warnings(record=True): + values = np.random.randn(2) - func = lambda l, r: tm.assert_series_equal(l, r, - check_dtype=True, - check_index_type=True, - check_series_type=True) + func = lambda l, r: tm.assert_series_equal(l, r, + check_dtype=True, + check_index_type=True, + check_series_type=True) with catch_warnings(record=True): ser = Series(values, [0, 'y']) @@ -2702,18 +2749,31 @@ def test_index_types(self): self._check_roundtrip(ser, func) with catch_warnings(record=True): + + ser = Series(values, [0, 'y']) + self._check_roundtrip(ser, func) + + ser = Series(values, [datetime.datetime.today(), 0]) + self._check_roundtrip(ser, func) + + ser = Series(values, ['y', 0]) + self._check_roundtrip(ser, func) + + ser = Series(values, [datetime.date.today(), 'a']) + self._check_roundtrip(ser, func) + ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) - ser = Series(values, [1, 1.53]) - self._check_roundtrip(ser, func) + ser = Series(values, [1, 1.53]) + self._check_roundtrip(ser, func) - ser = Series(values, [1, 5]) - self._check_roundtrip(ser, func) + ser = Series(values, [1, 5]) + self._check_roundtrip(ser, func) - ser = Series(values, [datetime.datetime( - 2012, 1, 1), datetime.datetime(2012, 1, 2)]) - self._check_roundtrip(ser, func) + ser = Series(values, [datetime.datetime( + 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): @@ -2876,13 +2936,9 @@ def _make_one(): def test_wide(self): - wp = tm.makePanel() - self._check_roundtrip(wp, assert_panel_equal) - - def test_wide_table(self): - - wp = tm.makePanel() - self._check_roundtrip_table(wp, assert_panel_equal) + with catch_warnings(record=True): + wp = tm.makePanel() + self._check_roundtrip(wp, assert_panel_equal) def test_select_with_dups(self): @@ -2944,25 +3000,24 @@ def test_select_with_dups(self): assert_frame_equal(result, expected, by_blocks=True) def test_wide_table_dups(self): - wp = tm.makePanel() with ensure_clean_store(self.path) as store: - store.put('panel', wp, format='table') - store.put('panel', wp, format='table', append=True) - with catch_warnings(record=True): + + wp = tm.makePanel() + store.put('panel', wp, format='table') + store.put('panel', wp, format='table', append=True) + recons = store['panel'] - assert_panel_equal(recons, wp) + assert_panel_equal(recons, wp) def test_long(self): def _check(left, right): assert_panel_equal(left.to_panel(), right.to_panel()) - wp = tm.makePanel() - self._check_roundtrip(wp.to_frame(), _check) - - # empty - # self._check_roundtrip(wp.to_frame()[:0], _check) + with catch_warnings(record=True): + wp = tm.makePanel() + self._check_roundtrip(wp.to_frame(), _check) def test_longpanel(self): pass @@ -3009,70 +3064,72 @@ def test_sparse_with_compression(self): check_frame_type=True) def test_select(self): - wp = tm.makePanel() with ensure_clean_store(self.path) as store: - # put/select ok - _maybe_remove(store, 'wp') - store.put('wp', wp, format='table') - store.select('wp') - - # non-table ok (where = None) - _maybe_remove(store, 'wp') - store.put('wp2', wp) - store.select('wp2') - - # selection on the non-indexable with a large number of columns - wp = Panel(np.random.randn(100, 100, 100), - items=['Item%03d' % i for i in range(100)], - major_axis=date_range('1/1/2000', periods=100), - minor_axis=['E%03d' % i for i in range(100)]) - - _maybe_remove(store, 'wp') - store.append('wp', wp) - items = ['Item%03d' % i for i in range(80)] - result = store.select('wp', Term('items=items')) - expected = wp.reindex(items=items) - assert_panel_equal(expected, result) - - # selectin non-table with a where - # self.assertRaises(ValueError, store.select, - # 'wp2', ('column', ['A', 'D'])) + with catch_warnings(record=True): + wp = tm.makePanel() - # select with columns= - df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) + # put/select ok + _maybe_remove(store, 'wp') + store.put('wp', wp, format='table') + store.select('wp') + + # non-table ok (where = None) + _maybe_remove(store, 'wp') + store.put('wp2', wp) + store.select('wp2') + + # selection on the non-indexable with a large number of columns + wp = Panel(np.random.randn(100, 100, 100), + items=['Item%03d' % i for i in range(100)], + major_axis=date_range('1/1/2000', periods=100), + minor_axis=['E%03d' % i for i in range(100)]) + + _maybe_remove(store, 'wp') + store.append('wp', wp) + items = ['Item%03d' % i for i in range(80)] + result = store.select('wp', 'items=items') + expected = wp.reindex(items=items) + assert_panel_equal(expected, result) + + # selectin non-table with a where + # self.assertRaises(ValueError, store.select, + # 'wp2', ('column', ['A', 'D'])) + + # select with columns= + df = tm.makeTimeDataFrame() + _maybe_remove(store, 'df') + store.append('df', df) + result = store.select('df', columns=['A', 'B']) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) - # equivalentsly - result = store.select('df', [("columns=['A', 'B']")]) - expected = df.reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) + # equivalentsly + result = store.select('df', [("columns=['A', 'B']")]) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) - # with a data column - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['A']) - result = store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) + # with a data column + _maybe_remove(store, 'df') + store.append('df', df, data_columns=['A']) + result = store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) - # all a data columns - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - result = store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) - tm.assert_frame_equal(expected, result) + # all a data columns + _maybe_remove(store, 'df') + store.append('df', df, data_columns=True) + result = store.select('df', ['A > 0'], columns=['A', 'B']) + expected = df[df.A > 0].reindex(columns=['A', 'B']) + tm.assert_frame_equal(expected, result) - # with a data column, but different columns - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['A']) - result = store.select('df', ['A > 0'], columns=['C', 'D']) - expected = df[df.A > 0].reindex(columns=['C', 'D']) - tm.assert_frame_equal(expected, result) + # with a data column, but different columns + _maybe_remove(store, 'df') + store.append('df', df, data_columns=['A']) + result = store.select('df', ['A > 0'], columns=['C', 'D']) + expected = df[df.A > 0].reindex(columns=['C', 'D']) + tm.assert_frame_equal(expected, result) def test_select_dtypes(self): @@ -3084,7 +3141,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A']) - result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) + result = store.select('df', "ts>=Timestamp('2012-02-01')") expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) @@ -3099,15 +3156,15 @@ def test_select_dtypes(self): expected = (df[df.boolv == True] # noqa .reindex(columns=['A', 'boolv'])) for v in [True, 'true', 1]: - result = store.select('df', Term( - 'boolv == %s' % str(v)), columns=['A', 'boolv']) + result = store.select('df', 'boolv == %s' % str(v), + columns=['A', 'boolv']) tm.assert_frame_equal(expected, result) expected = (df[df.boolv == False] # noqa .reindex(columns=['A', 'boolv'])) for v in [False, 'false', 0]: - result = store.select('df', Term( - 'boolv == %s' % str(v)), columns=['A', 'boolv']) + result = store.select( + 'df', 'boolv == %s' % str(v), columns=['A', 'boolv']) tm.assert_frame_equal(expected, result) # integer index @@ -3115,7 +3172,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_int') store.append('df_int', df) result = store.select( - 'df_int', [Term("index<10"), Term("columns=['A']")]) + 'df_int', "index<10 and columns=['A']") expected = df.reindex(index=list(df.index)[0:10], columns=['A']) tm.assert_frame_equal(expected, result) @@ -3125,7 +3182,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_float') store.append('df_float', df) result = store.select( - 'df_float', [Term("index<10.0"), Term("columns=['A']")]) + 'df_float', "index<10.0 and columns=['A']") expected = df.reindex(index=list(df.index)[0:10], columns=['A']) tm.assert_frame_equal(expected, result) @@ -3196,14 +3253,14 @@ def test_select_with_many_inputs(self): store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) # regular select - result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) + result = store.select('df', "ts>=Timestamp('2012-02-01')") expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) # small selector result = store.select( - 'df', [Term("ts>=Timestamp('2012-02-01') & " - "users=['a','b','c']")]) + 'df', + "ts>=Timestamp('2012-02-01') & users=['a','b','c']") expected = df[(df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a', 'b', 'c'])] tm.assert_frame_equal(expected, result) @@ -3211,21 +3268,21 @@ def test_select_with_many_inputs(self): # big selector along the columns selector = ['a', 'b', 'c'] + ['a%03d' % i for i in range(60)] result = store.select( - 'df', [Term("ts>=Timestamp('2012-02-01')"), - Term('users=selector')]) + 'df', + "ts>=Timestamp('2012-02-01') and users=selector") expected = df[(df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector)] tm.assert_frame_equal(expected, result) selector = range(100, 200) - result = store.select('df', [Term('B=selector')]) + result = store.select('df', 'B=selector') expected = df[df.B.isin(selector)] tm.assert_frame_equal(expected, result) self.assertEqual(len(result), 100) # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', [Term('ts=selector')]) + result = store.select('df', 'ts=selector') expected = df[df.ts.isin(selector.values)] tm.assert_frame_equal(expected, result) self.assertEqual(len(result), 100) @@ -3296,17 +3353,6 @@ def test_select_iterator(self): result = concat(results) tm.assert_frame_equal(expected, result) - # where selection - # expected = store.select_as_multiple( - # ['df1', 'df2'], where= Term('A>0'), selector='df1') - # results = [] - # for s in store.select_as_multiple( - # ['df1', 'df2'], where= Term('A>0'), selector='df1', - # chunksize=25): - # results.append(s) - # result = concat(results) - # tm.assert_frame_equal(expected, result) - def test_select_iterator_complete_8014(self): # GH 8014 @@ -3518,8 +3564,7 @@ def test_retain_index_attributes(self): getattr(getattr(result, idx), attr, None)) # try to append a table with a different frequency - with tm.assert_produces_warning( - expected_warning=AttributeConflictWarning): + with catch_warnings(record=True): df2 = DataFrame(dict( A=Series(lrange(3), index=date_range('2002-1-1', @@ -3544,9 +3589,7 @@ def test_retain_index_attributes(self): def test_retain_index_attributes2(self): with ensure_clean_path(self.path) as path: - expected_warning = Warning if PY35 else AttributeConflictWarning - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): df = DataFrame(dict( A=Series(lrange(3), @@ -3566,8 +3609,7 @@ def test_retain_index_attributes2(self): self.assertEqual(read_hdf(path, 'data').index.name, 'foo') - with tm.assert_produces_warning(expected_warning=expected_warning, - check_stacklevel=False): + with catch_warnings(record=True): idx2 = date_range('2001-1-1', periods=3, freq='H') idx2.name = 'bar' @@ -3578,23 +3620,28 @@ def test_retain_index_attributes2(self): def test_panel_select(self): - wp = tm.makePanel() - with ensure_clean_store(self.path) as store: - store.put('wp', wp, format='table') - date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = ('major_axis>=date') - crit2 = ("minor_axis=['A', 'D']") + with catch_warnings(record=True): - result = store.select('wp', [crit1, crit2]) - expected = wp.truncate(before=date).reindex(minor=['A', 'D']) - assert_panel_equal(result, expected) + wp = tm.makePanel() - result = store.select( - 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")]) - expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) - assert_panel_equal(result, expected) + store.put('wp', wp, format='table') + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = ('major_axis>=date') + crit2 = ("minor_axis=['A', 'D']") + + result = store.select('wp', [crit1, crit2]) + expected = wp.truncate(before=date).reindex(minor=['A', 'D']) + assert_panel_equal(result, expected) + + result = store.select( + 'wp', ['major_axis>="20000124"', + ("minor_axis=['A', 'B']")]) + expected = wp.truncate( + before='20000124').reindex(minor=['A', 'B']) + assert_panel_equal(result, expected) def test_frame_select(self): @@ -3622,7 +3669,7 @@ def test_frame_select(self): df = tm.makeTimeDataFrame() store.append('df_time', df) self.assertRaises( - ValueError, store.select, 'df_time', [Term("index>0")]) + ValueError, store.select, 'df_time', "index>0") # can't select if not written as table # store['frame'] = df @@ -3701,7 +3748,7 @@ def test_frame_select_complex2(self): hist.to_hdf(hh, 'df', mode='w', format='table') - expected = read_hdf(hh, 'df', where="l1=[2, 3, 4]") + expected = read_hdf(hh, 'df', where='l1=[2, 3, 4]') # sccope with list like l = selection.index.tolist() # noqa @@ -4005,6 +4052,7 @@ def test_append_to_multiple_dropna(self): df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: + # dropna=True should guarantee rows are synchronized store.append_to_multiple( {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', @@ -4015,14 +4063,27 @@ def test_append_to_multiple_dropna(self): tm.assert_index_equal(store.select('df1').index, store.select('df2').index) + @pytest.mark.xfail(run=False, + reason="append_to_multiple_dropna_false " + "is not raising as failed") + def test_append_to_multiple_dropna_false(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) + df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan + df = concat([df1, df2], axis=1) + + with ensure_clean_store(self.path) as store: + # dropna=False shouldn't synchronize row indexes store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', + {'df1a': ['A', 'B'], 'df2a': None}, df, selector='df1a', dropna=False) - self.assertRaises( - ValueError, store.select_as_multiple, ['df1', 'df2']) - assert not store.select('df1').index.equals( - store.select('df2').index) + + with pytest.raises(ValueError): + store.select_as_multiple(['df1a', 'df2a']) + + assert not store.select('df1a').index.equals( + store.select('df2a').index) def test_select_as_multiple(self): @@ -4220,7 +4281,7 @@ def _check_roundtrip_table(self, obj, comparator, compression=False): with ensure_clean_store(self.path, 'w', **options) as store: store.put('obj', obj, format='table') retrieved = store['obj'] - # sorted_obj = _test_sort(obj) + comparator(retrieved, obj) def test_multiple_open_close(self): @@ -4351,16 +4412,16 @@ def test_legacy_table_read(self): with ensure_clean_store( tm.get_data_path('legacy_hdf/legacy_table.h5'), mode='r') as store: - store.select('df1') - store.select('df2') - store.select('wp1') - # force the frame - store.select('df2', typ='legacy_frame') + with catch_warnings(record=True): + store.select('df1') + store.select('df2') + store.select('wp1') + + # force the frame + store.select('df2', typ='legacy_frame') - # old version warning - with tm.assert_produces_warning( - expected_warning=IncompatibilityWarning): + # old version warning self.assertRaises( Exception, store.select, 'wp1', 'minor_axis=B') @@ -4466,7 +4527,8 @@ def test_legacy_table_write(self): 'legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a') df = tm.makeDataFrame() - wp = tm.makePanel() + with catch_warnings(record=True): + wp = tm.makePanel() index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], @@ -4803,12 +4865,11 @@ def test_to_hdf_with_object_column_names(self): for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: - with self.assertRaises( + with catch_warnings(record=True): + with self.assertRaises( ValueError, msg=("cannot have non-object label " "DataIndexableCol")): - with catch_warnings(record=True): - df.to_hdf(path, 'df', - format='table', + df.to_hdf(path, 'df', format='table', data_columns=True) for index in types_should_run: @@ -4979,7 +5040,7 @@ def test_query_compare_column_type(self): with ensure_clean_store(self.path) as store: store.append('test', df, format='table', data_columns=True) - ts = pd.Timestamp('2014-01-01') # noqa + ts = pd.Timestamp('2014-01-01') # noqa result = store.select('test', where='real_date > ts') expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) @@ -5092,28 +5153,30 @@ def test_complex_mixed_table(self): assert_frame_equal(df, reread) def test_complex_across_dimensions_fixed(self): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) - df = DataFrame({'A': s, 'B': s}) - p = Panel({'One': df, 'Two': df}) + with catch_warnings(record=True): + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list('abcd')) + df = DataFrame({'A': s, 'B': s}) + p = Panel({'One': df, 'Two': df}) - objs = [s, df, p] - comps = [tm.assert_series_equal, tm.assert_frame_equal, - tm.assert_panel_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='fixed') - reread = read_hdf(path, 'obj') - comp(obj, reread) + objs = [s, df, p] + comps = [tm.assert_series_equal, tm.assert_frame_equal, + tm.assert_panel_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(self.path) as path: + obj.to_hdf(path, 'obj', format='fixed') + reread = read_hdf(path, 'obj') + comp(obj, reread) def test_complex_across_dimensions(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list('abcd')) df = DataFrame({'A': s, 'B': s}) - p = Panel({'One': df, 'Two': df}) with catch_warnings(record=True): - p4d = pd.Panel4D({'i': p, 'ii': p}) + p = Panel({'One': df, 'Two': df}) + p4d = Panel4D({'i': p, 'ii': p}) objs = [df, p, p4d] comps = [tm.assert_frame_equal, tm.assert_panel_equal, @@ -5430,12 +5493,3 @@ def test_dst_transitions(self): store.append('df', df) result = store.select('df') assert_frame_equal(result, df) - - -def _test_sort(obj): - if isinstance(obj, DataFrame): - return obj.reindex(sorted(obj.index)) - elif isinstance(obj, Panel): - return obj.reindex(major=sorted(obj.major_axis)) - else: - raise ValueError('type not supported here') diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index f669ebe371f9d..dc4787176a0b5 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -2,6 +2,7 @@ from __future__ import print_function # pylint: disable-msg=W0612,E1101 +from warnings import catch_warnings import re import operator import pytest @@ -32,19 +33,26 @@ 'D': _frame2['D'].astype('int32')}) _integer = DataFrame( np.random.randint(1, 100, - size=(10001, 4)), columns=list('ABCD'), dtype='int64') + size=(10001, 4)), + columns=list('ABCD'), dtype='int64') _integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), columns=list('ABCD'), dtype='int64') -_frame_panel = Panel(dict(ItemA=_frame.copy(), ItemB=( - _frame.copy() + 3), ItemC=_frame.copy(), ItemD=_frame.copy())) -_frame2_panel = Panel(dict(ItemA=_frame2.copy(), ItemB=(_frame2.copy() + 3), - ItemC=_frame2.copy(), ItemD=_frame2.copy())) -_integer_panel = Panel(dict(ItemA=_integer, ItemB=(_integer + 34).astype( - 'int64'))) -_integer2_panel = Panel(dict(ItemA=_integer2, ItemB=(_integer2 + 34).astype( - 'int64'))) -_mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3))) -_mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) + +with catch_warnings(record=True): + _frame_panel = Panel(dict(ItemA=_frame.copy(), + ItemB=(_frame.copy() + 3), + ItemC=_frame.copy(), + ItemD=_frame.copy())) + _frame2_panel = Panel(dict(ItemA=_frame2.copy(), + ItemB=(_frame2.copy() + 3), + ItemC=_frame2.copy(), + ItemD=_frame2.copy())) + _integer_panel = Panel(dict(ItemA=_integer, + ItemB=(_integer + 34).astype('int64'))) + _integer2_panel = Panel(dict(ItemA=_integer2, + ItemB=(_integer2 + 34).astype('int64'))) + _mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3))) + _mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3))) @pytest.mark.skipif(not expr._USE_NUMEXPR, reason='not using numexpr') @@ -204,7 +212,7 @@ def test_float_panel(self): @slow def test_panel4d(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, assert_func=assert_panel4d_equal, binary_comp=3) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 0e8e8dc43ff03..118039d1f354c 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -3,6 +3,8 @@ from operator import methodcaller from copy import copy, deepcopy +from warnings import catch_warnings + import pytest import numpy as np from numpy import nan @@ -1570,17 +1572,18 @@ def test_to_xarray(self): tm._skip_if_no_xarray() from xarray import DataArray - p = tm.makePanel() + with catch_warnings(record=True): + p = tm.makePanel() - result = p.to_xarray() - self.assertIsInstance(result, DataArray) - self.assertEqual(len(result.coords), 3) - assert_almost_equal(list(result.coords.keys()), - ['items', 'major_axis', 'minor_axis']) - self.assertEqual(len(result.dims), 3) + result = p.to_xarray() + self.assertIsInstance(result, DataArray) + self.assertEqual(len(result.coords), 3) + assert_almost_equal(list(result.coords.keys()), + ['items', 'major_axis', 'minor_axis']) + self.assertEqual(len(result.dims), 3) - # idempotency - assert_panel_equal(result.to_pandas(), p) + # idempotency + assert_panel_equal(result.to_pandas(), p) class TestPanel4D(tm.TestCase, Generic): @@ -1590,15 +1593,12 @@ class TestPanel4D(tm.TestCase, Generic): def test_sample(self): pytest.skip("sample on Panel4D") - def test_copy_and_deepcopy(self): - pytest.skip("copy_and_deepcopy on Panel4D") - def test_to_xarray(self): tm._skip_if_no_xarray() from xarray import DataArray - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p = tm.makePanel4D() result = p.to_xarray() @@ -1624,12 +1624,20 @@ def test_to_xarray(self): 'test_stat_unexpected_keyword', 'test_api_compat', 'test_stat_non_defaults_args', 'test_clip', 'test_truncate_out_of_bounds', 'test_numpy_clip', - 'test_metadata_propagation']: + 'test_metadata_propagation', 'test_copy_and_deepcopy', + 'test_sample']: + + def f(): + def tester(self): + with catch_warnings(record=True): + return getattr(super(TestPanel, self), t)() + return tester + + setattr(TestPanel, t, f()) def f(): def tester(self): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with catch_warnings(record=True): return getattr(super(TestPanel4D, self), t)() return tester @@ -1660,10 +1668,11 @@ def test_sample(sel): with tm.assertRaises(ValueError): s.sample(n=3, weights='weight_column') - panel = pd.Panel(items=[0, 1, 2], major_axis=[2, 3, 4], - minor_axis=[3, 4, 5]) - with tm.assertRaises(ValueError): - panel.sample(n=1, weights='weight_column') + with catch_warnings(record=True): + panel = Panel(items=[0, 1, 2], major_axis=[2, 3, 4], + minor_axis=[3, 4, 5]) + with tm.assertRaises(ValueError): + panel.sample(n=1, weights='weight_column') with tm.assertRaises(ValueError): df.sample(n=1, weights='weight_column', axis=1) @@ -1726,14 +1735,15 @@ def test_sample(sel): assert_frame_equal(sample1, df[['colString']]) # Test default axes - p = pd.Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], - minor_axis=[1, 3, 5]) - assert_panel_equal( - p.sample(n=3, random_state=42), p.sample(n=3, axis=1, - random_state=42)) - assert_frame_equal( - df.sample(n=3, random_state=42), df.sample(n=3, axis=0, - random_state=42)) + with catch_warnings(record=True): + p = Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], + minor_axis=[1, 3, 5]) + assert_panel_equal( + p.sample(n=3, random_state=42), p.sample(n=3, axis=1, + random_state=42)) + assert_frame_equal( + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, + random_state=42)) # Test that function aligns weights with frame df = DataFrame( @@ -1763,9 +1773,10 @@ def test_squeeze(self): tm.assert_series_equal(s.squeeze(), s) for df in [tm.makeTimeDataFrame()]: tm.assert_frame_equal(df.squeeze(), df) - for p in [tm.makePanel()]: - tm.assert_panel_equal(p.squeeze(), p) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): + for p in [tm.makePanel()]: + tm.assert_panel_equal(p.squeeze(), p) + with catch_warnings(record=True): for p4d in [tm.makePanel4D()]: tm.assert_panel4d_equal(p4d.squeeze(), p4d) @@ -1773,24 +1784,26 @@ def test_squeeze(self): df = tm.makeTimeDataFrame().reindex(columns=['A']) tm.assert_series_equal(df.squeeze(), df['A']) - p = tm.makePanel().reindex(items=['ItemA']) - tm.assert_frame_equal(p.squeeze(), p['ItemA']) + with catch_warnings(record=True): + p = tm.makePanel().reindex(items=['ItemA']) + tm.assert_frame_equal(p.squeeze(), p['ItemA']) - p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A']) - tm.assert_series_equal(p.squeeze(), p.loc['ItemA', :, 'A']) + p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A']) + tm.assert_series_equal(p.squeeze(), p.loc['ItemA', :, 'A']) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D().reindex(labels=['label1']) tm.assert_panel_equal(p4d.squeeze(), p4d['label1']) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) tm.assert_frame_equal(p4d.squeeze(), p4d.loc['label1', 'ItemA']) # don't fail with 0 length dimensions GH11229 & GH8999 - empty_series = pd.Series([], name='five') - empty_frame = pd.DataFrame([empty_series]) - empty_panel = pd.Panel({'six': empty_frame}) + empty_series = Series([], name='five') + empty_frame = DataFrame([empty_series]) + with catch_warnings(record=True): + empty_panel = Panel({'six': empty_frame}) [tm.assert_series_equal(empty_series, higher_dim.squeeze()) for higher_dim in [empty_series, empty_frame, empty_panel]] @@ -1825,13 +1838,15 @@ def test_transpose(self): tm.assert_series_equal(s.transpose(), s) for df in [tm.makeTimeDataFrame()]: tm.assert_frame_equal(df.transpose().transpose(), df) - for p in [tm.makePanel()]: - tm.assert_panel_equal(p.transpose(2, 0, 1) - .transpose(1, 2, 0), p) - tm.assertRaisesRegexp(TypeError, msg, p.transpose, - 2, 0, 1, axes=(2, 0, 1)) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): + for p in [tm.makePanel()]: + tm.assert_panel_equal(p.transpose(2, 0, 1) + .transpose(1, 2, 0), p) + tm.assertRaisesRegexp(TypeError, msg, p.transpose, + 2, 0, 1, axes=(2, 0, 1)) + + with catch_warnings(record=True): for p4d in [tm.makePanel4D()]: tm.assert_panel4d_equal(p4d.transpose(2, 0, 3, 1) .transpose(1, 3, 0, 2), p4d) @@ -1853,12 +1868,13 @@ def test_numpy_transpose(self): tm.assertRaisesRegexp(ValueError, msg, np.transpose, df, axes=1) - p = tm.makePanel() - tm.assert_panel_equal(np.transpose( - np.transpose(p, axes=(2, 0, 1)), - axes=(1, 2, 0)), p) + with catch_warnings(record=True): + p = tm.makePanel() + tm.assert_panel_equal(np.transpose( + np.transpose(p, axes=(2, 0, 1)), + axes=(1, 2, 0)), p) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): p4d = tm.makePanel4D() tm.assert_panel4d_equal(np.transpose( np.transpose(p4d, axes=(2, 0, 3, 1)), @@ -1880,15 +1896,16 @@ def test_take(self): tm.assert_frame_equal(out, expected) indices = [-3, 2, 0, 1] - for p in [tm.makePanel()]: - out = p.take(indices) - expected = Panel(data=p.values.take(indices, axis=0), - items=p.items.take(indices), - major_axis=p.major_axis, - minor_axis=p.minor_axis) - tm.assert_panel_equal(out, expected) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): + for p in [tm.makePanel()]: + out = p.take(indices) + expected = Panel(data=p.values.take(indices, axis=0), + items=p.items.take(indices), + major_axis=p.major_axis, + minor_axis=p.minor_axis) + tm.assert_panel_equal(out, expected) + + with catch_warnings(record=True): for p4d in [tm.makePanel4D()]: out = p4d.take(indices) expected = Panel4D(data=p4d.values.take(indices, axis=0), @@ -1902,9 +1919,9 @@ def test_take_invalid_kwargs(self): indices = [-3, 2, 0, 1] s = tm.makeFloatSeries() df = tm.makeTimeDataFrame() - p = tm.makePanel() - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): + p = tm.makePanel() p4d = tm.makePanel4D() for obj in (s, df, p, p4d): @@ -2011,8 +2028,9 @@ def test_equals(self): self.assertTrue(e.equals(f)) def test_describe_raises(self): - with tm.assertRaises(NotImplementedError): - tm.makePanel().describe() + with catch_warnings(record=True): + with tm.assertRaises(NotImplementedError): + tm.makePanel().describe() def test_pipe(self): df = DataFrame({'A': [1, 2, 3]}) @@ -2043,15 +2061,16 @@ def test_pipe_tuple_error(self): df.A.pipe((f, 'y'), x=1, y=0) def test_pipe_panel(self): - wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})}) - f = lambda x, y: x + y - result = wp.pipe(f, 2) - expected = wp + 2 - assert_panel_equal(result, expected) - - result = wp.pipe((f, 'y'), x=1) - expected = wp + 1 - assert_panel_equal(result, expected) - - with tm.assertRaises(ValueError): - result = wp.pipe((f, 'y'), x=1, y=1) + with catch_warnings(record=True): + wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})}) + f = lambda x, y: x + y + result = wp.pipe(f, 2) + expected = wp + 2 + assert_panel_equal(result, expected) + + result = wp.pipe((f, 'y'), x=1) + expected = wp + 1 + assert_panel_equal(result, expected) + + with tm.assertRaises(ValueError): + result = wp.pipe((f, 'y'), x=1, y=1) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index ab0322abbcf06..bc7bb8a4dfec1 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -3,7 +3,6 @@ from warnings import catch_warnings from datetime import datetime - import operator import pytest @@ -31,25 +30,37 @@ import pandas.util.testing as tm +def make_test_panel(): + with catch_warnings(record=True): + _panel = tm.makePanel() + tm.add_nans(_panel) + _panel = _panel.copy() + return _panel + + class PanelTests(object): panel = None def test_pickle(self): - unpickled = self.round_trip_pickle(self.panel) - assert_frame_equal(unpickled['ItemA'], self.panel['ItemA']) + with catch_warnings(record=True): + unpickled = self.round_trip_pickle(self.panel) + assert_frame_equal(unpickled['ItemA'], self.panel['ItemA']) def test_rank(self): - self.assertRaises(NotImplementedError, lambda: self.panel.rank()) + with catch_warnings(record=True): + self.assertRaises(NotImplementedError, lambda: self.panel.rank()) def test_cumsum(self): - cumsum = self.panel.cumsum() - assert_frame_equal(cumsum['ItemA'], self.panel['ItemA'].cumsum()) + with catch_warnings(record=True): + cumsum = self.panel.cumsum() + assert_frame_equal(cumsum['ItemA'], self.panel['ItemA'].cumsum()) def not_hashable(self): - c_empty = Panel() - c = Panel(Panel([[[1]]])) - self.assertRaises(TypeError, hash, c_empty) - self.assertRaises(TypeError, hash, c) + with catch_warnings(record=True): + c_empty = Panel() + c = Panel(Panel([[[1]]])) + self.assertRaises(TypeError, hash, c_empty) + self.assertRaises(TypeError, hash, c) class SafeForLongAndSparse(object): @@ -58,11 +69,12 @@ def test_repr(self): repr(self.panel) def test_copy_names(self): - for attr in ('major_axis', 'minor_axis'): - getattr(self.panel, attr).name = None - cp = self.panel.copy() - getattr(cp, attr).name = 'foo' - self.assertIsNone(getattr(self.panel, attr).name) + with catch_warnings(record=True): + for attr in ('major_axis', 'minor_axis'): + getattr(self.panel, attr).name = None + cp = self.panel.copy() + getattr(cp, attr).name = 'foo' + self.assertIsNone(getattr(self.panel, attr).name) def test_iter(self): tm.equalContents(list(self.panel), self.panel.items) @@ -107,10 +119,6 @@ def this_skew(x): self._check_stat_op('skew', this_skew) - # def test_mad(self): - # f = lambda x: np.abs(x - x.mean()).mean() - # self._check_stat_op('mad', f) - def test_var(self): def alt(x): if len(x) < 2: @@ -239,47 +247,48 @@ def test_get_plane_axes(self): index, columns = self.panel._get_plane_axes(0) def test_truncate(self): - dates = self.panel.major_axis - start, end = dates[1], dates[5] - - trunced = self.panel.truncate(start, end, axis='major') - expected = self.panel['ItemA'].truncate(start, end) + with catch_warnings(record=True): + dates = self.panel.major_axis + start, end = dates[1], dates[5] - assert_frame_equal(trunced['ItemA'], expected) + trunced = self.panel.truncate(start, end, axis='major') + expected = self.panel['ItemA'].truncate(start, end) - trunced = self.panel.truncate(before=start, axis='major') - expected = self.panel['ItemA'].truncate(before=start) + assert_frame_equal(trunced['ItemA'], expected) - assert_frame_equal(trunced['ItemA'], expected) + trunced = self.panel.truncate(before=start, axis='major') + expected = self.panel['ItemA'].truncate(before=start) - trunced = self.panel.truncate(after=end, axis='major') - expected = self.panel['ItemA'].truncate(after=end) + assert_frame_equal(trunced['ItemA'], expected) - assert_frame_equal(trunced['ItemA'], expected) + trunced = self.panel.truncate(after=end, axis='major') + expected = self.panel['ItemA'].truncate(after=end) - # XXX test other axes + assert_frame_equal(trunced['ItemA'], expected) def test_arith(self): - self._test_op(self.panel, operator.add) - self._test_op(self.panel, operator.sub) - self._test_op(self.panel, operator.mul) - self._test_op(self.panel, operator.truediv) - self._test_op(self.panel, operator.floordiv) - self._test_op(self.panel, operator.pow) - - self._test_op(self.panel, lambda x, y: y + x) - self._test_op(self.panel, lambda x, y: y - x) - self._test_op(self.panel, lambda x, y: y * x) - self._test_op(self.panel, lambda x, y: y / x) - self._test_op(self.panel, lambda x, y: y ** x) - - self._test_op(self.panel, lambda x, y: x + y) # panel + 1 - self._test_op(self.panel, lambda x, y: x - y) # panel - 1 - self._test_op(self.panel, lambda x, y: x * y) # panel * 1 - self._test_op(self.panel, lambda x, y: x / y) # panel / 1 - self._test_op(self.panel, lambda x, y: x ** y) # panel ** 1 - - self.assertRaises(Exception, self.panel.__add__, self.panel['ItemA']) + with catch_warnings(record=True): + self._test_op(self.panel, operator.add) + self._test_op(self.panel, operator.sub) + self._test_op(self.panel, operator.mul) + self._test_op(self.panel, operator.truediv) + self._test_op(self.panel, operator.floordiv) + self._test_op(self.panel, operator.pow) + + self._test_op(self.panel, lambda x, y: y + x) + self._test_op(self.panel, lambda x, y: y - x) + self._test_op(self.panel, lambda x, y: y * x) + self._test_op(self.panel, lambda x, y: y / x) + self._test_op(self.panel, lambda x, y: y ** x) + + self._test_op(self.panel, lambda x, y: x + y) # panel + 1 + self._test_op(self.panel, lambda x, y: x - y) # panel - 1 + self._test_op(self.panel, lambda x, y: x * y) # panel * 1 + self._test_op(self.panel, lambda x, y: x / y) # panel / 1 + self._test_op(self.panel, lambda x, y: x ** y) # panel ** 1 + + self.assertRaises(Exception, self.panel.__add__, + self.panel['ItemA']) @staticmethod def _test_op(panel, op): @@ -299,92 +308,100 @@ def test_iteritems(self): len(self.panel.items)) def test_combineFrame(self): - def check_op(op, name): - # items - df = self.panel['ItemA'] + with catch_warnings(record=True): + def check_op(op, name): + # items + df = self.panel['ItemA'] - func = getattr(self.panel, name) + func = getattr(self.panel, name) - result = func(df, axis='items') + result = func(df, axis='items') - assert_frame_equal(result['ItemB'], op(self.panel['ItemB'], df)) + assert_frame_equal( + result['ItemB'], op(self.panel['ItemB'], df)) - # major - xs = self.panel.major_xs(self.panel.major_axis[0]) - result = func(xs, axis='major') + # major + xs = self.panel.major_xs(self.panel.major_axis[0]) + result = func(xs, axis='major') - idx = self.panel.major_axis[1] + idx = self.panel.major_axis[1] - assert_frame_equal(result.major_xs(idx), - op(self.panel.major_xs(idx), xs)) + assert_frame_equal(result.major_xs(idx), + op(self.panel.major_xs(idx), xs)) - # minor - xs = self.panel.minor_xs(self.panel.minor_axis[0]) - result = func(xs, axis='minor') + # minor + xs = self.panel.minor_xs(self.panel.minor_axis[0]) + result = func(xs, axis='minor') - idx = self.panel.minor_axis[1] + idx = self.panel.minor_axis[1] - assert_frame_equal(result.minor_xs(idx), - op(self.panel.minor_xs(idx), xs)) + assert_frame_equal(result.minor_xs(idx), + op(self.panel.minor_xs(idx), xs)) - ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow', 'mod'] - if not compat.PY3: - ops.append('div') + ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow', 'mod'] + if not compat.PY3: + ops.append('div') - for op in ops: - try: - check_op(getattr(operator, op), op) - except: - pprint_thing("Failing operation: %r" % op) - raise - if compat.PY3: - try: - check_op(operator.truediv, 'div') - except: - pprint_thing("Failing operation: %r" % 'div') - raise + for op in ops: + try: + check_op(getattr(operator, op), op) + except: + pprint_thing("Failing operation: %r" % op) + raise + if compat.PY3: + try: + check_op(operator.truediv, 'div') + except: + pprint_thing("Failing operation: %r" % 'div') + raise def test_combinePanel(self): - result = self.panel.add(self.panel) - self.assert_panel_equal(result, self.panel * 2) + with catch_warnings(record=True): + result = self.panel.add(self.panel) + assert_panel_equal(result, self.panel * 2) def test_neg(self): - self.assert_panel_equal(-self.panel, self.panel * -1) + with catch_warnings(record=True): + assert_panel_equal(-self.panel, self.panel * -1) # issue 7692 def test_raise_when_not_implemented(self): - p = Panel(np.arange(3 * 4 * 5).reshape(3, 4, 5), - items=['ItemA', 'ItemB', 'ItemC'], - major_axis=pd.date_range('20130101', periods=4), - minor_axis=list('ABCDE')) - d = p.sum(axis=1).iloc[0] - ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'div', 'mod', 'pow'] - for op in ops: - with self.assertRaises(NotImplementedError): - getattr(p, op)(d, axis=0) + with catch_warnings(record=True): + p = Panel(np.arange(3 * 4 * 5).reshape(3, 4, 5), + items=['ItemA', 'ItemB', 'ItemC'], + major_axis=pd.date_range('20130101', periods=4), + minor_axis=list('ABCDE')) + d = p.sum(axis=1).iloc[0] + ops = ['add', 'sub', 'mul', 'truediv', + 'floordiv', 'div', 'mod', 'pow'] + for op in ops: + with self.assertRaises(NotImplementedError): + getattr(p, op)(d, axis=0) def test_select(self): - p = self.panel + with catch_warnings(record=True): + p = self.panel - # select items - result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') - expected = p.reindex(items=['ItemA', 'ItemC']) - self.assert_panel_equal(result, expected) + # select items + result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') + expected = p.reindex(items=['ItemA', 'ItemC']) + assert_panel_equal(result, expected) - # select major_axis - result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') - new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] - expected = p.reindex(major=new_major) - self.assert_panel_equal(result, expected) + # select major_axis + result = p.select(lambda x: x >= datetime( + 2000, 1, 15), axis='major') + new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] + expected = p.reindex(major=new_major) + assert_panel_equal(result, expected) - # select minor_axis - result = p.select(lambda x: x in ('D', 'A'), axis=2) - expected = p.reindex(minor=['A', 'D']) - self.assert_panel_equal(result, expected) + # select minor_axis + result = p.select(lambda x: x in ('D', 'A'), axis=2) + expected = p.reindex(minor=['A', 'D']) + assert_panel_equal(result, expected) - # corner case, empty thing - result = p.select(lambda x: x in ('foo', ), axis='items') - self.assert_panel_equal(result, p.reindex(items=[])) + # corner case, empty thing + result = p.select(lambda x: x in ('foo', ), axis='items') + assert_panel_equal(result, p.reindex(items=[])) def test_get_value(self): for item in self.panel.items: @@ -396,27 +413,28 @@ def test_get_value(self): def test_abs(self): - result = self.panel.abs() - result2 = abs(self.panel) - expected = np.abs(self.panel) - self.assert_panel_equal(result, expected) - self.assert_panel_equal(result2, expected) - - df = self.panel['ItemA'] - result = df.abs() - result2 = abs(df) - expected = np.abs(df) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) + with catch_warnings(record=True): + result = self.panel.abs() + result2 = abs(self.panel) + expected = np.abs(self.panel) + assert_panel_equal(result, expected) + assert_panel_equal(result2, expected) - s = df['A'] - result = s.abs() - result2 = abs(s) - expected = np.abs(s) - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - self.assertEqual(result.name, 'A') - self.assertEqual(result2.name, 'A') + df = self.panel['ItemA'] + result = df.abs() + result2 = abs(df) + expected = np.abs(df) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + s = df['A'] + result = s.abs() + result2 = abs(s) + expected = np.abs(s) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + self.assertEqual(result.name, 'A') + self.assertEqual(result2.name, 'A') class CheckIndexing(object): @@ -425,188 +443,200 @@ def test_getitem(self): self.assertRaises(Exception, self.panel.__getitem__, 'ItemQ') def test_delitem_and_pop(self): - expected = self.panel['ItemA'] - result = self.panel.pop('ItemA') - assert_frame_equal(expected, result) - self.assertNotIn('ItemA', self.panel.items) + with catch_warnings(record=True): + expected = self.panel['ItemA'] + result = self.panel.pop('ItemA') + assert_frame_equal(expected, result) + self.assertNotIn('ItemA', self.panel.items) - del self.panel['ItemB'] - self.assertNotIn('ItemB', self.panel.items) - self.assertRaises(Exception, self.panel.__delitem__, 'ItemB') + del self.panel['ItemB'] + self.assertNotIn('ItemB', self.panel.items) + self.assertRaises(Exception, self.panel.__delitem__, 'ItemB') - values = np.empty((3, 3, 3)) - values[0] = 0 - values[1] = 1 - values[2] = 2 + values = np.empty((3, 3, 3)) + values[0] = 0 + values[1] = 1 + values[2] = 2 - panel = Panel(values, lrange(3), lrange(3), lrange(3)) + panel = Panel(values, lrange(3), lrange(3), lrange(3)) - # did we delete the right row? + # did we delete the right row? - panelc = panel.copy() - del panelc[0] - assert_frame_equal(panelc[1], panel[1]) - assert_frame_equal(panelc[2], panel[2]) + panelc = panel.copy() + del panelc[0] + assert_frame_equal(panelc[1], panel[1]) + assert_frame_equal(panelc[2], panel[2]) - panelc = panel.copy() - del panelc[1] - assert_frame_equal(panelc[0], panel[0]) - assert_frame_equal(panelc[2], panel[2]) + panelc = panel.copy() + del panelc[1] + assert_frame_equal(panelc[0], panel[0]) + assert_frame_equal(panelc[2], panel[2]) - panelc = panel.copy() - del panelc[2] - assert_frame_equal(panelc[1], panel[1]) - assert_frame_equal(panelc[0], panel[0]) + panelc = panel.copy() + del panelc[2] + assert_frame_equal(panelc[1], panel[1]) + assert_frame_equal(panelc[0], panel[0]) def test_setitem(self): - # LongPanel with one item - lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() - with tm.assertRaises(ValueError): - self.panel['ItemE'] = lp + with catch_warnings(record=True): - # DataFrame - df = self.panel['ItemA'][2:].filter(items=['A', 'B']) - self.panel['ItemF'] = df - self.panel['ItemE'] = df + # LongPanel with one item + lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() + with tm.assertRaises(ValueError): + self.panel['ItemE'] = lp - df2 = self.panel['ItemF'] + # DataFrame + df = self.panel['ItemA'][2:].filter(items=['A', 'B']) + self.panel['ItemF'] = df + self.panel['ItemE'] = df - assert_frame_equal(df, df2.reindex(index=df.index, columns=df.columns)) + df2 = self.panel['ItemF'] - # scalar - self.panel['ItemG'] = 1 - self.panel['ItemE'] = True - self.assertEqual(self.panel['ItemG'].values.dtype, np.int64) - self.assertEqual(self.panel['ItemE'].values.dtype, np.bool_) + assert_frame_equal(df, df2.reindex( + index=df.index, columns=df.columns)) - # object dtype - self.panel['ItemQ'] = 'foo' - self.assertEqual(self.panel['ItemQ'].values.dtype, np.object_) + # scalar + self.panel['ItemG'] = 1 + self.panel['ItemE'] = True + self.assertEqual(self.panel['ItemG'].values.dtype, np.int64) + self.assertEqual(self.panel['ItemE'].values.dtype, np.bool_) - # boolean dtype - self.panel['ItemP'] = self.panel['ItemA'] > 0 - self.assertEqual(self.panel['ItemP'].values.dtype, np.bool_) + # object dtype + self.panel['ItemQ'] = 'foo' + self.assertEqual(self.panel['ItemQ'].values.dtype, np.object_) - self.assertRaises(TypeError, self.panel.__setitem__, 'foo', - self.panel.loc[['ItemP']]) + # boolean dtype + self.panel['ItemP'] = self.panel['ItemA'] > 0 + self.assertEqual(self.panel['ItemP'].values.dtype, np.bool_) - # bad shape - p = Panel(np.random.randn(4, 3, 2)) - with tm.assertRaisesRegexp(ValueError, - r"shape of value must be \(3, 2\), " - r"shape of given object was \(4, 2\)"): - p[0] = np.random.randn(4, 2) + self.assertRaises(TypeError, self.panel.__setitem__, 'foo', + self.panel.loc[['ItemP']]) + + # bad shape + p = Panel(np.random.randn(4, 3, 2)) + with tm.assertRaisesRegexp(ValueError, + r"shape of value must be \(3, 2\), " + r"shape of given object was \(4, 2\)"): + p[0] = np.random.randn(4, 2) def test_setitem_ndarray(self): - timeidx = date_range(start=datetime(2009, 1, 1), - end=datetime(2009, 12, 31), - freq=MonthEnd()) - lons_coarse = np.linspace(-177.5, 177.5, 72) - lats_coarse = np.linspace(-87.5, 87.5, 36) - P = Panel(items=timeidx, major_axis=lons_coarse, - minor_axis=lats_coarse) - data = np.random.randn(72 * 36).reshape((72, 36)) - key = datetime(2009, 2, 28) - P[key] = data - - assert_almost_equal(P[key].values, data) + with catch_warnings(record=True): + timeidx = date_range(start=datetime(2009, 1, 1), + end=datetime(2009, 12, 31), + freq=MonthEnd()) + lons_coarse = np.linspace(-177.5, 177.5, 72) + lats_coarse = np.linspace(-87.5, 87.5, 36) + P = Panel(items=timeidx, major_axis=lons_coarse, + minor_axis=lats_coarse) + data = np.random.randn(72 * 36).reshape((72, 36)) + key = datetime(2009, 2, 28) + P[key] = data + + assert_almost_equal(P[key].values, data) def test_set_minor_major(self): - # GH 11014 - df1 = DataFrame(['a', 'a', 'a', np.nan, 'a', np.nan]) - df2 = DataFrame([1.0, np.nan, 1.0, np.nan, 1.0, 1.0]) - panel = Panel({'Item1': df1, 'Item2': df2}) + with catch_warnings(record=True): + # GH 11014 + df1 = DataFrame(['a', 'a', 'a', np.nan, 'a', np.nan]) + df2 = DataFrame([1.0, np.nan, 1.0, np.nan, 1.0, 1.0]) + panel = Panel({'Item1': df1, 'Item2': df2}) - newminor = notnull(panel.iloc[:, :, 0]) - panel.loc[:, :, 'NewMinor'] = newminor - assert_frame_equal(panel.loc[:, :, 'NewMinor'], - newminor.astype(object)) + newminor = notnull(panel.iloc[:, :, 0]) + panel.loc[:, :, 'NewMinor'] = newminor + assert_frame_equal(panel.loc[:, :, 'NewMinor'], + newminor.astype(object)) - newmajor = notnull(panel.iloc[:, 0, :]) - panel.loc[:, 'NewMajor', :] = newmajor - assert_frame_equal(panel.loc[:, 'NewMajor', :], - newmajor.astype(object)) + newmajor = notnull(panel.iloc[:, 0, :]) + panel.loc[:, 'NewMajor', :] = newmajor + assert_frame_equal(panel.loc[:, 'NewMajor', :], + newmajor.astype(object)) def test_major_xs(self): - ref = self.panel['ItemA'] + with catch_warnings(record=True): + ref = self.panel['ItemA'] - idx = self.panel.major_axis[5] - xs = self.panel.major_xs(idx) + idx = self.panel.major_axis[5] + xs = self.panel.major_xs(idx) - result = xs['ItemA'] - assert_series_equal(result, ref.xs(idx), check_names=False) - self.assertEqual(result.name, 'ItemA') + result = xs['ItemA'] + assert_series_equal(result, ref.xs(idx), check_names=False) + self.assertEqual(result.name, 'ItemA') - # not contained - idx = self.panel.major_axis[0] - BDay() - self.assertRaises(Exception, self.panel.major_xs, idx) + # not contained + idx = self.panel.major_axis[0] - BDay() + self.assertRaises(Exception, self.panel.major_xs, idx) def test_major_xs_mixed(self): - self.panel['ItemD'] = 'foo' - xs = self.panel.major_xs(self.panel.major_axis[0]) - self.assertEqual(xs['ItemA'].dtype, np.float64) - self.assertEqual(xs['ItemD'].dtype, np.object_) + with catch_warnings(record=True): + self.panel['ItemD'] = 'foo' + xs = self.panel.major_xs(self.panel.major_axis[0]) + self.assertEqual(xs['ItemA'].dtype, np.float64) + self.assertEqual(xs['ItemD'].dtype, np.object_) def test_minor_xs(self): - ref = self.panel['ItemA'] + with catch_warnings(record=True): + ref = self.panel['ItemA'] - idx = self.panel.minor_axis[1] - xs = self.panel.minor_xs(idx) + idx = self.panel.minor_axis[1] + xs = self.panel.minor_xs(idx) - assert_series_equal(xs['ItemA'], ref[idx], check_names=False) + assert_series_equal(xs['ItemA'], ref[idx], check_names=False) - # not contained - self.assertRaises(Exception, self.panel.minor_xs, 'E') + # not contained + self.assertRaises(Exception, self.panel.minor_xs, 'E') def test_minor_xs_mixed(self): - self.panel['ItemD'] = 'foo' + with catch_warnings(record=True): + self.panel['ItemD'] = 'foo' - xs = self.panel.minor_xs('D') - self.assertEqual(xs['ItemA'].dtype, np.float64) - self.assertEqual(xs['ItemD'].dtype, np.object_) + xs = self.panel.minor_xs('D') + self.assertEqual(xs['ItemA'].dtype, np.float64) + self.assertEqual(xs['ItemD'].dtype, np.object_) def test_xs(self): - itemA = self.panel.xs('ItemA', axis=0) - expected = self.panel['ItemA'] - assert_frame_equal(itemA, expected) + with catch_warnings(record=True): + itemA = self.panel.xs('ItemA', axis=0) + expected = self.panel['ItemA'] + assert_frame_equal(itemA, expected) - # get a view by default - itemA_view = self.panel.xs('ItemA', axis=0) - itemA_view.values[:] = np.nan - self.assertTrue(np.isnan(self.panel['ItemA'].values).all()) + # get a view by default + itemA_view = self.panel.xs('ItemA', axis=0) + itemA_view.values[:] = np.nan + self.assertTrue(np.isnan(self.panel['ItemA'].values).all()) - # mixed-type yields a copy - self.panel['strings'] = 'foo' - result = self.panel.xs('D', axis=2) - self.assertIsNotNone(result.is_copy) + # mixed-type yields a copy + self.panel['strings'] = 'foo' + result = self.panel.xs('D', axis=2) + self.assertIsNotNone(result.is_copy) def test_getitem_fancy_labels(self): - p = self.panel + with catch_warnings(record=True): + p = self.panel - items = p.items[[1, 0]] - dates = p.major_axis[::2] - cols = ['D', 'C', 'F'] + items = p.items[[1, 0]] + dates = p.major_axis[::2] + cols = ['D', 'C', 'F'] - # all 3 specified - assert_panel_equal(p.loc[items, dates, cols], - p.reindex(items=items, major=dates, minor=cols)) + # all 3 specified + assert_panel_equal(p.loc[items, dates, cols], + p.reindex(items=items, major=dates, minor=cols)) - # 2 specified - assert_panel_equal(p.loc[:, dates, cols], - p.reindex(major=dates, minor=cols)) + # 2 specified + assert_panel_equal(p.loc[:, dates, cols], + p.reindex(major=dates, minor=cols)) - assert_panel_equal(p.loc[items, :, cols], - p.reindex(items=items, minor=cols)) + assert_panel_equal(p.loc[items, :, cols], + p.reindex(items=items, minor=cols)) - assert_panel_equal(p.loc[items, dates, :], - p.reindex(items=items, major=dates)) + assert_panel_equal(p.loc[items, dates, :], + p.reindex(items=items, major=dates)) - # only 1 - assert_panel_equal(p.loc[items, :, :], p.reindex(items=items)) + # only 1 + assert_panel_equal(p.loc[items, :, :], p.reindex(items=items)) - assert_panel_equal(p.loc[:, dates, :], p.reindex(major=dates)) + assert_panel_equal(p.loc[:, dates, :], p.reindex(major=dates)) - assert_panel_equal(p.loc[:, :, cols], p.reindex(minor=cols)) + assert_panel_equal(p.loc[:, :, cols], p.reindex(minor=cols)) def test_getitem_fancy_slice(self): pass @@ -646,127 +676,132 @@ def test_getitem_fancy_xs(self): assert_series_equal(p.loc[:, date, col], p.major_xs(date).loc[col]) def test_getitem_fancy_xs_check_view(self): - item = 'ItemB' - date = self.panel.major_axis[5] - - # make sure it's always a view - NS = slice(None, None) - - # DataFrames - comp = assert_frame_equal - self._check_view(item, comp) - self._check_view((item, NS), comp) - self._check_view((item, NS, NS), comp) - self._check_view((NS, date), comp) - self._check_view((NS, date, NS), comp) - self._check_view((NS, NS, 'C'), comp) - - # Series - comp = assert_series_equal - self._check_view((item, date), comp) - self._check_view((item, date, NS), comp) - self._check_view((item, NS, 'C'), comp) - self._check_view((NS, date, 'C'), comp) + with catch_warnings(record=True): + item = 'ItemB' + date = self.panel.major_axis[5] + + # make sure it's always a view + NS = slice(None, None) + + # DataFrames + comp = assert_frame_equal + self._check_view(item, comp) + self._check_view((item, NS), comp) + self._check_view((item, NS, NS), comp) + self._check_view((NS, date), comp) + self._check_view((NS, date, NS), comp) + self._check_view((NS, NS, 'C'), comp) + + # Series + comp = assert_series_equal + self._check_view((item, date), comp) + self._check_view((item, date, NS), comp) + self._check_view((item, NS, 'C'), comp) + self._check_view((NS, date, 'C'), comp) def test_getitem_callable(self): - p = self.panel - # GH 12533 + with catch_warnings(record=True): + p = self.panel + # GH 12533 - assert_frame_equal(p[lambda x: 'ItemB'], p.loc['ItemB']) - assert_panel_equal(p[lambda x: ['ItemB', 'ItemC']], - p.loc[['ItemB', 'ItemC']]) + assert_frame_equal(p[lambda x: 'ItemB'], p.loc['ItemB']) + assert_panel_equal(p[lambda x: ['ItemB', 'ItemC']], + p.loc[['ItemB', 'ItemC']]) def test_ix_setitem_slice_dataframe(self): - a = Panel(items=[1, 2, 3], major_axis=[11, 22, 33], - minor_axis=[111, 222, 333]) - b = DataFrame(np.random.randn(2, 3), index=[111, 333], - columns=[1, 2, 3]) + with catch_warnings(record=True): + a = Panel(items=[1, 2, 3], major_axis=[11, 22, 33], + minor_axis=[111, 222, 333]) + b = DataFrame(np.random.randn(2, 3), index=[111, 333], + columns=[1, 2, 3]) - a.loc[:, 22, [111, 333]] = b + a.loc[:, 22, [111, 333]] = b - assert_frame_equal(a.loc[:, 22, [111, 333]], b) + assert_frame_equal(a.loc[:, 22, [111, 333]], b) def test_ix_align(self): - from pandas import Series - b = Series(np.random.randn(10), name=0) - b.sort_values() - df_orig = Panel(np.random.randn(3, 10, 2)) - df = df_orig.copy() + with catch_warnings(record=True): + from pandas import Series + b = Series(np.random.randn(10), name=0) + b.sort_values() + df_orig = Panel(np.random.randn(3, 10, 2)) + df = df_orig.copy() - df.loc[0, :, 0] = b - assert_series_equal(df.loc[0, :, 0].reindex(b.index), b) + df.loc[0, :, 0] = b + assert_series_equal(df.loc[0, :, 0].reindex(b.index), b) - df = df_orig.swapaxes(0, 1) - df.loc[:, 0, 0] = b - assert_series_equal(df.loc[:, 0, 0].reindex(b.index), b) + df = df_orig.swapaxes(0, 1) + df.loc[:, 0, 0] = b + assert_series_equal(df.loc[:, 0, 0].reindex(b.index), b) - df = df_orig.swapaxes(1, 2) - df.loc[0, 0, :] = b - assert_series_equal(df.loc[0, 0, :].reindex(b.index), b) + df = df_orig.swapaxes(1, 2) + df.loc[0, 0, :] = b + assert_series_equal(df.loc[0, 0, :].reindex(b.index), b) def test_ix_frame_align(self): - p_orig = tm.makePanel() - df = p_orig.iloc[0].copy() - assert_frame_equal(p_orig['ItemA'], df) + with catch_warnings(record=True): + p_orig = tm.makePanel() + df = p_orig.iloc[0].copy() + assert_frame_equal(p_orig['ItemA'], df) - p = p_orig.copy() - p.iloc[0, :, :] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p.iloc[0, :, :] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p.iloc[0] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p.iloc[0] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p.iloc[0, :, :] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p.iloc[0, :, :] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p.iloc[0] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p.iloc[0] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p.loc['ItemA'] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p.loc['ItemA'] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p.loc['ItemA', :, :] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p.loc['ItemA', :, :] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p['ItemA'] = df - assert_panel_equal(p, p_orig) + p = p_orig.copy() + p['ItemA'] = df + assert_panel_equal(p, p_orig) - p = p_orig.copy() - p.iloc[0, [0, 1, 3, 5], -2:] = df - out = p.iloc[0, [0, 1, 3, 5], -2:] - assert_frame_equal(out, df.iloc[[0, 1, 3, 5], [2, 3]]) + p = p_orig.copy() + p.iloc[0, [0, 1, 3, 5], -2:] = df + out = p.iloc[0, [0, 1, 3, 5], -2:] + assert_frame_equal(out, df.iloc[[0, 1, 3, 5], [2, 3]]) - # GH3830, panel assignent by values/frame - for dtype in ['float64', 'int64']: + # GH3830, panel assignent by values/frame + for dtype in ['float64', 'int64']: - panel = Panel(np.arange(40).reshape((2, 4, 5)), - items=['a1', 'a2'], dtype=dtype) - df1 = panel.iloc[0] - df2 = panel.iloc[1] + panel = Panel(np.arange(40).reshape((2, 4, 5)), + items=['a1', 'a2'], dtype=dtype) + df1 = panel.iloc[0] + df2 = panel.iloc[1] - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df2) + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df2) - # Assignment by Value Passes for 'a2' - panel.loc['a2'] = df1.values - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df1) + # Assignment by Value Passes for 'a2' + panel.loc['a2'] = df1.values + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df1) - # Assignment by DataFrame Ok w/o loc 'a2' - panel['a2'] = df2 - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df2) + # Assignment by DataFrame Ok w/o loc 'a2' + panel['a2'] = df2 + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df2) - # Assignment by DataFrame Fails for 'a2' - panel.loc['a2'] = df2 - tm.assert_frame_equal(panel.loc['a1'], df1) - tm.assert_frame_equal(panel.loc['a2'], df2) + # Assignment by DataFrame Fails for 'a2' + panel.loc['a2'] = df2 + tm.assert_frame_equal(panel.loc['a1'], df1) + tm.assert_frame_equal(panel.loc['a2'], df2) def _check_view(self, indexer, comp): cp = self.panel.copy() @@ -776,57 +811,60 @@ def _check_view(self, indexer, comp): comp(cp.loc[indexer].reindex_like(obj), obj) def test_logical_with_nas(self): - d = Panel({'ItemA': {'a': [np.nan, False]}, - 'ItemB': {'a': [True, True]}}) + with catch_warnings(record=True): + d = Panel({'ItemA': {'a': [np.nan, False]}, + 'ItemB': {'a': [True, True]}}) - result = d['ItemA'] | d['ItemB'] - expected = DataFrame({'a': [np.nan, True]}) - assert_frame_equal(result, expected) + result = d['ItemA'] | d['ItemB'] + expected = DataFrame({'a': [np.nan, True]}) + assert_frame_equal(result, expected) - # this is autodowncasted here - result = d['ItemA'].fillna(False) | d['ItemB'] - expected = DataFrame({'a': [True, True]}) - assert_frame_equal(result, expected) + # this is autodowncasted here + result = d['ItemA'].fillna(False) | d['ItemB'] + expected = DataFrame({'a': [True, True]}) + assert_frame_equal(result, expected) def test_neg(self): - # what to do? - assert_panel_equal(-self.panel, -1 * self.panel) + with catch_warnings(record=True): + assert_panel_equal(-self.panel, -1 * self.panel) def test_invert(self): - assert_panel_equal(-(self.panel < 0), ~(self.panel < 0)) + with catch_warnings(record=True): + assert_panel_equal(-(self.panel < 0), ~(self.panel < 0)) def test_comparisons(self): - p1 = tm.makePanel() - p2 = tm.makePanel() + with catch_warnings(record=True): + p1 = tm.makePanel() + p2 = tm.makePanel() - tp = p1.reindex(items=p1.items + ['foo']) - df = p1[p1.items[0]] + tp = p1.reindex(items=p1.items + ['foo']) + df = p1[p1.items[0]] - def test_comp(func): + def test_comp(func): - # versus same index - result = func(p1, p2) - self.assert_numpy_array_equal(result.values, - func(p1.values, p2.values)) + # versus same index + result = func(p1, p2) + self.assert_numpy_array_equal(result.values, + func(p1.values, p2.values)) - # versus non-indexed same objs - self.assertRaises(Exception, func, p1, tp) + # versus non-indexed same objs + self.assertRaises(Exception, func, p1, tp) - # versus different objs - self.assertRaises(Exception, func, p1, df) + # versus different objs + self.assertRaises(Exception, func, p1, df) - # versus scalar - result3 = func(self.panel, 0) - self.assert_numpy_array_equal(result3.values, - func(self.panel.values, 0)) + # versus scalar + result3 = func(self.panel, 0) + self.assert_numpy_array_equal(result3.values, + func(self.panel.values, 0)) - with np.errstate(invalid='ignore'): - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) + with np.errstate(invalid='ignore'): + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) def test_get_value(self): for item in self.panel.items: @@ -840,28 +878,26 @@ def test_get_value(self): self.panel.get_value('a') def test_set_value(self): - for item in self.panel.items: - for mjr in self.panel.major_axis[::2]: - for mnr in self.panel.minor_axis: - self.panel.set_value(item, mjr, mnr, 1.) - assert_almost_equal(self.panel[item][mnr][mjr], 1.) - - # resize - res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5) - tm.assertIsInstance(res, Panel) - self.assertIsNot(res, self.panel) - self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) - - res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) - self.assertTrue(is_float_dtype(res3['ItemE'].values)) - with tm.assertRaisesRegexp(TypeError, - "There must be an argument for each axis" - " plus the value provided"): - self.panel.set_value('a') - - -_panel = tm.makePanel() -tm.add_nans(_panel) + with catch_warnings(record=True): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + self.panel.set_value(item, mjr, mnr, 1.) + assert_almost_equal(self.panel[item][mnr][mjr], 1.) + + # resize + res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5) + tm.assertIsInstance(res, Panel) + self.assertIsNot(res, self.panel) + self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) + + res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) + self.assertTrue(is_float_dtype(res3['ItemE'].values)) + with tm.assertRaisesRegexp(TypeError, + "There must be an argument " + "for each axis" + " plus the value provided"): + self.panel.set_value('a') class TestPanel(tm.TestCase, PanelTests, CheckIndexing, SafeForLongAndSparse, @@ -872,292 +908,315 @@ def assert_panel_equal(cls, x, y): assert_panel_equal(x, y) def setUp(self): - self.panel = _panel.copy() + self.panel = make_test_panel() self.panel.major_axis.name = None self.panel.minor_axis.name = None self.panel.items.name = None def test_constructor(self): - # with BlockManager - wp = Panel(self.panel._data) - self.assertIs(wp._data, self.panel._data) - - wp = Panel(self.panel._data, copy=True) - self.assertIsNot(wp._data, self.panel._data) - assert_panel_equal(wp, self.panel) - - # strings handled prop - wp = Panel([[['foo', 'foo', 'foo', ], ['foo', 'foo', 'foo']]]) - self.assertEqual(wp.values.dtype, np.object_) - - vals = self.panel.values - - # no copy - wp = Panel(vals) - self.assertIs(wp.values, vals) - - # copy - wp = Panel(vals, copy=True) - self.assertIsNot(wp.values, vals) - - # GH #8285, test when scalar data is used to construct a Panel - # if dtype is not passed, it should be inferred - value_and_dtype = [(1, 'int64'), (3.14, 'float64'), - ('foo', np.object_)] - for (val, dtype) in value_and_dtype: - wp = Panel(val, items=range(2), major_axis=range(3), - minor_axis=range(4)) - vals = np.empty((2, 3, 4), dtype=dtype) - vals.fill(val) - assert_panel_equal(wp, Panel(vals, dtype=dtype)) - - # test the case when dtype is passed - wp = Panel(1, items=range(2), major_axis=range(3), minor_axis=range(4), - dtype='float32') - vals = np.empty((2, 3, 4), dtype='float32') - vals.fill(1) - assert_panel_equal(wp, Panel(vals, dtype='float32')) + with catch_warnings(record=True): + # with BlockManager + wp = Panel(self.panel._data) + self.assertIs(wp._data, self.panel._data) + + wp = Panel(self.panel._data, copy=True) + self.assertIsNot(wp._data, self.panel._data) + assert_panel_equal(wp, self.panel) + + # strings handled prop + wp = Panel([[['foo', 'foo', 'foo', ], ['foo', 'foo', 'foo']]]) + self.assertEqual(wp.values.dtype, np.object_) + + vals = self.panel.values + + # no copy + wp = Panel(vals) + self.assertIs(wp.values, vals) + + # copy + wp = Panel(vals, copy=True) + self.assertIsNot(wp.values, vals) + + # GH #8285, test when scalar data is used to construct a Panel + # if dtype is not passed, it should be inferred + value_and_dtype = [(1, 'int64'), (3.14, 'float64'), + ('foo', np.object_)] + for (val, dtype) in value_and_dtype: + wp = Panel(val, items=range(2), major_axis=range(3), + minor_axis=range(4)) + vals = np.empty((2, 3, 4), dtype=dtype) + vals.fill(val) + assert_panel_equal(wp, Panel(vals, dtype=dtype)) + + # test the case when dtype is passed + wp = Panel(1, items=range(2), major_axis=range(3), + minor_axis=range(4), + dtype='float32') + vals = np.empty((2, 3, 4), dtype='float32') + vals.fill(1) + assert_panel_equal(wp, Panel(vals, dtype='float32')) def test_constructor_cast(self): - zero_filled = self.panel.fillna(0) + with catch_warnings(record=True): + zero_filled = self.panel.fillna(0) - casted = Panel(zero_filled._data, dtype=int) - casted2 = Panel(zero_filled.values, dtype=int) + casted = Panel(zero_filled._data, dtype=int) + casted2 = Panel(zero_filled.values, dtype=int) - exp_values = zero_filled.values.astype(int) - assert_almost_equal(casted.values, exp_values) - assert_almost_equal(casted2.values, exp_values) + exp_values = zero_filled.values.astype(int) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) - casted = Panel(zero_filled._data, dtype=np.int32) - casted2 = Panel(zero_filled.values, dtype=np.int32) + casted = Panel(zero_filled._data, dtype=np.int32) + casted2 = Panel(zero_filled.values, dtype=np.int32) - exp_values = zero_filled.values.astype(np.int32) - assert_almost_equal(casted.values, exp_values) - assert_almost_equal(casted2.values, exp_values) + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) - # can't cast - data = [[['foo', 'bar', 'baz']]] - self.assertRaises(ValueError, Panel, data, dtype=float) + # can't cast + data = [[['foo', 'bar', 'baz']]] + self.assertRaises(ValueError, Panel, data, dtype=float) def test_constructor_empty_panel(self): - empty = Panel() - self.assertEqual(len(empty.items), 0) - self.assertEqual(len(empty.major_axis), 0) - self.assertEqual(len(empty.minor_axis), 0) + with catch_warnings(record=True): + empty = Panel() + self.assertEqual(len(empty.items), 0) + self.assertEqual(len(empty.major_axis), 0) + self.assertEqual(len(empty.minor_axis), 0) def test_constructor_observe_dtype(self): - # GH #411 - panel = Panel(items=lrange(3), major_axis=lrange(3), - minor_axis=lrange(3), dtype='O') - self.assertEqual(panel.values.dtype, np.object_) + with catch_warnings(record=True): + # GH #411 + panel = Panel(items=lrange(3), major_axis=lrange(3), + minor_axis=lrange(3), dtype='O') + self.assertEqual(panel.values.dtype, np.object_) def test_constructor_dtypes(self): - # GH #797 - - def _check_dtype(panel, dtype): - for i in panel.items: - self.assertEqual(panel[i].values.dtype.name, dtype) - - # only nan holding types allowed here - for dtype in ['float64', 'float32', 'object']: - panel = Panel(items=lrange(2), major_axis=lrange(10), - minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - panel = Panel(np.array(np.random.randn(2, 10, 5), dtype=dtype), - items=lrange(2), - major_axis=lrange(10), - minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - panel = Panel(np.array(np.random.randn(2, 10, 5), dtype='O'), - items=lrange(2), - major_axis=lrange(10), - minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - panel = Panel(np.random.randn(2, 10, 5), items=lrange( - 2), major_axis=lrange(10), minor_axis=lrange(5), dtype=dtype) - _check_dtype(panel, dtype) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: - df1 = DataFrame(np.random.randn(2, 5), - index=lrange(2), columns=lrange(5)) - df2 = DataFrame(np.random.randn(2, 5), - index=lrange(2), columns=lrange(5)) - panel = Panel.from_dict({'a': df1, 'b': df2}, dtype=dtype) - _check_dtype(panel, dtype) + with catch_warnings(record=True): + # GH #797 + + def _check_dtype(panel, dtype): + for i in panel.items: + self.assertEqual(panel[i].values.dtype.name, dtype) + + # only nan holding types allowed here + for dtype in ['float64', 'float32', 'object']: + panel = Panel(items=lrange(2), major_axis=lrange(10), + minor_axis=lrange(5), dtype=dtype) + _check_dtype(panel, dtype) + + for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: + panel = Panel(np.array(np.random.randn(2, 10, 5), dtype=dtype), + items=lrange(2), + major_axis=lrange(10), + minor_axis=lrange(5), dtype=dtype) + _check_dtype(panel, dtype) + + for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: + panel = Panel(np.array(np.random.randn(2, 10, 5), dtype='O'), + items=lrange(2), + major_axis=lrange(10), + minor_axis=lrange(5), dtype=dtype) + _check_dtype(panel, dtype) + + for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: + panel = Panel( + np.random.randn(2, 10, 5), + items=lrange(2), major_axis=lrange(10), + minor_axis=lrange(5), + dtype=dtype) + _check_dtype(panel, dtype) + + for dtype in ['float64', 'float32', 'int64', 'int32', 'object']: + df1 = DataFrame(np.random.randn(2, 5), + index=lrange(2), columns=lrange(5)) + df2 = DataFrame(np.random.randn(2, 5), + index=lrange(2), columns=lrange(5)) + panel = Panel.from_dict({'a': df1, 'b': df2}, dtype=dtype) + _check_dtype(panel, dtype) def test_constructor_fails_with_not_3d_input(self): - with tm.assertRaisesRegexp(ValueError, - "The number of dimensions required is 3"): - Panel(np.random.randn(10, 2)) + with catch_warnings(record=True): + with tm.assertRaisesRegexp(ValueError, "The number of dimensions required is 3"): # noqa + Panel(np.random.randn(10, 2)) def test_consolidate(self): - self.assertTrue(self.panel._data.is_consolidated()) + with catch_warnings(record=True): + self.assertTrue(self.panel._data.is_consolidated()) - self.panel['foo'] = 1. - self.assertFalse(self.panel._data.is_consolidated()) + self.panel['foo'] = 1. + self.assertFalse(self.panel._data.is_consolidated()) - panel = self.panel._consolidate() - self.assertTrue(panel._data.is_consolidated()) + panel = self.panel._consolidate() + self.assertTrue(panel._data.is_consolidated()) def test_ctor_dict(self): - itema = self.panel['ItemA'] - itemb = self.panel['ItemB'] + with catch_warnings(record=True): + itema = self.panel['ItemA'] + itemb = self.panel['ItemB'] - d = {'A': itema, 'B': itemb[5:]} - d2 = {'A': itema._series, 'B': itemb[5:]._series} - d3 = {'A': None, - 'B': DataFrame(itemb[5:]._series), - 'C': DataFrame(itema._series)} + d = {'A': itema, 'B': itemb[5:]} + d2 = {'A': itema._series, 'B': itemb[5:]._series} + d3 = {'A': None, + 'B': DataFrame(itemb[5:]._series), + 'C': DataFrame(itema._series)} - wp = Panel.from_dict(d) - wp2 = Panel.from_dict(d2) # nested Dict + wp = Panel.from_dict(d) + wp2 = Panel.from_dict(d2) # nested Dict - # TODO: unused? - wp3 = Panel.from_dict(d3) # noqa + # TODO: unused? + wp3 = Panel.from_dict(d3) # noqa - self.assert_index_equal(wp.major_axis, self.panel.major_axis) - assert_panel_equal(wp, wp2) + self.assert_index_equal(wp.major_axis, self.panel.major_axis) + assert_panel_equal(wp, wp2) - # intersect - wp = Panel.from_dict(d, intersect=True) - self.assert_index_equal(wp.major_axis, itemb.index[5:]) + # intersect + wp = Panel.from_dict(d, intersect=True) + self.assert_index_equal(wp.major_axis, itemb.index[5:]) - # use constructor - assert_panel_equal(Panel(d), Panel.from_dict(d)) - assert_panel_equal(Panel(d2), Panel.from_dict(d2)) - assert_panel_equal(Panel(d3), Panel.from_dict(d3)) + # use constructor + assert_panel_equal(Panel(d), Panel.from_dict(d)) + assert_panel_equal(Panel(d2), Panel.from_dict(d2)) + assert_panel_equal(Panel(d3), Panel.from_dict(d3)) - # a pathological case - d4 = {'A': None, 'B': None} + # a pathological case + d4 = {'A': None, 'B': None} - # TODO: unused? - wp4 = Panel.from_dict(d4) # noqa + # TODO: unused? + wp4 = Panel.from_dict(d4) # noqa - assert_panel_equal(Panel(d4), Panel(items=['A', 'B'])) + assert_panel_equal(Panel(d4), Panel(items=['A', 'B'])) - # cast - dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) - for k, v in compat.iteritems(d)) - result = Panel(dcasted, dtype=int) - expected = Panel(dict((k, v.astype(int)) - for k, v in compat.iteritems(dcasted))) - assert_panel_equal(result, expected) + # cast + dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) + for k, v in compat.iteritems(d)) + result = Panel(dcasted, dtype=int) + expected = Panel(dict((k, v.astype(int)) + for k, v in compat.iteritems(dcasted))) + assert_panel_equal(result, expected) - result = Panel(dcasted, dtype=np.int32) - expected = Panel(dict((k, v.astype(np.int32)) - for k, v in compat.iteritems(dcasted))) - assert_panel_equal(result, expected) + result = Panel(dcasted, dtype=np.int32) + expected = Panel(dict((k, v.astype(np.int32)) + for k, v in compat.iteritems(dcasted))) + assert_panel_equal(result, expected) def test_constructor_dict_mixed(self): - data = dict((k, v.values) for k, v in self.panel.iteritems()) - result = Panel(data) - exp_major = Index(np.arange(len(self.panel.major_axis))) - self.assert_index_equal(result.major_axis, exp_major) + with catch_warnings(record=True): + data = dict((k, v.values) for k, v in self.panel.iteritems()) + result = Panel(data) + exp_major = Index(np.arange(len(self.panel.major_axis))) + self.assert_index_equal(result.major_axis, exp_major) - result = Panel(data, items=self.panel.items, - major_axis=self.panel.major_axis, - minor_axis=self.panel.minor_axis) - assert_panel_equal(result, self.panel) + result = Panel(data, items=self.panel.items, + major_axis=self.panel.major_axis, + minor_axis=self.panel.minor_axis) + assert_panel_equal(result, self.panel) - data['ItemC'] = self.panel['ItemC'] - result = Panel(data) - assert_panel_equal(result, self.panel) + data['ItemC'] = self.panel['ItemC'] + result = Panel(data) + assert_panel_equal(result, self.panel) - # corner, blow up - data['ItemB'] = data['ItemB'][:-1] - self.assertRaises(Exception, Panel, data) + # corner, blow up + data['ItemB'] = data['ItemB'][:-1] + self.assertRaises(Exception, Panel, data) - data['ItemB'] = self.panel['ItemB'].values[:, :-1] - self.assertRaises(Exception, Panel, data) + data['ItemB'] = self.panel['ItemB'].values[:, :-1] + self.assertRaises(Exception, Panel, data) def test_ctor_orderedDict(self): - keys = list(set(np.random.randint(0, 5000, 100)))[ - :50] # unique random int keys - d = OrderedDict([(k, mkdf(10, 5)) for k in keys]) - p = Panel(d) - self.assertTrue(list(p.items) == keys) + with catch_warnings(record=True): + keys = list(set(np.random.randint(0, 5000, 100)))[ + :50] # unique random int keys + d = OrderedDict([(k, mkdf(10, 5)) for k in keys]) + p = Panel(d) + self.assertTrue(list(p.items) == keys) - p = Panel.from_dict(d) - self.assertTrue(list(p.items) == keys) + p = Panel.from_dict(d) + self.assertTrue(list(p.items) == keys) def test_constructor_resize(self): - data = self.panel._data - items = self.panel.items[:-1] - major = self.panel.major_axis[:-1] - minor = self.panel.minor_axis[:-1] - - result = Panel(data, items=items, major_axis=major, minor_axis=minor) - expected = self.panel.reindex(items=items, major=major, minor=minor) - assert_panel_equal(result, expected) + with catch_warnings(record=True): + data = self.panel._data + items = self.panel.items[:-1] + major = self.panel.major_axis[:-1] + minor = self.panel.minor_axis[:-1] + + result = Panel(data, items=items, + major_axis=major, minor_axis=minor) + expected = self.panel.reindex( + items=items, major=major, minor=minor) + assert_panel_equal(result, expected) - result = Panel(data, items=items, major_axis=major) - expected = self.panel.reindex(items=items, major=major) - assert_panel_equal(result, expected) + result = Panel(data, items=items, major_axis=major) + expected = self.panel.reindex(items=items, major=major) + assert_panel_equal(result, expected) - result = Panel(data, items=items) - expected = self.panel.reindex(items=items) - assert_panel_equal(result, expected) + result = Panel(data, items=items) + expected = self.panel.reindex(items=items) + assert_panel_equal(result, expected) - result = Panel(data, minor_axis=minor) - expected = self.panel.reindex(minor=minor) - assert_panel_equal(result, expected) + result = Panel(data, minor_axis=minor) + expected = self.panel.reindex(minor=minor) + assert_panel_equal(result, expected) def test_from_dict_mixed_orient(self): - df = tm.makeDataFrame() - df['foo'] = 'bar' + with catch_warnings(record=True): + df = tm.makeDataFrame() + df['foo'] = 'bar' - data = {'k1': df, 'k2': df} + data = {'k1': df, 'k2': df} - panel = Panel.from_dict(data, orient='minor') + panel = Panel.from_dict(data, orient='minor') - self.assertEqual(panel['foo'].values.dtype, np.object_) - self.assertEqual(panel['A'].values.dtype, np.float64) + self.assertEqual(panel['foo'].values.dtype, np.object_) + self.assertEqual(panel['A'].values.dtype, np.float64) def test_constructor_error_msgs(self): - def testit(): - Panel(np.random.randn(3, 4, 5), lrange(4), lrange(5), lrange(5)) + with catch_warnings(record=True): + def testit(): + Panel(np.random.randn(3, 4, 5), + lrange(4), lrange(5), lrange(5)) - assertRaisesRegexp(ValueError, - r"Shape of passed values is \(3, 4, 5\), " - r"indices imply \(4, 5, 5\)", - testit) + assertRaisesRegexp(ValueError, + r"Shape of passed values is \(3, 4, 5\), " + r"indices imply \(4, 5, 5\)", + testit) - def testit(): - Panel(np.random.randn(3, 4, 5), lrange(5), lrange(4), lrange(5)) + def testit(): + Panel(np.random.randn(3, 4, 5), + lrange(5), lrange(4), lrange(5)) - assertRaisesRegexp(ValueError, - r"Shape of passed values is \(3, 4, 5\), " - r"indices imply \(5, 4, 5\)", - testit) + assertRaisesRegexp(ValueError, + r"Shape of passed values is \(3, 4, 5\), " + r"indices imply \(5, 4, 5\)", + testit) - def testit(): - Panel(np.random.randn(3, 4, 5), lrange(5), lrange(5), lrange(4)) + def testit(): + Panel(np.random.randn(3, 4, 5), + lrange(5), lrange(5), lrange(4)) - assertRaisesRegexp(ValueError, - r"Shape of passed values is \(3, 4, 5\), " - r"indices imply \(5, 5, 4\)", - testit) + assertRaisesRegexp(ValueError, + r"Shape of passed values is \(3, 4, 5\), " + r"indices imply \(5, 5, 4\)", + testit) def test_conform(self): - df = self.panel['ItemA'][:-5].filter(items=['A', 'B']) - conformed = self.panel.conform(df) + with catch_warnings(record=True): + df = self.panel['ItemA'][:-5].filter(items=['A', 'B']) + conformed = self.panel.conform(df) - tm.assert_index_equal(conformed.index, self.panel.major_axis) - tm.assert_index_equal(conformed.columns, self.panel.minor_axis) + tm.assert_index_equal(conformed.index, self.panel.major_axis) + tm.assert_index_equal(conformed.columns, self.panel.minor_axis) def test_convert_objects(self): + with catch_warnings(record=True): - # GH 4937 - p = Panel(dict(A=dict(a=['1', '1.0']))) - expected = Panel(dict(A=dict(a=[1, 1.0]))) - result = p._convert(numeric=True, coerce=True) - assert_panel_equal(result, expected) + # GH 4937 + p = Panel(dict(A=dict(a=['1', '1.0']))) + expected = Panel(dict(A=dict(a=[1, 1.0]))) + result = p._convert(numeric=True, coerce=True) + assert_panel_equal(result, expected) def test_dtypes(self): @@ -1166,875 +1225,940 @@ def test_dtypes(self): assert_series_equal(result, expected) def test_astype(self): - # GH7271 - data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) - panel = Panel(data, ['a', 'b'], ['c', 'd'], ['e', 'f']) + with catch_warnings(record=True): + # GH7271 + data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) + panel = Panel(data, ['a', 'b'], ['c', 'd'], ['e', 'f']) - str_data = np.array([[['1', '2'], ['3', '4']], - [['5', '6'], ['7', '8']]]) - expected = Panel(str_data, ['a', 'b'], ['c', 'd'], ['e', 'f']) - assert_panel_equal(panel.astype(str), expected) + str_data = np.array([[['1', '2'], ['3', '4']], + [['5', '6'], ['7', '8']]]) + expected = Panel(str_data, ['a', 'b'], ['c', 'd'], ['e', 'f']) + assert_panel_equal(panel.astype(str), expected) - self.assertRaises(NotImplementedError, panel.astype, {0: str}) + self.assertRaises(NotImplementedError, panel.astype, {0: str}) def test_apply(self): - # GH1148 - - # ufunc - applied = self.panel.apply(np.sqrt) - with np.errstate(invalid='ignore'): - expected = np.sqrt(self.panel.values) - assert_almost_equal(applied.values, expected) - - # ufunc same shape - result = self.panel.apply(lambda x: x * 2, axis='items') - expected = self.panel * 2 - assert_panel_equal(result, expected) - result = self.panel.apply(lambda x: x * 2, axis='major_axis') - expected = self.panel * 2 - assert_panel_equal(result, expected) - result = self.panel.apply(lambda x: x * 2, axis='minor_axis') - expected = self.panel * 2 - assert_panel_equal(result, expected) - - # reduction to DataFrame - result = self.panel.apply(lambda x: x.dtype, axis='items') - expected = DataFrame(np.dtype('float64'), index=self.panel.major_axis, - columns=self.panel.minor_axis) - assert_frame_equal(result, expected) - result = self.panel.apply(lambda x: x.dtype, axis='major_axis') - expected = DataFrame(np.dtype('float64'), index=self.panel.minor_axis, - columns=self.panel.items) - assert_frame_equal(result, expected) - result = self.panel.apply(lambda x: x.dtype, axis='minor_axis') - expected = DataFrame(np.dtype('float64'), index=self.panel.major_axis, - columns=self.panel.items) - assert_frame_equal(result, expected) + with catch_warnings(record=True): + # GH1148 - # reductions via other dims - expected = self.panel.sum(0) - result = self.panel.apply(lambda x: x.sum(), axis='items') - assert_frame_equal(result, expected) - expected = self.panel.sum(1) - result = self.panel.apply(lambda x: x.sum(), axis='major_axis') - assert_frame_equal(result, expected) - expected = self.panel.sum(2) - result = self.panel.apply(lambda x: x.sum(), axis='minor_axis') - assert_frame_equal(result, expected) + # ufunc + applied = self.panel.apply(np.sqrt) + with np.errstate(invalid='ignore'): + expected = np.sqrt(self.panel.values) + assert_almost_equal(applied.values, expected) - # pass kwargs - result = self.panel.apply(lambda x, y: x.sum() + y, axis='items', y=5) - expected = self.panel.sum(0) + 5 - assert_frame_equal(result, expected) + # ufunc same shape + result = self.panel.apply(lambda x: x * 2, axis='items') + expected = self.panel * 2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x * 2, axis='major_axis') + expected = self.panel * 2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x * 2, axis='minor_axis') + expected = self.panel * 2 + assert_panel_equal(result, expected) + + # reduction to DataFrame + result = self.panel.apply(lambda x: x.dtype, axis='items') + expected = DataFrame(np.dtype('float64'), + index=self.panel.major_axis, + columns=self.panel.minor_axis) + assert_frame_equal(result, expected) + result = self.panel.apply(lambda x: x.dtype, axis='major_axis') + expected = DataFrame(np.dtype('float64'), + index=self.panel.minor_axis, + columns=self.panel.items) + assert_frame_equal(result, expected) + result = self.panel.apply(lambda x: x.dtype, axis='minor_axis') + expected = DataFrame(np.dtype('float64'), + index=self.panel.major_axis, + columns=self.panel.items) + assert_frame_equal(result, expected) + + # reductions via other dims + expected = self.panel.sum(0) + result = self.panel.apply(lambda x: x.sum(), axis='items') + assert_frame_equal(result, expected) + expected = self.panel.sum(1) + result = self.panel.apply(lambda x: x.sum(), axis='major_axis') + assert_frame_equal(result, expected) + expected = self.panel.sum(2) + result = self.panel.apply(lambda x: x.sum(), axis='minor_axis') + assert_frame_equal(result, expected) + + # pass kwargs + result = self.panel.apply( + lambda x, y: x.sum() + y, axis='items', y=5) + expected = self.panel.sum(0) + 5 + assert_frame_equal(result, expected) def test_apply_slabs(self): + with catch_warnings(record=True): - # same shape as original - result = self.panel.apply(lambda x: x * 2, - axis=['items', 'major_axis']) - expected = (self.panel * 2).transpose('minor_axis', 'major_axis', - 'items') - assert_panel_equal(result, expected) - result = self.panel.apply(lambda x: x * 2, - axis=['major_axis', 'items']) - assert_panel_equal(result, expected) - - result = self.panel.apply(lambda x: x * 2, - axis=['items', 'minor_axis']) - expected = (self.panel * 2).transpose('major_axis', 'minor_axis', - 'items') - assert_panel_equal(result, expected) - result = self.panel.apply(lambda x: x * 2, - axis=['minor_axis', 'items']) - assert_panel_equal(result, expected) - - result = self.panel.apply(lambda x: x * 2, - axis=['major_axis', 'minor_axis']) - expected = self.panel * 2 - assert_panel_equal(result, expected) - result = self.panel.apply(lambda x: x * 2, - axis=['minor_axis', 'major_axis']) - assert_panel_equal(result, expected) - - # reductions - result = self.panel.apply(lambda x: x.sum(0), axis=[ - 'items', 'major_axis' - ]) - expected = self.panel.sum(1).T - assert_frame_equal(result, expected) + # same shape as original + result = self.panel.apply(lambda x: x * 2, + axis=['items', 'major_axis']) + expected = (self.panel * 2).transpose('minor_axis', 'major_axis', + 'items') + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x * 2, + axis=['major_axis', 'items']) + assert_panel_equal(result, expected) - result = self.panel.apply(lambda x: x.sum(1), axis=[ - 'items', 'major_axis' - ]) - expected = self.panel.sum(0) - assert_frame_equal(result, expected) + result = self.panel.apply(lambda x: x * 2, + axis=['items', 'minor_axis']) + expected = (self.panel * 2).transpose('major_axis', 'minor_axis', + 'items') + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x * 2, + axis=['minor_axis', 'items']) + assert_panel_equal(result, expected) + + result = self.panel.apply(lambda x: x * 2, + axis=['major_axis', 'minor_axis']) + expected = self.panel * 2 + assert_panel_equal(result, expected) + result = self.panel.apply(lambda x: x * 2, + axis=['minor_axis', 'major_axis']) + assert_panel_equal(result, expected) - # transforms - f = lambda x: ((x.T - x.mean(1)) / x.std(1)).T + # reductions + result = self.panel.apply(lambda x: x.sum(0), axis=[ + 'items', 'major_axis' + ]) + expected = self.panel.sum(1).T + assert_frame_equal(result, expected) # make sure that we don't trigger any warnings with catch_warnings(record=True): + result = self.panel.apply(lambda x: x.sum(1), axis=[ + 'items', 'major_axis' + ]) + expected = self.panel.sum(0) + assert_frame_equal(result, expected) + + # transforms + f = lambda x: ((x.T - x.mean(1)) / x.std(1)).T + + # make sure that we don't trigger any warnings result = self.panel.apply(f, axis=['items', 'major_axis']) expected = Panel(dict([(ax, f(self.panel.loc[:, :, ax])) for ax in self.panel.minor_axis])) assert_panel_equal(result, expected) - result = self.panel.apply(f, axis=['major_axis', 'minor_axis']) - expected = Panel(dict([(ax, f(self.panel.loc[ax])) - for ax in self.panel.items])) - assert_panel_equal(result, expected) - - result = self.panel.apply(f, axis=['minor_axis', 'items']) - expected = Panel(dict([(ax, f(self.panel.loc[:, ax])) - for ax in self.panel.major_axis])) - assert_panel_equal(result, expected) - - # with multi-indexes - # GH7469 - index = MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ( - 'two', 'a'), ('two', 'b')]) - dfa = DataFrame(np.array(np.arange(12, dtype='int64')).reshape( - 4, 3), columns=list("ABC"), index=index) - dfb = DataFrame(np.array(np.arange(10, 22, dtype='int64')).reshape( - 4, 3), columns=list("ABC"), index=index) - p = Panel({'f': dfa, 'g': dfb}) - result = p.apply(lambda x: x.sum(), axis=0) - - # on windows this will be in32 - result = result.astype('int64') - expected = p.sum(0) - assert_frame_equal(result, expected) + result = self.panel.apply(f, axis=['major_axis', 'minor_axis']) + expected = Panel(dict([(ax, f(self.panel.loc[ax])) + for ax in self.panel.items])) + assert_panel_equal(result, expected) + + result = self.panel.apply(f, axis=['minor_axis', 'items']) + expected = Panel(dict([(ax, f(self.panel.loc[:, ax])) + for ax in self.panel.major_axis])) + assert_panel_equal(result, expected) + + # with multi-indexes + # GH7469 + index = MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ( + 'two', 'a'), ('two', 'b')]) + dfa = DataFrame(np.array(np.arange(12, dtype='int64')).reshape( + 4, 3), columns=list("ABC"), index=index) + dfb = DataFrame(np.array(np.arange(10, 22, dtype='int64')).reshape( + 4, 3), columns=list("ABC"), index=index) + p = Panel({'f': dfa, 'g': dfb}) + result = p.apply(lambda x: x.sum(), axis=0) + + # on windows this will be in32 + result = result.astype('int64') + expected = p.sum(0) + assert_frame_equal(result, expected) def test_apply_no_or_zero_ndim(self): - # GH10332 - self.panel = Panel(np.random.rand(5, 5, 5)) + with catch_warnings(record=True): + # GH10332 + self.panel = Panel(np.random.rand(5, 5, 5)) - result_int = self.panel.apply(lambda df: 0, axis=[1, 2]) - result_float = self.panel.apply(lambda df: 0.0, axis=[1, 2]) - result_int64 = self.panel.apply(lambda df: np.int64(0), axis=[1, 2]) - result_float64 = self.panel.apply(lambda df: np.float64(0.0), - axis=[1, 2]) + result_int = self.panel.apply(lambda df: 0, axis=[1, 2]) + result_float = self.panel.apply(lambda df: 0.0, axis=[1, 2]) + result_int64 = self.panel.apply( + lambda df: np.int64(0), axis=[1, 2]) + result_float64 = self.panel.apply(lambda df: np.float64(0.0), + axis=[1, 2]) - expected_int = expected_int64 = Series([0] * 5) - expected_float = expected_float64 = Series([0.0] * 5) + expected_int = expected_int64 = Series([0] * 5) + expected_float = expected_float64 = Series([0.0] * 5) - assert_series_equal(result_int, expected_int) - assert_series_equal(result_int64, expected_int64) - assert_series_equal(result_float, expected_float) - assert_series_equal(result_float64, expected_float64) + assert_series_equal(result_int, expected_int) + assert_series_equal(result_int64, expected_int64) + assert_series_equal(result_float, expected_float) + assert_series_equal(result_float64, expected_float64) def test_reindex(self): - ref = self.panel['ItemB'] + with catch_warnings(record=True): + ref = self.panel['ItemB'] - # items - result = self.panel.reindex(items=['ItemA', 'ItemB']) - assert_frame_equal(result['ItemB'], ref) + # items + result = self.panel.reindex(items=['ItemA', 'ItemB']) + assert_frame_equal(result['ItemB'], ref) - # major - new_major = list(self.panel.major_axis[:10]) - result = self.panel.reindex(major=new_major) - assert_frame_equal(result['ItemB'], ref.reindex(index=new_major)) + # major + new_major = list(self.panel.major_axis[:10]) + result = self.panel.reindex(major=new_major) + assert_frame_equal(result['ItemB'], ref.reindex(index=new_major)) - # raise exception put both major and major_axis - self.assertRaises(Exception, self.panel.reindex, major_axis=new_major, - major=new_major) + # raise exception put both major and major_axis + self.assertRaises(Exception, self.panel.reindex, + major_axis=new_major, + major=new_major) - # minor - new_minor = list(self.panel.minor_axis[:2]) - result = self.panel.reindex(minor=new_minor) - assert_frame_equal(result['ItemB'], ref.reindex(columns=new_minor)) + # minor + new_minor = list(self.panel.minor_axis[:2]) + result = self.panel.reindex(minor=new_minor) + assert_frame_equal(result['ItemB'], ref.reindex(columns=new_minor)) - # this ok - result = self.panel.reindex() - assert_panel_equal(result, self.panel) - self.assertFalse(result is self.panel) + # this ok + result = self.panel.reindex() + assert_panel_equal(result, self.panel) + self.assertFalse(result is self.panel) - # with filling - smaller_major = self.panel.major_axis[::5] - smaller = self.panel.reindex(major=smaller_major) + # with filling + smaller_major = self.panel.major_axis[::5] + smaller = self.panel.reindex(major=smaller_major) - larger = smaller.reindex(major=self.panel.major_axis, method='pad') + larger = smaller.reindex(major=self.panel.major_axis, method='pad') - assert_frame_equal(larger.major_xs(self.panel.major_axis[1]), - smaller.major_xs(smaller_major[0])) + assert_frame_equal(larger.major_xs(self.panel.major_axis[1]), + smaller.major_xs(smaller_major[0])) - # don't necessarily copy - result = self.panel.reindex(major=self.panel.major_axis, copy=False) - assert_panel_equal(result, self.panel) - self.assertTrue(result is self.panel) + # don't necessarily copy + result = self.panel.reindex( + major=self.panel.major_axis, copy=False) + assert_panel_equal(result, self.panel) + self.assertTrue(result is self.panel) def test_reindex_multi(self): + with catch_warnings(record=True): - # with and without copy full reindexing - result = self.panel.reindex(items=self.panel.items, - major=self.panel.major_axis, - minor=self.panel.minor_axis, copy=False) - - self.assertIs(result.items, self.panel.items) - self.assertIs(result.major_axis, self.panel.major_axis) - self.assertIs(result.minor_axis, self.panel.minor_axis) - - result = self.panel.reindex(items=self.panel.items, - major=self.panel.major_axis, - minor=self.panel.minor_axis, copy=False) - assert_panel_equal(result, self.panel) - - # multi-axis indexing consistency - # GH 5900 - df = DataFrame(np.random.randn(4, 3)) - p = Panel({'Item1': df}) - expected = Panel({'Item1': df}) - expected['Item2'] = np.nan - - items = ['Item1', 'Item2'] - major_axis = np.arange(4) - minor_axis = np.arange(3) - - results = [] - results.append(p.reindex(items=items, major_axis=major_axis, - copy=True)) - results.append(p.reindex(items=items, major_axis=major_axis, - copy=False)) - results.append(p.reindex(items=items, minor_axis=minor_axis, - copy=True)) - results.append(p.reindex(items=items, minor_axis=minor_axis, - copy=False)) - results.append(p.reindex(items=items, major_axis=major_axis, - minor_axis=minor_axis, copy=True)) - results.append(p.reindex(items=items, major_axis=major_axis, - minor_axis=minor_axis, copy=False)) - - for i, r in enumerate(results): - assert_panel_equal(expected, r) + # with and without copy full reindexing + result = self.panel.reindex( + items=self.panel.items, + major=self.panel.major_axis, + minor=self.panel.minor_axis, copy=False) + + self.assertIs(result.items, self.panel.items) + self.assertIs(result.major_axis, self.panel.major_axis) + self.assertIs(result.minor_axis, self.panel.minor_axis) + + result = self.panel.reindex( + items=self.panel.items, + major=self.panel.major_axis, + minor=self.panel.minor_axis, copy=False) + assert_panel_equal(result, self.panel) + + # multi-axis indexing consistency + # GH 5900 + df = DataFrame(np.random.randn(4, 3)) + p = Panel({'Item1': df}) + expected = Panel({'Item1': df}) + expected['Item2'] = np.nan + + items = ['Item1', 'Item2'] + major_axis = np.arange(4) + minor_axis = np.arange(3) + + results = [] + results.append(p.reindex(items=items, major_axis=major_axis, + copy=True)) + results.append(p.reindex(items=items, major_axis=major_axis, + copy=False)) + results.append(p.reindex(items=items, minor_axis=minor_axis, + copy=True)) + results.append(p.reindex(items=items, minor_axis=minor_axis, + copy=False)) + results.append(p.reindex(items=items, major_axis=major_axis, + minor_axis=minor_axis, copy=True)) + results.append(p.reindex(items=items, major_axis=major_axis, + minor_axis=minor_axis, copy=False)) + + for i, r in enumerate(results): + assert_panel_equal(expected, r) def test_reindex_like(self): - # reindex_like - smaller = self.panel.reindex(items=self.panel.items[:-1], - major=self.panel.major_axis[:-1], - minor=self.panel.minor_axis[:-1]) - smaller_like = self.panel.reindex_like(smaller) - assert_panel_equal(smaller, smaller_like) + with catch_warnings(record=True): + # reindex_like + smaller = self.panel.reindex(items=self.panel.items[:-1], + major=self.panel.major_axis[:-1], + minor=self.panel.minor_axis[:-1]) + smaller_like = self.panel.reindex_like(smaller) + assert_panel_equal(smaller, smaller_like) def test_take(self): - # axis == 0 - result = self.panel.take([2, 0, 1], axis=0) - expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) - assert_panel_equal(result, expected) + with catch_warnings(record=True): + # axis == 0 + result = self.panel.take([2, 0, 1], axis=0) + expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + assert_panel_equal(result, expected) - # axis >= 1 - result = self.panel.take([3, 0, 1, 2], axis=2) - expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) - assert_panel_equal(result, expected) + # axis >= 1 + result = self.panel.take([3, 0, 1, 2], axis=2) + expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + assert_panel_equal(result, expected) - # neg indicies ok - expected = self.panel.reindex(minor=['D', 'D', 'B', 'C']) - result = self.panel.take([3, -1, 1, 2], axis=2) - assert_panel_equal(result, expected) + # neg indicies ok + expected = self.panel.reindex(minor=['D', 'D', 'B', 'C']) + result = self.panel.take([3, -1, 1, 2], axis=2) + assert_panel_equal(result, expected) - self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) def test_sort_index(self): - import random - - ritems = list(self.panel.items) - rmajor = list(self.panel.major_axis) - rminor = list(self.panel.minor_axis) - random.shuffle(ritems) - random.shuffle(rmajor) - random.shuffle(rminor) - - random_order = self.panel.reindex(items=ritems) - sorted_panel = random_order.sort_index(axis=0) - assert_panel_equal(sorted_panel, self.panel) - - # descending - random_order = self.panel.reindex(items=ritems) - sorted_panel = random_order.sort_index(axis=0, ascending=False) - assert_panel_equal(sorted_panel, - self.panel.reindex(items=self.panel.items[::-1])) - - random_order = self.panel.reindex(major=rmajor) - sorted_panel = random_order.sort_index(axis=1) - assert_panel_equal(sorted_panel, self.panel) - - random_order = self.panel.reindex(minor=rminor) - sorted_panel = random_order.sort_index(axis=2) - assert_panel_equal(sorted_panel, self.panel) + with catch_warnings(record=True): + import random + + ritems = list(self.panel.items) + rmajor = list(self.panel.major_axis) + rminor = list(self.panel.minor_axis) + random.shuffle(ritems) + random.shuffle(rmajor) + random.shuffle(rminor) + + random_order = self.panel.reindex(items=ritems) + sorted_panel = random_order.sort_index(axis=0) + assert_panel_equal(sorted_panel, self.panel) + + # descending + random_order = self.panel.reindex(items=ritems) + sorted_panel = random_order.sort_index(axis=0, ascending=False) + assert_panel_equal( + sorted_panel, + self.panel.reindex(items=self.panel.items[::-1])) + + random_order = self.panel.reindex(major=rmajor) + sorted_panel = random_order.sort_index(axis=1) + assert_panel_equal(sorted_panel, self.panel) + + random_order = self.panel.reindex(minor=rminor) + sorted_panel = random_order.sort_index(axis=2) + assert_panel_equal(sorted_panel, self.panel) def test_fillna(self): - filled = self.panel.fillna(0) - self.assertTrue(np.isfinite(filled.values).all()) - - filled = self.panel.fillna(method='backfill') - assert_frame_equal(filled['ItemA'], - self.panel['ItemA'].fillna(method='backfill')) - - panel = self.panel.copy() - panel['str'] = 'foo' - - filled = panel.fillna(method='backfill') - assert_frame_equal(filled['ItemA'], - panel['ItemA'].fillna(method='backfill')) - - empty = self.panel.reindex(items=[]) - filled = empty.fillna(0) - assert_panel_equal(filled, empty) - - self.assertRaises(ValueError, self.panel.fillna) - self.assertRaises(ValueError, self.panel.fillna, 5, method='ffill') - - self.assertRaises(TypeError, self.panel.fillna, [1, 2]) - self.assertRaises(TypeError, self.panel.fillna, (1, 2)) - - # limit not implemented when only value is specified - p = Panel(np.random.randn(3, 4, 5)) - p.iloc[0:2, 0:2, 0:2] = np.nan - self.assertRaises(NotImplementedError, lambda: p.fillna(999, limit=1)) - - # Test in place fillNA - # Expected result - expected = Panel([[[0, 1], [2, 1]], [[10, 11], [12, 11]]], - items=['a', 'b'], minor_axis=['x', 'y'], - dtype=np.float64) - # method='ffill' - p1 = Panel([[[0, 1], [2, np.nan]], [[10, 11], [12, np.nan]]], - items=['a', 'b'], minor_axis=['x', 'y'], - dtype=np.float64) - p1.fillna(method='ffill', inplace=True) - assert_panel_equal(p1, expected) - - # method='bfill' - p2 = Panel([[[0, np.nan], [2, 1]], [[10, np.nan], [12, 11]]], - items=['a', 'b'], minor_axis=['x', 'y'], dtype=np.float64) - p2.fillna(method='bfill', inplace=True) - assert_panel_equal(p2, expected) + with catch_warnings(record=True): + filled = self.panel.fillna(0) + self.assertTrue(np.isfinite(filled.values).all()) + + filled = self.panel.fillna(method='backfill') + assert_frame_equal(filled['ItemA'], + self.panel['ItemA'].fillna(method='backfill')) + + panel = self.panel.copy() + panel['str'] = 'foo' + + filled = panel.fillna(method='backfill') + assert_frame_equal(filled['ItemA'], + panel['ItemA'].fillna(method='backfill')) + + empty = self.panel.reindex(items=[]) + filled = empty.fillna(0) + assert_panel_equal(filled, empty) + + self.assertRaises(ValueError, self.panel.fillna) + self.assertRaises(ValueError, self.panel.fillna, 5, method='ffill') + + self.assertRaises(TypeError, self.panel.fillna, [1, 2]) + self.assertRaises(TypeError, self.panel.fillna, (1, 2)) + + # limit not implemented when only value is specified + p = Panel(np.random.randn(3, 4, 5)) + p.iloc[0:2, 0:2, 0:2] = np.nan + self.assertRaises(NotImplementedError, + lambda: p.fillna(999, limit=1)) + + # Test in place fillNA + # Expected result + expected = Panel([[[0, 1], [2, 1]], [[10, 11], [12, 11]]], + items=['a', 'b'], minor_axis=['x', 'y'], + dtype=np.float64) + # method='ffill' + p1 = Panel([[[0, 1], [2, np.nan]], [[10, 11], [12, np.nan]]], + items=['a', 'b'], minor_axis=['x', 'y'], + dtype=np.float64) + p1.fillna(method='ffill', inplace=True) + assert_panel_equal(p1, expected) + + # method='bfill' + p2 = Panel([[[0, np.nan], [2, 1]], [[10, np.nan], [12, 11]]], + items=['a', 'b'], minor_axis=['x', 'y'], + dtype=np.float64) + p2.fillna(method='bfill', inplace=True) + assert_panel_equal(p2, expected) def test_ffill_bfill(self): - assert_panel_equal(self.panel.ffill(), - self.panel.fillna(method='ffill')) - assert_panel_equal(self.panel.bfill(), - self.panel.fillna(method='bfill')) + with catch_warnings(record=True): + assert_panel_equal(self.panel.ffill(), + self.panel.fillna(method='ffill')) + assert_panel_equal(self.panel.bfill(), + self.panel.fillna(method='bfill')) def test_truncate_fillna_bug(self): - # #1823 - result = self.panel.truncate(before=None, after=None, axis='items') + with catch_warnings(record=True): + # #1823 + result = self.panel.truncate(before=None, after=None, axis='items') - # it works! - result.fillna(value=0.0) + # it works! + result.fillna(value=0.0) def test_swapaxes(self): - result = self.panel.swapaxes('items', 'minor') - self.assertIs(result.items, self.panel.minor_axis) + with catch_warnings(record=True): + result = self.panel.swapaxes('items', 'minor') + self.assertIs(result.items, self.panel.minor_axis) - result = self.panel.swapaxes('items', 'major') - self.assertIs(result.items, self.panel.major_axis) + result = self.panel.swapaxes('items', 'major') + self.assertIs(result.items, self.panel.major_axis) - result = self.panel.swapaxes('major', 'minor') - self.assertIs(result.major_axis, self.panel.minor_axis) + result = self.panel.swapaxes('major', 'minor') + self.assertIs(result.major_axis, self.panel.minor_axis) - panel = self.panel.copy() - result = panel.swapaxes('major', 'minor') - panel.values[0, 0, 1] = np.nan - expected = panel.swapaxes('major', 'minor') - assert_panel_equal(result, expected) + panel = self.panel.copy() + result = panel.swapaxes('major', 'minor') + panel.values[0, 0, 1] = np.nan + expected = panel.swapaxes('major', 'minor') + assert_panel_equal(result, expected) - # this should also work - result = self.panel.swapaxes(0, 1) - self.assertIs(result.items, self.panel.major_axis) + # this should also work + result = self.panel.swapaxes(0, 1) + self.assertIs(result.items, self.panel.major_axis) - # this works, but return a copy - result = self.panel.swapaxes('items', 'items') - assert_panel_equal(self.panel, result) - self.assertNotEqual(id(self.panel), id(result)) + # this works, but return a copy + result = self.panel.swapaxes('items', 'items') + assert_panel_equal(self.panel, result) + self.assertNotEqual(id(self.panel), id(result)) def test_transpose(self): - result = self.panel.transpose('minor', 'major', 'items') - expected = self.panel.swapaxes('items', 'minor') - assert_panel_equal(result, expected) - - # test kwargs - result = self.panel.transpose(items='minor', major='major', - minor='items') - expected = self.panel.swapaxes('items', 'minor') - assert_panel_equal(result, expected) - - # text mixture of args - result = self.panel.transpose('minor', major='major', minor='items') - expected = self.panel.swapaxes('items', 'minor') - assert_panel_equal(result, expected) - - result = self.panel.transpose('minor', 'major', minor='items') - expected = self.panel.swapaxes('items', 'minor') - assert_panel_equal(result, expected) - - # duplicate axes - with tm.assertRaisesRegexp(TypeError, - 'not enough/duplicate arguments'): - self.panel.transpose('minor', maj='major', minor='items') + with catch_warnings(record=True): + result = self.panel.transpose('minor', 'major', 'items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) - with tm.assertRaisesRegexp(ValueError, 'repeated axis in transpose'): - self.panel.transpose('minor', 'major', major='minor', - minor='items') + # test kwargs + result = self.panel.transpose(items='minor', major='major', + minor='items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) - result = self.panel.transpose(2, 1, 0) - assert_panel_equal(result, expected) + # text mixture of args + result = self.panel.transpose( + 'minor', major='major', minor='items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) - result = self.panel.transpose('minor', 'items', 'major') - expected = self.panel.swapaxes('items', 'minor') - expected = expected.swapaxes('major', 'minor') - assert_panel_equal(result, expected) + result = self.panel.transpose('minor', + 'major', + minor='items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) - result = self.panel.transpose(2, 0, 1) - assert_panel_equal(result, expected) + # duplicate axes + with tm.assertRaisesRegexp(TypeError, + 'not enough/duplicate arguments'): + self.panel.transpose('minor', maj='major', minor='items') - self.assertRaises(ValueError, self.panel.transpose, 0, 0, 1) + with tm.assertRaisesRegexp(ValueError, + 'repeated axis in transpose'): + self.panel.transpose('minor', 'major', major='minor', + minor='items') + + result = self.panel.transpose(2, 1, 0) + assert_panel_equal(result, expected) + + result = self.panel.transpose('minor', 'items', 'major') + expected = self.panel.swapaxes('items', 'minor') + expected = expected.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + result = self.panel.transpose(2, 0, 1) + assert_panel_equal(result, expected) + + self.assertRaises(ValueError, self.panel.transpose, 0, 0, 1) def test_transpose_copy(self): - panel = self.panel.copy() - result = panel.transpose(2, 0, 1, copy=True) - expected = panel.swapaxes('items', 'minor') - expected = expected.swapaxes('major', 'minor') - assert_panel_equal(result, expected) + with catch_warnings(record=True): + panel = self.panel.copy() + result = panel.transpose(2, 0, 1, copy=True) + expected = panel.swapaxes('items', 'minor') + expected = expected.swapaxes('major', 'minor') + assert_panel_equal(result, expected) - panel.values[0, 1, 1] = np.nan - self.assertTrue(notnull(result.values[1, 0, 1])) + panel.values[0, 1, 1] = np.nan + self.assertTrue(notnull(result.values[1, 0, 1])) def test_to_frame(self): - # filtered - filtered = self.panel.to_frame() - expected = self.panel.to_frame().dropna(how='any') - assert_frame_equal(filtered, expected) - - # unfiltered - unfiltered = self.panel.to_frame(filter_observations=False) - assert_panel_equal(unfiltered.to_panel(), self.panel) - - # names - self.assertEqual(unfiltered.index.names, ('major', 'minor')) - - # unsorted, round trip - df = self.panel.to_frame(filter_observations=False) - unsorted = df.take(np.random.permutation(len(df))) - pan = unsorted.to_panel() - assert_panel_equal(pan, self.panel) - - # preserve original index names - df = DataFrame(np.random.randn(6, 2), - index=[['a', 'a', 'b', 'b', 'c', 'c'], - [0, 1, 0, 1, 0, 1]], - columns=['one', 'two']) - df.index.names = ['foo', 'bar'] - df.columns.name = 'baz' - - rdf = df.to_panel().to_frame() - self.assertEqual(rdf.index.names, df.index.names) - self.assertEqual(rdf.columns.names, df.columns.names) + with catch_warnings(record=True): + # filtered + filtered = self.panel.to_frame() + expected = self.panel.to_frame().dropna(how='any') + assert_frame_equal(filtered, expected) + + # unfiltered + unfiltered = self.panel.to_frame(filter_observations=False) + assert_panel_equal(unfiltered.to_panel(), self.panel) + + # names + self.assertEqual(unfiltered.index.names, ('major', 'minor')) + + # unsorted, round trip + df = self.panel.to_frame(filter_observations=False) + unsorted = df.take(np.random.permutation(len(df))) + pan = unsorted.to_panel() + assert_panel_equal(pan, self.panel) + + # preserve original index names + df = DataFrame(np.random.randn(6, 2), + index=[['a', 'a', 'b', 'b', 'c', 'c'], + [0, 1, 0, 1, 0, 1]], + columns=['one', 'two']) + df.index.names = ['foo', 'bar'] + df.columns.name = 'baz' + + rdf = df.to_panel().to_frame() + self.assertEqual(rdf.index.names, df.index.names) + self.assertEqual(rdf.columns.names, df.columns.names) def test_to_frame_mixed(self): - panel = self.panel.fillna(0) - panel['str'] = 'foo' - panel['bool'] = panel['ItemA'] > 0 - - lp = panel.to_frame() - wp = lp.to_panel() - self.assertEqual(wp['bool'].values.dtype, np.bool_) - # Previously, this was mutating the underlying index and changing its - # name - assert_frame_equal(wp['bool'], panel['bool'], check_names=False) - - # GH 8704 - # with categorical - df = panel.to_frame() - df['category'] = df['str'].astype('category') - - # to_panel - # TODO: this converts back to object - p = df.to_panel() - expected = panel.copy() - expected['category'] = 'foo' - assert_panel_equal(p, expected) + with catch_warnings(record=True): + panel = self.panel.fillna(0) + panel['str'] = 'foo' + panel['bool'] = panel['ItemA'] > 0 + + lp = panel.to_frame() + wp = lp.to_panel() + self.assertEqual(wp['bool'].values.dtype, np.bool_) + # Previously, this was mutating the underlying + # index and changing its name + assert_frame_equal(wp['bool'], panel['bool'], check_names=False) + + # GH 8704 + # with categorical + df = panel.to_frame() + df['category'] = df['str'].astype('category') + + # to_panel + # TODO: this converts back to object + p = df.to_panel() + expected = panel.copy() + expected['category'] = 'foo' + assert_panel_equal(p, expected) def test_to_frame_multi_major(self): - idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( - 2, 'two')]) - df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]], - columns=['A', 'B', 'C'], index=idx) - wp = Panel({'i1': df, 'i2': df}) - expected_idx = MultiIndex.from_tuples( - [ - (1, 'one', 'A'), (1, 'one', 'B'), - (1, 'one', 'C'), (1, 'two', 'A'), - (1, 'two', 'B'), (1, 'two', 'C'), - (2, 'one', 'A'), (2, 'one', 'B'), - (2, 'one', 'C'), (2, 'two', 'A'), - (2, 'two', 'B'), (2, 'two', 'C') - ], - names=[None, None, 'minor']) - expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, - 'c', 1, 4, 'd', 1], - 'i2': [1, 'a', 1, 2, 'b', - 1, 3, 'c', 1, 4, 'd', 1]}, - index=expected_idx) - result = wp.to_frame() - assert_frame_equal(result, expected) - - wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773 - result = wp.to_frame() - assert_frame_equal(result, expected[1:]) - - idx = MultiIndex.from_tuples([(1, 'two'), (1, 'one'), (2, 'one'), ( - np.nan, 'two')]) - df = DataFrame([[1, 'a', 1], [2, 'b', 1], [3, 'c', 1], [4, 'd', 1]], - columns=['A', 'B', 'C'], index=idx) - wp = Panel({'i1': df, 'i2': df}) - ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), - (1, 'two', 'C'), - (1, 'one', 'A'), - (1, 'one', 'B'), - (1, 'one', 'C'), - (2, 'one', 'A'), - (2, 'one', 'B'), - (2, 'one', 'C'), - (np.nan, 'two', 'A'), - (np.nan, 'two', 'B'), - (np.nan, 'two', 'C')], - names=[None, None, 'minor']) - expected.index = ex_idx - result = wp.to_frame() - assert_frame_equal(result, expected) + with catch_warnings(record=True): + idx = MultiIndex.from_tuples( + [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]) + df = DataFrame([[1, 'a', 1], [2, 'b', 1], + [3, 'c', 1], [4, 'd', 1]], + columns=['A', 'B', 'C'], index=idx) + wp = Panel({'i1': df, 'i2': df}) + expected_idx = MultiIndex.from_tuples( + [ + (1, 'one', 'A'), (1, 'one', 'B'), + (1, 'one', 'C'), (1, 'two', 'A'), + (1, 'two', 'B'), (1, 'two', 'C'), + (2, 'one', 'A'), (2, 'one', 'B'), + (2, 'one', 'C'), (2, 'two', 'A'), + (2, 'two', 'B'), (2, 'two', 'C') + ], + names=[None, None, 'minor']) + expected = DataFrame({'i1': [1, 'a', 1, 2, 'b', 1, 3, + 'c', 1, 4, 'd', 1], + 'i2': [1, 'a', 1, 2, 'b', + 1, 3, 'c', 1, 4, 'd', 1]}, + index=expected_idx) + result = wp.to_frame() + assert_frame_equal(result, expected) + + wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773 + result = wp.to_frame() + assert_frame_equal(result, expected[1:]) + + idx = MultiIndex.from_tuples( + [(1, 'two'), (1, 'one'), (2, 'one'), (np.nan, 'two')]) + df = DataFrame([[1, 'a', 1], [2, 'b', 1], + [3, 'c', 1], [4, 'd', 1]], + columns=['A', 'B', 'C'], index=idx) + wp = Panel({'i1': df, 'i2': df}) + ex_idx = MultiIndex.from_tuples([(1, 'two', 'A'), (1, 'two', 'B'), + (1, 'two', 'C'), + (1, 'one', 'A'), + (1, 'one', 'B'), + (1, 'one', 'C'), + (2, 'one', 'A'), + (2, 'one', 'B'), + (2, 'one', 'C'), + (np.nan, 'two', 'A'), + (np.nan, 'two', 'B'), + (np.nan, 'two', 'C')], + names=[None, None, 'minor']) + expected.index = ex_idx + result = wp.to_frame() + assert_frame_equal(result, expected) def test_to_frame_multi_major_minor(self): - cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) - idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( - 2, 'two'), (3, 'three'), (4, 'four')]) - df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], - ['a', 'b', 'w', 'x'], - ['c', 'd', 'y', 'z'], [-1, -2, -3, -4], - [-5, -6, -7, -8]], columns=cols, index=idx) - wp = Panel({'i1': df, 'i2': df}) - - exp_idx = MultiIndex.from_tuples( - [(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'), - (1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'), - (1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'), - (1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'), - (2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'), - (2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'), - (2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'), - (2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'), - (3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'), - (3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'), - (4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'), - (4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')], - names=[None, None, None, None]) - exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], [3, 3], [4, 4], - [13, 13], [14, 14], ['a', 'a'], ['b', 'b'], ['w', 'w'], - ['x', 'x'], ['c', 'c'], ['d', 'd'], ['y', 'y'], ['z', 'z'], - [-1, -1], [-2, -2], [-3, -3], [-4, -4], [-5, -5], [-6, -6], - [-7, -7], [-8, -8]] - result = wp.to_frame() - expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx) - assert_frame_equal(result, expected) + with catch_warnings(record=True): + cols = MultiIndex(levels=[['C_A', 'C_B'], ['C_1', 'C_2']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + idx = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( + 2, 'two'), (3, 'three'), (4, 'four')]) + df = DataFrame([[1, 2, 11, 12], [3, 4, 13, 14], + ['a', 'b', 'w', 'x'], + ['c', 'd', 'y', 'z'], [-1, -2, -3, -4], + [-5, -6, -7, -8]], columns=cols, index=idx) + wp = Panel({'i1': df, 'i2': df}) + + exp_idx = MultiIndex.from_tuples( + [(1, 'one', 'C_A', 'C_1'), (1, 'one', 'C_A', 'C_2'), + (1, 'one', 'C_B', 'C_1'), (1, 'one', 'C_B', 'C_2'), + (1, 'two', 'C_A', 'C_1'), (1, 'two', 'C_A', 'C_2'), + (1, 'two', 'C_B', 'C_1'), (1, 'two', 'C_B', 'C_2'), + (2, 'one', 'C_A', 'C_1'), (2, 'one', 'C_A', 'C_2'), + (2, 'one', 'C_B', 'C_1'), (2, 'one', 'C_B', 'C_2'), + (2, 'two', 'C_A', 'C_1'), (2, 'two', 'C_A', 'C_2'), + (2, 'two', 'C_B', 'C_1'), (2, 'two', 'C_B', 'C_2'), + (3, 'three', 'C_A', 'C_1'), (3, 'three', 'C_A', 'C_2'), + (3, 'three', 'C_B', 'C_1'), (3, 'three', 'C_B', 'C_2'), + (4, 'four', 'C_A', 'C_1'), (4, 'four', 'C_A', 'C_2'), + (4, 'four', 'C_B', 'C_1'), (4, 'four', 'C_B', 'C_2')], + names=[None, None, None, None]) + exp_val = [[1, 1], [2, 2], [11, 11], [12, 12], + [3, 3], [4, 4], + [13, 13], [14, 14], ['a', 'a'], + ['b', 'b'], ['w', 'w'], + ['x', 'x'], ['c', 'c'], ['d', 'd'], [ + 'y', 'y'], ['z', 'z'], + [-1, -1], [-2, -2], [-3, -3], [-4, -4], + [-5, -5], [-6, -6], + [-7, -7], [-8, -8]] + result = wp.to_frame() + expected = DataFrame(exp_val, columns=['i1', 'i2'], index=exp_idx) + assert_frame_equal(result, expected) def test_to_frame_multi_drop_level(self): - idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')]) - df = DataFrame({'A': [np.nan, 1, 2]}, index=idx) - wp = Panel({'i1': df, 'i2': df}) - result = wp.to_frame() - exp_idx = MultiIndex.from_tuples([(2, 'one', 'A'), (2, 'two', 'A')], - names=[None, None, 'minor']) - expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx) - assert_frame_equal(result, expected) + with catch_warnings(record=True): + idx = MultiIndex.from_tuples([(1, 'one'), (2, 'one'), (2, 'two')]) + df = DataFrame({'A': [np.nan, 1, 2]}, index=idx) + wp = Panel({'i1': df, 'i2': df}) + result = wp.to_frame() + exp_idx = MultiIndex.from_tuples( + [(2, 'one', 'A'), (2, 'two', 'A')], + names=[None, None, 'minor']) + expected = DataFrame({'i1': [1., 2], 'i2': [1., 2]}, index=exp_idx) + assert_frame_equal(result, expected) def test_to_panel_na_handling(self): - df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)), - index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1], - [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]]) + with catch_warnings(record=True): + df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)), + index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1], + [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]]) - panel = df.to_panel() - self.assertTrue(isnull(panel[0].loc[1, [0, 1]]).all()) + panel = df.to_panel() + self.assertTrue(isnull(panel[0].loc[1, [0, 1]]).all()) def test_to_panel_duplicates(self): # #2441 - df = DataFrame({'a': [0, 0, 1], 'b': [1, 1, 1], 'c': [1, 2, 3]}) - idf = df.set_index(['a', 'b']) - assertRaisesRegexp(ValueError, 'non-uniquely indexed', idf.to_panel) + with catch_warnings(record=True): + df = DataFrame({'a': [0, 0, 1], 'b': [1, 1, 1], 'c': [1, 2, 3]}) + idf = df.set_index(['a', 'b']) + assertRaisesRegexp( + ValueError, 'non-uniquely indexed', idf.to_panel) def test_panel_dups(self): + with catch_warnings(record=True): - # GH 4960 - # duplicates in an index + # GH 4960 + # duplicates in an index - # items - data = np.random.randn(5, 100, 5) - no_dup_panel = Panel(data, items=list("ABCDE")) - panel = Panel(data, items=list("AACDE")) + # items + data = np.random.randn(5, 100, 5) + no_dup_panel = Panel(data, items=list("ABCDE")) + panel = Panel(data, items=list("AACDE")) - expected = no_dup_panel['A'] - result = panel.iloc[0] - assert_frame_equal(result, expected) + expected = no_dup_panel['A'] + result = panel.iloc[0] + assert_frame_equal(result, expected) - expected = no_dup_panel['E'] - result = panel.loc['E'] - assert_frame_equal(result, expected) + expected = no_dup_panel['E'] + result = panel.loc['E'] + assert_frame_equal(result, expected) - expected = no_dup_panel.loc[['A', 'B']] - expected.items = ['A', 'A'] - result = panel.loc['A'] - assert_panel_equal(result, expected) + expected = no_dup_panel.loc[['A', 'B']] + expected.items = ['A', 'A'] + result = panel.loc['A'] + assert_panel_equal(result, expected) - # major - data = np.random.randn(5, 5, 5) - no_dup_panel = Panel(data, major_axis=list("ABCDE")) - panel = Panel(data, major_axis=list("AACDE")) + # major + data = np.random.randn(5, 5, 5) + no_dup_panel = Panel(data, major_axis=list("ABCDE")) + panel = Panel(data, major_axis=list("AACDE")) - expected = no_dup_panel.loc[:, 'A'] - result = panel.iloc[:, 0] - assert_frame_equal(result, expected) + expected = no_dup_panel.loc[:, 'A'] + result = panel.iloc[:, 0] + assert_frame_equal(result, expected) - expected = no_dup_panel.loc[:, 'E'] - result = panel.loc[:, 'E'] - assert_frame_equal(result, expected) + expected = no_dup_panel.loc[:, 'E'] + result = panel.loc[:, 'E'] + assert_frame_equal(result, expected) - expected = no_dup_panel.loc[:, ['A', 'B']] - expected.major_axis = ['A', 'A'] - result = panel.loc[:, 'A'] - assert_panel_equal(result, expected) + expected = no_dup_panel.loc[:, ['A', 'B']] + expected.major_axis = ['A', 'A'] + result = panel.loc[:, 'A'] + assert_panel_equal(result, expected) - # minor - data = np.random.randn(5, 100, 5) - no_dup_panel = Panel(data, minor_axis=list("ABCDE")) - panel = Panel(data, minor_axis=list("AACDE")) + # minor + data = np.random.randn(5, 100, 5) + no_dup_panel = Panel(data, minor_axis=list("ABCDE")) + panel = Panel(data, minor_axis=list("AACDE")) - expected = no_dup_panel.loc[:, :, 'A'] - result = panel.iloc[:, :, 0] - assert_frame_equal(result, expected) + expected = no_dup_panel.loc[:, :, 'A'] + result = panel.iloc[:, :, 0] + assert_frame_equal(result, expected) - expected = no_dup_panel.loc[:, :, 'E'] - result = panel.loc[:, :, 'E'] - assert_frame_equal(result, expected) + expected = no_dup_panel.loc[:, :, 'E'] + result = panel.loc[:, :, 'E'] + assert_frame_equal(result, expected) - expected = no_dup_panel.loc[:, :, ['A', 'B']] - expected.minor_axis = ['A', 'A'] - result = panel.loc[:, :, 'A'] - assert_panel_equal(result, expected) + expected = no_dup_panel.loc[:, :, ['A', 'B']] + expected.minor_axis = ['A', 'A'] + result = panel.loc[:, :, 'A'] + assert_panel_equal(result, expected) def test_filter(self): pass def test_compound(self): - compounded = self.panel.compound() + with catch_warnings(record=True): + compounded = self.panel.compound() - assert_series_equal(compounded['ItemA'], - (1 + self.panel['ItemA']).product(0) - 1, - check_names=False) + assert_series_equal(compounded['ItemA'], + (1 + self.panel['ItemA']).product(0) - 1, + check_names=False) def test_shift(self): - # major - idx = self.panel.major_axis[0] - idx_lag = self.panel.major_axis[1] - shifted = self.panel.shift(1) - assert_frame_equal(self.panel.major_xs(idx), shifted.major_xs(idx_lag)) - - # minor - idx = self.panel.minor_axis[0] - idx_lag = self.panel.minor_axis[1] - shifted = self.panel.shift(1, axis='minor') - assert_frame_equal(self.panel.minor_xs(idx), shifted.minor_xs(idx_lag)) - - # items - idx = self.panel.items[0] - idx_lag = self.panel.items[1] - shifted = self.panel.shift(1, axis='items') - assert_frame_equal(self.panel[idx], shifted[idx_lag]) - - # negative numbers, #2164 - result = self.panel.shift(-1) - expected = Panel(dict((i, f.shift(-1)[:-1]) - for i, f in self.panel.iteritems())) - assert_panel_equal(result, expected) - - # mixed dtypes #6959 - data = [('item ' + ch, makeMixedDataFrame()) for ch in list('abcde')] - data = dict(data) - mixed_panel = Panel.from_dict(data, orient='minor') - shifted = mixed_panel.shift(1) - assert_series_equal(mixed_panel.dtypes, shifted.dtypes) + with catch_warnings(record=True): + # major + idx = self.panel.major_axis[0] + idx_lag = self.panel.major_axis[1] + shifted = self.panel.shift(1) + assert_frame_equal(self.panel.major_xs(idx), + shifted.major_xs(idx_lag)) + + # minor + idx = self.panel.minor_axis[0] + idx_lag = self.panel.minor_axis[1] + shifted = self.panel.shift(1, axis='minor') + assert_frame_equal(self.panel.minor_xs(idx), + shifted.minor_xs(idx_lag)) + + # items + idx = self.panel.items[0] + idx_lag = self.panel.items[1] + shifted = self.panel.shift(1, axis='items') + assert_frame_equal(self.panel[idx], shifted[idx_lag]) + + # negative numbers, #2164 + result = self.panel.shift(-1) + expected = Panel(dict((i, f.shift(-1)[:-1]) + for i, f in self.panel.iteritems())) + assert_panel_equal(result, expected) + + # mixed dtypes #6959 + data = [('item ' + ch, makeMixedDataFrame()) + for ch in list('abcde')] + data = dict(data) + mixed_panel = Panel.from_dict(data, orient='minor') + shifted = mixed_panel.shift(1) + assert_series_equal(mixed_panel.dtypes, shifted.dtypes) def test_tshift(self): # PeriodIndex - ps = tm.makePeriodPanel() - shifted = ps.tshift(1) - unshifted = shifted.tshift(-1) + with catch_warnings(record=True): + ps = tm.makePeriodPanel() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) - assert_panel_equal(unshifted, ps) + assert_panel_equal(unshifted, ps) - shifted2 = ps.tshift(freq='B') - assert_panel_equal(shifted, shifted2) + shifted2 = ps.tshift(freq='B') + assert_panel_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=BDay()) - assert_panel_equal(shifted, shifted3) + shifted3 = ps.tshift(freq=BDay()) + assert_panel_equal(shifted, shifted3) - assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') + assertRaisesRegexp(ValueError, 'does not match', + ps.tshift, freq='M') - # DatetimeIndex - panel = _panel - shifted = panel.tshift(1) - unshifted = shifted.tshift(-1) + # DatetimeIndex + panel = make_test_panel() + shifted = panel.tshift(1) + unshifted = shifted.tshift(-1) - assert_panel_equal(panel, unshifted) + assert_panel_equal(panel, unshifted) - shifted2 = panel.tshift(freq=panel.major_axis.freq) - assert_panel_equal(shifted, shifted2) + shifted2 = panel.tshift(freq=panel.major_axis.freq) + assert_panel_equal(shifted, shifted2) - inferred_ts = Panel(panel.values, items=panel.items, - major_axis=Index(np.asarray(panel.major_axis)), - minor_axis=panel.minor_axis) - shifted = inferred_ts.tshift(1) - unshifted = shifted.tshift(-1) - assert_panel_equal(shifted, panel.tshift(1)) - assert_panel_equal(unshifted, inferred_ts) + inferred_ts = Panel(panel.values, items=panel.items, + major_axis=Index(np.asarray(panel.major_axis)), + minor_axis=panel.minor_axis) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + assert_panel_equal(shifted, panel.tshift(1)) + assert_panel_equal(unshifted, inferred_ts) - no_freq = panel.iloc[:, [0, 5, 7], :] - self.assertRaises(ValueError, no_freq.tshift) + no_freq = panel.iloc[:, [0, 5, 7], :] + self.assertRaises(ValueError, no_freq.tshift) def test_pct_change(self): - df1 = DataFrame({'c1': [1, 2, 5], 'c2': [3, 4, 6]}) - df2 = df1 + 1 - df3 = DataFrame({'c1': [3, 4, 7], 'c2': [5, 6, 8]}) - wp = Panel({'i1': df1, 'i2': df2, 'i3': df3}) - # major, 1 - result = wp.pct_change() # axis='major' - expected = Panel({'i1': df1.pct_change(), - 'i2': df2.pct_change(), - 'i3': df3.pct_change()}) - assert_panel_equal(result, expected) - result = wp.pct_change(axis=1) - assert_panel_equal(result, expected) - # major, 2 - result = wp.pct_change(periods=2) - expected = Panel({'i1': df1.pct_change(2), - 'i2': df2.pct_change(2), - 'i3': df3.pct_change(2)}) - assert_panel_equal(result, expected) - # minor, 1 - result = wp.pct_change(axis='minor') - expected = Panel({'i1': df1.pct_change(axis=1), - 'i2': df2.pct_change(axis=1), - 'i3': df3.pct_change(axis=1)}) - assert_panel_equal(result, expected) - result = wp.pct_change(axis=2) - assert_panel_equal(result, expected) - # minor, 2 - result = wp.pct_change(periods=2, axis='minor') - expected = Panel({'i1': df1.pct_change(periods=2, axis=1), - 'i2': df2.pct_change(periods=2, axis=1), - 'i3': df3.pct_change(periods=2, axis=1)}) - assert_panel_equal(result, expected) - # items, 1 - result = wp.pct_change(axis='items') - expected = Panel({'i1': DataFrame({'c1': [np.nan, np.nan, np.nan], - 'c2': [np.nan, np.nan, np.nan]}), - 'i2': DataFrame({'c1': [1, 0.5, .2], - 'c2': [1. / 3, 0.25, 1. / 6]}), - 'i3': DataFrame({'c1': [.5, 1. / 3, 1. / 6], - 'c2': [.25, .2, 1. / 7]})}) - assert_panel_equal(result, expected) - result = wp.pct_change(axis=0) - assert_panel_equal(result, expected) - # items, 2 - result = wp.pct_change(periods=2, axis='items') - expected = Panel({'i1': DataFrame({'c1': [np.nan, np.nan, np.nan], - 'c2': [np.nan, np.nan, np.nan]}), - 'i2': DataFrame({'c1': [np.nan, np.nan, np.nan], - 'c2': [np.nan, np.nan, np.nan]}), - 'i3': DataFrame({'c1': [2, 1, .4], - 'c2': [2. / 3, .5, 1. / 3]})}) - assert_panel_equal(result, expected) + with catch_warnings(record=True): + df1 = DataFrame({'c1': [1, 2, 5], 'c2': [3, 4, 6]}) + df2 = df1 + 1 + df3 = DataFrame({'c1': [3, 4, 7], 'c2': [5, 6, 8]}) + wp = Panel({'i1': df1, 'i2': df2, 'i3': df3}) + # major, 1 + result = wp.pct_change() # axis='major' + expected = Panel({'i1': df1.pct_change(), + 'i2': df2.pct_change(), + 'i3': df3.pct_change()}) + assert_panel_equal(result, expected) + result = wp.pct_change(axis=1) + assert_panel_equal(result, expected) + # major, 2 + result = wp.pct_change(periods=2) + expected = Panel({'i1': df1.pct_change(2), + 'i2': df2.pct_change(2), + 'i3': df3.pct_change(2)}) + assert_panel_equal(result, expected) + # minor, 1 + result = wp.pct_change(axis='minor') + expected = Panel({'i1': df1.pct_change(axis=1), + 'i2': df2.pct_change(axis=1), + 'i3': df3.pct_change(axis=1)}) + assert_panel_equal(result, expected) + result = wp.pct_change(axis=2) + assert_panel_equal(result, expected) + # minor, 2 + result = wp.pct_change(periods=2, axis='minor') + expected = Panel({'i1': df1.pct_change(periods=2, axis=1), + 'i2': df2.pct_change(periods=2, axis=1), + 'i3': df3.pct_change(periods=2, axis=1)}) + assert_panel_equal(result, expected) + # items, 1 + result = wp.pct_change(axis='items') + expected = Panel( + {'i1': DataFrame({'c1': [np.nan, np.nan, np.nan], + 'c2': [np.nan, np.nan, np.nan]}), + 'i2': DataFrame({'c1': [1, 0.5, .2], + 'c2': [1. / 3, 0.25, 1. / 6]}), + 'i3': DataFrame({'c1': [.5, 1. / 3, 1. / 6], + 'c2': [.25, .2, 1. / 7]})}) + assert_panel_equal(result, expected) + result = wp.pct_change(axis=0) + assert_panel_equal(result, expected) + # items, 2 + result = wp.pct_change(periods=2, axis='items') + expected = Panel( + {'i1': DataFrame({'c1': [np.nan, np.nan, np.nan], + 'c2': [np.nan, np.nan, np.nan]}), + 'i2': DataFrame({'c1': [np.nan, np.nan, np.nan], + 'c2': [np.nan, np.nan, np.nan]}), + 'i3': DataFrame({'c1': [2, 1, .4], + 'c2': [2. / 3, .5, 1. / 3]})}) + assert_panel_equal(result, expected) def test_round(self): - values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12], - [-1566.213, 88.88], [-12, 94.5]], - [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12], - [272.212, -99.99], [23, -76.5]]] - evalues = [[[float(np.around(i)) for i in j] for j in k] - for k in values] - p = Panel(values, items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B']) - expected = Panel(evalues, items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B']) - result = p.round() - self.assert_panel_equal(expected, result) + with catch_warnings(record=True): + values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12], + [-1566.213, 88.88], [-12, 94.5]], + [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12], + [272.212, -99.99], [23, -76.5]]] + evalues = [[[float(np.around(i)) for i in j] for j in k] + for k in values] + p = Panel(values, items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B']) + expected = Panel(evalues, items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B']) + result = p.round() + assert_panel_equal(expected, result) def test_numpy_round(self): - values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12], - [-1566.213, 88.88], [-12, 94.5]], - [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12], - [272.212, -99.99], [23, -76.5]]] - evalues = [[[float(np.around(i)) for i in j] for j in k] - for k in values] - p = Panel(values, items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B']) - expected = Panel(evalues, items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B']) - result = np.round(p) - self.assert_panel_equal(expected, result) - - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.round, p, out=p) + with catch_warnings(record=True): + values = [[[-3.2, 2.2], [0, -4.8213], [3.123, 123.12], + [-1566.213, 88.88], [-12, 94.5]], + [[-5.82, 3.5], [6.21, -73.272], [-9.087, 23.12], + [272.212, -99.99], [23, -76.5]]] + evalues = [[[float(np.around(i)) for i in j] for j in k] + for k in values] + p = Panel(values, items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B']) + expected = Panel(evalues, items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B']) + result = np.round(p) + assert_panel_equal(expected, result) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.round, p, out=p) def test_multiindex_get(self): - ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)], - names=['first', 'second']) - wp = Panel(np.random.random((4, 5, 5)), - items=ind, - major_axis=np.arange(5), - minor_axis=np.arange(5)) - f1 = wp['a'] - f2 = wp.loc['a'] - assert_panel_equal(f1, f2) - - self.assertTrue((f1.items == [1, 2]).all()) - self.assertTrue((f2.items == [1, 2]).all()) - - ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - names=['first', 'second']) + with catch_warnings(record=True): + ind = MultiIndex.from_tuples( + [('a', 1), ('a', 2), ('b', 1), ('b', 2)], + names=['first', 'second']) + wp = Panel(np.random.random((4, 5, 5)), + items=ind, + major_axis=np.arange(5), + minor_axis=np.arange(5)) + f1 = wp['a'] + f2 = wp.loc['a'] + assert_panel_equal(f1, f2) + + self.assertTrue((f1.items == [1, 2]).all()) + self.assertTrue((f2.items == [1, 2]).all()) + + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + names=['first', 'second']) def test_multiindex_blocks(self): - ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - names=['first', 'second']) - wp = Panel(self.panel._data) - wp.items = ind - f1 = wp['a'] - self.assertTrue((f1.items == [1, 2]).all()) + with catch_warnings(record=True): + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + names=['first', 'second']) + wp = Panel(self.panel._data) + wp.items = ind + f1 = wp['a'] + self.assertTrue((f1.items == [1, 2]).all()) - f1 = wp[('b', 1)] - self.assertTrue((f1.columns == ['A', 'B', 'C', 'D']).all()) + f1 = wp[('b', 1)] + self.assertTrue((f1.columns == ['A', 'B', 'C', 'D']).all()) def test_repr_empty(self): - empty = Panel() - repr(empty) + with catch_warnings(record=True): + empty = Panel() + repr(empty) def test_rename(self): - mapper = {'ItemA': 'foo', 'ItemB': 'bar', 'ItemC': 'baz'} + with catch_warnings(record=True): + mapper = {'ItemA': 'foo', 'ItemB': 'bar', 'ItemC': 'baz'} - renamed = self.panel.rename_axis(mapper, axis=0) - exp = Index(['foo', 'bar', 'baz']) - self.assert_index_equal(renamed.items, exp) + renamed = self.panel.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assert_index_equal(renamed.items, exp) - renamed = self.panel.rename_axis(str.lower, axis=2) - exp = Index(['a', 'b', 'c', 'd']) - self.assert_index_equal(renamed.minor_axis, exp) + renamed = self.panel.rename_axis(str.lower, axis=2) + exp = Index(['a', 'b', 'c', 'd']) + self.assert_index_equal(renamed.minor_axis, exp) - # don't copy - renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) - renamed_nocopy['foo'] = 3. - self.assertTrue((self.panel['ItemA'].values == 3).all()) + # don't copy + renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) + renamed_nocopy['foo'] = 3. + self.assertTrue((self.panel['ItemA'].values == 3).all()) def test_get_attr(self): assert_frame_equal(self.panel['ItemA'], self.panel.ItemA) @@ -2046,12 +2170,13 @@ def test_get_attr(self): assert_frame_equal(self.panel['i'], self.panel.i) def test_from_frame_level1_unsorted(self): - tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1), - ('MSFT', 1)] - midx = MultiIndex.from_tuples(tuples) - df = DataFrame(np.random.rand(5, 4), index=midx) - p = df.to_panel() - assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index()) + with catch_warnings(record=True): + tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1), + ('MSFT', 1)] + midx = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.rand(5, 4), index=midx) + p = df.to_panel() + assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index()) def test_to_excel(self): try: @@ -2094,162 +2219,191 @@ def test_to_excel_xlsxwriter(self): assert_frame_equal(df, recdf) def test_dropna(self): - p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde')) - p.loc[:, ['b', 'd'], 0] = np.nan + with catch_warnings(record=True): + p = Panel(np.random.randn(4, 5, 6), major_axis=list('abcde')) + p.loc[:, ['b', 'd'], 0] = np.nan - result = p.dropna(axis=1) - exp = p.loc[:, ['a', 'c', 'e'], :] - assert_panel_equal(result, exp) - inp = p.copy() - inp.dropna(axis=1, inplace=True) - assert_panel_equal(inp, exp) + result = p.dropna(axis=1) + exp = p.loc[:, ['a', 'c', 'e'], :] + assert_panel_equal(result, exp) + inp = p.copy() + inp.dropna(axis=1, inplace=True) + assert_panel_equal(inp, exp) - result = p.dropna(axis=1, how='all') - assert_panel_equal(result, p) + result = p.dropna(axis=1, how='all') + assert_panel_equal(result, p) - p.loc[:, ['b', 'd'], :] = np.nan - result = p.dropna(axis=1, how='all') - exp = p.loc[:, ['a', 'c', 'e'], :] - assert_panel_equal(result, exp) + p.loc[:, ['b', 'd'], :] = np.nan + result = p.dropna(axis=1, how='all') + exp = p.loc[:, ['a', 'c', 'e'], :] + assert_panel_equal(result, exp) - p = Panel(np.random.randn(4, 5, 6), items=list('abcd')) - p.loc[['b'], :, 0] = np.nan + p = Panel(np.random.randn(4, 5, 6), items=list('abcd')) + p.loc[['b'], :, 0] = np.nan - result = p.dropna() - exp = p.loc[['a', 'c', 'd']] - assert_panel_equal(result, exp) + result = p.dropna() + exp = p.loc[['a', 'c', 'd']] + assert_panel_equal(result, exp) - result = p.dropna(how='all') - assert_panel_equal(result, p) + result = p.dropna(how='all') + assert_panel_equal(result, p) - p.loc['b'] = np.nan - result = p.dropna(how='all') - exp = p.loc[['a', 'c', 'd']] - assert_panel_equal(result, exp) + p.loc['b'] = np.nan + result = p.dropna(how='all') + exp = p.loc[['a', 'c', 'd']] + assert_panel_equal(result, exp) def test_drop(self): - df = DataFrame({"A": [1, 2], "B": [3, 4]}) - panel = Panel({"One": df, "Two": df}) + with catch_warnings(record=True): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + panel = Panel({"One": df, "Two": df}) - def check_drop(drop_val, axis_number, aliases, expected): - try: - actual = panel.drop(drop_val, axis=axis_number) - assert_panel_equal(actual, expected) - for alias in aliases: - actual = panel.drop(drop_val, axis=alias) + def check_drop(drop_val, axis_number, aliases, expected): + try: + actual = panel.drop(drop_val, axis=axis_number) assert_panel_equal(actual, expected) - except AssertionError: - pprint_thing("Failed with axis_number %d and aliases: %s" % - (axis_number, aliases)) - raise - # Items - expected = Panel({"One": df}) - check_drop('Two', 0, ['items'], expected) - - self.assertRaises(ValueError, panel.drop, 'Three') - - # errors = 'ignore' - dropped = panel.drop('Three', errors='ignore') - assert_panel_equal(dropped, panel) - dropped = panel.drop(['Two', 'Three'], errors='ignore') - expected = Panel({"One": df}) - assert_panel_equal(dropped, expected) - - # Major - exp_df = DataFrame({"A": [2], "B": [4]}, index=[1]) - expected = Panel({"One": exp_df, "Two": exp_df}) - check_drop(0, 1, ['major_axis', 'major'], expected) - - exp_df = DataFrame({"A": [1], "B": [3]}, index=[0]) - expected = Panel({"One": exp_df, "Two": exp_df}) - check_drop([1], 1, ['major_axis', 'major'], expected) - - # Minor - exp_df = df[['B']] - expected = Panel({"One": exp_df, "Two": exp_df}) - check_drop(["A"], 2, ['minor_axis', 'minor'], expected) - - exp_df = df[['A']] - expected = Panel({"One": exp_df, "Two": exp_df}) - check_drop("B", 2, ['minor_axis', 'minor'], expected) + for alias in aliases: + actual = panel.drop(drop_val, axis=alias) + assert_panel_equal(actual, expected) + except AssertionError: + pprint_thing("Failed with axis_number %d and aliases: %s" % + (axis_number, aliases)) + raise + # Items + expected = Panel({"One": df}) + check_drop('Two', 0, ['items'], expected) + + self.assertRaises(ValueError, panel.drop, 'Three') + + # errors = 'ignore' + dropped = panel.drop('Three', errors='ignore') + assert_panel_equal(dropped, panel) + dropped = panel.drop(['Two', 'Three'], errors='ignore') + expected = Panel({"One": df}) + assert_panel_equal(dropped, expected) + + # Major + exp_df = DataFrame({"A": [2], "B": [4]}, index=[1]) + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop(0, 1, ['major_axis', 'major'], expected) + + exp_df = DataFrame({"A": [1], "B": [3]}, index=[0]) + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop([1], 1, ['major_axis', 'major'], expected) + + # Minor + exp_df = df[['B']] + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop(["A"], 2, ['minor_axis', 'minor'], expected) + + exp_df = df[['A']] + expected = Panel({"One": exp_df, "Two": exp_df}) + check_drop("B", 2, ['minor_axis', 'minor'], expected) def test_update(self): - pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) + with catch_warnings(record=True): + pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) - other = Panel([[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1]) + other = Panel( + [[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1]) - pan.update(other) + pan.update(other) - expected = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]], - [[3.6, 2., 3], [1.5, np.nan, 7], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) + expected = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], [1.5, np.nan, 3.]], + [[3.6, 2., 3], [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) - assert_panel_equal(pan, expected) + assert_panel_equal(pan, expected) def test_update_from_dict(self): - pan = Panel({'one': DataFrame([[1.5, np.nan, 3], [1.5, np.nan, 3], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]]), - 'two': DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]])}) - - other = {'two': DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]])} - - pan.update(other) - - expected = Panel( - {'two': DataFrame([[3.6, 2., 3], [1.5, np.nan, 7], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]]), - 'one': DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]])}) - - assert_panel_equal(pan, expected) + with catch_warnings(record=True): + pan = Panel({'one': DataFrame([[1.5, np.nan, 3], + [1.5, np.nan, 3], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]), + 'two': DataFrame([[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]])}) + + other = {'two': DataFrame( + [[3.6, 2., np.nan], [np.nan, np.nan, 7]])} + + pan.update(other) + + expected = Panel( + {'two': DataFrame([[3.6, 2., 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]), + 'one': DataFrame([[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]])}) + + assert_panel_equal(pan, expected) def test_update_nooverwrite(self): - pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) + with catch_warnings(record=True): + pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) - other = Panel([[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1]) + other = Panel( + [[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1]) - pan.update(other, overwrite=False) + pan.update(other, overwrite=False) - expected = Panel([[[1.5, np.nan, 3], [1.5, np.nan, 3], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]], - [[1.5, 2., 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) + expected = Panel([[[1.5, np.nan, 3], [1.5, np.nan, 3], + [1.5, np.nan, 3.], [1.5, np.nan, 3.]], + [[1.5, 2., 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) - assert_panel_equal(pan, expected) + assert_panel_equal(pan, expected) def test_update_filtered(self): - pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) + with catch_warnings(record=True): + pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) - other = Panel([[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1]) + other = Panel( + [[[3.6, 2., np.nan], [np.nan, np.nan, 7]]], items=[1]) - pan.update(other, filter_func=lambda x: x > 2) + pan.update(other, filter_func=lambda x: x > 2) - expected = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]], - [[1.5, np.nan, 3], [1.5, np.nan, 7], - [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) + expected = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], [1.5, np.nan, 3.]], + [[1.5, np.nan, 3], [1.5, np.nan, 7], + [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) - assert_panel_equal(pan, expected) + assert_panel_equal(pan, expected) def test_update_raise(self): - pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]) + with catch_warnings(record=True): + pan = Panel([[[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]) - self.assertRaises(Exception, pan.update, *(pan, ), - **{'raise_conflict': True}) + self.assertRaises(Exception, pan.update, *(pan, ), + **{'raise_conflict': True}) def test_all_any(self): self.assertTrue((self.panel.all(axis=0).values == nanall( @@ -2276,90 +2430,95 @@ class TestLongPanel(tm.TestCase): """ def setUp(self): - import warnings - warnings.filterwarnings(action='ignore', category=FutureWarning) - - panel = tm.makePanel() - tm.add_nans(panel) - + panel = make_test_panel() self.panel = panel.to_frame() self.unfiltered_panel = panel.to_frame(filter_observations=False) def test_ops_differently_indexed(self): - # trying to set non-identically indexed panel - wp = self.panel.to_panel() - wp2 = wp.reindex(major=wp.major_axis[:-1]) - lp2 = wp2.to_frame() + with catch_warnings(record=True): + # trying to set non-identically indexed panel + wp = self.panel.to_panel() + wp2 = wp.reindex(major=wp.major_axis[:-1]) + lp2 = wp2.to_frame() - result = self.panel + lp2 - assert_frame_equal(result.reindex(lp2.index), lp2 * 2) + result = self.panel + lp2 + assert_frame_equal(result.reindex(lp2.index), lp2 * 2) - # careful, mutation - self.panel['foo'] = lp2['ItemA'] - assert_series_equal(self.panel['foo'].reindex(lp2.index), lp2['ItemA'], - check_names=False) + # careful, mutation + self.panel['foo'] = lp2['ItemA'] + assert_series_equal(self.panel['foo'].reindex(lp2.index), + lp2['ItemA'], + check_names=False) def test_ops_scalar(self): - result = self.panel.mul(2) - expected = DataFrame.__mul__(self.panel, 2) - assert_frame_equal(result, expected) + with catch_warnings(record=True): + result = self.panel.mul(2) + expected = DataFrame.__mul__(self.panel, 2) + assert_frame_equal(result, expected) def test_combineFrame(self): - wp = self.panel.to_panel() - result = self.panel.add(wp['ItemA'].stack(), axis=0) - assert_frame_equal(result.to_panel()['ItemA'], wp['ItemA'] * 2) + with catch_warnings(record=True): + wp = self.panel.to_panel() + result = self.panel.add(wp['ItemA'].stack(), axis=0) + assert_frame_equal(result.to_panel()['ItemA'], wp['ItemA'] * 2) def test_combinePanel(self): - wp = self.panel.to_panel() - result = self.panel.add(self.panel) - wide_result = result.to_panel() - assert_frame_equal(wp['ItemA'] * 2, wide_result['ItemA']) + with catch_warnings(record=True): + wp = self.panel.to_panel() + result = self.panel.add(self.panel) + wide_result = result.to_panel() + assert_frame_equal(wp['ItemA'] * 2, wide_result['ItemA']) - # one item - result = self.panel.add(self.panel.filter(['ItemA'])) + # one item + result = self.panel.add(self.panel.filter(['ItemA'])) def test_combine_scalar(self): - result = self.panel.mul(2) - expected = DataFrame(self.panel._data) * 2 - assert_frame_equal(result, expected) + with catch_warnings(record=True): + result = self.panel.mul(2) + expected = DataFrame(self.panel._data) * 2 + assert_frame_equal(result, expected) def test_combine_series(self): - s = self.panel['ItemA'][:10] - result = self.panel.add(s, axis=0) - expected = DataFrame.add(self.panel, s, axis=0) - assert_frame_equal(result, expected) + with catch_warnings(record=True): + s = self.panel['ItemA'][:10] + result = self.panel.add(s, axis=0) + expected = DataFrame.add(self.panel, s, axis=0) + assert_frame_equal(result, expected) - s = self.panel.iloc[5] - result = self.panel + s - expected = DataFrame.add(self.panel, s, axis=1) - assert_frame_equal(result, expected) + s = self.panel.iloc[5] + result = self.panel + s + expected = DataFrame.add(self.panel, s, axis=1) + assert_frame_equal(result, expected) def test_operators(self): - wp = self.panel.to_panel() - result = (self.panel + 1).to_panel() - assert_frame_equal(wp['ItemA'] + 1, result['ItemA']) + with catch_warnings(record=True): + wp = self.panel.to_panel() + result = (self.panel + 1).to_panel() + assert_frame_equal(wp['ItemA'] + 1, result['ItemA']) def test_arith_flex_panel(self): - ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod'] - if not compat.PY3: - aliases = {} - else: - aliases = {'div': 'truediv'} - self.panel = self.panel.to_panel() - - for n in [np.random.randint(-50, -1), np.random.randint(1, 50), 0]: - for op in ops: - alias = aliases.get(op, op) - f = getattr(operator, alias) - exp = f(self.panel, n) - result = getattr(self.panel, op)(n) - assert_panel_equal(result, exp, check_panel_type=True) - - # rops - r_f = lambda x, y: f(y, x) - exp = r_f(self.panel, n) - result = getattr(self.panel, 'r' + op)(n) - assert_panel_equal(result, exp) + with catch_warnings(record=True): + ops = ['add', 'sub', 'mul', 'div', + 'truediv', 'pow', 'floordiv', 'mod'] + if not compat.PY3: + aliases = {} + else: + aliases = {'div': 'truediv'} + self.panel = self.panel.to_panel() + + for n in [np.random.randint(-50, -1), np.random.randint(1, 50), 0]: + for op in ops: + alias = aliases.get(op, op) + f = getattr(operator, alias) + exp = f(self.panel, n) + result = getattr(self.panel, op)(n) + assert_panel_equal(result, exp, check_panel_type=True) + + # rops + r_f = lambda x, y: f(y, x) + exp = r_f(self.panel, n) + result = getattr(self.panel, 'r' + op)(n) + assert_panel_equal(result, exp) def test_sort(self): def is_sorted(arr): @@ -2382,43 +2541,44 @@ def test_to_sparse(self): self.panel.to_sparse) def test_truncate(self): - dates = self.panel.index.levels[0] - start, end = dates[1], dates[5] + with catch_warnings(record=True): + dates = self.panel.index.levels[0] + start, end = dates[1], dates[5] - trunced = self.panel.truncate(start, end).to_panel() - expected = self.panel.to_panel()['ItemA'].truncate(start, end) + trunced = self.panel.truncate(start, end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(start, end) - # TODO trucate drops index.names - assert_frame_equal(trunced['ItemA'], expected, check_names=False) + # TODO trucate drops index.names + assert_frame_equal(trunced['ItemA'], expected, check_names=False) - trunced = self.panel.truncate(before=start).to_panel() - expected = self.panel.to_panel()['ItemA'].truncate(before=start) + trunced = self.panel.truncate(before=start).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(before=start) - # TODO trucate drops index.names - assert_frame_equal(trunced['ItemA'], expected, check_names=False) + # TODO trucate drops index.names + assert_frame_equal(trunced['ItemA'], expected, check_names=False) - trunced = self.panel.truncate(after=end).to_panel() - expected = self.panel.to_panel()['ItemA'].truncate(after=end) + trunced = self.panel.truncate(after=end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(after=end) - # TODO trucate drops index.names - assert_frame_equal(trunced['ItemA'], expected, check_names=False) + # TODO trucate drops index.names + assert_frame_equal(trunced['ItemA'], expected, check_names=False) - # truncate on dates that aren't in there - wp = self.panel.to_panel() - new_index = wp.major_axis[::5] + # truncate on dates that aren't in there + wp = self.panel.to_panel() + new_index = wp.major_axis[::5] - wp2 = wp.reindex(major=new_index) + wp2 = wp.reindex(major=new_index) - lp2 = wp2.to_frame() - lp_trunc = lp2.truncate(wp.major_axis[2], wp.major_axis[-2]) + lp2 = wp2.to_frame() + lp_trunc = lp2.truncate(wp.major_axis[2], wp.major_axis[-2]) - wp_trunc = wp2.truncate(wp.major_axis[2], wp.major_axis[-2]) + wp_trunc = wp2.truncate(wp.major_axis[2], wp.major_axis[-2]) - assert_panel_equal(wp_trunc, lp_trunc.to_panel()) + assert_panel_equal(wp_trunc, lp_trunc.to_panel()) - # throw proper exception - self.assertRaises(Exception, lp2.truncate, wp.major_axis[-2], - wp.major_axis[2]) + # throw proper exception + self.assertRaises(Exception, lp2.truncate, wp.major_axis[-2], + wp.major_axis[2]) def test_axis_dummies(self): from pandas.core.reshape import make_axis_dummies @@ -2449,82 +2609,70 @@ def test_get_dummies(self): self.assert_numpy_array_equal(dummies.values, minor_dummies.values) def test_mean(self): - means = self.panel.mean(level='minor') + with catch_warnings(record=True): + means = self.panel.mean(level='minor') - # test versus Panel version - wide_means = self.panel.to_panel().mean('major') - assert_frame_equal(means, wide_means) + # test versus Panel version + wide_means = self.panel.to_panel().mean('major') + assert_frame_equal(means, wide_means) def test_sum(self): - sums = self.panel.sum(level='minor') + with catch_warnings(record=True): + sums = self.panel.sum(level='minor') - # test versus Panel version - wide_sums = self.panel.to_panel().sum('major') - assert_frame_equal(sums, wide_sums) + # test versus Panel version + wide_sums = self.panel.to_panel().sum('major') + assert_frame_equal(sums, wide_sums) def test_count(self): - index = self.panel.index + with catch_warnings(record=True): + index = self.panel.index - major_count = self.panel.count(level=0)['ItemA'] - labels = index.labels[0] - for i, idx in enumerate(index.levels[0]): - self.assertEqual(major_count[i], (labels == i).sum()) + major_count = self.panel.count(level=0)['ItemA'] + labels = index.labels[0] + for i, idx in enumerate(index.levels[0]): + self.assertEqual(major_count[i], (labels == i).sum()) - minor_count = self.panel.count(level=1)['ItemA'] - labels = index.labels[1] - for i, idx in enumerate(index.levels[1]): - self.assertEqual(minor_count[i], (labels == i).sum()) + minor_count = self.panel.count(level=1)['ItemA'] + labels = index.labels[1] + for i, idx in enumerate(index.levels[1]): + self.assertEqual(minor_count[i], (labels == i).sum()) def test_join(self): - lp1 = self.panel.filter(['ItemA', 'ItemB']) - lp2 = self.panel.filter(['ItemC']) + with catch_warnings(record=True): + lp1 = self.panel.filter(['ItemA', 'ItemB']) + lp2 = self.panel.filter(['ItemC']) - joined = lp1.join(lp2) + joined = lp1.join(lp2) - self.assertEqual(len(joined.columns), 3) + self.assertEqual(len(joined.columns), 3) - self.assertRaises(Exception, lp1.join, - self.panel.filter(['ItemB', 'ItemC'])) + self.assertRaises(Exception, lp1.join, + self.panel.filter(['ItemB', 'ItemC'])) def test_pivot(self): - from pandas.core.reshape import _slow_pivot - - one, two, three = (np.array([1, 2, 3, 4, 5]), - np.array(['a', 'b', 'c', 'd', 'e']), - np.array([1, 2, 3, 5, 4.])) - df = pivot(one, two, three) - self.assertEqual(df['a'][1], 1) - self.assertEqual(df['b'][2], 2) - self.assertEqual(df['c'][3], 3) - self.assertEqual(df['d'][4], 5) - self.assertEqual(df['e'][5], 4) - assert_frame_equal(df, _slow_pivot(one, two, three)) - - # weird overlap, TODO: test? - a, b, c = (np.array([1, 2, 3, 4, 4]), - np.array(['a', 'a', 'a', 'a', 'a']), - np.array([1., 2., 3., 4., 5.])) - self.assertRaises(Exception, pivot, a, b, c) - - # corner case, empty - df = pivot(np.array([]), np.array([]), np.array([])) - - -def test_monotonic(): - pos = np.array([1, 2, 3, 5]) - - def _monotonic(arr): - return not (arr[1:] < arr[:-1]).any() - - assert _monotonic(pos) - - neg = np.array([1, 2, 3, 4, 3]) - - assert not _monotonic(neg) - - neg2 = np.array([5, 1, 2, 3, 4, 5]) - - assert not _monotonic(neg2) + with catch_warnings(record=True): + from pandas.core.reshape import _slow_pivot + + one, two, three = (np.array([1, 2, 3, 4, 5]), + np.array(['a', 'b', 'c', 'd', 'e']), + np.array([1, 2, 3, 5, 4.])) + df = pivot(one, two, three) + self.assertEqual(df['a'][1], 1) + self.assertEqual(df['b'][2], 2) + self.assertEqual(df['c'][3], 3) + self.assertEqual(df['d'][4], 5) + self.assertEqual(df['e'][5], 4) + assert_frame_equal(df, _slow_pivot(one, two, three)) + + # weird overlap, TODO: test? + a, b, c = (np.array([1, 2, 3, 4, 4]), + np.array(['a', 'a', 'a', 'a', 'a']), + np.array([1., 2., 3., 4., 5.])) + self.assertRaises(Exception, pivot, a, b, c) + + # corner case, empty + df = pivot(np.array([]), np.array([]), np.array([])) def test_panel_index(): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index ceb12c6c03074..5fc31e9321f31 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -10,8 +10,8 @@ from distutils.version import LooseVersion import pandas as pd -from pandas import (Series, DataFrame, Panel, bdate_range, isnull, - notnull, concat, Timestamp) +from pandas import (Series, DataFrame, bdate_range, isnull, + notnull, concat, Timestamp, Index) import pandas.stats.moments as mom import pandas.core.window as rwindow import pandas.tseries.offsets as offsets @@ -172,7 +172,7 @@ def test_agg_consistency(self): tm.assert_index_equal(result, expected) result = r['A'].agg([np.sum, np.mean]).columns - expected = pd.Index(['sum', 'mean']) + expected = Index(['sum', 'mean']) tm.assert_index_equal(result, expected) result = r.agg({'A': [np.sum, np.mean]}).columns @@ -1688,6 +1688,160 @@ def _check_ew_structures(self, func, name): self.assertEqual(type(frame_result), DataFrame) +class TestPairwise(object): + + # GH 7738 + df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], + columns=['C', 'C']), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), + DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], + columns=[1, 0.]), + DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], + columns=[0, 1.]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], + columns=[1., 'X']), ] + df2 = DataFrame([[None, 1, 1], [None, 1, 2], + [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) + s = Series([1, 1, 3, 8]) + + def compare(self, result, expected): + + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()]) + def test_no_flex(self, f): + + # DataFrame methods (which do not call _flex_binary_moment()) + + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.columns) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x: x.expanding().cov(pairwise=True), + lambda x: x.expanding().corr(pairwise=True), + lambda x: x.rolling(window=3).cov(pairwise=True), + lambda x: x.rolling(window=3).corr(pairwise=True), + lambda x: x.ewm(com=3).cov(pairwise=True), + lambda x: x.ewm(com=3).corr(pairwise=True)]) + def test_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=True + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index.levels[0], + df.index, + check_names=False) + tm.assert_index_equal(result.index.levels[1], + df.columns, + check_names=False) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x: x.expanding().cov(pairwise=False), + lambda x: x.expanding().corr(pairwise=False), + lambda x: x.rolling(window=3).cov(pairwise=False), + lambda x: x.rolling(window=3).corr(pairwise=False), + lambda x: x.ewm(com=3).cov(pairwise=False), + lambda x: x.ewm(com=3).corr(pairwise=False), ]) + def test_no_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=False + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x, y: x.expanding().cov(y, pairwise=True), + lambda x, y: x.expanding().corr(y, pairwise=True), + lambda x, y: x.rolling(window=3).cov(y, pairwise=True), + lambda x, y: x.rolling(window=3).corr(y, pairwise=True), + lambda x, y: x.ewm(com=3).cov(y, pairwise=True), + lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]) + def test_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=True + results = [f(df, self.df2) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index.levels[0], + df.index, + check_names=False) + tm.assert_index_equal(result.index.levels[1], + self.df2.columns, + check_names=False) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x, y: x.expanding().cov(y, pairwise=False), + lambda x, y: x.expanding().corr(y, pairwise=False), + lambda x, y: x.rolling(window=3).cov(y, pairwise=False), + lambda x, y: x.rolling(window=3).corr(y, pairwise=False), + lambda x, y: x.ewm(com=3).cov(y, pairwise=False), + lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ]) + def test_no_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=False + results = [f(df, self.df2) if df.columns.is_unique else None + for df in self.df1s] + for (df, result) in zip(self.df1s, results): + if result is not None: + with catch_warnings(record=True): + # we can have int and str columns + expected_index = df.index.union(self.df2.index) + expected_columns = df.columns.union(self.df2.columns) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) + else: + tm.assertRaisesRegexp( + ValueError, "'arg1' columns are not unique", f, df, + self.df2) + tm.assertRaisesRegexp( + ValueError, "'arg2' columns are not unique", f, + self.df2, df) + + @pytest.mark.parametrize( + 'f', [lambda x, y: x.expanding().cov(y), + lambda x, y: x.expanding().corr(y), + lambda x, y: x.rolling(window=3).cov(y), + lambda x, y: x.rolling(window=3).corr(y), + lambda x, y: x.ewm(com=3).cov(y), + lambda x, y: x.ewm(com=3).corr(y), ]) + def test_pairwise_with_series(self, f): + + # DataFrame with a Series + results = ([f(df, self.s) for df in self.df1s] + + [f(self.s, df) for df in self.df1s]) + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + # create the data only once as we are not setting it def _create_consistency_data(): def create_series(): @@ -2083,21 +2237,6 @@ def test_expanding_consistency(self): assert_equal(expanding_f_result, expanding_apply_f_result) - if (name in ['cov', 'corr']) and isinstance(x, - DataFrame): - # test pairwise=True - expanding_f_result = expanding_f(x, pairwise=True) - expected = Panel(items=x.index, - major_axis=x.columns, - minor_axis=x.columns) - for i, _ in enumerate(x.columns): - for j, _ in enumerate(x.columns): - expected.iloc[:, i, j] = getattr( - x.iloc[:, i].expanding( - min_periods=min_periods), - name)(x.iloc[:, j]) - tm.assert_panel_equal(expanding_f_result, expected) - @tm.slow def test_rolling_consistency(self): @@ -2203,25 +2342,6 @@ def cases(): assert_equal(rolling_f_result, rolling_apply_f_result) - if (name in ['cov', 'corr']) and isinstance( - x, DataFrame): - # test pairwise=True - rolling_f_result = rolling_f(x, - pairwise=True) - expected = Panel(items=x.index, - major_axis=x.columns, - minor_axis=x.columns) - for i, _ in enumerate(x.columns): - for j, _ in enumerate(x.columns): - expected.iloc[:, i, j] = ( - getattr( - x.iloc[:, i] - .rolling(window=window, - min_periods=min_periods, - center=center), - name)(x.iloc[:, j])) - tm.assert_panel_equal(rolling_f_result, expected) - # binary moments def test_rolling_cov(self): A = self.series @@ -2257,11 +2377,11 @@ def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - panel = get_result(self.frame) - actual = panel.loc[:, 1, 5] + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(actual, expected, check_names=False) - self.assertEqual(actual.name, 5) + tm.assert_series_equal(result, expected, check_names=False) def test_flex_binary_moment(self): # GH3155 @@ -2429,17 +2549,14 @@ def test_expanding_cov_pairwise(self): rolling_result = self.frame.rolling(window=len(self.frame), min_periods=1).corr() - for i in result.items: - tm.assert_almost_equal(result[i], rolling_result[i]) + tm.assert_frame_equal(result, rolling_result) def test_expanding_corr_pairwise(self): result = self.frame.expanding().corr() rolling_result = self.frame.rolling(window=len(self.frame), min_periods=1).corr() - - for i in result.items: - tm.assert_almost_equal(result[i], rolling_result[i]) + tm.assert_frame_equal(result, rolling_result) def test_expanding_cov_diff_index(self): # GH 7512 @@ -2507,8 +2624,6 @@ def test_rolling_functions_window_non_shrinkage(self): s_expected = Series(np.nan, index=s.index) df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) - df_expected_panel = Panel(items=df.index, major_axis=df.columns, - minor_axis=df.columns) functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=False)), @@ -2540,13 +2655,24 @@ def test_rolling_functions_window_non_shrinkage(self): # scipy needed for rolling_window continue + def test_rolling_functions_window_non_shrinkage_binary(self): + + # corr/cov return a MI DataFrame + df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], + columns=Index(['A', 'B'], name='foo'), + index=Index(range(4), name='bar')) + df_expected = DataFrame( + columns=Index(['A', 'B'], name='foo'), + index=pd.MultiIndex.from_product([df.index, df.columns], + names=['bar', 'foo']), + dtype='float64') functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=True)), lambda x: (x.rolling(window=10, min_periods=5) .corr(x, pairwise=True))] for f in functions: - df_result_panel = f(df) - tm.assert_panel_equal(df_result_panel, df_expected_panel) + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) def test_moment_functions_zero_length(self): # GH 8056 @@ -2554,13 +2680,9 @@ def test_moment_functions_zero_length(self): s_expected = s df1 = DataFrame() df1_expected = df1 - df1_expected_panel = Panel(items=df1.index, major_axis=df1.columns, - minor_axis=df1.columns) df2 = DataFrame(columns=['a']) df2['a'] = df2['a'].astype('float64') df2_expected = df2 - df2_expected_panel = Panel(items=df2.index, major_axis=df2.columns, - minor_axis=df2.columns) functions = [lambda x: x.expanding().count(), lambda x: x.expanding(min_periods=5).cov( @@ -2613,6 +2735,23 @@ def test_moment_functions_zero_length(self): # scipy needed for rolling_window continue + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=Index(['a'], name='foo'), + index=Index([], name='bar')) + df2['a'] = df2['a'].astype('float64') + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([])) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product([df2.index, df2.columns], + names=['bar', 'foo']), + columns=Index(['a'], name='foo'), + dtype='float64') + functions = [lambda x: (x.expanding(min_periods=5) .cov(x, pairwise=True)), lambda x: (x.expanding(min_periods=5) @@ -2623,24 +2762,33 @@ def test_moment_functions_zero_length(self): .corr(x, pairwise=True)), ] for f in functions: - df1_result_panel = f(df1) - tm.assert_panel_equal(df1_result_panel, df1_expected_panel) + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) - df2_result_panel = f(df2) - tm.assert_panel_equal(df2_result_panel, df2_expected_panel) + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) def test_expanding_cov_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=['A', 'B']) - df1a = DataFrame([[1, 5], [3, 9]], index=[0, 2], columns=['A', 'B']) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], columns=['X', 'Y']) - df2a = DataFrame([[5, 6], [2, 1]], index=[0, 2], columns=['X', 'Y']) - result1 = df1.expanding().cov(df2a, pairwise=True)[2] - result2 = df1.expanding().cov(df2a, pairwise=True)[2] - result3 = df1a.expanding().cov(df2, pairwise=True)[2] - result4 = df1a.expanding().cov(df2a, pairwise=True)[2] - expected = DataFrame([[-3., -5.], [-6., -10.]], index=['A', 'B'], - columns=['X', 'Y']) + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], + columns=Index(['A', 'B'], name='foo')) + df1a = DataFrame([[1, 5], [3, 9]], + index=[0, 2], + columns=Index(['A', 'B'], name='foo')) + df2 = DataFrame([[5, 6], [None, None], [2, 1]], + columns=Index(['X', 'Y'], name='foo')) + df2a = DataFrame([[5, 6], [2, 1]], + index=[0, 2], + columns=Index(['X', 'Y'], name='foo')) + # TODO: xref gh-15826 + # .loc is not preserving the names + result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(['A', 'B'], name='foo'), + index=Index(['X', 'Y'], name='foo')) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -2648,149 +2796,30 @@ def test_expanding_cov_pairwise_diff_length(self): def test_expanding_corr_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 2], [3, 2], [3, 4]], columns=['A', 'B']) - df1a = DataFrame([[1, 2], [3, 4]], index=[0, 2], columns=['A', 'B']) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], columns=['X', 'Y']) - df2a = DataFrame([[5, 6], [2, 1]], index=[0, 2], columns=['X', 'Y']) - result1 = df1.expanding().corr(df2, pairwise=True)[2] - result2 = df1.expanding().corr(df2a, pairwise=True)[2] - result3 = df1a.expanding().corr(df2, pairwise=True)[2] - result4 = df1a.expanding().corr(df2a, pairwise=True)[2] - expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], index=['A', 'B'], - columns=['X', 'Y']) + df1 = DataFrame([[1, 2], [3, 2], [3, 4]], + columns=['A', 'B'], + index=Index(range(3), name='bar')) + df1a = DataFrame([[1, 2], [3, 4]], + index=Index([0, 2], name='bar'), + columns=['A', 'B']) + df2 = DataFrame([[5, 6], [None, None], [2, 1]], + columns=['X', 'Y'], + index=Index(range(3), name='bar')) + df2a = DataFrame([[5, 6], [2, 1]], + index=Index([0, 2], name='bar'), + columns=['X', 'Y']) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], + columns=['A', 'B'], + index=Index(['X', 'Y'])) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) tm.assert_frame_equal(result4, expected) - def test_pairwise_stats_column_names_order(self): - # GH 7738 - df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], - columns=['C', 'C']), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), - DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], - columns=[1, 0.]), - DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], - columns=[0, 1.]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], - columns=[1., 'X']), ] - df2 = DataFrame([[None, 1, 1], [None, 1, 2], - [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) - s = Series([1, 1, 3, 8]) - - # suppress warnings about incomparable objects, as we are deliberately - # testing with such column labels - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*incomparable objects.*", - category=RuntimeWarning) - - # DataFrame methods (which do not call _flex_binary_moment()) - for f in [lambda x: x.cov(), lambda x: x.corr(), ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.index, df.columns) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - # compare internal values, as columns can be different - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with itself, pairwise=True - for f in [lambda x: x.expanding().cov(pairwise=True), - lambda x: x.expanding().corr(pairwise=True), - lambda x: x.rolling(window=3).cov(pairwise=True), - lambda x: x.rolling(window=3).corr(pairwise=True), - lambda x: x.ewm(com=3).cov(pairwise=True), - lambda x: x.ewm(com=3).corr(pairwise=True), ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.items, df.index) - tm.assert_index_equal(result.major_axis, df.columns) - tm.assert_index_equal(result.minor_axis, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with itself, pairwise=False - for f in [lambda x: x.expanding().cov(pairwise=False), - lambda x: x.expanding().corr(pairwise=False), - lambda x: x.rolling(window=3).cov(pairwise=False), - lambda x: x.rolling(window=3).corr(pairwise=False), - lambda x: x.ewm(com=3).cov(pairwise=False), - lambda x: x.ewm(com=3).corr(pairwise=False), ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with another DataFrame, pairwise=True - for f in [lambda x, y: x.expanding().cov(y, pairwise=True), - lambda x, y: x.expanding().corr(y, pairwise=True), - lambda x, y: x.rolling(window=3).cov(y, pairwise=True), - lambda x, y: x.rolling(window=3).corr(y, pairwise=True), - lambda x, y: x.ewm(com=3).cov(y, pairwise=True), - lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]: - results = [f(df, df2) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.items, df.index) - tm.assert_index_equal(result.major_axis, df.columns) - tm.assert_index_equal(result.minor_axis, df2.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with another DataFrame, pairwise=False - for f in [lambda x, y: x.expanding().cov(y, pairwise=False), - lambda x, y: x.expanding().corr(y, pairwise=False), - lambda x, y: x.rolling(window=3).cov(y, pairwise=False), - lambda x, y: x.rolling(window=3).corr(y, pairwise=False), - lambda x, y: x.ewm(com=3).cov(y, pairwise=False), - lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ]: - results = [f(df, df2) if df.columns.is_unique else None - for df in df1s] - for (df, result) in zip(df1s, results): - if result is not None: - expected_index = df.index.union(df2.index) - expected_columns = df.columns.union(df2.columns) - tm.assert_index_equal(result.index, expected_index) - tm.assert_index_equal(result.columns, expected_columns) - else: - tm.assertRaisesRegexp( - ValueError, "'arg1' columns are not unique", f, df, - df2) - tm.assertRaisesRegexp( - ValueError, "'arg2' columns are not unique", f, - df2, df) - - # DataFrame with a Series - for f in [lambda x, y: x.expanding().cov(y), - lambda x, y: x.expanding().corr(y), - lambda x, y: x.rolling(window=3).cov(y), - lambda x, y: x.rolling(window=3).corr(y), - lambda x, y: x.ewm(com=3).cov(y), - lambda x, y: x.ewm(com=3).corr(y), ]: - results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - def test_rolling_skew_edge_cases(self): all_nan = Series([np.NaN] * 5) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index c41924a7987bd..623c5fa02fcb2 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1,4 +1,5 @@ from warnings import catch_warnings + import numpy as np from numpy.random import randn @@ -1283,8 +1284,9 @@ def test_concat_mixed_objs(self): assert_frame_equal(result, expected) # invalid concatente of mixed dims - panel = tm.makePanel() - self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) + with catch_warnings(record=True): + panel = tm.makePanel() + self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) def test_empty_dtype_coerce(self): @@ -1322,56 +1324,59 @@ def test_dtype_coerceion(self): tm.assert_series_equal(result.dtypes, df.dtypes) def test_panel_concat_other_axes(self): - panel = tm.makePanel() + with catch_warnings(record=True): + panel = tm.makePanel() - p1 = panel.iloc[:, :5, :] - p2 = panel.iloc[:, 5:, :] + p1 = panel.iloc[:, :5, :] + p2 = panel.iloc[:, 5:, :] - result = concat([p1, p2], axis=1) - tm.assert_panel_equal(result, panel) + result = concat([p1, p2], axis=1) + tm.assert_panel_equal(result, panel) - p1 = panel.iloc[:, :, :2] - p2 = panel.iloc[:, :, 2:] + p1 = panel.iloc[:, :, :2] + p2 = panel.iloc[:, :, 2:] - result = concat([p1, p2], axis=2) - tm.assert_panel_equal(result, panel) + result = concat([p1, p2], axis=2) + tm.assert_panel_equal(result, panel) - # if things are a bit misbehaved - p1 = panel.iloc[:2, :, :2] - p2 = panel.iloc[:, :, 2:] - p1['ItemC'] = 'baz' + # if things are a bit misbehaved + p1 = panel.iloc[:2, :, :2] + p2 = panel.iloc[:, :, 2:] + p1['ItemC'] = 'baz' - result = concat([p1, p2], axis=2) + result = concat([p1, p2], axis=2) - expected = panel.copy() - expected['ItemC'] = expected['ItemC'].astype('O') - expected.loc['ItemC', :, :2] = 'baz' - tm.assert_panel_equal(result, expected) + expected = panel.copy() + expected['ItemC'] = expected['ItemC'].astype('O') + expected.loc['ItemC', :, :2] = 'baz' + tm.assert_panel_equal(result, expected) def test_panel_concat_buglet(self): - # #2257 - def make_panel(): - index = 5 - cols = 3 - - def df(): - return DataFrame(np.random.randn(index, cols), - index=["I%s" % i for i in range(index)], - columns=["C%s" % i for i in range(cols)]) - return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']])) - - panel1 = make_panel() - panel2 = make_panel() - - panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) - for x in panel2.major_axis]), - axis=1) - - panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) - panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) - - # it works! - concat([panel1, panel3], axis=1, verify_integrity=True) + with catch_warnings(record=True): + # #2257 + def make_panel(): + index = 5 + cols = 3 + + def df(): + return DataFrame(np.random.randn(index, cols), + index=["I%s" % i for i in range(index)], + columns=["C%s" % i for i in range(cols)]) + return Panel(dict([("Item%s" % x, df()) + for x in ['A', 'B', 'C']])) + + panel1 = make_panel() + panel2 = make_panel() + + panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) + for x in panel2.major_axis]), + axis=1) + + panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) + panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) + + # it works! + concat([panel1, panel3], axis=1, verify_integrity=True) def test_panel4d_concat(self): with catch_warnings(record=True): diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index 2e35f5c1badbb..efd6dda02ab4b 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from warnings import catch_warnings import numpy as np from datetime import datetime from pandas.util import testing as tm @@ -76,14 +77,15 @@ def test_isnull(self): tm.assert_frame_equal(result, expected) # panel - for p in [tm.makePanel(), tm.makePeriodPanel(), - tm.add_nans(tm.makePanel())]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel_equal(result, expected) + with catch_warnings(record=True): + for p in [tm.makePanel(), tm.makePeriodPanel(), + tm.add_nans(tm.makePanel())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel_equal(result, expected) # panel 4d - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with catch_warnings(record=True): for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 9a9f3c6c6b945..9d7b004374318 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1747,8 +1747,10 @@ def makePeriodPanel(nper=None): def makePanel4D(nper=None): - return Panel4D(dict(l1=makePanel(nper), l2=makePanel(nper), - l3=makePanel(nper))) + with warnings.catch_warnings(record=True): + d = dict(l1=makePanel(nper), l2=makePanel(nper), + l3=makePanel(nper)) + return Panel4D(d) def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, From 5d17a94506c9234abb6578e232a5bb4a921c851c Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 7 Apr 2017 15:47:29 -0400 Subject: [PATCH 19/22] ENH: Support malformed row handling in Python engine (#15925) --- doc/source/io.rst | 4 +- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/io/parsers.py | 168 ++++++++++++------- pandas/tests/io/parser/common.py | 42 ++++- pandas/tests/io/parser/python_parser_only.py | 9 +- 5 files changed, 155 insertions(+), 71 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 5cec27c329a7f..f4676f3ad964e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -342,11 +342,11 @@ error_bad_lines : boolean, default ``True`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If ``False``, then these "bad lines" will dropped from the DataFrame that is - returned (only valid with C parser). See :ref:`bad lines ` + returned. See :ref:`bad lines ` below. warn_bad_lines : boolean, default ``True`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for - each "bad line" will be output (only valid with C parser). + each "bad line" will be output. .. _io.dtypes: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 132f20cb73142..d3207ffa86c6a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -368,9 +368,10 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) -- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) - A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels `. (:issue:`15694`) - :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) +- ``pd.read_csv()`` will now raise a ``ParserError`` error whenever any parsing error occurs (:issue:`15913`, :issue:`15925`) +- ``pd.read_csv()`` now supports the ``error_bad_lines`` and ``warn_bad_lines`` arguments for the Python parser (:issue:`15925`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a85f9cda50879..10f8c53987471 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -263,10 +263,10 @@ Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will dropped from the DataFrame that is - returned. (Only valid with C parser) + returned. warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each - "bad line" will be output. (Only valid with C parser). + "bad line" will be output. low_memory : boolean, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds): _python_unsupported = set([ 'low_memory', 'buffer_lines', - 'error_bad_lines', - 'warn_bad_lines', 'float_precision', ]) _deprecated_args = set([ @@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds): self.usecols, _ = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] + self.warn_bad_lines = kwds['warn_bad_lines'] + self.error_bad_lines = kwds['error_bad_lines'] + self.names_passed = kwds['names'] or None self.na_filter = kwds['na_filter'] @@ -2469,16 +2470,19 @@ def _next_line(self): next(self.data) while True: - orig_line = self._next_iter_line() - line = self._check_comments([orig_line])[0] + orig_line = self._next_iter_line(row_num=self.pos + 1) self.pos += 1 - if (not self.skip_blank_lines and - (self._empty(orig_line) or line)): - break - elif self.skip_blank_lines: - ret = self._check_empty([line]) - if ret: - line = ret[0] + + if orig_line is not None: + line = self._check_comments([orig_line])[0] + + if self.skip_blank_lines: + ret = self._check_empty([line]) + + if ret: + line = ret[0] + break + elif self._empty(orig_line) or line: break # This was the first line of the file, @@ -2491,7 +2495,28 @@ def _next_line(self): self.buf.append(line) return line - def _next_iter_line(self, **kwargs): + def _alert_malformed(self, msg, row_num): + """ + Alert a user about a malformed row. + + If `self.error_bad_lines` is True, the alert will be `ParserError`. + If `self.warn_bad_lines` is True, the alert will be printed out. + + Parameters + ---------- + msg : The error message to display. + row_num : The row number where the parsing error occurred. + Because this row number is displayed, we 1-index, + even though we 0-index internally. + """ + + if self.error_bad_lines: + raise ParserError(msg) + elif self.warn_bad_lines: + base = 'Skipping line {row_num}: '.format(row_num=row_num) + sys.stderr.write(base + msg + '\n') + + def _next_iter_line(self, row_num): """ Wrapper around iterating through `self.data` (CSV source). @@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs): Parameters ---------- - kwargs : Keyword arguments used to customize the error message. + row_num : The row number of the line being parsed. """ try: return next(self.data) except csv.Error as e: - msg = str(e) - - if 'NULL byte' in msg: - msg = ('NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead') - elif 'newline inside string' in msg: - msg = ('EOF inside string starting with ' - 'line ' + str(kwargs['row_num'])) - - if self.skipfooter > 0: - reason = ('Error could possibly be due to ' - 'parsing errors in the skipped footer rows ' - '(the skipfooter keyword is only applied ' - 'after Python\'s csv library has parsed ' - 'all rows).') - msg += '. ' + reason - - raise csv.Error(msg) + if self.warn_bad_lines or self.error_bad_lines: + msg = str(e) + + if 'NULL byte' in msg: + msg = ('NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead') + elif 'newline inside string' in msg: + msg = ('EOF inside string starting with ' + 'line ' + str(row_num)) + + if self.skipfooter > 0: + reason = ('Error could possibly be due to ' + 'parsing errors in the skipped footer rows ' + '(the skipfooter keyword is only applied ' + 'after Python\'s csv library has parsed ' + 'all rows).') + msg += '. ' + reason + + self._alert_malformed(msg, row_num) + return None def _check_comments(self, lines): if self.comment is None: @@ -2657,42 +2684,57 @@ def _get_index_name(self, columns): return index_name, orig_names, columns def _rows_to_cols(self, content): + if self.skipfooter < 0: + raise ValueError('skip footer cannot be negative') + col_len = self.num_original_columns if self._implicit_index: col_len += len(self.index_col) - # see gh-13320 - zipped_content = list(lib.to_object_array( - content, min_width=col_len).T) - zip_len = len(zipped_content) - - if self.skipfooter < 0: - raise ValueError('skip footer cannot be negative') + max_len = max([len(row) for row in content]) - # Loop through rows to verify lengths are correct. - if (col_len != zip_len and + # Check that there are no rows with too many + # elements in their row (rows with too few + # elements are padded with NaN). + if (max_len > col_len and self.index_col is not False and self.usecols is None): - i = 0 - for (i, l) in enumerate(content): - if len(l) != col_len: - break - footers = 0 - if self.skipfooter: - footers = self.skipfooter + footers = self.skipfooter if self.skipfooter else 0 + bad_lines = [] - row_num = self.pos - (len(content) - i + footers) + iter_content = enumerate(content) + content_len = len(content) + content = [] - msg = ('Expected %d fields in line %d, saw %d' % - (col_len, row_num + 1, zip_len)) - if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: - # see gh-13374 - reason = ('Error could possibly be due to quotes being ' - 'ignored when a multi-char delimiter is used.') - msg += '. ' + reason - raise ValueError(msg) + for (i, l) in iter_content: + actual_len = len(l) + + if actual_len > col_len: + if self.error_bad_lines or self.warn_bad_lines: + row_num = self.pos - (content_len - i + footers) + bad_lines.append((row_num, actual_len)) + + if self.error_bad_lines: + break + else: + content.append(l) + + for row_num, actual_len in bad_lines: + msg = ('Expected %d fields in line %d, saw %d' % + (col_len, row_num + 1, actual_len)) + if len(self.delimiter) > 1 and self.quoting != csv.QUOTE_NONE: + # see gh-13374 + reason = ('Error could possibly be due to quotes being ' + 'ignored when a multi-char delimiter is used.') + msg += '. ' + reason + + self._alert_malformed(msg, row_num + 1) + + # see gh-13320 + zipped_content = list(lib.to_object_array( + content, min_width=col_len).T) if self.usecols: if self._implicit_index: @@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None): while True: new_row = self._next_iter_line( - row_num=self.pos + rows) - new_rows.append(new_row) + row_num=self.pos + rows + 1) rows += 1 + if new_row is not None: + new_rows.append(new_row) + except StopIteration: if self.skiprows: new_rows = [row for i, row in enumerate(new_rows) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 36d5f2dd5274b..ee0f00506cef3 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -19,7 +19,7 @@ from pandas import compat from pandas.compat import (StringIO, BytesIO, PY3, range, lrange, u) -from pandas.errors import DtypeWarning, EmptyDataError +from pandas.errors import DtypeWarning, EmptyDataError, ParserError from pandas.io.common import URLError from pandas.io.parsers import TextFileReader, TextParser @@ -1569,7 +1569,7 @@ def test_null_byte_char(self): tm.assert_frame_equal(out, expected) else: msg = "NULL byte detected" - with tm.assertRaisesRegexp(csv.Error, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), names=cols) def test_utf8_bom(self): @@ -1695,3 +1695,41 @@ class InvalidBuffer(object): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(mock.Mock()) + + def test_skip_bad_lines(self): + # see gh-15925 + data = 'a\n1\n1,2,3\n4\n5,6,7' + + with tm.assertRaises(ParserError): + self.read_csv(StringIO(data)) + + with tm.assertRaises(ParserError): + self.read_csv(StringIO(data), error_bad_lines=True) + + stderr = sys.stderr + expected = DataFrame({'a': [1, 4]}) + + sys.stderr = StringIO() + try: + out = self.read_csv(StringIO(data), + error_bad_lines=False, + warn_bad_lines=False) + tm.assert_frame_equal(out, expected) + + val = sys.stderr.getvalue() + self.assertEqual(val, '') + finally: + sys.stderr = stderr + + sys.stderr = StringIO() + try: + out = self.read_csv(StringIO(data), + error_bad_lines=False, + warn_bad_lines=True) + tm.assert_frame_equal(out, expected) + + val = sys.stderr.getvalue() + self.assertTrue('Skipping line 3' in val) + self.assertTrue('Skipping line 5' in val) + finally: + sys.stderr = stderr diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py index 36356315419c4..9a1eb94270e28 100644 --- a/pandas/tests/io/parser/python_parser_only.py +++ b/pandas/tests/io/parser/python_parser_only.py @@ -14,6 +14,7 @@ import pandas.util.testing as tm from pandas import DataFrame, Index from pandas import compat +from pandas.errors import ParserError from pandas.compat import StringIO, BytesIO, u @@ -213,13 +214,13 @@ def test_multi_char_sep_quotes(self): data = 'a,,b\n1,,a\n2,,"2,,b"' msg = 'ignored when a multi-char delimiter is used' - with tm.assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), sep=',,') # We expect no match, so there should be an assertion # error out of the inner context manager. with tm.assertRaises(AssertionError): - with tm.assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), sep=',,', quoting=csv.QUOTE_NONE) @@ -231,11 +232,11 @@ def test_skipfooter_bad_row(self): for data in ('a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'): - with tm.assertRaisesRegexp(csv.Error, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data), skipfooter=1) # We expect no match, so there should be an assertion # error out of the inner context manager. with tm.assertRaises(AssertionError): - with tm.assertRaisesRegexp(csv.Error, msg): + with tm.assertRaisesRegexp(ParserError, msg): self.read_csv(StringIO(data)) From 6d90a436674a1e285d4577553b5cb75906a1bd27 Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Fri, 7 Apr 2017 16:37:41 -0400 Subject: [PATCH 20/22] BUG: use entire size of DatetimeTZBlock when coercing result (#15855) (#15924) * BUG: use entire size of DatetimeTZBlock when coercing result (#15855) * Moved test * Removed unnecessary 'self' * Removed unnecessary 'self', again --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/internals.py | 2 +- pandas/tests/frame/test_missing.py | 14 ++++++++++++++ pandas/tests/series/test_missing.py | 12 ++++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d3207ffa86c6a..0b98e57c606a3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1144,6 +1144,7 @@ Conversion - Bug in ``DataFrame.fillna()`` where the argument ``downcast`` was ignored when fillna value was of type ``dict`` (:issue:`15277`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) - Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`) +- Bug in ``DataFrame.fillna()`` with tz-aware datetimes (:issue:`15855`) Indexing ^^^^^^^^ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8db801f8e7212..57361886eab8c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2475,7 +2475,7 @@ def _try_coerce_result(self, result): if isinstance(result, np.ndarray): # allow passing of > 1dim if its trivial if result.ndim > 1: - result = result.reshape(len(result)) + result = result.reshape(np.prod(result.shape)) result = self.values._shallow_copy(result) return result diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 93c3ba78a0abf..eacf032bbcc85 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -257,6 +257,20 @@ def test_fillna(self): result = df.fillna(value={'Date': df['Date2']}) assert_frame_equal(result, expected) + # with timezone + # GH 15855 + df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), + pd.NaT]}) + exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), + pd.Timestamp('2012-11-11 00:00:00+01:00')]}) + assert_frame_equal(df.fillna(method='pad'), exp) + + df = pd.DataFrame({'A': [pd.NaT, + pd.Timestamp('2012-11-11 00:00:00+01:00')]}) + exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), + pd.Timestamp('2012-11-11 00:00:00+01:00')]}) + assert_frame_equal(df.fillna(method='bfill'), exp) + def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 7174283494fe7..ea49abeee21c5 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -258,6 +258,18 @@ def test_datetime64_tz_fillna(self): self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) + # with timezone + # GH 15855 + df = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]) + exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), + pd.Timestamp('2012-11-11 00:00:00+01:00')]) + assert_series_equal(df.fillna(method='pad'), exp) + + df = pd.Series([pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')]) + exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), + pd.Timestamp('2012-11-11 00:00:00+01:00')]) + assert_series_equal(df.fillna(method='bfill'), exp) + def test_datetime64tz_fillna_round_issue(self): # GH 14872 From bc2fa160b9d281889b344e7bc15352998e7b0955 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 7 Apr 2017 18:42:29 -0400 Subject: [PATCH 21/22] BUG/DOC: Add documentation in types/common.py (#15941) * DOC: document internal methods in types/common.py Partially addresses gh-15895. * BUG: Catch TypeError when calling _get_dtype The following functions were not catching the TypeError raised by _get_dtype: 1) is_string_dtype 2) is_string_like_dtype 3) is_timedelta64_ns_dtype Thus, when "None" was passed in, an Exception was raised instead of returning False, as other functions did. * TST: use ids to have nice parameterized function names --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/types/test_common.py | 27 ++++ pandas/types/common.py | 253 +++++++++++++++++++++++++++--- 3 files changed, 259 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0b98e57c606a3..436d51da6e873 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1145,6 +1145,7 @@ Conversion - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) - Bug in ``DataFrame`` construction with nulls and datetimes in a list-like (:issue:`15869`) - Bug in ``DataFrame.fillna()`` with tz-aware datetimes (:issue:`15855`) +- Bug in ``is_string_dtype``, ``is_timedelta64_ns_dtype``, and ``is_string_like_dtype`` in which an error was raised when ``None`` was passed in (:issue:`15941`) Indexing ^^^^^^^^ diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py index c15f219c8fad6..21772bab44d01 100644 --- a/pandas/tests/types/test_common.py +++ b/pandas/tests/types/test_common.py @@ -80,3 +80,30 @@ def test_dtype_equal_strict(): assert not is_dtype_equal( pandas_dtype('datetime64[ns, US/Eastern]'), pandas_dtype('datetime64[ns, CET]')) + + # see gh-15941: no exception should be raised + assert not is_dtype_equal(None, None) + + +def get_is_dtype_funcs(): + """ + Get all functions in pandas.types.common that + begin with 'is_' and end with 'dtype' + + """ + import pandas.types.common as com + + fnames = [f for f in dir(com) if (f.startswith('is_') and + f.endswith('dtype'))] + return [getattr(com, fname) for fname in fnames] + + +@pytest.mark.parametrize('func', + get_is_dtype_funcs(), + ids=lambda x: x.__name__) +def test_get_dtype_error_catch(func): + # see gh-15941 + # + # No exception should be raised. + + assert not func(None) diff --git a/pandas/types/common.py b/pandas/types/common.py index 017805673defe..7ab2e068ac69f 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -31,6 +31,20 @@ def _ensure_float(arr): + """ + Ensure that an array object has a float dtype if possible. + + Parameters + ---------- + arr : ndarray, Series + The array whose data type we want to enforce as float. + + Returns + ------- + float_arr : The original array cast to the float dtype if + possible. Otherwise, the original array is returned. + """ + if issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr @@ -46,6 +60,20 @@ def _ensure_float(arr): def _ensure_categorical(arr): + """ + Ensure that an array-like object is a Categorical (if not already). + + Parameters + ---------- + arr : array-like + The array that we want to convert into a Categorical. + + Returns + ------- + cat_arr : The original array cast as a Categorical. If it already + is a Categorical, we return as is. + """ + if not is_categorical(arr): from pandas import Categorical arr = Categorical(arr) @@ -116,8 +144,40 @@ def is_categorical_dtype(arr_or_dtype): def is_string_dtype(arr_or_dtype): - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) + """ + Check whether the provided array or dtype is of the string dtype. + + Parameters + ---------- + arr_or_dtype : ndarray, dtype, type + The array or dtype to check. + + Returns + ------- + boolean : Whether or not the array or dtype is of the string dtype. + + Examples + -------- + >>> is_string_dtype(str) + True + >>> is_string_dtype(object) + True + >>> is_string_dtype(int) + False + >>> + >>> is_string_dtype(np.array(['a', 'b'])) + True + >>> is_string_dtype(np.array([1, 2])) + False + """ + + # TODO: gh-15585: consider making the checks stricter. + + try: + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) + except TypeError: + return False def is_period_arraylike(arr): @@ -209,8 +269,40 @@ def is_datetime64_ns_dtype(arr_or_dtype): def is_timedelta64_ns_dtype(arr_or_dtype): - tipo = _get_dtype(arr_or_dtype) - return tipo == _TD_DTYPE + """ + Check whether the provided array or dtype is of the timedelta64[ns] dtype. + + This is a very specific dtype, so generic ones like `np.timedelta64` + will return False if passed into this function. + + Parameters + ---------- + arr_or_dtype : ndarray, dtype, type + The array or dtype to check. + + Returns + ------- + boolean : Whether or not the array or dtype + is of the timedelta64[ns] dtype. + + Examples + -------- + >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]') + True + >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]') # Wrong frequency + False + >>> + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + True + >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) + False + """ + + try: + tipo = _get_dtype(arr_or_dtype) + return tipo == _TD_DTYPE + except TypeError: + return False def is_datetime_or_timedelta_dtype(arr_or_dtype): @@ -220,10 +312,21 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): def _is_unorderable_exception(e): """ - return a boolean if we an unorderable exception error message + Check if the exception raised is an unorderable exception. - These are different error message for PY>=3<=3.5 and PY>=3.6 + The error message differs for 3 <= PY <= 3.5 and PY >= 3.6, so + we need to condition based on Python version. + + Parameters + ---------- + e : Exception or sub-class + The exception object to check. + + Returns + ------- + boolean : Whether or not the exception raised is an unorderable exception. """ + if PY36: return "'>' not supported between instances of" in str(e) @@ -302,9 +405,39 @@ def is_numeric_dtype(arr_or_dtype): def is_string_like_dtype(arr_or_dtype): - # exclude object as its a mixed dtype - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('S', 'U') + """ + Check whether the provided array or dtype is of a string-like dtype. + + Unlike `is_string_dtype`, the object dtype is excluded because it + is a mixed dtype. + + Parameters + ---------- + arr_or_dtype : ndarray, dtype, type + The array or dtype to check. + + Returns + ------- + boolean : Whether or not the array or dtype is of the string dtype. + + Examples + -------- + >>> is_string_like_dtype(str) + True + >>> is_string_like_dtype(object) + False + >>> + >>> is_string_like_dtype(np.array(['a', 'b'])) + True + >>> is_string_like_dtype(np.array([1, 2])) + False + """ + + try: + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('S', 'U') + except TypeError: + return False def is_float_dtype(arr_or_dtype): @@ -346,7 +479,22 @@ def is_complex_dtype(arr_or_dtype): def _coerce_to_dtype(dtype): - """ coerce a string / np.dtype to a dtype """ + """ + Coerce a string or np.dtype to a pandas or numpy + dtype if possible. + + If we cannot convert to a pandas dtype initially, + we convert to a numpy dtype. + + Parameters + ---------- + dtype : The dtype that we want to coerce. + + Returns + ------- + pd_or_np_dtype : The coerced dtype. + """ + if is_categorical_dtype(dtype): dtype = CategoricalDtype() elif is_datetime64tz_dtype(dtype): @@ -359,8 +507,27 @@ def _coerce_to_dtype(dtype): def _get_dtype(arr_or_dtype): + """ + Get the dtype instance associated with an array + or dtype object. + + Parameters + ---------- + arr_or_dtype : ndarray, Series, dtype, type + The array-like or dtype object whose dtype we want to extract. + + Returns + ------- + obj_dtype : The extract dtype instance from the + passed in array or dtype object. + + Raises + ------ + TypeError : The passed in object is None. + """ + if arr_or_dtype is None: - raise TypeError + raise TypeError("Cannot deduce dtype from null object") if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype elif isinstance(arr_or_dtype, type): @@ -385,6 +552,21 @@ def _get_dtype(arr_or_dtype): def _get_dtype_type(arr_or_dtype): + """ + Get the type (NOT dtype) instance associated with + an array or dtype object. + + Parameters + ---------- + arr_or_dtype : ndarray, Series, dtype, type + The array-like or dtype object whose type we want to extract. + + Returns + ------- + obj_type : The extract type instance from the + passed in array or dtype object. + """ + if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type elif isinstance(arr_or_dtype, type): @@ -410,16 +592,27 @@ def _get_dtype_type(arr_or_dtype): def _get_dtype_from_object(dtype): - """Get a numpy dtype.type-style object. This handles the datetime64[ns] - and datetime64[ns, TZ] compat + """ + Get a numpy dtype.type-style object for a dtype object. - Notes - ----- - If nothing can be found, returns ``object``. + This methods also includes handling of the datetime64[ns] and + datetime64[ns, TZ] objects. + + If no dtype can be found, we return ``object``. + + Parameters + ---------- + dtype : dtype, type + The dtype object whose numpy dtype.type-style + object we want to extract. + + Returns + ------- + dtype_object : The extracted numpy dtype.type-style object. """ - # type object from a dtype if isinstance(dtype, type) and issubclass(dtype, np.generic): + # Type object from a dtype return dtype elif is_categorical(dtype): return CategoricalDtype().type @@ -429,7 +622,7 @@ def _get_dtype_from_object(dtype): try: _validate_date_like_dtype(dtype) except TypeError: - # should still pass if we don't have a datelike + # Should still pass if we don't have a date-like pass return dtype.type elif isinstance(dtype, string_types): @@ -444,10 +637,11 @@ def _get_dtype_from_object(dtype): try: return _get_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): - # handles cases like _get_dtype(int) - # i.e., python objects that are valid dtypes (unlike user-defined - # types, in general) - # TypeError handles the float16 typecode of 'e' + # Handles cases like _get_dtype(int) i.e., + # Python objects that are valid dtypes + # (unlike user-defined types, in general) + # + # TypeError handles the float16 type code of 'e' # further handle internal types pass @@ -455,6 +649,21 @@ def _get_dtype_from_object(dtype): def _validate_date_like_dtype(dtype): + """ + Check whether the dtype is a date-like dtype. Raises an error if invalid. + + Parameters + ---------- + dtype : dtype, type + The dtype to check. + + Raises + ------ + TypeError : The dtype could not be casted to a date-like dtype. + ValueError : The dtype is an illegal date-like dtype (e.g. the + the frequency provided is too specific) + """ + try: typ = np.datetime_data(dtype)[0] except ValueError as e: From d60f490b4e764ab9af170adf4d88639014804fe6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 7 Apr 2017 18:59:27 -0400 Subject: [PATCH 22/22] DOC: minor doc corrections --- ci/requirements-3.5_DOC.run | 1 + doc/source/computation.rst | 4 ++-- doc/source/dsintro.rst | 4 ++-- doc/source/whatsnew/v0.20.0.txt | 6 +++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ci/requirements-3.5_DOC.run b/ci/requirements-3.5_DOC.run index 644a16f51f4b6..740d4714e96b4 100644 --- a/ci/requirements-3.5_DOC.run +++ b/ci/requirements-3.5_DOC.run @@ -18,4 +18,5 @@ sqlalchemy numexpr bottleneck statsmodels +xarray pyqt=4.11.4 diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 2423f1a342994..a37cbc96b2d8c 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -557,7 +557,7 @@ can even be omitted: correls.loc['2002-09-22':] You can efficiently retrieve the time series of correlations between two -columns using ``.loc`` indexing: +columns by reshaping and indexing: .. ipython:: python :suppress: @@ -567,7 +567,7 @@ columns using ``.loc`` indexing: .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.loc[:, ('A', 'C')].plot() + correls.unstack(1)[('A', 'C')].plot() .. _stats.aggregate: diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 2b11b23b1d1c2..0086cb0f94747 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -953,8 +953,8 @@ datatype support, and manipulation routines. As a result, supporting efficient i routines for ``Series``, ``DataFrame`` and ``Panel`` has contributed to an increasingly fragmented and difficult-to-understand codebase. -The 3-d structure of a ``Panel`` is much less common for many types of data analysis, -than the 1-d of the ``Series`` or the 2-D of the ``DataFrame``. Going forward it makes sense for +The 3-D structure of a ``Panel`` is much less common for many types of data analysis, +than the 1-D of the ``Series`` or the 2-D of the ``DataFrame``. Going forward it makes sense for pandas to focus on these areas exclusively. Oftentimes, one can simply use a MultiIndex ``DataFrame`` for easily working with higher dimensional data. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 436d51da6e873..d571c0f2d9620 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -435,7 +435,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` is deprecated and will be removed in a future version. The recommended way to represent 3-D data are -with a ``MultiIndex``on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion. See the documentation :ref:`Deprecate Panel `. (:issue:`13563`). .. ipython:: python @@ -874,8 +874,8 @@ Window Binary Corr/Cov operations return a MultiIndex DataFrame A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object, will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, -see :ref:`here <_whatsnew_0200.api_breaking.deprecate_panel>`. These are equivalent in function, -but MultiIndexed ``DataFrame`` s enjoy more support in pandas. +see :ref:`here `. These are equivalent in function, +but a MultiIndexed ``DataFrame`` enjoys more support in pandas. See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) .. ipython:: python