diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f392687a0a3fd..1a500bdc65ce3 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -26,6 +26,7 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ +- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) - ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 39a5da0aa6912..0dc6a7a1e9c7b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -6,6 +6,8 @@ import warnings from datetime import datetime, timedelta from functools import partial +import inspect +import collections import numpy as np from pandas._libs import lib, tslib @@ -479,6 +481,42 @@ def _dict_compat(d): for key, value in iteritems(d)) +def standardize_mapping(into): + """ + Helper function to standardize a supplied mapping. + + .. versionadded:: 0.21.0 + + Parameters + ---------- + into : instance or subclass of collections.Mapping + Must be a class, an initialized collections.defaultdict, + or an instance of a collections.Mapping subclass. + + Returns + ------- + mapping : a collections.Mapping subclass or other constructor + a callable object that can accept an iterator to create + the desired Mapping. + + See Also + -------- + DataFrame.to_dict + Series.to_dict + """ + if not inspect.isclass(into): + if isinstance(into, collections.defaultdict): + return partial( + collections.defaultdict, into.default_factory) + into = type(into) + if not issubclass(into, collections.Mapping): + raise TypeError('unsupported type: {}'.format(into)) + elif into == collections.defaultdict: + raise TypeError( + 'to_dict() only accepts initialized defaultdicts') + return into + + def sentinel_factory(): class Sentinel(object): pass diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d437102e4d18..3b0cc5619a1cd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -63,7 +63,8 @@ _default_index, _values_from_object, _maybe_box_datetimelike, - _dict_compat) + _dict_compat, + standardize_mapping) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, @@ -860,7 +861,7 @@ def from_dict(cls, data, orient='columns', dtype=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_dict(self, orient='dict'): + def to_dict(self, orient='dict', into=dict): """Convert DataFrame to dictionary. Parameters @@ -882,32 +883,85 @@ def to_dict(self, orient='dict'): Abbreviations are allowed. `s` indicates `series` and `sp` indicates `split`. + into : class, default dict + The collections.Mapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + .. versionadded:: 0.21.0 + Returns ------- - result : dict like {column -> {index -> value}} + result : collections.Mapping like {column -> {index -> value}} + + Examples + -------- + >>> df = pd.DataFrame( + {'col1': [1, 2], 'col2': [0.5, 0.75]}, index=['a', 'b']) + >>> df + col1 col2 + a 1 0.1 + b 2 0.2 + >>> df.to_dict() + {'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict('series') + {'col1': a 1 + b 2 + Name: col1, dtype: int64, 'col2': a 0.50 + b 0.75 + Name: col2, dtype: float64} + >>> df.to_dict('split') + {'columns': ['col1', 'col2'], + 'data': [[1.0, 0.5], [2.0, 0.75]], + 'index': ['a', 'b']} + >>> df.to_dict('records') + [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}] + >>> df.to_dict('index') + {'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])), + ('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + + >>> dd = defaultdict(list) + >>> df.to_dict('records', into=dd) + [defaultdict(, {'col2': 0.5, 'col1': 1.0}), + defaultdict(, {'col2': 0.75, 'col1': 2.0})] """ if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " "columns will be omitted.", UserWarning) + # GH16122 + into_c = standardize_mapping(into) if orient.lower().startswith('d'): - return dict((k, v.to_dict()) for k, v in compat.iteritems(self)) + return into_c( + (k, v.to_dict(into)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('l'): - return dict((k, v.tolist()) for k, v in compat.iteritems(self)) + return into_c((k, v.tolist()) for k, v in compat.iteritems(self)) elif orient.lower().startswith('sp'): - return {'index': self.index.tolist(), - 'columns': self.columns.tolist(), - 'data': lib.map_infer(self.values.ravel(), - _maybe_box_datetimelike) - .reshape(self.values.shape).tolist()} + return into_c((('index', self.index.tolist()), + ('columns', self.columns.tolist()), + ('data', lib.map_infer(self.values.ravel(), + _maybe_box_datetimelike) + .reshape(self.values.shape).tolist()))) elif orient.lower().startswith('s'): - return dict((k, _maybe_box_datetimelike(v)) - for k, v in compat.iteritems(self)) + return into_c((k, _maybe_box_datetimelike(v)) + for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [dict((k, _maybe_box_datetimelike(v)) - for k, v in zip(self.columns, row)) + return [into_c((k, _maybe_box_datetimelike(v)) + for k, v in zip(self.columns, row)) for row in self.values] elif orient.lower().startswith('i'): - return dict((k, v.to_dict()) for k, v in self.iterrows()) + return into_c((k, v.to_dict(into)) for k, v in self.iterrows()) else: raise ValueError("orient '%s' not understood" % orient) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6ec163bbaa73d..129f291e5f843 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -46,7 +46,8 @@ _maybe_match_name, SettingWithCopyError, _maybe_box_datetimelike, - _dict_compat) + _dict_compat, + standardize_mapping) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, Float64Index, _ensure_index) from pandas.core.indexing import check_bool_indexer, maybe_convert_indices @@ -1074,15 +1075,39 @@ def tolist(self): """ Convert Series to a nested list """ return list(self.asobject) - def to_dict(self): + def to_dict(self, into=dict): """ - Convert Series to {label -> value} dict + Convert Series to {label -> value} dict or dict-like object. + + Parameters + ---------- + into : class, default dict + The collections.Mapping subclass to use as the return + object. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + .. versionadded:: 0.21.0 Returns ------- - value_dict : dict - """ - return dict(compat.iteritems(self)) + value_dict : collections.Mapping + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.to_dict() + {0: 1, 1: 2, 2: 3, 3: 4} + >>> from collections import OrderedDict, defaultdict + >>> s.to_dict(OrderedDict) + OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) + >>> dd = defaultdict(list) + >>> s.to_dict(dd) + defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) + """ + # GH16122 + into_c = standardize_mapping(into) + return into_c(compat.iteritems(self)) def to_frame(self, name=None): """ diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index e0cdca7904db7..34dd138ee1c80 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import pytest +import collections import numpy as np from pandas import compat @@ -13,50 +14,6 @@ class TestDataFrameConvertTo(TestData): - def test_to_dict(self): - test_data = { - 'A': {'1': 1, '2': 2}, - 'B': {'1': '1', '2': '2', '3': '3'}, - } - recons_data = DataFrame(test_data).to_dict() - - for k, v in compat.iteritems(test_data): - for k2, v2 in compat.iteritems(v): - assert v2 == recons_data[k][k2] - - recons_data = DataFrame(test_data).to_dict("l") - - for k, v in compat.iteritems(test_data): - for k2, v2 in compat.iteritems(v): - assert v2 == recons_data[k][int(k2) - 1] - - recons_data = DataFrame(test_data).to_dict("s") - - for k, v in compat.iteritems(test_data): - for k2, v2 in compat.iteritems(v): - assert v2 == recons_data[k][k2] - - recons_data = DataFrame(test_data).to_dict("sp") - expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'], - 'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]} - tm.assert_dict_equal(recons_data, expected_split) - - recons_data = DataFrame(test_data).to_dict("r") - expected_records = [{'A': 1.0, 'B': '1'}, - {'A': 2.0, 'B': '2'}, - {'A': np.nan, 'B': '3'}] - assert isinstance(recons_data, list) - assert len(recons_data) == 3 - for l, r in zip(recons_data, expected_records): - tm.assert_dict_equal(l, r) - - # GH10844 - recons_data = DataFrame(test_data).to_dict("i") - - for k, v in compat.iteritems(test_data): - for k2, v2 in compat.iteritems(v): - assert v2 == recons_data[k2][k] - def test_to_dict_timestamp(self): # GH11247 @@ -190,17 +147,85 @@ def test_to_records_with_unicode_column_names(self): ) tm.assert_almost_equal(result, expected) + @pytest.mark.parametrize('mapping', [ + dict, + collections.defaultdict(list), + collections.OrderedDict]) + def test_to_dict(self, mapping): + test_data = { + 'A': {'1': 1, '2': 2}, + 'B': {'1': '1', '2': '2', '3': '3'}, + } + + # GH16122 + recons_data = DataFrame(test_data).to_dict(into=mapping) + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + assert (v2 == recons_data[k][k2]) + + recons_data = DataFrame(test_data).to_dict("l", mapping) + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + assert (v2 == recons_data[k][int(k2) - 1]) + + recons_data = DataFrame(test_data).to_dict("s", mapping) + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + assert (v2 == recons_data[k][k2]) + + recons_data = DataFrame(test_data).to_dict("sp", mapping) + expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'], + 'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]} + tm.assert_dict_equal(recons_data, expected_split) + + recons_data = DataFrame(test_data).to_dict("r", mapping) + expected_records = [{'A': 1.0, 'B': '1'}, + {'A': 2.0, 'B': '2'}, + {'A': np.nan, 'B': '3'}] + assert isinstance(recons_data, list) + assert (len(recons_data) == 3) + for l, r in zip(recons_data, expected_records): + tm.assert_dict_equal(l, r) + + # GH10844 + recons_data = DataFrame(test_data).to_dict("i") + + for k, v in compat.iteritems(test_data): + for k2, v2 in compat.iteritems(v): + assert (v2 == recons_data[k2][k]) + + df = DataFrame(test_data) + df['duped'] = df[df.columns[0]] + recons_data = df.to_dict("i") + comp_data = test_data.copy() + comp_data['duped'] = comp_data[df.columns[0]] + for k, v in compat.iteritems(comp_data): + for k2, v2 in compat.iteritems(v): + assert (v2 == recons_data[k2][k]) + + @pytest.mark.parametrize('mapping', [ + list, + collections.defaultdict, + []]) + def test_to_dict_errors(self, mapping): + # GH16122 + df = DataFrame(np.random.randn(3, 3)) + with pytest.raises(TypeError): + df.to_dict(into=mapping) -@pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern']) -def test_to_records_datetimeindex_with_tz(tz): - # GH13937 - dr = date_range('2016-01-01', periods=10, - freq='S', tz=tz) + @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern']) + def test_to_records_datetimeindex_with_tz(self, tz): + # GH13937 + dr = date_range('2016-01-01', periods=10, + freq='S', tz=tz) - df = DataFrame({'datetime': dr}, index=dr) + df = DataFrame({'datetime': dr}, index=dr) - expected = df.to_records() - result = df.tz_convert("UTC").to_records() + expected = df.to_records() + result = df.tz_convert("UTC").to_records() - # both converted to UTC, so they are equal - tm.assert_numpy_array_equal(result, expected) + # both converted to UTC, so they are equal + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index d1c9e5a6d16cf..503185de427f1 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -2,6 +2,8 @@ # pylint: disable-msg=E1101,W0612 from datetime import datetime +import collections +import pytest import numpy as np import pandas as pd @@ -126,9 +128,6 @@ def test_to_frame(self): dict(testdifferent=self.ts.values), index=self.ts.index) assert_frame_equal(rs, xp) - def test_to_dict(self): - tm.assert_series_equal(Series(self.ts.to_dict(), name='ts'), self.ts) - def test_timeseries_periodindex(self): # GH2891 from pandas import period_range @@ -167,6 +166,19 @@ class SubclassedFrame(DataFrame): expected = SubclassedFrame({'X': [1, 2, 3]}) assert_frame_equal(result, expected) + @pytest.mark.parametrize('mapping', ( + dict, + collections.defaultdict(list), + collections.OrderedDict)) + def test_to_dict(self, mapping): + # GH16122 + ts = TestData().ts + tm.assert_series_equal( + Series(ts.to_dict(mapping), name='ts'), ts) + from_method = Series(ts.to_dict(collections.Counter)) + from_constructor = Series(collections.Counter(ts.iteritems())) + tm.assert_series_equal(from_method, from_constructor) + class TestSeriesToList(TestData): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index d7dbaccb87ee8..4893f99f7cf0f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- import pytest +import collections +from functools import partial import numpy as np @@ -195,3 +197,26 @@ def test_dict_compat(): assert (com._dict_compat(data_datetime64) == expected) assert (com._dict_compat(expected) == expected) assert (com._dict_compat(data_unchanged) == data_unchanged) + + +def test_standardize_mapping(): + # No uninitialized defaultdicts + with pytest.raises(TypeError): + com.standardize_mapping(collections.defaultdict) + + # No non-mapping subtypes, instance + with pytest.raises(TypeError): + com.standardize_mapping([]) + + # No non-mapping subtypes, class + with pytest.raises(TypeError): + com.standardize_mapping(list) + + fill = {'bad': 'data'} + assert (com.standardize_mapping(fill) == dict) + + # Convert instance to type + assert (com.standardize_mapping({}) == dict) + + dd = collections.defaultdict(list) + assert isinstance(com.standardize_mapping(dd), partial)