Skip to content

Commit

Permalink
PERF: SparseDataFrame._init_dict uses intermediary dict, not DataFrame (
Browse files Browse the repository at this point in the history
#16883)

Closes gh-16773.
  • Loading branch information
kernc authored and gfyoung committed Jul 17, 2017
1 parent ec927a4 commit 0bd871f
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 6 deletions.
8 changes: 8 additions & 0 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import repeat

from .pandas_vb_common import *
import scipy.sparse
from pandas import SparseSeries, SparseDataFrame
Expand Down Expand Up @@ -27,6 +29,12 @@ class sparse_frame_constructor(object):
def time_sparse_frame_constructor(self):
SparseDataFrame(columns=np.arange(100), index=np.arange(1000))

def time_sparse_from_scipy(self):
SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005))

def time_sparse_from_dict(self):
SparseDataFrame(dict(zip(range(1000), repeat([0]))))


class sparse_series_from_coo(object):
goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)


.. _whatsnew_0210.bug_fixes:
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _init_dict(self, data, index, columns, dtype=None):
sp_maker = lambda x: SparseArray(x, kind=self._default_kind,
fill_value=self._default_fill_value,
copy=True, dtype=dtype)
sdict = DataFrame()
sdict = {}
for k, v in compat.iteritems(data):
if isinstance(v, Series):
# Force alignment, no copy necessary
Expand All @@ -163,11 +163,8 @@ def _init_dict(self, data, index, columns, dtype=None):

# TODO: figure out how to handle this case, all nan's?
# add in any other columns we want to have (completeness)
nan_vec = np.empty(len(index))
nan_vec.fill(nan)
for c in columns:
if c not in sdict:
sdict[c] = sp_maker(nan_vec)
nan_arr = sp_maker(np.full(len(index), np.nan))
sdict.update((c, nan_arr) for c in columns if c not in sdict)

return to_manager(sdict, columns, index)

Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,10 @@ def test_dataframe_dummies_preserve_categorical_dtype(self):
class TestGetDummiesSparse(TestGetDummies):
sparse = True

@pytest.mark.xfail(reason='nan in index is problematic (GH 16894)')
def test_include_na(self):
super(TestGetDummiesSparse, self).test_include_na()


class TestMakeAxisDummies(object):

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/sparse/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,8 @@ def test_as_blocks(self):
assert list(df_blocks.keys()) == ['float64']
tm.assert_frame_equal(df_blocks['float64'], df)

@pytest.mark.xfail(reason='nan column names in _init_dict problematic '
'(GH 16894)')
def test_nan_columnname(self):
# GH 8822
nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan])
Expand Down

0 comments on commit 0bd871f

Please sign in to comment.