Skip to content

Commit

Permalink
BUG: handle non-numeric aggregates in pure python Series aggregation,…
Browse files Browse the repository at this point in the history
… GH #612
  • Loading branch information
wesm committed Jan 12, 2012
1 parent fbb1102 commit 71e9046
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 7 deletions.
17 changes: 14 additions & 3 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,15 +414,26 @@ def _aggregate_series(self, obj, func, group_index, ngroups):

def _aggregate_series_pure_python(self, obj, func, ngroups):
counts = np.zeros(ngroups, dtype=int)
result = np.empty(ngroups, dtype=float)
result.fill(np.nan)
result = None

for label, group in self._generator_factory(obj):
if group is None:
continue
res = func(group)
if result is None:
try:
assert(not isinstance(res, np.ndarray))
assert(not (isinstance(res, list) and
len(res) == len(self.dummy)))

result = np.empty(ngroups, dtype='O')
except Exception:
raise ValueError('function does not reduce')

counts[label] = group.shape[0]
result[label] = func(group)
result[label] = res

result = lib.maybe_convert_objects(result)
return result, counts

def _python_apply_general(self, func, *args, **kwargs):
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,28 @@ def test_series_agg_multikey(self):
expected = grouped.sum()
assert_series_equal(result, expected)

def test_series_agg_multi_pure_python(self):
data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
'bar', 'bar', 'bar', 'bar',
'foo', 'foo', 'foo'],
'B' : ['one', 'one', 'one', 'two',
'one', 'one', 'one', 'two',
'two', 'two', 'one'],
'C' : ['dull', 'dull', 'shiny', 'dull',
'dull', 'shiny', 'shiny', 'dull',
'shiny', 'shiny', 'shiny'],
'D' : np.random.randn(11),
'E' : np.random.randn(11),
'F' : np.random.randn(11)})

def bad(x):
assert(len(x.base) == len(x))
return 'foo'

result = data.groupby(['A', 'B']).agg(bad)
expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
assert_frame_equal(result, expected)

def test_series_index_name(self):
grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
result = grouped.agg(lambda x: x.mean())
Expand Down
14 changes: 10 additions & 4 deletions vb_suite/stat_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@
common_setup = """from pandas_vb_common import *
"""

#----------------------------------------------------------------------
# nanops

setup = common_setup + """
s = Series(np.random.randn(100000))
s[::2] = np.nan
"""

stat_ops_series_std = Benchmark("s.std()", setup)

#----------------------------------------------------------------------
# ops by level

Expand All @@ -19,21 +29,17 @@

stat_ops_level_frame_sum = \
Benchmark("df.sum(level=1)", setup,
name='stat_ops_level_frame_sum',
start_date=datetime(2011, 11, 15))

stat_ops_level_frame_sum_multiple = \
Benchmark("df.sum(level=[0, 1])", setup, repeat=1,
name='stat_ops_level_frame_sum_multiple',
start_date=datetime(2011, 11, 15))

stat_ops_level_series_sum = \
Benchmark("df[1].sum(level=1)", setup,
name='stat_ops_level_series_sum',
start_date=datetime(2011, 11, 15))

stat_ops_level_series_sum_multiple = \
Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1,
name='stat_ops_level_series_sum_multiple',
start_date=datetime(2011, 11, 15))

0 comments on commit 71e9046

Please sign in to comment.