BUG: handle non-numeric aggregates in pure python Series aggregation,…

… GH #612
pandas-dev · Jan 12, 2012 · 71e9046 · 71e9046
1 parent fbb1102
commit 71e9046
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 7 deletions.
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -414,15 +414,26 @@ def _aggregate_series(self, obj, func, group_index, ngroups):
 
     def _aggregate_series_pure_python(self, obj, func, ngroups):
         counts = np.zeros(ngroups, dtype=int)
-        result = np.empty(ngroups, dtype=float)
-        result.fill(np.nan)
+        result = None
 
         for label, group in self._generator_factory(obj):
             if group is None:
                 continue
+            res = func(group)
+            if result is None:
+                try:
+                    assert(not isinstance(res, np.ndarray))
+                    assert(not (isinstance(res, list) and
+                                len(res) == len(self.dummy)))
+
+                    result = np.empty(ngroups, dtype='O')
+                except Exception:
+                    raise ValueError('function does not reduce')
+
             counts[label] = group.shape[0]
-            result[label] = func(group)
+            result[label] = res
 
+        result = lib.maybe_convert_objects(result)
         return result, counts
 
     def _python_apply_general(self, func, *args, **kwargs):

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -343,6 +343,28 @@ def test_series_agg_multikey(self):
         expected = grouped.sum()
         assert_series_equal(result, expected)
 
+    def test_series_agg_multi_pure_python(self):
+        data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
+                                 'bar', 'bar', 'bar', 'bar',
+                                 'foo', 'foo', 'foo'],
+                          'B' : ['one', 'one', 'one', 'two',
+                                 'one', 'one', 'one', 'two',
+                                 'two', 'two', 'one'],
+                          'C' : ['dull', 'dull', 'shiny', 'dull',
+                                 'dull', 'shiny', 'shiny', 'dull',
+                                 'shiny', 'shiny', 'shiny'],
+                          'D' : np.random.randn(11),
+                          'E' : np.random.randn(11),
+                          'F' : np.random.randn(11)})
+
+        def bad(x):
+            assert(len(x.base) == len(x))
+            return 'foo'
+
+        result = data.groupby(['A', 'B']).agg(bad)
+        expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
+        assert_frame_equal(result, expected)
+
     def test_series_index_name(self):
         grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
         result = grouped.agg(lambda x: x.mean())

diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py
@@ -4,6 +4,16 @@
 common_setup = """from pandas_vb_common import *
 """
 
+#----------------------------------------------------------------------
+# nanops
+
+setup = common_setup + """
+s = Series(np.random.randn(100000))
+s[::2] = np.nan
+"""
+
+stat_ops_series_std = Benchmark("s.std()", setup)
+
 #----------------------------------------------------------------------
 # ops by level
 
@@ -19,21 +29,17 @@
 
 stat_ops_level_frame_sum = \
     Benchmark("df.sum(level=1)", setup,
-              name='stat_ops_level_frame_sum',
               start_date=datetime(2011, 11, 15))
 
 stat_ops_level_frame_sum_multiple = \
     Benchmark("df.sum(level=[0, 1])", setup, repeat=1,
-              name='stat_ops_level_frame_sum_multiple',
               start_date=datetime(2011, 11, 15))
 
 stat_ops_level_series_sum = \
     Benchmark("df[1].sum(level=1)", setup,
-              name='stat_ops_level_series_sum',
               start_date=datetime(2011, 11, 15))
 
 stat_ops_level_series_sum_multiple = \
     Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1,
-              name='stat_ops_level_series_sum_multiple',
               start_date=datetime(2011, 11, 15))