diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a808b83119a402..c261891aa8897f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1160,6 +1160,7 @@ Reshaping - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) - Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) - Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`) +- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`) Other ^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 30521760327b46..ae9d240afcb936 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list(list lists): +def fast_unique_multiple_list(list lists, bint sort=True): cdef: list buf Py_ssize_t k = len(lists) @@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists): if val not in table: table[val] = stub uniques.append(val) - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f33ef9597f4569..f82305ac3913a7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5982,7 +5982,8 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False): + def append(self, other, ignore_index=False, + verify_integrity=False, sort=False): """ Append rows of `other` to the end of this frame, returning a new object. Columns not in this frame are added as new columns. @@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): If True, do not use the index labels. verify_integrity : boolean, default False If True, raise ValueError on creating index with duplicates. + sort: boolean, default False + Sort columns if given object doesn't have the same columns Returns ------- @@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False): else: to_concat = [self, other] return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + verify_integrity=verify_integrity, + sort=sort) def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 2e5ec8b554ce76..75232e3db7e550 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -31,17 +31,17 @@ '_all_indexes_same'] -def _get_objs_combined_axis(objs, intersect=False, axis=0): +def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): # Extract combined index: return intersection or union (depending on the # value of "intersect") of indexes on given axis, or None if all objects # lack indexes (e.g. they are numpy arrays) obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, '_get_axis')] if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect) + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_combined_index(indexes, intersect=False): +def _get_combined_index(indexes, intersect=False, sort=True): # TODO: handle index names! indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: @@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False): for other in indexes[1:]: index = index.intersection(other) return index - union = _union_indexes(indexes) + union = _union_indexes(indexes, sort=sort) return _ensure_index(union) -def _union_indexes(indexes): +def _union_indexes(indexes, sort=True): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') if len(indexes) == 1: @@ -74,7 +74,8 @@ def conv(i): i = i.tolist() return i - return Index(lib.fast_unique_multiple_list([conv(i) for i in inds])) + return Index( + lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) if kind == 'special': result = indexes[0] diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 20f4384a3d6984..3630edbcbf58f3 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -20,7 +20,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, - copy=True): + sort=False, copy=True): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -60,6 +60,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, verify_integrity : boolean, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation + sort : boolean, default False + Sort columns if all passed object columns are not the same copy : boolean, default True If False, do not copy data unnecessarily @@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, verify_integrity=verify_integrity, - copy=copy) + copy=copy, sort=sort) return op.get_result() @@ -220,7 +222,8 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False, copy=True, + sort=False): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.keys = keys self.names = names or getattr(keys, 'names', None) self.levels = levels + self.sort = sort self.ignore_index = ignore_index self.verify_integrity = verify_integrity @@ -447,7 +451,8 @@ def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect) + intersect=self.intersect, + sort=self.sort) except IndexError: types = [type(x).__name__ for x in self.objs] raise TypeError("Cannot concatenate list of {types}" diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 437b4179c580a3..ed3dff56254aaf 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -852,8 +852,9 @@ def test_append_dtype_coerce(self): dt.datetime(2013, 1, 2, 0, 0), dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) + name='start_time')], + axis=1, sort=True) + result = df1.append(df2, ignore_index=True, sort=True) assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): @@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self): frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, ignore_index=True) + v1 = concat([frame1, frame2], axis=1, + ignore_index=True, sort=True) nan = np.nan expected = DataFrame([[nan, nan, nan, 4.3], @@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self): # must reindex, #2603 s = Series(randn(3), index=['c', 'a', 'b'], name='A') s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) + result = concat([s, s2], axis=1, sort=True) expected = DataFrame({'A': s, 'B': s2}) assert_frame_equal(result, expected) @@ -2070,8 +2072,6 @@ def test_concat_order(self): for i in range(100)] result = pd.concat(dfs).columns expected = dfs[0].columns - if PY2: - expected = expected.sort_values() tm.assert_index_equal(result, expected) def test_concat_datetime_timezone(self): @@ -2155,3 +2155,24 @@ def test_concat_empty_and_non_empty_series_regression(): expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) + + +def test_concat_preserve_column_order_differing_columns(): + # GH 4588 regression test + # for new columns in concat + dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]]) + dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]]) + result = pd.concat([dfa, dfb]) + assert result.columns.tolist() == ['C', 'A', 'Z'] + + +def test_concat_preserve_column_order_uneven_data(): + # GH 4588 regression test + # add to column, concat with uneven data + df = pd.DataFrame() + df['b'] = [1,2,3] + df['c'] = [1,2,3] + df['a'] = [1,2,3] + df2 = pd.DataFrame({'a':[4,5]}) + df3 = pd.concat([df, df2]) + assert df3.columns.tolist() == ['b', 'c', 'a']