Skip to content

Commit

Permalink
Stop concat from attempting to sort mismatched columns by default
Browse files Browse the repository at this point in the history
Preserve column order upon concatenation to obey
least astonishment principle.

Allow old behavior to be enabled by adding a boolean switch to
concat and DataFrame.append, mismatch_sort, which is by default disabled.

Close #4588
  • Loading branch information
brycepg committed Apr 5, 2018
1 parent 6d610a4 commit bcf835a
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 23 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,7 @@ Reshaping
- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)
- Bug in :class:`Series` constructor with a ``dtype=str``, previously raised in some cases (:issue:`19853`)
- Stop :func:`concat` and ``Dataframe.append`` from sorting columns by default. Use ``sort=True`` to retain old behavior (:issue:`4588`)

Other
^^^^^
Expand Down
11 changes: 6 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):

@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list(list lists):
def fast_unique_multiple_list(list lists, bint sort=True):
cdef:
list buf
Py_ssize_t k = len(lists)
Expand All @@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
if val not in table:
table[val] = stub
uniques.append(val)
try:
uniques.sort()
except Exception:
pass
if sort:
try:
uniques.sort()
except Exception:
pass

return uniques

Expand Down
8 changes: 6 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5982,7 +5982,8 @@ def infer(x):
# ----------------------------------------------------------------------
# Merging / joining methods

def append(self, other, ignore_index=False, verify_integrity=False):
def append(self, other, ignore_index=False,
verify_integrity=False, sort=False):
"""
Append rows of `other` to the end of this frame, returning a new
object. Columns not in this frame are added as new columns.
Expand All @@ -5995,6 +5996,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
If True, do not use the index labels.
verify_integrity : boolean, default False
If True, raise ValueError on creating index with duplicates.
sort: boolean, default False
Sort columns if given object doesn't have the same columns
Returns
-------
Expand Down Expand Up @@ -6103,7 +6106,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
else:
to_concat = [self, other]
return concat(to_concat, ignore_index=ignore_index,
verify_integrity=verify_integrity)
verify_integrity=verify_integrity,
sort=sort)

def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
sort=False):
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@
'_all_indexes_same']


def _get_objs_combined_axis(objs, intersect=False, axis=0):
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
# Extract combined index: return intersection or union (depending on the
# value of "intersect") of indexes on given axis, or None if all objects
# lack indexes (e.g. they are numpy arrays)
obs_idxes = [obj._get_axis(axis) for obj in objs
if hasattr(obj, '_get_axis')]
if obs_idxes:
return _get_combined_index(obs_idxes, intersect=intersect)
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)


def _get_combined_index(indexes, intersect=False):
def _get_combined_index(indexes, intersect=False, sort=True):
# TODO: handle index names!
indexes = com._get_distinct_objs(indexes)
if len(indexes) == 0:
Expand All @@ -53,11 +53,11 @@ def _get_combined_index(indexes, intersect=False):
for other in indexes[1:]:
index = index.intersection(other)
return index
union = _union_indexes(indexes)
union = _union_indexes(indexes, sort=sort)
return _ensure_index(union)


def _union_indexes(indexes):
def _union_indexes(indexes, sort=True):
if len(indexes) == 0:
raise AssertionError('Must have at least 1 Index to union')
if len(indexes) == 1:
Expand All @@ -74,7 +74,8 @@ def conv(i):
i = i.tolist()
return i

return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
return Index(
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))

if kind == 'special':
result = indexes[0]
Expand Down
13 changes: 9 additions & 4 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
keys=None, levels=None, names=None, verify_integrity=False,
copy=True):
sort=False, copy=True):
"""
Concatenate pandas objects along a particular axis with optional set logic
along the other axes.
Expand Down Expand Up @@ -60,6 +60,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
verify_integrity : boolean, default False
Check whether the new concatenated axis contains duplicates. This can
be very expensive relative to the actual data concatenation
sort : boolean, default False
Sort columns if all passed object columns are not the same
copy : boolean, default True
If False, do not copy data unnecessarily
Expand Down Expand Up @@ -209,7 +211,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
ignore_index=ignore_index, join=join,
keys=keys, levels=levels, names=names,
verify_integrity=verify_integrity,
copy=copy)
copy=copy, sort=sort)
return op.get_result()


Expand All @@ -220,7 +222,8 @@ class _Concatenator(object):

def __init__(self, objs, axis=0, join='outer', join_axes=None,
keys=None, levels=None, names=None,
ignore_index=False, verify_integrity=False, copy=True):
ignore_index=False, verify_integrity=False, copy=True,
sort=False):
if isinstance(objs, (NDFrame, compat.string_types)):
raise TypeError('first argument must be an iterable of pandas '
'objects, you passed an object of type '
Expand Down Expand Up @@ -355,6 +358,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
self.keys = keys
self.names = names or getattr(keys, 'names', None)
self.levels = levels
self.sort = sort

self.ignore_index = ignore_index
self.verify_integrity = verify_integrity
Expand Down Expand Up @@ -447,7 +451,8 @@ def _get_comb_axis(self, i):
data_axis = self.objs[0]._get_block_manager_axis(i)
try:
return _get_objs_combined_axis(self.objs, axis=data_axis,
intersect=self.intersect)
intersect=self.intersect,
sort=self.sort)
except IndexError:
types = [type(x).__name__ for x in self.objs]
raise TypeError("Cannot concatenate list of {types}"
Expand Down
33 changes: 27 additions & 6 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,8 +852,9 @@ def test_append_dtype_coerce(self):
dt.datetime(2013, 1, 2, 0, 0),
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 4, 0, 0)],
name='start_time')], axis=1)
result = df1.append(df2, ignore_index=True)
name='start_time')],
axis=1, sort=True)
result = df1.append(df2, ignore_index=True, sort=True)
assert_frame_equal(result, expected)

def test_append_missing_column_proper_upcast(self):
Expand Down Expand Up @@ -1011,7 +1012,8 @@ def test_concat_ignore_index(self):
frame1.index = Index(["x", "y", "z"])
frame2.index = Index(["x", "y", "q"])

v1 = concat([frame1, frame2], axis=1, ignore_index=True)
v1 = concat([frame1, frame2], axis=1,
ignore_index=True, sort=True)

nan = np.nan
expected = DataFrame([[nan, nan, nan, 4.3],
Expand Down Expand Up @@ -1463,7 +1465,7 @@ def test_concat_series_axis1(self):
# must reindex, #2603
s = Series(randn(3), index=['c', 'a', 'b'], name='A')
s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
result = concat([s, s2], axis=1)
result = concat([s, s2], axis=1, sort=True)
expected = DataFrame({'A': s, 'B': s2})
assert_frame_equal(result, expected)

Expand Down Expand Up @@ -2070,8 +2072,6 @@ def test_concat_order(self):
for i in range(100)]
result = pd.concat(dfs).columns
expected = dfs[0].columns
if PY2:
expected = expected.sort_values()
tm.assert_index_equal(result, expected)

def test_concat_datetime_timezone(self):
Expand Down Expand Up @@ -2155,3 +2155,24 @@ def test_concat_empty_and_non_empty_series_regression():
expected = s1
result = pd.concat([s1, s2])
tm.assert_series_equal(result, expected)


def test_concat_preserve_column_order_differing_columns():
# GH 4588 regression test
# for new columns in concat
dfa = pd.DataFrame(columns=['C', 'A'], data=[[1, 2]])
dfb = pd.DataFrame(columns=['C', 'Z'], data=[[5, 6]])
result = pd.concat([dfa, dfb])
assert result.columns.tolist() == ['C', 'A', 'Z']


def test_concat_preserve_column_order_uneven_data():
# GH 4588 regression test
# add to column, concat with uneven data
df = pd.DataFrame()
df['b'] = [1,2,3]
df['c'] = [1,2,3]
df['a'] = [1,2,3]
df2 = pd.DataFrame({'a':[4,5]})
df3 = pd.concat([df, df2])
assert df3.columns.tolist() == ['b', 'c', 'a']

0 comments on commit bcf835a

Please sign in to comment.