Skip to content

Commit

Permalink
Implement new dupcol keyword that indicates what to do with duplicate…
Browse files Browse the repository at this point in the history
… columns in joins and DataFrame constructors
  • Loading branch information
leei committed Jul 31, 2023
1 parent e341cc7 commit 33c947e
Show file tree
Hide file tree
Showing 12 changed files with 502 additions and 214 deletions.
125 changes: 74 additions & 51 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,9 @@ Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df),

"""
rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol};
makeunique::Bool=false)
makeunique::Bool=false, dupcol::Symbol=:error)
rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
makeunique::Bool=false)
makeunique::Bool=false, dupcol::Symbol=:error)
rename!(df::AbstractDataFrame, (from => to)::Pair...)
rename!(df::AbstractDataFrame, d::AbstractDict)
rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair})
Expand Down Expand Up @@ -179,9 +179,9 @@ julia> rename!(df, [:a, :b, :c])
1 │ 1 2 3
julia> rename!(df, [:a, :b, :a])
ERROR: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically.
ERROR: ArgumentError: Duplicate variable names: :a. Pass dupcol=:makeunique to make them unique using a suffix automatically.
julia> rename!(df, [:a, :b, :a], makeunique=true)
julia> rename!(df, [:a, :b, :a], dupcol=:makeunique)
1×3 DataFrame
Row │ a b a_1
│ Int64 Int64 Int64
Expand All @@ -197,16 +197,16 @@ julia> rename!(uppercase, df)
```
"""
function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol};
makeunique::Bool=false)
rename!(index(df), vals, makeunique=makeunique)
makeunique::Bool=false, dupcol::Symbol=:error)
rename!(index(df), vals, makeunique=makeunique, dupcol=dupcol)
# renaming columns of SubDataFrame has to clean non-note metadata in its parent
_drop_all_nonnote_metadata!(parent(df))
return df
end

function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
makeunique::Bool=false)
rename!(index(df), Symbol.(vals), makeunique=makeunique)
makeunique::Bool=false, dupcol::Symbol=:error)
rename!(index(df), Symbol.(vals), makeunique=makeunique, dupcol=dupcol)
# renaming columns of SubDataFrame has to clean non-note metadata in its parent
_drop_all_nonnote_metadata!(parent(df))
return df
Expand Down Expand Up @@ -353,9 +353,9 @@ julia> rename(uppercase, df)
```
"""
rename(df::AbstractDataFrame, vals::AbstractVector{Symbol};
makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique)
makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol)
rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique)
makeunique::Bool=false, dupcol::Symbol=:error) = rename!(copy(df), vals, makeunique=makeunique, dupcol=dupcol)
rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...)
rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df))

Expand Down Expand Up @@ -1536,13 +1536,20 @@ end

"""
hcat(df::AbstractDataFrame...;
makeunique::Bool=false, copycols::Bool=true)
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
Horizontally concatenate data frames.
If `makeunique=false` (the default) column names of passed objects must be unique.
If `makeunique=true` then duplicate column names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).
Deprecated in favor of `dupcol`
If `dupcol=:error` (the default) then columns names of passed objects must be unique.
If `dupcol=:makeunique` then duplicate column names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).
If `dupcol=:update` then duplicate columns names will be combined with the left-hand
column overwritten by non-missing values from the right hand column(s)
If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will
contain copied columns from the source data frames.
Expand Down Expand Up @@ -1575,7 +1582,7 @@ julia> df2 = DataFrame(A=4:6, B=4:6)
2 │ 5 5
3 │ 6 6
julia> df3 = hcat(df1, df2, makeunique=true)
julia> df3 = hcat(df1, df2, dupcol=:makeunique)
3×4 DataFrame
Row │ A B A_1 B_1
│ Int64 Int64 Int64 Int64
Expand All @@ -1587,32 +1594,32 @@ julia> df3 = hcat(df1, df2, makeunique=true)
julia> df3.A === df1.A
false
julia> df3 = hcat(df1, df2, makeunique=true, copycols=false);
julia> df3 = hcat(df1, df2, dupcol=:makeunique, copycols=false);
julia> df3.A === df1.A
true
```
"""
function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true)
function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
df = DataFrame(df, copycols=copycols)
_drop_all_nonnote_metadata!(df)
return df
end

# TODO: after deprecation remove AbstractVector methods
Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) =
hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, copycols=copycols)
Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) =
hcat!(x, df, makeunique=makeunique, copycols=copycols)
Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
hcat!(x, df, makeunique=makeunique, dupcol=dupcol, copycols=copycols)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame;
makeunique::Bool=false, copycols::Bool=true) =
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
hcat!(DataFrame(df1, copycols=copycols), df2,
makeunique=makeunique, copycols=copycols)
makeunique=makeunique, dupcol=dupcol, copycols=copycols)
Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame},
y::Union{AbstractVector, AbstractDataFrame}...;
makeunique::Bool=false, copycols::Bool=true) =
hcat!(hcat(df, x, makeunique=makeunique, copycols=copycols), y...,
makeunique=makeunique, copycols=copycols)
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
hcat!(hcat(df, x, makeunique=makeunique, dupcol=dupcol, copycols=copycols), y...,
makeunique=makeunique, dupcol=dupcol, copycols=copycols)

"""
vcat(dfs::AbstractDataFrame...;
Expand Down Expand Up @@ -2870,6 +2877,10 @@ const INSERTCOLS_ARGUMENTS =
- `makeunique` : defines what to do if `name` already exists in `df`;
if it is `false` an error will be thrown; if it is `true` a new unique name will
be generated by adding a suffix
- `dupcol` : defines what to do if `name` already exists in `df`;
if it is :error an error will be thrown; if is :makeunique a new unique name will
be generated by adding a suffix; if it is :update then the existing column will be
updated with the non-missing values
- `copycols` : whether vectors passed as columns should be copied
If `val` is an `AbstractRange` then the result of `collect(val)` is inserted.
Expand All @@ -2891,7 +2902,7 @@ const INSERTCOLS_ARGUMENTS =

"""
insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
after::Bool=false, makeunique::Bool=false, dupcol=:error, copycols::Bool=true)
Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref)
function and return the newly created data frame.
Expand Down Expand Up @@ -2922,7 +2933,7 @@ julia> insertcols(df, 1, :b => 'a':'c')
2 │ b 2
3 │ c 3
julia> insertcols(df, :c => 2:4, :c => 3:5, makeunique=true)
julia> insertcols(df, :c => 2:4, :c => 3:5, dupcol=:error)
3×3 DataFrame
Row │ a c c_1
│ Int64 Int64 Int64
Expand All @@ -2942,13 +2953,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true)
```
"""
insertcols(df::AbstractDataFrame, args...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
insertcols!(copy(df), args...;
after=after, makeunique=makeunique, copycols=copycols)
after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols)

"""
insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
Insert a column into a data frame in place. Return the updated data frame.
Expand Down Expand Up @@ -2979,7 +2990,7 @@ julia> insertcols!(df, 1, :b => 'a':'c')
2 │ b 2
3 │ c 3
julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true)
julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, dupcol=:error)
3×4 DataFrame
Row │ b c c_1 a
│ Char Int64 Int64 Int64
Expand All @@ -2999,7 +3010,10 @@ julia> insertcols!(df, :b, :d => 7:9, after=true)
```
"""
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)

dupcol = _dupcol(dupcol, makeunique)

if !is_column_insertion_allowed(df)
throw(ArgumentError("insertcols! is only supported for DataFrame, or for " *
"SubDataFrame created with `:` as column selector"))
Expand All @@ -3025,15 +3039,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
"$(ncol(df)) columns at index $col_ind"))
end

if !makeunique
if dupcol == :error
if !allunique(first.(name_cols))
throw(ArgumentError("Names of columns to be inserted into a data frame " *
"must be unique when `makeunique=true`"))
"must be unique when `dupcol=:error`"))
end
for (n, _) in name_cols
if hasproperty(df, n)
throw(ArgumentError("Column $n is already present in the data frame " *
"which is not allowed when `makeunique=true`"))
"which is not allowed when `dupcol=:error`"))
end
end
end
Expand Down Expand Up @@ -3103,19 +3117,28 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
dfp[!, name] = item_new
else
if hasproperty(dfp, name)
@assert makeunique
k = 1
while true
nn = Symbol("$(name)_$k")
if !hasproperty(dfp, nn)
name = nn
break
if dupcol == :makeunique
k = 1
while true
nn = Symbol("$(name)_$k")
if !hasproperty(dfp, nn)
name = nn
break
end
k += 1
end
k += 1
insert!(index(dfp), col_ind, name)
insert!(_columns(dfp), col_ind, item_new)
else
@assert dupcol == :update
# Just update without adding to index
dfp[!, name] = _update_missing.(dfp[!, name], item_new)
col_ind -= 1
end
else
insert!(index(dfp), col_ind, name)
insert!(_columns(dfp), col_ind, item_new)
end
insert!(index(dfp), col_ind, name)
insert!(_columns(dfp), col_ind, item_new)
end
col_ind += 1
end
Expand All @@ -3134,22 +3157,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
end

insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)...,
after=after, makeunique=makeunique, copycols=copycols)
after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols)

insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
insertcols!(df, ncol(df)+1, name_cols..., after=after,
makeunique=makeunique, copycols=copycols)
makeunique=makeunique, dupcol=dupcol, copycols=copycols)

insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true) =
insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)...,
after=after, makeunique=makeunique, copycols=copycols)
after=after, makeunique=makeunique, dupcol=dupcol, copycols=copycols)

function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
makeunique::Bool=false, copycols::Bool=true)
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
if col isa SymbolOrString
col_ind = Int(columnindex(df, col))
if col_ind == 0
Expand All @@ -3173,7 +3196,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
end

function insertcols!(df::AbstractDataFrame; after::Bool=false,
makeunique::Bool=false, copycols::Bool=true)
makeunique::Bool=false, dupcol::Symbol=:error, copycols::Bool=true)
_drop_all_nonnote_metadata!(parent(df))
return df
end
Expand Down
16 changes: 8 additions & 8 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,7 @@ julia> permutedims(df2, 1, "different_name")
"""
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
dest_namescol::Union{Symbol, AbstractString};
makeunique::Bool=false, strict::Bool=true)
makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true)

if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
Expand Down Expand Up @@ -854,26 +854,26 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,

if ncol(df_notsrc) == 0
df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names,
makeunique=makeunique, copycols=false)
makeunique=makeunique, dupcol=dupcol, copycols=false)
else
m = permutedims(Matrix(df_notsrc))
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique)
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique, dupcol=dupcol)
end
out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, dupcol=dupcol, copycols=false)
_copy_table_note_metadata!(out_df, df)
return out_df
end

function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
makeunique::Bool=false, strict::Bool=true)
makeunique::Bool=false, dupcol::Symbol=:error, strict::Bool=true)
if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
dest_namescol = _names(df)[src_namescol]
else
dest_namescol = src_namescol
end
return permutedims(df, src_namescol, dest_namescol;
makeunique=makeunique, strict=strict)
makeunique=makeunique, dupcol=dupcol, strict=strict)
end

function Base.permutedims(df::AbstractDataFrame)
Expand All @@ -883,8 +883,8 @@ function Base.permutedims(df::AbstractDataFrame)
end

function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector;
makeunique::Bool=false)
out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique)
makeunique::Bool=false, dupcol::Symbol=:error)
out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, dupcol=dupcol)
_copy_table_note_metadata!(out_df, df)
return out_df
end
Loading

0 comments on commit 33c947e

Please sign in to comment.