Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow rename when selecting #1975

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,21 +150,21 @@ rename(f::Function, df::AbstractDataFrame)

* `::AbstractDataFrame` : the updated result

New names are processed sequentially. A new name must not already exist in the `DataFrame`
at the moment an attempt to rename a column is performed.
Each name is changed at most once. Permutation of names is allowed.

**Examples**

```julia
df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
rename(df, :i => :A, :x => :X)
rename(df, :x => :y, :y => :x)
rename(df, [:i => :A, :x => :X])
rename(df, Dict(:i => :A, :x => :X))
rename(x -> Symbol(uppercase(string(x))), df)
rename(df) do x
Symbol(uppercase(string(x)))
end
rename!(df, Dict(:i =>: A, :x => :X))
rename!(df, Dict(:i => :A, :x => :X))
```

"""
Expand Down
59 changes: 58 additions & 1 deletion src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ end

# df[MultiRowIndex, MultiColumnIndex] => DataFrame
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T},
col_inds::Union{AbstractVector, Regex, Not, Between, All}) where T
col_inds::Union{Regex, Not, Between, All}) where T
@boundscheck if !checkindex(Bool, axes(df, 1), row_inds)
throw(BoundsError("attempt to access a data frame with $(nrow(df)) " *
"rows at index $row_inds"))
Expand All @@ -382,6 +382,28 @@ end
return DataFrame(new_columns, Index(_names(df)[selected_columns]), copycols=false)
end


# df[MultiRowIndex, MultiColumnIndex] => DataFrame
@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T},
col_inds::AbstractVector) where T
@boundscheck if !checkindex(Bool, axes(df, 1), row_inds)
throw(BoundsError("attempt to access a data frame with $(nrow(df)) " *
"rows at index $row_inds"))
end
if any(i->i isa Pair{Symbol, Symbol}, col_inds)
cc = [i isa Symbol ? i : i[1] for i in col_inds]
rr = filter(i -> i isa Pair{Symbol, Symbol}, col_inds)
else
cc = col_inds
rr = Pair{Symbol, Symbol}[]
end
selected_columns = index(df)[cc]
# Computing integer indices once for all columns is faster
selected_rows = T === Bool ? findall(row_inds) : row_inds
new_columns = AbstractVector[dv[selected_rows] for dv in _columns(df)[selected_columns]]
return rename!(DataFrame(new_columns, Index(_names(df)[selected_columns]), copycols=false), rr...)
end

@inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T}, ::Colon) where T
@boundscheck if !checkindex(Bool, axes(df, 1), row_inds)
throw(BoundsError("attempt to access a data frame with $(nrow(df)) " *
Expand Down Expand Up @@ -831,6 +853,16 @@ function select!(df::DataFrame, inds::AbstractVector{Int})
end

select!(df::DataFrame, c::Int) = select!(df, [c])

function select!(df::DataFrame, c::AbstractVector{T}) where T
if any(i->i isa Pair{Symbol, Symbol}, c)
cc = [i isa Symbol ? i : i[1] for i in c]
rr = filter(i -> i isa Pair{Symbol, Symbol}, c)
return rename!(select!(df, index(df)[cc]), rr...)
end
return select!(df, index(df)[c])
end

select!(df::DataFrame, c::Any) = select!(df, index(df)[c])
select!(df::DataFrame, c, cs...) = select!(df, All(c, cs...))

Expand Down Expand Up @@ -859,6 +891,9 @@ If `copycols=false`, then returned `DataFrame` shares column vectors with `df`.
If `df` is a `SubDataFrame` then a `SubDataFrame` is returned if `copycols=false`
and a `DataFrame` with freshly allocated columns otherwise.

If `df` is a `DataFrame`, then select! support partially rename (`SubDataFrame`
does not support rename yet).

### Examples

```jldoctest
Expand All @@ -879,6 +914,15 @@ julia> select(d, :b)
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │

julia> select(d, [:b=>:x])
3×1 DataFrame
│ Row │ x │
│ │ Int64 │
├─────┼───────┤
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │
```

"""
Expand All @@ -888,8 +932,21 @@ select(df::DataFrame, inds::AbstractVector{Int}; copycols::Bool=true) =

select(df::DataFrame, c::Int; copycols::Bool=true) =
select(df, [c], copycols=copycols)
select(df::DataFrame, c::AbstractVector{T}; copycols::Bool=true) where T<:Union{Symbol,Integer} =
select(df, index(df)[c], copycols=copycols)

function select(df::DataFrame, c::AbstractVector{T}; copycols::Bool=true) where T
if any(i->i isa Pair{Symbol, Symbol}, c)
cc = [i isa Symbol ? i : i[1] for i in c]
rr = filter(i -> i isa Pair{Symbol, Symbol}, c)
return rename!(select(df, index(df)[cc], copycols=copycols), rr...)
end
return select(df, index(df)[c], copycols=copycols)
end

select(df::DataFrame, c::Any; copycols::Bool=true) =
select(df, index(df)[c], copycols=copycols)

select(df::DataFrame, c, cs...; copycols::Bool=true) =
select(df, All(c, cs...), copycols=copycols)

Expand Down
31 changes: 29 additions & 2 deletions src/other/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,41 @@ function names!(x::Index, nms::Vector{Symbol}; makeunique::Bool=false)
end

function rename!(x::Index, nms)
xbackup = copy(x)
processedfrom = Set{Symbol}()
processedto = Set{Symbol}()
toholder = Dict{Symbol,Int}()
for (from, to) in nms
if from ∈ processedfrom
merge!(empty!(x.lookup), xbackup.lookup)
x.names .= xbackup.names
throw(ArgumentError("Tried renaming $from multiple times."))
end
if to ∈ processedto
merge!(empty!(x.lookup), xbackup.lookup)
x.names .= xbackup.names
throw(ArgumentError("Tried renaming to $to multiple times."))
end
push!(processedfrom, from)
push!(processedto, to)
from == to && continue # No change, nothing to do
if !haskey(xbackup, from)
merge!(empty!(x.lookup), xbackup.lookup)
x.names .= xbackup.names
throw(ArgumentError("Tried renaming $from to $to, when $from does not exist in the Index."))
end
if haskey(x, to)
error("Tried renaming $from to $to, when $to already exists in the Index.")
toholder[to] = x.lookup[to]
end
x.lookup[to] = col = pop!(x.lookup, from)
col = haskey(toholder, from) ? pop!(toholder, from) : pop!(x.lookup, from)
x.lookup[to] = col
x.names[col] = to
end
if !isempty(toholder)
merge!(empty!(x.lookup), xbackup.lookup)
x.names .= xbackup.names
throw(ArgumentError("Tried renaming to $(first(keys(toholder))), when it already exists in the Index."))
end
return x
end

Expand Down
98 changes: 98 additions & 0 deletions test/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,36 @@ end
@test d.b === df.b
end

@testset "select! rename" begin
df = DataFrame(a=1, b=2, c=3, d=4, e=5)
dfa = df.a
dfb = df.b
dfc = df.c
dfd = df.d
dfe = df.e

d = copy(df, copycols=false)
select!(d, [:a=>:b])
@test names(d) == [:b]
@test d.b === dfa

d = copy(df, copycols=false)
select!(d, [:b=>:a, :a=>:b, :e])
@test names(d) == [:a, :b, :e]
@test d.b === dfa
@test d.a === dfb
@test d.e === dfe

d = copy(df, copycols=false)
select!(d, [:a=>:aa, :b=>:bb, :c=>:cc, :d=>:dd, :e=>:ee])
@test names(d) == [:aa, :bb, :cc, :dd, :ee]
@test d.aa === dfa
@test d.bb === dfb
@test d.cc === dfc
@test d.dd === dfd
@test d.ee === dfe
end

@testset "select" begin
df = DataFrame(a=1, b=2, c=3, d=4, e=5)
@test_throws BoundsError select(df, 0)
Expand Down Expand Up @@ -819,6 +849,36 @@ end
@test d.b === df.b
end

@testset "select rename" begin
df = DataFrame(a=1, b=2, c=3, d=4, e=5)

d = select(df, [:a=>:b])
@test names(d) == [:b]
@test d.b !== df.a
@test d.b == df.a

d = select(df, [:b=>:a, :a=>:b, :e])
@test names(d) == [:a, :b, :e]
@test d.b !== df.a
@test d.a !== df.b
@test d.e !== df.e
@test d.b == df.a
@test d.a == df.b
@test d.e == df.e

d = select(df, [:e, :b=>:a, :c], copycols=false)
@test names(d) == [:e, :a, :c]
@test d.e === df.e
@test d.a === df.b
@test d.c === df.c

d = select(df, [:e=>:a, :b, :a=>:c], copycols=false)
@test names(d) == [:a, :b, :c]
@test d.a === df.e
@test d.b === df.b
@test d.c === df.a
end

@testset "deleterows!" begin
df = DataFrame(a=[1, 2], b=[3.0, 4.0])
@test deleterows!(df, 1) === df
Expand Down Expand Up @@ -1127,6 +1187,33 @@ end
@test names(df) == [:A_4, :B_4]
@test rename!(x->Symbol(lowercase(string(x))), df) === df
@test names(df) == [:a_4, :b_4]

df = DataFrame(A = 1:3, B = 'A':'C', C = [:x, :y, :z])
@test rename!(df, :A => :B, :B => :A) === df
@test names(df) == [:B, :A, :C]
@test rename!(df, :A => :B, :B => :A, :C => :D) === df
@test names(df) == [:A, :B, :D]
@test rename!(df, :A => :B, :B => :C, :D => :A) === df
@test names(df) == [:B, :C, :A]
@test rename!(df, :A => :C, :B => :A, :C => :B) === df
@test names(df) == [:A, :B, :C]
@test rename!(df, :A => :A, :B => :B, :C => :C) === df
@test names(df) == [:A, :B, :C]

@test_throws ArgumentError rename!(df, :X => :Y)
@test names(df) == [:A, :B, :C]
@test_throws ArgumentError rename!(df, :A => :X, :X => :Y)
@test names(df) == [:A, :B, :C]
@test_throws ArgumentError rename!(df, :A => :B)
@test names(df) == [:A, :B, :C]
@test_throws ArgumentError rename!(df, :A => :X, :A => :X)
@test names(df) == [:A, :B, :C]
@test_throws ArgumentError rename!(df, :A => :X, :B => :X)
@test names(df) == [:A, :B, :C]
@test_throws ArgumentError rename!(df, :A => :B, :B => :A, :C => :B)
@test names(df) == [:A, :B, :C]
@test_throws ArgumentError rename!(df, :A => :B, :B => :A, :A => :X)
@test names(df) == [:A, :B, :C]
end

@testset "size" begin
Expand Down Expand Up @@ -1527,6 +1614,17 @@ end
@test df[:, [:y,:x]][!, :x] !== x
end

@testset "test getindex with rename" begin
x = [1,3]
y = [2,4]
df = DataFrame(x=x, y=y, copycols=false)
@test df[!, [:x=>:t]].t === x
@test df[:, [:x=>:t]].t == x
@test df[:, [:x=>:t]].t !== x
@test df[1:1, [:x=>:t]].t == x[1:1]
@test df[Not(2), [:x=>:t]].t == x[Not(2)]
end

@testset "test corner case of getindex" begin
df = DataFrame(x=[1], y=[1])
@test_throws ArgumentError df[true, 1:2]
Expand Down