Skip to content

Commit

Permalink
add mapcols! and repeat!, fix corner cases of repeat
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Apr 28, 2020
1 parent 34b8d6d commit 3dd7ff0
Show file tree
Hide file tree
Showing 7 changed files with 305 additions and 71 deletions.
2 changes: 2 additions & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ hcat
insertcols!
length
mapcols
mapcols!
names
ncol
ndims
Expand All @@ -67,6 +68,7 @@ push!
rename
rename!
repeat
repeat!
select
select!
show
Expand Down
2 changes: 2 additions & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,15 @@ export AbstractDataFrame,
insertcols!,
leftjoin,
mapcols,
mapcols!,
ncol,
nonunique,
nrow,
order,
outerjoin,
rename!,
rename,
repeat!,
rightjoin,
select!,
select,
Expand Down
13 changes: 9 additions & 4 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1494,8 +1494,11 @@ julia> repeat(df, inner = 2, outer = 3)
│ 12 │ 2 │ 4 │
```
"""
Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) =
mapcols(x -> repeat(x, inner = inner, outer = outer), df)
function Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1)
inner < 0 && throw(ArgumentError("inner keyword argument must be non-negative"))
outer < 0 && throw(ArgumentError("outer keyword argument must be non-negative"))
return mapcols(x -> repeat(x, inner = inner, outer = outer), df)
end

"""
repeat(df::AbstractDataFrame, count::Integer)
Expand Down Expand Up @@ -1524,8 +1527,10 @@ julia> repeat(df, 2)
│ 4 │ 2 │ 4 │
```
"""
Base.repeat(df::AbstractDataFrame, count::Integer) =
mapcols(x -> repeat(x, count), df)
function Base.repeat(df::AbstractDataFrame, count::Integer)
count < 0 && throw(ArgumentError("count must be non-negative"))
return mapcols(x -> repeat(x, count), df)
end

##############################################################################
##
Expand Down
178 changes: 122 additions & 56 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -195,62 +195,6 @@ where `name` is the column name of the column `col`.
"""
Base.pairs(itr::DataFrameColumns) = Base.Iterators.Pairs(itr, keys(itr))

"""
mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
Return a `DataFrame` where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars.
Note that `mapcols` guarantees not to reuse the columns from `df` in the returned
`DataFrame`. If `f` returns its argument then it gets copied before being stored.
# Examples
```jldoctest
julia> df = DataFrame(x=1:4, y=11:14)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 11 │
│ 2 │ 2 │ 12 │
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> mapcols(x -> x.^2, df)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 121 │
│ 2 │ 4 │ 144 │
│ 3 │ 9 │ 169 │
│ 4 │ 16 │ 196 │
```
"""
function mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
end
seenvector = true
push!(vs, fv === v ? copy(fv) : fv)
else
if seenvector
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
end
seenscalar = true
push!(vs, [fv])
end
end
DataFrame(vs, _names(df), copycols=false)
end

Base.parent(itr::Union{DataFrameRows, DataFrameColumns}) = getfield(itr, :df)
Base.names(itr::Union{DataFrameRows, DataFrameColumns}) = names(parent(itr))
Base.names(itr::Union{DataFrameRows, DataFrameColumns}, cols) = names(parent(itr), cols)
Expand Down Expand Up @@ -320,3 +264,125 @@ Base.show(dfcs::DataFrameColumns;
eltypes::Bool = true) =
show(stdout, dfcs, allrows=allrows, allcols=allcols, splitcols=splitcols,
rowlabel=rowlabel, summary=summary, eltypes=eltypes)

"""
mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
Return a `DataFrame` where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars
(all values other than `AbstractVector` are considered to be a scalar).
Note that `mapcols` guarantees not to reuse the columns from `df` in the returned
`DataFrame`. If `f` returns its argument then it gets copied before being stored.
# Examples
```jldoctest
julia> df = DataFrame(x=1:4, y=11:14)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 11 │
│ 2 │ 2 │ 12 │
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> mapcols(x -> x.^2, df)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 121 │
│ 2 │ 4 │ 144 │
│ 3 │ 9 │ 169 │
│ 4 │ 16 │ 196 │
```
"""
function mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
end
seenvector = true
push!(vs, fv === v ? copy(fv) : fv)
else
if seenvector
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
end
seenscalar = true
push!(vs, [fv])
end
end
return DataFrame(vs, _names(df), copycols=false)
end

"""
mapcols!(f::Union{Function,Type}, df::DataFrame)
Update a `DataFrame` in-place where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars
(all values other than `AbstractVector` are considered to be a scalar).
Note that `mapcols!` reuses the columns from `df` if they are returned by `f`.
# Examples
```jldoctest
julia> df = DataFrame(x=1:4, y=11:14)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 11 │
│ 2 │ 2 │ 12 │
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> mapcols!(x -> x.^2, df);
julia> df
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 121 │
│ 2 │ 4 │ 144 │
│ 3 │ 9 │ 169 │
│ 4 │ 16 │ 196 │
```
"""
function mapcols!(f::Union{Function,Type}, df::DataFrame)
# note: `f` must return a consistent length
vs = AbstractVector[]
seenscalar = false
seenvector = false
for v in eachcol(df)
fv = f(v)
if fv isa AbstractVector
if seenscalar
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
end
seenvector = true
push!(vs, fv)
else
if seenvector
throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
end
seenscalar = true
push!(vs, [fv])
end
end

len_min, len_max = extrema(length(v) for v in vs)
if len_min != len_max
throw(DimensionMismatch("lengths of returned vectors must be identical"))
end
_columns(df) .= vs

return df
end
76 changes: 76 additions & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1592,3 +1592,79 @@ function Base.push!(df::DataFrame, row::Any; promote::Bool=false)
end
df
end

"""
repeat!(df::DataFrame; inner::Integer = 1, outer::Integer = 1)
Update a data frame `df` in-place by repeating its rows. `inner` specifies how many
times each row is repeated, and `outer` specifies how many times the full set
of rows is repeated. Columns of `df` are freshly allocated.
# Example
```jldoctest
julia> df = DataFrame(a = 1:2, b = 3:4)
2×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 4 │
julia> repeat!(df, inner = 2, outer = 3);
julia> df
12×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 3 │
│ 2 │ 1 │ 3 │
│ 3 │ 2 │ 4 │
│ 4 │ 2 │ 4 │
│ 5 │ 1 │ 3 │
│ 6 │ 1 │ 3 │
│ 7 │ 2 │ 4 │
│ 8 │ 2 │ 4 │
│ 9 │ 1 │ 3 │
│ 10 │ 1 │ 3 │
│ 11 │ 2 │ 4 │
│ 12 │ 2 │ 4 │
```
"""
function repeat!(df::DataFrame; inner::Integer = 1, outer::Integer = 1)
inner < 0 && throw(ArgumentError("inner keyword argument must be non-negative"))
outer < 0 && throw(ArgumentError("outer keyword argument must be non-negative"))
return mapcols!(x -> repeat(x, inner = inner, outer = outer), df)
end

"""
repeat!(df::DataFrame, count::Integer)
Update a data frame `df` in-place by repeating its rows the number of times
specified by `count`. Columns of `df` are freshly allocated.
# Example
```jldoctest
julia> df = DataFrame(a = 1:2, b = 3:4)
2×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 4 │
julia> repeat(df, 2)
4×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 4 │
│ 3 │ 1 │ 3 │
│ 4 │ 2 │ 4 │
```
"""
function repeat!(df::DataFrame, count::Integer)
count < 0 && throw(ArgumentError("count must be non-negative"))
return mapcols!(x -> repeat(x, count), df)
end
24 changes: 24 additions & 0 deletions test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ end
@test mapcols(sum, df_mapcols) == DataFrame(a=55, b=155)
@test mapcols(x -> 1, df_mapcols) == DataFrame(a=1, b=1)
@test_throws ArgumentError mapcols(x -> x[1] == 1 ? 0 : [0], df_mapcols)
@test_throws DimensionMismatch mapcols(x -> x[1] == 1 ? [1] : [1,2], df_mapcols)
@test_throws ArgumentError mapcols(x -> x[1] == 1 ? x : 0, df_mapcols)
@test_throws ArgumentError mapcols(x -> x[1] != 1 ? x : 0, df_mapcols)
df_mapcols2 = mapcols(x -> x, df_mapcols)
Expand All @@ -59,6 +60,29 @@ end
@test df_mapcols2.b !== df_mapcols.b
end

@testset "mapcols!" begin
df_mapcols = DataFrame(a=1:10, b=11:20)
mapcols!(sum, df_mapcols)
@test df_mapcols == DataFrame(a=55, b=155)

df_mapcols = DataFrame(a=1:10, b=11:20)
mapcols!(x -> 1, df_mapcols)
@test df_mapcols == DataFrame(a=1, b=1)

df_mapcols = DataFrame(a=1:10, b=11:20)
@test_throws ArgumentError mapcols!(x -> x[1] == 1 ? 0 : [0], df_mapcols)
@test_throws DimensionMismatch mapcols!(x -> x[1] == 1 ? [1] : [1,2], df_mapcols)
@test_throws ArgumentError mapcols!(x -> x[1] == 1 ? x : 0, df_mapcols)
@test_throws ArgumentError mapcols!(x -> x[1] != 1 ? x : 0, df_mapcols)
@test df_mapcols == DataFrame(a=1:10, b=11:20)

a = df_mapcols.a
b = df_mapcols.b
mapcols!(x -> x, df_mapcols)
@test a === df_mapcols.a
@test b === df_mapcols.b
end

@testset "SubDataFrame" begin
df = DataFrame([11:16 21:26 31:36 41:46])
sdf = view(df, [3,1,4], [3,1,4])
Expand Down
Loading

0 comments on commit 3dd7ff0

Please sign in to comment.