add mapcols! and repeat!, fix corner cases of repeat

JuliaData · Apr 28, 2020 · 3dd7ff0 · 3dd7ff0
1 parent 34b8d6d
commit 3dd7ff0
Show file tree

Hide file tree

Showing 7 changed files with 305 additions and 71 deletions.
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -57,6 +57,7 @@ hcat
 insertcols!
 length
 mapcols
+mapcols!
 names
 ncol
 ndims
@@ -67,6 +68,7 @@ push!
 rename
 rename!
 repeat
+repeat!
 select
 select!
 show

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -45,13 +45,15 @@ export AbstractDataFrame,
        insertcols!,
        leftjoin,
        mapcols,
+       mapcols!,
        ncol,
        nonunique,
        nrow,
        order,
        outerjoin,
        rename!,
        rename,
+       repeat!,
        rightjoin,
        select!,
        select,

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -1494,8 +1494,11 @@ julia> repeat(df, inner = 2, outer = 3)
 │ 12  │ 2     │ 4     │
 ```
 """
-Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) =
-    mapcols(x -> repeat(x, inner = inner, outer = outer), df)
+function Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1)
+    inner < 0 && throw(ArgumentError("inner keyword argument must be non-negative"))
+    outer < 0 && throw(ArgumentError("outer keyword argument must be non-negative"))
+    return mapcols(x -> repeat(x, inner = inner, outer = outer), df)
+end
 
 """
     repeat(df::AbstractDataFrame, count::Integer)
@@ -1524,8 +1527,10 @@ julia> repeat(df, 2)
 │ 4   │ 2     │ 4     │
 ```
 """
-Base.repeat(df::AbstractDataFrame, count::Integer) =
-    mapcols(x -> repeat(x, count), df)
+function Base.repeat(df::AbstractDataFrame, count::Integer)
+    count < 0 && throw(ArgumentError("count must be non-negative"))
+    return mapcols(x -> repeat(x, count), df)
+end
 
 ##############################################################################
 ##

diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl
@@ -195,62 +195,6 @@ where `name` is the column name of the column `col`.
 """
 Base.pairs(itr::DataFrameColumns) = Base.Iterators.Pairs(itr, keys(itr))
 
-"""
-    mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
-
-Return a `DataFrame` where each column of `df` is transformed using function `f`.
-`f` must return `AbstractVector` objects all with the same length or scalars.
-
-Note that `mapcols` guarantees not to reuse the columns from `df` in the returned
-`DataFrame`. If `f` returns its argument then it gets copied before being stored.
-
-# Examples
-```jldoctest
-julia> df = DataFrame(x=1:4, y=11:14)
-4×2 DataFrame
-│ Row │ x     │ y     │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 11    │
-│ 2   │ 2     │ 12    │
-│ 3   │ 3     │ 13    │
-│ 4   │ 4     │ 14    │
-
-julia> mapcols(x -> x.^2, df)
-4×2 DataFrame
-│ Row │ x     │ y     │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 121   │
-│ 2   │ 4     │ 144   │
-│ 3   │ 9     │ 169   │
-│ 4   │ 16    │ 196   │
-```
-"""
-function mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
-    # note: `f` must return a consistent length
-    vs = AbstractVector[]
-    seenscalar = false
-    seenvector = false
-    for v in eachcol(df)
-        fv = f(v)
-        if fv isa AbstractVector
-            if seenscalar
-                throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
-            end
-            seenvector = true
-            push!(vs, fv === v ? copy(fv) : fv)
-        else
-            if seenvector
-                throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
-            end
-            seenscalar = true
-            push!(vs, [fv])
-        end
-    end
-    DataFrame(vs, _names(df), copycols=false)
-end
-
 Base.parent(itr::Union{DataFrameRows, DataFrameColumns}) = getfield(itr, :df)
 Base.names(itr::Union{DataFrameRows, DataFrameColumns}) = names(parent(itr))
 Base.names(itr::Union{DataFrameRows, DataFrameColumns}, cols) = names(parent(itr), cols)
@@ -320,3 +264,125 @@ Base.show(dfcs::DataFrameColumns;
           eltypes::Bool = true) =
     show(stdout, dfcs, allrows=allrows, allcols=allcols, splitcols=splitcols,
          rowlabel=rowlabel, summary=summary, eltypes=eltypes)
+
+"""
+    mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
+
+Return a `DataFrame` where each column of `df` is transformed using function `f`.
+`f` must return `AbstractVector` objects all with the same length or scalars
+(all values other than `AbstractVector` are considered to be a scalar).
+
+Note that `mapcols` guarantees not to reuse the columns from `df` in the returned
+`DataFrame`. If `f` returns its argument then it gets copied before being stored.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(x=1:4, y=11:14)
+4×2 DataFrame
+│ Row │ x     │ y     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 11    │
+│ 2   │ 2     │ 12    │
+│ 3   │ 3     │ 13    │
+│ 4   │ 4     │ 14    │
+
+julia> mapcols(x -> x.^2, df)
+4×2 DataFrame
+│ Row │ x     │ y     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 121   │
+│ 2   │ 4     │ 144   │
+│ 3   │ 9     │ 169   │
+│ 4   │ 16    │ 196   │
+```
+"""
+function mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
+    # note: `f` must return a consistent length
+    vs = AbstractVector[]
+    seenscalar = false
+    seenvector = false
+    for v in eachcol(df)
+        fv = f(v)
+        if fv isa AbstractVector
+            if seenscalar
+                throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
+            end
+            seenvector = true
+            push!(vs, fv === v ? copy(fv) : fv)
+        else
+            if seenvector
+                throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
+            end
+            seenscalar = true
+            push!(vs, [fv])
+        end
+    end
+    return DataFrame(vs, _names(df), copycols=false)
+end
+
+"""
+    mapcols!(f::Union{Function,Type}, df::DataFrame)
+
+Update a `DataFrame` in-place where each column of `df` is transformed using function `f`.
+`f` must return `AbstractVector` objects all with the same length or scalars
+(all values other than `AbstractVector` are considered to be a scalar).
+
+Note that `mapcols!` reuses the columns from `df` if they are returned by `f`.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(x=1:4, y=11:14)
+4×2 DataFrame
+│ Row │ x     │ y     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 11    │
+│ 2   │ 2     │ 12    │
+│ 3   │ 3     │ 13    │
+│ 4   │ 4     │ 14    │
+
+julia> mapcols!(x -> x.^2, df);
+
+julia> df
+4×2 DataFrame
+│ Row │ x     │ y     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 121   │
+│ 2   │ 4     │ 144   │
+│ 3   │ 9     │ 169   │
+│ 4   │ 16    │ 196   │
+```
+"""
+function mapcols!(f::Union{Function,Type}, df::DataFrame)
+    # note: `f` must return a consistent length
+    vs = AbstractVector[]
+    seenscalar = false
+    seenvector = false
+    for v in eachcol(df)
+        fv = f(v)
+        if fv isa AbstractVector
+            if seenscalar
+                throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
+            end
+            seenvector = true
+            push!(vs, fv)
+        else
+            if seenvector
+                throw(ArgumentError("mixing scalars and vectors in mapcols not allowed"))
+            end
+            seenscalar = true
+            push!(vs, [fv])
+        end
+    end
+
+    len_min, len_max = extrema(length(v) for v in vs)
+    if len_min != len_max
+        throw(DimensionMismatch("lengths of returned vectors must be identical"))
+    end
+    _columns(df) .= vs
+
+    return df
+end
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -1592,3 +1592,79 @@ function Base.push!(df::DataFrame, row::Any; promote::Bool=false)
     end
     df
 end
+
+"""
+    repeat!(df::DataFrame; inner::Integer = 1, outer::Integer = 1)
+
+Update a data frame `df` in-place by repeating its rows. `inner` specifies how many
+times each row is repeated, and `outer` specifies how many times the full set
+of rows is repeated. Columns of `df` are freshly allocated.
+
+# Example
+```jldoctest
+julia> df = DataFrame(a = 1:2, b = 3:4)
+2×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 3     │
+│ 2   │ 2     │ 4     │
+
+julia> repeat!(df, inner = 2, outer = 3);
+
+julia> df
+12×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 3     │
+│ 2   │ 1     │ 3     │
+│ 3   │ 2     │ 4     │
+│ 4   │ 2     │ 4     │
+│ 5   │ 1     │ 3     │
+│ 6   │ 1     │ 3     │
+│ 7   │ 2     │ 4     │
+│ 8   │ 2     │ 4     │
+│ 9   │ 1     │ 3     │
+│ 10  │ 1     │ 3     │
+│ 11  │ 2     │ 4     │
+│ 12  │ 2     │ 4     │
+```
+"""
+function repeat!(df::DataFrame; inner::Integer = 1, outer::Integer = 1)
+    inner < 0 && throw(ArgumentError("inner keyword argument must be non-negative"))
+    outer < 0 && throw(ArgumentError("outer keyword argument must be non-negative"))
+    return mapcols!(x -> repeat(x, inner = inner, outer = outer), df)
+end
+
+"""
+    repeat!(df::DataFrame, count::Integer)
+
+Update a data frame `df` in-place by repeating its rows the number of times
+specified by `count`. Columns of `df` are freshly allocated.
+
+# Example
+```jldoctest
+julia> df = DataFrame(a = 1:2, b = 3:4)
+2×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 3     │
+│ 2   │ 2     │ 4     │
+
+julia> repeat(df, 2)
+4×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 3     │
+│ 2   │ 2     │ 4     │
+│ 3   │ 1     │ 3     │
+│ 4   │ 2     │ 4     │
+```
+"""
+function repeat!(df::DataFrame, count::Integer)
+    count < 0 && throw(ArgumentError("count must be non-negative"))
+    return mapcols!(x -> repeat(x, count), df)
+end
diff --git a/test/iteration.jl b/test/iteration.jl
@@ -51,6 +51,7 @@ end
     @test mapcols(sum, df_mapcols) == DataFrame(a=55, b=155)
     @test mapcols(x -> 1, df_mapcols) == DataFrame(a=1, b=1)
     @test_throws ArgumentError mapcols(x -> x[1] == 1 ? 0 : [0], df_mapcols)
+    @test_throws DimensionMismatch mapcols(x -> x[1] == 1 ? [1] : [1,2], df_mapcols)
     @test_throws ArgumentError mapcols(x -> x[1] == 1 ? x : 0, df_mapcols)
     @test_throws ArgumentError mapcols(x -> x[1] != 1 ? x : 0, df_mapcols)
     df_mapcols2 = mapcols(x -> x, df_mapcols)
@@ -59,6 +60,29 @@ end
     @test df_mapcols2.b !== df_mapcols.b
 end
 
+@testset "mapcols!" begin
+    df_mapcols = DataFrame(a=1:10, b=11:20)
+    mapcols!(sum, df_mapcols)
+    @test df_mapcols == DataFrame(a=55, b=155)
+
+    df_mapcols = DataFrame(a=1:10, b=11:20)
+    mapcols!(x -> 1, df_mapcols)
+    @test df_mapcols == DataFrame(a=1, b=1)
+
+    df_mapcols = DataFrame(a=1:10, b=11:20)
+    @test_throws ArgumentError mapcols!(x -> x[1] == 1 ? 0 : [0], df_mapcols)
+    @test_throws DimensionMismatch mapcols!(x -> x[1] == 1 ? [1] : [1,2], df_mapcols)
+    @test_throws ArgumentError mapcols!(x -> x[1] == 1 ? x : 0, df_mapcols)
+    @test_throws ArgumentError mapcols!(x -> x[1] != 1 ? x : 0, df_mapcols)
+    @test df_mapcols == DataFrame(a=1:10, b=11:20)
+
+    a = df_mapcols.a
+    b = df_mapcols.b
+    mapcols!(x -> x, df_mapcols)
+    @test a === df_mapcols.a
+    @test b === df_mapcols.b
+end
+
 @testset "SubDataFrame" begin
     df = DataFrame([11:16 21:26 31:36 41:46])
     sdf = view(df, [3,1,4], [3,1,4])