From 17da97173b62d13668be39d23ab92c8cdf9181af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 26 Sep 2022 23:19:07 +0200 Subject: [PATCH 1/5] rename valuestransform to valuesfunction in unstack --- NEWS.md | 2 +- docs/src/man/reshaping_and_pivoting.md | 4 +-- src/abstractdataframe/reshape.jl | 40 +++++++++++------------ test/metadata.jl | 4 +-- test/multithreading.jl | 6 ++-- test/reshape.jl | 44 +++++++++++++------------- 6 files changed, 50 insertions(+), 50 deletions(-) diff --git a/NEWS.md b/NEWS.md index 45317dc54d..4402e4597b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,7 +18,7 @@ for a more flexible handling of values stored in a column that will become a new header ([#3004](https://github.com/JuliaData/DataFrames.jl/issues/3004)) -* `unstack` now allows passing a function in `valuestransform` keyword argument; +* `unstack` now allows passing a function in `valuesfunction` keyword argument; this allows for a convenient creation of two dimensional pivot tables ([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998)) * `filter` for `GroupedDataFrame` now accepts `ungroup` keyword argument diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index 21d6138abc..e2974e400f 100755 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -297,7 +297,7 @@ Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. To do aggregation, use the split-apply-combine functions in combination with -`unstack` or use the `valuestransform` keyword argument in `unstack`. Here is an example: +`unstack` or use the `valuesfunction` keyword argument in `unstack`. Here is an example: ```jldoctest reshape julia> using Statistics @@ -357,7 +357,7 @@ julia> unstack(agg, :variable, :Species, :vmean) 4 │ PetalWidth 0.244 1.326 2.026 5 │ id 25.5 75.5 125.5 -julia> unstack(d, :variable, :Species, :value, valuestransform=mean) +julia> unstack(d, :variable, :Species, :value, valuesfunction=mean) 5×4 DataFrame Row │ variable Iris-setosa Iris-versicolor Iris-virginica │ String Float64? Float64? Float64? diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index b974886cad..9226366549 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -215,15 +215,15 @@ end """ unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity, allowmissing::Bool=false, - allowduplicates::Bool=false, valuestransform=nothing, + allowduplicates::Bool=false, valuesfunction=nothing, fill=missing, threads::Bool=true) unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity, allowmissing::Bool=false, - allowduplicates::Bool=false, valuestransform=nothing, + allowduplicates::Bool=false, valuesfunction=nothing, fill=missing, threads::Bool=true) unstack(df::AbstractDataFrame; renamecols::Function=identity, allowmissing::Bool=false, - allowduplicates::Bool=false, valuestransform=nothing, + allowduplicates::Bool=false, valuesfunction=nothing, fill=missing, threads::Bool=true) Unstack data frame `df`, i.e. convert it from long to wide format. @@ -252,18 +252,18 @@ Row and column keys will be ordered in the order of their first appearance. - `allowduplicates`: if `false` (the default) then an error an error will be thrown if combination of `rowkeys` and `colkey` contains duplicate entries; if `true` then the last encountered `value` will be retained; - this keyword argument is ignored if `valuestransform` keyword argument is passed. -- `valuestransform`: if passed then `allowduplicates` is ignored and instead + this keyword argument is ignored if `valuesfunction` keyword argument is passed. +- `valuesfunction`: if passed then `allowduplicates` is ignored and instead the passed function will be called on a vector view containing all elements for each combination of `rowkeys` and `colkey` present in the data. - `fill`: missing row/column combinations are filled with this value. The default is `missing`. If the `value` column is a `CategoricalVector` and `fill` is not `missing` then in order to keep unstacked value columns also `CategoricalVector` the `fill` must be passed as `CategoricalValue` -- `threads`: whether `valuestransform` may be run in separate tasks which +- `threads`: whether `valuesfunction` may be run in separate tasks which can execute in parallel (possibly being applied to multiple groups at the same time). Whether or not tasks are actually spawned and their number are determined automatically. - Set to `false` if `valuestransform` requires serial execution or is not thread-safe. + Set to `false` if `valuesfunction` requires serial execution or is not thread-safe. Metadata: table-level `:note`-style metadata and column-level `:note`-style metadata for row keys columns are preserved. @@ -401,14 +401,14 @@ julia> df = DataFrame(cols=["a", "a", "b"], values=[1, 2, 4]) 2 │ a 2 3 │ b 4 -julia> unstack(df, :cols, :values, valuestransform=copy) +julia> unstack(df, :cols, :values, valuesfunction=copy) 1×2 DataFrame Row │ a b │ Array…? Array…? ─────┼────────────────── 1 │ [1, 2] [4] -julia> unstack(df, :cols, :values, valuestransform=sum) +julia> unstack(df, :cols, :values, valuesfunction=sum) 1×2 DataFrame Row │ a b │ Int64? Int64? @@ -419,7 +419,7 @@ julia> unstack(df, :cols, :values, valuestransform=sum) function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, values::ColumnIndex; renamecols::Function=identity, allowmissing::Bool=false, allowduplicates::Bool=false, - valuestransform=nothing, fill=missing, + valuesfunction=nothing, fill=missing, threads::Bool=true) # first make sure that rowkeys are unique and # normalize all selectors as a strings @@ -428,7 +428,7 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, colkey = only(names(df, colkey)) values = only(names(df, values)) - if !isnothing(valuestransform) + if !isnothing(valuesfunction) # potentially colkey can be also part of rowkeys so we need to do unique groupcols = unique!([rowkeys; colkey]) @assert groupcols isa Vector{String} @@ -441,16 +441,16 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, end gdf = groupby(df, groupcols) - if check_aggregate(valuestransform, df[!, values]) isa AbstractAggregate - # if valuestransform function is AbstractAggregate + if check_aggregate(valuesfunction, df[!, values]) isa AbstractAggregate + # if valuesfunction function is AbstractAggregate # then we are sure it will return a scalar number so we can # leave it as is and be sure we use fast path in combine - agg_fun = valuestransform + agg_fun = valuesfunction else - # in general valuestransform function could return e.g. a vector, + # in general valuesfunction function could return e.g. a vector, # which would get expanded to multiple rows so we protect it with # Ref that will get unwrapped by combine - agg_fun = Ref∘valuestransform + agg_fun = Ref∘valuesfunction end df_op = combine(gdf, values => agg_fun => values_out, threads=threads) @@ -478,22 +478,22 @@ end function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex; renamecols::Function=identity, allowmissing::Bool=false, allowduplicates::Bool=false, - valuestransform=nothing, fill=missing, + valuesfunction=nothing, fill=missing, threads::Bool=true) colkey_int = index(df)[colkey] value_int = index(df)[values] return unstack(df, Not(colkey_int, value_int), colkey_int, value_int, renamecols=renamecols, allowmissing=allowmissing, - allowduplicates=allowduplicates, valuestransform=valuestransform, + allowduplicates=allowduplicates, valuesfunction=valuesfunction, fill=fill, threads=threads) end unstack(df::AbstractDataFrame; renamecols::Function=identity, allowmissing::Bool=false, allowduplicates::Bool=false, - valuestransform=nothing, fill=missing, + valuesfunction=nothing, fill=missing, threads::Bool=true) = unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, - allowduplicates=allowduplicates, valuestransform=valuestransform, + allowduplicates=allowduplicates, valuesfunction=valuesfunction, fill=fill, threads=threads) # we take into account the fact that idx, starts and ends are computed lazily diff --git a/test/metadata.jl b/test/metadata.jl index 40d6c06c7e..df02346169 100644 --- a/test/metadata.jl +++ b/test/metadata.jl @@ -1327,7 +1327,7 @@ end @test check_allnotemetadata(res) @test getfield(res, :metadata) === nothing @test getfield(res, :colmetadata) === nothing - res = unstack(long, :a, :variable, :value, valuestransform=copy) + res = unstack(long, :a, :variable, :value, valuesfunction=copy) @test check_allnotemetadata(res) @test getfield(res, :metadata) === nothing @test getfield(res, :colmetadata) === nothing @@ -1361,7 +1361,7 @@ end @test isempty(colmetadatakeys(res, :c)) @test isempty(colmetadatakeys(res, :d)) - res = unstack(long, :a, :variable, :value, valuestransform=copy) + res = unstack(long, :a, :variable, :value, valuesfunction=copy) @test check_allnotemetadata(res) @test collect(metadatakeys(res)) == ["name"] @test metadata(res, "name") == "empty" diff --git a/test/multithreading.jl b/test/multithreading.jl index c81ee0ac1f..65ab239820 100644 --- a/test/multithreading.jl +++ b/test/multithreading.jl @@ -237,15 +237,15 @@ end m = Ref(0) n = Ref(0) unstack(df, - allowduplicates=true, valuestransform=x -> (l[] += 1), + allowduplicates=true, valuesfunction=x -> (l[] += 1), threads=false) == DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) == unstack(df, :variable, :value, - allowduplicates=true, valuestransform=x -> (m[] += 1), + allowduplicates=true, valuesfunction=x -> (m[] += 1), threads=false) == DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) == unstack(df, :id, :variable, :value, - allowduplicates=true, valuestransform=x -> (n[] += 1), + allowduplicates=true, valuesfunction=x -> (n[] += 1), threads=false) == DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) diff --git a/test/reshape.jl b/test/reshape.jl index 6c80544ac3..f2676b028a 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -769,17 +769,17 @@ end df = DataFrame(x=[:one, :two, :one], y=[1, 2, 3]) @test_throws ArgumentError unstack(df, :x, :y) @test unstack(df, :x, :y, allowduplicates=true) == DataFrame(one=3, two=2) - @test unstack(df, :x, :y, valuestransform=identity) == + @test unstack(df, :x, :y, valuesfunction=identity) == DataFrame(one=[[1, 3]], two=[[2]]) - @test unstack(df, :x, :y, valuestransform=last) == + @test unstack(df, :x, :y, valuesfunction=last) == DataFrame(one=3, two=2) - @test unstack(df, :x, :y, valuestransform=first) == + @test unstack(df, :x, :y, valuesfunction=first) == DataFrame(one=1, two=2) - @test unstack(df, :x, :y, valuestransform=length) == + @test unstack(df, :x, :y, valuesfunction=length) == DataFrame(one=2, two=1) end -@testset "valuestransform" begin +@testset "valuesfunction" begin df = DataFrame(rowid=[1, 1, 1, 1, 2, 2], colid=[1, 1, 2, 2, 3, 3], values=1:6) @test_throws ArgumentError unstack(df, :rowid, :colid, :values) @test unstack(df, :rowid, :colid, :values, allowduplicates=true) ≅ @@ -788,42 +788,42 @@ end @test unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) == DataFrame("rowid" => 1:2, "1" => [2, 0], "2" => [4, 0], "3" => [0, 6]) - @test unstack(df, :rowid, :colid, :values, valuestransform=identity) ≅ + @test unstack(df, :rowid, :colid, :values, valuesfunction=identity) ≅ DataFrame("rowid" => 1:2, "1" => [1:2, missing], "2" => [3:4, missing], "3" => [missing, 5:6]) @test unstack(df, :rowid, :colid, :values, - valuestransform=identity, fill=Int[]) == + valuesfunction=identity, fill=Int[]) == DataFrame("rowid" => 1:2, "1" => [1:2, []], "2" => [3:4, []], "3" => [[], 5:6]) - @test unstack(df, :rowid, :colid, :values, valuestransform=sum) ≅ + @test unstack(df, :rowid, :colid, :values, valuesfunction=sum) ≅ DataFrame("rowid" => 1:2, "1" => [3, missing], "2" => [7, missing], "3" => [missing, 11]) - @test unstack(df, :rowid, :colid, :values, valuestransform=sum, fill=0) == + @test unstack(df, :rowid, :colid, :values, valuesfunction=sum, fill=0) == DataFrame("rowid" => 1:2, "1" => [3, 0], "2" => [7, 0], "3" => [0, 11]) - @test unstack(df, :rowid, :colid, :values, valuestransform=sum, fill="X") == + @test unstack(df, :rowid, :colid, :values, valuesfunction=sum, fill="X") == DataFrame("rowid" => 1:2, "1" => [3, "X"], "2" => [7, "X"], "3" => ["X", 11]) - @test unstack(df, :rowid, :colid, :values, valuestransform=length) ≅ + @test unstack(df, :rowid, :colid, :values, valuesfunction=length) ≅ DataFrame("rowid" => 1:2, "1" => [2, missing], "2" => [2, missing], "3" => [missing, 2]) - @test unstack(df, :rowid, :colid, :values, valuestransform=length, fill=0) == + @test unstack(df, :rowid, :colid, :values, valuesfunction=length, fill=0) == DataFrame("rowid" => 1:2, "1" => [2, 0], "2" => [2, 0], "3" => [0, 2]) @test unstack(df, :rowid, :colid, :values, - valuestransform=x -> isempty(x) ? missing : length(x)) ≅ + valuesfunction=x -> isempty(x) ? missing : length(x)) ≅ DataFrame("rowid" => 1:2, "1" => [2, missing], "2" => [2, missing], "3" => [missing, 2]) @test unstack(df, :rowid, :colid, :values, - valuestransform=x -> isempty(x) ? missing : x) ≅ + valuesfunction=x -> isempty(x) ? missing : x) ≅ DataFrame("rowid" => 1:2, "1" => [1:2, missing], "2" => [3:4, missing], "3" => [missing, 5:6]) df = DataFrame(rowid=[2, 2, 2, 2, 1, 1], colid=[2, 2, 1, 1, 3, 3], values=1:6) - @test unstack(df, :rowid, :colid, :values, valuestransform=identity) ≅ + @test unstack(df, :rowid, :colid, :values, valuesfunction=identity) ≅ DataFrame("rowid" => [2,1], "2" => [1:2, missing], "1" => [3:4, missing], "3" => [missing, 5:6]) - @test unstack(df, :rowid, :colid, :values, valuestransform=identity, fill="X") == + @test unstack(df, :rowid, :colid, :values, valuesfunction=identity, fill="X") == DataFrame("rowid" => [2,1], "2" => [1:2, "X"], "1" => [3:4, "X"], "3" => ["X", 5:6]) @@ -831,22 +831,22 @@ end # check correctness of row and column ordering for _ in 1:10 df = DataFrame(rowid=rand(1:10, 50), colid=rand(1:10, 50), values=1:50) - res = unstack(df, :rowid, :colid, :values, valuestransform=last) + res = unstack(df, :rowid, :colid, :values, valuesfunction=last) @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true) @test res.rowid == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) - res = unstack(df, :rowid, :colid, :values, valuestransform=last, fill=0) + res = unstack(df, :rowid, :colid, :values, valuesfunction=last, fill=0) @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) @test res.rowid == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) df.rowid=categorical(df.rowid, levels=shuffle(unique(df.rowid))) df.colid=categorical(df.colid, levels=shuffle(unique(df.colid))) - res = unstack(df, :rowid, :colid, :values, valuestransform=last) + res = unstack(df, :rowid, :colid, :values, valuesfunction=last) @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true) @test unwrap.(res.rowid) == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) - res = unstack(df, :rowid, :colid, :values, valuestransform=last, fill=0) + res = unstack(df, :rowid, :colid, :values, valuesfunction=last, fill=0) @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) @test unwrap.(res.rowid) == unique(df.rowid) @@ -893,7 +893,7 @@ end for c in (:a, :b, :c, "a", "b", "c", 1, 2, 3) for v in (:a, :b, :c, "a", "b", "c", 1, 2, 3) @test unstack(df, r, c, v) ≅ - broadcast(x -> x isa Vector ? only(x) : x, unstack(df, r, c, v, valuestransform=copy)) + broadcast(x -> x isa Vector ? only(x) : x, unstack(df, r, c, v, valuesfunction=copy)) end end end @@ -906,7 +906,7 @@ end d=["a", missing, missing], e=[missing, "b", missing], f=[missing, missing, "c"]) - @test unstack(df, 3, 2, 1, valuestransform=only) ≅ + @test unstack(df, 3, 2, 1, valuesfunction=only) ≅ DataFrame(values_out_3490283_11=["g", "h", "i"], d=["a", missing, missing], e=[missing, "b", missing], From 8db0c0eb0314f5b9b6534a10b857d13fd5077ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 1 Oct 2022 12:15:24 +0200 Subject: [PATCH 2/5] change kwarg to combine and deprecate allowduplicates --- NEWS.md | 11 ++- docs/src/man/reshaping_and_pivoting.md | 4 +- src/abstractdataframe/reshape.jl | 118 +++++++++++++------------ test/deprecated.jl | 15 ++++ test/metadata.jl | 4 +- test/multithreading.jl | 9 +- test/reshape.jl | 70 +++++++-------- 7 files changed, 128 insertions(+), 103 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4402e4597b..16e7a44079 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,9 +18,10 @@ for a more flexible handling of values stored in a column that will become a new header ([#3004](https://github.com/JuliaData/DataFrames.jl/issues/3004)) -* `unstack` now allows passing a function in `valuesfunction` keyword argument; +* `unstack` now allows passing a function in `combine` keyword argument; this allows for a convenient creation of two dimensional pivot tables - ([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998)) + ([#2998](https://github.com/JuliaData/DataFrames.jl/issues/2998), + [#3185](https://github.com/JuliaData/DataFrames.jl/pull/3185)) * `filter` for `GroupedDataFrame` now accepts `ungroup` keyword argument ([#3021](https://github.com/JuliaData/DataFrames.jl/issues/3021)) * Add special syntax for `eachindex`, `groupindices`, and `proprow` @@ -65,6 +66,12 @@ or older it is an in place operation. ([#3022](https://github.com/JuliaData/DataFrames.jl/pull/3022)) +# Deprecations + +* `allowduplicates` keyword argument in `unstack` is deprecated, use + `combine` should be used instead + ([#3185](https://github.com/JuliaData/DataFrames.jl/pull/3185)) + ## Internal changes * `DataFrame` is now a `mutable struct` and has three new fields diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md index e2974e400f..e3cde762b6 100755 --- a/docs/src/man/reshaping_and_pivoting.md +++ b/docs/src/man/reshaping_and_pivoting.md @@ -297,7 +297,7 @@ Id columns -- `RepeatedVector` This repeats the original columns N times where N is the number of columns stacked. To do aggregation, use the split-apply-combine functions in combination with -`unstack` or use the `valuesfunction` keyword argument in `unstack`. Here is an example: +`unstack` or use the `combine` keyword argument in `unstack`. Here is an example: ```jldoctest reshape julia> using Statistics @@ -357,7 +357,7 @@ julia> unstack(agg, :variable, :Species, :vmean) 4 │ PetalWidth 0.244 1.326 2.026 5 │ id 25.5 75.5 125.5 -julia> unstack(d, :variable, :Species, :value, valuesfunction=mean) +julia> unstack(d, :variable, :Species, :value, combine=mean) 5×4 DataFrame Row │ variable Iris-setosa Iris-versicolor Iris-virginica │ String Float64? Float64? Float64? diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 9226366549..00ce50187d 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -215,20 +215,17 @@ end """ unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity, allowmissing::Bool=false, - allowduplicates::Bool=false, valuesfunction=nothing, - fill=missing, threads::Bool=true) + combine=nothing, fill=missing, threads::Bool=true) unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity, allowmissing::Bool=false, - allowduplicates::Bool=false, valuesfunction=nothing, - fill=missing, threads::Bool=true) + combine=nothing, fill=missing, threads::Bool=true) unstack(df::AbstractDataFrame; renamecols::Function=identity, allowmissing::Bool=false, - allowduplicates::Bool=false, valuesfunction=nothing, - fill=missing, threads::Bool=true) + combine=nothing, fill=missing, threads::Bool=true) Unstack data frame `df`, i.e. convert it from long to wide format. -Row and column keys will be ordered in the order of their first appearance. +Row and column keys are ordered in the order of their first appearance. # Positional arguments - `df` : the AbstractDataFrame to be unstacked @@ -246,27 +243,25 @@ Row and column keys will be ordered in the order of their first appearance. return the name of the column to be created (typically as a string or a `Symbol`). Duplicates in resulting names when converted to `Symbol` are not allowed. By default no transformation is performed. -- `allowmissing`: if `false` (the default) then an error will be thrown if +- `allowmissing`: if `false` (the default) then an error is thrown if `colkey` contains `missing` values; if `true` then a column referring to - `missing` value will be created. -- `allowduplicates`: if `false` (the default) then an error an error will be - thrown if combination of `rowkeys` and `colkey` contains duplicate entries; if - `true` then the last encountered `value` will be retained; - this keyword argument is ignored if `valuesfunction` keyword argument is passed. -- `valuesfunction`: if passed then `allowduplicates` is ignored and instead - the passed function will be called on a vector view containing all elements - for each combination of `rowkeys` and `colkey` present in the data. + `missing` value is created. +- `combine`: if `only` (the default) then an error is thrown if combination + of `rowkeys` and `colkey` contains duplicate entries. Otherwise the passed + value must be a function that is called on a vector view containing all + elements for each combination of `rowkeys` and `colkey` present in the data. - `fill`: missing row/column combinations are filled with this value. The default is `missing`. If the `value` column is a `CategoricalVector` and `fill` is not `missing` then in order to keep unstacked value columns also `CategoricalVector` the `fill` must be passed as `CategoricalValue` -- `threads`: whether `valuesfunction` may be run in separate tasks which - can execute in parallel (possibly being applied to multiple groups at the same time). - Whether or not tasks are actually spawned and their number are determined automatically. - Set to `false` if `valuesfunction` requires serial execution or is not thread-safe. +- `threads`: whether `combine` function may be run in separate tasks which can + execute in parallel (possibly being applied to multiple groups at the same + time). Whether or not tasks are actually spawned and their number are + determined automatically. Set to `false` if `combine` requires serial + execution or is not thread-safe. -Metadata: table-level `:note`-style metadata and column-level `:note`-style metadata -for row keys columns are preserved. +Metadata: table-level `:note`-style metadata and column-level `:note`-style +metadata for row keys columns are preserved. # Examples @@ -401,14 +396,14 @@ julia> df = DataFrame(cols=["a", "a", "b"], values=[1, 2, 4]) 2 │ a 2 3 │ b 4 -julia> unstack(df, :cols, :values, valuesfunction=copy) +julia> unstack(df, :cols, :values, combine=copy) 1×2 DataFrame Row │ a b │ Array…? Array…? ─────┼────────────────── 1 │ [1, 2] [4] -julia> unstack(df, :cols, :values, valuesfunction=sum) +julia> unstack(df, :cols, :values, combine=sum) 1×2 DataFrame Row │ a b │ Int64? Int64? @@ -418,9 +413,13 @@ julia> unstack(df, :cols, :values, valuesfunction=sum) """ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, values::ColumnIndex; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, - valuesfunction=nothing, fill=missing, - threads::Bool=true) + allowmissing::Bool=false, allowduplicates::Bool=false, + combine=only, fill=missing, threads::Bool=true) + if allowduplicates + Base.depwarn("allowduplicates keyword argument is deprecated. " * + "Pass `combine=last` instead of allowduplicates=true.", :unstack) + combine = last + end # first make sure that rowkeys are unique and # normalize all selectors as a strings # if some of the selectors are wrong we will get an early error here @@ -428,7 +427,7 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, colkey = only(names(df, colkey)) values = only(names(df, values)) - if !isnothing(valuesfunction) + if combine !== only # potentially colkey can be also part of rowkeys so we need to do unique groupcols = unique!([rowkeys; colkey]) @assert groupcols isa Vector{String} @@ -441,60 +440,67 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, end gdf = groupby(df, groupcols) - if check_aggregate(valuesfunction, df[!, values]) isa AbstractAggregate - # if valuesfunction function is AbstractAggregate + if check_aggregate(combine, df[!, values]) isa AbstractAggregate + # if combine function is AbstractAggregate # then we are sure it will return a scalar number so we can # leave it as is and be sure we use fast path in combine - agg_fun = valuesfunction + agg_fun = combine else - # in general valuesfunction function could return e.g. a vector, + # in general combine function could return e.g. a vector, # which would get expanded to multiple rows so we protect it with # Ref that will get unwrapped by combine - agg_fun = Ref∘valuesfunction + agg_fun = Ref∘combine end - df_op = combine(gdf, values => agg_fun => values_out, - threads=threads) + df_op = DataFrames.combine(gdf, values => agg_fun => values_out, + threads=threads) group_rows = find_group_row(gdf) if !issorted(group_rows) df_op = df_op[sortperm(group_rows), :] end - # set allowduplicates to true as we should not have any duplicates now - # and allowduplicates=true is a bit faster - allowduplicates = true + # we should not have any duplicates in df_op now + noduplicates = true else df_op = df values_out = values + noduplicates = false end g_rowkey = groupby(df_op, rowkeys) g_colkey = groupby(df_op, colkey) valuecol = df_op[!, values_out] return _unstack(df_op, index(df_op)[rowkeys], index(df_op)[colkey], g_colkey, - valuecol, g_rowkey, renamecols, - allowmissing, allowduplicates, fill) + valuecol, g_rowkey, renamecols, allowmissing, noduplicates, fill) end function unstack(df::AbstractDataFrame, colkey::ColumnIndex, values::ColumnIndex; - renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, - valuesfunction=nothing, fill=missing, - threads::Bool=true) + renamecols::Function=identity, allowmissing::Bool=false, + allowduplicates::Bool=false, combine=only, fill=missing, + threads::Bool=true) + if allowduplicates + Base.depwarn("allowduplicates keyword argument is deprecated. " * + "Pass `combine=last` instead of allowduplicates=true.", :unstack) + combine = last + end colkey_int = index(df)[colkey] value_int = index(df)[values] return unstack(df, Not(colkey_int, value_int), colkey_int, value_int, renamecols=renamecols, allowmissing=allowmissing, - allowduplicates=allowduplicates, valuesfunction=valuesfunction, + combine=combine, fill=fill, threads=threads) end -unstack(df::AbstractDataFrame; renamecols::Function=identity, - allowmissing::Bool=false, allowduplicates::Bool=false, - valuesfunction=nothing, fill=missing, - threads::Bool=true) = +function unstack(df::AbstractDataFrame; renamecols::Function=identity, + allowmissing::Bool=false, allowduplicates::Bool=false, + combine=only, fill=missing, threads::Bool=true) + if allowduplicates + Base.depwarn("allowduplicates keyword argument is deprecated. " * + "Pass `combine=last` instead of allowduplicates=true.", :unstack) + combine = last + end unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing, - allowduplicates=allowduplicates, valuesfunction=valuesfunction, - fill=fill, threads=threads) + combine=combine, fill=fill, threads=threads) +end # we take into account the fact that idx, starts and ends are computed lazily # so we rather directly reference the gdf.groups @@ -521,8 +527,7 @@ end function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, colkey::Int, g_colkey::GroupedDataFrame, valuecol::AbstractVector, g_rowkey::GroupedDataFrame, - renamecols::Function, allowmissing::Bool, - allowduplicates::Bool, fill) + renamecols::Function, allowmissing::Bool, noduplicates::Bool, fill) rowref = g_rowkey.groups row_group_row_idxs = find_group_row(g_rowkey) Nrow = length(g_rowkey) @@ -543,8 +548,8 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, Nrow), fill) for _ in 1:Ncol] - # use a separate path for allowduplicates to reduce memory use and increase speed - if allowduplicates + # use a separate path for noduplicates to reduce memory use and increase speed + if noduplicates for (k, (row_id, col_id, val)) in enumerate(zip(rowref, colref, valuecol)) unstacked_val[col_id][row_id] = val end @@ -556,7 +561,8 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, bad_var = colref_map[col_id] throw(ArgumentError("Duplicate entries in unstack at row $k for key "* "$bad_key and variable $bad_var. " * - "Pass allowduplicates=true to allow them.")) + "Pass `combine` keyword argument to specify " * + "how they should be handled.")) end unstacked_val[col_id][row_id] = val mask_filled[row_id, col_id] = true diff --git a/test/deprecated.jl b/test/deprecated.jl index 6e2035564f..beaba2770b 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -140,4 +140,19 @@ end @test df == DataFrame(x=1, y=1:4) end +@testset "deprecated allowduplicates in unstack" begin + df = DataFrame(row=[1, 1, 2, 2], variable=["x", "x", "y", "y"], value=1:4) + @test_throws ArgumentError unstack(df, :row, :variable, :value) + @test unstack(df, :row, :variable, :value, allowduplicates=true) ≅ + DataFrame(row=1:2, x=[2, missing], y=[missing, 4]) + @test unstack(df, :variable, :value, allowduplicates=true) ≅ + DataFrame(row=1:2, x=[2, missing], y=[missing, 4]) + @test unstack(df, allowduplicates=true) ≅ + DataFrame(row=1:2, x=[2, missing], y=[missing, 4]) + @test unstack(df, :variable, :value, allowduplicates=true) ≅ + DataFrame(row=1:2, x=[2, missing], y=[missing, 4]) + @test unstack(df, :row, :variable, :value, allowduplicates=true) ≅ + unstack(df, :row, :variable, :value, combine=last) +end + end # module diff --git a/test/metadata.jl b/test/metadata.jl index df02346169..b9cccfc7f9 100644 --- a/test/metadata.jl +++ b/test/metadata.jl @@ -1327,7 +1327,7 @@ end @test check_allnotemetadata(res) @test getfield(res, :metadata) === nothing @test getfield(res, :colmetadata) === nothing - res = unstack(long, :a, :variable, :value, valuesfunction=copy) + res = unstack(long, :a, :variable, :value, combine=copy) @test check_allnotemetadata(res) @test getfield(res, :metadata) === nothing @test getfield(res, :colmetadata) === nothing @@ -1361,7 +1361,7 @@ end @test isempty(colmetadatakeys(res, :c)) @test isempty(colmetadatakeys(res, :d)) - res = unstack(long, :a, :variable, :value, valuesfunction=copy) + res = unstack(long, :a, :variable, :value, combine=copy) @test check_allnotemetadata(res) @test collect(metadatakeys(res)) == ["name"] @test metadata(res, "name") == "empty" diff --git a/test/multithreading.jl b/test/multithreading.jl index 65ab239820..a1826e76da 100644 --- a/test/multithreading.jl +++ b/test/multithreading.jl @@ -236,16 +236,13 @@ end l = Ref(0) m = Ref(0) n = Ref(0) - unstack(df, - allowduplicates=true, valuesfunction=x -> (l[] += 1), + unstack(df, combine=x -> (l[] += 1), threads=false) == DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) == - unstack(df, :variable, :value, - allowduplicates=true, valuesfunction=x -> (m[] += 1), + unstack(df, :variable, :value, combine=x -> (m[] += 1), threads=false) == DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) == - unstack(df, :id, :variable, :value, - allowduplicates=true, valuesfunction=x -> (n[] += 1), + unstack(df, :id, :variable, :value, combine=x -> (n[] += 1), threads=false) == DataFrame(id=1:3, a=[1, 3, 5], b=[2, 4, 6]) diff --git a/test/reshape.jl b/test/reshape.jl index f2676b028a..f766f0bfaa 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -144,8 +144,8 @@ end variable=["a", "b", "a", "b"], value=[3, 4, 5, 6]) @test_throws ArgumentError unstack(df, :id, :variable, :value) @test_throws ArgumentError unstack(df, :variable, :value) - a = unstack(df, :id, :variable, :value, allowduplicates=true) - b = unstack(df, :variable, :value, allowduplicates=true) + a = unstack(df, :id, :variable, :value, combine=last) + b = unstack(df, :variable, :value, combine=last) @test a ≅ DataFrame(id=[1, 2], a=[5, missing], b=[missing, 6]) @test b ≅ DataFrame(id=[1, 2], id2=[1, 2], a=[5, missing], b=[missing, 6]) @@ -157,8 +157,8 @@ end df = DataFrame(variable=["x", "x"], value=[missing, missing], id=[1, 1]) @test_throws ArgumentError unstack(df, :variable, :value) @test_throws ArgumentError unstack(df, :id, :variable, :value) - @test unstack(df, :variable, :value, allowduplicates=true) ≅ DataFrame(id=1, x=missing) - @test unstack(df, :id, :variable, :value, allowduplicates=true) ≅ DataFrame(id=1, x=missing) + @test unstack(df, :variable, :value, combine=last) ≅ DataFrame(id=1, x=missing) + @test unstack(df, :id, :variable, :value, combine=last) ≅ DataFrame(id=1, x=missing) end @testset "missing values in colkey" begin @@ -546,9 +546,9 @@ end df[4, 1:2] .= 1 @test_throws ArgumentError unstack(df, :id, :var, :val) @test_throws ArgumentError unstack(df, [:id, :id2], :var, :val) - @test unstack(df, :id, :var, :val, allowduplicates=true) ≅ + @test unstack(df, :id, :var, :val, combine=last) ≅ DataFrame(id=1:3, a=[4, missing, 7], b=2:3:8, c=3:3:9) - @test unstack(df, [:id, :id2], :var, :val, allowduplicates=true) ≅ + @test unstack(df, [:id, :id2], :var, :val, combine=last) ≅ DataFrame(id=1:3, id2=1:3, a=[4, missing, 7], b=2:3:8, c=3:3:9) df = DataFrame(id=repeat(1:3, inner=3), @@ -768,62 +768,62 @@ end df = DataFrame(x=[:one, :two, :one], y=[1, 2, 3]) @test_throws ArgumentError unstack(df, :x, :y) - @test unstack(df, :x, :y, allowduplicates=true) == DataFrame(one=3, two=2) - @test unstack(df, :x, :y, valuesfunction=identity) == + @test unstack(df, :x, :y, combine=last) == DataFrame(one=3, two=2) + @test unstack(df, :x, :y, combine=identity) == DataFrame(one=[[1, 3]], two=[[2]]) - @test unstack(df, :x, :y, valuesfunction=last) == + @test unstack(df, :x, :y, combine=last) == DataFrame(one=3, two=2) - @test unstack(df, :x, :y, valuesfunction=first) == + @test unstack(df, :x, :y, combine=first) == DataFrame(one=1, two=2) - @test unstack(df, :x, :y, valuesfunction=length) == + @test unstack(df, :x, :y, combine=length) == DataFrame(one=2, two=1) end -@testset "valuesfunction" begin +@testset "combine kwarg" begin df = DataFrame(rowid=[1, 1, 1, 1, 2, 2], colid=[1, 1, 2, 2, 3, 3], values=1:6) @test_throws ArgumentError unstack(df, :rowid, :colid, :values) - @test unstack(df, :rowid, :colid, :values, allowduplicates=true) ≅ + @test unstack(df, :rowid, :colid, :values, combine=last) ≅ DataFrame("rowid" => 1:2, "1" => [2, missing], "2" => [4, missing], "3" => [missing, 6]) - @test unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) == + @test unstack(df, :rowid, :colid, :values, combine=last, fill=0) == DataFrame("rowid" => 1:2, "1" => [2, 0], "2" => [4, 0], "3" => [0, 6]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=identity) ≅ + @test unstack(df, :rowid, :colid, :values, combine=identity) ≅ DataFrame("rowid" => 1:2, "1" => [1:2, missing], "2" => [3:4, missing], "3" => [missing, 5:6]) @test unstack(df, :rowid, :colid, :values, - valuesfunction=identity, fill=Int[]) == + combine=identity, fill=Int[]) == DataFrame("rowid" => 1:2, "1" => [1:2, []], "2" => [3:4, []], "3" => [[], 5:6]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=sum) ≅ + @test unstack(df, :rowid, :colid, :values, combine=sum) ≅ DataFrame("rowid" => 1:2, "1" => [3, missing], "2" => [7, missing], "3" => [missing, 11]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=sum, fill=0) == + @test unstack(df, :rowid, :colid, :values, combine=sum, fill=0) == DataFrame("rowid" => 1:2, "1" => [3, 0], "2" => [7, 0], "3" => [0, 11]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=sum, fill="X") == + @test unstack(df, :rowid, :colid, :values, combine=sum, fill="X") == DataFrame("rowid" => 1:2, "1" => [3, "X"], "2" => [7, "X"], "3" => ["X", 11]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=length) ≅ + @test unstack(df, :rowid, :colid, :values, combine=length) ≅ DataFrame("rowid" => 1:2, "1" => [2, missing], "2" => [2, missing], "3" => [missing, 2]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=length, fill=0) == + @test unstack(df, :rowid, :colid, :values, combine=length, fill=0) == DataFrame("rowid" => 1:2, "1" => [2, 0], "2" => [2, 0], "3" => [0, 2]) @test unstack(df, :rowid, :colid, :values, - valuesfunction=x -> isempty(x) ? missing : length(x)) ≅ + combine=x -> isempty(x) ? missing : length(x)) ≅ DataFrame("rowid" => 1:2, "1" => [2, missing], "2" => [2, missing], "3" => [missing, 2]) @test unstack(df, :rowid, :colid, :values, - valuesfunction=x -> isempty(x) ? missing : x) ≅ + combine=x -> isempty(x) ? missing : x) ≅ DataFrame("rowid" => 1:2, "1" => [1:2, missing], "2" => [3:4, missing], "3" => [missing, 5:6]) df = DataFrame(rowid=[2, 2, 2, 2, 1, 1], colid=[2, 2, 1, 1, 3, 3], values=1:6) - @test unstack(df, :rowid, :colid, :values, valuesfunction=identity) ≅ + @test unstack(df, :rowid, :colid, :values, combine=identity) ≅ DataFrame("rowid" => [2,1], "2" => [1:2, missing], "1" => [3:4, missing], "3" => [missing, 5:6]) - @test unstack(df, :rowid, :colid, :values, valuesfunction=identity, fill="X") == + @test unstack(df, :rowid, :colid, :values, combine=identity, fill="X") == DataFrame("rowid" => [2,1], "2" => [1:2, "X"], "1" => [3:4, "X"], "3" => ["X", 5:6]) @@ -831,24 +831,24 @@ end # check correctness of row and column ordering for _ in 1:10 df = DataFrame(rowid=rand(1:10, 50), colid=rand(1:10, 50), values=1:50) - res = unstack(df, :rowid, :colid, :values, valuesfunction=last) - @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true) + res = unstack(df, :rowid, :colid, :values, combine=last) + @test res ≅ unstack(df, :rowid, :colid, :values, combine=last) @test res.rowid == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) - res = unstack(df, :rowid, :colid, :values, valuesfunction=last, fill=0) - @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) + res = unstack(df, :rowid, :colid, :values, combine=last, fill=0) + @test res ≅ unstack(df, :rowid, :colid, :values, combine=last, fill=0) @test res.rowid == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) df.rowid=categorical(df.rowid, levels=shuffle(unique(df.rowid))) df.colid=categorical(df.colid, levels=shuffle(unique(df.colid))) - res = unstack(df, :rowid, :colid, :values, valuesfunction=last) - @test res ≅ unstack(df, :rowid, :colid, :values, allowduplicates=true) + res = unstack(df, :rowid, :colid, :values, combine=last) + @test res ≅ unstack(df, :rowid, :colid, :values, combine=last) @test unwrap.(res.rowid) == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) - res = unstack(df, :rowid, :colid, :values, valuesfunction=last, fill=0) + res = unstack(df, :rowid, :colid, :values, combine=last, fill=0) @test res ≅ - unstack(df, :rowid, :colid, :values, allowduplicates=true, fill=0) + unstack(df, :rowid, :colid, :values, combine=last, fill=0) @test unwrap.(res.rowid) == unique(df.rowid) @test names(res, Not(1)) == string.(unique(df.colid)) end @@ -893,7 +893,7 @@ end for c in (:a, :b, :c, "a", "b", "c", 1, 2, 3) for v in (:a, :b, :c, "a", "b", "c", 1, 2, 3) @test unstack(df, r, c, v) ≅ - broadcast(x -> x isa Vector ? only(x) : x, unstack(df, r, c, v, valuesfunction=copy)) + broadcast(x -> x isa Vector ? only(x) : x, unstack(df, r, c, v, combine=copy)) end end end @@ -906,7 +906,7 @@ end d=["a", missing, missing], e=[missing, "b", missing], f=[missing, missing, "c"]) - @test unstack(df, 3, 2, 1, valuesfunction=only) ≅ + @test unstack(df, 3, 2, 1, combine=only) ≅ DataFrame(values_out_3490283_11=["g", "h", "i"], d=["a", missing, missing], e=[missing, "b", missing], From 9bbe6bcdead35f47219d6bb6b8190e66df3d0861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Oct 2022 11:36:12 +0200 Subject: [PATCH 3/5] Update NEWS.md Co-authored-by: Milan Bouchet-Valat --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 16e7a44079..156924509e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -68,7 +68,7 @@ # Deprecations -* `allowduplicates` keyword argument in `unstack` is deprecated, use +* `allowduplicates` keyword argument in `unstack` is deprecated, `combine` should be used instead ([#3185](https://github.com/JuliaData/DataFrames.jl/pull/3185)) From d651499ccba118a7cc83005ec3d6e8a5996c3167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 2 Oct 2022 11:39:30 +0200 Subject: [PATCH 4/5] improve docstings --- NEWS.md | 2 +- src/abstractdataframe/reshape.jl | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 156924509e..3a6d73c9b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,7 +69,7 @@ # Deprecations * `allowduplicates` keyword argument in `unstack` is deprecated, - `combine` should be used instead + `combine` keyword argument should be used instead ([#3185](https://github.com/JuliaData/DataFrames.jl/pull/3185)) ## Internal changes diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 00ce50187d..da717359fd 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -263,6 +263,12 @@ Row and column keys are ordered in the order of their first appearance. Metadata: table-level `:note`-style metadata and column-level `:note`-style metadata for row keys columns are preserved. +# Deprecations + +- `allowduplicates` keyword argument is deprecated; instead use `combine` + keyword argument; an equivalent to `allowduplicates=true` is `combine=last` + and to `allowduplicates=false` is `combine=only` (the default); + # Examples ```jldoctest From 91f3a0967ed242ebe74852df2a05996a7f7584c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 3 Oct 2022 08:25:15 +0200 Subject: [PATCH 5/5] Update src/abstractdataframe/reshape.jl --- src/abstractdataframe/reshape.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index da717359fd..96a3ab9aeb 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -423,7 +423,7 @@ function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex, combine=only, fill=missing, threads::Bool=true) if allowduplicates Base.depwarn("allowduplicates keyword argument is deprecated. " * - "Pass `combine=last` instead of allowduplicates=true.", :unstack) + "Pass `combine=last` instead of `allowduplicates=true`.", :unstack) combine = last end # first make sure that rowkeys are unique and