diff --git a/NEWS.md b/NEWS.md index 1c22d9dd19..52c0447b15 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,13 +28,16 @@ ([#3012](https://github.com/JuliaData/DataFrames.jl/issues/3012)) * Guarantee that `permute!` and `invpermute!` throw on invalid input ([#3035](https://github.com/JuliaData/DataFrames.jl/pull/3035)) +* Add `allcombinations` function that returns a data frame created + from all combinations of the passed vectors + ([#3031](https://github.com/JuliaData/DataFrames.jl/pull/3031)) ## Previously announced breaking changes * On Julia 1.7 or newer broadcasting assignment into an existing column of a data frame replaces it. Under Julia 1.6 or older it is an in place operation. - ([#3022](https://github.com/JuliaData/DataFrames.jl/pull/3022) + ([#3022](https://github.com/JuliaData/DataFrames.jl/pull/3022)) ## Performance diff --git a/Project.toml b/Project.toml index 5b8e3e5dce..ebc02f7ce2 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] CategoricalArrays = "0.10.0" Compat = "3.17" -DataAPI = "1.9" +DataAPI = "1.10" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2, 1" diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 807a05fbc3..dcb6758343 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -43,6 +43,7 @@ Pages = ["functions.md"] ## Constructing data frames ```@docs +allcombinations copy similar ``` diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 9b253210e3..63b6b77a01 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -11,6 +11,7 @@ using PrettyTables using Random import DataAPI, + DataAPI.allcombinations, DataAPI.All, DataAPI.Between, DataAPI.Cols, @@ -39,6 +40,7 @@ export AbstractDataFrame, GroupedDataFrame, SubDataFrame, Tables, + allcombinations, allowmissing!, antijoin, columnindex, diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a753959b7c..cbfeec278b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1519,14 +1519,8 @@ function fillcombinations(df::AbstractDataFrame, indexcols; # Create a vector of vectors of unique values in each column uniquevals = [] for col in colind - # levels drops missing, handle the case where missing values are present # All levels are retained, missing is added only if present - # TODO: change this after DataAPI.jl levels supports missing - if any(ismissing, df[!, col]) - tempcol = vcat(levels(df[!, col]), missing) - else - tempcol = levels(df[!, col]) - end + tempcol = levels(df[!, col], skipmissing=false) push!(uniquevals, tempcol) end @@ -2528,12 +2522,12 @@ function _permutation_helper!(fun::Union{typeof(Base.permute!!), typeof(Base.inv nrow(df) != length(p) && throw(DimensionMismatch("Permutation does not have a correct length " * "(expected $(nrow(df)) but got $(length(p)))")) - + cp = _compile_permutation!(Base.copymutable(p)) isempty(cp) && return df - if fun === Base.invpermute!! + if fun === Base.invpermute!! reverse!(@view cp[1:end-1]) end @@ -2544,14 +2538,14 @@ function _permutation_helper!(fun::Union{typeof(Base.permute!!), typeof(Base.inv _cycle_permute!(col, cp) end end - + return df end -# convert a classical permutation to zero terminated cycle +# convert a classical permutation to zero terminated cycle # notation, zeroing the original permutation in the process. function _compile_permutation!(p::AbstractVector{<:Integer}) - firstindex(p) == 1 || + firstindex(p) == 1 || throw(ArgumentError("Permutation vectors must have 1-based indexing")) # this length is sufficient because we do not record 1-cycles, # so the worst case is all 2-cycles. One extra element gives the diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index ce973d6694..cf5917a29b 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -1572,3 +1572,90 @@ function _replace_columns!(df::DataFrame, newdf::DataFrame) copy!(index(df).lookup, index(newdf).lookup) return df end + +allcombinations(::Type{DataFrame}; kwargs...) = + isempty(kwargs) ? DataFrame() : allcombinations(DataFrame, kwargs...) + +allcombinations(::Type{DataFrame}, pairs::Pair{<:AbstractString, <:Any}...) = + allcombinations(DataFrame, (Symbol(k) => v for (k, v) in pairs)...) + +""" + allcombinations(DataFrame, pairs::Pair...) + allcombinations(DataFrame; kwargs...) + +Create a `DataFrame` from all combinations of values in passed arguments. + +Arguments associating a column name with values to expand can be specified +either as `Pair`s passed as positional arguments, or as keyword arguments. +Column names must be `Symbol`s or strings and must be unique. + +Column value can be a vector which is consumed as is or an object of any other +type (except `AbstractArray`). In the latter case the passed value is treated +as having length one for expansion. As a particular rule values stored in a `Ref` +or a `0`-dimensional `AbstractArray` are unwrapped and treated as having length one. + +# Examples + +```jldoctest +julia> allcombinations(DataFrame, a=1:2, b='a':'c') +6×2 DataFrame + Row │ a b + │ Int64 Char +─────┼───────────── + 1 │ 1 a + 2 │ 2 a + 3 │ 1 b + 4 │ 2 b + 5 │ 1 c + 6 │ 2 c + +julia> allcombinations(DataFrame, "a" => 1:2, "b" => 'a':'c', "c" => "const") +6×3 DataFrame + Row │ a b c + │ Int64 Char String +─────┼───────────────────── + 1 │ 1 a const + 2 │ 2 a const + 3 │ 1 b const + 4 │ 2 b const + 5 │ 1 c const + 6 │ 2 c const +``` +""" +function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...) + colnames = first.(pairs) + if !allunique(colnames) + throw(ArgumentError("All column names passed to allcombinations must be unique")) + end + colvalues = map(pairs) do p + v = last(p) + if v isa AbstractVector + return v + elseif v isa Union{AbstractArray{<:Any, 0}, Ref} + x = v[] + return fill!(Tables.allocatecolumn(typeof(x), 1), x) + elseif v isa AbstractArray + throw(ArgumentError("adding AbstractArray other than AbstractVector " * + "as a column of a data frame is not allowed")) + else + return fill!(Tables.allocatecolumn(typeof(v), 1), v) + end + end + @assert length(colvalues) == length(colnames) + @assert all(x -> x isa AbstractVector, colvalues) + + target_rows = Int(prod(x -> big(length(x)), colvalues)) + out_df = DataFrame() + inner = 1 + for (val, cname) in zip(colvalues, colnames) + len = length(val) + last_inner = inner + inner *= len + outer, remv = inner == 0 ? (0, 0) : divrem(target_rows, inner) + @assert iszero(remv) + out_df[!, cname] = repeat(val, inner=last_inner, outer=outer) + end + @assert inner == target_rows + @assert size(out_df) == (target_rows, length(colnames)) + return out_df +end diff --git a/test/data.jl b/test/data.jl index f17db1f83f..2a1b7fe478 100644 --- a/test/data.jl +++ b/test/data.jl @@ -615,4 +615,72 @@ end @test_throws ArgumentError fillcombinations(df, 2) end +@testset "allcombinations" begin + @test allcombinations(DataFrame) == DataFrame() + @test allcombinations(DataFrame, a=1:2, b=3:4) == + allcombinations(DataFrame, "a" => 1:2, "b" => 3:4) == + allcombinations(DataFrame, :a => 1:2, :b => 3:4) == + DataFrame(a=[1, 2, 1, 2], b=[3, 3, 4, 4]) + @test_throws MethodError allcombinations(DataFrame, "a" => 1:2, :b => 3:4) + @test_throws ArgumentError allcombinations(DataFrame, "a" => 1:2, "a" => 3:4) + + res = allcombinations(DataFrame, a=categorical(["a", "b", "a"], levels=["c", "b", "a"])) + @test res == DataFrame(a=["a", "b", "a"]) + @test res.a isa CategoricalVector + @test levels(res.a) == ["c", "b", "a"] + + @test allcombinations(DataFrame, a=categorical(["a", "b", "a"]), + b=Ref([1, 2]), + c=fill(1:2), + d=DataFrame(p=1, q=2)) == + DataFrame(a=categorical(["a", "b", "a"]), + b=Ref([1, 2]), + c=fill(1:2), + d=DataFrame(p=1, q=2)) + @test allcombinations(DataFrame, a=categorical(["a", "b", "a"]), + b=Ref([1, 2]), + c=fill(1:2), + d=DataFrame(p=1, q=2), + e=1:2) == + DataFrame(a=categorical(["a", "b", "a", "a", "b", "a"]), + b=Ref([1, 2]), + c=fill(1:2), + d=DataFrame(p=1, q=2), + e=[1, 1, 1, 2, 2, 2]) + @test_throws ArgumentError allcombinations(DataFrame, a=[1 2; 3 4]) + + @test allcombinations(DataFrame, a=[1, 1, 1], b=[2, 2, 2]) == + DataFrame(a=fill(1, 9), b=fill(2, 9)) + @test allcombinations(DataFrame, a=[1, 1, 1], b='a':'b', c=[2, 2, 2]) == + DataFrame(a=fill(1, 18), b=repeat('a':'b', inner=3, outer=3), c=fill(2, 18)) + + res = allcombinations(DataFrame, b=categorical(String[], levels=["a"])) + @test nrow(res) == 0 + @test names(res) == ["b"] + @test typeof(res.b) <: CategoricalVector{String} + @test levels(res.b) == ["a"] + + res = allcombinations(DataFrame, b=categorical(String[], levels=["a"]), c='a':'b') + @test nrow(res) == 0 + @test names(res) == ["b", "c"] + @test typeof(res.b) <: CategoricalVector{String} + @test levels(res.b) == ["a"] + @test typeof(res.c) === Vector{Char} + + res = allcombinations(DataFrame, a=1:3, b=categorical(String[], levels=["a"])) + @test nrow(res) == 0 + @test names(res) == ["a", "b"] + @test typeof(res.a) === Vector{Int} + @test typeof(res.b) <: CategoricalVector{String} + @test levels(res.b) == ["a"] + + res = allcombinations(DataFrame, a=1:3, b=categorical(String[], levels=["a"]), c='a':'b') + @test nrow(res) == 0 + @test names(res) == ["a", "b", "c"] + @test typeof(res.a) === Vector{Int} + @test typeof(res.b) <: CategoricalVector{String} + @test levels(res.b) == ["a"] + @test typeof(res.c) === Vector{Char} +end + end # module