Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add allcombinations #3031

Merged
merged 12 commits into from
Apr 25, 2022
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,16 @@
([#3012](https://github.com/JuliaData/DataFrames.jl/issues/3012))
* Guarantee that `permute!` and `invpermute!` throw on invalid input
([#3035](https://github.com/JuliaData/DataFrames.jl/pull/3035))
* Add `allcombinations` function that returns a data frame created
from all combinations of the passed vectors
([#3031](https://github.com/JuliaData/DataFrames.jl/pull/3031))

## Previously announced breaking changes

* On Julia 1.7 or newer broadcasting assignment
into an existing column of a data frame replaces it. Under Julia 1.6
or older it is an in place operation.
([#3022](https://github.com/JuliaData/DataFrames.jl/pull/3022)
([#3022](https://github.com/JuliaData/DataFrames.jl/pull/3022))

## Performance

Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[compat]
CategoricalArrays = "0.10.0"
Compat = "3.17"
DataAPI = "1.9"
DataAPI = "1.10"
InvertedIndices = "1"
IteratorInterfaceExtensions = "0.1.1, 1"
Missings = "0.4.2, 1"
Expand Down
1 change: 1 addition & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Pages = ["functions.md"]

## Constructing data frames
```@docs
allcombinations
copy
similar
```
Expand Down
2 changes: 2 additions & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ using PrettyTables
using Random

import DataAPI,
DataAPI.allcombinations,
DataAPI.All,
DataAPI.Between,
DataAPI.Cols,
Expand Down Expand Up @@ -39,6 +40,7 @@ export AbstractDataFrame,
GroupedDataFrame,
SubDataFrame,
Tables,
allcombinations,
allowmissing!,
antijoin,
columnindex,
Expand Down
18 changes: 6 additions & 12 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1519,14 +1519,8 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
# Create a vector of vectors of unique values in each column
uniquevals = []
for col in colind
# levels drops missing, handle the case where missing values are present
# All levels are retained, missing is added only if present
# TODO: change this after DataAPI.jl levels supports missing
if any(ismissing, df[!, col])
tempcol = vcat(levels(df[!, col]), missing)
else
tempcol = levels(df[!, col])
end
tempcol = levels(df[!, col], skipmissing=false)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
push!(uniquevals, tempcol)
end

Expand Down Expand Up @@ -2528,12 +2522,12 @@ function _permutation_helper!(fun::Union{typeof(Base.permute!!), typeof(Base.inv
nrow(df) != length(p) &&
throw(DimensionMismatch("Permutation does not have a correct length " *
"(expected $(nrow(df)) but got $(length(p)))"))

cp = _compile_permutation!(Base.copymutable(p))

isempty(cp) && return df

if fun === Base.invpermute!!
if fun === Base.invpermute!!
reverse!(@view cp[1:end-1])
end

Expand All @@ -2544,14 +2538,14 @@ function _permutation_helper!(fun::Union{typeof(Base.permute!!), typeof(Base.inv
_cycle_permute!(col, cp)
end
end

return df
end

# convert a classical permutation to zero terminated cycle
# convert a classical permutation to zero terminated cycle
# notation, zeroing the original permutation in the process.
function _compile_permutation!(p::AbstractVector{<:Integer})
firstindex(p) == 1 ||
firstindex(p) == 1 ||
throw(ArgumentError("Permutation vectors must have 1-based indexing"))
# this length is sufficient because we do not record 1-cycles,
# so the worst case is all 2-cycles. One extra element gives the
Expand Down
87 changes: 87 additions & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1572,3 +1572,90 @@ function _replace_columns!(df::DataFrame, newdf::DataFrame)
copy!(index(df).lookup, index(newdf).lookup)
return df
end

allcombinations(::Type{DataFrame}; kwargs...) =
isempty(kwargs) ? DataFrame() : allcombinations(DataFrame, kwargs...)

allcombinations(::Type{DataFrame}, pairs::Pair{<:AbstractString, <:Any}...) =
allcombinations(DataFrame, (Symbol(k) => v for (k, v) in pairs)...)

"""
allcombinations(DataFrame, pairs::Pair...)
allcombinations(DataFrame, ; kwargs...)
bkamins marked this conversation as resolved.
Show resolved Hide resolved

Create a `DataFrame` from all combinations of values in passed arguments.

Arguments associating a column name with values to expand can be specified
either as `Pair`s passed as positional arguments, or as keyword arguments.
Column names must be `Symbol`s or strings and must be unique.

Column value can be a vector which is consumed as is or an object of any other
type (except `AbstractArray`). In the latter case the passed value is treated
as having length one for expansion. As a particular rule values stored in a `Ref`
or a `0`-dimensional `AbstractArray` are unwrapped and treated as having length one.

# Examples

```jldoctest
julia> allcombinations(DataFrame, a=1:2, b='a':'c')
6×2 DataFrame
Row │ a b
│ Int64 Char
─────┼─────────────
1 │ 1 a
2 │ 2 a
3 │ 1 b
4 │ 2 b
5 │ 1 c
6 │ 2 c

julia> allcombinations(DataFrame, "a" => 1:2, "b" => 'a':'c', "c" => "const")
6×3 DataFrame
Row │ a b c
│ Int64 Char String
─────┼─────────────────────
1 │ 1 a const
2 │ 2 a const
3 │ 1 b const
4 │ 2 b const
5 │ 1 c const
6 │ 2 c const
```
"""
function allcombinations(::Type{DataFrame}, pairs::Pair{Symbol, <:Any}...)
colnames = first.(pairs)
if !allunique(colnames)
throw(ArgumentError("All column names passed to allcombinations must be unique"))
end
colvalues = map(pairs) do p
v = last(p)
if v isa AbstractVector
return v
elseif v isa Union{AbstractArray{<:Any, 0}, Ref}
x = v[]
return fill!(Tables.allocatecolumn(typeof(x), 1), x)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
elseif v isa AbstractArray
throw(ArgumentError("adding AbstractArray other than AbstractVector " *
"as a column of a data frame is not allowed"))
else
return fill!(Tables.allocatecolumn(typeof(v), 1), v)
end
end
@assert length(colvalues) == length(colnames)
@assert all(x -> x isa AbstractVector, colvalues)

target_rows = Int(prod(x -> big(length(x)), colvalues))
out_df = DataFrame()
inner = 1
for (val, cname) in zip(colvalues, colnames)
len = length(val)
last_inner = inner
inner *= len
outer, remv = inner == 0 ? (0, 0) : divrem(target_rows, inner)
@assert iszero(remv)
out_df[!, cname] = repeat(val, inner=last_inner, outer=outer)
end
@assert inner == target_rows
@assert size(out_df) == (target_rows, length(colnames))
return out_df
end
68 changes: 68 additions & 0 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -615,4 +615,72 @@ end
@test_throws ArgumentError fillcombinations(df, 2)
end

@testset "allcombinations" begin
@test allcombinations(DataFrame) == DataFrame()
@test allcombinations(DataFrame, a=1:2, b=3:4) ==
allcombinations(DataFrame, "a" => 1:2, "b" => 3:4) ==
allcombinations(DataFrame, :a => 1:2, :b => 3:4) ==
DataFrame(a=[1, 2, 1, 2], b=[3, 3, 4, 4])
@test_throws MethodError allcombinations(DataFrame, "a" => 1:2, :b => 3:4)
@test_throws ArgumentError allcombinations(DataFrame, "a" => 1:2, "a" => 3:4)

res = allcombinations(DataFrame, a=categorical(["a", "b", "a"], levels=["c", "b", "a"]))
@test res == DataFrame(a=["a", "b", "a"])
@test res.a isa CategoricalVector
@test levels(res.a) == ["c", "b", "a"]

@test allcombinations(DataFrame, a=categorical(["a", "b", "a"]),
b=Ref([1, 2]),
c=fill(1:2),
d=DataFrame(p=1, q=2)) ==
DataFrame(a=categorical(["a", "b", "a"]),
b=Ref([1, 2]),
c=fill(1:2),
d=DataFrame(p=1, q=2))
@test allcombinations(DataFrame, a=categorical(["a", "b", "a"]),
b=Ref([1, 2]),
c=fill(1:2),
d=DataFrame(p=1, q=2),
e=1:2) ==
DataFrame(a=categorical(["a", "b", "a", "a", "b", "a"]),
b=Ref([1, 2]),
c=fill(1:2),
d=DataFrame(p=1, q=2),
e=[1, 1, 1, 2, 2, 2])
@test_throws ArgumentError allcombinations(DataFrame, a=[1 2; 3 4])

@test allcombinations(DataFrame, a=[1, 1, 1], b=[2, 2, 2]) ==
DataFrame(a=fill(1, 9), b=fill(2, 9))
@test allcombinations(DataFrame, a=[1, 1, 1], b='a':'b', c=[2, 2, 2]) ==
DataFrame(a=fill(1, 18), b=repeat('a':'b', inner=3, outer=3), c=fill(2, 18))

res = allcombinations(DataFrame, b=categorical(String[], levels=["a"]))
@test nrow(res) == 0
@test names(res) == ["b"]
@test typeof(res.b) <: CategoricalVector{String}
@test levels(res.b) == ["a"]

res = allcombinations(DataFrame, b=categorical(String[], levels=["a"]), c='a':'b')
@test nrow(res) == 0
@test names(res) == ["b", "c"]
@test typeof(res.b) <: CategoricalVector{String}
@test levels(res.b) == ["a"]
@test typeof(res.c) === Vector{Char}

res = allcombinations(DataFrame, a=1:3, b=categorical(String[], levels=["a"]))
@test nrow(res) == 0
@test names(res) == ["a", "b"]
@test typeof(res.a) === Vector{Int}
@test typeof(res.b) <: CategoricalVector{String}
@test levels(res.b) == ["a"]

res = allcombinations(DataFrame, a=1:3, b=categorical(String[], levels=["a"]), c='a':'b')
@test nrow(res) == 0
@test names(res) == ["a", "b", "c"]
@test typeof(res.a) === Vector{Int}
@test typeof(res.b) <: CategoricalVector{String}
@test levels(res.b) == ["a"]
@test typeof(res.c) === Vector{Char}
end

end # module