Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deprecate stackdf and meltdf #2031

Merged
merged 9 commits into from
Dec 6, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ map
melt
stack
unstack
stackdf
meltdf
```

## Basics
Expand Down
2 changes: 0 additions & 2 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ export AbstractDataFrame,
insertcols!,
mapcols,
melt,
meltdf,
ncol,
nonunique,
nrow,
Expand All @@ -49,7 +48,6 @@ export AbstractDataFrame,
select,
select!,
stack,
stackdf,
unique!,
unstack

Expand Down
167 changes: 64 additions & 103 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
stack(df::AbstractDataFrame, [measure_vars], [id_vars];
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
melt(df::AbstractDataFrame, [id_vars], [measure_vars];
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)

Stack a data frame `df`, i.e. convert it from wide to long format.

Expand All @@ -12,9 +12,10 @@ column `variable_name` (`:variable` by default) a vector of `Symbol`s holding
the name of the corresponding `measure_vars` variable,
and with columns for each of the `id_vars`.

See also [`stackdf`](@ref)) and [`meltdf`](@ref) for stacking methods that return a
view into the original data frame. See [`unstack`](@ref) for converting from
long to wide format.
If `view=true` then return a stacked view of a data frame (long format).
The result is a view because the columns are special `AbstractVectors`
that return indexed views into the original data frame.
bkamins marked this conversation as resolved.
Show resolved Hide resolved


# Arguments
- `df` : the AbstractDataFrame to be stacked
Expand All @@ -31,6 +32,8 @@ long to wide format.
of each of `measure_vars`
- `value_name` : the name of the new stacked column containing the values from
each of `measure_vars`
- `view` : if the stacked data frame should be a view or it should contain
bkamins marked this conversation as resolved.
Show resolved Hide resolved
freshly allocated vectors.

# Examples
```julia
Expand All @@ -50,7 +53,9 @@ d1s_name = melt(d1, [:a, :b, :e], variable_name=:somemeasure)

function stack(df::AbstractDataFrame, measure_vars::AbstractVector{<:Integer},
id_vars::AbstractVector{<:Integer}; variable_name::Symbol=:variable,
value_name::Symbol=:value)
value_name::Symbol=:value, view::Bool=false)
view && return _stackdf(df, measure_vars, id_vars, variable_name=variable_name,
bkamins marked this conversation as resolved.
Show resolved Hide resolved
value_name=value_name)
N = length(measure_vars)
cnames = names(df)[id_vars]
insert!(cnames, 1, value_name)
Expand All @@ -60,60 +65,63 @@ function stack(df::AbstractDataFrame, measure_vars::AbstractVector{<:Integer},
[repeat(df[!, c], outer=N) for c in id_vars]...], # id_var columns
cnames, copycols=false)
end

function stack(df::AbstractDataFrame, measure_var::Int, id_var::Int;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stack(df, [measure_var], [id_var];
variable_name=variable_name, value_name=value_name)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
stack(df, [measure_var], [id_var],
variable_name=variable_name, value_name=value_name, view=view)
end

function stack(df::AbstractDataFrame, measure_vars::AbstractVector{<:Integer}, id_var::Int;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stack(df, measure_vars, [id_var];
variable_name=variable_name, value_name=value_name)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
stack(df, measure_vars, [id_var],
variable_name=variable_name, value_name=value_name, view=view)
end

function stack(df::AbstractDataFrame, measure_var::Int, id_vars::AbstractVector{<:Integer};
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
stack(df, [measure_var], id_vars;
variable_name=variable_name, value_name=value_name)
variable_name=variable_name, value_name=value_name, view=view)
end

function stack(df::AbstractDataFrame, measure_vars, id_vars;
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
stack(df, index(df)[measure_vars], index(df)[id_vars];
variable_name=variable_name, value_name=value_name)
variable_name=variable_name, value_name=value_name, view=view)
end

# no vars specified, by default select only numeric columns
numeric_vars(df::AbstractDataFrame) =
[eltype(col) <: Union{AbstractFloat, Missing} for col in eachcol(df)]

function stack(df::AbstractDataFrame, measure_vars = numeric_vars(df);
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
mv_inds = index(df)[measure_vars]
stack(df, mv_inds, setdiff(1:ncol(df), mv_inds);
variable_name=variable_name, value_name=value_name)
variable_name=variable_name, value_name=value_name, view=view)
end

function melt(df::AbstractDataFrame, id_vars::ColumnIndex;
variable_name::Symbol=:variable, value_name::Symbol=:value)
melt(df, [id_vars]; variable_name=variable_name, value_name=value_name)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
melt(df, [id_vars]; variable_name=variable_name, value_name=value_name, view=view)
end

function melt(df::AbstractDataFrame, id_vars;
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
id_inds = index(df)[id_vars]
stack(df, setdiff(1:ncol(df), id_inds), id_inds;
variable_name=variable_name, value_name=value_name)
variable_name=variable_name, value_name=value_name, view=view)
end

function melt(df::AbstractDataFrame, id_vars, measure_vars;
variable_name::Symbol=:variable, value_name::Symbol=:value)
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false)
stack(df, measure_vars, id_vars; variable_name=variable_name,
value_name=value_name)
value_name=value_name, view=view)
end
melt(df::AbstractDataFrame; variable_name::Symbol=:variable, value_name::Symbol=:value) =
stack(df; variable_name=variable_name, value_name=value_name)

##############################################################################
##
## unstack()
##
##############################################################################
melt(df::AbstractDataFrame;
variable_name::Symbol=:variable, value_name::Symbol=:value, view::Bool=false) =
stack(df; variable_name=variable_name, value_name=value_name, view=view)

"""
unstack(df::AbstractDataFrame, rowkeys::Union{Integer, Symbol},
Expand Down Expand Up @@ -300,15 +308,6 @@ end
unstack(df::AbstractDataFrame; renamecols::Function=identity) =
unstack(df, :variable, :value, renamecols=renamecols)

##############################################################################
##
## Reshaping using referencing (issue #145)
## New AbstractVector types (all read only):
## StackedVector
## RepeatedVector
##
##############################################################################

"""
StackedVector <: AbstractVector{Any}

Expand Down Expand Up @@ -413,99 +412,61 @@ function CategoricalArrays.CategoricalArray(v::RepeatedVector)
res
end

"""
stackdf(df::AbstractDataFrame, [measure_vars], [id_vars];
variable_name::Symbol=:variable, value_name::Symbol=:value)
meltdf(df::AbstractDataFrame, [id_vars], [measure_vars];
variable_name::Symbol=:variable, value_name::Symbol=:value)

Return a stacked view of a data frame (long format).

Like [`stack`](@ref) and [`melt`](@ref), but a view is returned rather than data
copies.

Return a `DataFrame` with a column `value_name` (`:value` by default)
holding the values of the stacked columns (`measure_vars`), with
column `variable_name` (`:variable` by default) a vector of `Symbol`s with the `measure_vars` name,
and with columns for each of the `id_vars`.

The result is a view because the columns are special `AbstractVectors`
that return indexed views into the original data frame.

# Arguments
- `df` : the wide AbstractDataFrame
- `measure_vars` : the columns to be stacked (the measurement
variables), a normal column indexing type, like a `Symbol`,
`Vector{Symbol}`, Int, etc.; for `meltdf`, defaults to all
variables that are not `id_vars`
- `id_vars` : the identifier columns that are repeated during
stacking, a normal column indexing type; for `stackdf` defaults to all
variables that are not `measure_vars`

# Examples
```julia
d1 = DataFrame(a = repeat([1:3;], inner = [4]),
b = repeat([1:4;], inner = [3]),
c = randn(12),
d = randn(12),
e = map(string, 'a':'l'))

d1s = stackdf(d1, [:c, :d])
d1s2 = stackdf(d1, [:c, :d], [:a])
d1m = meltdf(d1, [:a, :b, :e])
```
"""
(stackdf, meltdf)

function stackdf(df::AbstractDataFrame, measure_vars::AbstractVector{<:Integer},
function _stackdf(df::AbstractDataFrame, measure_vars::AbstractVector{<:Integer},
id_vars::AbstractVector{<:Integer}; variable_name::Symbol=:variable,
bkamins marked this conversation as resolved.
Show resolved Hide resolved
value_name::Symbol=:value)
N = length(measure_vars)
cnames = names(df)[id_vars]
insert!(cnames, 1, value_name)
insert!(cnames, 1, variable_name)
DataFrame(AbstractVector[RepeatedVector(_names(df)[measure_vars], nrow(df), 1), # variable
StackedVector(Any[df[!, c] for c in measure_vars]), # value
[RepeatedVector(df[!, c], 1, N) for c in id_vars]...], # id_var columns
StackedVector(Any[df[!, c] for c in measure_vars]), # value
[RepeatedVector(df[!, c], 1, N) for c in id_vars]...], # id_var columns
cnames, copycols=false)
end
function stackdf(df::AbstractDataFrame, measure_var::Int, id_var::Int;

function _stackdf(df::AbstractDataFrame, measure_var::Int, id_var::Int;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stackdf(df, [measure_var], [id_var]; variable_name=variable_name,
_stackdf(df, [measure_var], [id_var]; variable_name=variable_name,
value_name=value_name)
end
function stackdf(df::AbstractDataFrame, measure_vars, id_var::Int;

function _stackdf(df::AbstractDataFrame, measure_vars, id_var::Int;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stackdf(df, measure_vars, [id_var]; variable_name=variable_name,
_stackdf(df, measure_vars, [id_var]; variable_name=variable_name,
value_name=value_name)
end
function stackdf(df::AbstractDataFrame, measure_var::Int, id_vars;

function _stackdf(df::AbstractDataFrame, measure_var::Int, id_vars;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stackdf(df, [measure_var], id_vars; variable_name=variable_name,
_stackdf(df, [measure_var], id_vars; variable_name=variable_name,
value_name=value_name)
end
function stackdf(df::AbstractDataFrame, measure_vars, id_vars;

function _stackdf(df::AbstractDataFrame, measure_vars, id_vars;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stackdf(df, index(df)[measure_vars], index(df)[id_vars];
_stackdf(df, index(df)[measure_vars], index(df)[id_vars];
variable_name=variable_name, value_name=value_name)
end
function stackdf(df::AbstractDataFrame, measure_vars = numeric_vars(df);

function _stackdf(df::AbstractDataFrame, measure_vars = numeric_vars(df);
variable_name::Symbol=:variable, value_name::Symbol=:value)
m_inds = index(df)[measure_vars]
stackdf(df, m_inds, setdiff(1:ncol(df), m_inds);
_stackdf(df, m_inds, setdiff(1:ncol(df), m_inds);
variable_name=variable_name, value_name=value_name)
end

function meltdf(df::AbstractDataFrame, id_vars; variable_name::Symbol=:variable,
function _meltdf(df::AbstractDataFrame, id_vars; variable_name::Symbol=:variable,
value_name::Symbol=:value)
id_inds = index(df)[id_vars]
stackdf(df, setdiff(1:ncol(df), id_inds), id_inds;
_stackdf(df, setdiff(1:ncol(df), id_inds), id_inds;
variable_name=variable_name, value_name=value_name)
end
function meltdf(df::AbstractDataFrame, id_vars, measure_vars;

function _meltdf(df::AbstractDataFrame, id_vars, measure_vars;
variable_name::Symbol=:variable, value_name::Symbol=:value)
stackdf(df, measure_vars, id_vars; variable_name=variable_name,
_stackdf(df, measure_vars, id_vars; variable_name=variable_name,
value_name=value_name)
end
meltdf(df::AbstractDataFrame; variable_name::Symbol=:variable, value_name::Symbol=:value) =
stackdf(df; variable_name=variable_name, value_name=value_name)
_meltdf(df::AbstractDataFrame; variable_name::Symbol=:variable, value_name::Symbol=:value) =
_stackdf(df; variable_name=variable_name, value_name=value_name)
3 changes: 3 additions & 0 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1545,3 +1545,6 @@ import Base: setproperty!
import DataAPI: describe
@deprecate describe(io::IO, df::AbstractDataFrame, stats::Union{Symbol, Pair{Symbol}}...;
cols=:) describe(df, stats..., cols=cols)

@deprecate stackdf(args...; kwargs...) stack(args...; kwargs..., view=true)
@deprecate meltdf(args...; kwargs...) melt(args...; kwargs..., view=true)
34 changes: 21 additions & 13 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -253,11 +253,19 @@ end
@test size(dx) == (0, 3)
@test names(dx) == [:variable, :value, :a]

@test stackdf(d1, :a) == stackdf(d1, [:a])
@test stack(d1, :a, view=true) == stack(d1, [:a], view=true)
@test all(isa.(eachcol(stackdf(d1, :a)),
[DataFrames.RepeatedVector;
DataFrames.StackedVector;
fill(DataFrames.RepeatedVector, 4)]))
@test all(isa.(eachcol(meltdf(d1, [:b, :c, :d, :e])),
[DataFrames.RepeatedVector;
DataFrames.StackedVector;
fill(DataFrames.RepeatedVector, 4)]))

# Tests of RepeatedVector and StackedVector indexing
d1s = stackdf(d1, [:a, :b])
@test d1s == stackdf(d1, r"[ab]")
d1s = stack(d1, [:a, :b], view=true)
@test d1s == stack(d1, r"[ab]", view=true)
@test d1s[!, 1] isa DataFrames.RepeatedVector
@test ndims(d1s[!, 1]) == 1
@test ndims(typeof(d1s[!, 1])) == 1
Expand All @@ -281,24 +289,24 @@ end
@test [d1s[!, 1][1:12]; d1s[!, 1][13:24]] == d1s[!, 1]
@test [d1s[!, 2][1:12]; d1s[!, 2][13:24]] == d1s[!, 2]

d1s2 = stackdf(d1, [:c, :d])
@test d1s2 == stackdf(d1, r"[cd]")
d1s3 = stackdf(d1)
d1m = meltdf(d1, [:c, :d, :e])
@test d1m == meltdf(d1, r"[cde]")
d1s2 = stack(d1, [:c, :d], view=true)
@test d1s2 == stack(d1, r"[cd]", view=true)
d1s3 = stack(d1, view=true)
d1m = melt(d1, [:c, :d, :e], view=true)
@test d1m == melt(d1, r"[cde]", view=true)
@test d1s[1:12, :c] == d1[!, :c]
@test d1s[13:24, :c] == d1[!, :c]
@test d1s2 == d1s3
@test names(d1s) == [:variable, :value, :c, :d, :e]
@test d1s == d1m
d1m = meltdf(d1[:, [1,3,4]], :a)
d1m = melt(d1[:, [1,3,4]], :a, view=true)
@test names(d1m) == [:variable, :value, :a]

d1s_named = stackdf(d1, [:a, :b], variable_name=:letter, value_name=:someval)
@test d1s_named == stackdf(d1, r"[ab]", variable_name=:letter, value_name=:someval)
d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true)
@test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval, view=true)
@test names(d1s_named) == [:letter, :someval, :c, :d, :e]
d1m_named = meltdf(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval)
@test d1m_named == meltdf(d1, r"[cde]", variable_name=:letter, value_name=:someval)
d1m_named = melt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval, view=true)
@test d1m_named == melt(d1, r"[cde]", variable_name=:letter, value_name=:someval, view=true)
@test names(d1m_named) == [:letter, :someval, :c, :d, :e]

d1s[!, :id] = Union{Int, Missing}[1:12; 1:12]
Expand Down