Skip to content

Commit

Permalink
Make eachcol names argument default to false
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Apr 1, 2019
1 parent 2b4e64d commit d4dfbb4
Show file tree
Hide file tree
Showing 14 changed files with 132 additions and 160 deletions.
5 changes: 2 additions & 3 deletions docs/src/lib/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,10 @@ Similarly, the `eachcol` function returns a value of the `DataFrameColumns` type
serves as an iterator over columns of an `AbstractDataFrame`.
The return value can have two concrete types:

* If the `eachcol` function is called with the `names` argument set to `true` (currently the default,
but in the future the default will change to `false`) then it returns a value of the
* If the `eachcol` function is called with the `names` argument set to `true` then it returns a value of the
`DataFrameColumns{<:AbstractDataFrame, Pair{Symbol, AbstractVector}}` type, which is an
iterator returning a pair containing the column name and the column vector.
* If the `eachcol` function is called with `names` argument set to `false` then it returns a value of the
* If the `eachcol` function is called with `names` argument set to `false` (the default) then it returns a value of the
`DataFrameColumns{<:AbstractDataFrame, AbstractVector}` type, which is an
iterator returning the column vector only.

Expand Down
42 changes: 21 additions & 21 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ eltypes(df)
```
"""
eltypes(df::AbstractDataFrame) = eltype.(columns(df))
eltypes(df::AbstractDataFrame) = eltype.(eachcol(df))

Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df))
function Base.size(df::AbstractDataFrame, i::Integer)
Expand Down Expand Up @@ -244,7 +244,7 @@ that is different than the number of rows present in `df`.
"""
function Base.similar(df::AbstractDataFrame, rows::Integer = size(df, 1))
rows < 0 && throw(ArgumentError("the number of rows must be non-negative"))
DataFrame(AbstractVector[similar(x, rows) for x in columns(df)], copy(index(df)))
DataFrame(AbstractVector[similar(x, rows) for x in eachcol(df)], copy(index(df)))
end

##############################################################################
Expand Down Expand Up @@ -341,15 +341,15 @@ describe(df::AbstractDataFrame, stats::Union{Symbol, Pair{<:Symbol}}...)
**Arguments**
* `df` : the `AbstractDataFrame`
* `stats::Union{Symbol, Pair{<:Symbol}}...` : the summary statistics to report.
Arguments can be:
* A symbol from the list `:mean`, `:std`, `:min`, `:q25`,
`:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and
* `stats::Union{Symbol, Pair{<:Symbol}}...` : the summary statistics to report.
Arguments can be:
* A symbol from the list `:mean`, `:std`, `:min`, `:q25`,
`:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and
`:nmissing`. The default statistics used
are `:mean`, `:min`, `:median`, `:max`, `:nunique`, `:nmissing`, and `:eltype`.
* `:all` as the only `Symbol` argument to return all statistics.
* `:all` as the only `Symbol` argument to return all statistics.
* A `name => function` pair where `name` is a `Symbol`. This will create
a column of summary statistics with the provided name.
a column of summary statistics with the provided name.
**Result**
Expand Down Expand Up @@ -388,16 +388,16 @@ describe(df, :min, :sum => sum)
```
"""
StatsBase.describe(df::AbstractDataFrame, stats::Union{Symbol, Pair{Symbol}}...) =
StatsBase.describe(df::AbstractDataFrame, stats::Union{Symbol, Pair{Symbol}}...) =
_describe(df, collect(stats))

# TODO: un-comment this method definition after the deprecation period of
# the `stats` keyword for `describe`.
# StatsBase.describe(df::AbstractDataFrame) =
# the `stats` keyword for `describe`.
# StatsBase.describe(df::AbstractDataFrame) =
# _describe(df, [:mean, :min, :median, :max, :nunique, :nmissing, :eltype])

function _describe(df::AbstractDataFrame, stats::AbstractVector)
predefined_funs = Symbol[s for s in stats if s isa Symbol]
function _describe(df::AbstractDataFrame, stats::AbstractVector)
predefined_funs = Symbol[s for s in stats if s isa Symbol]

allowed_fields = [:mean, :std, :min, :q25, :median, :q75,
:max, :nunique, :nmissing, :first, :last, :eltype]
Expand All @@ -406,8 +406,8 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector)
predefined_funs = allowed_fields
i = findfirst(s -> s == :all, stats)
splice!(stats, i, allowed_fields) # insert in the stats vector to get a good order
elseif :all in predefined_funs
throw(ArgumentError("`:all` must be the only `Symbol` argument."))
elseif :all in predefined_funs
throw(ArgumentError("`:all` must be the only `Symbol` argument."))
elseif !issubset(predefined_funs, allowed_fields)
not_allowed = join(setdiff(predefined_funs, allowed_fields), ", :")
allowed_msg = "\nAllowed fields are: :" * join(allowed_fields, ", :")
Expand All @@ -417,7 +417,7 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector)
custom_funs = Pair[s for s in stats if s isa Pair]

ordered_names = [s isa Symbol ? s : s[1] for s in stats]

if !allunique(ordered_names)
duplicate_names = unique(ordered_names[nonunique(DataFrame(ordered_names = ordered_names))])
throw(ArgumentError("Duplicate names not allowed. Duplicated value(s) are: :$(join(duplicate_names, ", "))"))
Expand All @@ -428,7 +428,7 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector)
data[:variable] = names(df)

# An array of Dicts for summary statistics
column_stats_dicts = map(columns(df)) do col
column_stats_dicts = map(eachcol(df)) do col
if eltype(col) >: Missing
t = collect(skipmissing(col))
d = get_stats(t, predefined_funs)
Expand All @@ -438,11 +438,11 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector)
get_stats!(d, col, custom_funs)
end

if :nmissing in predefined_funs
if :nmissing in predefined_funs
d[:nmissing] = eltype(col) >: Missing ? count(ismissing, col) : nothing
end

if :first in predefined_funs
if :first in predefined_funs
d[:first] = isempty(col) ? nothing : first(col)
end

Expand Down Expand Up @@ -509,7 +509,7 @@ function get_stats(col::AbstractVector, stats::AbstractVector{Symbol})
end

function get_stats!(d::Dict, col::AbstractVector, stats::AbstractVector{<:Pair})
for stat in stats
for stat in stats
d[stat[1]] = try stat[2](col) catch end
end
end
Expand Down Expand Up @@ -843,7 +843,7 @@ function Base.convert(::Type{Matrix{T}}, df::AbstractDataFrame) where T
n, p = size(df)
res = Matrix{T}(undef, n, p)
idx = 1
for (name, col) in zip(names(df), columns(df))
for (name, col) in eachcol(df, true)
try
copyto!(res, idx, col)
catch err
Expand Down
46 changes: 17 additions & 29 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,12 @@ struct DataFrameColumns{T<:AbstractDataFrame, V} <: AbstractVector{V}
end

"""
eachcol(df::AbstractDataFrame, names::Bool=true)
eachcol(df::AbstractDataFrame, names::Bool=false)
Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by column.
If `names` is equal to `true` (currently the default, in the future the default
will be set to `false`) iteration returns a pair consisting of column name
and column vector.
If `names` is equal to `false` then column vectors are yielded.
If `names` is equal to `false` (the default) iteration returns column vectors.
If `names` is equal to `true` pairs consisting of column name and column vector
are yielded.
**Examples**
Expand All @@ -120,48 +119,37 @@ julia> df = DataFrame(x=1:4, y=11:14)
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> collect(eachcol(df, true))
2-element Array{Pair{Symbol,AbstractArray{T,1} where T},1}:
:x => [1, 2, 3, 4]
:y => [11, 12, 13, 14]
julia> collect(eachcol(df, false))
julia> collect(eachcol(df))
2-element Array{AbstractArray{T,1} where T,1}:
[1, 2, 3, 4]
[11, 12, 13, 14]
julia> sum.(eachcol(df, false))
2-element Array{Int64,1}:
10
50
julia> map(eachcol(df, false)) do col
julia> map(eachcol(df)) do col
maximum(col) - minimum(col)
end
2-element Array{Int64,1}:
3
3
julia> sum.(eachcol(df))
2-element Array{Int64,1}:
10
50
julia> collect(eachcol(df, true))
2-element Array{Pair{Symbol,AbstractArray{T,1} where T},1}:
:x => [1, 2, 3, 4]
:y => [11, 12, 13, 14]
```
"""
@inline function eachcol(df::T, names::Bool) where T<: AbstractDataFrame
@inline function eachcol(df::T, names::Bool=false) where T<: AbstractDataFrame
if names
DataFrameColumns{T, Pair{Symbol, AbstractVector}}(df)
else
DataFrameColumns{T, AbstractVector}(df)
end
end

# TODO: remove this method after deprecation
# and add default argument value above
function eachcol(df::AbstractDataFrame)
Base.depwarn("In the future eachcol will have names argument set to false by default", :eachcol)
eachcol(df, true)
end

# TODO: remove this method after deprecation
# this is left to make sure we do not forget to properly fix columns calls
columns(df::AbstractDataFrame) = eachcol(df, false)

Base.size(itr::DataFrameColumns) = (size(itr.df, 2),)
Base.IndexStyle(::Type{<:DataFrameColumns}) = Base.IndexLinear()

Expand Down
8 changes: 4 additions & 4 deletions src/abstractdataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol,
cols = Vector{AbstractVector}(undef, ncleft + ncol(dfr_noon))
# inner and left joins preserve non-missingness of the left frame
_similar_left = kind == :inner || kind == :left ? similar : similar_missing
for (i, col) in enumerate(columns(joiner.dfl))
for (i, col) in enumerate(eachcol(joiner.dfl))
cols[i] = _similar_left(col, nrow)
copyto!(cols[i], view(col, all_orig_left_ixs))
end
# inner and right joins preserve non-missingness of the right frame
_similar_right = kind == :inner || kind == :right ? similar : similar_missing
for (i, col) in enumerate(columns(dfr_noon))
for (i, col) in enumerate(eachcol(dfr_noon))
cols[i+ncleft] = _similar_right(col, nrow)
copyto!(cols[i+ncleft], view(col, all_orig_right_ixs))
permute!(cols[i+ncleft], right_perm)
Expand Down Expand Up @@ -407,7 +407,7 @@ end
function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false)
r1, r2 = size(df1, 1), size(df2, 1)
colindex = merge(index(df1), index(df2), makeunique=makeunique)
cols = Any[[repeat(c, inner=r2) for c in columns(df1)];
[repeat(c, outer=r1) for c in columns(df2)]]
cols = Any[[repeat(c, inner=r2) for c in eachcol(df1)];
[repeat(c, outer=r1) for c in eachcol(df2)]]
DataFrame(cols, colindex)
end
10 changes: 5 additions & 5 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Base: @deprecate

import Base: keys, values, insert!
@deprecate keys(df::AbstractDataFrame) names(df)
@deprecate values(df::AbstractDataFrame) columns(df)
@deprecate values(df::AbstractDataFrame) eachcol(df)
@deprecate insert!(df::DataFrame, df2::AbstractDataFrame) (foreach(col -> df[col] = df2[col], names(df2)); df)

@deprecate pool categorical
Expand Down Expand Up @@ -1325,11 +1325,11 @@ import Base: vcat
@deprecate showcols(df::AbstractDataFrame, all::Bool=false, values::Bool=true) describe(df, :eltype, :nmissing, :first, :last)
@deprecate showcols(io::IO, df::AbstractDataFrame, all::Bool=false, values::Bool=true) show(io, describe(df, :eltype, :nmissing, :first, :last), all)
function StatsBase.describe(df::AbstractDataFrame; stats=nothing)
if stats === nothing
_describe(df, [:mean, :min, :median,
:max, :nunique, :nmissing,
if stats === nothing
_describe(df, [:mean, :min, :median,
:max, :nunique, :nmissing,
:eltype])
elseif stats === :all
elseif stats === :all
Base.depwarn("The `stats` keyword argument has been deprecated. Use describe(df, stats...) instead.", :describe)
_describe(df, [:mean, :std, :min, :q25, :median, :q75,
:max, :nunique, :nmissing, :first, :last, :eltype])
Expand Down
4 changes: 2 additions & 2 deletions src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ function _combine(f::Union{AbstractVector{<:Pair}, Tuple{Vararg{Pair}},
incols = gd.parent[first(p)]
else
df = gd.parent[collect(first(p))]
incols = NamedTuple{Tuple(names(df))}(columns(df))
incols = NamedTuple{Tuple(names(df))}(eachcol(df))
end
firstres = do_call(fun, gd, incols, 1)
idx, outcols, _ = _combine_with_first(wrap(firstres), fun, gd, incols)
Expand Down Expand Up @@ -715,7 +715,7 @@ function _combine(f::Any, gd::GroupedDataFrame)
fun = last(f)
elseif f isa Pair
df = gd.parent[collect(first(f))]
incols = NamedTuple{Tuple(names(df))}(columns(df))
incols = NamedTuple{Tuple(names(df))}(eachcol(df))
fun = last(f)
else
incols = nothing
Expand Down
15 changes: 7 additions & 8 deletions test/cat.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module TestCat

using Test, Random, DataFrames
using DataFrames: columns
const = isequal

#
Expand Down Expand Up @@ -158,7 +157,7 @@ end
@testset "vcat >2 args" begin
empty_dfs = [DataFrame(), DataFrame(), DataFrame()]
@test vcat(empty_dfs...) == reduce(vcat, empty_dfs) == DataFrame()

df = DataFrame(x = trues(1), y = falses(1))
dfs = [df, df, df]
@test vcat(dfs...) ==reduce(vcat, dfs) == DataFrame(x = trues(3), y = falses(3))
Expand All @@ -167,13 +166,13 @@ end
@testset "vcat mixed coltypes" begin
df = vcat(DataFrame([[1]], [:x]), DataFrame([[1.0]], [:x]))
@test df == DataFrame([[1.0, 1.0]], [:x])
@test typeof.(columns(df)) == [Vector{Float64}]
@test typeof.(eachcol(df)) == [Vector{Float64}]
df = vcat(DataFrame([[1]], [:x]), DataFrame([["1"]], [:x]))
@test df == DataFrame([[1, "1"]], [:x])
@test typeof.(columns(df)) == [Vector{Any}]
@test typeof.(eachcol(df)) == [Vector{Any}]
df = vcat(DataFrame([Union{Missing, Int}[1]], [:x]), DataFrame([[1]], [:x]))
@test df == DataFrame([[1, 1]], [:x])
@test typeof.(columns(df)) == [Vector{Union{Missing, Int}}]
@test typeof.(eachcol(df)) == [Vector{Union{Missing, Int}}]
df = vcat(DataFrame([CategoricalArray([1])], [:x]), DataFrame([[1]], [:x]))
@test df == DataFrame([[1, 1]], [:x])
@test df[:x] isa Vector{Int}
Expand All @@ -188,14 +187,14 @@ end
df = vcat(DataFrame([Union{Int, Missing}[1]], [:x]),
DataFrame([["1"]], [:x]))
@test df == DataFrame([[1, "1"]], [:x])
@test typeof.(columns(df)) == [Vector{Any}]
@test typeof.(eachcol(df)) == [Vector{Any}]
df = vcat(DataFrame([CategoricalArray([1])], [:x]),
DataFrame([CategoricalArray(["1"])], [:x]))
@test df == DataFrame([[1, "1"]], [:x])
@test df[:x] isa CategoricalVector{Any}
df = vcat(DataFrame([trues(1)], [:x]), DataFrame([[false]], [:x]))
@test df == DataFrame([[true, false]], [:x])
@test typeof.(columns(df)) == [Vector{Bool}]
@test typeof.(eachcol(df)) == [Vector{Bool}]
end

@testset "vcat out of order" begin
Expand All @@ -213,7 +212,7 @@ end
@test vcat(df2, df1, df2) == DataFrame([[2, 4, 6, 7, 8, 9, 2, 4, 6],
[8, 10, 12, 4, 5, 6, 8, 10, 12],
[14, 16, 18, 1, 2, 3, 14, 16, 18]] ,[:C, :B, :A])

@test size(vcat(df1, df1, df1, df2, df2, df2)) == (18, 3)
df3 = df1[[1, 3, 2]]
res = vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3, df3)
Expand Down
7 changes: 3 additions & 4 deletions test/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module TestConstructors

using Test, DataFrames
using DataFrames: Index, _columns, index
using DataFrames: columns
const = isequal

#
Expand Down Expand Up @@ -147,13 +146,13 @@ end
@testset "column types" begin
df = DataFrame(A = 1:3, B = 2:4, C = 3:5)
answer = [Array{Int,1}, Array{Int,1}, Array{Int,1}]
@test map(typeof, columns(df)) == answer
@test map(typeof, eachcol(df)) == answer
df[:D] = [4, 5, missing]
push!(answer, Vector{Union{Int, Missing}})
@test map(typeof, columns(df)) == answer
@test map(typeof, eachcol(df)) == answer
df[:E] = 'c'
push!(answer, Vector{Char})
@test map(typeof, columns(df)) == answer
@test map(typeof, eachcol(df)) == answer
end

@testset "categorical constructor" begin
Expand Down
Loading

0 comments on commit d4dfbb4

Please sign in to comment.