diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 4a76d91271..30dfdc41c5 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -29,6 +29,7 @@ meltdf ```@docs allowmissing! +columns completecases describe disallowmissing! @@ -41,6 +42,7 @@ filter filter! head insertcols! +mapcols names! nonunique rename! diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md index a446879826..120dfeef20 100644 --- a/docs/src/lib/types.md +++ b/docs/src/lib/types.md @@ -33,9 +33,21 @@ and reflects changes done to the parent after the creation of the view. Typically objects of the `DataFrameRow` type are encountered when returned by the `eachrow` function. In the future accessing a single row of a data frame via `getindex` or `view` will return a `DataFrameRow`. -Additionally the `eachrow` and `eachcol` functions return values of the `DFRowIterator` and `DFColumnIterator` types respectively. -Those types are not exported and should not be constructed directly. -They respectively serve as iterators over rows and columns of an `AbstractDataFrame`. +Additionally, the `eachrow` function returns a value of the `DataFrameRows` type, which +serves as an iterator over rows of an `AbstractDataFrame`, returning `DataFrameRow` objects. + +Similarly, the `eachcol` and `columns` functions return a value of the `DataFrameColumns` type, which +serves as an iterator over columns of an `AbstractDataFrame`. +The difference between the return value of `eachcol` and `columns` is the following: + +* The `eachcol` function returns a value of the `DataFrameColumns{<:AbstractDataFrame, true}` type, which is an + iterator returning a pair containing the column name and the column vector. +* The `columns` function returns a value of the `DataFrameColumns{<:AbstractDataFrame, false}` type, which is an + iterator returning the column vector only. + +The `DataFrameRows` and `DataFrameColumns` types are subtypes of `AbstractVector` and support its interface +with the exception that they are read only. Note that they are not exported and should not be constructed directly, +but using the `eachrow`, `eachcol` and `columns` functions. ## Types specification @@ -45,6 +57,6 @@ DataFrame DataFrameRow GroupedDataFrame SubDataFrame -DFRowIterator -DFColumnIterator +DataFrameRows +DataFrameColumns ``` diff --git a/src/DataFrames.jl b/src/DataFrames.jl index e53d807563..b234e3479e 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -27,6 +27,7 @@ export AbstractDataFrame, aggregate, by, categorical!, + columns, colwise, combine, completecases, @@ -42,6 +43,7 @@ export AbstractDataFrame, groupby, head, insertcols!, + mapcols, melt, meltdf, names!, diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index ae66e5cc97..ee5c40ef19 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -81,22 +81,6 @@ abstract type AbstractDataFrame end ## ############################################################################## -struct Cols{T <: AbstractDataFrame} <: AbstractVector{AbstractVector} - df::T -end -function Base.iterate(itr::Cols, st=1) - st > length(itr.df) && return nothing - return (itr.df[st], st + 1) -end -Base.length(itr::Cols) = length(itr.df) -Base.size(itr::Cols, ix) = ix==1 ? length(itr) : throw(ArgumentError("Incorrect dimension")) -Base.size(itr::Cols) = (length(itr.df),) -Base.IndexStyle(::Type{<:Cols}) = IndexLinear() -Base.getindex(itr::Cols, inds...) = getindex(itr.df, inds...) - -# N.B. where stored as a vector, 'columns(x) = x.vector' is a bit cheaper -columns(df::T) where {T <: AbstractDataFrame} = Cols{T}(df) - Base.names(df::AbstractDataFrame) = names(index(df)) _names(df::AbstractDataFrame) = _names(index(df)) @@ -218,7 +202,7 @@ eltypes(df) ``` """ -eltypes(df::AbstractDataFrame) = map!(eltype, Vector{Type}(undef, size(df,2)), columns(df)) +eltypes(df::AbstractDataFrame) = eltype.(columns(df)) Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df)) function Base.size(df::AbstractDataFrame, i::Integer) @@ -1096,7 +1080,7 @@ julia> repeat(df, inner = 2, outer = 3) ``` """ Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) = - map(x -> repeat(x, inner = inner, outer = outer), eachcol(df)) + mapcols(x -> repeat(x, inner = inner, outer = outer), df) """ repeat(df::AbstractDataFrame, count::Integer) @@ -1126,7 +1110,7 @@ julia> repeat(df, 2) ``` """ Base.repeat(df::AbstractDataFrame, count::Integer) = - map(x -> repeat(x, count), eachcol(df)) + mapcols(x -> repeat(x, count), df) ############################################################################## ## diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl index 69529f7f8c..2fc226ba80 100644 --- a/src/abstractdataframe/io.jl +++ b/src/abstractdataframe/io.jl @@ -213,11 +213,11 @@ struct DataFrameStream{T} columns::T header::Vector{String} end -DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(columns(df)), string.(names(df))) +DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(_columns(df)), string.(names(df))) # DataFrame Data.Source implementation Data.schema(df::DataFrame) = - Data.Schema(Type[eltype(A) for A in columns(df)], string.(names(df)), size(df, 1)) + Data.Schema(Type[eltype(A) for A in _columns(df)], string.(names(df)), size(df, 1)) Data.isdone(source::DataFrame, row, col, rows, cols) = row > rows || col > cols function Data.isdone(source::DataFrame, row, col) @@ -276,26 +276,25 @@ function DataFrame(sch::Data.Schema{R}, ::Type{S}=Data.Field, # to the # of rows in the source newsize = ifelse(S == Data.Column || !R, 0, ifelse(append, sinkrows + sch.rows, sch.rows)) - foreach(col->resize!(col, newsize), columns(sink)) + foreach(col->resize!(col, newsize), _columns(sink)) sch.rows = newsize end # take care of a possible reference from source by addint to WeakRefStringArrays if !isempty(reference) foreach(col-> col isa WeakRefStringArray && push!(col.data, reference), - sink.columns) + _columns(sink)) end - sink = DataFrameStream(sink) + return DataFrameStream(sink) else # allocating a fresh DataFrame Sink; append is irrelevant # for Data.Column or unknown # of rows in Data.Field, we only ever append!, # so just allocate empty columns rows = ifelse(S == Data.Column, 0, ifelse(!R, 0, sch.rows)) names = Data.header(sch) - sink = DataFrameStream( - Tuple(allocate(types[i], rows, reference) for i = 1:length(types)), names) sch.rows = rows + return DataFrameStream(Tuple(allocate(types[i], rows, reference) + for i = 1:length(types)), names) end - return sink end DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool; diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index d31558c322..8df76f4a2b 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -8,56 +8,51 @@ # Iteration by rows """ - DFRowIterator{<:AbstractDataFrame} + DataFrameRows{T<:AbstractDataFrame} <: AbstractVector{DataFrameRow{T}} Iterator over rows of an `AbstractDataFrame`, with each row represented as a `DataFrameRow`. A value of this type is returned by the [`eachrow`](@link) function. """ -struct DFRowIterator{T <: AbstractDataFrame} +struct DataFrameRows{T<:AbstractDataFrame} <: AbstractVector{DataFrameRow{T}} df::T end """ eachrow(df::AbstractDataFrame) -Return a `DFRowIterator` that iterates an `AbstractDataFrame` row by row, +Return a `DataFrameRows` that iterates an `AbstractDataFrame` row by row, with each row represented as a `DataFrameRow`. """ -eachrow(df::AbstractDataFrame) = DFRowIterator(df) +eachrow(df::AbstractDataFrame) = DataFrameRows(df) -function Base.iterate(itr::DFRowIterator, i=1) - i > size(itr.df, 1) && return nothing - return (DataFrameRow(itr.df, i), i + 1) +Base.size(itr::DataFrameRows) = (size(itr.df, 1), ) +Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear() +@inline function Base.getindex(itr::DataFrameRows, i::Int) + @boundscheck checkbounds(itr, i) + return DataFrameRow(itr.df, i) end -Base.eltype(::DFRowIterator{T}) where {T} = DataFrameRow{T} -Base.size(itr::DFRowIterator) = (size(itr.df, 1), ) -Base.length(itr::DFRowIterator) = size(itr.df, 1) -Base.getindex(itr::DFRowIterator, i) = DataFrameRow(itr.df, i) # Iteration by columns """ - DFColumnIterator{<:AbstractDataFrame} + DataFrameColumns{<:AbstractDataFrame, V} <: AbstractVector{V} Iterator over columns of an `AbstractDataFrame`. -Each returned value is a tuple consisting of column name and column vector. - -A value of this type is returned by the [`eachcol`](@link) function. +If `V` is `Pair{Symbol,AbstractVector}` (which is the case when calling +[`eachcol`](@link)) then each returned value is a pair consisting of +column name and column vector. If `V` is `AbstractVector` (a value returned by +the [`columns`](@link) function) then each returned value is a column vector. """ -struct DFColumnIterator{T <: AbstractDataFrame} +struct DataFrameColumns{T<:AbstractDataFrame, V} <: AbstractVector{V} df::T end """ eachcol(df::AbstractDataFrame) -Return a `DFColumnIterator` that iterates an `AbstractDataFrame` column by column. -Iteration returns a tuple consisting of column name and column vector. - -`DFColumnIterator` has a custom implementation of the `map` function which -returns a `DataFrame` and assumes that a function argument passed do -the `map` function accepts takes only a column vector. +Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by column. +Iteration returns a pair consisting of column name and column vector. **Examples** @@ -72,28 +67,122 @@ julia> df = DataFrame(x=1:4, y=11:14) │ 3 │ 3 │ 13 │ │ 4 │ 4 │ 14 │ -julia> map(sum, eachcol(df)) -1×2 DataFrame +julia> collect(eachcol(df)) +2-element Array{Pair{Symbol,AbstractArray{T,1} where T},1}: + :x => [1, 2, 3, 4] + :y => [11, 12, 13, 14] +``` +""" +eachcol(df::T) where T<: AbstractDataFrame = + DataFrameColumns{T, Pair{Symbol, AbstractVector}}(df) + +""" + columns(df::AbstractDataFrame) + +Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by +column, yielding column vectors. + +**Examples** + +```jldoctest +julia> df = DataFrame(x=1:4, y=11:14) +4×2 DataFrame │ Row │ x │ y │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ -│ 1 │ 10 │ 50 │ +│ 1 │ 1 │ 11 │ +│ 2 │ 2 │ 12 │ +│ 3 │ 3 │ 13 │ +│ 4 │ 4 │ 14 │ + +julia> collect(columns(df)) +2-element Array{AbstractArray{T,1} where T,1}: + [1, 2, 3, 4] + [11, 12, 13, 14] + +julia> sum.(columns(df)) +2-element Array{Int64,1}: + 10 + 50 + +julia> map(columns(df)) do col + maximum(col) - minimum(col) + end +2-element Array{Int64,1}: + 3 + 3 ``` """ -eachcol(df::AbstractDataFrame) = DFColumnIterator(df) +columns(df::T) where T<: AbstractDataFrame = + DataFrameColumns{T, AbstractVector}(df) + +Base.size(itr::DataFrameColumns) = (size(itr.df, 2),) +Base.IndexStyle(::Type{<:DataFrameColumns}) = Base.IndexLinear() + +@inline function Base.getindex(itr::DataFrameColumns{<:AbstractDataFrame, + Pair{Symbol, AbstractVector}}, + j::Int) + @boundscheck checkbounds(itr, j) + Base.depwarn("Indexing into a return value of eachcol will return a pair " * + "of column name and column value", :getindex) + itr.df[j] + # after deprecation replace by: + # _names(itr.df)[j] => itr.df[j] +end + +@inline function Base.getindex(itr::DataFrameColumns{<:AbstractDataFrame, AbstractVector}, + j::Int) + @boundscheck checkbounds(itr, j) + itr.df[j] +end -function Base.iterate(itr::DFColumnIterator, j=1) +# TODO: remove this after deprecation period of getindex of DataFrameColumns +function Base.iterate(itr::DataFrameColumns{<:AbstractDataFrame, + Pair{Symbol, AbstractVector}}, j=1) j > size(itr.df, 2) && return nothing - return ((_names(itr.df)[j], itr.df[j]), j + 1) + return (_names(itr.df)[j] => itr.df[j], j + 1) +end + +# TODO: remove this after deprecation period of getindex of DataFrameColumns +function Base.collect(itr::DataFrameColumns{<:AbstractDataFrame, + Pair{Symbol, AbstractVector}}) + Pair{Symbol, AbstractVector}[v for v in itr] end -Base.eltype(::DFColumnIterator) = Tuple{Symbol, AbstractVector} -Base.size(itr::DFColumnIterator) = (size(itr.df, 2), ) -Base.length(itr::DFColumnIterator) = size(itr.df, 2) -Base.getindex(itr::DFColumnIterator, j) = itr.df[j] -function Base.map(f::Union{Function,Type}, dfci::DFColumnIterator) + +""" + mapcols(f::Union{Function,Type}, df::AbstractDataFrame) + +Return a `DataFrame` where each column of `df` is transformed using function `f`. +`f` must return `AbstractVector` objects all with the same length or scalars. + +**Examples** + +```jldoctest +julia> df = DataFrame(x=1:4, y=11:14) +4×2 DataFrame +│ Row │ x │ y │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 11 │ +│ 2 │ 2 │ 12 │ +│ 3 │ 3 │ 13 │ +│ 4 │ 4 │ 14 │ + +julia> mapcols(x -> x.^2, df) +4×2 DataFrame +│ Row │ x │ y │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 121 │ +│ 2 │ 4 │ 144 │ +│ 3 │ 9 │ 169 │ +│ 4 │ 16 │ 196 │ +``` +""" +function mapcols(f::Union{Function,Type}, df::AbstractDataFrame) # note: `f` must return a consistent length res = DataFrame() - for (n, v) in eachcol(dfci.df) + for (n, v) in eachcol(df) res[n] = f(v) end res diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index f2cb37fa13..4cff773383 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -230,10 +230,10 @@ end ############################################################################## index(df::DataFrame) = getfield(df, :colindex) -columns(df::DataFrame) = getfield(df, :columns) +_columns(df::DataFrame) = getfield(df, :columns) # note: these type assertions are required to pass tests -nrow(df::DataFrame) = ncol(df) > 0 ? length(columns(df)[1])::Int : 0 +nrow(df::DataFrame) = ncol(df) > 0 ? length(_columns(df)[1])::Int : 0 ncol(df::DataFrame) = length(index(df)) ############################################################################## @@ -247,13 +247,13 @@ const ColumnIndex = Union{Integer, Symbol} # df[SingleColumnIndex] => AbstractVector, the same vector function Base.getindex(df::DataFrame, col_ind::ColumnIndex) selected_column = index(df)[col_ind] - return columns(df)[selected_column] + return _columns(df)[selected_column] end # df[MultiColumnIndex] => DataFrame function Base.getindex(df::DataFrame, col_inds::AbstractVector) selected_columns = index(df)[col_inds] - new_columns = columns(df)[selected_columns] + new_columns = _columns(df)[selected_columns] return DataFrame(new_columns, Index(_names(df)[selected_columns])) end @@ -263,7 +263,7 @@ Base.getindex(df::DataFrame, col_inds::Colon) = copy(df) # df[SingleRowIndex, SingleColumnIndex] => Scalar function Base.getindex(df::DataFrame, row_ind::Integer, col_ind::ColumnIndex) selected_column = index(df)[col_ind] - return columns(df)[selected_column][row_ind] + return _columns(df)[selected_column][row_ind] end # df[SingleRowIndex, MultiColumnIndex] => DataFrame (will be DatFrameRow) @@ -274,7 +274,7 @@ function Base.getindex(df::DataFrame, row_ind::Integer, col_inds::AbstractVector Base.depwarn("Selecting a single row from a `DataFrame` will return a `DataFrameRow` in the future. " * "To get a `DataFrame` use `df[row_ind:row_ind, col_inds]`.", :getindex) selected_columns = index(df)[col_inds] - new_columns = AbstractVector[[dv[row_ind]] for dv in columns(df)[selected_columns]] + new_columns = AbstractVector[[dv[row_ind]] for dv in _columns(df)[selected_columns]] return DataFrame(new_columns, Index(_names(df)[selected_columns])) end @@ -285,20 +285,20 @@ function Base.getindex(df::DataFrame, row_ind::Integer, ::Colon) end Base.depwarn("Selecting a single row from a `DataFrame` will return a `DataFrameRow` in the future. " * "To get a `DataFrame` use `df[row_ind:row_ind, :]`.", :getindex) - new_columns = AbstractVector[[dv[row_ind]] for dv in columns(df)] + new_columns = AbstractVector[[dv[row_ind]] for dv in _columns(df)] return DataFrame(new_columns, copy(index(df))) end # df[MultiRowIndex, SingleColumnIndex] => AbstractVector, copy function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_ind::ColumnIndex) selected_column = index(df)[col_ind] - return columns(df)[selected_column][row_inds] + return _columns(df)[selected_column][row_inds] end # df[MultiRowIndex, MultiColumnIndex] => DataFrame function Base.getindex(df::DataFrame, row_inds::AbstractVector, col_inds::AbstractVector) selected_columns = index(df)[col_inds] - new_columns = AbstractVector[dv[row_inds] for dv in columns(df)[selected_columns]] + new_columns = AbstractVector[dv[row_inds] for dv in _columns(df)[selected_columns]] return DataFrame(new_columns, Index(_names(df)[selected_columns])) end @@ -312,7 +312,7 @@ end # df[MultiRowIndex, :] => DataFrame function Base.getindex(df::DataFrame, row_inds::AbstractVector, ::Colon) - new_columns = AbstractVector[dv[row_inds] for dv in columns(df)] + new_columns = AbstractVector[dv[row_inds] for dv in _columns(df)] return DataFrame(new_columns, copy(index(df))) end @@ -351,15 +351,15 @@ function insert_single_column!(df::DataFrame, dv = isa(v, AbstractRange) ? collect(v) : v if haskey(index(df), col_ind) j = index(df)[col_ind] - columns(df)[j] = dv + _columns(df)[j] = dv else if typeof(col_ind) <: Symbol push!(index(df), col_ind) - push!(columns(df), dv) + push!(_columns(df), dv) else if ncol(df) + 1 == Int(col_ind) push!(index(df), nextcolname(df)) - push!(columns(df), dv) + push!(_columns(df), dv) else throw(ArgumentError("Cannot assign to non-existent column: $col_ind")) end @@ -370,7 +370,7 @@ end function insert_single_entry!(df::DataFrame, v::Any, row_ind::Real, col_ind::ColumnIndex) if haskey(index(df), col_ind) - columns(df)[index(df)[col_ind]][row_ind] = v + _columns(df)[index(df)[col_ind]][row_ind] = v return v else error("Cannot assign to non-existent column: $col_ind") @@ -382,7 +382,7 @@ function insert_multiple_entries!(df::DataFrame, row_inds::AbstractVector{<:Real}, col_ind::ColumnIndex) if haskey(index(df), col_ind) - columns(df)[index(df)[col_ind]][row_inds] .= v + _columns(df)[index(df)[col_ind]][row_inds] .= v return v else error("Cannot assign to non-existent column: $col_ind") @@ -616,7 +616,7 @@ function Base.setindex!(df::DataFrame, new_df::DataFrame, row_inds::Colon, col_inds::Colon=Colon()) - setfield!(df, :columns, copy(columns(new_df))) + setfield!(df, :columns, copy(_columns(new_df))) setfield!(df, :colindex, copy(index(new_df))) df end @@ -639,7 +639,7 @@ Base.setindex!(df::DataFrame, v, ::Colon, col_inds) = ## ############################################################################## -Base.empty!(df::DataFrame) = (empty!(columns(df)); empty!(index(df)); df) +Base.empty!(df::DataFrame) = (empty!(_columns(df)); empty!(index(df)); df) """ Insert a column into a data frame in place. @@ -728,7 +728,7 @@ function insertcols!(df::DataFrame, col_ind::Int, name_col::Pair{Symbol, <:Abstr end end insert!(index(df), col_ind, name) - insert!(columns(df), col_ind, item) + insert!(_columns(df), col_ind, item) df end @@ -749,12 +749,12 @@ end # A copy of a DataFrame points to the original column vectors but # gets its own Index. -Base.copy(df::DataFrame) = DataFrame(copy(columns(df)), copy(index(df))) +Base.copy(df::DataFrame) = DataFrame(copy(_columns(df)), copy(index(df))) # Deepcopy is recursive -- if a column is a vector of DataFrames, each of # those DataFrames is deepcopied. function Base.deepcopy(df::DataFrame) - DataFrame(deepcopy(columns(df)), deepcopy(index(df))) + DataFrame(deepcopy(_columns(df)), deepcopy(index(df))) end ############################################################################## @@ -766,7 +766,7 @@ end function deletecols!(df::DataFrame, inds::Vector{Int}) for ind in sort(inds, rev = true) if 1 <= ind <= ncol(df) - splice!(columns(df), ind) + splice!(_columns(df), ind) delete!(index(df), ind) else throw(ArgumentError("Can't delete a non-existent DataFrame column")) @@ -779,7 +779,7 @@ deletecols!(df::DataFrame, c::Any) = deletecols!(df, index(df)[c]) function deleterows!(df::DataFrame, ind::Union{Integer, UnitRange{Int}}) for i in 1:ncol(df) - columns(df)[i] = deleteat!(columns(df)[i], ind) + _columns(df)[i] = deleteat!(_columns(df)[i], ind) end df end @@ -805,7 +805,7 @@ function deleterows!(df::DataFrame, ind::AbstractVector{Int}) keep[ikeep:end] = idf:n for i in 1:ncol(df) - columns(df)[i] = columns(df)[i][keep] + _columns(df)[i] = _columns(df)[i][keep] end df end @@ -1011,11 +1011,11 @@ function Base.push!(df::DataFrame, iterable::Any) i = 1 for t in iterable try - push!(columns(df)[i], t) + push!(_columns(df)[i], t) catch #clean up partial row for j in 1:(i - 1) - pop!(columns(df)[j]) + pop!(_columns(df)[j]) end msg = "Error adding $t to column :$(_names(df)[i]). Possible type mis-match." throw(ArgumentError(msg)) @@ -1083,7 +1083,7 @@ function permutecols!(df::DataFrame, p::AbstractVector) if !(length(p) == size(df, 2) && isperm(p)) throw(ArgumentError("$p is not a valid column permutation for this DataFrame")) end - permute!(columns(df), p) + permute!(_columns(df), p) @inbounds permute!(index(df), p) df end diff --git a/src/dataframe/sort.jl b/src/dataframe/sort.jl index 968784ece0..d51084a2f6 100644 --- a/src/dataframe/sort.jl +++ b/src/dataframe/sort.jl @@ -88,7 +88,7 @@ end function Base.sort!(df::DataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering) p = sortperm(df, a, o) pp = similar(p) - c = columns(df) + c = _columns(df) for (i,col) in enumerate(c) # Check if this column has been sorted already diff --git a/src/dataframerow/utils.jl b/src/dataframerow/utils.jl index ebcca753ef..118df0dce6 100644 --- a/src/dataframerow/utils.jl +++ b/src/dataframerow/utils.jl @@ -73,9 +73,8 @@ end function hashrows(df::AbstractDataFrame, skipmissing::Bool) rhashes = zeros(UInt, nrow(df)) missings = fill(false, skipmissing ? nrow(df) : 0) - cols = columns(df) - for i in 1:ncol(df) - hashrows_col!(rhashes, missings, cols[i], i == 1) + for (i, col) in enumerate(columns(df)) + hashrows_col!(rhashes, missings, col, i == 1) end return (rhashes, missings) end diff --git a/src/deprecated.jl b/src/deprecated.jl index 6e88fe3937..28826b43e2 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -10,7 +10,7 @@ import Base: @deprecate import Base: keys, values, insert! @deprecate keys(df::AbstractDataFrame) names(df) -@deprecate values(df::AbstractDataFrame) DataFrames.columns(df) +@deprecate values(df::AbstractDataFrame) columns(df) @deprecate insert!(df::DataFrame, df2::AbstractDataFrame) (foreach(col -> df[col] = df2[col], names(df2)); df) @deprecate pool categorical @@ -1382,12 +1382,6 @@ import Base: setindex! # TODO: START: Deprecations to be removed after getindex deprecation period finishes -function Base.iterate(itr::Cols{SubDataFrame}, st=1) - st > length(itr.df) && return nothing - # after deprecation we will return a view, now we materialize a vector - return (itr.df[:, st], st + 1) -end - # after deprecation we will return a view, now we materialize a vector Base.get(df::SubDataFrame, key::Any, default::Any) = haskey(df, key) ? df[:, key] : default @@ -1422,12 +1416,28 @@ function Base.hash(df::SubDataFrame, h::UInt) return h end -function Base.iterate(itr::DFColumnIterator{<:SubDataFrame}, j=1) +function Base.getindex(itr::DataFrameColumns{<:SubDataFrame, + Pair{Symbol, AbstractVector}}, j::Int) + Base.depwarn("Indexing into a return value of eachcol on SubDataFrame will return a pair " * + "of column name and a view of column value", :getindex) + itr.df[:, j] +end + + +function Base.getindex(itr::DataFrameColumns{<:SubDataFrame,AbstractVector}, j) + Base.depwarn("Indexing into a return value of columns on SubDataFrame will return a" * + " view of column value", :getindex) + itr.df[:, j] +end + +function Base.iterate(itr::DataFrameColumns{<:SubDataFrame, + Pair{Symbol, AbstractVector}}, j=1) + Base.depwarn("iterating over value of eachcol on SubDataFrame will return a" * + " pair of column name and a view of column value", :getindex) j > size(itr.df, 2) && return nothing - return ((_names(itr.df)[j], itr.df[:, j]), j + 1) + return (_names(itr.df)[j] => itr.df[:, j], j + 1) end -Base.getindex(itr::DFColumnIterator{<:SubDataFrame}, j) = itr.df[:, j] function showrowindices(io::IO, df::SubDataFrame, @@ -1732,10 +1742,17 @@ function hashrows(df::SubDataFrame, skipmissing::Bool) return (rhashes, missings) end +function Base.getproperty(df::SubDataFrame, col_ind::Symbol) + Base.depwarn("`sdf.col_ind` will create a view of `parent(sdf).col_ind` in the future." * + " Use sdf[:, col_ind]` to get a freshly allocated vector.", :getproperty) + getindex(df, :, col_ind) +end + # TODO: END: Deprecations to be removed after getindex deprecation period finishes import Base: map @deprecate map(f::Function, sdf::SubDataFrame) f(sdf) +@deprecate map(f::Union{Function,Type}, dfc::DataFrameColumns{<:AbstractDataFrame, Pair{Symbol, AbstractVector}}) mapcols(f, dfc.df) import Base: length -@deprecate length(df::AbstractDataFrame) size(df, 2) \ No newline at end of file +@deprecate length(df::AbstractDataFrame) size(df, 2) diff --git a/test/cat.jl b/test/cat.jl index f3724b7a87..982be685df 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -1,6 +1,5 @@ module TestCat using Test, Random, DataFrames - using DataFrames: columns const ≅ = isequal # diff --git a/test/constructors.jl b/test/constructors.jl index 89994d79c7..3900dd2a5a 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -1,6 +1,6 @@ module TestConstructors using Test, DataFrames - using DataFrames: Index, columns, index + using DataFrames: Index, _columns, index const ≅ = isequal # @@ -8,7 +8,8 @@ module TestConstructors # @testset "constructors" begin df = DataFrame() - @test columns(df) == Any[] + @test isempty(_columns(df)) + @test _columns(df) isa Vector{AbstractVector} @test index(df) == Index() df = DataFrame(Any[CategoricalVector{Union{Float64, Missing}}(zeros(3)), diff --git a/test/dataframe.jl b/test/dataframe.jl index 630c5fd6b9..a685adb082 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -1,6 +1,6 @@ module TestDataFrame using Dates, DataFrames, LinearAlgebra, Statistics, Random, Test - using DataFrames: columns + using DataFrames: _columns const ≅ = isequal const ≇ = !isequal @@ -529,13 +529,13 @@ module TestDataFrame @test all(c -> typeof(c) <: CategoricalVector{Union{Int, Missing}}, columns(categorical!(deepcopy(df), [:A,:B]))) @test findfirst(c -> typeof(c) <: CategoricalVector{Union{Int, Missing}}, - columns(categorical!(deepcopy(df), [:A]))) == 1 + _columns(categorical!(deepcopy(df), [:A]))) == 1 @test findfirst(c -> typeof(c) <: CategoricalVector{Union{Int, Missing}}, - columns(categorical!(deepcopy(df), :A))) == 1 + _columns(categorical!(deepcopy(df), :A))) == 1 @test findfirst(c -> typeof(c) <: CategoricalVector{Union{Int, Missing}}, - columns(categorical!(deepcopy(df), [1]))) == 1 + _columns(categorical!(deepcopy(df), [1]))) == 1 @test findfirst(c -> typeof(c) <: CategoricalVector{Union{Int, Missing}}, - columns(categorical!(deepcopy(df), 1))) == 1 + _columns(categorical!(deepcopy(df), 1))) == 1 @testset "categorical!" begin df = DataFrame([["a", "b"], ['a', 'b'], [true, false], 1:2, ["x", "y"]]) diff --git a/test/grouping.jl b/test/grouping.jl index e8906af04b..1edf7b80fd 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -92,6 +92,7 @@ module TestGrouping c = Vector{Union{Float64, Missing}}(randn(8))) cols = [:a, :b] + f1(df) = DataFrame(cmax = maximum(df[:, :c])) f2(df) = (cmax = maximum(df[:, :c]),) f3(df) = maximum(df[:, :c]) diff --git a/test/iteration.jl b/test/iteration.jl index f1ea17849d..f25fc87168 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -1,10 +1,6 @@ module TestIteration using Test, DataFrames - dv = [1, 2, missing] - dm = Union{Int, Missing}[1 2; 3 4] - df = Array{Union{Int, Missing}}(zeros(2, 2, 2)) - df = DataFrame(A = Vector{Union{Int, Missing}}(1:2), B = Vector{Union{Int, Missing}}(2:3)) @test size(eachrow(df)) == (size(df, 1),) @@ -19,18 +15,42 @@ module TestIteration end @test size(eachcol(df)) == (size(df, 2),) + @test size(columns(df)) == (size(df, 2),) @test length(eachcol(df)) == size(df, 2) - @test eachcol(df)[1] == df[1] - @test collect(eachcol(df)) isa Vector{Tuple{Symbol, AbstractVector}} - @test eltype(eachcol(df)) == Tuple{Symbol, AbstractVector} + @test length(columns(df)) == size(df, 2) + @test eachcol(df)[1] == df[1] # this will be (:A => df[1]) after deprecation + @test columns(df)[1] == df[1] + @test collect(eachcol(df)) isa Vector{Pair{Symbol, AbstractVector}} + @test collect(eachcol(df)) == [:A => [1, 2], :B => [2, 3]] + @test collect(columns(df)) isa Vector{AbstractVector} + @test collect(columns(df)) == [[1, 2], [2, 3]] + @test eltype(eachcol(df)) == Pair{Symbol, AbstractVector} + @test eltype(columns(df)) == AbstractVector for col in eachcol(df) - @test isa(col, Tuple{Symbol, AbstractVector}) + @test typeof(col) <: Pair{Symbol, <:AbstractVector} + end + for col in columns(df) + @test isa(col, AbstractVector) end - @test map(x -> minimum(convert(Array, x)), eachrow(df)) == Any[1,2] + @test map(x -> minimum(convert(Array, x)), eachrow(df)) == [1,2] @test map(Vector, eachrow(df)) == [[1, 2], [2, 3]] - @test map(minimum, eachcol(df)) == DataFrame(A = [1], B = [2]) - @test eltypes(map(Vector{Float64}, eachcol(df))) == [Float64, Float64] + @test mapcols(minimum, df) == DataFrame(A = [1], B = [2]) + @test map(minimum, eachcol(df)) == DataFrame(A = [1], B = [2]) # this is deprecated + @test map(minimum, columns(df)) == [1, 2] + @test eltypes(mapcols(Vector{Float64}, df)) == [Float64, Float64] + @test eltypes(map(Vector{Float64}, eachcol(df))) == [Float64, Float64] # this is deprecated + @test eltype(map(Vector{Float64}, columns(df))) == Vector{Float64} + + # test mapcols corner cases + # this behavior might change when we rework setindex! to follow standard broadcasting rules + # notice that now mixing vectors and scalars is allowed in some cases but not in others + # this is likely to change + df_mapcols = DataFrame(a=1:10, b=11:20) + @test mapcols(sum, df_mapcols) == DataFrame(a=55, b=155) + @test mapcols(x -> x[1] == 1 ? 0 : [0], df_mapcols) == DataFrame(a=0, b=0) + @test mapcols(x -> x[1] == 1 ? x : 0, df_mapcols) == DataFrame(a=1:10, b=0) + @test_throws ArgumentError mapcols(x -> x[1] != 1 ? x : 0, df_mapcols) row = DataFrameRow(df, 1) diff --git a/test/join.jl b/test/join.jl index 96fd17cd52..7a5037fc94 100644 --- a/test/join.jl +++ b/test/join.jl @@ -1,6 +1,6 @@ module TestJoin using Test, DataFrames - using DataFrames: similar_missing, columns + using DataFrames: similar_missing const ≅ = isequal name = DataFrame(ID = Union{Int, Missing}[1, 2, 3],