Make eachcol, columns and eachrow return an AbstractVector type

JuliaData · Nov 14, 2018 · 5ebb714 · 5ebb714
1 parent e8b1efd
commit 5ebb714
Show file tree

Hide file tree

Showing 16 changed files with 252 additions and 127 deletions.
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -29,6 +29,7 @@ meltdf
 
 ```@docs
 allowmissing!
+columns
 completecases
 describe
 disallowmissing!
@@ -41,6 +42,7 @@ filter
 filter!
 head
 insertcols!
+mapcols
 names!
 nonunique
 rename!

diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md
@@ -33,9 +33,21 @@ and reflects changes done to the parent after the creation of the view.
 Typically objects of the `DataFrameRow` type are encountered when returned by the `eachrow` function.
 In the future accessing a single row of a data frame via `getindex` or `view` will return a `DataFrameRow`.
 
-Additionally the `eachrow` and `eachcol` functions return values of the `DFRowIterator` and `DFColumnIterator` types respectively.
-Those types are not exported and should not be constructed directly.
-They respectively serve as iterators over rows and columns of an `AbstractDataFrame`.
+Additionally, the `eachrow` function returns a value of the `DataFrameRows` type, which
+serves as an iterator over rows of an `AbstractDataFrame`, returning `DataFrameRow` objects.
+
+Similarly, the `eachcol` and `columns` functions return a value of the `DataFrameColumns` type, which
+serves as an iterator over columns of an `AbstractDataFrame`.
+The difference between the return value of `eachcol` and `columns` is the following:
+
+* The `eachcol` function returns a value of the `DataFrameColumns{<:AbstractDataFrame, true}` type, which is an
+  iterator returning a pair containing the column name and the column vector.
+* The `columns` function returns a value of the `DataFrameColumns{<:AbstractDataFrame, false}` type, which is an
+  iterator returning the column vector only.
+
+The `DataFrameRows` and `DataFrameColumns` types are subtypes of `AbstractVector` and support its interface
+with the exception that they are read only. Note that they are not exported and should not be constructed directly,
+but using the `eachrow`, `eachcol` and `columns` functions.
 
 ## Types specification
 
@@ -45,6 +57,6 @@ DataFrame
 DataFrameRow
 GroupedDataFrame
 SubDataFrame
-DFRowIterator
-DFColumnIterator
+DataFrameRows
+DataFrameColumns
 ```
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -27,6 +27,7 @@ export AbstractDataFrame,
        aggregate,
        by,
        categorical!,
+       columns,
        colwise,
        combine,
        completecases,
@@ -42,6 +43,7 @@ export AbstractDataFrame,
        groupby,
        head,
        insertcols!,
+       mapcols,
        melt,
        meltdf,
        names!,

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -81,22 +81,6 @@ abstract type AbstractDataFrame end
 ##
 ##############################################################################
 
-struct Cols{T <: AbstractDataFrame} <: AbstractVector{AbstractVector}
-    df::T
-end
-function Base.iterate(itr::Cols, st=1)
-    st > length(itr.df) && return nothing
-    return (itr.df[st], st + 1)
-end
-Base.length(itr::Cols) = length(itr.df)
-Base.size(itr::Cols, ix) = ix==1 ? length(itr) : throw(ArgumentError("Incorrect dimension"))
-Base.size(itr::Cols) = (length(itr.df),)
-Base.IndexStyle(::Type{<:Cols}) = IndexLinear()
-Base.getindex(itr::Cols, inds...) = getindex(itr.df, inds...)
-
-# N.B. where stored as a vector, 'columns(x) = x.vector' is a bit cheaper
-columns(df::T) where {T <: AbstractDataFrame} = Cols{T}(df)
-
 Base.names(df::AbstractDataFrame) = names(index(df))
 _names(df::AbstractDataFrame) = _names(index(df))
 
@@ -218,7 +202,7 @@ eltypes(df)
 ```
 
 """
-eltypes(df::AbstractDataFrame) = map!(eltype, Vector{Type}(undef, size(df,2)), columns(df))
+eltypes(df::AbstractDataFrame) = eltype.(columns(df))
 
 Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df))
 function Base.size(df::AbstractDataFrame, i::Integer)
@@ -1096,7 +1080,7 @@ julia> repeat(df, inner = 2, outer = 3)
 ```
 """
 Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) =
-    map(x -> repeat(x, inner = inner, outer = outer), eachcol(df))
+    mapcols(x -> repeat(x, inner = inner, outer = outer), df)
 
 """
     repeat(df::AbstractDataFrame, count::Integer)
@@ -1126,7 +1110,7 @@ julia> repeat(df, 2)
 ```
 """
 Base.repeat(df::AbstractDataFrame, count::Integer) =
-    map(x -> repeat(x, count), eachcol(df))
+    mapcols(x -> repeat(x, count), df)
 
 ##############################################################################
 ##

diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl
@@ -213,11 +213,11 @@ struct DataFrameStream{T}
     columns::T
     header::Vector{String}
 end
-DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(columns(df)), string.(names(df)))
+DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(_columns(df)), string.(names(df)))
 
 # DataFrame Data.Source implementation
 Data.schema(df::DataFrame) =
-    Data.Schema(Type[eltype(A) for A in columns(df)], string.(names(df)), size(df, 1))
+    Data.Schema(Type[eltype(A) for A in _columns(df)], string.(names(df)), size(df, 1))
 
 Data.isdone(source::DataFrame, row, col, rows, cols) = row > rows || col > cols
 function Data.isdone(source::DataFrame, row, col)
@@ -276,26 +276,25 @@ function DataFrame(sch::Data.Schema{R}, ::Type{S}=Data.Field,
                 # to the # of rows in the source
             newsize = ifelse(S == Data.Column || !R, 0,
                         ifelse(append, sinkrows + sch.rows, sch.rows))
-            foreach(col->resize!(col, newsize), columns(sink))
+            foreach(col->resize!(col, newsize), _columns(sink))
             sch.rows = newsize
         end
         # take care of a possible reference from source by addint to WeakRefStringArrays
         if !isempty(reference)
             foreach(col-> col isa WeakRefStringArray && push!(col.data, reference),
-                sink.columns)
+                    _columns(sink))
         end
-        sink = DataFrameStream(sink)
+        return DataFrameStream(sink)
     else
         # allocating a fresh DataFrame Sink; append is irrelevant
         # for Data.Column or unknown # of rows in Data.Field, we only ever append!,
             # so just allocate empty columns
         rows = ifelse(S == Data.Column, 0, ifelse(!R, 0, sch.rows))
         names = Data.header(sch)
-        sink = DataFrameStream(
-                Tuple(allocate(types[i], rows, reference) for i = 1:length(types)), names)
         sch.rows = rows
+        return DataFrameStream(Tuple(allocate(types[i], rows, reference)
+                                     for i = 1:length(types)), names)
     end
-    return sink
 end
 
 DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;

diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl
@@ -8,56 +8,51 @@
 
 # Iteration by rows
 """
-    DFRowIterator{<:AbstractDataFrame}
+    DataFrameRows{T<:AbstractDataFrame} <: AbstractVector{DataFrameRow{T}}
 
 Iterator over rows of an `AbstractDataFrame`,
 with each row represented as a `DataFrameRow`.
 
 A value of this type is returned by the [`eachrow`](@link) function.
 """
-struct DFRowIterator{T <: AbstractDataFrame}
+struct DataFrameRows{T<:AbstractDataFrame} <: AbstractVector{DataFrameRow{T}}
     df::T
 end
 
 """
     eachrow(df::AbstractDataFrame)
 
-Return a `DFRowIterator` that iterates an `AbstractDataFrame` row by row,
+Return a `DataFrameRows` that iterates an `AbstractDataFrame` row by row,
 with each row represented as a `DataFrameRow`.
 """
-eachrow(df::AbstractDataFrame) = DFRowIterator(df)
+eachrow(df::AbstractDataFrame) = DataFrameRows(df)
 
-function Base.iterate(itr::DFRowIterator, i=1)
-    i > size(itr.df, 1) && return nothing
-    return (DataFrameRow(itr.df, i), i + 1)
+Base.size(itr::DataFrameRows) = (size(itr.df, 1), )
+Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear()
+@inline function Base.getindex(itr::DataFrameRows, i::Int)
+    @boundscheck checkbounds(itr, i)
+    return DataFrameRow(itr.df, i)
 end
-Base.eltype(::DFRowIterator{T}) where {T} = DataFrameRow{T}
-Base.size(itr::DFRowIterator) = (size(itr.df, 1), )
-Base.length(itr::DFRowIterator) = size(itr.df, 1)
-Base.getindex(itr::DFRowIterator, i) = DataFrameRow(itr.df, i)
 
 # Iteration by columns
 """
-    DFColumnIterator{<:AbstractDataFrame}
+    DataFrameColumns{<:AbstractDataFrame, V} <: AbstractVector{V}
 
 Iterator over columns of an `AbstractDataFrame`.
-Each returned value is a tuple consisting of column name and column vector.
-
-A value of this type is returned by the [`eachcol`](@link) function.
+If `V` is `Pair{Symbol,AbstractVector}` (which is the case when calling
+[`eachcol`](@link)) then each returned value is a pair consisting of
+column name and column vector. If `V` is `AbstractVector` (a value returned by
+the [`columns`](@link) function) then each returned value is a column vector.
 """
-struct DFColumnIterator{T <: AbstractDataFrame}
+struct DataFrameColumns{T<:AbstractDataFrame, V} <: AbstractVector{V}
     df::T
 end
 
 """
     eachcol(df::AbstractDataFrame)
 
-Return a `DFColumnIterator` that iterates an `AbstractDataFrame` column by column.
-Iteration returns a tuple consisting of column name and column vector.
-
-`DFColumnIterator` has a custom implementation of the `map` function which
-returns a `DataFrame` and assumes that a function argument passed do
-the `map` function accepts takes only a column vector.
+Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by column.
+Iteration returns a pair consisting of column name and column vector.
 
 **Examples**
 
@@ -72,28 +67,122 @@ julia> df = DataFrame(x=1:4, y=11:14)
 │ 3   │ 3     │ 13    │
 │ 4   │ 4     │ 14    │
 
-julia> map(sum, eachcol(df))
-1×2 DataFrame
+julia> collect(eachcol(df))
+2-element Array{Pair{Symbol,AbstractArray{T,1} where T},1}:
+ :x => [1, 2, 3, 4]
+ :y => [11, 12, 13, 14]
+```
+"""
+eachcol(df::T) where T<: AbstractDataFrame =
+    DataFrameColumns{T, Pair{Symbol, AbstractVector}}(df)
+
+"""
+    columns(df::AbstractDataFrame)
+
+Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by
+column, yielding column vectors.
+
+**Examples**
+
+```jldoctest
+julia> df = DataFrame(x=1:4, y=11:14)
+4×2 DataFrame
 │ Row │ x     │ y     │
 │     │ Int64 │ Int64 │
 ├─────┼───────┼───────┤
-│ 1   │ 10    │ 50    │
+│ 1   │ 1     │ 11    │
+│ 2   │ 2     │ 12    │
+│ 3   │ 3     │ 13    │
+│ 4   │ 4     │ 14    │
+
+julia> collect(columns(df))
+2-element Array{AbstractArray{T,1} where T,1}:
+ [1, 2, 3, 4]
+ [11, 12, 13, 14]
+
+julia> sum.(columns(df))
+2-element Array{Int64,1}:
+ 10
+ 50
+
+julia> map(columns(df)) do col
+           maximum(col) - minimum(col)
+       end
+2-element Array{Int64,1}:
+ 3
+ 3
 ```
 """
-eachcol(df::AbstractDataFrame) = DFColumnIterator(df)
+columns(df::T) where T<: AbstractDataFrame =
+    DataFrameColumns{T, AbstractVector}(df)
+
+Base.size(itr::DataFrameColumns) = (size(itr.df, 2),)
+Base.IndexStyle(::Type{<:DataFrameColumns}) = Base.IndexLinear()
+
+@inline function Base.getindex(itr::DataFrameColumns{<:AbstractDataFrame,
+                                                     Pair{Symbol, AbstractVector}},
+                               j::Int)
+    @boundscheck checkbounds(itr, j)
+    Base.depwarn("Indexing into a return value of eachcol will return a pair " *
+                 "of column name and column value", :getindex)
+    itr.df[j]
+    # after deprecation replace by:
+    # _names(itr.df)[j] => itr.df[j]
+end
+
+@inline function Base.getindex(itr::DataFrameColumns{<:AbstractDataFrame, AbstractVector},
+                               j::Int)
+    @boundscheck checkbounds(itr, j)
+    itr.df[j]
+end
 
-function Base.iterate(itr::DFColumnIterator, j=1)
+# TODO: remove this after deprecation period of getindex of DataFrameColumns
+function Base.iterate(itr::DataFrameColumns{<:AbstractDataFrame,
+                                            Pair{Symbol, AbstractVector}}, j=1)
     j > size(itr.df, 2) && return nothing
-    return ((_names(itr.df)[j], itr.df[j]), j + 1)
+    return (_names(itr.df)[j] => itr.df[j], j + 1)
+end
+
+# TODO: remove this after deprecation period of getindex of DataFrameColumns
+function Base.collect(itr::DataFrameColumns{<:AbstractDataFrame,
+                                            Pair{Symbol, AbstractVector}})
+    Pair{Symbol, AbstractVector}[v for v in itr]
 end
-Base.eltype(::DFColumnIterator) = Tuple{Symbol, AbstractVector}
-Base.size(itr::DFColumnIterator) = (size(itr.df, 2), )
-Base.length(itr::DFColumnIterator) = size(itr.df, 2)
-Base.getindex(itr::DFColumnIterator, j) = itr.df[j]
-function Base.map(f::Union{Function,Type}, dfci::DFColumnIterator)
+
+"""
+    mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
+
+Return a `DataFrame` where each column of `df` is transformed using function `f`.
+`f` must return `AbstractVector` objects all with the same length or scalars.
+
+**Examples**
+
+```jldoctest
+julia> df = DataFrame(x=1:4, y=11:14)
+4×2 DataFrame
+│ Row │ x     │ y     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 11    │
+│ 2   │ 2     │ 12    │
+│ 3   │ 3     │ 13    │
+│ 4   │ 4     │ 14    │
+
+julia> mapcols(x -> x.^2, df)
+4×2 DataFrame
+│ Row │ x     │ y     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 121   │
+│ 2   │ 4     │ 144   │
+│ 3   │ 9     │ 169   │
+│ 4   │ 16    │ 196   │
+```
+"""
+function mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
     # note: `f` must return a consistent length
     res = DataFrame()
-    for (n, v) in eachcol(dfci.df)
+    for (n, v) in eachcol(df)
         res[n] = f(v)
     end
     res