Skip to content

Commit

Permalink
Make eachcol, columns and eachrow return an AbstractVector type
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins committed Nov 14, 2018
1 parent e8b1efd commit 5ebb714
Show file tree
Hide file tree
Showing 16 changed files with 252 additions and 127 deletions.
2 changes: 2 additions & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ meltdf

```@docs
allowmissing!
columns
completecases
describe
disallowmissing!
Expand All @@ -41,6 +42,7 @@ filter
filter!
head
insertcols!
mapcols
names!
nonunique
rename!
Expand Down
22 changes: 17 additions & 5 deletions docs/src/lib/types.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,21 @@ and reflects changes done to the parent after the creation of the view.
Typically objects of the `DataFrameRow` type are encountered when returned by the `eachrow` function.
In the future accessing a single row of a data frame via `getindex` or `view` will return a `DataFrameRow`.

Additionally the `eachrow` and `eachcol` functions return values of the `DFRowIterator` and `DFColumnIterator` types respectively.
Those types are not exported and should not be constructed directly.
They respectively serve as iterators over rows and columns of an `AbstractDataFrame`.
Additionally, the `eachrow` function returns a value of the `DataFrameRows` type, which
serves as an iterator over rows of an `AbstractDataFrame`, returning `DataFrameRow` objects.

Similarly, the `eachcol` and `columns` functions return a value of the `DataFrameColumns` type, which
serves as an iterator over columns of an `AbstractDataFrame`.
The difference between the return value of `eachcol` and `columns` is the following:

* The `eachcol` function returns a value of the `DataFrameColumns{<:AbstractDataFrame, true}` type, which is an
iterator returning a pair containing the column name and the column vector.
* The `columns` function returns a value of the `DataFrameColumns{<:AbstractDataFrame, false}` type, which is an
iterator returning the column vector only.

The `DataFrameRows` and `DataFrameColumns` types are subtypes of `AbstractVector` and support its interface
with the exception that they are read only. Note that they are not exported and should not be constructed directly,
but using the `eachrow`, `eachcol` and `columns` functions.

## Types specification

Expand All @@ -45,6 +57,6 @@ DataFrame
DataFrameRow
GroupedDataFrame
SubDataFrame
DFRowIterator
DFColumnIterator
DataFrameRows
DataFrameColumns
```
2 changes: 2 additions & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ export AbstractDataFrame,
aggregate,
by,
categorical!,
columns,
colwise,
combine,
completecases,
Expand All @@ -42,6 +43,7 @@ export AbstractDataFrame,
groupby,
head,
insertcols!,
mapcols,
melt,
meltdf,
names!,
Expand Down
22 changes: 3 additions & 19 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,6 @@ abstract type AbstractDataFrame end
##
##############################################################################

struct Cols{T <: AbstractDataFrame} <: AbstractVector{AbstractVector}
df::T
end
function Base.iterate(itr::Cols, st=1)
st > length(itr.df) && return nothing
return (itr.df[st], st + 1)
end
Base.length(itr::Cols) = length(itr.df)
Base.size(itr::Cols, ix) = ix==1 ? length(itr) : throw(ArgumentError("Incorrect dimension"))
Base.size(itr::Cols) = (length(itr.df),)
Base.IndexStyle(::Type{<:Cols}) = IndexLinear()
Base.getindex(itr::Cols, inds...) = getindex(itr.df, inds...)

# N.B. where stored as a vector, 'columns(x) = x.vector' is a bit cheaper
columns(df::T) where {T <: AbstractDataFrame} = Cols{T}(df)

Base.names(df::AbstractDataFrame) = names(index(df))
_names(df::AbstractDataFrame) = _names(index(df))

Expand Down Expand Up @@ -218,7 +202,7 @@ eltypes(df)
```
"""
eltypes(df::AbstractDataFrame) = map!(eltype, Vector{Type}(undef, size(df,2)), columns(df))
eltypes(df::AbstractDataFrame) = eltype.(columns(df))

Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df))
function Base.size(df::AbstractDataFrame, i::Integer)
Expand Down Expand Up @@ -1096,7 +1080,7 @@ julia> repeat(df, inner = 2, outer = 3)
```
"""
Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) =
map(x -> repeat(x, inner = inner, outer = outer), eachcol(df))
mapcols(x -> repeat(x, inner = inner, outer = outer), df)

"""
repeat(df::AbstractDataFrame, count::Integer)
Expand Down Expand Up @@ -1126,7 +1110,7 @@ julia> repeat(df, 2)
```
"""
Base.repeat(df::AbstractDataFrame, count::Integer) =
map(x -> repeat(x, count), eachcol(df))
mapcols(x -> repeat(x, count), df)

##############################################################################
##
Expand Down
15 changes: 7 additions & 8 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,11 @@ struct DataFrameStream{T}
columns::T
header::Vector{String}
end
DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(columns(df)), string.(names(df)))
DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(_columns(df)), string.(names(df)))

# DataFrame Data.Source implementation
Data.schema(df::DataFrame) =
Data.Schema(Type[eltype(A) for A in columns(df)], string.(names(df)), size(df, 1))
Data.Schema(Type[eltype(A) for A in _columns(df)], string.(names(df)), size(df, 1))

Data.isdone(source::DataFrame, row, col, rows, cols) = row > rows || col > cols
function Data.isdone(source::DataFrame, row, col)
Expand Down Expand Up @@ -276,26 +276,25 @@ function DataFrame(sch::Data.Schema{R}, ::Type{S}=Data.Field,
# to the # of rows in the source
newsize = ifelse(S == Data.Column || !R, 0,
ifelse(append, sinkrows + sch.rows, sch.rows))
foreach(col->resize!(col, newsize), columns(sink))
foreach(col->resize!(col, newsize), _columns(sink))
sch.rows = newsize
end
# take care of a possible reference from source by addint to WeakRefStringArrays
if !isempty(reference)
foreach(col-> col isa WeakRefStringArray && push!(col.data, reference),
sink.columns)
_columns(sink))
end
sink = DataFrameStream(sink)
return DataFrameStream(sink)
else
# allocating a fresh DataFrame Sink; append is irrelevant
# for Data.Column or unknown # of rows in Data.Field, we only ever append!,
# so just allocate empty columns
rows = ifelse(S == Data.Column, 0, ifelse(!R, 0, sch.rows))
names = Data.header(sch)
sink = DataFrameStream(
Tuple(allocate(types[i], rows, reference) for i = 1:length(types)), names)
sch.rows = rows
return DataFrameStream(Tuple(allocate(types[i], rows, reference)
for i = 1:length(types)), names)
end
return sink
end

DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;
Expand Down
157 changes: 123 additions & 34 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,56 +8,51 @@

# Iteration by rows
"""
DFRowIterator{<:AbstractDataFrame}
DataFrameRows{T<:AbstractDataFrame} <: AbstractVector{DataFrameRow{T}}
Iterator over rows of an `AbstractDataFrame`,
with each row represented as a `DataFrameRow`.
A value of this type is returned by the [`eachrow`](@link) function.
"""
struct DFRowIterator{T <: AbstractDataFrame}
struct DataFrameRows{T<:AbstractDataFrame} <: AbstractVector{DataFrameRow{T}}
df::T
end

"""
eachrow(df::AbstractDataFrame)
Return a `DFRowIterator` that iterates an `AbstractDataFrame` row by row,
Return a `DataFrameRows` that iterates an `AbstractDataFrame` row by row,
with each row represented as a `DataFrameRow`.
"""
eachrow(df::AbstractDataFrame) = DFRowIterator(df)
eachrow(df::AbstractDataFrame) = DataFrameRows(df)

function Base.iterate(itr::DFRowIterator, i=1)
i > size(itr.df, 1) && return nothing
return (DataFrameRow(itr.df, i), i + 1)
Base.size(itr::DataFrameRows) = (size(itr.df, 1), )
Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear()
@inline function Base.getindex(itr::DataFrameRows, i::Int)
@boundscheck checkbounds(itr, i)
return DataFrameRow(itr.df, i)
end
Base.eltype(::DFRowIterator{T}) where {T} = DataFrameRow{T}
Base.size(itr::DFRowIterator) = (size(itr.df, 1), )
Base.length(itr::DFRowIterator) = size(itr.df, 1)
Base.getindex(itr::DFRowIterator, i) = DataFrameRow(itr.df, i)

# Iteration by columns
"""
DFColumnIterator{<:AbstractDataFrame}
DataFrameColumns{<:AbstractDataFrame, V} <: AbstractVector{V}
Iterator over columns of an `AbstractDataFrame`.
Each returned value is a tuple consisting of column name and column vector.
A value of this type is returned by the [`eachcol`](@link) function.
If `V` is `Pair{Symbol,AbstractVector}` (which is the case when calling
[`eachcol`](@link)) then each returned value is a pair consisting of
column name and column vector. If `V` is `AbstractVector` (a value returned by
the [`columns`](@link) function) then each returned value is a column vector.
"""
struct DFColumnIterator{T <: AbstractDataFrame}
struct DataFrameColumns{T<:AbstractDataFrame, V} <: AbstractVector{V}
df::T
end

"""
eachcol(df::AbstractDataFrame)
Return a `DFColumnIterator` that iterates an `AbstractDataFrame` column by column.
Iteration returns a tuple consisting of column name and column vector.
`DFColumnIterator` has a custom implementation of the `map` function which
returns a `DataFrame` and assumes that a function argument passed do
the `map` function accepts takes only a column vector.
Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by column.
Iteration returns a pair consisting of column name and column vector.
**Examples**
Expand All @@ -72,28 +67,122 @@ julia> df = DataFrame(x=1:4, y=11:14)
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> map(sum, eachcol(df))
1×2 DataFrame
julia> collect(eachcol(df))
2-element Array{Pair{Symbol,AbstractArray{T,1} where T},1}:
:x => [1, 2, 3, 4]
:y => [11, 12, 13, 14]
```
"""
eachcol(df::T) where T<: AbstractDataFrame =
DataFrameColumns{T, Pair{Symbol, AbstractVector}}(df)

"""
columns(df::AbstractDataFrame)
Return a `DataFrameColumns` that iterates an `AbstractDataFrame` column by
column, yielding column vectors.
**Examples**
```jldoctest
julia> df = DataFrame(x=1:4, y=11:14)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 10 │ 50 │
│ 1 │ 1 │ 11 │
│ 2 │ 2 │ 12 │
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> collect(columns(df))
2-element Array{AbstractArray{T,1} where T,1}:
[1, 2, 3, 4]
[11, 12, 13, 14]
julia> sum.(columns(df))
2-element Array{Int64,1}:
10
50
julia> map(columns(df)) do col
maximum(col) - minimum(col)
end
2-element Array{Int64,1}:
3
3
```
"""
eachcol(df::AbstractDataFrame) = DFColumnIterator(df)
columns(df::T) where T<: AbstractDataFrame =
DataFrameColumns{T, AbstractVector}(df)

Base.size(itr::DataFrameColumns) = (size(itr.df, 2),)
Base.IndexStyle(::Type{<:DataFrameColumns}) = Base.IndexLinear()

@inline function Base.getindex(itr::DataFrameColumns{<:AbstractDataFrame,
Pair{Symbol, AbstractVector}},
j::Int)
@boundscheck checkbounds(itr, j)
Base.depwarn("Indexing into a return value of eachcol will return a pair " *
"of column name and column value", :getindex)
itr.df[j]
# after deprecation replace by:
# _names(itr.df)[j] => itr.df[j]
end

@inline function Base.getindex(itr::DataFrameColumns{<:AbstractDataFrame, AbstractVector},
j::Int)
@boundscheck checkbounds(itr, j)
itr.df[j]
end

function Base.iterate(itr::DFColumnIterator, j=1)
# TODO: remove this after deprecation period of getindex of DataFrameColumns
function Base.iterate(itr::DataFrameColumns{<:AbstractDataFrame,
Pair{Symbol, AbstractVector}}, j=1)
j > size(itr.df, 2) && return nothing
return ((_names(itr.df)[j], itr.df[j]), j + 1)
return (_names(itr.df)[j] => itr.df[j], j + 1)
end

# TODO: remove this after deprecation period of getindex of DataFrameColumns
function Base.collect(itr::DataFrameColumns{<:AbstractDataFrame,
Pair{Symbol, AbstractVector}})
Pair{Symbol, AbstractVector}[v for v in itr]
end
Base.eltype(::DFColumnIterator) = Tuple{Symbol, AbstractVector}
Base.size(itr::DFColumnIterator) = (size(itr.df, 2), )
Base.length(itr::DFColumnIterator) = size(itr.df, 2)
Base.getindex(itr::DFColumnIterator, j) = itr.df[j]
function Base.map(f::Union{Function,Type}, dfci::DFColumnIterator)

"""
mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
Return a `DataFrame` where each column of `df` is transformed using function `f`.
`f` must return `AbstractVector` objects all with the same length or scalars.
**Examples**
```jldoctest
julia> df = DataFrame(x=1:4, y=11:14)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 11 │
│ 2 │ 2 │ 12 │
│ 3 │ 3 │ 13 │
│ 4 │ 4 │ 14 │
julia> mapcols(x -> x.^2, df)
4×2 DataFrame
│ Row │ x │ y │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 121 │
│ 2 │ 4 │ 144 │
│ 3 │ 9 │ 169 │
│ 4 │ 16 │ 196 │
```
"""
function mapcols(f::Union{Function,Type}, df::AbstractDataFrame)
# note: `f` must return a consistent length
res = DataFrame()
for (n, v) in eachcol(dfci.df)
for (n, v) in eachcol(df)
res[n] = f(v)
end
res
Expand Down
Loading

0 comments on commit 5ebb714

Please sign in to comment.