Skip to content

Commit

Permalink
Merge pull request #536 from garborg/updatejoin
Browse files Browse the repository at this point in the history
Update join
  • Loading branch information
johnmyleswhite committed Feb 9, 2014
2 parents 2bc72f2 + 4d01b1f commit fba8b12
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 58 deletions.
12 changes: 3 additions & 9 deletions src/dataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,15 @@ eachcol(df::AbstractDataFrame) = DFColumnIterator(df)

Base.start(itr::DFColumnIterator) = 1
Base.done(itr::DFColumnIterator, j::Int) = j > size(itr.df, 2)
Base.next(itr::DFColumnIterator, j::Int) = (itr.df[:, j], j + 1)
Base.next(itr::DFColumnIterator, j::Int) = ((names(itr.df)[j], itr.df[j]), j + 1)
Base.size(itr::DFColumnIterator) = (size(itr.df, 2), )
Base.length(itr::DFColumnIterator) = size(itr.df, 2)
Base.getindex(itr::DFColumnIterator, j::Any) = itr.df[:, j]
function Base.map(f::Function, dfci::DFColumnIterator)
# note: `f` must return a consistent length
res = DataFrame()
for i = 1:size(dfci.df, 2)
res[i] = f(dfci[i])
for (n, v) in eachcol(dfci.df)
res[n] = f(v)
end
names!(res, names(dfci.df))
res
end

# Iteration matches that of Associative types (experimental)
Base.start(df::AbstractDataFrame) = 1
Base.done(df::AbstractDataFrame, i) = i > ncol(df)
Base.next(df::AbstractDataFrame, i) = ((names(df)[i], df[i]), i + 1)
26 changes: 23 additions & 3 deletions src/dataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,24 @@ end

DataArrays.PooledDataArray(df::AbstractDataFrame) = PooledDataArray(df, DEFAULT_POOLED_REF_TYPE)

# Union(Vector{T}, ByteString, Nothing
function Base.join(df1::AbstractDataFrame,
df2::AbstractDataFrame;
on::Any = nothing,
on::Union(Symbol, Vector{Symbol}) = Symbol[],
kind::Symbol = :inner)
if on == nothing
if kind == :cross
if on != Symbol[]
throw(ArgumentError("Cross joins don't use argument 'on'."))
end
return crossjoin(df1, df2)
elseif on == Symbol[]
depwarn("Natural joins are deprecated, use argument 'on'.", :AbstractDataFrame)
on = intersect(names(df1), names(df2))
if length(on) > 1
throw(ArgumentError("Key omitted from join with multiple shared names."))
end
#throw(ArgumentError("Missing join argument 'on'."))
end

dv1, dv2 = PooledDataVecs(df1[on], df2[on])
left_indexer, leftonly_indexer, right_indexer, rightonly_indexer =
join_idx(dv1.refs, dv2.refs, length(dv1.pool))
Expand Down Expand Up @@ -169,3 +176,16 @@ function Base.join(df1::AbstractDataFrame,
throw(ArgumentError("Unknown kind of join requested"))
end
end

function crossjoin(df1::DataFrame, df2::DataFrame)
d = DataFrame()
addx!(d, df1, 1, size(df2, 1))
addx!(d, df2, size(df1, 1), 1)
d
end

function addx!(d::DataFrame, x::DataFrame, times::Int, each::Int)
for (n, v) in eachcol(x)
d[n] = rep(v, times, each)
end
end
75 changes: 42 additions & 33 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ Base.@deprecate EachCol eachcol
Base.@deprecate subset sub

function DataFrame(df::DataFrame)
depwarn("DataFrame(::DataFrame) is deprecated, use convert(DataFrame, DataFrame) instead",
:DataFrame)
return df
depwarn("DataFrame(::DataFrame) is deprecated, use convert(DataFrame, DataFrame) instead",
:DataFrame)
return df
end

function DataFrame(x::Union(Number, String))
depwarn("DataFrame(::Union(Number, String)) is deprecated, use DataFrame(Vector{Any}) instead",
:DataFrame)
depwarn("DataFrame(::Union(Number, String)) is deprecated, use DataFrame(Vector{Any}) instead",
:DataFrame)
cols = {DataArray([x], falses(1))}
colind = Index(gennames(1))
return DataFrame(cols, colind)
Expand All @@ -31,8 +31,8 @@ end
# TODO: Replace this with convert call.
# Convert a standard Matrix to a DataFrame w/ pre-specified names
function DataFrame(x::Matrix, cn::Vector = gennames(size(x, 2)))
depwarn("DataFrame(::Matrix, ::Vector)) is deprecated, use convert(DataFrame, Matrix) instead",
:DataFrame)
depwarn("DataFrame(::Matrix, ::Vector)) is deprecated, use convert(DataFrame, Matrix) instead",
:DataFrame)
n = length(cn)
cols = Array(Any, n)
for i in 1:n
Expand All @@ -42,17 +42,26 @@ function DataFrame(x::Matrix, cn::Vector = gennames(size(x, 2)))
end

function DataFrame{T<:String}(columns::Vector{Any}, cnames::Vector{T})
depwarn("DataFrame(::Vector{Any}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{Any}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(columns, map(symbol, cnames))
depwarn("DataFrame(::Vector{Any}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{Any}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(columns, map(symbol, cnames))
end

function DataFrame{D <: Associative, T <: String}(ds::Vector{D}, ks::Vector{T})
depwarn("DataFrame(::Vector{D<:Associative}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{D<:Associative}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(ds, map(symbol, ks))
depwarn("DataFrame(::Vector{D<:Associative}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{D<:Associative}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(ds, map(symbol, ks))
end

# Iteration matches that of Associative types (experimental)
function Base.start(df::AbstractDataFrame)
depwarn("Default AbstractDataFrame iterator is deprecated, use eachcol(::AbstractDataFrame) instead",
:AbstractDataFrame)
1
end
Base.done(df::AbstractDataFrame, i) = i > ncol(df)
Base.next(df::AbstractDataFrame, i) = ((names(df)[i], df[i]), i + 1)

##############################################################################
##
## Dict conversion
Expand All @@ -63,7 +72,7 @@ end
##############################################################################

function dict(adf::AbstractDataFrame, flatten::Bool = false)
depwarn("dict(::AbstractDataFrame, ::Bool) is deprecated", :dict)
depwarn("dict(::AbstractDataFrame, ::Bool) is deprecated", :dict)
res = Dict{Symbol, Any}()
if flatten && size(adf, 1) == 1
for colname in names(adf)
Expand All @@ -78,62 +87,62 @@ function dict(adf::AbstractDataFrame, flatten::Bool = false)
end

function pool!(df::AbstractDataFrame, cname::String)
depwarn("pool!(::AbstractDataFrame, ::String) is deprecated, use pool!(::AbstractDataFrame, ::Symbol) instead", :pool!)
pool!(df, symbol(cname))
depwarn("pool!(::AbstractDataFrame, ::String) is deprecated, use pool!(::AbstractDataFrame, ::Symbol) instead", :pool!)
pool!(df, symbol(cname))
end

function pool!{T<:String}(df::AbstractDataFrame, cname::Vector{T})
depwarn("pool!(::AbstractDataFrame, ::Vector{T<:String}) is deprecated, use pool!(::AbstractDataFrame, ::Vector{T<:Symbol}) instead", :pool!)
pool!(df, map(symbol, cnames))
depwarn("pool!(::AbstractDataFrame, ::Vector{T<:String}) is deprecated, use pool!(::AbstractDataFrame, ::Vector{T<:Symbol}) instead", :pool!)
pool!(df, map(symbol, cnames))
end

function Base.getindex(df::DataFrame, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, symbol(col_ind))
end

function Base.getindex{T<:String}(df::DataFrame, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, map(symbol, col_inds))
end

function Base.getindex(df::DataFrame, row_ind, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, row_ind, symbol(col_ind))
end

function Base.getindex{T<:String}(df::DataFrame, row_ind, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, row_ind, map(symbol, col_inds))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, row_ind, map(symbol, col_inds))
end

function Base.getindex(x::AbstractIndex, idx::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, symbol(idx))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, symbol(idx))
end

function Base.getindex{T<:String}(x::AbstractIndex, idx::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, map(symbol, idx))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, map(symbol, idx))
end

function Base.setindex!(df::DataFrame, v, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, symbol(col_ind))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, symbol(col_ind))
end

function Base.setindex!{T<:String}(df::DataFrame, v, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, map(symbol, col_ind))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, map(symbol, col_ind))
end

function Base.setindex!(df::DataFrame, v, row_ind, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, row_ind, symbol(col_ind))
end

function Base.assign{T<:String}(df::DataFrame, v, row_ind, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, row_ind, map(symbol, col_ind))
end

Expand Down
2 changes: 1 addition & 1 deletion test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ module TestIteration
end

for col in eachcol(df)
@assert isa(col, AbstractDataVector)
@assert isa(col, (Symbol, AbstractDataVector))
end

@assert isequal(map(x -> minimum(array(x)), eachrow(df)), {1,2})
Expand Down
43 changes: 31 additions & 12 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,49 @@ module TestJoin
name = DataFrame(ID = [1, 2, 3], Name = ["John Doe", "Jane Doe", "Joe Blogs"])
job = DataFrame(ID = [1, 2, 4], Job = ["Lawyer", "Doctor", "Farmer"])

# Join on symbols or vectors of symbols
join(name, job, on = :ID)
join(name, job, on = [:ID])

# Soon we won't allow natural joins
#@test_throws join(name, job)

# Test output of various join types
outer = DataFrame(ID = [1, 2, 3, 4],
Name = @data(["John Doe", "Jane Doe", "Joe Blogs", NA]),
Job = @data(["Lawyer", "Doctor", NA, "Farmer"]))

# Tests use current column ordering but don't promote it
# (Tests use current column ordering but don't promote it)
right = outer[!isna(outer[:Job]), [:Name, :ID, :Job]]
left = outer[!isna(outer[:Name]), :]
inner = left[!isna(left[:Job]), :]
semi = inner[:, [:ID, :Name]]
anti = left[isna(left[:Job]), [:ID, :Name]]

@test isequal(join(name, job), inner)
@test isequal(join(name, job, kind = :inner), inner)
@test isequal(join(name, job, kind = :outer), outer)
@test isequal(join(name, job, kind = :left), left)
@test isequal(join(name, job, kind = :right), right)
@test isequal(join(name, job, kind = :semi), semi)
@test isequal(join(name, job, kind = :anti), anti)
@test isequal(join(name, job, on = :ID), inner)
@test isequal(join(name, job, on = :ID, kind = :inner), inner)
@test isequal(join(name, job, on = :ID, kind = :outer), outer)
@test isequal(join(name, job, on = :ID, kind = :left), left)
@test isequal(join(name, job, on = :ID, kind = :right), right)
@test isequal(join(name, job, on = :ID, kind = :semi), semi)
@test isequal(join(name, job, on = :ID, kind = :anti), anti)

# Join on multiple keys
df1 = DataFrame(A = 1, B = 2, C = 3)
df2 = DataFrame(A = 1, B = 2, D = 4)

# Join key detection expects a single shared column
@test_throws join(df1, df2)

join(df1, df2, on = [:A, :B])
end

# Test output of cross joins
df1 = DataFrame(A = 1:2, B = 'a':'b')
df2 = DataFrame(A = 1:3, C = 3:5)

cross = DataFrame(A = [1, 1, 1, 2, 2, 2],
B = ['a', 'a', 'a', 'b', 'b', 'b'],
C = [3, 4, 5, 3, 4, 5])

@test join(df1, df2[[:C]], kind = :cross) == cross

# Cross joins don't take keys
@test_throws join(df1, df2, on = :A, kind = :cross)
end

0 comments on commit fba8b12

Please sign in to comment.