Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update join #536

Merged
merged 2 commits into from
Feb 9, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions src/dataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,15 @@ eachcol(df::AbstractDataFrame) = DFColumnIterator(df)

Base.start(itr::DFColumnIterator) = 1
Base.done(itr::DFColumnIterator, j::Int) = j > size(itr.df, 2)
Base.next(itr::DFColumnIterator, j::Int) = (itr.df[:, j], j + 1)
Base.next(itr::DFColumnIterator, j::Int) = ((names(itr.df)[j], itr.df[j]), j + 1)
Base.size(itr::DFColumnIterator) = (size(itr.df, 2), )
Base.length(itr::DFColumnIterator) = size(itr.df, 2)
Base.getindex(itr::DFColumnIterator, j::Any) = itr.df[:, j]
function Base.map(f::Function, dfci::DFColumnIterator)
# note: `f` must return a consistent length
res = DataFrame()
for i = 1:size(dfci.df, 2)
res[i] = f(dfci[i])
for (n, v) in eachcol(dfci.df)
res[n] = f(v)
end
names!(res, names(dfci.df))
res
end

# Iteration matches that of Associative types (experimental)
Base.start(df::AbstractDataFrame) = 1
Base.done(df::AbstractDataFrame, i) = i > ncol(df)
Base.next(df::AbstractDataFrame, i) = ((names(df)[i], df[i]), i + 1)
26 changes: 23 additions & 3 deletions src/dataframe/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,24 @@ end

DataArrays.PooledDataArray(df::AbstractDataFrame) = PooledDataArray(df, DEFAULT_POOLED_REF_TYPE)

# Union(Vector{T}, ByteString, Nothing
function Base.join(df1::AbstractDataFrame,
df2::AbstractDataFrame;
on::Any = nothing,
on::Union(Symbol, Vector{Symbol}) = Symbol[],
kind::Symbol = :inner)
if on == nothing
if kind == :cross
if on != Symbol[]
throw(ArgumentError("Cross joins don't use argument 'on'."))
end
return crossjoin(df1, df2)
elseif on == Symbol[]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the basic implementation should take in an array of Symbols and then we use a quick wrapper that puts a single symbol into an array? Could conceivably be faster because of decreased type uncertainty.

depwarn("Natural joins are deprecated, use argument 'on'.", :AbstractDataFrame)
on = intersect(names(df1), names(df2))
if length(on) > 1
throw(ArgumentError("Key omitted from join with multiple shared names."))
end
#throw(ArgumentError("Missing join argument 'on'."))
end

dv1, dv2 = PooledDataVecs(df1[on], df2[on])
left_indexer, leftonly_indexer, right_indexer, rightonly_indexer =
join_idx(dv1.refs, dv2.refs, length(dv1.pool))
Expand Down Expand Up @@ -169,3 +176,16 @@ function Base.join(df1::AbstractDataFrame,
throw(ArgumentError("Unknown kind of join requested"))
end
end

function crossjoin(df1::DataFrame, df2::DataFrame)
d = DataFrame()
addx!(d, df1, 1, size(df2, 1))
addx!(d, df2, size(df1, 1), 1)
d
end

function addx!(d::DataFrame, x::DataFrame, times::Int, each::Int)
for (n, v) in eachcol(x)
d[n] = rep(v, times, each)
end
end
75 changes: 42 additions & 33 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ Base.@deprecate EachCol eachcol
Base.@deprecate subset sub

function DataFrame(df::DataFrame)
depwarn("DataFrame(::DataFrame) is deprecated, use convert(DataFrame, DataFrame) instead",
:DataFrame)
return df
depwarn("DataFrame(::DataFrame) is deprecated, use convert(DataFrame, DataFrame) instead",
:DataFrame)
return df
end

function DataFrame(x::Union(Number, String))
depwarn("DataFrame(::Union(Number, String)) is deprecated, use DataFrame(Vector{Any}) instead",
:DataFrame)
depwarn("DataFrame(::Union(Number, String)) is deprecated, use DataFrame(Vector{Any}) instead",
:DataFrame)
cols = {DataArray([x], falses(1))}
colind = Index(gennames(1))
return DataFrame(cols, colind)
Expand All @@ -31,8 +31,8 @@ end
# TODO: Replace this with convert call.
# Convert a standard Matrix to a DataFrame w/ pre-specified names
function DataFrame(x::Matrix, cn::Vector = gennames(size(x, 2)))
depwarn("DataFrame(::Matrix, ::Vector)) is deprecated, use convert(DataFrame, Matrix) instead",
:DataFrame)
depwarn("DataFrame(::Matrix, ::Vector)) is deprecated, use convert(DataFrame, Matrix) instead",
:DataFrame)
n = length(cn)
cols = Array(Any, n)
for i in 1:n
Expand All @@ -42,17 +42,26 @@ function DataFrame(x::Matrix, cn::Vector = gennames(size(x, 2)))
end

function DataFrame{T<:String}(columns::Vector{Any}, cnames::Vector{T})
depwarn("DataFrame(::Vector{Any}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{Any}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(columns, map(symbol, cnames))
depwarn("DataFrame(::Vector{Any}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{Any}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(columns, map(symbol, cnames))
end

function DataFrame{D <: Associative, T <: String}(ds::Vector{D}, ks::Vector{T})
depwarn("DataFrame(::Vector{D<:Associative}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{D<:Associative}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(ds, map(symbol, ks))
depwarn("DataFrame(::Vector{D<:Associative}, ::Vector{T<:String}) is deprecated, use DataFrame(::Vector{D<:Associative}, ::Vector{Symbol}) instead",
:DataFrame)
DataFrame(ds, map(symbol, ks))
end

# Iteration matches that of Associative types (experimental)
function Base.start(df::AbstractDataFrame)
depwarn("Default AbstractDataFrame iterator is deprecated, use eachcol(::AbstractDataFrame) instead",
:AbstractDataFrame)
1
end
Base.done(df::AbstractDataFrame, i) = i > ncol(df)
Base.next(df::AbstractDataFrame, i) = ((names(df)[i], df[i]), i + 1)

##############################################################################
##
## Dict conversion
Expand All @@ -63,7 +72,7 @@ end
##############################################################################

function dict(adf::AbstractDataFrame, flatten::Bool = false)
depwarn("dict(::AbstractDataFrame, ::Bool) is deprecated", :dict)
depwarn("dict(::AbstractDataFrame, ::Bool) is deprecated", :dict)
res = Dict{Symbol, Any}()
if flatten && size(adf, 1) == 1
for colname in names(adf)
Expand All @@ -78,62 +87,62 @@ function dict(adf::AbstractDataFrame, flatten::Bool = false)
end

function pool!(df::AbstractDataFrame, cname::String)
depwarn("pool!(::AbstractDataFrame, ::String) is deprecated, use pool!(::AbstractDataFrame, ::Symbol) instead", :pool!)
pool!(df, symbol(cname))
depwarn("pool!(::AbstractDataFrame, ::String) is deprecated, use pool!(::AbstractDataFrame, ::Symbol) instead", :pool!)
pool!(df, symbol(cname))
end

function pool!{T<:String}(df::AbstractDataFrame, cname::Vector{T})
depwarn("pool!(::AbstractDataFrame, ::Vector{T<:String}) is deprecated, use pool!(::AbstractDataFrame, ::Vector{T<:Symbol}) instead", :pool!)
pool!(df, map(symbol, cnames))
depwarn("pool!(::AbstractDataFrame, ::Vector{T<:String}) is deprecated, use pool!(::AbstractDataFrame, ::Vector{T<:Symbol}) instead", :pool!)
pool!(df, map(symbol, cnames))
end

function Base.getindex(df::DataFrame, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, symbol(col_ind))
end

function Base.getindex{T<:String}(df::DataFrame, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, map(symbol, col_inds))
end

function Base.getindex(df::DataFrame, row_ind, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, row_ind, symbol(col_ind))
end

function Base.getindex{T<:String}(df::DataFrame, row_ind, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, row_ind, map(symbol, col_inds))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(df, row_ind, map(symbol, col_inds))
end

function Base.getindex(x::AbstractIndex, idx::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, symbol(idx))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, symbol(idx))
end

function Base.getindex{T<:String}(x::AbstractIndex, idx::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, map(symbol, idx))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :getindex)
getindex(x, map(symbol, idx))
end

function Base.setindex!(df::DataFrame, v, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, symbol(col_ind))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, symbol(col_ind))
end

function Base.setindex!{T<:String}(df::DataFrame, v, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, map(symbol, col_ind))
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, map(symbol, col_ind))
end

function Base.setindex!(df::DataFrame, v, row_ind, col_ind::String)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, row_ind, symbol(col_ind))
end

function Base.assign{T<:String}(df::DataFrame, v, row_ind, col_inds::AbstractVector{T})
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
depwarn("indexing DataFrames with strings is deprecated; use symbols instead", :setindex!)
setindex!(df, v, row_ind, map(symbol, col_ind))
end

Expand Down
2 changes: 1 addition & 1 deletion test/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ module TestIteration
end

for col in eachcol(df)
@assert isa(col, AbstractDataVector)
@assert isa(col, (Symbol, AbstractDataVector))
end

@assert isequal(map(x -> minimum(array(x)), eachrow(df)), {1,2})
Expand Down
43 changes: 31 additions & 12 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,49 @@ module TestJoin
name = DataFrame(ID = [1, 2, 3], Name = ["John Doe", "Jane Doe", "Joe Blogs"])
job = DataFrame(ID = [1, 2, 4], Job = ["Lawyer", "Doctor", "Farmer"])

# Join on symbols or vectors of symbols
join(name, job, on = :ID)
join(name, job, on = [:ID])

# Soon we won't allow natural joins
#@test_throws join(name, job)

# Test output of various join types
outer = DataFrame(ID = [1, 2, 3, 4],
Name = @data(["John Doe", "Jane Doe", "Joe Blogs", NA]),
Job = @data(["Lawyer", "Doctor", NA, "Farmer"]))

# Tests use current column ordering but don't promote it
# (Tests use current column ordering but don't promote it)
right = outer[!isna(outer[:Job]), [:Name, :ID, :Job]]
left = outer[!isna(outer[:Name]), :]
inner = left[!isna(left[:Job]), :]
semi = inner[:, [:ID, :Name]]
anti = left[isna(left[:Job]), [:ID, :Name]]

@test isequal(join(name, job), inner)
@test isequal(join(name, job, kind = :inner), inner)
@test isequal(join(name, job, kind = :outer), outer)
@test isequal(join(name, job, kind = :left), left)
@test isequal(join(name, job, kind = :right), right)
@test isequal(join(name, job, kind = :semi), semi)
@test isequal(join(name, job, kind = :anti), anti)
@test isequal(join(name, job, on = :ID), inner)
@test isequal(join(name, job, on = :ID, kind = :inner), inner)
@test isequal(join(name, job, on = :ID, kind = :outer), outer)
@test isequal(join(name, job, on = :ID, kind = :left), left)
@test isequal(join(name, job, on = :ID, kind = :right), right)
@test isequal(join(name, job, on = :ID, kind = :semi), semi)
@test isequal(join(name, job, on = :ID, kind = :anti), anti)

# Join on multiple keys
df1 = DataFrame(A = 1, B = 2, C = 3)
df2 = DataFrame(A = 1, B = 2, D = 4)

# Join key detection expects a single shared column
@test_throws join(df1, df2)

join(df1, df2, on = [:A, :B])
end

# Test output of cross joins
df1 = DataFrame(A = 1:2, B = 'a':'b')
df2 = DataFrame(A = 1:3, C = 3:5)

cross = DataFrame(A = [1, 1, 1, 2, 2, 2],
B = ['a', 'a', 'a', 'b', 'b', 'b'],
C = [3, 4, 5, 3, 4, 5])

@test join(df1, df2[[:C]], kind = :cross) == cross

# Cross joins don't take keys
@test_throws join(df1, df2, on = :A, kind = :cross)
end