Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-do the @orderby backend #191

Merged
merged 20 commits into from
Oct 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
order of rows returned after `DataFrames.transform(gd::GroupedDataFrame, args...)`.
* `@select` now supports `GroupedDataFrame` with the same behavior as
`DataFrames.select(df::GroupedDataFrame, args...)` ([#180])
* `@orderby(gd::GroupedDataFrame, args...)` is now reserved and will error.
* Restrictions are imposed on the types of column references allowed when using `cols`.
Mixing integer column references with other types now errors. ([#183])
Mixing integer column references with other types now errors. ([#183])
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,17 @@ df2 = @byrow df begin
end
```

## `@orderby`

Sort rows in a `DataFrame` by values in one of several columns or a
transformation of columns.

```julia
d = DataFrame(x = [3, 3, 3, 2, 1, 1, 1, 2, 1, 1], n = 1:10);
@orderby(d, -1 .* :n)
@orderby(d, :x, :n .- mean(:x))
```

## Working with column names programmatically with `cols`

DataFramesMeta.jl provides the special syntax `cols` for referring to
Expand Down Expand Up @@ -320,7 +331,7 @@ The following operations are now included:
GroupedDataFrame.

- `orderby(g, d -> mean(d[:a]))` and `@orderby(g, mean(:a))` -- Sort
groups based on the given criteria. Returns a GroupedDataFrame.
rows by a given criteria. Returns a `DataFrame`.

- `DataFrame(g)` -- Convert groups back to a DataFrame with the same
group orderings.
Expand Down
106 changes: 54 additions & 52 deletions src/DataFramesMeta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ function fun_to_vec(kw::Expr; nolhs = false)
end
else
if kw.args[1] isa Symbol
# cols(n) = f(:x) becomes [:x] => _f => n
# y = f(:x) becomes [:x] => _f => :y
output = QuoteNode(kw.args[1])
elseif onearg(kw.args[1], :cols)
# y = f(:x) becomes [:x] => _f => :y
# cols(n) = f(:x) becomes [:x] => _f => n
output = kw.args[1].args[2]
end
t = quote
Expand All @@ -141,7 +141,7 @@ function fun_to_vec(kw::Expr; nolhs = false)
end
end

fun_to_vec(kw::QuoteNode) = kw
fun_to_vec(kw::QuoteNode; nolhs = false) = kw

function make_source_concrete(x::AbstractVector)
if isempty(x) || isconcretetype(eltype(x))
Expand Down Expand Up @@ -397,76 +397,78 @@ end
##
##############################################################################

# needed on Julia 1.0 till #1489 in DataFrames is merged
orderby(d::DataFrame, arg::DataFrame) = d[sortperm(arg), :]

function orderby(d::AbstractDataFrame, args...)
D = typeof(d)(args...)
d[sortperm(D), :]
function orderby_helper(x, args...)
t = (fun_to_vec(arg; nolhs = true) for arg in args)
quote
$DataFramesMeta.orderby($x, $(t...))
end
end

orderby(d::AbstractDataFrame, f::Function) = d[sortperm(f(d)), :]
orderby(g::GroupedDataFrame, f::Function) = g[sortperm([f(x) for x in g])]

orderbyconstructor(d::AbstractDataFrame) = (x...) -> DataFrame(Any[x...], Symbol.(1:length(x)))
orderbyconstructor(d) = x -> x
function orderby(x::AbstractDataFrame, @nospecialize(args...))
t = DataFrames.select(x, args...; copycols = false)
x[sortperm(t), :]
end

function orderby_helper(d, args...)
_D = gensym()
quote
let $_D = $d
$orderby($_D, $(with_anonymous(:($orderbyconstructor($_D)($(args...))))))
end
end
function orderby(x::GroupedDataFrame, @nospecialize(args...))
throw(ArgumentError("@orderby with a GroupedDataFrame is reserved"))
end

"""
@orderby(d, i...)

Sort by criteria. Normally used to sort groups in GroupedDataFrames.
Sort rows by values in one of several columns or a transformation of columns.
Always returns a fresh `DataFrame`. Does not accept a `GroupedDataFrame`.

When given a `DataFrame`, `@orderby` applies the transformation
given by its arguments (but does not create new columns) and sorts
the given `DataFrame` on the result, returning a new `DataFrame`.

### Arguments

* `d` : an AbstractDataFrame or GroupedDataFrame
* `d` : an AbstractDataFrame
* `i...` : expression for sorting

### Examples

```jldoctest
julia> using DataFrames, DataFramesMeta, Statistics

julia> d = DataFrame(n = 1:20, x = [3, 3, 3, 3, 1, 1, 1, 2, 1, 1,
2, 1, 1, 2, 2, 2, 3, 1, 1, 2]);

julia> g = groupby(d, :x);

julia> @orderby(g, mean(:n))
GroupedDataFrame 3 groups with keys: Symbol[:x]
First Group:
5×2 SubDataFrame{Array{Int64,1}}
│ Row │ n │ x │
├─────┼────┼───┤
│ 1 │ 1 │ 3 │
│ 2 │ 2 │ 3 │
│ 3 │ 3 │ 3 │
│ 4 │ 4 │ 3 │
│ 5 │ 17 │ 3 │
Last Group:
6×2 SubDataFrame{Array{Int64,1}}
│ Row │ n │ x │
├─────┼────┼───┤
│ 1 │ 8 │ 2 │
│ 2 │ 11 │ 2 │
│ 3 │ 14 │ 2 │
│ 4 │ 15 │ 2 │
│ 5 │ 16 │ 2 │
│ 6 │ 20 │ 2 │
julia> d = DataFrame(x = [3, 3, 3, 2, 1, 1, 1, 2, 1, 1], n = 1:10);

julia> @orderby(d, -1 .* :n)
10×2 DataFrame
│ Row │ x │ n │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 10 │
│ 2 │ 1 │ 9 │
│ 3 │ 2 │ 8 │
│ 4 │ 1 │ 7 │
│ 5 │ 1 │ 6 │
│ 6 │ 1 │ 5 │
│ 7 │ 2 │ 4 │
│ 8 │ 3 │ 3 │
│ 9 │ 3 │ 2 │
│ 10 │ 3 │ 1 │

julia> @orderby(d, :x, :n .- mean(:n))
10×2 DataFrame
│ Row │ x │ n │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 5 │
│ 2 │ 1 │ 6 │
│ 3 │ 1 │ 7 │
│ 4 │ 1 │ 9 │
│ 5 │ 1 │ 10 │
│ 6 │ 2 │ 4 │
│ 7 │ 2 │ 8 │
│ 8 │ 3 │ 1 │
│ 9 │ 3 │ 2 │
│ 10 │ 3 │ 3 │
```

"""
macro orderby(d, args...)
# I don't esc just the input because I want _DF to be visible to the user
esc(orderby_helper(d, args...))
end

Expand Down
16 changes: 15 additions & 1 deletion test/dataframes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,21 @@ end
@test @where(df, :A .> 1, :A .< 4, :B .> 1) == df[map(&, df.A .> 1, df.A .< 4, df.B .> 1),:]
end

@testset "orderby" begin
df = DataFrame(
g = [1, 1, 1, 2, 2],
i = 1:5,
t = ["a", "b", "c", "c", "e"],
y = [:v, :w, :x, :y, :z],
c = [:g, :quote, :body, :transform, missing]
)

gd = groupby(df, :g)
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved

@test @orderby(df, :c).i == [3, 1, 2, 4, 5]
@test @orderby(df, -:g).i == [4, 5, 1, 2, 3]
@test @orderby(df, :t).i == [1, 2, 3, 4, 5]
end

@test DataFramesMeta.orderby(df, df[[1, 3, 2], :]) == df[[1, 3, 2], :]

end # module
3 changes: 0 additions & 3 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@ g = groupby(d, :x, sort=true)
@test DataFrame(@where(g, length(:x) > 5)) == DataFrame(DataFramesMeta.where(g, x -> length(x.x) > 5))
@test DataFrame(@where(g, length(:x) > 5))[!, :n][1:3] == [5, 6, 7]

@test DataFrame(DataFramesMeta.orderby(g, x -> mean(x.n))) == DataFrame(@orderby(g, mean(:n)))

@test @based_on(g, nsum = sum(:n)).nsum == [99, 84, 27]


@testset "@based_on" begin
df = DataFrame(
g = [1, 1, 1, 2, 2],
Expand Down
24 changes: 11 additions & 13 deletions test/linqmacro.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,26 @@ using DataFramesMeta
using Statistics
using Random

Random.seed!(100)
n = 100
df = DataFrame(a = rand(1:3, n),
b = ["a","b","c","d"][rand(1:4, n)],
x = rand(n))
df = DataFrame(a = repeat(1:5, outer = 20),
b = repeat(["a", "b", "c", "d"], inner = 25),
x = repeat(1:20, inner = 5))

x = @where(df, :a .> 2, :b .!= "c")
x = @transform(x, y = 10 * :x)
x = @orderby(x, :x .- mean(:x))
x = @by(x, :b, meanX = mean(:x), meanY = mean(:y))
x = @orderby(x, -:meanX)
x = @select(x, var = :b, :meanX, :meanY)

x1 = @linq transform(where(df, :a .> 2, :b .!= "c"), y = 10 * :x)
x1 = @linq by(x1, :b, meanX = mean(:x), meanY = mean(:y))
x1 = @linq select(orderby(x1, -:meanX), var = :b, :meanX, :meanY)
x1 = @linq by(orderby(x1, :x .- mean(:x)), :b, meanX = mean(:x), meanY = mean(:y))
x1 = @linq select(x1, var = :b, :meanX, :meanY)

## chaining
xlinq = @linq df |>
where(:a .> 2, :b .!= "c") |>
transform(y = 10 * :x) |>
orderby(:x .- mean(:x)) |>
by(:b, meanX = mean(:x), meanY = mean(:y)) |>
orderby(-:meanX) |>
select(var = :b, :meanX, :meanY)

@test x == x1
Expand All @@ -36,17 +34,17 @@ xlinq = @linq df |>
xlinq2 = @linq df |>
where(:a .> 2, :b .!= "c") |>
transform(y = 10 * :x) |>
orderby(:x .- mean(:x)) |>
groupby(:b) |>
orderby(-mean(:x)) |>
based_on(meanX = mean(:x), meanY = mean(:y))

@test xlinq2[!, [:meanX, :meanY]] == xlinq[!, [:meanX, :meanY]]

xlinq3 = @linq df |>
where(:a .> 2, :b .!= "c") |>
transform(y = 10 * :x) |>
orderby(:x .- mean(:x)) |>
DataFrames.groupby(:b) |>
orderby(-mean(:x)) |>
based_on(meanX = mean(:x), meanY = mean(:y))

@test xlinq3[!, [:meanX, :meanY]] == xlinq[!, [:meanX, :meanY]]
Expand All @@ -68,8 +66,8 @@ xlinq3 = @linq df |>
xlinq3 = @linq df |>
where(cols(a_sym) .> 2, :b .!= "c") |>
transform(cols(y_str) = 10 * cols(x_sym)) |>
DataFrames.groupby(b_str) |>
orderby(-mean(cols(x_sym))) |>
orderby(cols(x_sym) .- mean(cols(x_sym))) |>
groupby(b_str) |>
based_on(cols("meanX") = mean(:x), meanY = mean(:y))

@test isequal(xlinq3, DataFrame(b = "d", meanX = 40.0, meanY = 400.0))
Expand Down