diff --git a/docs/src/querycommands.md b/docs/src/querycommands.md index f97b3027..305bcb8e 100644 --- a/docs/src/querycommands.md +++ b/docs/src/querycommands.md @@ -283,6 +283,37 @@ println(x) │ 2 │ 2 │ 2 │ ``` +## Split-Apply-Combine (a.k.a. `dplyr`) + +`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise. + +#### Example + +```jldoctest +using Query, DataFrames + +df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), + age=vcat([10., 20., 30.],[10., 20., 30.].+3), + children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + +x = @from i in df begin + @group i by i.state into g + @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @collect DataFrame +end + +println(x) + +# Output + +2×4 DataFrames.DataFrame +│ Row │ group │ mage │ oldest │ youngest │ +├─────┼───────┼──────┼────────┼──────────┤ +│ 1 │ a │ 20.0 │ 30.0 │ 10.0 │ +│ 2 │ b │ 23.0 │ 33.0 │ 13.0 │ + +``` + ## Range variables The `@let` statement introduces new range variables in a query expression. The syntax for the range statement is `@let = `. `` specifies the name of the new range variable and `` is any julia expression that returns the value that should be assigned to the new range variable. diff --git a/example/25-ab-syntax.jl b/example/25-ab-syntax.jl new file mode 100644 index 00000000..230afd1a --- /dev/null +++ b/example/25-ab-syntax.jl @@ -0,0 +1,14 @@ +using Query +using DataFrames + +df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), + age=vcat([10., 20., 30.],[10., 20., 30.].+3), + children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + +x = @from i in df begin + @group i by i.state into g + @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @collect DataFrame +end + +println(x) diff --git a/src/Query.jl b/src/Query.jl index 703a8012..075cedcf 100644 --- a/src/Query.jl +++ b/src/Query.jl @@ -5,7 +5,7 @@ using NamedTuples using DataStructures using IterableTables using DataValues -import MacroTools +using MacroTools: postwalk import Base.start import Base.next diff --git a/src/query_translation.jl b/src/query_translation.jl index 4f57b054..b33615ed 100644 --- a/src/query_translation.jl +++ b/src/query_translation.jl @@ -1,5 +1,5 @@ function helper_namedtuples_replacement(ex) - return MacroTools.postwalk(ex) do x + return postwalk(ex) do x if x isa Expr && x.head==:cell1d new_ex = Expr(:macrocall, Symbol("@NT"), x.args...) @@ -24,7 +24,7 @@ end function helper_replace_anon_func_syntax(ex) if !(isa(ex, Expr) && ex.head==:->) new_symb = gensym() - new_ex = MacroTools.postwalk(ex) do x + new_ex = postwalk(ex) do x if isa(x, Symbol) && x==:_ return new_symb else @@ -52,6 +52,16 @@ function query_expression_translation_phase_A(qe) end i+=1 end + + for i in 1:length(qe) + qe[i] = postwalk(qe[i]) do x + if x isa Expr && x.head==:call && x.args[1]==:(..) + return :(map(i->i.$(x.args[3]),$(x.args[2]))) + else + return x + end + end + end end function query_expression_translation_phase_B(qe) diff --git a/test/runtests.jl b/test/runtests.jl index 34474bf9..35758acb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -550,6 +550,7 @@ q = collect(Query.@select(source_df, i->get(i.children))) include("test_indexedtables.jl") include("test_pipesyntax.jl") +include("test_dplyr-syntax.jl") end @@ -576,7 +577,8 @@ end "../example/21-nulls.jl", "../example/22-datastreams-sink.jl", "../example/23-dict-sink.jl", - "../example/24-DataTable.jl"] + "../example/24-DataTable.jl", + "../example/25-ab-syntax.jl"] color = Base.have_color ? "--color=yes" : "--color=no" compilecache = "--compilecache=" * (Bool(Base.JLOptions().use_compilecache) ? "yes" : "no") diff --git a/test/test_dplyr-syntax.jl b/test/test_dplyr-syntax.jl new file mode 100644 index 00000000..3fb5ef6d --- /dev/null +++ b/test/test_dplyr-syntax.jl @@ -0,0 +1,27 @@ +using Query +using DataFrames +using Base.Test + + + +@testset "a..b Syntax (dplyr API)" begin + + df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), + age=vcat([10., 20., 30.],[10., 20., 30.].+3), + children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + + x = @from i in df begin + @group i by i.state into g + @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @collect DataFrame + end + + @test x isa DataFrame + @test size(x) == (2,4) + @test x[1,:mage] == 20 + @test x[2,:mage] == 23 + @test x[1,:oldest] == 30 + @test x[2,:oldest] == 33 + @test x[1,:youngest] == 10 + @test x[2,:youngest] == 13 +end