From 516a2fc4492468809a53bbb93aec4cf975240d33 Mon Sep 17 00:00:00 2001 From: David Anthoff Date: Wed, 7 Jun 2017 10:39:35 -0700 Subject: [PATCH 1/7] Add a..b syntax --- REQUIRE | 1 + src/Query.jl | 1 + src/query_translation.jl | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/REQUIRE b/REQUIRE index 0d42b987..c5962cab 100644 --- a/REQUIRE +++ b/REQUIRE @@ -5,3 +5,4 @@ Requires 0.4.3 Documenter 0.9.0 IterableTables 0.1.0 DataValues 0.0.2 +MacroTools 0.3.6 diff --git a/src/Query.jl b/src/Query.jl index 70b7e520..eef11c60 100644 --- a/src/Query.jl +++ b/src/Query.jl @@ -5,6 +5,7 @@ using NamedTuples using DataStructures using IterableTables using DataValues +using MacroTools: postwalk, prewalk, @capture import Base.start import Base.next diff --git a/src/query_translation.jl b/src/query_translation.jl index 12c7c400..cf2dd219 100644 --- a/src/query_translation.jl +++ b/src/query_translation.jl @@ -13,6 +13,16 @@ function query_expression_translation_phase_A(qe) end i+=1 end + + for i in 1:length(qe) + qe[i] = postwalk(qe[i]) do x + if x isa Expr && x.head==:call && x.args[1]==:(..) + return :(map(i->i.$(x.args[3]),$(x.args[2]))) + else + return x + end + end + end end function query_expression_translation_phase_B(qe) From 30d640aa7596a6e8c34a11437123a9dff6673a8c Mon Sep 17 00:00:00 2001 From: florian oswald Date: Fri, 23 Jun 2017 23:39:55 +0200 Subject: [PATCH 2/7] added example and test for a..b --- example/25-ab-syntax.jl | 14 ++++++++++++++ test/test_dplyr-syntax.jl | 27 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 example/25-ab-syntax.jl create mode 100644 test/test_dplyr-syntax.jl diff --git a/example/25-ab-syntax.jl b/example/25-ab-syntax.jl new file mode 100644 index 00000000..230afd1a --- /dev/null +++ b/example/25-ab-syntax.jl @@ -0,0 +1,14 @@ +using Query +using DataFrames + +df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), + age=vcat([10., 20., 30.],[10., 20., 30.].+3), + children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + +x = @from i in df begin + @group i by i.state into g + @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @collect DataFrame +end + +println(x) diff --git a/test/test_dplyr-syntax.jl b/test/test_dplyr-syntax.jl new file mode 100644 index 00000000..3fb5ef6d --- /dev/null +++ b/test/test_dplyr-syntax.jl @@ -0,0 +1,27 @@ +using Query +using DataFrames +using Base.Test + + + +@testset "a..b Syntax (dplyr API)" begin + + df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), + age=vcat([10., 20., 30.],[10., 20., 30.].+3), + children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + + x = @from i in df begin + @group i by i.state into g + @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @collect DataFrame + end + + @test x isa DataFrame + @test size(x) == (2,4) + @test x[1,:mage] == 20 + @test x[2,:mage] == 23 + @test x[1,:oldest] == 30 + @test x[2,:oldest] == 33 + @test x[1,:youngest] == 10 + @test x[2,:youngest] == 13 +end From e8c0f1dee23b813581638382d641effb10a3603f Mon Sep 17 00:00:00 2001 From: florian oswald Date: Sun, 25 Jun 2017 11:43:01 +0200 Subject: [PATCH 3/7] added docs for dplyr --- docs/src/querycommands.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/src/querycommands.md b/docs/src/querycommands.md index f97b3027..305bcb8e 100644 --- a/docs/src/querycommands.md +++ b/docs/src/querycommands.md @@ -283,6 +283,37 @@ println(x) │ 2 │ 2 │ 2 │ ``` +## Split-Apply-Combine (a.k.a. `dplyr`) + +`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise. + +#### Example + +```jldoctest +using Query, DataFrames + +df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), + age=vcat([10., 20., 30.],[10., 20., 30.].+3), + children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b]) + +x = @from i in df begin + @group i by i.state into g + @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)} + @collect DataFrame +end + +println(x) + +# Output + +2×4 DataFrames.DataFrame +│ Row │ group │ mage │ oldest │ youngest │ +├─────┼───────┼──────┼────────┼──────────┤ +│ 1 │ a │ 20.0 │ 30.0 │ 10.0 │ +│ 2 │ b │ 23.0 │ 33.0 │ 13.0 │ + +``` + ## Range variables The `@let` statement introduces new range variables in a query expression. The syntax for the range statement is `@let = `. `` specifies the name of the new range variable and `` is any julia expression that returns the value that should be assigned to the new range variable. From 83583840b8bf3a390be68a42b109a1d21b9afc3b Mon Sep 17 00:00:00 2001 From: florian oswald Date: Sun, 25 Jun 2017 23:19:19 +0200 Subject: [PATCH 4/7] started with datatable --- perf/Rdatatable.jl | 96 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 perf/Rdatatable.jl diff --git a/perf/Rdatatable.jl b/perf/Rdatatable.jl new file mode 100644 index 00000000..e023d132 --- /dev/null +++ b/perf/Rdatatable.jl @@ -0,0 +1,96 @@ + + + + +module QueryPerf + +using Query, DataFrames, StatsBase, RCall + + function R_bench(N,K) + + R""" + library(data.table) + N <- $N + K <- $K + # copied from https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping + set.seed(1) + DT <- data.table( + id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) + id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) + id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) + id4 = sample(K, N, TRUE), # large groups (int) + id5 = sample(K, N, TRUE), # large groups (int) + id6 = sample(N/K, N, TRUE), # small groups (int) + v1 = sample(5, N, TRUE), # int in range [1,5] + v2 = sample(5, N, TRUE), # int in range [1,5] + v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 + ) + + timings <- list() + + timings$sum1 <- system.time( DT[, sum(v1), keyby=id1] )[3] + timings$sum2 <- system.time( DT[, sum(v1), keyby=id1] )[3] + timings$sum3 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3] + timings$sum4 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3] + timings$sum_mean1 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3] + timings$sum_mean2 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3] + timings$mean7_9_by_id4_1 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3] + timings$mean7_9_by_id4_2 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3] + timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3] + timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3] + """ + @rget timings + return timings + end + + function createData(N::Int,K::Int) + + N = 1_000 + K = 100 + + df = DataFrame(id1 = sample(["id$x" for x in 1:K],N), + id2 = sample(["id$x" for x in 1:K],N), + id3 = sample(["id$x" for x in 1:(N/K)],N*K), + id4 = sample(1:K,N), + id5 = sample(1:K,N), + id6 = sample(1:(N/K),N*K), + v1 = sample(1:5,N), + v2 = sample(1:5,N), + v3 = sample(round(rand(100),4),N)) + + return df + end + + function bench1(df::DataFrame) + + t1 = @from i in df begin + @group i by i.id1 into g + @select r=sum(g..v1) + @collect DataFrame + end + return nothing + end + + function run_benches(N=1_000,K=100) + # get small data for JIT warmup + d_ = createData(10,3) + # warm up + bench1(d_) + + # get real data + d = createData(N,K) + # measure + t1 = @elapsed bench1(d) + + end + + +end # module + + + + + + + + From bd3dfdbd5c051c914f011c1dc57ba44e8bbcc274 Mon Sep 17 00:00:00 2001 From: florian oswald Date: Sun, 25 Jun 2017 23:22:07 +0200 Subject: [PATCH 5/7] Revert "started with datatable" This reverts commit 83583840b8bf3a390be68a42b109a1d21b9afc3b. --- perf/Rdatatable.jl | 96 ---------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 perf/Rdatatable.jl diff --git a/perf/Rdatatable.jl b/perf/Rdatatable.jl deleted file mode 100644 index e023d132..00000000 --- a/perf/Rdatatable.jl +++ /dev/null @@ -1,96 +0,0 @@ - - - - -module QueryPerf - -using Query, DataFrames, StatsBase, RCall - - function R_bench(N,K) - - R""" - library(data.table) - N <- $N - K <- $K - # copied from https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping - set.seed(1) - DT <- data.table( - id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) - id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) - id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) - id4 = sample(K, N, TRUE), # large groups (int) - id5 = sample(K, N, TRUE), # large groups (int) - id6 = sample(N/K, N, TRUE), # small groups (int) - v1 = sample(5, N, TRUE), # int in range [1,5] - v2 = sample(5, N, TRUE), # int in range [1,5] - v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 - ) - - timings <- list() - - timings$sum1 <- system.time( DT[, sum(v1), keyby=id1] )[3] - timings$sum2 <- system.time( DT[, sum(v1), keyby=id1] )[3] - timings$sum3 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3] - timings$sum4 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3] - timings$sum_mean1 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3] - timings$sum_mean2 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3] - timings$mean7_9_by_id4_1 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3] - timings$mean7_9_by_id4_2 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3] - timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3] - timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3] - """ - @rget timings - return timings - end - - function createData(N::Int,K::Int) - - N = 1_000 - K = 100 - - df = DataFrame(id1 = sample(["id$x" for x in 1:K],N), - id2 = sample(["id$x" for x in 1:K],N), - id3 = sample(["id$x" for x in 1:(N/K)],N*K), - id4 = sample(1:K,N), - id5 = sample(1:K,N), - id6 = sample(1:(N/K),N*K), - v1 = sample(1:5,N), - v2 = sample(1:5,N), - v3 = sample(round(rand(100),4),N)) - - return df - end - - function bench1(df::DataFrame) - - t1 = @from i in df begin - @group i by i.id1 into g - @select r=sum(g..v1) - @collect DataFrame - end - return nothing - end - - function run_benches(N=1_000,K=100) - # get small data for JIT warmup - d_ = createData(10,3) - # warm up - bench1(d_) - - # get real data - d = createData(N,K) - # measure - t1 = @elapsed bench1(d) - - end - - -end # module - - - - - - - - From 2c2f5d3481c00bbaf057ed85fdfc5925db93d9ff Mon Sep 17 00:00:00 2001 From: David Anthoff Date: Fri, 21 Jul 2017 20:30:18 -0700 Subject: [PATCH 6/7] Run dplyr tests --- test/runtests.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 34474bf9..35758acb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -550,6 +550,7 @@ q = collect(Query.@select(source_df, i->get(i.children))) include("test_indexedtables.jl") include("test_pipesyntax.jl") +include("test_dplyr-syntax.jl") end @@ -576,7 +577,8 @@ end "../example/21-nulls.jl", "../example/22-datastreams-sink.jl", "../example/23-dict-sink.jl", - "../example/24-DataTable.jl"] + "../example/24-DataTable.jl", + "../example/25-ab-syntax.jl"] color = Base.have_color ? "--color=yes" : "--color=no" compilecache = "--compilecache=" * (Bool(Base.JLOptions().use_compilecache) ? "yes" : "no") From 083e115a792c684666c8c0158d370e16749b4722 Mon Sep 17 00:00:00 2001 From: David Anthoff Date: Fri, 21 Jul 2017 20:52:31 -0700 Subject: [PATCH 7/7] Fix use of MacroTools --- src/Query.jl | 2 +- src/query_translation.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Query.jl b/src/Query.jl index 2e5d3911..075cedcf 100644 --- a/src/Query.jl +++ b/src/Query.jl @@ -5,7 +5,7 @@ using NamedTuples using DataStructures using IterableTables using DataValues -using MacroTools: postwalk, prewalk, @capture +using MacroTools: postwalk import Base.start import Base.next diff --git a/src/query_translation.jl b/src/query_translation.jl index aa7ef49b..b33615ed 100644 --- a/src/query_translation.jl +++ b/src/query_translation.jl @@ -1,5 +1,5 @@ function helper_namedtuples_replacement(ex) - return MacroTools.postwalk(ex) do x + return postwalk(ex) do x if x isa Expr && x.head==:cell1d new_ex = Expr(:macrocall, Symbol("@NT"), x.args...) @@ -24,7 +24,7 @@ end function helper_replace_anon_func_syntax(ex) if !(isa(ex, Expr) && ex.head==:->) new_symb = gensym() - new_ex = MacroTools.postwalk(ex) do x + new_ex = postwalk(ex) do x if isa(x, Symbol) && x==:_ return new_symb else