From 516a2fc4492468809a53bbb93aec4cf975240d33 Mon Sep 17 00:00:00 2001
From: David Anthoff <anthoff@berkeley.edu>
Date: Wed, 7 Jun 2017 10:39:35 -0700
Subject: [PATCH 1/7] Add a..b syntax

---
 REQUIRE                  |  1 +
 src/Query.jl             |  1 +
 src/query_translation.jl | 10 ++++++++++
 3 files changed, 12 insertions(+)

diff --git a/REQUIRE b/REQUIRE
index 0d42b987..c5962cab 100644
--- a/REQUIRE
+++ b/REQUIRE
@@ -5,3 +5,4 @@ Requires 0.4.3
 Documenter 0.9.0
 IterableTables 0.1.0
 DataValues 0.0.2
+MacroTools 0.3.6
diff --git a/src/Query.jl b/src/Query.jl
index 70b7e520..eef11c60 100644
--- a/src/Query.jl
+++ b/src/Query.jl
@@ -5,6 +5,7 @@ using NamedTuples
 using DataStructures
 using IterableTables
 using DataValues
+using MacroTools: postwalk, prewalk, @capture
 
 import Base.start
 import Base.next
diff --git a/src/query_translation.jl b/src/query_translation.jl
index 12c7c400..cf2dd219 100644
--- a/src/query_translation.jl
+++ b/src/query_translation.jl
@@ -13,6 +13,16 @@ function query_expression_translation_phase_A(qe)
 		end
 		i+=1
 	end
+
+	for i in 1:length(qe)
+		qe[i] = postwalk(qe[i]) do x
+			if x isa Expr && x.head==:call && x.args[1]==:(..)
+				return :(map(i->i.$(x.args[3]),$(x.args[2])))
+			else
+				return x
+			end
+		end
+	end
 end
 
 function query_expression_translation_phase_B(qe)

From 30d640aa7596a6e8c34a11437123a9dff6673a8c Mon Sep 17 00:00:00 2001
From: florian oswald <florian.oswald@gmail.com>
Date: Fri, 23 Jun 2017 23:39:55 +0200
Subject: [PATCH 2/7] added example and test for a..b

---
 example/25-ab-syntax.jl   | 14 ++++++++++++++
 test/test_dplyr-syntax.jl | 27 +++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 example/25-ab-syntax.jl
 create mode 100644 test/test_dplyr-syntax.jl

diff --git a/example/25-ab-syntax.jl b/example/25-ab-syntax.jl
new file mode 100644
index 00000000..230afd1a
--- /dev/null
+++ b/example/25-ab-syntax.jl
@@ -0,0 +1,14 @@
+using Query
+using DataFrames
+
+df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), 
+     age=vcat([10., 20., 30.],[10., 20., 30.].+3), 
+     children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])
+
+x = @from i in df begin
+    @group i by i.state into g
+    @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
+    @collect DataFrame
+end
+
+println(x)
diff --git a/test/test_dplyr-syntax.jl b/test/test_dplyr-syntax.jl
new file mode 100644
index 00000000..3fb5ef6d
--- /dev/null
+++ b/test/test_dplyr-syntax.jl
@@ -0,0 +1,27 @@
+using Query
+using DataFrames
+using Base.Test
+
+
+
+@testset "a..b Syntax (dplyr API)" begin
+
+    df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), 
+                   age=vcat([10., 20., 30.],[10., 20., 30.].+3), 
+                   children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])
+
+    x = @from i in df begin
+        @group i by i.state into g
+        @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
+        @collect DataFrame
+    end
+
+    @test x isa DataFrame
+    @test size(x) == (2,4)
+    @test x[1,:mage] == 20
+    @test x[2,:mage] == 23
+    @test x[1,:oldest] == 30
+    @test x[2,:oldest] == 33
+    @test x[1,:youngest] == 10
+    @test x[2,:youngest] == 13
+end

From e8c0f1dee23b813581638382d641effb10a3603f Mon Sep 17 00:00:00 2001
From: florian oswald <florian.oswald@gmail.com>
Date: Sun, 25 Jun 2017 11:43:01 +0200
Subject: [PATCH 3/7] added docs for dplyr

---
 docs/src/querycommands.md | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/docs/src/querycommands.md b/docs/src/querycommands.md
index f97b3027..305bcb8e 100644
--- a/docs/src/querycommands.md
+++ b/docs/src/querycommands.md
@@ -283,6 +283,37 @@ println(x)
 │ 2   │ 2   │ 2     │
 ```
 
+## Split-Apply-Combine (a.k.a. `dplyr`)
+
+`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise.
+
+#### Example
+
+```jldoctest
+using Query, DataFrames
+
+df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]), 
+     age=vcat([10., 20., 30.],[10., 20., 30.].+3), 
+     children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])
+
+x = @from i in df begin
+    @group i by i.state into g
+    @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
+    @collect DataFrame
+end
+
+println(x)
+
+# Output
+
+2×4 DataFrames.DataFrame
+│ Row │ group │ mage │ oldest │ youngest │
+├─────┼───────┼──────┼────────┼──────────┤
+│ 1   │ a     │ 20.0 │ 30.0   │ 10.0     │
+│ 2   │ b     │ 23.0 │ 33.0   │ 13.0     │
+
+```
+
 ## Range variables
 
 The `@let` statement introduces new range variables in a query expression. The syntax for the range statement is `@let <range variable> = <value selector>`. `<range variable>` specifies the name of the new range variable and `<value selector>` is any julia expression that returns the value that should be assigned to the new range variable.

From 83583840b8bf3a390be68a42b109a1d21b9afc3b Mon Sep 17 00:00:00 2001
From: florian oswald <florian.oswald@gmail.com>
Date: Sun, 25 Jun 2017 23:19:19 +0200
Subject: [PATCH 4/7] started with datatable

---
 perf/Rdatatable.jl | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 perf/Rdatatable.jl

diff --git a/perf/Rdatatable.jl b/perf/Rdatatable.jl
new file mode 100644
index 00000000..e023d132
--- /dev/null
+++ b/perf/Rdatatable.jl
@@ -0,0 +1,96 @@
+
+
+
+
+module QueryPerf
+
+using Query, DataFrames, StatsBase, RCall
+
+    function R_bench(N,K)
+
+        R"""
+        library(data.table)
+        N <- $N
+        K <- $K
+        # copied from https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping
+        set.seed(1)
+        DT <- data.table(
+          id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
+          id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
+          id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
+          id4 = sample(K, N, TRUE),                          # large groups (int)
+          id5 = sample(K, N, TRUE),                          # large groups (int)
+          id6 = sample(N/K, N, TRUE),                        # small groups (int)
+          v1 =  sample(5, N, TRUE),                          # int in range [1,5]
+          v2 =  sample(5, N, TRUE),                          # int in range [1,5]
+          v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
+        )
+
+        timings <- list()
+
+        timings$sum1 <- system.time( DT[, sum(v1), keyby=id1] )[3]
+        timings$sum2 <- system.time( DT[, sum(v1), keyby=id1] )[3]
+        timings$sum3 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3]
+        timings$sum4 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3]
+        timings$sum_mean1 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3]
+        timings$sum_mean2 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3]
+        timings$mean7_9_by_id4_1 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3]
+        timings$mean7_9_by_id4_2 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3]
+        timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3]
+        timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3]
+        """
+        @rget timings
+        return timings
+    end
+
+    function createData(N::Int,K::Int)
+
+        N = 1_000
+        K = 100
+
+        df = DataFrame(id1 = sample(["id$x" for x in 1:K],N),
+                       id2 = sample(["id$x" for x in 1:K],N),
+                       id3 = sample(["id$x" for x in 1:(N/K)],N*K),
+                       id4 = sample(1:K,N),
+                       id5 = sample(1:K,N),
+                       id6 = sample(1:(N/K),N*K),
+                       v1 = sample(1:5,N),
+                       v2 = sample(1:5,N),
+                       v3 = sample(round(rand(100),4),N))
+
+        return df
+    end
+
+    function bench1(df::DataFrame)
+
+        t1 = @from i in df begin
+                 @group i by i.id1 into g
+                 @select r=sum(g..v1)
+                 @collect DataFrame 
+            end
+        return nothing
+    end
+
+    function run_benches(N=1_000,K=100)
+        # get small data for JIT warmup
+        d_ = createData(10,3)
+        # warm up
+        bench1(d_)
+
+        # get real data
+        d = createData(N,K)
+        # measure
+        t1 = @elapsed bench1(d)
+
+    end
+
+
+end # module
+
+
+
+
+
+
+
+

From bd3dfdbd5c051c914f011c1dc57ba44e8bbcc274 Mon Sep 17 00:00:00 2001
From: florian oswald <florian.oswald@gmail.com>
Date: Sun, 25 Jun 2017 23:22:07 +0200
Subject: [PATCH 5/7] Revert "started with datatable"

This reverts commit 83583840b8bf3a390be68a42b109a1d21b9afc3b.
---
 perf/Rdatatable.jl | 96 ----------------------------------------------
 1 file changed, 96 deletions(-)
 delete mode 100644 perf/Rdatatable.jl

diff --git a/perf/Rdatatable.jl b/perf/Rdatatable.jl
deleted file mode 100644
index e023d132..00000000
--- a/perf/Rdatatable.jl
+++ /dev/null
@@ -1,96 +0,0 @@
-
-
-
-
-module QueryPerf
-
-using Query, DataFrames, StatsBase, RCall
-
-    function R_bench(N,K)
-
-        R"""
-        library(data.table)
-        N <- $N
-        K <- $K
-        # copied from https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping
-        set.seed(1)
-        DT <- data.table(
-          id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
-          id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
-          id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
-          id4 = sample(K, N, TRUE),                          # large groups (int)
-          id5 = sample(K, N, TRUE),                          # large groups (int)
-          id6 = sample(N/K, N, TRUE),                        # small groups (int)
-          v1 =  sample(5, N, TRUE),                          # int in range [1,5]
-          v2 =  sample(5, N, TRUE),                          # int in range [1,5]
-          v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
-        )
-
-        timings <- list()
-
-        timings$sum1 <- system.time( DT[, sum(v1), keyby=id1] )[3]
-        timings$sum2 <- system.time( DT[, sum(v1), keyby=id1] )[3]
-        timings$sum3 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3]
-        timings$sum4 <- system.time( DT[, sum(v1), keyby="id1,id2"] )[3]
-        timings$sum_mean1 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3]
-        timings$sum_mean2 <- system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )[3]
-        timings$mean7_9_by_id4_1 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3]
-        timings$mean7_9_by_id4_2 <- system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )[3]
-        timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3]
-        timings$sum7_9_by_id6_2 <- system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )[3]
-        """
-        @rget timings
-        return timings
-    end
-
-    function createData(N::Int,K::Int)
-
-        N = 1_000
-        K = 100
-
-        df = DataFrame(id1 = sample(["id$x" for x in 1:K],N),
-                       id2 = sample(["id$x" for x in 1:K],N),
-                       id3 = sample(["id$x" for x in 1:(N/K)],N*K),
-                       id4 = sample(1:K,N),
-                       id5 = sample(1:K,N),
-                       id6 = sample(1:(N/K),N*K),
-                       v1 = sample(1:5,N),
-                       v2 = sample(1:5,N),
-                       v3 = sample(round(rand(100),4),N))
-
-        return df
-    end
-
-    function bench1(df::DataFrame)
-
-        t1 = @from i in df begin
-                 @group i by i.id1 into g
-                 @select r=sum(g..v1)
-                 @collect DataFrame 
-            end
-        return nothing
-    end
-
-    function run_benches(N=1_000,K=100)
-        # get small data for JIT warmup
-        d_ = createData(10,3)
-        # warm up
-        bench1(d_)
-
-        # get real data
-        d = createData(N,K)
-        # measure
-        t1 = @elapsed bench1(d)
-
-    end
-
-
-end # module
-
-
-
-
-
-
-
-

From 2c2f5d3481c00bbaf057ed85fdfc5925db93d9ff Mon Sep 17 00:00:00 2001
From: David Anthoff <anthoff@berkeley.edu>
Date: Fri, 21 Jul 2017 20:30:18 -0700
Subject: [PATCH 6/7] Run dplyr tests

---
 test/runtests.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 34474bf9..35758acb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -550,6 +550,7 @@ q = collect(Query.@select(source_df, i->get(i.children)))
 
 include("test_indexedtables.jl")
 include("test_pipesyntax.jl")
+include("test_dplyr-syntax.jl")
 
 end
 
@@ -576,7 +577,8 @@ end
         "../example/21-nulls.jl",
         "../example/22-datastreams-sink.jl",
         "../example/23-dict-sink.jl",
-        "../example/24-DataTable.jl"]
+        "../example/24-DataTable.jl",
+        "../example/25-ab-syntax.jl"]
 
     color = Base.have_color ? "--color=yes" : "--color=no"
     compilecache = "--compilecache=" * (Bool(Base.JLOptions().use_compilecache) ? "yes" : "no")

From 083e115a792c684666c8c0158d370e16749b4722 Mon Sep 17 00:00:00 2001
From: David Anthoff <anthoff@berkeley.edu>
Date: Fri, 21 Jul 2017 20:52:31 -0700
Subject: [PATCH 7/7] Fix use of MacroTools

---
 src/Query.jl             | 2 +-
 src/query_translation.jl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Query.jl b/src/Query.jl
index 2e5d3911..075cedcf 100644
--- a/src/Query.jl
+++ b/src/Query.jl
@@ -5,7 +5,7 @@ using NamedTuples
 using DataStructures
 using IterableTables
 using DataValues
-using MacroTools: postwalk, prewalk, @capture
+using MacroTools: postwalk
 
 import Base.start
 import Base.next
diff --git a/src/query_translation.jl b/src/query_translation.jl
index aa7ef49b..b33615ed 100644
--- a/src/query_translation.jl
+++ b/src/query_translation.jl
@@ -1,5 +1,5 @@
 function helper_namedtuples_replacement(ex)
-	return MacroTools.postwalk(ex) do x
+	return postwalk(ex) do x
 		if x isa Expr && x.head==:cell1d
 			new_ex = Expr(:macrocall, Symbol("@NT"), x.args...)
 
@@ -24,7 +24,7 @@ end
 function helper_replace_anon_func_syntax(ex)
 	if !(isa(ex, Expr) && ex.head==:->)
 		new_symb = gensym()
-		new_ex = MacroTools.postwalk(ex) do x
+		new_ex = postwalk(ex) do x
 			if isa(x, Symbol) && x==:_
 				return new_symb
 			else