Skip to content

Commit

Permalink
SQL: Fix bug regarding histograms usage in scripting (#36866)
Browse files Browse the repository at this point in the history
Allow scripts to correctly reference grouping functions
Fix bug in translation of date/time functions mixed with histograms.
Enhance Verifier to prevent histograms being nested inside other
 functions inside GROUP BY (as it implies double grouping)
Extend Histogram docs

(cherry picked from commit ac032a0)
(cherry picked from commit 3a0fd4c)
  • Loading branch information
costin committed Dec 20, 2018
1 parent b8d6c85 commit 3f18092
Show file tree
Hide file tree
Showing 15 changed files with 234 additions and 47 deletions.
24 changes: 24 additions & 0 deletions docs/reference/sql/functions/grouping.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ The histogram function takes all matching values and divides them into buckets w
bucket_key = Math.floor(value / interval) * interval
----

NOTE:: The histogram in SQL does *NOT* return empty buckets for missing intervals as the traditional <<search-aggregations-bucket-histogram-aggregation, histogram>> and <<search-aggregations-bucket-datehistogram-aggregation, date histogram>>. Such behavior does not fit conceptually in SQL which treats all missing values as `NULL`; as such the histogram places all missing values in the `NULL` group.

`Histogram` can be applied on either numeric fields:


Expand All @@ -51,4 +53,26 @@ or date/time fields:
include-tagged::{sql-specs}/docs.csv-spec[histogramDate]
----

Expressions inside the histogram are also supported as long as the
return type is numeric:

["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[histogramNumericExpression]
----

Do note that histograms (and grouping functions in general) allow custom expressions but cannot have any functions applied to them in the `GROUP BY`. In other words, the following statement is *NOT* allowed:

["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[expressionOnHistogramNotAllowed]
----

as it requires two groupings (one for histogram followed by a second for applying the function on top of the histogram groups).

Instead one can rewrite the query to move the expression on the histogram _inside_ of it:

["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[histogramDateExpression]
----
50 changes: 46 additions & 4 deletions x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,51 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
null |10
;

histogramDateWithDateFunction-Ignore
SELECT YEAR(HISTOGRAM(birth_date, INTERVAL 1 YEAR)) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
histogramDateWithMonthOnTop
schema::h:i|c:l
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;

h | c
---------------+---------------
12 |7
10 |17
8 |16
6 |16
4 |18
2 |10
0 |6
null |10
;

histogramDateWithYearOnTop
schema::h:i|c:l
SELECT HISTOGRAM(YEAR(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
h | c
---------------+---------------
1964 |5
1962 |13
1960 |16
1958 |16
1956 |9
1954 |12
1952 |19
null |10
;



histogramNumericWithExpression
schema::h:i|c:l
SELECT HISTOGRAM(emp_no % 100, 10) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;

h | c
---------------+---------------
90 |10
80 |10
70 |10
60 |10
50 |10
40 |10
30 |10
20 |10
10 |10
0 |10
;
45 changes: 45 additions & 0 deletions x-pack/plugin/sql/qa/src/main/resources/docs.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,27 @@ SELECT HISTOGRAM(salary, 5000) AS h FROM emp GROUP BY h;
// end::histogramNumeric
;

histogramNumericExpression
schema::h:i|c:l
// tag::histogramNumericExpression
SELECT HISTOGRAM(salary % 100, 10) AS h, COUNT(*) AS c FROM emp GROUP BY h;

h | c
---------------+---------------
0 |10
10 |15
20 |10
30 |14
40 |9
50 |9
60 |8
70 |13
80 |3
90 |9

// end::histogramNumericExpression
;

histogramDate
schema::h:ts|c:l
// tag::histogramDate
Expand Down Expand Up @@ -752,6 +773,30 @@ null |10
// end::histogramDate
;

expressionOnHistogramNotAllowed-Ignore
// tag::expressionOnHistogramNotAllowed
SELECT MONTH(HISTOGRAM(birth_date), 2)) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
// end::expressionOnHistogramNotAllowed

histogramDateExpression
schema::h:i|c:l
// tag::histogramDateExpression
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;

h | c
---------------+---------------
12 |7
10 |17
8 |16
6 |16
4 |18
2 |10
0 |6
null |10

// end::histogramDateExpression
;

///////////////////////////////
//
// Date/Time
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import org.elasticsearch.xpack.sql.expression.function.FunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.Functions;
import org.elasticsearch.xpack.sql.expression.function.Score;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction;
import org.elasticsearch.xpack.sql.expression.predicate.operator.comparison.In;
Expand Down Expand Up @@ -224,6 +226,7 @@ Collection<Failure> verify(LogicalPlan plan) {
validateConditional(p, localFailures);

checkFilterOnAggs(p, localFailures);
checkFilterOnGrouping(p, localFailures);

if (!groupingFailures.contains(p)) {
checkGroupBy(p, localFailures, resolvedFunctions, groupingFailures);
Expand Down Expand Up @@ -419,7 +422,7 @@ private static boolean checkGroupByHavingHasOnlyAggs(Expression e, Node<?> sourc
return true;
}
// skip aggs (allowed to refer to non-group columns)
if (Functions.isAggregate(e)) {
if (Functions.isAggregate(e) || Functions.isGrouping(e)) {
return true;
}

Expand Down Expand Up @@ -448,6 +451,21 @@ private static boolean checkGroupByAgg(LogicalPlan p, Set<Failure> localFailures
}
}));

a.groupings().forEach(e -> {
if (Functions.isGrouping(e) == false) {
e.collectFirstChildren(c -> {
if (Functions.isGrouping(c)) {
localFailures.add(fail(c,
"Cannot combine [%s] grouping function inside GROUP BY, found [%s];"
+ " consider moving the expression inside the histogram",
Expressions.name(c), Expressions.name(e)));
return true;
}
return false;
});
}
});

if (!localFailures.isEmpty()) {
return false;
}
Expand Down Expand Up @@ -547,19 +565,30 @@ private static void checkFilterOnAggs(LogicalPlan p, Set<Failure> localFailures)
if (p instanceof Filter) {
Filter filter = (Filter) p;
if ((filter.child() instanceof Aggregate) == false) {
filter.condition().forEachDown(f -> {
if (Functions.isAggregate(f) || Functions.isGrouping(f)) {
String type = Functions.isAggregate(f) ? "aggregate" : "grouping";
localFailures.add(fail(f,
"Cannot use WHERE filtering on %s function [%s], use HAVING instead", type, Expressions.name(f)));
filter.condition().forEachDown(e -> {
if (Functions.isAggregate(e) || e instanceof AggregateFunctionAttribute) {
localFailures.add(
fail(e, "Cannot use WHERE filtering on aggregate function [%s], use HAVING instead", Expressions.name(e)));
}

}, Function.class);
}, Expression.class);
}
}
}


private static void checkFilterOnGrouping(LogicalPlan p, Set<Failure> localFailures) {
if (p instanceof Filter) {
Filter filter = (Filter) p;
filter.condition().forEachDown(e -> {
if (Functions.isGrouping(e) || e instanceof GroupingFunctionAttribute) {
localFailures
.add(fail(e, "Cannot filter on grouping function [%s], use its argument instead", Expressions.name(e)));
}
}, Expression.class);
}
}


private static void checkForScoreInsideFunctions(LogicalPlan p, Set<Failure> localFailures) {
// Make sure that SCORE is only used in "top level" functions
p.forEachExpressions(e ->
Expand Down Expand Up @@ -647,4 +676,4 @@ private static boolean areTypesCompatible(DataType left, DataType right) {
(left.isNumeric() && right.isNumeric());
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,9 @@ public static Integer weekOfYear(Object dateTime, String tzId) {
}

public static ZonedDateTime asDateTime(Object dateTime) {
if (dateTime == null) {
return null;
}
if (dateTime instanceof JodaCompatibleZonedDateTime) {
return ((JodaCompatibleZonedDateTime) dateTime).getZonedDateTime();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.sql.expression.gen.script;

import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;

class Grouping extends Param<GroupingFunctionAttribute> {

Grouping(GroupingFunctionAttribute groupRef) {
super(groupRef);
}

String groupName() {
return value().functionId();
}

@Override
public String prefix() {
return "g";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -85,25 +85,15 @@ Map<String, String> asAggPaths() {
String s = a.aggProperty() != null ? a.aggProperty() : a.aggName();
map.put(p.prefix() + aggs++, s);
}
}

return map;
}

// return the agg refs
List<String> asAggRefs() {
List<String> refs = new ArrayList<>();

for (Param<?> p : params) {
if (p instanceof Agg) {
refs.add(((Agg) p).aggName());
if (p instanceof Grouping) {
Grouping g = (Grouping) p;
map.put(p.prefix() + aggs++, g.groupName());
}
}

return refs;
return map;
}


private static List<Param<?>> flatten(List<Param<?>> params) {
List<Param<?>> flatten = emptyList();

Expand All @@ -116,6 +106,9 @@ private static List<Param<?>> flatten(List<Param<?>> params) {
else if (p instanceof Agg) {
flatten.add(p);
}
else if (p instanceof Grouping) {
flatten.add(p);
}
else if (p instanceof Var) {
flatten.add(p);
}
Expand All @@ -131,4 +124,4 @@ else if (p instanceof Var) {
public String toString() {
return params.toString();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package org.elasticsearch.xpack.sql.expression.gen.script;

import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -28,6 +29,11 @@ public ParamsBuilder agg(AggregateFunctionAttribute agg) {
return this;
}

public ParamsBuilder grouping(GroupingFunctionAttribute grouping) {
params.add(new Grouping(grouping));
return this;
}

public ParamsBuilder script(Params ps) {
params.add(new Script(ps));
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@ public Params params() {
return params;
}

public List<String> aggRefs() {
return params.asAggRefs();
}

public Map<String, String> aggPaths() {
return params.asAggPaths();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.elasticsearch.xpack.sql.expression.Expressions;
import org.elasticsearch.xpack.sql.expression.FieldAttribute;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.literal.IntervalDayTime;
import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth;
Expand All @@ -37,6 +38,9 @@ default ScriptTemplate asScript(Expression exp) {
if (attr instanceof AggregateFunctionAttribute) {
return scriptWithAggregate((AggregateFunctionAttribute) attr);
}
if (attr instanceof GroupingFunctionAttribute) {
return scriptWithGrouping((GroupingFunctionAttribute) attr);
}
if (attr instanceof FieldAttribute) {
return scriptWithField((FieldAttribute) attr);
}
Expand Down Expand Up @@ -83,6 +87,16 @@ default ScriptTemplate scriptWithAggregate(AggregateFunctionAttribute aggregate)
dataType());
}

default ScriptTemplate scriptWithGrouping(GroupingFunctionAttribute grouping) {
String template = "{}";
if (grouping.dataType() == DataType.DATE) {
template = "{sql}.asDateTime({})";
}
return new ScriptTemplate(processScript(template),
paramsBuilder().grouping(grouping).build(),
dataType());
}

default ScriptTemplate scriptWithField(FieldAttribute field) {
return new ScriptTemplate(processScript("doc[{}].value"),
paramsBuilder().variable(field.name()).build(),
Expand Down
Loading

0 comments on commit 3f18092

Please sign in to comment.