Skip to content

Commit

Permalink
feat: Databricks Aggregate functions
Browse files Browse the repository at this point in the history
Signed-off-by: Andreas Reichel <andreas@manticore-projects.com>
  • Loading branch information
manticore-projects committed May 30, 2024
1 parent 973c253 commit a4491aa
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ enum TranspiledFunction {

, ANY, APPROX_PERCENTILE, ARRAY_AGG, COLLECT_LIST, COLLECT_SET, COUNT, COUNT_IF, FIRST, FIRST_VALUE, LAST, LAST_VALUE

, PERCENTILE, PERCENTILE_APPROX, REGR_INTERCEPT, REGR_SLOPE, KURTOSIS, SKEWNESS, STD
, PERCENTILE, PERCENTILE_APPROX, REGR_INTERCEPT, REGR_SLOPE, KURTOSIS, SKEWNESS, STD, NTH_VALUE

, TRY_AVG, TRY_SUM
, TRY_AVG, TRY_SUM, PERCENT_RANK

;
// @FORMATTER:ON
Expand Down Expand Up @@ -586,9 +586,29 @@ public void visit(AnalyticExpression function) {
case TRY_AVG:
warning("TRY error handling not supported.");
function.setName("Avg");
break;
case TRY_SUM:
warning("TRY error handling not supported.");
function.setName("Sum");
break;
case NTH_VALUE:
// , ignoreNulls
if (function.getDefaultValue() != null) {
if (function.getDefaultValue().toString().equalsIgnoreCase("TRUE")) {
function.setNullHandling(Function.NullHandling.IGNORE_NULLS);
} else if (function.getDefaultValue().toString().equalsIgnoreCase("FALSE")) {
function.setNullHandling(Function.NullHandling.RESPECT_NULLS);
}
warning("ignoreNulls parameter not supported, use IGNORE/RESPECT NULLS instead.");
function.setDefaultValue(null);
}
break;
case PERCENT_RANK:
if (function.getExpression() != null) {
warning("PERCENT_RANK needs 0 parameters, got 1");
function.setExpression(null);
}
break;
}
}
if (rewrittenExpression == null) {
Expand Down
58 changes: 30 additions & 28 deletions src/main/java/ai/starlake/transpiler/schemas/SchemaProvider.java
Original file line number Diff line number Diff line change
@@ -1,34 +1,36 @@
package ai.starlake.transpiler.schemas
import java.util.Map;



package ai.starlake.transpiler.schemas;

import java.util.Map;

interface SchemaProvider {
/**
* Get all tables in the schema
* @return Map of tables with schema name and table name as key and map of field name and field type as value
*/
Map<String, Map<String, String>> getTables();


/**
* Get all fields in the table
* @param schemaName schema name
* @param tableName table name
* @return Map of field name and field type
*/
Map<String, String> getTable(String schemaName, String tableName);

/**
* Get table regardless of schema name
* @param tableName table name
* @return Map of schema name where the table is found and map of field name and field type. Returning more than one key means
* the table is found in multiple schemas and the resolution is ambiguous.
* In the future, resolution may be done by jsqltranspiler based on the context.
*/
Map<String, Map<String, String>> getTables(String tableName);
/**
* Get all tables in the schema
*
* @return Map of tables with schema name and table name as key and map of field name and field
* type as value
*/
Map<String, Map<String, String>> getTables();


/**
* Get all fields in the table
*
* @param schemaName schema name
* @param tableName table name
* @return Map of field name and field type
*/
Map<String, String> getTable(String schemaName, String tableName);

/**
* Get table regardless of schema name
*
* @param tableName table name
* @return Map of schema name where the table is found and map of field name and field type.
* Returning more than one key means the table is found in multiple schemas and the
* resolution is ambiguous. In the future, resolution may be done by jsqltranspiler based
* on the context.
*/
Map<String, Map<String, String>> getTables(String tableName);

}

Binary file modified src/site/sphinx/_static/JSQLTranspiler.ods
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,109 @@ SELECT stddev_pop(DISTINCT col) AS stddev_pop FROM VALUES (1), (2), (3), (3) AS

-- result
"stddev_pop"
"0.816496580927726"
"0.816496580927726"


-- provided
SELECT a, b, cume_dist() OVER (PARTITION BY a ORDER BY b) AS cume_dist
FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2;

-- result
"a","b","cume_dist"
"A1","1","0.6666666666666666"
"A1","1","0.6666666666666666"
"A1","2","1.0"
"A2","3","1.0"


-- provided
SELECT a, b, lag(b) OVER (PARTITION BY a ORDER BY b) AS lag
FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2;

-- result
"a","b","lag"
"A1","1",""
"A1","1","1"
"A1","2","1"
"A2","3",""


-- provided
SELECT a, b, lead(b) OVER (PARTITION BY a ORDER BY b) AS lead
FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2
;

-- result
"a","b","lead"
"A1","1","1"
"A1","1","2"
"A1","2",""
"A2","3",""


-- provided
SELECT a, b, nth_value(b, 2) OVER (PARTITION BY a ORDER BY b) AS nth_value
FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2;

-- result
"a","b","nth_value"
"A1","1","1"
"A1","1","1"
"A1","2","1"
"A2","3",""


-- provided
SELECT a,
b,
dense_rank() OVER(PARTITION BY a ORDER BY b) AS dense_rank,
rank() OVER(PARTITION BY a ORDER BY b) AS rank,
row_number() OVER(PARTITION BY a ORDER BY b) AS row_number
FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2,3;

-- result
"a","b","dense_rank","rank","row_number"
"A1","1","1","1","1"
"A1","1","1","1","2"
"A1","2","2","3","3"
"A2","3","1","1","1"

-- provided
SELECT a, b, ntile(2) OVER (PARTITION BY a ORDER BY b) AS ntile
FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2;

-- result
"a","b","ntile"
"A1","1","1"
"A1","1","1"
"A1","2","2"
"A2","3","1"


-- provided
SELECT a, b, percent_rank(b) OVER (PARTITION BY a ORDER BY b) AS percent_rank
FROM VALUES ('A1', 2), ('A1', 1), ('A1', 3), ('A1', 6), ('A1', 7), ('A1', 7), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2;

-- expected
SELECT a, b, percent_rank() OVER (PARTITION BY a ORDER BY b) AS percent_rank
FROM VALUES ('A1', 2), ('A1', 1), ('A1', 3), ('A1', 6), ('A1', 7), ('A1', 7), ('A2', 3), ('A1', 1) tab(a, b)
ORDER BY 1,2;


-- result
"a","b","percent_rank"
"A1","1","0.0"
"A1","1","0.0"
"A1","2","0.3333333333333333"
"A1","3","0.5"
"A1","6","0.6666666666666666"
"A1","7","0.8333333333333334"
"A1","7","0.8333333333333334"
"A2","3","0.0"

0 comments on commit a4491aa

Please sign in to comment.