feat: Databricks Aggregate functions

Signed-off-by: Andreas Reichel <andreas@manticore-projects.com>
starlake-ai · May 30, 2024 · a4491aa · a4491aa
1 parent 973c253
commit a4491aa
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 31 deletions.
diff --git a/src/main/java/ai/starlake/transpiler/databricks/DatabricksExpressionTranspiler.java b/src/main/java/ai/starlake/transpiler/databricks/DatabricksExpressionTranspiler.java
@@ -59,9 +59,9 @@ enum TranspiledFunction {
 
     , ANY, APPROX_PERCENTILE, ARRAY_AGG, COLLECT_LIST, COLLECT_SET, COUNT, COUNT_IF, FIRST, FIRST_VALUE, LAST, LAST_VALUE
 
-    , PERCENTILE, PERCENTILE_APPROX, REGR_INTERCEPT, REGR_SLOPE, KURTOSIS, SKEWNESS, STD
+    , PERCENTILE, PERCENTILE_APPROX, REGR_INTERCEPT, REGR_SLOPE, KURTOSIS, SKEWNESS, STD, NTH_VALUE
 
-    , TRY_AVG, TRY_SUM
+    , TRY_AVG, TRY_SUM, PERCENT_RANK
 
     ;
     // @FORMATTER:ON
@@ -586,9 +586,29 @@ public void visit(AnalyticExpression function) {
         case TRY_AVG:
           warning("TRY error handling not supported.");
           function.setName("Avg");
+          break;
         case TRY_SUM:
           warning("TRY error handling not supported.");
           function.setName("Sum");
+          break;
+        case NTH_VALUE:
+          // , ignoreNulls
+          if (function.getDefaultValue() != null) {
+            if (function.getDefaultValue().toString().equalsIgnoreCase("TRUE")) {
+              function.setNullHandling(Function.NullHandling.IGNORE_NULLS);
+            } else if (function.getDefaultValue().toString().equalsIgnoreCase("FALSE")) {
+              function.setNullHandling(Function.NullHandling.RESPECT_NULLS);
+            }
+            warning("ignoreNulls parameter not supported, use IGNORE/RESPECT NULLS instead.");
+            function.setDefaultValue(null);
+          }
+          break;
+        case PERCENT_RANK:
+          if (function.getExpression() != null) {
+            warning("PERCENT_RANK needs 0 parameters, got 1");
+            function.setExpression(null);
+          }
+          break;
       }
     }
     if (rewrittenExpression == null) {

diff --git a/src/main/java/ai/starlake/transpiler/schemas/SchemaProvider.java b/src/main/java/ai/starlake/transpiler/schemas/SchemaProvider.java
@@ -1,34 +1,36 @@
-package ai.starlake.transpiler.schemas
-import java.util.Map;
-
-
-
+package ai.starlake.transpiler.schemas;
 
+import java.util.Map;
 
 interface SchemaProvider {
-    /**
-     * Get all tables in the schema
-     * @return Map of tables with schema name and table name as key and map of field name and field type as value
-     */
-    Map<String, Map<String, String>> getTables();
-
-
-    /**
-     * Get all fields in the table
-     * @param schemaName schema name
-     * @param tableName table name
-     * @return Map of field name and field type
-     */
-    Map<String, String> getTable(String schemaName, String tableName);
-
-    /**
-     * Get table regardless of schema name
-     * @param tableName table name
-     * @return Map of schema name where the table is found and map of field name and field type. Returning more than one key means
-     * the table is found in multiple schemas and the resolution is ambiguous.
-     * In the future, resolution may be done by jsqltranspiler based on the context.
-     */
-    Map<String, Map<String, String>> getTables(String tableName);
+  /**
+   * Get all tables in the schema
+   * 
+   * @return Map of tables with schema name and table name as key and map of field name and field
+   *         type as value
+   */
+  Map<String, Map<String, String>> getTables();
+
+
+  /**
+   * Get all fields in the table
+   * 
+   * @param schemaName schema name
+   * @param tableName table name
+   * @return Map of field name and field type
+   */
+  Map<String, String> getTable(String schemaName, String tableName);
+
+  /**
+   * Get table regardless of schema name
+   * 
+   * @param tableName table name
+   * @return Map of schema name where the table is found and map of field name and field type.
+   *         Returning more than one key means the table is found in multiple schemas and the
+   *         resolution is ambiguous. In the future, resolution may be done by jsqltranspiler based
+   *         on the context.
+   */
+  Map<String, Map<String, String>> getTables(String tableName);
 
 }
 
diff --git a/src/site/sphinx/_static/JSQLTranspiler.ods b/src/site/sphinx/_static/JSQLTranspiler.ods
diff --git a/src/test/resources/ai/starlake/transpiler/databricks/aggregate_function.sql b/src/test/resources/ai/starlake/transpiler/databricks/aggregate_function.sql
@@ -366,4 +366,109 @@ SELECT stddev_pop(DISTINCT col) AS stddev_pop FROM VALUES (1), (2), (3), (3) AS
 
 -- result
 "stddev_pop"
-"0.816496580927726"
+"0.816496580927726"
+
+
+-- provided
+SELECT a, b, cume_dist() OVER (PARTITION BY a ORDER BY b) AS cume_dist
+    FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
+    ORDER BY 1,2;
+
+-- result
+"a","b","cume_dist"
+"A1","1","0.6666666666666666"
+"A1","1","0.6666666666666666"
+"A1","2","1.0"
+"A2","3","1.0"
+
+
+-- provided
+SELECT a, b, lag(b) OVER (PARTITION BY a ORDER BY b) AS lag
+    FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
+    ORDER BY 1,2;
+
+-- result
+"a","b","lag"
+"A1","1",""
+"A1","1","1"
+"A1","2","1"
+"A2","3",""
+
+
+-- provided
+SELECT a, b, lead(b) OVER (PARTITION BY a ORDER BY b) AS lead
+    FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
+    ORDER BY 1,2
+    ;
+
+-- result
+"a","b","lead"
+"A1","1","1"
+"A1","1","2"
+"A1","2",""
+"A2","3",""
+
+
+-- provided
+SELECT a, b, nth_value(b, 2) OVER (PARTITION BY a ORDER BY b) AS nth_value
+    FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
+ORDER BY 1,2;
+
+-- result
+"a","b","nth_value"
+"A1","1","1"
+"A1","1","1"
+"A1","2","1"
+"A2","3",""
+
+
+-- provided
+SELECT a,
+         b,
+         dense_rank() OVER(PARTITION BY a ORDER BY b) AS dense_rank,
+         rank() OVER(PARTITION BY a ORDER BY b) AS rank,
+         row_number() OVER(PARTITION BY a ORDER BY b) AS row_number
+    FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
+ORDER BY 1,2,3;
+
+-- result
+"a","b","dense_rank","rank","row_number"
+"A1","1","1","1","1"
+"A1","1","1","1","2"
+"A1","2","2","3","3"
+"A2","3","1","1","1"
+
+-- provided
+ SELECT a, b, ntile(2) OVER (PARTITION BY a ORDER BY b) AS ntile
+ FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b)
+ ORDER BY 1,2;
+
+-- result
+"a","b","ntile"
+"A1","1","1"
+"A1","1","1"
+"A1","2","2"
+"A2","3","1"
+
+
+-- provided
+SELECT a, b, percent_rank(b) OVER (PARTITION BY a ORDER BY b) AS percent_rank
+    FROM VALUES ('A1', 2), ('A1', 1), ('A1', 3), ('A1', 6), ('A1', 7), ('A1', 7), ('A2', 3), ('A1', 1) tab(a, b)
+    ORDER BY 1,2;
+
+-- expected
+SELECT a, b, percent_rank() OVER (PARTITION BY a ORDER BY b) AS percent_rank
+    FROM VALUES ('A1', 2), ('A1', 1), ('A1', 3), ('A1', 6), ('A1', 7), ('A1', 7), ('A2', 3), ('A1', 1) tab(a, b)
+    ORDER BY 1,2;
+
+
+-- result
+"a","b","percent_rank"
+"A1","1","0.0"
+"A1","1","0.0"
+"A1","2","0.3333333333333333"
+"A1","3","0.5"
+"A1","6","0.6666666666666666"
+"A1","7","0.8333333333333334"
+"A1","7","0.8333333333333334"
+"A2","3","0.0"