Skip to content

Commit

Permalink
fix: complete DataBricks text functions
Browse files Browse the repository at this point in the history
Signed-off-by: Andreas Reichel <andreas@manticore-projects.com>
  • Loading branch information
manticore-projects committed May 6, 2024
1 parent 9f312ca commit 66a3720
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import ai.starlake.transpiler.JSQLTranspiler;
import ai.starlake.transpiler.redshift.RedshiftExpressionTranspiler;
import net.sf.jsqlparser.expression.AnalyticExpression;
import net.sf.jsqlparser.expression.ArrayExpression;
import net.sf.jsqlparser.expression.BinaryExpression;
import net.sf.jsqlparser.expression.CastExpression;
import net.sf.jsqlparser.expression.Expression;
Expand All @@ -36,8 +37,7 @@ public DatabricksExpressionTranspiler(JSQLTranspiler transpiler, StringBuilder b

enum TranspiledFunction {
// @FORMATTER:OFF
DATE_FROM_PARTS, BINARY, BITMAP_COUNT, BTRIM, CHAR, CHAR_LENGTH, CHARACTER_LENGTH, CHARINDEX, ENDSWITH, STARTSWITH
, FIND_IN_SET, LEVENSHTEIN, LOCATE, LTRIM, RTRIM, POSITION, REGEXP_REGEX, REGEXP_LIKE, REGEXP_EXTRACT, REGEXP_SUBSTR, SHA2, SPACE, SPLIT, STRING, SUBSTR
DATE_FROM_PARTS, BINARY, BITMAP_COUNT, BTRIM, CHAR, CHAR_LENGTH, CHARACTER_LENGTH, CHARINDEX, ENDSWITH, STARTSWITH, FIND_IN_SET, LEVENSHTEIN, LOCATE, LTRIM, RTRIM, POSITION, REGEXP_REGEX, REGEXP_LIKE, REGEXP_EXTRACT, REGEXP_SUBSTR, SHA2, SPACE, SPLIT, STRING, SUBSTR, SUBSTRING_INDEX, TRY_TO_BINARY, TO_BINARY, UNBASE64, ENCODE, DECODE


, ARRAY
Expand Down Expand Up @@ -96,7 +96,7 @@ public void visit(Function function) {

if (UnsupportedFunction.from(function) != null) {
throw new RuntimeException(
"Unsupported: " + functionName + " is not supported by DuckDB (yet).");
"Unsupported: " + functionName + " is not supported by DuckDB (yet).");
} else if (functionName.endsWith("$$")) {
// work around for transpiling already transpiled functions twice
// @todo: figure out a better way to achieve that
Expand Down Expand Up @@ -125,7 +125,7 @@ public void visit(Function function) {
function.setName("Encode");
break;
case BITMAP_COUNT:
if (paramCount==1) {
if (paramCount == 1) {
function.setName("Bit_Count");
function.setParameters(new CastExpression(parameters.get(0), "BIT"));
}
Expand All @@ -151,11 +151,9 @@ public void visit(Function function) {
case 3:
// ifplus( instr(substr('abcbarbar', 5), 'bar'), 0, 5-1)
function.setName("IfPlus");
function.setParameters(
new Function("InStr", new Function("SubStr", parameters.get(1), parameters.get(2)), parameters.get(0))
, new LongValue(0)
, BinaryExpression.subtract( parameters.get(2), new LongValue(1))
);
function.setParameters(new Function("InStr",
new Function("SubStr", parameters.get(1), parameters.get(2)), parameters.get(0)),
new LongValue(0), BinaryExpression.subtract(parameters.get(2), new LongValue(1)));
break;
}
break;
Expand All @@ -169,25 +167,26 @@ public void visit(Function function) {
function.setName("Starts_With");
break;
case FIND_IN_SET:
//list_position(str_split_regex('abc,b,ab,c,def', ','), 'ab')
if (paramCount==2) {
// list_position(str_split_regex('abc,b,ab,c,def', ','), 'ab')
if (paramCount == 2) {
function.setName("List_position");
function.setParameters(
new Function("Str_Split_Regex", parameters.get(1), new StringValue(","))
, parameters.get(0)
);
new Function("Str_Split_Regex", parameters.get(1), new StringValue(",")),
parameters.get(0));
}
break;
case LEVENSHTEIN:
if (paramCount==3) {
function.setName("Least");
function.setParameters( new Function("Levenshtein", parameters.get(0), parameters.get(1)), parameters.get(2));
if (paramCount == 3) {
function.setName("Least");
function.setParameters(
new Function("Levenshtein", parameters.get(0), parameters.get(1)),
parameters.get(2));
}
break;
case LTRIM:
case RTRIM:
if (paramCount==2) {
function.setParameters( parameters.get(1), parameters.get(0));
if (paramCount == 2) {
function.setParameters(parameters.get(1), parameters.get(0));
}
break;
case REGEXP_REGEX:
Expand All @@ -208,14 +207,14 @@ public void visit(Function function) {
}
break;
case SHA2:
if (paramCount==2) {
if (paramCount == 2) {
warning("Only 256bits supported.");
function.setName("Sha256");
function.setParameters(parameters.get(0));
}
break;
case SPACE:
if (paramCount==1) {
if (paramCount == 1) {
function.setName("Repeat");
function.setParameters(new StringValue(" "), parameters.get(0));
}
Expand All @@ -231,13 +230,55 @@ public void visit(Function function) {
}
break;
case STRING:
if (paramCount==1) {
if (paramCount == 1) {
rewrittenExpression = new CastExpression(parameters.get(0), "VARCHAR");
}
break;
case SUBSTR:
function.setName("SubString");
break;
case SUBSTRING_INDEX:
// substring_index('www.apache.org', '.', 2)
// list_aggregate(regexp_split_to_array('www.apache.org', regexp_escape('.'))[1:2],
// 'string_agg', '.')

if (paramCount == 3) {
function.setName("List_aggregate");
function.setParameters(new ArrayExpression(
new Function("RegExp_Split_To_Array", parameters.get(0),
new Function("RegExp_Escape", parameters.get(1))),
new LongValue(1), parameters.get(2)), new StringValue("string_agg"),
parameters.get(1));
}
break;
case TRY_TO_BINARY:
warning("TRY is not supported.");
case TO_BINARY:
switch (paramCount) {
case 2:
String p = parameters.get(1).toString().toLowerCase();
if (p.equals("'hex'")) {
function.setName("UnHex");
function.setParameters(parameters.get(0));
} else if (p.equals("'base64'")) {
function.setName("From_Base64");
function.setParameters(parameters.get(0));
} else if (p.equals("'utf-8'")) {
function.setName("Encode");
function.setParameters(parameters.get(0));
}
break;
case 1:
function.setName("UnHex");
}
break;
case ENCODE:
case DECODE:
if (paramCount == 2) {
warning("CHARSET parameter not supported.");
function.setParameters(parameters.get(0));
}
break;
}
}
if (rewrittenExpression == null) {
Expand All @@ -252,7 +293,7 @@ public void visit(AnalyticExpression function) {

if (UnsupportedFunction.from(function) != null) {
throw new RuntimeException(
"Unsupported: " + functionName + " is not supported by DuckDB (yet).");
"Unsupported: " + functionName + " is not supported by DuckDB (yet).");
} else if (functionName.endsWith("$$")) {
// work around for transpiling already transpiled functions twice
// @todo: figure out a better way to achieve that
Expand All @@ -261,7 +302,7 @@ public void visit(AnalyticExpression function) {
return;
}

if (function.getNullHandling()!=null && function.isIgnoreNullsOutside()) {
if (function.getNullHandling() != null && function.isIgnoreNullsOutside()) {
function.setIgnoreNullsOutside(false);
}

Expand Down
Binary file modified src/main/resources/doc/JSQLTranspiler.ods
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,69 @@ SELECT SubString('Spark SQL' FROM 5 FOR 1) AS s;
"k"


-- provided
SELECT substring_index('www.apache.org', '.', 2) AS i;

-- expected
select list_aggregate(regexp_split_to_array('www.apache.org', regexp_escape('.'))[1:2], 'string_agg', '.') AS i;

-- result
"i"
"www.apache"


-- provided
SELECT cast(to_binary('537061726B') AS STRING) AS s;

-- expected
SELECT cast(unhex('537061726B') AS String) AS s;

-- result
"s"
"Spark"


-- provided
SELECT cast(to_binary('537061726B', 'hex') AS STRING) AS s;

-- expected
SELECT cast(unhex('537061726B') AS String) AS s;

-- result
"s"
"Spark"

-- provided
SELECT cast(to_binary('U3Bhcms=', 'base64') AS STRING) AS s;

-- expected
SELECT cast(From_Base64('U3Bhcms=') AS STRING) AS s;

-- result
"s"
"Spark"


-- provided
SELECT hex(to_binary('서울시(Seoul)', 'UTF-8')) AS s;

-- expected
SELECT Hex(encode('서울시(Seoul)')) AS s;

-- result
"s"
"EC849CEC9AB8EC8B9C2853656F756C29"


-- provided
SELECT cast(unbase64('U3BhcmsgU1FM') AS STRING) AS s;

-- expected
SELECT cast(From_Base64('U3BhcmsgU1FM') AS STRING) AS s;

-- result
"s"
"Spark SQL"



Expand Down

0 comments on commit 66a3720

Please sign in to comment.