From 9782f1e695dff5cf20bdba23d261b885b3b967b6 Mon Sep 17 00:00:00 2001 From: Richard Tia Date: Fri, 23 Sep 2022 14:24:57 -0700 Subject: [PATCH 1/2] feat: add functions for splitting strings --- extensions/functions_string.yaml | 122 ++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 27 deletions(-) diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml index a2bbb43a4..a197a007e 100644 --- a/extensions/functions_string.yaml +++ b/extensions/functions_string.yaml @@ -101,13 +101,13 @@ scalar_functions: impls: - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "varchar" name: "input" @@ -120,13 +120,13 @@ scalar_functions: return: "varchar" - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "string" name: "input" @@ -523,13 +523,13 @@ scalar_functions: impls: - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "varchar" name: "input" @@ -542,13 +542,13 @@ scalar_functions: return: i64 - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "string" name: "input" @@ -620,13 +620,13 @@ scalar_functions: impls: - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "string" name: "input" @@ -637,13 +637,13 @@ scalar_functions: return: i64 - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "varchar" name: "input" @@ -654,13 +654,13 @@ scalar_functions: return: i64 - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "fixedchar" name: "input" @@ -1015,13 +1015,13 @@ scalar_functions: impls: - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "string" name: "input" @@ -1041,13 +1041,13 @@ scalar_functions: return: "string" - args: - name: case_sensitivity - options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII] + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] required: false - name: multiline - options: [ MULTILINE_DISABLED, MULTILINE_ENABLED] + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] required: false - name: dotall - options: [ DOTALL_DISABLED, DOTALL_ENABLED] + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] required: false - value: "varchar" name: "input" @@ -1263,6 +1263,74 @@ scalar_functions: - value: i32 name: "count" return: "string" + - + name: string_split + description: >- + Split a string into a list of strings, based on a specified `separator` character. + impls: + - args: + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "separator" + description: A character used for splitting the string. + return: "List>" + - args: + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "separator" + description: A character used for splitting the string. + return: "List" + - + name: regex_string_split + description: >- + Split a string into a list of strings, based on a regular expression pattern. The + regular expression pattern should follow the International Components for Unicode + implementation (https://unicode-org.github.io/icu/userguide/strings/regexp.html). + + The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. + Enabling the `multiline` option will treat the input string as multiple lines. This makes + the `^` and `$` characters match at the beginning and end of any line, instead of just the + beginning and end of the input string. Enabling the `dotall` option makes the `.` character + match line terminator characters in a string. + impls: + - args: + - name: case_sensitivity + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + required: false + - name: multiline + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + required: false + - name: dotall + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] + required: false + - value: "varchar" + name: "input" + description: The input string. + - value: "varchar" + name: "pattern" + description: The regular expression to search for within the input string. + return: "List>" + - args: + - name: case_sensitivity + options: [ CASE_SENSITIVE, CASE_INSENSITIVE, CASE_INSENSITIVE_ASCII ] + required: false + - name: multiline + options: [ MULTILINE_DISABLED, MULTILINE_ENABLED ] + required: false + - name: dotall + options: [ DOTALL_DISABLED, DOTALL_ENABLED ] + required: false + - value: "string" + name: "input" + description: The input string. + - value: "string" + name: "pattern" + description: The regular expression to search for within the input string. + return: "List" aggregate_functions: From 2f4fd59a2fe0ad5885d80f932ec8e9e366b813f1 Mon Sep 17 00:00:00 2001 From: Richard Tia Date: Mon, 26 Sep 2022 09:13:37 -0700 Subject: [PATCH 2/2] fix: update description of regex_string_split --- extensions/functions_string.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extensions/functions_string.yaml b/extensions/functions_string.yaml index a197a007e..896fb3c2e 100644 --- a/extensions/functions_string.yaml +++ b/extensions/functions_string.yaml @@ -1288,8 +1288,10 @@ scalar_functions: name: regex_string_split description: >- Split a string into a list of strings, based on a regular expression pattern. The - regular expression pattern should follow the International Components for Unicode - implementation (https://unicode-org.github.io/icu/userguide/strings/regexp.html). + substrings matched by the pattern will be used as the separators to split the input + string and will not be included in the resulting list. The regular expression + pattern should follow the International Components for Unicode implementation + (https://unicode-org.github.io/icu/userguide/strings/regexp.html). The `case_sensitivity` option specifies case-sensitive or case-insensitive matching. Enabling the `multiline` option will treat the input string as multiple lines. This makes