Skip to content

Commit

Permalink
[SPARK-49611][SQL] Introduce TVF all_collations()
Browse files Browse the repository at this point in the history
  • Loading branch information
panbingkun committed Sep 12, 2024
1 parent 8023504 commit 48c614e
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,7 @@ object TableFunctionRegistry {
generator[PosExplode]("posexplode"),
generator[PosExplode]("posexplode_outer", outer = true),
generator[Stack]("stack"),
generator[AllCollations]("all_collations"),
generator[SQLKeywords]("sql_keywords"),
generator[VariantExplode]("variant_explode"),
generator[VariantExplode]("variant_explode_outer", outer = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions

import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
Expand All @@ -28,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.plans.logical.{FunctionSignature, InputParameter}
import org.apache.spark.sql.catalyst.trees.TreePattern.{GENERATOR, TreePattern}
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, MapData}
import org.apache.spark.sql.catalyst.util.SQLKeywordUtils._
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
import org.apache.spark.sql.internal.SQLConf
Expand Down Expand Up @@ -618,3 +619,45 @@ case class SQLKeywords() extends LeafExpression with Generator with CodegenFallb

override def prettyName: String = "sql_keywords"
}

@ExpressionDescription(
usage = """_FUNC_() - Get Spark SQL all collations""",
examples = """
Examples:
> SELECT * FROM _FUNC_() LIMIT 2;
SYSTEM BUILTIN UTF8_BINARY NULL NULL ACCENT_SENSITIVE CASE_SENSITIVE NO_PAD NULL
SYSTEM BUILTIN UTF8_LCASE NULL NULL ACCENT_SENSITIVE CASE_INSENSITIVE NO_PAD NULL
""",
since = "4.0.0",
group = "generator_funcs")
case class AllCollations() extends LeafExpression with Generator with CodegenFallback {
override def elementSchema: StructType = new StructType()
.add("COLLATION_CATALOG", StringType, nullable = false)
.add("COLLATION_SCHEMA", StringType, nullable = false)
.add("COLLATION_NAME", StringType, nullable = false)
.add("LANGUAGE", StringType)
.add("COUNTRY", StringType)
.add("ACCENT_SENSITIVITY", StringType, nullable = false)
.add("CASE_SENSITIVITY", StringType, nullable = false)
.add("PAD_ATTRIBUTE", StringType, nullable = false)
.add("ICU_VERSION", StringType)

override def eval(input: InternalRow): IterableOnce[InternalRow] = {
CollationFactory.listCollations().asScala.map(CollationFactory.loadCollationMeta).map { m =>
InternalRow(
UTF8String.fromString(m.catalog),
UTF8String.fromString(m.schema),
UTF8String.fromString(m.collationName),
if (m.language != null) UTF8String.fromString(m.language) else null,
if (m.country != null) UTF8String.fromString(m.country) else null,
UTF8String.fromString(
if (m.accentSensitivity) "ACCENT_SENSITIVE" else "ACCENT_INSENSITIVE"),
UTF8String.fromString(
if (m.caseSensitivity) "CASE_SENSITIVE" else "CASE_INSENSITIVE"),
UTF8String.fromString(m.padAttribute),
if (m.icuVersion != null) UTF8String.fromString(m.icuVersion) else null)
}
}

override def prettyName: String = "all_collations"
}
42 changes: 42 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1666,4 +1666,46 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong SAR China",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))
}

test("TVF all_collations()") {
assert(sql("SELECT * FROM all_collations()").collect().length >= 562)

// verify that the output ordering is as expected (UTF8_BINARY, UTF8_LCASE, etc.)
val df = sql("SELECT * FROM all_collations() limit 10")
checkAnswer(df,
Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UNICODE", "", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM all_collations() WHERE COLLATION_NAME LIKE '%UTF8_BINARY%'"),
Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null))

checkAnswer(sql("SELECT * FROM all_collations() WHERE COLLATION_NAME LIKE '%zh_Hant_HKG%'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hant_HKG", "Chinese", "Hong Kong SAR China",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI", "Chinese", "Hong Kong SAR China",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI", "Chinese", "Hong Kong SAR China",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong SAR China",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))
}
}

0 comments on commit 48c614e

Please sign in to comment.