Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-49611][SQL] Introduce TVF collations() & remove the SHOW COLLATIONS command #48087

Closed
wants to merge 13 commits into from
1 change: 0 additions & 1 deletion docs/sql-ref-ansi-compliance.md
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,6 @@ Below is a list of all the keywords in Spark SQL.
|CODEGEN|non-reserved|non-reserved|non-reserved|
|COLLATE|reserved|non-reserved|reserved|
|COLLATION|reserved|non-reserved|reserved|
|COLLATIONS|reserved|non-reserved|reserved|
|COLLECTION|non-reserved|non-reserved|non-reserved|
|COLUMN|reserved|non-reserved|reserved|
|COLUMNS|non-reserved|non-reserved|non-reserved|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ CLUSTERED: 'CLUSTERED';
CODEGEN: 'CODEGEN';
COLLATE: 'COLLATE';
COLLATION: 'COLLATION';
COLLATIONS: 'COLLATIONS';
COLLECTION: 'COLLECTION';
COLUMN: 'COLUMN';
COLUMNS: 'COLUMNS';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,6 @@ statement
| SHOW PARTITIONS identifierReference partitionSpec? #showPartitions
| SHOW identifier? FUNCTIONS ((FROM | IN) ns=identifierReference)?
(LIKE? (legacy=multipartIdentifier | pattern=stringLit))? #showFunctions
| SHOW COLLATIONS (LIKE? pattern=stringLit)? #showCollations
| SHOW CREATE TABLE identifierReference (AS SERDE)? #showCreateTable
| SHOW CURRENT namespace #showCurrentNamespace
| SHOW CATALOGS (LIKE? pattern=stringLit)? #showCatalogs
Expand Down Expand Up @@ -1868,7 +1867,6 @@ nonReserved
| CODEGEN
| COLLATE
| COLLATION
| COLLATIONS
| COLLECTION
| COLUMN
| COLUMNS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,7 @@ object TableFunctionRegistry {
generator[PosExplode]("posexplode"),
generator[PosExplode]("posexplode_outer", outer = true),
generator[Stack]("stack"),
generator[Collations]("collations"),
generator[SQLKeywords]("sql_keywords"),
generator[VariantExplode]("variant_explode"),
generator[VariantExplode]("variant_explode_outer", outer = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import java.util.concurrent.TimeUnit
import javax.annotation.concurrent.GuardedBy

import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala
import scala.util.{Failure, Success, Try}

import com.google.common.cache.{Cache, CacheBuilder}
Expand All @@ -40,8 +39,7 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Expression, Expre
import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, SubqueryAlias, View}
import org.apache.spark.sql.catalyst.trees.CurrentOrigin
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, StringUtils}
import org.apache.spark.sql.catalyst.util.CollationFactory.CollationMeta
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, StringUtils}
import org.apache.spark.sql.connector.catalog.CatalogManager
import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
Expand Down Expand Up @@ -1901,17 +1899,6 @@ class SessionCatalog(
.filter(isTemporaryFunction)
}

/**
* List all built-in collations with the given pattern.
*/
def listCollations(pattern: Option[String]): Seq[CollationMeta] = {
val collationIdentifiers = CollationFactory.listCollations().asScala.toSeq
val filteredCollationNames = StringUtils.filterPattern(
collationIdentifiers.map(_.getName), pattern.getOrElse("*")).toSet
collationIdentifiers.filter(ident => filteredCollationNames.contains(ident.getName)).map(
CollationFactory.loadCollationMeta)
}

// -----------------
// | Other methods |
// -----------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions

import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this import?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because it is necessary to use from java(List) to scala(Iterable), as follows:
image


import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
Expand All @@ -28,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.plans.logical.{FunctionSignature, InputParameter}
import org.apache.spark.sql.catalyst.trees.TreePattern.{GENERATOR, TreePattern}
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, MapData}
import org.apache.spark.sql.catalyst.util.SQLKeywordUtils._
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
import org.apache.spark.sql.internal.SQLConf
Expand Down Expand Up @@ -618,3 +619,44 @@ case class SQLKeywords() extends LeafExpression with Generator with CodegenFallb

override def prettyName: String = "sql_keywords"
}

@ExpressionDescription(
usage = """_FUNC_() - Get all of the Spark SQL string collations""",
examples = """
Examples:
> SELECT * FROM _FUNC_() WHERE NAME = 'UTF8_BINARY';
SYSTEM BUILTIN UTF8_BINARY NULL NULL ACCENT_SENSITIVE CASE_SENSITIVE NO_PAD NULL
""",
since = "4.0.0",
group = "generator_funcs")
case class Collations() extends LeafExpression with Generator with CodegenFallback {
override def elementSchema: StructType = new StructType()
.add("CATALOG", StringType, nullable = false)
.add("SCHEMA", StringType, nullable = false)
.add("NAME", StringType, nullable = false)
.add("LANGUAGE", StringType)
.add("COUNTRY", StringType)
.add("ACCENT_SENSITIVITY", StringType, nullable = false)
.add("CASE_SENSITIVITY", StringType, nullable = false)
cloud-fan marked this conversation as resolved.
Show resolved Hide resolved
.add("PAD_ATTRIBUTE", StringType, nullable = false)
.add("ICU_VERSION", StringType)

override def eval(input: InternalRow): IterableOnce[InternalRow] = {
CollationFactory.listCollations().asScala.map(CollationFactory.loadCollationMeta).map { m =>
InternalRow(
UTF8String.fromString(m.catalog),
UTF8String.fromString(m.schema),
UTF8String.fromString(m.collationName),
UTF8String.fromString(m.language),
UTF8String.fromString(m.country),
UTF8String.fromString(
if (m.accentSensitivity) "ACCENT_SENSITIVE" else "ACCENT_INSENSITIVE"),
UTF8String.fromString(
if (m.caseSensitivity) "CASE_SENSITIVE" else "CASE_INSENSITIVE"),
UTF8String.fromString(m.padAttribute),
UTF8String.fromString(m.icuVersion))
}
}

override def prettyName: String = "collations"
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ CLOSE
COALESCE
COLLATE
COLLATION
COLLATIONS
COLLECT
COLUMN
COMMIT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1096,16 +1096,4 @@ class SparkSqlAstBuilder extends AstBuilder {
withIdentClause(ctx.identifierReference(), UnresolvedNamespace(_)),
cleanedProperties)
}

/**
* Create a [[ShowCollationsCommand]] command.
* Expected format:
* {{{
* SHOW COLLATIONS (LIKE? pattern=stringLit)?;
* }}}
*/
override def visitShowCollations(ctx: ShowCollationsContext): LogicalPlan = withOrigin(ctx) {
val pattern = Option(ctx.pattern).map(x => string(visitStringLit(x)))
ShowCollationsCommand(pattern)
}
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ CLUSTERED false
CODEGEN false
COLLATE true
COLLATION true
COLLATIONS true
COLLECTION false
COLUMN true
COLUMNS false
Expand Down Expand Up @@ -384,7 +383,6 @@ CAST
CHECK
COLLATE
COLLATION
COLLATIONS
COLUMN
CONSTRAINT
CREATE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ CLUSTERED false
CODEGEN false
COLLATE false
COLLATION false
COLLATIONS false
COLLECTION false
COLUMN false
COLUMNS false
Expand Down
79 changes: 55 additions & 24 deletions sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1625,38 +1625,38 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
}
}

test("show collations") {
assert(sql("SHOW COLLATIONS").collect().length >= 562)
test("TVF collations()") {
assert(sql("SELECT * FROM collations()").collect().length >= 562)

// verify that the output ordering is as expected (UTF8_BINARY, UTF8_LCASE, etc.)
val df = sql("SHOW COLLATIONS").limit(10)
val df = sql("SELECT * FROM collations() limit 10")
checkAnswer(df,
Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UNICODE", "", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SHOW COLLATIONS LIKE '*UTF8_BINARY*'"),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UNICODE", "", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%UTF8_BINARY%'"),
Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null))

checkAnswer(sql("SHOW COLLATIONS '*zh_Hant_HKG*'"),
checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%zh_Hant_HKG%'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hant_HKG", "Chinese", "Hong Kong SAR China",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI", "Chinese", "Hong Kong SAR China",
Expand All @@ -1665,5 +1665,36 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong SAR China",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM collations() WHERE COUNTRY = 'Singapore'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hans_SGP", "Chinese", "Singapore",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_AI", "Chinese", "Singapore",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI", "Chinese", "Singapore",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_AI", "Chinese", "Singapore",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM collations() WHERE LANGUAGE = 'English' " +
"and COUNTRY = 'United States'"),
Seq(Row("SYSTEM", "BUILTIN", "en_USA", "English", "United States",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "en_USA_AI", "English", "United States",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "en_USA_CI", "English", "United States",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "en_USA_CI_AI", "English", "United States",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT NAME, LANGUAGE, ACCENT_SENSITIVITY, CASE_SENSITIVITY " +
"FROM collations() WHERE COUNTRY = 'United States'"),
Seq(Row("en_USA", "English", "ACCENT_SENSITIVE", "CASE_SENSITIVE"),
Row("en_USA_AI", "English", "ACCENT_SENSITIVE", "CASE_INSENSITIVE"),
Row("en_USA_CI", "English", "ACCENT_INSENSITIVE", "CASE_SENSITIVE"),
Row("en_USA_CI_AI", "English", "ACCENT_INSENSITIVE", "CASE_INSENSITIVE")))

checkAnswer(sql("SELECT NAME FROM collations() WHERE ICU_VERSION is null"),
Seq(Row("UTF8_BINARY"), Row("UTF8_LCASE")))
}
}
Loading
Loading