Skip to content

Commit

Permalink
[SPARK-43049][SQL] Use CLOB instead of VARCHAR(255) for StringType fo…
Browse files Browse the repository at this point in the history
…r Oracle JDBC

### What changes were proposed in this pull request?

Use CLOB instead of VARCHAR(255) for StringType for Oracle JDBC

### Why are the changes needed?

- Fix insufficient length issue when storing a spark string to oracle.
- Make room for Spark VarcharType mapping

### Does this PR introduce _any_ user-facing change?

yes, Using APIs, such as DDL and `df.write.jdbc`, with oracle to store string will result in CLOB columns.
### How was this patch tested?

new tests.

Closes apache#40683 from yaooqinn/SPARK-43049.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
  • Loading branch information
yaooqinn committed Apr 7, 2023
1 parent fa6e55b commit 529f2d5
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSpark
}


test("SPARK-12941: String datatypes to be mapped to Varchar in Oracle") {
// SPARK-43049: Use CLOB instead of VARCHAR(255) for StringType for Oracle jdbc-am""
test("SPARK-12941: String datatypes to be mapped to CLOB in Oracle") {
// create a sample dataframe with string type
val df1 = sparkContext.parallelize(Seq(("foo"))).toDF("x")
// write the dataframe to the oracle table tbl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,26 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTes
var t = spark.table(tbl)
var expectedSchema = new StructType().add("ID", DecimalType(10, 0), true, defaultMetadata)
assert(t.schema === expectedSchema)
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE STRING")
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE LONG")
t = spark.table(tbl)
expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata)
expectedSchema = new StructType().add("ID", DecimalType(19, 0), true, defaultMetadata)
assert(t.schema === expectedSchema)
// Update column type from STRING to INTEGER
// Update column type from LONG to INTEGER
val msg1 = intercept[AnalysisException] {
sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE INTEGER")
}.getMessage
assert(msg1.contains(
s"Cannot update $catalogName.alt_table field ID: string cannot be cast to int"))
s"Cannot update $catalogName.alt_table field ID: decimal(19,0) cannot be cast to int"))
}

override def caseConvert(tableName: String): String = tableName.toUpperCase(Locale.ROOT)

test("SPARK-43049: Use CLOB instead of VARCHAR(255) for StringType for Oracle JDBC") {
val tableName = catalogName + ".t1"
withTable(tableName) {
sql(s"CREATE TABLE $tableName(c1 string)")
sql(s"INSERT INTO $tableName SELECT rpad('hi', 256, 'spark')")
assert(sql(s"SELECT char_length(c1) from $tableName").head().get(0) === 256)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ private case object OracleDialect extends JdbcDialect {
case DoubleType => Some(JdbcType("NUMBER(19, 4)", java.sql.Types.DOUBLE))
case ByteType => Some(JdbcType("NUMBER(3)", java.sql.Types.SMALLINT))
case ShortType => Some(JdbcType("NUMBER(5)", java.sql.Types.SMALLINT))
case StringType => Some(JdbcType("VARCHAR2(255)", java.sql.Types.VARCHAR))
case StringType => Some(JdbcType("CLOB", java.sql.Types.CLOB))
case _ => None
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession {
test("SPARK 12941: The data type mapping for StringType to Oracle") {
val oracleDialect = JdbcDialects.get("jdbc:oracle://127.0.0.1/db")
assert(oracleDialect.getJDBCType(StringType).
map(_.databaseTypeDefinition).get == "VARCHAR2(255)")
map(_.databaseTypeDefinition).get == "CLOB")
}

test("SPARK-16625: General data types to be mapped to Oracle") {
Expand All @@ -1285,7 +1285,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession {
assert(getJdbcType(oracleDialect, DoubleType) == "NUMBER(19, 4)")
assert(getJdbcType(oracleDialect, ByteType) == "NUMBER(3)")
assert(getJdbcType(oracleDialect, ShortType) == "NUMBER(5)")
assert(getJdbcType(oracleDialect, StringType) == "VARCHAR2(255)")
assert(getJdbcType(oracleDialect, StringType) == "CLOB")
assert(getJdbcType(oracleDialect, BinaryType) == "BLOB")
assert(getJdbcType(oracleDialect, DateType) == "DATE")
assert(getJdbcType(oracleDialect, TimestampType) == "TIMESTAMP")
Expand Down

0 comments on commit 529f2d5

Please sign in to comment.