[Spark]Add OPTIMIZE FULL history support (#3852)

#### Which Delta project/connector is this regarding?  - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description  Add isFull operation peramemter in the commit history for command OPTIMIZE tb FULL. ## How was this patch tested? Existing unit tests.  ## Does this PR introduce _any_ user-facing changes?  No
delta-io · Nov 12, 2024 · fbdd347 · fbdd347
1 parent 4f54313
commit fbdd347
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 8 deletions.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala
@@ -684,20 +684,25 @@ object DeltaOperations {
   val ZORDER_PARAMETER_KEY = "zOrderBy"
   /** parameter key to indicate clustering columns */
   val CLUSTERING_PARAMETER_KEY = "clusterBy"
+  /** parameter key to indicate the operation for `OPTIMIZE tbl FULL` */
+  val CLUSTERING_IS_FULL_KEY = "isFull"
 
   /** Recorded when optimizing the table. */
   case class Optimize(
       predicate: Seq[Expression],
       zOrderBy: Seq[String] = Seq.empty,
       auto: Boolean = false,
-      clusterBy: Option[Seq[String]] = None
+      clusterBy: Option[Seq[String]] = None,
+      isFull: Boolean = false
   ) extends OptimizeOrReorg(OPTIMIZE_OPERATION_NAME, predicate) {
     override val parameters: Map[String, Any] = super.parameters ++ Map(
       // When clustering columns are specified, set the zOrderBy key to empty.
       ZORDER_PARAMETER_KEY -> JsonUtils.toJson(if (clusterBy.isEmpty) zOrderBy else Seq.empty),
       CLUSTERING_PARAMETER_KEY -> JsonUtils.toJson(clusterBy.getOrElse(Seq.empty)),
       AUTO_COMPACTION_PARAMETER_KEY -> auto
     )
+    // `isFull` is not relevant for non-clustering tables, so skip it.
+    .++(clusterBy.filter(_.nonEmpty).map(_ => CLUSTERING_IS_FULL_KEY -> isFull))
 
     override val operationMetrics: Set[String] = DeltaOperationMetrics.OPTIMIZE
 

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala
@@ -576,7 +576,8 @@ class OptimizeExecutor(
         predicate = partitionPredicate,
         zOrderBy = zOrderByColumns,
         auto = isAutoCompact,
-        clusterBy = if (isClusteredTable) Option(clusteringColumns).filter(_.nonEmpty) else None)
+        clusterBy = if (isClusteredTable) Option(clusteringColumns).filter(_.nonEmpty) else None,
+        isFull = optimizeContext.isFull)
     }
   }
 

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala b/spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.delta.skipping
 import org.apache.spark.sql.delta.skipping.clustering.{ClusteredTableUtils, ClusteringColumn, ClusteringColumnInfo}
 import org.apache.spark.sql.delta.skipping.clustering.temp.ClusterBySpec
 import org.apache.spark.sql.delta.{DeltaLog, Snapshot}
+import org.apache.spark.sql.delta.DeltaOperations
 import org.apache.spark.sql.delta.DeltaOperations.{CLUSTERING_PARAMETER_KEY, ZORDER_PARAMETER_KEY}
 import org.apache.spark.sql.delta.commands.optimize.OptimizeMetrics
 import org.apache.spark.sql.delta.coordinatedcommits.CoordinatedCommitsBaseSuite
@@ -53,10 +54,41 @@ trait ClusteredTableTestUtilsBase
    * @param postHook callback triggered with OptimizeMetrics returned by the OPTIMIZE command
    */
   def runOptimize(table: String)(postHook: OptimizeMetrics => Unit): Unit = {
+    // Verify Delta history operation parameters' clusterBy
+    val isPathBasedTable = table.startsWith("tahoe.") || table.startsWith("delta.")
+    var (deltaLog, snapshot) = if (isPathBasedTable) {
+      // Path based table e.g. delta.`path-to-directory` or tahoe.`path-to-directory`. Strip
+      // 6 characters to extract table path.
+      DeltaLog.forTableWithSnapshot(spark, table.drop(6).replace("`", ""))
+    } else {
+      DeltaLog.forTableWithSnapshot(spark, TableIdentifier(table))
+    }
+    val beforeVersion = snapshot.version
+
     postHook(optimizeTable(table).select($"metrics.*").as[OptimizeMetrics].head())
+    snapshot = deltaLog.update()
+    val afterVersion = snapshot.version
 
-    // Verify Delta history operation parameters' clusterBy
-    verifyDescribeHistoryOperationParameters(table)
+    val shouldCheckFullStatus = deltaLog.history.getHistory(Some(1)).headOption.exists { h =>
+      Seq(DeltaOperations.OPTIMIZE_OPERATION_NAME
+      ).contains(h.operation)
+    }
+
+    // Note: Only expect isFull status when the table has non-empty clustering columns and
+    // clustering table feature, otherwise the OPTIMIZE will fall back to compaction and
+    // isFull status will not be relevant anymore.
+    val expectedOperationParameters = ClusteredTableUtils
+      .getClusteringColumnsOptional(snapshot)
+      .filter { cols =>
+        cols.nonEmpty &&
+          shouldCheckFullStatus &&
+          ClusteredTableUtils.isSupported(snapshot.protocol) &&
+          afterVersion > beforeVersion
+      }
+      .map(_ => Map(DeltaOperations.CLUSTERING_IS_FULL_KEY -> false))
+      .getOrElse(Map.empty)
+    verifyDescribeHistoryOperationParameters(
+      table, expectedOperationParameters = expectedOperationParameters)
   }
 
   /**
@@ -69,7 +101,8 @@ trait ClusteredTableTestUtilsBase
     postHook(sql(s"OPTIMIZE $table FULL").select($"metrics.*").as[OptimizeMetrics].head())
 
     // Verify Delta history operation parameters' clusterBy
-    verifyDescribeHistoryOperationParameters(table)
+    verifyDescribeHistoryOperationParameters(table, expectedOperationParameters = Map(
+      DeltaOperations.CLUSTERING_IS_FULL_KEY -> true))
   }
 
   def verifyClusteringColumnsInDomainMetadata(
@@ -83,8 +116,8 @@ trait ClusteredTableTestUtilsBase
 
   // Verify the operation parameters of the last history event contains `clusterBy`.
   protected def verifyDescribeHistoryOperationParameters(
-      table: String
-  ): Unit = {
+      table: String,
+      expectedOperationParameters: Map[String, Any] = Map.empty): Unit = {
     val clusterBySupportedOperations = Set(
       "CREATE TABLE",
       "REPLACE TABLE",
@@ -115,7 +148,8 @@ trait ClusteredTableTestUtilsBase
         "add the operation to the appropriate case in " +
         "verifyDescribeHistoryOperationParameters. " +
         s"table: $table, lastOperation: ${lastEvent.operation} " +
-        s"lastOperationParameters: $lastOperationParameters"
+        s"lastOperationParameters: $lastOperationParameters " +
+        s"expectedOperationParameters: $expectedOperationParameters"
       try {
         assert(assertion, debugMsg)
       } catch {
@@ -140,6 +174,12 @@ trait ClusteredTableTestUtilsBase
       doAssert(!lastOperationParameters.contains(CLUSTERING_PARAMETER_KEY))
     }
 
+    // Validate caller provided operator parameters from the last commit.
+    for ((operationParameterKey, value) <- expectedOperationParameters) {
+      // Convert value to string since value is stored as toString in operationParameters.
+      doAssert(lastOperationParameters(operationParameterKey) === value.toString)
+    }
+
     // Check clusterBy
     lastEvent.operation match {
       case "CLUSTER BY" =>