apache · zhztheplayer · Mar 8, 2023 · Mar 7, 2023 · Mar 7, 2023 · jinchengchenghh
diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxDataTypeValidationSuite.scala b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxDataTypeValidationSuite.scala
@@ -63,7 +63,7 @@ class VeloxDataTypeValidationSuite extends WholeStageTransformerSuite {
       .set("spark.sql.files.maxPartitionBytes", "1g")
       .set("spark.sql.shuffle.partitions", "1")
       .set("spark.memory.offHeap.size", "2g")
-      .set("spark.unsafe.exceptionOnMemoryLeak", "false")
+      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
       .set("spark.sql.autoBroadcastJoinThreshold", "10M")
       .set("spark.sql.sources.useV1SourceList", "avro")
   }

diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxStringFunctionsSuite.scala
@@ -49,7 +49,7 @@ class VeloxStringFunctionsSuite extends WholeStageTransformerSuite {
       .set("spark.sql.files.maxPartitionBytes", "1g")
       .set("spark.sql.shuffle.partitions", "1")
       .set("spark.memory.offHeap.size", "2g")
-      .set("spark.unsafe.exceptionOnMemoryLeak", "false")
+      .set("spark.unsafe.exceptionOnMemoryLeak", "true")
       .set("spark.sql.autoBroadcastJoinThreshold", "-1")
       .set("spark.sql.sources.useV1SourceList", "avro")
       .set("spark.sql.optimizer.excludedRules", ConstantFolding.ruleName + "," +
@@ -289,7 +289,7 @@ class VeloxStringFunctionsSuite extends WholeStageTransformerSuite {
       s"from $LINEITEM_TABLE limit $LENGTH") { checkOperatorMatch[ProjectExecTransformer] }
     runQueryAndCompare(s"select l_orderkey, like(l_comment, 'a_%b') " +
       s"from $LINEITEM_TABLE limit $LENGTH") { checkOperatorMatch[ProjectExecTransformer] }
-    runQueryAndCompare(s"select l_orderkey, like('l_comment', 'a\\__b') " +
+    runQueryAndCompare(s"select l_orderkey, like(l_comment, 'a\\__b') " +
       s"from $LINEITEM_TABLE limit $LENGTH") { checkOperatorMatch[ProjectExecTransformer] }
     runQueryAndCompare(s"select l_orderkey, like(l_comment, 'abc_') " +
       s"from $LINEITEM_TABLE limit $LENGTH") { checkOperatorMatch[ProjectExecTransformer] }
@@ -353,7 +353,7 @@ class VeloxStringFunctionsSuite extends WholeStageTransformerSuite {
   }
 
   test("regexp_extract_all") {
-    runQueryAndCompare(s"select l_orderkey, regexp_extract_all('l_comment', '([a-z])', 1) " +
+    runQueryAndCompare(s"select l_orderkey, regexp_extract_all(l_comment, '([a-z])', 1) " +
       s"from $LINEITEM_TABLE limit 5") { checkOperatorMatch[ProjectExecTransformer] }
     // fall back because of unsupported cast(array)
     runQueryAndCompare(s"select l_orderkey, l_comment, " +

diff --git a/gluten-data/src/main/java/io/glutenproject/vectorized/ArrowWritableColumnVector.java b/gluten-data/src/main/java/io/glutenproject/vectorized/ArrowWritableColumnVector.java
@@ -48,6 +48,7 @@
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.execution.datasources.v2.arrow.SparkSchemaUtil;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVectorShim;
 import org.apache.spark.sql.types.ArrayType;
 import org.apache.spark.sql.types.DataType;
@@ -244,7 +245,7 @@ private void createVectorAccessor(ValueVector vector, ValueVector dictionary) {
     } else if (vector instanceof MapVector) {
       MapVector mapVector = (MapVector) vector;
       accessor = new MapAccessor(mapVector);
-      childColumns = new ArrowWritableColumnVector[2];
+      reallocateChildColumns(2);
       final StructVector structVector = (StructVector) mapVector.getDataVector();
       final FieldVector keyChild = structVector.getChild(MapVector.KEY_NAME);
       final FieldVector valueChild = structVector.getChild(MapVector.VALUE_NAME);
@@ -255,14 +256,14 @@ private void createVectorAccessor(ValueVector vector, ValueVector dictionary) {
     } else if (vector instanceof ListVector) {
       ListVector listVector = (ListVector) vector;
       accessor = new ArrayAccessor(listVector);
-      childColumns = new ArrowWritableColumnVector[1];
+      reallocateChildColumns(1);
       childColumns[0] = new ArrowWritableColumnVector(
           listVector.getDataVector(), 0, listVector.size(), false);
     } else if (vector instanceof StructVector) {
       StructVector structVector = (StructVector) vector;
       accessor = new StructAccessor(structVector);
 
-      childColumns = new ArrowWritableColumnVector[structVector.size()];
+      reallocateChildColumns(structVector.size());
       for (int i = 0; i < childColumns.length; ++i) {
         childColumns[i] = new ArrowWritableColumnVector(structVector.getVectorById(i),
             i, structVector.size(), false);
@@ -275,6 +276,19 @@ private void createVectorAccessor(ValueVector vector, ValueVector dictionary) {
     }
   }
 
+  // The child columns may already be created in super class's constructor
+  //  org.apache.spark.sql.execution.vectorized
+  //    .WritableColumnVector#WritableColumnVector(int, DataType).
+  //  So we close them then create new ones.
+  private void reallocateChildColumns(int width) {
+    if (childColumns != null) {
+      for (WritableColumnVector column : childColumns) {
+        column.close();
+      }
+    }
+    childColumns = new ArrowWritableColumnVector[width];
+  }
+
   private ArrowVectorWriter createVectorWriter(ValueVector vector) {
     if (vector instanceof BitVector) {
       return new BooleanWriter((BitVector) vector);
@@ -355,14 +369,6 @@ public void close() {
     closed = true;
     vectorCount.getAndDecrement();
     super.close();
-    // TODO: close Arrow Allocated Memory
-    if (childColumns != null) {
-      for (int i = 0; i < childColumns.length; i++) {
-        childColumns[i].close();
-        childColumns[i] = null;
-      }
-      childColumns = null;
-    }
     vector.close();
     if (dictionaryVector != null) {
       dictionaryVector.close();

diff --git a/gluten-data/src/main/scala/io/glutenproject/execution/GlutenColumnarRules.scala b/gluten-data/src/main/scala/io/glutenproject/execution/GlutenColumnarRules.scala
@@ -17,6 +17,7 @@
 
 package io.glutenproject.execution
 
+import io.glutenproject.backendsapi.BackendsApiManager
 import io.glutenproject.columnarbatch.ArrowColumnarBatches
 import io.glutenproject.memory.arrowalloc.ArrowBufferAllocators
 import io.glutenproject.utils.{LogicalPlanSelector, QueryPlanSelector}
@@ -132,9 +133,10 @@ case class LoadArrowData(child: SparkPlan) extends UnaryExecNode {
 
   override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
     child.executeColumnar().mapPartitions { itr =>
-      itr.map { cb =>
-        ArrowColumnarBatches.ensureLoaded(ArrowBufferAllocators.contextInstance(), cb)
-      }
+      BackendsApiManager.getIteratorApiInstance.genCloseableColumnBatchIterator(
+        itr.map { cb =>
+          ArrowColumnarBatches.ensureLoaded(ArrowBufferAllocators.contextInstance(), cb)
+        })
     }
   }