Skip to content

Commit

Permalink
[GLUTEN-1476][CORE] Use correct field name in struct type (#1878)
Browse files Browse the repository at this point in the history
Use correct field name in struct type
  • Loading branch information
rui-mo authored Jun 8, 2023
1 parent 5e75698 commit 20bdbdd
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
public class StructNode implements TypeNode, Serializable {
private final Boolean nullable;
private final ArrayList<TypeNode> types = new ArrayList<>();
private final ArrayList<String> names = new ArrayList<>();

public StructNode(Boolean nullable, ArrayList<TypeNode> types, ArrayList<String> names) {
this.nullable = nullable;
this.types.addAll(types);
this.names.addAll(names);
}

public StructNode(Boolean nullable, ArrayList<TypeNode> types) {
this.nullable = nullable;
Expand All @@ -43,7 +50,9 @@ public Type toProtobuf() {
for (TypeNode typeNode : types) {
structBuilder.addTypes(typeNode.toProtobuf());
}

for (String name : names) {
structBuilder.addNames(name);
}
Type.Builder builder = Type.newBuilder();
builder.setStruct(structBuilder.build());
return builder.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ public static TypeNode makeTimestamp(Boolean nullable) {
return new TimestampTypeNode(nullable);
}

public static TypeNode makeStruct(Boolean nullable, ArrayList<TypeNode> types,
ArrayList<String> names) {
return new StructNode(nullable, types, names);
}

public static TypeNode makeStruct(Boolean nullable, ArrayList<TypeNode> types) {
return new StructNode(nullable, types);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ message Type {
repeated Type types = 1;
uint32 type_variation_reference = 2;
Nullability nullability = 3;
repeated string names = 4;

This comment has been minimized.

Copy link
@winningsix

winningsix Jun 12, 2023

Contributor

Hmm, should we use ordinal based instead?

}

message List {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,17 +184,19 @@ object ConverterUtils extends Logging {
case TimestampType =>
TypeBuilder.makeTimestamp(nullable)
case m: MapType =>
TypeBuilder.makeMap(nullable, getTypeNode(m.keyType, false),
TypeBuilder.makeMap(nullable, getTypeNode(m.keyType, nullable = false),
getTypeNode(m.valueType, m.valueContainsNull))
case a: ArrayType =>
TypeBuilder.makeList(nullable, getTypeNode(a.elementType, a.containsNull))
case s: StructType =>
val fieldNodes = new java.util.ArrayList[TypeNode]
val fieldNames = new java.util.ArrayList[String]
for (structField <- s.fields) {
fieldNodes.add(getTypeNode(structField.dataType, structField.nullable))
fieldNames.add(structField.name)
}
TypeBuilder.makeStruct(nullable, fieldNodes)
case n: NullType =>
TypeBuilder.makeStruct(nullable, fieldNodes, fieldNames)
case _: NullType =>
TypeBuilder.makeNothing()
case unknown =>
throw new UnsupportedOperationException(s"Type $unknown not supported.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,59 +17,5 @@

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.plans.logical.ColumnStat
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneUTC
import org.apache.spark.sql.functions.timestamp_seconds
import org.apache.spark.sql.types.{DataType, DateType, TimestampType}

import java.util.TimeZone
import java.util.concurrent.TimeUnit

class GlutenStatisticsCollectionSuite extends StatisticsCollectionSuite with GlutenSQLTestsTrait {

import testImplicits._

test(GlutenTestConstants.GLUTEN_TEST +
"store and retrieve column stats in different time zones") {
// TODO: bug fix on TableScan.
// val (start, end) = (0, TimeUnit.DAYS.toSeconds(2))
val (start, end) = (0, 200)

def checkTimestampStats(t: DataType,
srcTimeZone: TimeZone,
dstTimeZone: TimeZone)(checker: ColumnStat => Unit): Unit = {
val table = "time_table"
val column = "T"
val original = TimeZone.getDefault
try {
withTable(table) {
TimeZone.setDefault(srcTimeZone)
spark.range(start, end)
.select(timestamp_seconds($"id").cast(t).as(column))
.write.saveAsTable(table)
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS $column")

TimeZone.setDefault(dstTimeZone)
val stats = getCatalogTable(table)
.stats.get.colStats(column).toPlanStat(column, t)
checker(stats)
}
} finally {
TimeZone.setDefault(original)
}
}

DateTimeTestUtils.outstandingZoneIds.foreach { zid =>
val timeZone = TimeZone.getTimeZone(zid)
checkTimestampStats(DateType, TimeZoneUTC, timeZone) { stats =>
assert(stats.min.get.asInstanceOf[Int] == TimeUnit.SECONDS.toDays(start))
assert(stats.max.get.asInstanceOf[Int] == TimeUnit.SECONDS.toDays(end - 1))
}
checkTimestampStats(TimestampType, TimeZoneUTC, timeZone) { stats =>
assert(stats.min.get.asInstanceOf[Long] == TimeUnit.SECONDS.toMicros(start))
assert(stats.max.get.asInstanceOf[Long] == TimeUnit.SECONDS.toMicros(end - 1))
}
}
}
}

0 comments on commit 20bdbdd

Please sign in to comment.