ORC-709: FIX Boolean to StringGroup schema evolution (#594)

### What changes were proposed in this pull request? Special ConvertTreeReader for Boolean using StringGroupFromAnyIntegerTreeReader for String/Char/Varchar types ### Why are the changes needed? Properly handle Boolean to String/Char/Varchar conversions ### How was this patch tested? TestSchemaEvolution.testBooleanToStringEvolution
apache · Dec 30, 2020 · 40495ba · 40495ba
1 parent 60b03ef
commit 40495ba
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 4 deletions.
diff --git a/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java
@@ -1774,12 +1774,64 @@ public void nextVector(ColumnVector previousVector,
     }
   }
 
+  private static TypeReader createBooleanConvertTreeReader(int columnId,
+                                                           TypeDescription fileType,
+                                                           TypeDescription readerType,
+                                                           Context context) throws IOException {
+
+    // CONVERT from BOOLEAN to schema type.
+    //
+    switch (readerType.getCategory()) {
+
+    case BOOLEAN:
+    case BYTE:
+    case SHORT:
+    case INT:
+    case LONG:
+      if (fileType.getCategory() == readerType.getCategory()) {
+        throw new IllegalArgumentException("No conversion of type " +
+            readerType.getCategory() + " to self needed");
+      }
+      return new AnyIntegerFromAnyIntegerTreeReader(columnId, fileType, readerType,
+          context);
+
+    case FLOAT:
+    case DOUBLE:
+      return new DoubleFromAnyIntegerTreeReader(columnId, fileType, context);
+
+    case DECIMAL:
+      return new DecimalFromAnyIntegerTreeReader(columnId, fileType, context);
+
+    case STRING:
+    case CHAR:
+    case VARCHAR:
+      return new StringGroupFromBooleanTreeReader(columnId, fileType, readerType,
+          context);
+
+    case TIMESTAMP:
+    case TIMESTAMP_INSTANT:
+      return new TimestampFromAnyIntegerTreeReader(columnId, fileType, context,
+          readerType.getCategory() == Category.TIMESTAMP_INSTANT);
+
+    // Not currently supported conversion(s):
+    case BINARY:
+    case DATE:
+    case STRUCT:
+    case LIST:
+    case MAP:
+    case UNION:
+    default:
+      throw new IllegalArgumentException("Unsupported type " +
+          readerType.getCategory());
+    }
+  }
+
   private static TypeReader createAnyIntegerConvertTreeReader(int columnId,
                                                               TypeDescription fileType,
                                                               TypeDescription readerType,
                                                               Context context) throws IOException {
 
-    // CONVERT from (BOOLEAN, BYTE, SHORT, INT, LONG) to schema type.
+    // CONVERT from (BYTE, SHORT, INT, LONG) to schema type.
     //
     switch (readerType.getCategory()) {
 
@@ -2065,7 +2117,7 @@ private static TypeReader createBinaryConvertTreeReader(int columnId,
                                                           TypeDescription readerType,
                                                           Context context) throws IOException {
 
-    // CONVERT from DATE to schema type.
+    // CONVERT from BINARY to schema type.
     switch (readerType.getCategory()) {
 
     case STRING:
@@ -2145,7 +2197,8 @@ private static TypeReader createBinaryConvertTreeReader(int columnId,
    *   DecimalFromStringGroupTreeReader (written)
    *
    * To STRING, CHAR, VARCHAR:
-   *   Convert from (BOOLEAN, BYTE, SHORT, INT, LONG) using to string conversion
+   *   Convert from (BYTE, SHORT, INT, LONG) using to string conversion
+   *   Convert from BOOLEAN using boolean (True/False) conversion
    *   Convert from (FLOAT, DOUBLE) using to string conversion
    *   Convert from DECIMAL using HiveDecimal.toString
    *   Convert from CHAR by stripping pads
@@ -2155,6 +2208,7 @@ private static TypeReader createBinaryConvertTreeReader(int columnId,
    *   Convert from BINARY using Text.decode
    *
    *   StringGroupFromAnyIntegerTreeReader (written)
+   *   StringGroupFromBooleanTreeReader (written)
    *   StringGroupFromFloatTreeReader (written)
    *   StringGroupFromDoubleTreeReader (written)
    *   StringGroupFromDecimalTreeReader (written)
@@ -2233,13 +2287,15 @@ public static TypeReader createConvertTreeReader(TypeDescription readerType,
 
     switch (fileType.getCategory()) {
 
-    case BOOLEAN:
     case BYTE:
     case SHORT:
     case INT:
     case LONG:
       return createAnyIntegerConvertTreeReader(columnId, fileType, readerType, context);
 
+    case BOOLEAN:
+      return createBooleanConvertTreeReader(columnId, fileType, readerType, context);
+
     case FLOAT:
     case DOUBLE:
       return createDoubleConvertTreeReader(columnId, fileType, readerType, context);

diff --git a/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java
@@ -579,6 +579,37 @@ public void testDecimalToDecimal64Evolution() throws Exception {
     rows.close();
   }
 
+  @Test
+  public void testBooleanToStringEvolution() throws Exception {
+    testFilePath = new Path(workDir, "TestSchemaEvolution." +
+      testCaseName.getMethodName() + ".orc");
+    TypeDescription schema = TypeDescription.createBoolean();
+    Writer writer = OrcFile.createWriter(testFilePath,
+      OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+        .bufferSize(10000));
+    VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
+    LongColumnVector lcv = new LongColumnVector(1024);
+    batch.cols[0] = lcv;
+    batch.reset();
+    batch.size = 3;
+    lcv.vector[0] = 1L; // True
+    lcv.vector[1] = 0L; // False
+    lcv.vector[2] = 1L; // True
+    writer.addRowBatch(batch);
+    writer.close();
+
+    Reader reader = OrcFile.createReader(testFilePath,
+      OrcFile.readerOptions(conf).filesystem(fs));
+    TypeDescription schemaOnRead = TypeDescription.createString();
+    RecordReader rows = reader.rows(reader.options().schema(schemaOnRead));
+    batch = schemaOnRead.createRowBatch();
+    rows.nextBatch(batch);
+    assertEquals("TRUE", ((BytesColumnVector) batch.cols[0]).toString(0));
+    assertEquals("FALSE", ((BytesColumnVector) batch.cols[0]).toString(1));
+    assertEquals("TRUE", ((BytesColumnVector) batch.cols[0]).toString(2));
+    rows.close();
+  }
+
   @Test
   public void testCharToStringEvolution() throws IOException {
     TypeDescription fileType = TypeDescription.fromString("struct<x:char(10)>");