Skip to content

Commit

Permalink
ORC-709: FIX Boolean to StringGroup schema evolution (#594)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Special ConvertTreeReader for Boolean using StringGroupFromAnyIntegerTreeReader for String/Char/Varchar types

### Why are the changes needed?
Properly handle Boolean to String/Char/Varchar conversions

### How was this patch tested?
TestSchemaEvolution.testBooleanToStringEvolution
  • Loading branch information
pgaref authored Dec 30, 2020
1 parent 60b03ef commit 40495ba
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1774,12 +1774,64 @@ public void nextVector(ColumnVector previousVector,
}
}

private static TypeReader createBooleanConvertTreeReader(int columnId,
TypeDescription fileType,
TypeDescription readerType,
Context context) throws IOException {

// CONVERT from BOOLEAN to schema type.
//
switch (readerType.getCategory()) {

case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
if (fileType.getCategory() == readerType.getCategory()) {
throw new IllegalArgumentException("No conversion of type " +
readerType.getCategory() + " to self needed");
}
return new AnyIntegerFromAnyIntegerTreeReader(columnId, fileType, readerType,
context);

case FLOAT:
case DOUBLE:
return new DoubleFromAnyIntegerTreeReader(columnId, fileType, context);

case DECIMAL:
return new DecimalFromAnyIntegerTreeReader(columnId, fileType, context);

case STRING:
case CHAR:
case VARCHAR:
return new StringGroupFromBooleanTreeReader(columnId, fileType, readerType,
context);

case TIMESTAMP:
case TIMESTAMP_INSTANT:
return new TimestampFromAnyIntegerTreeReader(columnId, fileType, context,
readerType.getCategory() == Category.TIMESTAMP_INSTANT);

// Not currently supported conversion(s):
case BINARY:
case DATE:
case STRUCT:
case LIST:
case MAP:
case UNION:
default:
throw new IllegalArgumentException("Unsupported type " +
readerType.getCategory());
}
}

private static TypeReader createAnyIntegerConvertTreeReader(int columnId,
TypeDescription fileType,
TypeDescription readerType,
Context context) throws IOException {

// CONVERT from (BOOLEAN, BYTE, SHORT, INT, LONG) to schema type.
// CONVERT from (BYTE, SHORT, INT, LONG) to schema type.
//
switch (readerType.getCategory()) {

Expand Down Expand Up @@ -2065,7 +2117,7 @@ private static TypeReader createBinaryConvertTreeReader(int columnId,
TypeDescription readerType,
Context context) throws IOException {

// CONVERT from DATE to schema type.
// CONVERT from BINARY to schema type.
switch (readerType.getCategory()) {

case STRING:
Expand Down Expand Up @@ -2145,7 +2197,8 @@ private static TypeReader createBinaryConvertTreeReader(int columnId,
* DecimalFromStringGroupTreeReader (written)
*
* To STRING, CHAR, VARCHAR:
* Convert from (BOOLEAN, BYTE, SHORT, INT, LONG) using to string conversion
* Convert from (BYTE, SHORT, INT, LONG) using to string conversion
* Convert from BOOLEAN using boolean (True/False) conversion
* Convert from (FLOAT, DOUBLE) using to string conversion
* Convert from DECIMAL using HiveDecimal.toString
* Convert from CHAR by stripping pads
Expand All @@ -2155,6 +2208,7 @@ private static TypeReader createBinaryConvertTreeReader(int columnId,
* Convert from BINARY using Text.decode
*
* StringGroupFromAnyIntegerTreeReader (written)
* StringGroupFromBooleanTreeReader (written)
* StringGroupFromFloatTreeReader (written)
* StringGroupFromDoubleTreeReader (written)
* StringGroupFromDecimalTreeReader (written)
Expand Down Expand Up @@ -2233,13 +2287,15 @@ public static TypeReader createConvertTreeReader(TypeDescription readerType,

switch (fileType.getCategory()) {

case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
return createAnyIntegerConvertTreeReader(columnId, fileType, readerType, context);

case BOOLEAN:
return createBooleanConvertTreeReader(columnId, fileType, readerType, context);

case FLOAT:
case DOUBLE:
return createDoubleConvertTreeReader(columnId, fileType, readerType, context);
Expand Down
31 changes: 31 additions & 0 deletions java/core/src/test/org/apache/orc/impl/TestSchemaEvolution.java
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,37 @@ public void testDecimalToDecimal64Evolution() throws Exception {
rows.close();
}

@Test
public void testBooleanToStringEvolution() throws Exception {
testFilePath = new Path(workDir, "TestSchemaEvolution." +
testCaseName.getMethodName() + ".orc");
TypeDescription schema = TypeDescription.createBoolean();
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
.bufferSize(10000));
VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024);
LongColumnVector lcv = new LongColumnVector(1024);
batch.cols[0] = lcv;
batch.reset();
batch.size = 3;
lcv.vector[0] = 1L; // True
lcv.vector[1] = 0L; // False
lcv.vector[2] = 1L; // True
writer.addRowBatch(batch);
writer.close();

Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
TypeDescription schemaOnRead = TypeDescription.createString();
RecordReader rows = reader.rows(reader.options().schema(schemaOnRead));
batch = schemaOnRead.createRowBatch();
rows.nextBatch(batch);
assertEquals("TRUE", ((BytesColumnVector) batch.cols[0]).toString(0));
assertEquals("FALSE", ((BytesColumnVector) batch.cols[0]).toString(1));
assertEquals("TRUE", ((BytesColumnVector) batch.cols[0]).toString(2));
rows.close();
}

@Test
public void testCharToStringEvolution() throws IOException {
TypeDescription fileType = TypeDescription.fromString("struct<x:char(10)>");
Expand Down

0 comments on commit 40495ba

Please sign in to comment.