Skip to content

Commit

Permalink
CDPD-45931: ORC-676. Add getRawDataSizeFromColIndices back to ReaderI…
Browse files Browse the repository at this point in the history
…mpl (apache#555)

ORC-676: Add getRawDataSizeFromColIndices back to ReaderImpl

This closes apache#555
Change-Id: I1aa67c7d0e85e4bf9a13e8a3667e3a7bdfe960d0
  • Loading branch information
dongjoon-hyun authored and Dmitriy Fingerman committed Nov 17, 2022
1 parent e311bd9 commit 7e4d514
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
17 changes: 16 additions & 1 deletion java/core/src/java/org/apache/orc/impl/ReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.security.Key;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Supplier;

import org.apache.orc.EncryptionAlgorithm;
Expand Down Expand Up @@ -795,6 +794,22 @@ public long getRawDataSizeFromColIndices(List<Integer> colIndices) {
return getRawDataSizeFromColIndices(include, schema, fileStats);
}

public static long getRawDataSizeFromColIndices(
List<Integer> colIndices,
List<OrcProto.Type> types,
List<OrcProto.ColumnStatistics> stats)
throws FileFormatException {
TypeDescription schema = OrcUtils.convertTypeFromProtobuf(types, 0);
boolean[] include = new boolean[schema.getMaximumId() + 1];
for(Integer rootId: colIndices) {
TypeDescription root = schema.findSubtype(rootId);
for(int c = root.getId(); c <= root.getMaximumId(); ++c) {
include[c] = true;
}
}
return getRawDataSizeFromColIndices(include, schema, stats);
}

static long getRawDataSizeFromColIndices(boolean[] include,
TypeDescription schema,
List<OrcProto.ColumnStatistics> stats) {
Expand Down
25 changes: 25 additions & 0 deletions java/core/src/test/org/apache/orc/impl/TestReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
Expand All @@ -32,11 +33,13 @@
import org.apache.hadoop.io.Text;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcUtils;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.StripeStatistics;
import org.apache.orc.TestVectorOrcFile;
import org.junit.Test;
import org.apache.orc.TypeDescription;
import org.junit.Before;
import org.junit.Rule;
import org.junit.rules.ExpectedException;
Expand Down Expand Up @@ -191,4 +194,26 @@ public void testOrcTailStripeStats() throws Exception {
assertEquals(-28550000, tsStats.getMaximumUtc());
}
}

@Test
public void testGetRawDataSizeFromColIndices() throws Exception {
Configuration conf = new Configuration();
Path path = new Path(workDir, "orc_split_elim_new.orc");
FileSystem fs = path.getFileSystem(conf);
try (ReaderImpl reader = (ReaderImpl) OrcFile.createReader(path,
OrcFile.readerOptions(conf).filesystem(fs))) {
TypeDescription schema = reader.getSchema();
List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
boolean[] include = new boolean[schema.getMaximumId() + 1];
List<Integer> list = new ArrayList<Integer>();
for (int i = 0; i < include.length; i++) {
include[i] = true;
list.add(i);
}
List<OrcProto.ColumnStatistics> stats = reader.getFileTail().getFooter().getStatisticsList();
assertEquals(
ReaderImpl.getRawDataSizeFromColIndices(include, schema, stats),
ReaderImpl.getRawDataSizeFromColIndices(list, types, stats));
}
}
}

0 comments on commit 7e4d514

Please sign in to comment.