From 949c7444ef98680026d9f2b93d7299b76cdfb030 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Tue, 26 Nov 2019 09:57:16 -0800 Subject: [PATCH] ORC-27: Add support for proleptic Gregorian calendar for better support of dates before 1600AD. Fixes #455 Signed-off-by: Owen O'Malley --- .../core/src/java/org/apache/orc/OrcConf.java | 11 +- .../core/src/java/org/apache/orc/OrcFile.java | 40 +++- java/core/src/java/org/apache/orc/Reader.java | 5 + .../java/org/apache/orc/StripeStatistics.java | 15 +- .../apache/orc/impl/ColumnStatisticsImpl.java | 61 ++++-- .../java/org/apache/orc/impl/DateUtils.java | 182 +++++++++++++++++ .../java/org/apache/orc/impl/ReaderImpl.java | 31 ++- .../org/apache/orc/impl/RecordReaderImpl.java | 15 +- .../apache/orc/impl/StripeStatisticsImpl.java | 9 +- .../apache/orc/impl/TreeReaderFactory.java | 48 +++++ .../java/org/apache/orc/impl/TypeUtils.java | 4 +- .../java/org/apache/orc/impl/WriterImpl.java | 34 +++- .../impl/reader/ReaderEncryptionVariant.java | 6 +- .../orc/impl/writer/DateTreeWriter.java | 9 + .../orc/impl/writer/TimestampTreeWriter.java | 3 + .../apache/orc/impl/writer/WriterContext.java | 5 + .../apache/orc/TestProlepticConversions.java | 190 ++++++++++++++++++ .../org/apache/orc/TestStringDictionary.java | 5 + .../apache/orc/impl/TestRecordReaderImpl.java | 7 +- java/pom.xml | 2 +- .../java/org/apache/orc/tools/FileDump.java | 12 +- .../org/apache/orc/tools/JsonFileDump.java | 21 +- .../resources/orc-file-dump-bloomfilter.out | 1 + .../resources/orc-file-dump-bloomfilter2.out | 1 + .../orc-file-dump-dictionary-threshold.out | 1 + .../src/test/resources/orc-file-dump.json | 1 + .../src/test/resources/orc-file-dump.out | 1 + .../src/test/resources/orc-file-has-null.out | 1 + proto/orc_proto.proto | 10 + 29 files changed, 670 insertions(+), 61 deletions(-) create mode 100644 java/core/src/java/org/apache/orc/impl/DateUtils.java create mode 100644 java/core/src/test/org/apache/orc/TestProlepticConversions.java diff --git a/java/core/src/java/org/apache/orc/OrcConf.java b/java/core/src/java/org/apache/orc/OrcConf.java index 7cca1dbf6b..bf83431c1c 100644 --- a/java/core/src/java/org/apache/orc/OrcConf.java +++ b/java/core/src/java/org/apache/orc/OrcConf.java @@ -165,7 +165,16 @@ public enum OrcConf { Integer.MAX_VALUE - 1024, "When reading stripes >2GB, specify max limit for the chunk size."), ENCRYPTION("orc.encrypt", "orc.encrypt", null, "The list of keys and columns to encrypt with"), DATA_MASK("orc.mask", "orc.mask", null, "The masks to apply to the encrypted columns"), - KEY_PROVIDER("orc.key.provider", "orc.key.provider", "hadoop", "The kind of KeyProvider to use for encryption.") + KEY_PROVIDER("orc.key.provider", "orc.key.provider", "hadoop", "The kind of KeyProvider to use for encryption."), + PROLEPTIC_GREGORIAN("orc.proleptic.gregorian", "orc.proleptic.gregorian", false, + "Should we read and write dates & times using the proleptic Gregorian calendar\n" + + "instead of the hybrid Julian Gregorian? Hive before 3.1 and Spark before 3.0\n" + + "used hybrid."), + PROLEPTIC_GREGORIAN_DEFAULT("orc.proleptic.gregorian.default", + "orc.proleptic.gregorian.default", false, + "This value controls whether pre-ORC 27 files are using the hybrid or proleptic\n" + + "calendar. Only Hive 3.1 and the C++ library wrote using the proleptic, so hybrid\n" + + "is the default.") ; private final String attribute; diff --git a/java/core/src/java/org/apache/orc/OrcFile.java b/java/core/src/java/org/apache/orc/OrcFile.java index e1dced0ab6..45249ef400 100644 --- a/java/core/src/java/org/apache/orc/OrcFile.java +++ b/java/core/src/java/org/apache/orc/OrcFile.java @@ -38,7 +38,6 @@ import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.WriterImpl; import org.apache.orc.impl.WriterInternal; -import org.apache.orc.impl.reader.ReaderEncryptionVariant; import org.apache.orc.impl.writer.WriterImplV2; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -283,9 +282,11 @@ public static class ReaderOptions { // For now keeping this around to avoid complex surgery private FileMetadata fileMetadata; private boolean useUTCTimestamp; + private boolean useProlepticGregorian; public ReaderOptions(Configuration conf) { this.conf = conf; + this.useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf); } public ReaderOptions filesystem(FileSystem fs) { @@ -313,6 +314,18 @@ public ReaderOptions setKeyProvider(KeyProvider provider) { return this; } + /** + * Should the reader convert dates and times to the proleptic Gregorian + * calendar? + * @param newValue should it use the proleptic Gregorian calendar? + * @return this + */ + public ReaderOptions convertToProlepticGregorian(boolean newValue) { + this.useProlepticGregorian = newValue; + return this; + } + + public Configuration getConfiguration() { return conf; } @@ -354,6 +367,9 @@ public boolean getUseUTCTimestamp() { return useUTCTimestamp; } + public boolean getConvertToProlepticGregorian() { + return useProlepticGregorian; + } } public static ReaderOptions readerOptions(Configuration conf) { @@ -434,6 +450,7 @@ public static class WriterOptions implements Cloneable { private String encryption; private String masks; private KeyProvider provider; + private boolean useProlepticGregorian; private Map keyOverrides = new HashMap<>(); protected WriterOptions(Properties tableProperties, Configuration conf) { @@ -479,6 +496,7 @@ protected WriterOptions(Properties tableProperties, Configuration conf) { OrcConf.WRITE_VARIABLE_LENGTH_BLOCKS.getBoolean(tableProperties,conf); directEncodingColumns = OrcConf.DIRECT_ENCODING_COLUMNS.getString( tableProperties, conf); + useProlepticGregorian = OrcConf.PROLEPTIC_GREGORIAN.getBoolean(conf); } /** @@ -800,6 +818,17 @@ public WriterOptions setKeyProvider(KeyProvider provider) { return this; } + /** + * Should the writer use the proleptic Gregorian calendar for + * times and dates. + * @param newValue true if we should use the proleptic calendar + * @return this + */ + public WriterOptions setProlepticGregorian(boolean newValue) { + this.useProlepticGregorian = newValue; + return this; + } + public KeyProvider getKeyProvider() { return provider; } @@ -919,6 +948,10 @@ public String getMasks() { public Map getKeyOverrides() { return keyOverrides; } + + public boolean getProlepticGregorian() { + return useProlepticGregorian; + } } /** @@ -1129,6 +1162,11 @@ static boolean readerIsCompatible(Reader firstReader, LOG.info("Can't merge {} because it has different encryption variants", path); return false; } + if (firstReader.writerUsedProlepticGregorian() != + reader.writerUsedProlepticGregorian()) { + LOG.info("Can't merge {} because it uses a different calendar", path); + return false; + } return true; } diff --git a/java/core/src/java/org/apache/orc/Reader.java b/java/core/src/java/org/apache/orc/Reader.java index 4fbffdbf8d..9be6f90abd 100644 --- a/java/core/src/java/org/apache/orc/Reader.java +++ b/java/core/src/java/org/apache/orc/Reader.java @@ -495,4 +495,9 @@ public boolean getTolerateMissingSchema() { * @return Serialized file metadata read from disk for the purposes of caching, etc. */ ByteBuffer getSerializedFileFooter(); + + /** + * Was the file written using the proleptic Gregorian calendar. + */ + boolean writerUsedProlepticGregorian(); } diff --git a/java/core/src/java/org/apache/orc/StripeStatistics.java b/java/core/src/java/org/apache/orc/StripeStatistics.java index f2f26ad4e8..d02bc3357e 100644 --- a/java/core/src/java/org/apache/orc/StripeStatistics.java +++ b/java/core/src/java/org/apache/orc/StripeStatistics.java @@ -19,6 +19,7 @@ package org.apache.orc; import org.apache.orc.impl.ColumnStatisticsImpl; +import org.apache.orc.impl.ReaderImpl; import java.util.List; @@ -28,15 +29,19 @@ public class StripeStatistics { protected final List cs; protected final TypeDescription schema; + private final ReaderImpl reader; - public StripeStatistics(List list) { - this(null, list); + public StripeStatistics(List list, + ReaderImpl reader) { + this(null, list, reader); } public StripeStatistics(TypeDescription schema, - List list) { + List list, + ReaderImpl reader) { this.schema = schema; this.cs = list; + this.reader = reader; } private int getBase() { @@ -53,7 +58,9 @@ public ColumnStatistics[] getColumnStatistics() { int base = getBase(); for (int c = 0; c < result.length; ++c) { TypeDescription column = schema == null ? null : schema.findSubtype(base + c); - result[c] = ColumnStatisticsImpl.deserialize(column, cs.get(c)); + result[c] = reader == null + ? ColumnStatisticsImpl.deserialize(column, cs.get(c)) + : ColumnStatisticsImpl.deserialize(column, cs.get(c), reader); } return result; } diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index b73cafe4cf..ad58bdd214 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -1479,15 +1479,19 @@ private static final class DateStatisticsImpl extends ColumnStatisticsImpl DateStatisticsImpl() { } - DateStatisticsImpl(OrcProto.ColumnStatistics stats) { + DateStatisticsImpl(OrcProto.ColumnStatistics stats, + boolean writerUsedProlepticGregorian, + boolean convertToProlepticGregorian) { super(stats); OrcProto.DateStatistics dateStats = stats.getDateStatistics(); // min,max values serialized/deserialized as int (days since epoch) if (dateStats.hasMaximum()) { - maximum = dateStats.getMaximum(); + maximum = DateUtils.convertDate(dateStats.getMaximum(), + writerUsedProlepticGregorian, convertToProlepticGregorian); } if (dateStats.hasMinimum()) { - minimum = dateStats.getMinimum(); + minimum = DateUtils.convertDate(dateStats.getMinimum(), + writerUsedProlepticGregorian, convertToProlepticGregorian); } } @@ -1640,23 +1644,31 @@ private static class TimestampStatisticsImpl extends ColumnStatisticsImpl TimestampStatisticsImpl() { } - TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) { + TimestampStatisticsImpl(OrcProto.ColumnStatistics stats, + boolean writerUsedProlepticGregorian, + boolean convertToProlepticGregorian) { super(stats); OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics(); // min,max values serialized/deserialized as int (milliseconds since epoch) if (timestampStats.hasMaximum()) { - maximum = SerializationUtils.convertToUtc(TimeZone.getDefault(), - timestampStats.getMaximum()); + maximum = DateUtils.convertTime( + SerializationUtils.convertToUtc(TimeZone.getDefault(), + timestampStats.getMaximum()), + writerUsedProlepticGregorian, convertToProlepticGregorian); } if (timestampStats.hasMinimum()) { - minimum = SerializationUtils.convertToUtc(TimeZone.getDefault(), - timestampStats.getMinimum()); + minimum = DateUtils.convertTime( + SerializationUtils.convertToUtc(TimeZone.getDefault(), + timestampStats.getMinimum()), + writerUsedProlepticGregorian, convertToProlepticGregorian); } if (timestampStats.hasMaximumUtc()) { - maximum = timestampStats.getMaximumUtc(); + maximum = DateUtils.convertTime(timestampStats.getMaximumUtc(), + writerUsedProlepticGregorian, convertToProlepticGregorian); } if (timestampStats.hasMinimumUtc()) { - minimum = timestampStats.getMinimumUtc(); + minimum = DateUtils.convertTime(timestampStats.getMinimumUtc(), + writerUsedProlepticGregorian, convertToProlepticGregorian); } } @@ -1795,8 +1807,10 @@ private static final class TimestampInstantStatisticsImpl extends TimestampStati TimestampInstantStatisticsImpl() { } - TimestampInstantStatisticsImpl(OrcProto.ColumnStatistics stats) { - super(stats); + TimestampInstantStatisticsImpl(OrcProto.ColumnStatistics stats, + boolean writerUsedProlepticGregorian, + boolean convertToProlepticGregorian) { + super(stats, writerUsedProlepticGregorian, convertToProlepticGregorian); } @Override @@ -2008,6 +2022,20 @@ public static ColumnStatisticsImpl create(TypeDescription schema) { public static ColumnStatisticsImpl deserialize(TypeDescription schema, OrcProto.ColumnStatistics stats) { + return deserialize(schema, stats, false, false); + } + + public static ColumnStatisticsImpl deserialize(TypeDescription schema, + OrcProto.ColumnStatistics stats, + ReaderImpl reader) { + return deserialize(schema, stats, reader.writerUsedProlepticGregorian(), + reader.options.getConvertToProlepticGregorian()); + } + + public static ColumnStatisticsImpl deserialize(TypeDescription schema, + OrcProto.ColumnStatistics stats, + boolean writerUsedProlepticGregorian, + boolean convertToProlepticGregorian) { if (stats.hasBucketStatistics()) { return new BooleanStatisticsImpl(stats); } else if (stats.hasIntStatistics()) { @@ -2026,12 +2054,15 @@ public static ColumnStatisticsImpl deserialize(TypeDescription schema, return new DecimalStatisticsImpl(stats); } } else if (stats.hasDateStatistics()) { - return new DateStatisticsImpl(stats); + return new DateStatisticsImpl(stats, writerUsedProlepticGregorian, + convertToProlepticGregorian); } else if (stats.hasTimestampStatistics()) { return schema == null || schema.getCategory() == TypeDescription.Category.TIMESTAMP ? - new TimestampStatisticsImpl(stats) : - new TimestampInstantStatisticsImpl(stats); + new TimestampStatisticsImpl(stats, + writerUsedProlepticGregorian, convertToProlepticGregorian) : + new TimestampInstantStatisticsImpl(stats, + writerUsedProlepticGregorian, convertToProlepticGregorian); } else if(stats.hasBinaryStatistics()) { return new BinaryStatisticsImpl(stats); } else { diff --git a/java/core/src/java/org/apache/orc/impl/DateUtils.java b/java/core/src/java/org/apache/orc/impl/DateUtils.java new file mode 100644 index 0000000000..8ac574c6c8 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/DateUtils.java @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +/** + * Conversion utilities from the hybrid Julian/Gregorian calendar to/from the + * proleptic Gregorian. + * + * The semantics here are to hold the string representation constant and change + * the epoch offset rather than holding the instant in time constant and change + * the string representation. + * + * These utilities will be fast for the common case (> 1582 AD), but slow for + * old dates. + */ +public class DateUtils { + private static SimpleDateFormat createFormatter(String fmt, + GregorianCalendar calendar) { + SimpleDateFormat result = new SimpleDateFormat(fmt); + result.setCalendar(calendar); + return result; + } + + private static final String DATE = "yyyy-MM-dd"; + private static final String TIME = DATE + " HH:mm:ss"; + private static final TimeZone UTC = TimeZone.getTimeZone("UTC"); + private static final GregorianCalendar HYBRID = new GregorianCalendar(); + private static final ThreadLocal HYBRID_DATE_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(DATE, HYBRID)); + private static final ThreadLocal HYBRID_TIME_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(TIME, HYBRID)); + private static final long SWITCHOVER_MILLIS; + private static final long SWITCHOVER_DAYS; + private static final GregorianCalendar PROLEPTIC = new GregorianCalendar(); + private static final ThreadLocal PROLEPTIC_DATE_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(DATE, PROLEPTIC)); + private static final ThreadLocal PROLEPTIC_TIME_FORMAT = + ThreadLocal.withInitial(() -> createFormatter(TIME, PROLEPTIC)); + + static { + HYBRID.setTimeZone(UTC); + PROLEPTIC.setTimeZone(UTC); + PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE)); + + // Get the last day where the two calendars agree with each other. + try { + SWITCHOVER_MILLIS = HYBRID_DATE_FORMAT.get().parse("1582-10-15").getTime(); + SWITCHOVER_DAYS = TimeUnit.MILLISECONDS.toDays(SWITCHOVER_MILLIS); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse switch over date", e); + } + } + + /** + * Convert an epoch day from the hybrid Julian/Gregorian calendar to the + * proleptic Gregorian. + * @param hybrid day of epoch in the hybrid Julian/Gregorian + * @return day of epoch in the proleptic Gregorian + */ + public static int convertDateToProleptic(int hybrid) { + int proleptic = hybrid; + if (hybrid < SWITCHOVER_DAYS) { + String dateStr = HYBRID_DATE_FORMAT.get().format( + new Date(TimeUnit.DAYS.toMillis(hybrid))); + try { + proleptic = (int) TimeUnit.MILLISECONDS.toDays( + PROLEPTIC_DATE_FORMAT.get().parse(dateStr).getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return proleptic; + } + + /** + * Convert an epoch day from the proleptic Gregorian calendar to the hybrid + * Julian/Gregorian. + * @param proleptic day of epoch in the proleptic Gregorian + * @return day of epoch in the hybrid Julian/Gregorian + */ + public static int convertDateToHybrid(int proleptic) { + int hyrbid = proleptic; + if (proleptic < SWITCHOVER_DAYS) { + String dateStr = PROLEPTIC_DATE_FORMAT.get().format( + new Date(TimeUnit.DAYS.toMillis(proleptic))); + try { + hyrbid = (int) TimeUnit.MILLISECONDS.toDays( + HYBRID_DATE_FORMAT.get().parse(dateStr).getTime()); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return hyrbid; + } + + public static int convertDate(int original, + boolean fromProleptic, + boolean toProleptic) { + if (fromProleptic != toProleptic) { + return toProleptic + ? convertDateToProleptic(original) + : convertDateToHybrid(original); + } else { + return original; + } + } + + public static long convertTime(long original, + boolean fromProleptic, + boolean toProleptic) { + if (fromProleptic != toProleptic) { + return toProleptic + ? convertTimeToProleptic(original) + : convertTimeToHybrid(original); + } else { + return original; + } + } + /** + * Convert epoch millis from the hybrid Julian/Gregorian calendar to the + * proleptic Gregorian. + * @param hybrid millis of epoch in the hybrid Julian/Gregorian + * @return millis of epoch in the proleptic Gregorian + */ + public static long convertTimeToProleptic(long hybrid) { + long proleptic = hybrid; + if (hybrid < SWITCHOVER_MILLIS) { + String dateStr = HYBRID_TIME_FORMAT.get().format(new Date(hybrid)); + try { + proleptic = PROLEPTIC_TIME_FORMAT.get().parse(dateStr).getTime(); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return proleptic; + } + + /** + * Convert epoch millis from the proleptic Gregorian calendar to the hybrid + * Julian/Gregorian. + * @param proleptic millis of epoch in the proleptic Gregorian + * @return millis of epoch in the hybrid Julian/Gregorian + */ + public static long convertTimeToHybrid(long proleptic) { + long hybrid = proleptic; + if (proleptic < SWITCHOVER_MILLIS) { + String dateStr = PROLEPTIC_TIME_FORMAT.get().format(new Date(proleptic)); + try { + hybrid = HYBRID_TIME_FORMAT.get().parse(dateStr).getTime(); + } catch (ParseException e) { + throw new IllegalArgumentException("Can't parse " + dateStr, e); + } + } + return hybrid; + } + + private DateUtils() { + throw new UnsupportedOperationException(); + } +} diff --git a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java index a4c73c7455..3017a33df7 100644 --- a/java/core/src/java/org/apache/orc/impl/ReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ReaderImpl.java @@ -31,6 +31,7 @@ import org.apache.orc.DataMaskDescription; import org.apache.orc.EncryptionVariant; import org.apache.orc.FileMetadata; +import org.apache.orc.OrcConf; import org.apache.orc.OrcFile; import org.apache.orc.OrcUtils; import org.apache.orc.Reader; @@ -299,7 +300,8 @@ public List getVariantStripeStatistics(EncryptionVariant varia if (codec != null) { compression.withCodec(codec).withBufferSize(bufferSize); } - return ((ReaderEncryptionVariant) variant).getStripeStatistics(null, compression); + return ((ReaderEncryptionVariant) variant).getStripeStatistics(null, + compression, this); } } } @@ -347,10 +349,10 @@ public ColumnStatistics[] getStatistics() { return result; } - private static ColumnStatistics[] decryptFileStats(ReaderEncryptionVariant encryption, - InStream.StreamOptions compression, - OrcProto.Footer footer - ) throws IOException { + private ColumnStatistics[] decryptFileStats(ReaderEncryptionVariant encryption, + InStream.StreamOptions compression, + OrcProto.Footer footer + ) throws IOException { Key key = encryption.getFileFooterKey(); if (key == null) { return null; @@ -373,19 +375,20 @@ private static ColumnStatistics[] decryptFileStats(ReaderEncryptionVariant encry TypeDescription root = encryption.getRoot(); for(int i= 0; i < result.length; ++i){ result[i] = ColumnStatisticsImpl.deserialize(root.findSubtype(root.getId() + i), - decrypted.getColumn(i)); + decrypted.getColumn(i), this); } return result; } } - public static ColumnStatistics[] deserializeStats( + public ColumnStatistics[] deserializeStats( TypeDescription schema, List fileStats) { ColumnStatistics[] result = new ColumnStatistics[fileStats.size()]; for(int i=0; i < result.length; ++i) { TypeDescription subschema = schema == null ? null : schema.findSubtype(i); - result[i] = ColumnStatisticsImpl.deserialize(subschema, fileStats.get(i)); + result[i] = ColumnStatisticsImpl.deserialize(subschema, fileStats.get(i), + this); } return result; } @@ -736,6 +739,14 @@ public ByteBuffer getSerializedFileFooter() { return tail.getSerializedTail(); } + @Override + public boolean writerUsedProlepticGregorian() { + OrcProto.Footer footer = tail.getFooter(); + return footer.hasCalendar() + ? footer.getCalendar() == OrcProto.CalendarKind.PROLEPTIC_GREGORIAN + : OrcConf.PROLEPTIC_GREGORIAN_DEFAULT.getBoolean(conf); + } + @Override public Options options() { return new Options(conf); @@ -894,7 +905,7 @@ private List convertFromProto(List List result = new ArrayList<>(list.size()); for (OrcProto.StripeStatistics ss : stripeStatistics) { result.add(new StripeStatisticsImpl(schema, - new ArrayList<>(ss.getColStatsList()))); + new ArrayList<>(ss.getColStatsList()), this)); } return result; } @@ -926,7 +937,7 @@ public List getStripeStatistics(boolean[] included) throws IOE if (variant != null) { TypeDescription variantType = variant.getRoot(); List colStats = - variant.getStripeStatistics(included, options); + variant.getStripeStatistics(included, options, this); for(int sub = c; sub <= variantType.getMaximumId(); ++sub) { if (included == null || included[sub]) { for(int s = 0; s < colStats.size(); ++s) { diff --git a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java index de55c855f9..f3b2079d43 100644 --- a/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java +++ b/java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java @@ -32,7 +32,6 @@ import org.apache.orc.CollectionColumnStatistics; import org.apache.orc.ColumnStatistics; import org.apache.orc.CompressionCodec; -import org.apache.orc.CompressionKind; import org.apache.orc.DataReader; import org.apache.orc.DateColumnStatistics; import org.apache.orc.DecimalColumnStatistics; @@ -166,7 +165,9 @@ protected RecordReaderImpl(ReaderImpl fileReader, rowIndexStride, evolution, writerVersion, - fileReader.useUTCTimestamp); + fileReader.useUTCTimestamp, + fileReader.writerUsedProlepticGregorian(), + fileReader.options.getConvertToProlepticGregorian()); } else { sargApp = null; } @@ -223,6 +224,8 @@ protected RecordReaderImpl(ReaderImpl fileReader, .skipCorrupt(skipCorrupt) .fileFormat(fileReader.getFileVersion()) .useUTCTimestamp(fileReader.useUTCTimestamp) + .setProlepticGregorian(fileReader.writerUsedProlepticGregorian(), + fileReader.options.getConvertToProlepticGregorian()) .setEncryption(encryption); reader = TreeReaderFactory.createTreeReader(evolution.getReaderSchema(), readerContext); @@ -868,15 +871,21 @@ public static class SargApplier { private SchemaEvolution evolution; private final long[] exceptionCount; private final boolean useUTCTimestamp; + private final boolean writerUsedProlepticGregorian; + private final boolean convertToProlepticGregorian; public SargApplier(SearchArgument sarg, long rowIndexStride, SchemaEvolution evolution, OrcFile.WriterVersion writerVersion, - boolean useUTCTimestamp) { + boolean useUTCTimestamp, + boolean writerUsedProlepticGregorian, + boolean convertToProlepticGregorian) { this.writerVersion = writerVersion; this.sarg = sarg; sargLeaves = sarg.getLeaves(); + this.writerUsedProlepticGregorian = writerUsedProlepticGregorian; + this.convertToProlepticGregorian = convertToProlepticGregorian; filterColumns = mapSargColumnsToOrcInternalColIdx(sargLeaves, evolution); this.rowIndexStride = rowIndexStride; diff --git a/java/core/src/java/org/apache/orc/impl/StripeStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/StripeStatisticsImpl.java index f4a83cdfbd..50557fbf87 100644 --- a/java/core/src/java/org/apache/orc/impl/StripeStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/StripeStatisticsImpl.java @@ -27,12 +27,13 @@ public class StripeStatisticsImpl extends StripeStatistics { public StripeStatisticsImpl(TypeDescription schema, - List list) { - super(schema, list); + List list, + ReaderImpl reader) { + super(schema, list, reader); } - public StripeStatisticsImpl(TypeDescription schema) { - super(schema, createList(schema)); + public StripeStatisticsImpl(TypeDescription schema, ReaderImpl reader) { + super(schema, createList(schema), reader); } /** diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index e2bf1f4f8c..3f66eaf1c6 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector; import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; @@ -65,6 +66,10 @@ public interface Context { OrcFile.Version getFileFormat(); ReaderEncryption getEncryption(); + + boolean useProlepticGregorian(); + + boolean fileUsedProlepticGregorian(); } public static class ReaderContext implements Context { @@ -74,6 +79,8 @@ public static class ReaderContext implements Context { private String writerTimezone; private OrcFile.Version fileFormat; private ReaderEncryption encryption; + private boolean useProlepticGregorian; + private boolean fileUsedProlepticGregorian; public ReaderContext setSchemaEvolution(SchemaEvolution evolution) { this.evolution = evolution; @@ -105,6 +112,13 @@ public ReaderContext fileFormat(OrcFile.Version version) { return this; } + public ReaderContext setProlepticGregorian(boolean file, + boolean reader) { + this.useProlepticGregorian = reader; + this.fileUsedProlepticGregorian = file; + return this; + } + @Override public SchemaEvolution getSchemaEvolution() { return evolution; @@ -134,6 +148,16 @@ public OrcFile.Version getFileFormat() { public ReaderEncryption getEncryption() { return encryption; } + + @Override + public boolean useProlepticGregorian() { + return useProlepticGregorian; + } + + @Override + public boolean fileUsedProlepticGregorian() { + return fileUsedProlepticGregorian; + } } public abstract static class TreeReader { @@ -897,6 +921,8 @@ public static class TimestampTreeReader extends TreeReader { private TimeZone writerTimeZone; private boolean hasSameTZRules; private ThreadLocal threadLocalDateFormat; + private final boolean useProleptic; + private final boolean fileUsesProleptic; TimestampTreeReader(int columnId, Context context, boolean instantType) throws IOException { @@ -938,6 +964,8 @@ protected TimestampTreeReader(int columnId, InStream presentStream, InStream dat this.nanos = createIntegerReader(encoding.getKind(), nanosStream, false, context); } } + fileUsesProleptic = context.fileUsedProlepticGregorian(); + useProleptic = context.useProlepticGregorian(); } @Override @@ -1011,6 +1039,7 @@ public void nextVector(ColumnVector previousVector, boolean[] isNull, final int batchSize) throws IOException { TimestampColumnVector result = (TimestampColumnVector) previousVector; + result.changeCalendar(fileUsesProleptic, false); super.nextVector(previousVector, isNull, batchSize); result.setIsUTC(context.getUseUTCTimestamp()); @@ -1039,6 +1068,7 @@ public void nextVector(ColumnVector previousVector, } } } + result.changeCalendar(useProleptic, true); } private static int parseNanos(long serialized) { @@ -1062,6 +1092,9 @@ void skipRows(long items) throws IOException { public static class DateTreeReader extends TreeReader { protected IntegerReader reader = null; + private final boolean needsDateColumnVector; + private final boolean useProleptic; + private final boolean fileUsesProleptic; DateTreeReader(int columnId, Context context) throws IOException { this(columnId, null, null, null, context); @@ -1070,6 +1103,10 @@ public static class DateTreeReader extends TreeReader { protected DateTreeReader(int columnId, InStream present, InStream data, OrcProto.ColumnEncoding encoding, Context context) throws IOException { super(columnId, present, context); + useProleptic = context.useProlepticGregorian(); + fileUsesProleptic = context.fileUsedProlepticGregorian(); + // if either side is proleptic, we need a DateColumnVector + needsDateColumnVector = useProleptic || fileUsesProleptic; if (data != null && encoding != null) { checkEncoding(encoding); reader = createIntegerReader(encoding.getKind(), data, true, context); @@ -1110,12 +1147,23 @@ public void nextVector(ColumnVector previousVector, boolean[] isNull, final int batchSize) throws IOException { final LongColumnVector result = (LongColumnVector) previousVector; + if (needsDateColumnVector) { + if (result instanceof DateColumnVector) { + ((DateColumnVector) result).changeCalendar(fileUsesProleptic, false); + } else { + throw new IllegalArgumentException("Can't use LongColumnVector to " + + "read proleptic Gregorian dates."); + } + } // Read present/isNull stream super.nextVector(result, isNull, batchSize); // Read value entries based on isNull entries reader.nextVector(result, result.vector, batchSize); + if (needsDateColumnVector) { + ((DateColumnVector) result).changeCalendar(useProleptic, true); + } } @Override diff --git a/java/core/src/java/org/apache/orc/impl/TypeUtils.java b/java/core/src/java/org/apache/orc/impl/TypeUtils.java index 32d58ef157..0a1e9c3441 100644 --- a/java/core/src/java/org/apache/orc/impl/TypeUtils.java +++ b/java/core/src/java/org/apache/orc/impl/TypeUtils.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector; import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; @@ -45,8 +46,9 @@ public static ColumnVector createColumn(TypeDescription schema, case SHORT: case INT: case LONG: - case DATE: return new LongColumnVector(maxSize); + case DATE: + return new DateColumnVector(maxSize); case TIMESTAMP: case TIMESTAMP_INSTANT: return new TimestampColumnVector(maxSize); diff --git a/java/core/src/java/org/apache/orc/impl/WriterImpl.java b/java/core/src/java/org/apache/orc/impl/WriterImpl.java index 31a401ac63..3d9f3805d1 100644 --- a/java/core/src/java/org/apache/orc/impl/WriterImpl.java +++ b/java/core/src/java/org/apache/orc/impl/WriterImpl.java @@ -139,6 +139,7 @@ public class WriterImpl implements WriterInternal, MemoryManager.Callback { // do we need to include the current encryption keys in the next stripe // information private boolean needKeyFlush; + private final boolean useProlepticGregorian; public WriterImpl(FileSystem fs, Path path, @@ -147,6 +148,7 @@ public WriterImpl(FileSystem fs, this.conf = opts.getConfiguration(); // clone it so that we can annotate it with encryption this.schema = opts.getSchema().clone(); + useProlepticGregorian = opts.getProlepticGregorian(); int numColumns = schema.getMaximumId() + 1; if (!opts.isEnforceBufferSize()) { opts.bufferSize(getEstimatedBufferSize(opts.getStripeSize(), numColumns, @@ -468,6 +470,11 @@ public boolean getUseUTCTimestamp() { public double getDictionaryKeySizeThreshold(int columnId) { return directEncodingColumns[columnId] ? 0.0 : dictionaryKeySizeThreshold; } + + @Override + public boolean getProlepticGregorian() { + return useProlepticGregorian; + } } @@ -633,6 +640,11 @@ private long writeFooter() throws IOException { rawDataSize = computeRawDataSize(); // serialize the types writeTypes(builder, schema); + if (hasDateOrTime(schema)) { + builder.setCalendar(useProlepticGregorian + ? OrcProto.CalendarKind.PROLEPTIC_GREGORIAN + : OrcProto.CalendarKind.JULIAN_GREGORIAN); + } // add the stripe information for(OrcProto.StripeInformation stripe: stripes) { builder.addStripes(stripe); @@ -766,7 +778,8 @@ public void appendStripe(byte[] stripe, int offset, int length, ) throws IOException { appendStripe(stripe, offset, length, stripeInfo, new StripeStatistics[]{ - new StripeStatisticsImpl(schema, stripeStatistics.getColStatsList())}); + new StripeStatisticsImpl(schema, stripeStatistics.getColStatsList(), + null)}); } @Override @@ -853,6 +866,25 @@ private static boolean hasTimestamp(TypeDescription schema) { return false; } + private static boolean hasDateOrTime(TypeDescription schema) { + switch (schema.getCategory()) { + case TIMESTAMP: + case TIMESTAMP_INSTANT: + case DATE: + return true; + default: + } + List children = schema.getChildren(); + if (children != null) { + for(TypeDescription child: children) { + if (hasDateOrTime(child)) { + return true; + } + } + } + return false; + } + private WriterEncryptionKey getKey(String keyName, KeyProvider provider) throws IOException { WriterEncryptionKey result = keys.get(keyName); diff --git a/java/core/src/java/org/apache/orc/impl/reader/ReaderEncryptionVariant.java b/java/core/src/java/org/apache/orc/impl/reader/ReaderEncryptionVariant.java index 182de63bf2..97fbd0468a 100644 --- a/java/core/src/java/org/apache/orc/impl/reader/ReaderEncryptionVariant.java +++ b/java/core/src/java/org/apache/orc/impl/reader/ReaderEncryptionVariant.java @@ -31,6 +31,7 @@ import org.apache.orc.impl.KeyProvider; import org.apache.orc.impl.BufferChunk; import org.apache.orc.impl.LocalKey; +import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.StripeStatisticsImpl; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -230,11 +231,12 @@ public long getStripeStatisticsLength() { * @return the stripe statistics for this variant. */ public List getStripeStatistics(boolean[] columns, - InStream.StreamOptions compression + InStream.StreamOptions compression, + ReaderImpl reader ) throws IOException { StripeStatisticsImpl[] result = new StripeStatisticsImpl[stripeCount]; for(int s=0; s < result.length; ++s) { - result[s] = new StripeStatisticsImpl(column); + result[s] = new StripeStatisticsImpl(column, reader); } // create the objects long offset = stripeStatsOffset; diff --git a/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java index bc81d456a0..cc2bdb315c 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/DateTreeWriter.java @@ -19,6 +19,7 @@ package org.apache.orc.impl.writer; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.orc.OrcProto; @@ -34,6 +35,7 @@ public class DateTreeWriter extends TreeWriterBase { private final IntegerWriter writer; private final boolean isDirectV2; + private final boolean useProleptic; public DateTreeWriter(TypeDescription schema, WriterEncryptionVariant encryption, @@ -46,6 +48,7 @@ public DateTreeWriter(TypeDescription schema, if (rowIndexPosition != null) { recordPosition(rowIndexPosition); } + useProleptic = writer.getProlepticGregorian(); } @Override @@ -53,6 +56,12 @@ public void writeBatch(ColumnVector vector, int offset, int length) throws IOException { super.writeBatch(vector, offset, length); LongColumnVector vec = (LongColumnVector) vector; + if (vector instanceof DateColumnVector) { + ((DateColumnVector) vec).changeCalendar(useProleptic, true); + } else if (useProleptic) { + throw new IllegalArgumentException("Can't use LongColumnVector to write" + + " proleptic dates"); + } if (vector.isRepeating) { if (vector.noNulls || !vector.isNull[0]) { int value = (int) vec.vector[0]; diff --git a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java index 00313b57ef..e23413f9b6 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/TimestampTreeWriter.java @@ -47,6 +47,7 @@ public class TimestampTreeWriter extends TreeWriterBase { private final boolean alwaysUTC; private final TimeZone localTimezone; private final long epoch; + private final boolean useProleptic; public TimestampTreeWriter(TypeDescription schema, WriterEncryptionVariant encryption, @@ -81,6 +82,7 @@ public TimestampTreeWriter(TypeDescription schema, } catch (ParseException e) { throw new IOException("Unable to create base timestamp tree writer", e); } + useProleptic = writer.getProlepticGregorian(); } @Override @@ -96,6 +98,7 @@ public void writeBatch(ColumnVector vector, int offset, int length) throws IOException { super.writeBatch(vector, offset, length); TimestampColumnVector vec = (TimestampColumnVector) vector; + vec.changeCalendar(useProleptic, true); if (vector.isRepeating) { if (vector.noNulls || !vector.isNull[0]) { // ignore the bottom three digits from the vec.time field diff --git a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java index 73542ad976..ebb1ebbdf8 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java +++ b/java/core/src/java/org/apache/orc/impl/writer/WriterContext.java @@ -144,4 +144,9 @@ void writeStatistics(StreamName name, */ double getDictionaryKeySizeThreshold(int columnId); + /** + * Should we write the data using the proleptic Gregorian calendar? + * @return true if we should use the proleptic Gregorian calendar + */ + boolean getProlepticGregorian(); } diff --git a/java/core/src/test/org/apache/orc/TestProlepticConversions.java b/java/core/src/test/org/apache/orc/TestProlepticConversions.java new file mode 100644 index 0000000000..fa1719b9c0 --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestProlepticConversions.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.File; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.GregorianCalendar; +import java.util.List; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class TestProlepticConversions { + + @Parameterized.Parameter + public boolean writerProlepticGregorian; + + @Parameterized.Parameter(1) + public boolean readerProlepticGregorian; + + @Parameterized.Parameters + public static Collection getParameters() { + List result = new ArrayList<>(); + final boolean[] BOOLEANS = new boolean[]{false, true}; + for(Boolean writer: BOOLEANS) { + for (Boolean reader: BOOLEANS) { + result.add(new Object[]{writer, reader}); + } + } + return result; + } + + private Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + + private final Configuration conf; + private final TimeZone UTC = TimeZone.getTimeZone("UTC"); + private final GregorianCalendar PROLEPTIC = new GregorianCalendar(); + private final GregorianCalendar HYBRID = new GregorianCalendar(); + { + conf = new Configuration(); + PROLEPTIC.setTimeZone(UTC); + PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE)); + HYBRID.setTimeZone(UTC); + } + + private FileSystem fs; + private Path testFilePath; + + @Rule + public TestName testCaseName = new TestName(); + + @Before + public void setupPath() throws Exception { + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestProlepticConversion." + + testCaseName.getMethodName().replaceFirst("\\[[0-9]+]", "") + ".orc"); + fs.delete(testFilePath, false); + } + + private SimpleDateFormat createParser(String format, GregorianCalendar calendar) { + SimpleDateFormat result = new SimpleDateFormat(format); + result.setCalendar(calendar); + return result; + } + + @Test + public void testReadWrite() throws Exception { + TypeDescription schema = TypeDescription.fromString( + "struct"); + try (Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .setSchema(schema) + .fileSystem(fs) + .setProlepticGregorian(writerProlepticGregorian))) { + VectorizedRowBatch batch = schema.createRowBatchV2(); + batch.size = 1024; + DateColumnVector d = (DateColumnVector) batch.cols[0]; + TimestampColumnVector t = (TimestampColumnVector) batch.cols[1]; + TimestampColumnVector i = (TimestampColumnVector) batch.cols[2]; + d.changeCalendar(writerProlepticGregorian, false); + t.changeCalendar(writerProlepticGregorian, false); + i.changeCalendar(writerProlepticGregorian, false); + GregorianCalendar cal = writerProlepticGregorian ? PROLEPTIC : HYBRID; + SimpleDateFormat dateFormat = createParser("yyyy-MM-dd", cal); + SimpleDateFormat timeFormat = createParser("yyyy-MM-dd HH:mm:ss", cal); + for(int r=0; r < batch.size; ++r) { + d.vector[r] = TimeUnit.MILLISECONDS.toDays( + dateFormat.parse(String.format("%04d-01-23", r * 2 + 1)).getTime()); + Date val = timeFormat.parse( + String.format("%04d-03-21 %02d:12:34", 2 * r + 1, r % 24)); + t.time[r] = val.getTime(); + t.nanos[r] = 0; + i.time[r] = val.getTime(); + i.nanos[r] = 0; + } + writer.addRowBatch(batch); + } + try (Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf) + .filesystem(fs) + .convertToProlepticGregorian(readerProlepticGregorian)); + RecordReader rows = reader.rows(reader.options())) { + assertEquals(writerProlepticGregorian, reader.writerUsedProlepticGregorian()); + VectorizedRowBatch batch = reader.getSchema().createRowBatchV2(); + DateColumnVector d = (DateColumnVector) batch.cols[0]; + TimestampColumnVector t = (TimestampColumnVector) batch.cols[1]; + TimestampColumnVector i = (TimestampColumnVector) batch.cols[2]; + GregorianCalendar cal = readerProlepticGregorian ? PROLEPTIC : HYBRID; + SimpleDateFormat dateFormat = createParser("yyyy-MM-dd", cal); + SimpleDateFormat timeFormat = createParser("yyyy-MM-dd HH:mm:ss", cal); + + // Check the file statistics + ColumnStatistics[] colStats = reader.getStatistics(); + DateColumnStatistics dStats = (DateColumnStatistics) colStats[1]; + TimestampColumnStatistics tStats = (TimestampColumnStatistics) colStats[2]; + TimestampColumnStatistics iStats = (TimestampColumnStatistics) colStats[3]; + assertEquals("0001-01-23", dateFormat.format(dStats.getMinimum())); + assertEquals("2047-01-23", dateFormat.format(dStats.getMaximum())); + assertEquals("0001-03-21 00:12:34", timeFormat.format(tStats.getMinimum())); + assertEquals("2047-03-21 15:12:34", timeFormat.format(tStats.getMaximum())); + assertEquals("0001-03-21 00:12:34", timeFormat.format(iStats.getMinimum())); + assertEquals("2047-03-21 15:12:34", timeFormat.format(iStats.getMaximum())); + + // Check the stripe stats + List stripeStats = reader.getStripeStatistics(); + assertEquals(1, stripeStats.size()); + colStats = stripeStats.get(0).getColumnStatistics(); + dStats = (DateColumnStatistics) colStats[1]; + tStats = (TimestampColumnStatistics) colStats[2]; + iStats = (TimestampColumnStatistics) colStats[3]; + assertEquals("0001-01-23", dateFormat.format(dStats.getMinimum())); + assertEquals("2047-01-23", dateFormat.format(dStats.getMaximum())); + assertEquals("0001-03-21 00:12:34", timeFormat.format(tStats.getMinimum())); + assertEquals("2047-03-21 15:12:34", timeFormat.format(tStats.getMaximum())); + assertEquals("0001-03-21 00:12:34", timeFormat.format(iStats.getMinimum())); + assertEquals("2047-03-21 15:12:34", timeFormat.format(iStats.getMaximum())); + + // Check the data + assertTrue(rows.nextBatch(batch)); + assertEquals(1024, batch.size); + // Ensure the column vectors are using the right calendar + assertEquals(readerProlepticGregorian, d.isUsingProlepticCalendar()); + assertEquals(readerProlepticGregorian, t.usingProlepticCalendar()); + assertEquals(readerProlepticGregorian, i.usingProlepticCalendar()); + for(int r=0; r < batch.size; ++r) { + String expectedD = String.format("%04d-01-23", r * 2 + 1); + String expectedT = String.format("%04d-03-21 %02d:12:34", 2 * r + 1, r % 24); + assertEquals("row " + r, expectedD, dateFormat.format( + new Date(TimeUnit.DAYS.toMillis(d.vector[r])))); + assertEquals("row " + r, expectedT, timeFormat.format(t.asScratchTimestamp(r))); + assertEquals("row " + r, expectedT, timeFormat.format(i.asScratchTimestamp(r))); + } + } + } +} diff --git a/java/core/src/test/org/apache/orc/TestStringDictionary.java b/java/core/src/test/org/apache/orc/TestStringDictionary.java index 9ffe5334cb..ec7d547b15 100644 --- a/java/core/src/test/org/apache/orc/TestStringDictionary.java +++ b/java/core/src/test/org/apache/orc/TestStringDictionary.java @@ -270,6 +270,11 @@ public boolean getUseUTCTimestamp() { public double getDictionaryKeySizeThreshold(int column) { return OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getDouble(conf); } + + @Override + public boolean getProlepticGregorian() { + return false; + } } @Test diff --git a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index 83e43999ee..dc1271574b 100644 --- a/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -2141,7 +2141,7 @@ public void testPickRowGroups() throws Exception { .end().build(); RecordReaderImpl.SargApplier applier = new RecordReaderImpl.SargApplier(sarg, 1000, evolution, - OrcFile.WriterVersion.ORC_135, false); + OrcFile.WriterVersion.ORC_135, false, false, false); OrcProto.StripeInformation stripe = OrcProto.StripeInformation.newBuilder().setNumberOfRows(4000).build(); OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3]; @@ -2190,7 +2190,7 @@ public void testPickRowGroupsError() throws Exception { .end().build(); RecordReaderImpl.SargApplier applier = new RecordReaderImpl.SargApplier(sarg, 1000, evolution, - OrcFile.WriterVersion.ORC_135, false); + OrcFile.WriterVersion.ORC_135, false, false, false); OrcProto.StripeInformation stripe = OrcProto.StripeInformation.newBuilder().setNumberOfRows(3000).build(); OrcProto.RowIndex[] indexes = new OrcProto.RowIndex[3]; @@ -2238,7 +2238,8 @@ public void testPositionalEvolutionAddColumnPPD() throws IOException { .equals("y", PredicateLeaf.Type.BOOLEAN, true).end().build(); RecordReaderImpl.SargApplier applier = - new RecordReaderImpl.SargApplier(sarg, 1000, evo, OrcFile.WriterVersion.ORC_135, false); + new RecordReaderImpl.SargApplier(sarg, 1000, evo, + OrcFile.WriterVersion.ORC_135, false, false, false); OrcProto.StripeInformation stripe = OrcProto.StripeInformation.newBuilder().setNumberOfRows(2000).build(); diff --git a/java/pom.xml b/java/pom.xml index 6f9fa18a1d..5b7c47e32c 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -70,7 +70,7 @@ 2.2.0 2.7.3 - 2.6.0 + 2.7.1 3.4.6 diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java b/java/tools/src/java/org/apache/orc/tools/FileDump.java index 6c8061a6ce..92711ebe62 100644 --- a/java/tools/src/java/org/apache/orc/tools/FileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java @@ -39,6 +39,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.orc.impl.ReaderImpl; import org.apache.orc.util.BloomFilter; import org.apache.orc.util.BloomFilterIO; import org.apache.orc.ColumnStatistics; @@ -47,7 +48,6 @@ import org.apache.orc.Reader; import org.apache.orc.TypeDescription; import org.apache.orc.Writer; -import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.ColumnStatisticsImpl; import org.apache.orc.impl.OrcAcidUtils; import org.apache.orc.impl.OrcIndex; @@ -345,6 +345,9 @@ private static void printMetaDataImpl(final String filename, if (reader.getCompressionKind() != CompressionKind.NONE) { System.out.println("Compression size: " + reader.getCompressionSize()); } + System.out.println("Calendar: " + (reader.writerUsedProlepticGregorian() + ? "Proleptic Gregorian" + : "Julian/Gregorian")); System.out.println("Type: " + reader.getSchema().toString()); printTypeAnnotations(reader.getSchema(), "root"); System.out.println("\nStripe Statistics:"); @@ -418,7 +421,7 @@ private static void printMetaDataImpl(final String filename, for (int col : rowIndexCols) { StringBuilder buf = new StringBuilder(); String rowIdxString = getFormattedRowIndices(col, - indices.getRowGroupIndex(), schema); + indices.getRowGroupIndex(), schema, (ReaderImpl) reader); buf.append(rowIdxString); String bloomFilString = getFormattedBloomFilters(col, indices, reader.getWriterVersion(), @@ -697,7 +700,8 @@ private static String getBloomFilterStats(BloomFilter bf) { private static String getFormattedRowIndices(int col, OrcProto.RowIndex[] rowGroupIndex, - TypeDescription schema) { + TypeDescription schema, + ReaderImpl reader) { StringBuilder buf = new StringBuilder(); OrcProto.RowIndex index; buf.append(" Row group indices for column ").append(col).append(":"); @@ -720,7 +724,7 @@ private static String getFormattedRowIndices(int col, buf.append("no stats at "); } else { ColumnStatistics cs = - ColumnStatisticsImpl.deserialize(colSchema, colStats); + ColumnStatisticsImpl.deserialize(colSchema, colStats, reader); buf.append(cs.toString()); } buf.append(" positions: "); diff --git a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java index 54124dc36f..61849f90b4 100644 --- a/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java +++ b/java/tools/src/java/org/apache/orc/tools/JsonFileDump.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -31,6 +31,7 @@ import org.apache.orc.TypeDescription; import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.OrcAcidUtils; +import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.RecordReaderImpl; import org.apache.orc.util.BloomFilter; import org.codehaus.jettison.json.JSONArray; @@ -53,16 +54,12 @@ import org.codehaus.jettison.json.JSONObject; import org.codehaus.jettison.json.JSONStringer; import org.codehaus.jettison.json.JSONWriter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * File dump tool with json formatted output. */ public class JsonFileDump { - private static final Logger LOG = LoggerFactory.getLogger(JsonFileDump.class); - public static void printJsonMetaData(List files, Configuration conf, List rowIndexCols, boolean prettyPrint, boolean printTimeZone) @@ -100,7 +97,9 @@ public static void printJsonMetaData(List files, writer.key("schemaString").value(reader.getSchema().toString()); writer.key("schema"); writeSchema(writer, reader.getSchema()); - + writer.key("calendar").value(reader.writerUsedProlepticGregorian() + ? "proleptic Gregorian" + : "Julian/Gregorian"); writer.key("stripeStatistics").array(); List stripeStatistics = reader.getStripeStatistics(); for (int n = 0; n < stripeStatistics.size(); n++) { @@ -191,7 +190,7 @@ public static void printJsonMetaData(List files, writer.object(); writer.key("columnId").value(col); writeRowGroupIndexes(writer, col, indices.getRowGroupIndex(), - reader.getSchema()); + reader.getSchema(), (ReaderImpl) reader); writeBloomFilterIndexes(writer, col, indices, reader.getWriterVersion(), reader.getSchema().findSubtype(col).getCategory(), @@ -442,9 +441,9 @@ private static void writeBloomFilterStats(JSONWriter writer, BloomFilter bf) } private static void writeRowGroupIndexes(JSONWriter writer, int col, - OrcProto.RowIndex[] rowGroupIndex, TypeDescription schema) - throws JSONException { - + OrcProto.RowIndex[] rowGroupIndex, + TypeDescription schema, + ReaderImpl reader) throws JSONException { OrcProto.RowIndex index; if (rowGroupIndex == null || (col >= rowGroupIndex.length) || ((index = rowGroupIndex[col]) == null)) { @@ -461,7 +460,7 @@ private static void writeRowGroupIndexes(JSONWriter writer, int col, } OrcProto.ColumnStatistics colStats = entry.getStatistics(); writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize( - schema.findSubtype(col), colStats)); + schema.findSubtype(col), colStats, reader)); writer.key("positions").array(); for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) { writer.value(entry.getPositions(posIx)); diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out index 31a322f461..d46682b5e1 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter.out @@ -3,6 +3,7 @@ File Version: 0.12 with ORC_14 Rows: 21000 Compression: ZLIB Compression size: 4096 +Calendar: Julian/Gregorian Type: struct Attributes on root test1: value1 diff --git a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out index b63b4e7339..0943a05eee 100644 --- a/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out +++ b/java/tools/src/test/resources/orc-file-dump-bloomfilter2.out @@ -3,6 +3,7 @@ File Version: 0.12 with ORC_14 Rows: 21000 Compression: ZLIB Compression size: 4096 +Calendar: Julian/Gregorian Type: struct Stripe Statistics: diff --git a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out index 1ae53df2a0..7b96b44bec 100644 --- a/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out +++ b/java/tools/src/test/resources/orc-file-dump-dictionary-threshold.out @@ -3,6 +3,7 @@ File Version: 0.12 with ORC_14 Rows: 21000 Compression: ZLIB Compression size: 4096 +Calendar: Julian/Gregorian Type: struct Stripe Statistics: diff --git a/java/tools/src/test/resources/orc-file-dump.json b/java/tools/src/test/resources/orc-file-dump.json index 8487d6ea7c..cc7cccfab7 100644 --- a/java/tools/src/test/resources/orc-file-dump.json +++ b/java/tools/src/test/resources/orc-file-dump.json @@ -28,6 +28,7 @@ } } }, + "calendar": "Julian\/Gregorian", "stripeStatistics": [ { "stripeNumber": 1, diff --git a/java/tools/src/test/resources/orc-file-dump.out b/java/tools/src/test/resources/orc-file-dump.out index dd7291c5a3..1fca5c534f 100644 --- a/java/tools/src/test/resources/orc-file-dump.out +++ b/java/tools/src/test/resources/orc-file-dump.out @@ -3,6 +3,7 @@ File Version: 0.12 with ORC_14 Rows: 21000 Compression: ZLIB Compression size: 4096 +Calendar: Julian/Gregorian Type: struct Stripe Statistics: diff --git a/java/tools/src/test/resources/orc-file-has-null.out b/java/tools/src/test/resources/orc-file-has-null.out index 65f8814d8b..e1a5413256 100644 --- a/java/tools/src/test/resources/orc-file-has-null.out +++ b/java/tools/src/test/resources/orc-file-has-null.out @@ -3,6 +3,7 @@ File Version: 0.12 with ORC_14 Rows: 20000 Compression: ZLIB Compression size: 4096 +Calendar: Julian/Gregorian Type: struct Stripe Statistics: diff --git a/proto/orc_proto.proto b/proto/orc_proto.proto index 09d752e17c..170f07754a 100644 --- a/proto/orc_proto.proto +++ b/proto/orc_proto.proto @@ -340,6 +340,15 @@ message Encryption { optional KeyProviderKind keyProvider = 4; } +enum CalendarKind { + UNKNOWN_CALENDAR = 0; + // The Java default calendar changes from Julian to Gregorian + // in 1583. + JULIAN_GREGORIAN = 1; + // A calendar that extends the Gregorian calendar back forever. + PROLEPTIC_GREGORIAN = 2; +} + message Footer { optional uint64 headerLength = 1; optional uint64 contentLength = 2; @@ -359,6 +368,7 @@ message Footer { // information about the encryption in this file optional Encryption encryption = 10; + optional CalendarKind calendar = 11; } enum CompressionKind {