-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PARQUET-304: Add an option to make requested schema case insensitive in read path #210
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
import org.apache.parquet.io.InvalidRecordException; | ||
|
||
|
@@ -38,7 +39,6 @@ public class GroupType extends Type { | |
|
||
private final List<Type> fields; | ||
private final Map<String, Integer> indexByName; | ||
|
||
/** | ||
* @param repetition OPTIONAL, REPEATED, REQUIRED | ||
* @param name the name of the field | ||
|
@@ -48,6 +48,10 @@ public GroupType(Repetition repetition, String name, List<Type> fields) { | |
this(repetition, name, null, fields, null); | ||
} | ||
|
||
public GroupType(Repetition repetition, String name, List<Type> fields, boolean isCaseSensitive) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should not be a property of the type itself, but a property of the selection |
||
this(repetition, name, null, fields, null, isCaseSensitive); | ||
} | ||
|
||
/** | ||
* @param repetition OPTIONAL, REPEATED, REQUIRED | ||
* @param name the name of the field | ||
|
@@ -87,12 +91,17 @@ public GroupType(Repetition repetition, String name, OriginalType originalType, | |
* @param id the id of the field | ||
*/ | ||
GroupType(Repetition repetition, String name, OriginalType originalType, List<Type> fields, ID id) { | ||
this(repetition, name, originalType, fields, id, true); | ||
} | ||
|
||
GroupType(Repetition repetition, String name, OriginalType originalType, List<Type> fields, ID id, boolean isCaseSensitive) { | ||
super(name, repetition, originalType, id); | ||
if (fields.isEmpty()) { | ||
throw new InvalidSchemaException("A group type can not be empty. Parquet does not support empty group without leaves. Empty group: " + name); | ||
} | ||
|
||
this.fields = fields; | ||
this.indexByName = new HashMap<String, Integer>(); | ||
this.indexByName = isCaseSensitive? new HashMap<String, Integer>() : new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER); | ||
for (int i = 0; i < fields.size(); i++) { | ||
indexByName.put(fields.get(i).getName(), i); | ||
} | ||
|
@@ -391,4 +400,16 @@ List<Type> mergeFields(GroupType toMerge, boolean strict) { | |
} | ||
return newFields; | ||
} | ||
|
||
public List<Type> getContainedFields(GroupType base) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd make this method optionally case sensitive. |
||
List<Type> newFields = new ArrayList<Type>(); | ||
List<Type> baseFields = base.getFields(); | ||
for (Type type : this.getFields()) { | ||
if (base.containsField(type.getName())) { | ||
newFields.add(type); | ||
} | ||
} | ||
return newFields; | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,6 +50,13 @@ public MessageType(String name, List<Type> fields) { | |
super(Repetition.REPEATED, name, fields); | ||
} | ||
|
||
public MessageType(String name, List<Type> fields, boolean isCaseSensitive) { | ||
super(Repetition.REPEATED, name, fields, isCaseSensitive); | ||
} | ||
|
||
public MessageType(MessageType base, boolean isCaseSensitive) { | ||
super(Repetition.REPEATED, base.getName(), base.getFields(), isCaseSensitive); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment, I don't think the property belong here. |
||
/** | ||
* {@inheritDoc} | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,17 @@ protected ColumnPath toCanonical(ColumnPath value) { | |
} | ||
}; | ||
|
||
private static Canonicalizer<ColumnPath> pathsLowerCase = new Canonicalizer<ColumnPath>() { | ||
@Override | ||
protected ColumnPath toCanonical(ColumnPath value) { | ||
String[] path = new String[value.p.length]; | ||
for (int i = 0; i < value.p.length; i++) { | ||
path[i] = value.p[i].toLowerCase().intern(); | ||
} | ||
return new ColumnPath(path); | ||
} | ||
}; | ||
|
||
public static ColumnPath fromDotString(String path) { | ||
checkNotNull(path, "path"); | ||
return get(path.split("\\.")); | ||
|
@@ -48,6 +59,10 @@ public static ColumnPath get(String... path){ | |
return paths.canonicalize(new ColumnPath(path)); | ||
} | ||
|
||
public static ColumnPath getLowerCase(String... path){ | ||
return pathsLowerCase.canonicalize(new ColumnPath(path)); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that the toLowerCase can be applied to path directly and then call the existing get(). There's no need for a separate Canonicalizer |
||
|
||
private final String[] p; | ||
|
||
private ColumnPath(String[] path) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,23 +40,34 @@ | |
public class RowGroupFilter implements Visitor<List<BlockMetaData>> { | ||
private final List<BlockMetaData> blocks; | ||
private final MessageType schema; | ||
private final boolean isCaseSensitive; | ||
|
||
public static List<BlockMetaData> filterRowGroups(Filter filter, List<BlockMetaData> blocks, MessageType schema) { | ||
return filterRowGroups(filter, blocks, schema, true); | ||
} | ||
|
||
public static List<BlockMetaData> filterRowGroups(Filter filter, List<BlockMetaData> blocks, MessageType schema, | ||
boolean isCaseSensitive) { | ||
checkNotNull(filter, "filter"); | ||
return filter.accept(new RowGroupFilter(blocks, schema)); | ||
return filter.accept(new RowGroupFilter(blocks, schema, isCaseSensitive)); | ||
} | ||
|
||
private RowGroupFilter(List<BlockMetaData> blocks, MessageType schema) { | ||
this(blocks, schema, true); | ||
} | ||
|
||
private RowGroupFilter(List<BlockMetaData> blocks, MessageType schema, boolean isCaseSensitive) { | ||
this.blocks = checkNotNull(blocks, "blocks"); | ||
this.schema = checkNotNull(schema, "schema"); | ||
this.isCaseSensitive = isCaseSensitive; | ||
} | ||
|
||
@Override | ||
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) { | ||
FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate(); | ||
|
||
// check that the schema of the filter matches the schema of the file | ||
SchemaCompatibilityValidator.validate(filterPredicate, schema); | ||
SchemaCompatibilityValidator.validate(filterPredicate, schema, isCaseSensitive); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if the predicate has the isCaseSensitive field it can be applied here. |
||
|
||
List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>(); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -169,13 +169,23 @@ public void initialize(MessageType fileSchema, | |
// initialize a ReadContext for this file | ||
ReadSupport.ReadContext readContext = readSupport.init(new InitContext( | ||
configuration, toSetMultiMap(fileMetadata), fileSchema)); | ||
this.requestedSchema = readContext.getRequestedSchema(); | ||
MessageType request = readContext.getRequestedSchema(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whether the requestedContext is caseSensitive can com from the readContext. I don't think we need a Hadoop property for this. |
||
boolean isCaseSensitive = configuration.getBoolean(ParquetInputFormat.CASE_SENSITIVITY, true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer we don't use configuration deep in the code for such things. |
||
|
||
if (!isCaseSensitive) { | ||
MessageType base = new MessageType(request, false); | ||
this.requestedSchema = new MessageType(request.getName(), fileSchema.getContainedFields(base)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of adding isCaseSensitive in base, it can be passed to getContainedField |
||
} else { | ||
this.requestedSchema = request; | ||
} | ||
|
||
this.fileSchema = fileSchema; | ||
this.file = file; | ||
this.columnCount = requestedSchema.getPaths().size(); | ||
this.recordConverter = readSupport.prepareForRead( | ||
configuration, fileMetadata, fileSchema, readContext); | ||
this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); | ||
|
||
List<ColumnDescriptor> columns = requestedSchema.getColumns(); | ||
reader = new ParquetFileReader(configuration, file, blocks, columns); | ||
for (BlockMetaData block : blocks) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,6 +113,11 @@ public class ParquetInputFormat<T> extends FileInputFormat<Void, T> { | |
*/ | ||
public static final String FILTER_PREDICATE = "parquet.private.read.filter.predicate"; | ||
|
||
/** | ||
* key to configure case sensitivity | ||
*/ | ||
public static final String CASE_SENSITIVITY = "parquet.read.case.sensitivity"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need a property in that case. |
||
|
||
/** | ||
* key to turn on or off task side metadata loading (default true) | ||
* if true then metadata is read on the task side and some tasks may finish immediately. | ||
|
@@ -126,10 +131,18 @@ public static void setTaskSideMetaData(Job job, boolean taskSideMetadata) { | |
ContextUtil.getConfiguration(job).setBoolean(TASK_SIDE_METADATA, taskSideMetadata); | ||
} | ||
|
||
public static void setCaseSensitiveRead(Job job, boolean isCaseSensitive) { | ||
ContextUtil.getConfiguration(job).setBoolean(CASE_SENSITIVITY, isCaseSensitive); | ||
} | ||
|
||
public static boolean isTaskSideMetaData(Configuration configuration) { | ||
return configuration.getBoolean(TASK_SIDE_METADATA, TRUE); | ||
} | ||
|
||
public static boolean isCaseSensitiveRead(Configuration configuration) { | ||
return configuration.getBoolean(CASE_SENSITIVITY, TRUE); | ||
} | ||
|
||
public static void setReadSupportClass(Job job, Class<?> readSupportClass) { | ||
ContextUtil.getConfiguration(job).set(READ_SUPPORT_CLASS, readSupportClass.getName()); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would think that isCaseSensitive should be a property of the FilterPredicate.