-
Notifications
You must be signed in to change notification settings - Fork 153
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Alternative approach to logical type variations. #25
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,242 +30,104 @@ message Type { | |
} | ||
|
||
message I8 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
} | ||
|
||
message U8 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message I16 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
} | ||
|
||
message U16 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message I32 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
} | ||
|
||
message U32 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message I64 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
} | ||
|
||
message U64 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
} | ||
|
||
message FP16 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message FP32 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message FP64 { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message String { | ||
PhysicalType physical_type = 1; | ||
bool dictionary_encoded = 2; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
ARROW_LARGE_STRING = 1; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message Binary { | ||
PhysicalType physical_type = 1; | ||
bool dictionary_encoded = 2; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
ARROW_LARGE_BINARY = 1; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message Timestamp { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message Date { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message Time { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message IntervalYear { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
message IntervalDay { | ||
PhysicalType physical_type = 1; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
ARROW_MONTH_DAY_NANO = 1; | ||
} | ||
Variation variation = 1; | ||
} | ||
|
||
// Start compound types. | ||
message FixedChar { | ||
int32 length = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 2; | ||
} | ||
|
||
message VarChar { | ||
int32 length = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 2; | ||
} | ||
|
||
message FixedBinary { | ||
int32 length = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 2; | ||
} | ||
|
||
message Decimal { | ||
int32 scale = 1; | ||
int32 precision = 2; | ||
PhysicalType physical_type = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
ARROW_128 = 1; | ||
} | ||
Variation variation = 3; | ||
} | ||
|
||
message Struct { | ||
repeated Type types = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 2; | ||
} | ||
|
||
message NamedStruct { | ||
|
||
repeated Pair pairs = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
Variation variation = 2; | ||
|
||
message Pair { | ||
string name = 1; | ||
Type type = 2; | ||
} | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
} | ||
|
||
message List { | ||
Type type = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 2; | ||
} | ||
|
||
message Map { | ||
|
||
repeated KeyValue key_values = 1; | ||
PhysicalType physical_type = 2; | ||
bool dictionary_encoded = 3; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
UTF8_ORDERED_KEYS = 1; | ||
} | ||
Variation variation = 2; | ||
|
||
message KeyValue { | ||
Type key = 1; | ||
|
@@ -275,22 +137,17 @@ message Type { | |
|
||
message TimestampMicroTZ { | ||
string timezone = 1; | ||
|
||
PhysicalType physical_type = 2; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
Variation variation = 2; | ||
} | ||
|
||
message TimestampTZ { | ||
string timezone = 1; | ||
Variation variation = 2; | ||
} | ||
|
||
PhysicalType physical_type = 2; | ||
|
||
enum PhysicalType { | ||
SYSTEM_DEFAULT = 0; | ||
} | ||
message Variation { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A possible more natural way of modeling this in protobuf is to use have different physical type systems define extensions on either the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't really see much benefit to the extensions approach in this case. Generally, it is an interesting question whether to expose some objects with extensions but I generally feel like any kind of pure black box that an extension would provide would weaken the spec. In the pattern here the spec still has information about how the extended functionality works, even if declared beyond the core project. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extensions provide the ability to more strongly model variations. There would still be two classes:
I was thinking have one extension for each system type. e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't really seem like extensions provide a stronger model. For example, when trying to decide whether a particular function can be bound to a particular type:
I also haven't seen extensions successfully in a project in this way (it seems like they are only a bucket for arbitrary as opposed to structured information). |
||
int32 organization = 1; | ||
string name = 2; | ||
} | ||
|
||
message UserDefined { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
types: | ||
- string: | ||
- name: dict4 | ||
description: a four-byte dictionary encoded string | ||
functions: inherits | ||
- name: bigoffset | ||
description: The arrow large string representation of strings, still restricted to the default string size defined in Substrait. | ||
functions: separate | ||
- struct: | ||
- name: avro | ||
description: an avro encoded struct | ||
functions: separate | ||
- name: cstruct | ||
description: a cstruct representation of the struct | ||
functions: separate | ||
- name: dict2 | ||
description: a 2-byte dictionary encoded string. | ||
functions: inherit |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
arrange: | ||
- simple_logical_types.md | ||
- compound_logical_types.md | ||
- physical_types.md | ||
- type_variations.md | ||
- user_defined_types.md |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Type Variations | ||
|
||
Since Substrait is designed to work in both logical and physical contexts, there is a need to support extended attributes in the physical context. Different consumers may have multiple ways to present the same logical type. For example, an engine might support dictionary encoding a string or using either a row-wise or columnar representation of a struct. As such, there is the facility for specification users to express additional type variations for each logical type. These variations are expected to have the same logical properties as the canonical variation and are defined for each organization. The key properties of these variations are: | ||
|
||
| Property | Description | | ||
| ----------------- | ------------------------------------------------------------ | | ||
| Parent Type | The parent type this variation belongs to. Variations can only be expressed for simple types and wild-carded compound types (e.g. i8 or varchar(*)). | | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm trying to avoid using logical since it is overloaded in relational algebra. Maybe "base" instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated to base. lmk your thoughts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Base works for me |
||
| Name | The name used to reference this type. Should be unique within type variations for this parent type within an organization. | | ||
| Description | A human description of the purpose of this type variation | | ||
| Function Behavior | **Inherits** or **Independent**: Whether this variation supports functions using the canonical variation or whether functions should be resolved independently. For example if one has the function `add(i8,i8)` defined and then defines an i8 variation, can the i8 variation field be bound to the base `add` operation (inherits) or does a specialized version of `add` need to be defined specifically for this type variation (independent). Defaults to inherits. | | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If "function behavior" is In other words. I see two possible interpretations (neither of which are very appealing)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Imagine a situation where you have a physical implementation function that only works for dictionary encoded strings (but does not work for the "system default" strings). Both values are logically strings but only a subset of operations may be available for one physical representation. That is the purpose of independent. If you're using inherits, it means that a function defined for the base type would be available for the variation. To me, part of having physical variations is that there will likely be some operations that only work against one of the variations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right. I agree those situations may exist. I just don't think Substrait should concern itself with defining which situations are valid or invalid. Here are two concrete examples:
Is it an error, in Substrait's eyes, if an engine fails to support arithmetic functions on dictionary numbers? Is it an error if an engine chooses to allow field extraction on the system default representation? I don't think Substrait should care. |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm no protobuf expert but can
variation
be made optional? What is the equivalent ofSYSTEM_DEFAULT
from before?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In proto 3, all items are optional. So system default would be an undefined variation (at least that was my thinking).