substrait-io · jacques-n · Sep 14, 2021 · Sep 11, 2021 · Sep 11, 2021 · Sep 14, 2021
@@ -30,242 +30,104 @@ message Type {
     }
 
     message I8 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
-    }
-
-    message U8 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message I16 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
-    }
-
-    message U16 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message I32 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
-    }
-
-    message U32 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message I64 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
-    }
-
-    message U64 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
-    }
-
-    message FP16 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message FP32 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message FP64 {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message String {
-        PhysicalType physical_type = 1;
-        bool dictionary_encoded = 2;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-            ARROW_LARGE_STRING = 1;
-        }
+        Variation variation = 1;
     }
 
     message Binary {
-        PhysicalType physical_type = 1;
-        bool dictionary_encoded = 2;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-            ARROW_LARGE_BINARY = 1;
-        }
+        Variation variation = 1;
     }
 
     message Timestamp {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message Date {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message Time {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message IntervalYear {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 1;
     }
 
     message IntervalDay {
-        PhysicalType physical_type = 1;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-            ARROW_MONTH_DAY_NANO = 1;
-        }
+        Variation variation = 1;
     }
 
     // Start compound types.
     message FixedChar {
         int32 length = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 2;
     }
 
     message VarChar {
         int32 length = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 2;
     }
 
     message FixedBinary {
         int32 length = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 2;
     }
 
     message Decimal {
         int32 scale = 1;
         int32 precision = 2;
-        PhysicalType physical_type = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-            ARROW_128 = 1;
-        }
+        Variation variation = 3;
     }
 
     message Struct {
         repeated Type types = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 2;
     }
 
     message NamedStruct {
 
         repeated Pair pairs = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
+        Variation variation = 2;
 
         message Pair {
             string name = 1;
             Type type = 2;
         }
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
     }
 
     message List {
         Type type = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 2;
     }
 
     message Map {
 
         repeated KeyValue key_values = 1;
-        PhysicalType physical_type = 2;
-        bool dictionary_encoded = 3;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-            UTF8_ORDERED_KEYS = 1;
-        }
+        Variation variation = 2;
 
         message KeyValue {
             Type key = 1;
@@ -275,22 +137,17 @@ message Type {
 
     message TimestampMicroTZ {
         string timezone = 1;
-
-        PhysicalType physical_type = 2;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+        Variation variation = 2;
     }
 
     message TimestampTZ {
         string timezone = 1;
+        Variation variation = 2;
+    }
 
-        PhysicalType physical_type = 2;
-
-        enum PhysicalType {
-            SYSTEM_DEFAULT = 0;
-        }
+    message Variation {
+        int32 organization = 1;
+        string name = 2;
     }
 
     message UserDefined {

@@ -0,0 +1,18 @@
+types:
+- string:
+  - name: dict4
+    description: a four-byte dictionary encoded string
+    functions: inherits
+  - name: bigoffset
+    description: The arrow large string representation of strings, still restricted to the default string size defined in Substrait.
+    functions: separate
+- struct:
+  - name: avro
+    description: an avro encoded struct
+    functions: separate
+  - name: cstruct
+    description: a cstruct representation of the struct
+    functions: separate
+  - name: dict2
+    description: a 2-byte dictionary encoded string.
+    functions: inherit
@@ -19,9 +19,9 @@ Once all portions of the specification have been moved to commit (or eliminated)
 
 | Priority | Status | Section                                                      | Description                                                  |
 | -------- | ------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| 1        | sketch | [Simple Logical Types](/types/simple_logical_types)          | A way to describe the set of basic types that will be operated on within a plan. Only includes simple types such as integers and doubles (nothing configurable or compound). |
-|          | sketch | [Compound Logical Types](/types/compound_logical_types)      | Expression of types that go beyond simple scalar values. Key concepts here include: configurable types such as fixed length and numeric types as well as compound types such as structs, maps, lists, etc. |
-|          | sketch | [Physical Types](/types/physical_types)                      | Physical extensions to logical types.                        |
+| 1        | sketch | [Simple Types](/types/simple_logical_types)                  | A way to describe the set of basic types that will be operated on within a plan. Only includes simple types such as integers and doubles (nothing configurable or compound). |
+|          | sketch | [Compound Types](/types/compound_logical_types)              | Expression of types that go beyond simple scalar values. Key concepts here include: configurable types such as fixed length and numeric types as well as compound types such as structs, maps, lists, etc. |
+|          | sketch | [Type Variations](/types/type_variations.md)                 | Physical variations to base types.                           |
 |          | sketch | [User Defined Types](/types/user_defined_types)              | Extensions that can be defined for specific IR producers/consumers. |
 | 2        | sketch | [Field References](/expressions/field_references)            | Expressions to identify which portions of a record should be |
 | 3        | sketch | [Scalar Functions](/expressions/scalar_functions)            | Description of how functions are specified. Concepts include arguments, variadic functions, output type derivation, etc. |

@@ -1,5 +1,5 @@
 arrange:
   - simple_logical_types.md
   - compound_logical_types.md
-  - physical_types.md
+  - type_variations.md
   - user_defined_types.md
@@ -1,4 +1,4 @@
-# Logical Compound Types
+# Compound Types
 
 Compound types include any type that is configurable including complex types as well as configurable scalar types.
 

@@ -1,4 +1,4 @@
-# Simple Logical Types
+# Simple Types
 
 Substrait tries to cover the most common types used in data manipulation. Simple types are those that don't support any form of configuration. For simplicity, any generic type that has only a small number of discrete implementations is declared directly (as opposed to via configuration).
 

@@ -0,0 +1,11 @@
+# Type Variations
+
+Since Substrait is designed to work in both logical and physical contexts, there is a need to support extended attributes in the physical context. Different consumers may have multiple ways to present the same logical type. For example, an engine might support dictionary encoding a string or using either a row-wise or columnar representation of a struct. As such, there is the facility for specification users to express additional type variations for each logical type. These variations are expected to have the same logical properties as the canonical variation and are defined for each organization. The key properties of these variations are:
+
+| Property          | Description                                                  |
+| ----------------- | ------------------------------------------------------------ |
+| Base Type       | The base type this variation belongs to. Variations can only be expressed for simple types and wild-carded compound types (e.g. i8 or varchar(*)). |
+| Name              | The name used to reference this type. Should be unique within type variations for this parent type within an organization. |
+| Description       | A human description of the purpose of this type variation    |
+| Function Behavior | **Inherits** or **Independent**: Whether this variation supports functions using the canonical variation or whether functions should be resolved independently. For example if one has the function `add(i8,i8)` defined and then defines an i8 variation, can the i8 variation field be bound to the base `add` operation (inherits) or does a specialized version of `add` need to be defined specifically for this type variation (independent). Defaults to inherits. |
+