diff --git a/extensions/functions_datetime.yaml b/extensions/functions_datetime.yaml index b33960906..0d575b5dd 100644 --- a/extensions/functions_datetime.yaml +++ b/extensions/functions_datetime.yaml @@ -9,7 +9,7 @@ scalar_functions: * ISO_YEAR Return the ISO 8601 week-numbering year. First week of an ISO year has the majority (4 or more) of its days in January. * US_YEAR Return the US epidemiological year. First week of US epidemiological year has the majority (4 or more) - of its days in January. Last week of US epidemiological year has the year's last Wednesday in it. US + of its days in January. Last week of US epidemiological year has the year's last Wednesday in it. US epidemiological week starts on Sunday. * QUARTER Return the number of the quarter within the year. January 1 through March 31 map to the first quarter, April 1 through June 30 map to the second quarter, etc. @@ -32,6 +32,7 @@ scalar_functions: * SECOND Return the second (0-59). * MILLISECOND Return number of milliseconds since the last full second. * MICROSECOND Return number of microseconds since the last full millisecond. + * NANOSECOND Return number of nanoseconds since the last full microsecond. * SUBSECOND Return number of microseconds since the last full second of the given timestamp. * UNIX_TIME Return number of seconds that have elapsed since 1970-01-01 00:00:00 UTC, ignoring leap seconds. * TIMEZONE_OFFSET Return number of seconds of timezone offset to UTC. @@ -57,7 +58,7 @@ scalar_functions: * MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, and US_WEEK return values in range 0-52 The indexing option must be specified when the component is QUARTER, MONTH, DAY, DAY_OF_YEAR, - MONDAY_DAY_OF_WEEK, SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, or US_WEEK. The + MONDAY_DAY_OF_WEEK, SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, or US_WEEK. The indexing option cannot be specified when the component is YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, SUBSECOND, UNIX_TIME, or TIMEZONE_OFFSET. @@ -76,6 +77,17 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: i64 + - args: + - name: component + options: [ YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, + MILLISECOND, MICROSECOND, NANOSECOND, SUBSECOND, UNIX_TIME, TIMEZONE_OFFSET ] + description: The part of the value to extract. + - name: x + value: precision_timestamp_tz + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: i64 - args: - name: component options: [ YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, @@ -84,6 +96,14 @@ scalar_functions: - name: x value: timestamp return: i64 + - args: + - name: component + options: [ YEAR, ISO_YEAR, US_YEAR, HOUR, MINUTE, SECOND, + MILLISECOND, MICROSECOND, NANOSECOND, SUBSECOND, UNIX_TIME ] + description: The part of the value to extract. + - name: x + value: precision_timestamp + return: i64 - args: - name: component options: [ YEAR, ISO_YEAR, US_YEAR, UNIX_TIME ] @@ -112,6 +132,20 @@ scalar_functions: description: Timezone string from IANA tzdb. value: string return: i64 + - args: + - name: component + options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, + SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, US_WEEK ] + description: The part of the value to extract. + - name: indexing + options: [ ONE, ZERO ] + description: Start counting from 1 or 0. + - name: x + value: precision_timestamp_tz + - name: timezone + description: Timezone string from IANA tzdb. + value: string + return: i64 - args: - name: component options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, @@ -123,6 +157,17 @@ scalar_functions: - name: x value: timestamp return: i64 + - args: + - name: component + options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, + SUNDAY_DAY_OF_WEEK, MONDAY_WEEK, SUNDAY_WEEK, ISO_WEEK, US_WEEK ] + description: The part of the value to extract. + - name: indexing + options: [ ONE, ZERO ] + description: Start counting from 1 or 0. + - name: x + value: precision_timestamp + return: i64 - args: - name: component options: [ QUARTER, MONTH, DAY, DAY_OF_YEAR, MONDAY_DAY_OF_WEEK, diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index 9fec02a9f..242c72869 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -796,7 +796,8 @@ message Expression { string string = 12; bytes binary = 13; // Timestamp in units of microseconds since the UNIX epoch. - int64 timestamp = 14; + // Deprecated in favor of `precision_timestamp` + int64 timestamp = 14 [deprecated = true]; // Date in units of days since the UNIX epoch. int32 date = 16; // Time in units of microseconds past midnight @@ -807,10 +808,15 @@ message Expression { VarChar var_char = 22; bytes fixed_binary = 23; Decimal decimal = 24; + // If the precision is 6 or less then this is the microseconds since the UNIX epoch + // If the precision is more than 6 then this is the nanoseconds since the UNIX epoch + uint64 precision_timestamp = 34; + uint64 precision_timestamp_tz = 35; Struct struct = 25; Map map = 26; // Timestamp in units of microseconds since the UNIX epoch. - int64 timestamp_tz = 27; + // Deprecated in favor of `precision_timestamp_tz` + int64 timestamp_tz = 27 [deprecated = true]; bytes uuid = 28; Type null = 29; // a typed null literal List list = 30; diff --git a/proto/substrait/parameterized_types.proto b/proto/substrait/parameterized_types.proto index db0669354..51d9c0d68 100644 --- a/proto/substrait/parameterized_types.proto +++ b/proto/substrait/parameterized_types.proto @@ -21,18 +21,22 @@ message ParameterizedType { Type.FP64 fp64 = 11; Type.String string = 12; Type.Binary binary = 13; - Type.Timestamp timestamp = 14; + // Deprecated in favor of `ParameterizedPrecisionTimestamp precision_timestamp` + Type.Timestamp timestamp = 14 [deprecated = true]; Type.Date date = 16; Type.Time time = 17; Type.IntervalYear interval_year = 19; Type.IntervalDay interval_day = 20; - Type.TimestampTZ timestamp_tz = 29; + // Deprecated in favor of `ParameterizedPrecisionTimestampTZ precision_timestamp_tz` + Type.TimestampTZ timestamp_tz = 29 [deprecated = true]; Type.UUID uuid = 32; ParameterizedFixedChar fixed_char = 21; ParameterizedVarChar varchar = 22; ParameterizedFixedBinary fixed_binary = 23; ParameterizedDecimal decimal = 24; + ParameterizedPrecisionTimestamp precision_timestamp = 34; + ParameterizedPrecisionTimestampTZ precision_timestamp_tz = 35; ParameterizedStruct struct = 25; ParameterizedList list = 27; @@ -88,6 +92,18 @@ message ParameterizedType { Type.Nullability nullability = 4; } + message ParameterizedPrecisionTimestamp { + IntegerOption precision = 1; + uint32 variation_pointer = 2; + Type.Nullability nullability = 3; + } + + message ParameterizedPrecisionTimestampTZ { + IntegerOption precision = 1; + uint32 variation_pointer = 2; + Type.Nullability nullability = 3; + } + message ParameterizedStruct { repeated ParameterizedType types = 1; uint32 variation_pointer = 2; diff --git a/proto/substrait/type.proto b/proto/substrait/type.proto index a7f1c665d..02993c400 100644 --- a/proto/substrait/type.proto +++ b/proto/substrait/type.proto @@ -21,18 +21,22 @@ message Type { FP64 fp64 = 11; String string = 12; Binary binary = 13; - Timestamp timestamp = 14; + // Deprecated in favor of `PrecisionTimestamp precision_timestamp` + Timestamp timestamp = 14 [deprecated = true]; Date date = 16; Time time = 17; IntervalYear interval_year = 19; IntervalDay interval_day = 20; - TimestampTZ timestamp_tz = 29; + // Deprecated in favor of `PrecisionTimestampTZ precision_timestamp_tz` + TimestampTZ timestamp_tz = 29 [deprecated = true]; UUID uuid = 32; FixedChar fixed_char = 21; VarChar varchar = 22; FixedBinary fixed_binary = 23; Decimal decimal = 24; + PrecisionTimestamp precision_timestamp = 33; + PrecisionTimestampTZ precision_timestamp_tz = 34; Struct struct = 25; List list = 27; @@ -159,6 +163,20 @@ message Type { Nullability nullability = 4; } + message PrecisionTimestamp { + // Defaults to 6 + int32 precision = 1; + uint32 type_variation_reference = 2; + Nullability nullability = 3; + } + + message PrecisionTimestampTZ { + // Defaults to 6 + int32 precision = 1; + uint32 type_variation_reference = 2; + Nullability nullability = 3; + } + message Struct { repeated Type types = 1; uint32 type_variation_reference = 2; diff --git a/proto/substrait/type_expressions.proto b/proto/substrait/type_expressions.proto index 4be4aab47..6b59121d9 100644 --- a/proto/substrait/type_expressions.proto +++ b/proto/substrait/type_expressions.proto @@ -21,18 +21,22 @@ message DerivationExpression { Type.FP64 fp64 = 11; Type.String string = 12; Type.Binary binary = 13; - Type.Timestamp timestamp = 14; + // Deprecated in favor of `ExpressionPrecisionTimestamp precision_timestamp` + Type.Timestamp timestamp = 14 [deprecated = true]; Type.Date date = 16; Type.Time time = 17; Type.IntervalYear interval_year = 19; Type.IntervalDay interval_day = 20; - Type.TimestampTZ timestamp_tz = 29; + // Deprecated in favor of `ExpressionPrecisionTimestampTZ precision_timestamp_tz` + Type.TimestampTZ timestamp_tz = 29 [deprecated = true]; Type.UUID uuid = 32; ExpressionFixedChar fixed_char = 21; ExpressionVarChar varchar = 22; ExpressionFixedBinary fixed_binary = 23; ExpressionDecimal decimal = 24; + ExpressionPrecisionTimestamp precision_timestamp = 40; + ExpressionPrecisionTimestampTZ precision_timestamp_tz = 41; ExpressionStruct struct = 25; ExpressionList list = 27; @@ -80,6 +84,18 @@ message DerivationExpression { Type.Nullability nullability = 4; } + message ExpressionPrecisionTimestamp { + DerivationExpression precision = 1; + uint32 variation_pointer = 2; + Type.Nullability nullability = 3; + } + + message ExpressionPrecisionTimestampTZ { + DerivationExpression precision = 1; + uint32 variation_pointer = 2; + Type.Nullability nullability = 3; + } + message ExpressionStruct { repeated DerivationExpression types = 1; uint32 variation_pointer = 2; diff --git a/site/docs/extensions/index.md b/site/docs/extensions/index.md index 0d4ae606a..b35dc68db 100644 --- a/site/docs/extensions/index.md +++ b/site/docs/extensions/index.md @@ -58,34 +58,36 @@ Rather than using a full data type representation, the input argument types (`sh Every compound function signature must be unique. If two function implementations in a YAML file would generate the same compound function signature, then the YAML file is invalid and behavior is undefined. -| Argument Type | Signature Name | -| -------------------------- | -------------- | -| Required Enumeration | req | -| i8 | i8 | -| i16 | i16 | -| i32 | i32 | -| i64 | i64 | -| fp32 | fp32 | -| fp64 | fp64 | -| string | str | -| binary | vbin | -| boolean | bool | -| timestamp | ts | -| timestamp_tz | tstz | -| date | date | -| time | time | -| interval_year | iyear | -| interval_day | iday | -| uuid | uuid | -| fixedchar<N> | fchar | -| varchar<N> | vchar | -| fixedbinary<N> | fbin | -| decimal<P,S> | dec | -| struct<T1,T2,...,TN> | struct | -| list<T> | list | -| map<K,V> | map | -| any[\d]? | any | -| user defined type | u!name | +| Argument Type | Signature Name | +|---------------------------------|----------------| +| Required Enumeration | req | +| i8 | i8 | +| i16 | i16 | +| i32 | i32 | +| i64 | i64 | +| fp32 | fp32 | +| fp64 | fp64 | +| string | str | +| binary | vbin | +| boolean | bool | +| timestamp | ts | +| timestamp_tz | tstz | +| date | date | +| time | time | +| interval_year | iyear | +| interval_day | iday | +| uuid | uuid | +| fixedchar<N> | fchar | +| varchar<N> | vchar | +| fixedbinary<N> | fbin | +| decimal<P,S> | dec | +| precision_timestamp<P> | pts | +| precision_timestamp_tz<P> | ptstz | +| struct<T1,T2,...,TN> | struct | +| list<T> | list | +| map<K,V> | map | +| any[\d]? | any | +| user defined type | u!name | #### Examples diff --git a/site/docs/types/type_classes.md b/site/docs/types/type_classes.md index bb5f8a51f..10b2e732e 100644 --- a/site/docs/types/type_classes.md +++ b/site/docs/types/type_classes.md @@ -19,8 +19,8 @@ Simple type classes are those that don't support any form of configuration. For | fp64 | An 8-byte double-precision floating point number with the same range and precision as defined for the [IEEE 754 64-bit floating-point format](https://standards.ieee.org/ieee/754/6210/). | `double` | string | A unicode string of text, [0..2,147,483,647] UTF-8 bytes in length. | `string` | binary | A binary value, [0..2,147,483,647] bytes in length. | `binary` -| timestamp | A naive timestamp within [1000-01-01 00:00:00.000000..9999-12-31 23:59:59.999999], with microsecond precision. Does not include timezone information and can thus not be unambiguously mapped to a moment on the timeline without context. Similar to naive datetime in Python. | `int64` microseconds since 1970-01-01 00:00:00.000000 (in an unspecified timezone) -| timestamp_tz | A timezone-aware timestamp within [1000-01-01 00:00:00.000000 UTC..9999-12-31 23:59:59.999999 UTC], with microsecond precision. Similar to aware datetime in Python. | `int64` microseconds since 1970-01-01 00:00:00.000000 UTC +| timestamp | A naive timestamp with microsecond precision. Does not include timezone information and can thus not be unambiguously mapped to a moment on the timeline without context. Similar to naive datetime in Python. | `int64` microseconds since 1970-01-01 00:00:00.000000 (in an unspecified timezone) +| timestamp_tz | A timezone-aware timestamp with microsecond precision. Similar to aware datetime in Python. | `int64` microseconds since 1970-01-01 00:00:00.000000 UTC | date | A date within [1000-01-01..9999-12-31]. | `int32` days since `1970-01-01` | time | A time since the beginning of any day. Range of [0..86,399,999,999] microseconds; leap seconds need not be supported. | `int64` microseconds past midnight | interval_year | Interval year to month. Supports a range of [-10,000..10,000] years with month precision (= [-120,000..120,000] months). Usually stored as separate integers for years and months, but only the total number of months is significant, i.e. `1y 0m` is considered equal to `0y 12m` or `1001y -12000m`. | `int32` years and `int32` months, with the added constraint that each component can never independently specify more than 10,000 years, even if the components have opposite signs (e.g. `-10000y 200000m` is **not** allowed) @@ -31,16 +31,18 @@ Simple type classes are those that don't support any form of configuration. For Compound type classes are type classes that need to be configured by means of a parameter pack. -| Type Name | Description | Protobuf representation for literals -| ---------------------------- | ------------------------------------------------------------ | ------------------------------------------------ -| FIXEDCHAR<L> | A fixed-length unicode string of L characters. L must be within [1..2,147,483,647]. | L-character `string` -| VARCHAR<L> | A unicode string of at most L characters.L must be within [1..2,147,483,647]. | `string` with at most L characters -| FIXEDBINARY<L> | A binary string of L bytes. When casting, values shorter than L are padded with zeros, and values longer than L are right-trimmed. | L-byte `bytes` -| DECIMAL<P, S> | A fixed-precision decimal value having precision (P, number of digits) <= 38 and scale (S, number of fractional digits) 0 <= S <= P. | 16-byte `bytes` representing a little-endian 128-bit integer, to be divided by 10^S to get the decimal value -| STRUCT<T1,...,Tn> | A list of types in a defined order. | `repeated Literal`, types matching T1..Tn -| NSTRUCT<N:T1,...,N:Tn> | **Pseudo-type**: A struct that maps unique names to value types. Each name is a UTF-8-encoded string. Each value can have a distinct type. Note that NSTRUCT is actually a pseudo-type, because Substrait's core type system is based entirely on ordinal positions, not named fields. Nonetheless, when working with systems outside Substrait, names are important. | n/a -| LIST<T> | A list of values of type T. The list can be between [0..2,147,483,647] values in length. | `repeated Literal`, all types matching T -| MAP<K, V> | An unordered list of type K keys with type V values. Keys may be repeated. While the key type could be nullable, keys may not be null. | `repeated KeyValue` (in turn two `Literal`s), all key types matching K and all value types matching V +| Type Name | Description | Protobuf representation for literals +|-------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------------------------------------------------ +| FIXEDCHAR<L> | A fixed-length unicode string of L characters. L must be within [1..2,147,483,647]. | L-character `string` +| VARCHAR<L> | A unicode string of at most L characters.L must be within [1..2,147,483,647]. | `string` with at most L characters +| FIXEDBINARY<L> | A binary string of L bytes. When casting, values shorter than L are padded with zeros, and values longer than L are right-trimmed. | L-byte `bytes` +| DECIMAL<P, S> | A fixed-precision decimal value having precision (P, number of digits) <= 38 and scale (S, number of fractional digits) 0 <= S <= P. | 16-byte `bytes` representing a little-endian 128-bit integer, to be divided by 10^S to get the decimal value +| STRUCT<T1,...,Tn> | A list of types in a defined order. | `repeated Literal`, types matching T1..Tn +| NSTRUCT<N:T1,...,N:Tn> | **Pseudo-type**: A struct that maps unique names to value types. Each name is a UTF-8-encoded string. Each value can have a distinct type. Note that NSTRUCT is actually a pseudo-type, because Substrait's core type system is based entirely on ordinal positions, not named fields. Nonetheless, when working with systems outside Substrait, names are important. | n/a +| LIST<T> | A list of values of type T. The list can be between [0..2,147,483,647] values in length. | `repeated Literal`, all types matching T +| MAP<K, V> | An unordered list of type K keys with type V values. Keys may be repeated. While the key type could be nullable, keys may not be null. | `repeated KeyValue` (in turn two `Literal`s), all key types matching K and all value types matching V +| PRECISIONTIMESTAMP<P> | A timestamp with fractional second precision (P, number of digits) 0 >= P <= 9. Does not include timezone information and can thus not be unambiguously mapped to a moment on the timeline without context. Similar to naive datetime in Python. | `uint64` microseconds or nanoseconds since 1970-01-01 00:00:00.000000000 (in an unspecified timezone) +| PRECISIONTIMESTAMPTZ<P> | A timezone-aware timestamp, with fractional second precision (P, number of digits) 0 >= P <= 9. Similar to aware datetime in Python. | `uint64` microseconds or nanoseconds since 1970-01-01 00:00:00.000000000 UTC ## User-Defined Types