diff --git a/e2e_test/batch/types/interval.slt.part b/e2e_test/batch/types/interval.slt.part index 16c688707485a..73bd7189941bd 100644 --- a/e2e_test/batch/types/interval.slt.part +++ b/e2e_test/batch/types/interval.slt.part @@ -74,20 +74,20 @@ SELECT INTERVAL '3 mins' * 1.5; 00:04:30 # https://github.com/risingwavelabs/risingwave/issues/3873 -query TTTTT +query T select distinct * from (values (interval '1' month), (interval '30' day)) as t; ---- -30 days +1 mon -query TTTTT +query T select distinct * from (values (interval '30' day), (interval '1' month)) as t; ---- -30 days +1 mon -query TTTTT +query T select distinct * from (values (interval '720' hour), (interval '1' month)) as t; ---- -30 days +1 mon query TTTTTT select interval '1 year 1 month 1 day 1'; @@ -119,3 +119,44 @@ query T select '1 mons 1 days 00:00:00.0000001'::INTERVAL; ---- 1 mon 1 day + +# Tests moved from regress tests due to not matching exactly. + +# In mixed sign intervals, PostgreSQL displays positive sign after negative +# (e.g. `-1 mons +1 day`) while we display it without sign +# (e.g. `-1 mons 1 day`). +# But the main purpose of this test case is ordering of large values. + +statement ok +CREATE TABLE INTERVAL_TBL_OF (f1 interval); + +statement ok +INSERT INTO INTERVAL_TBL_OF (f1) VALUES + ('2147483647 days 2147483647 months'), + ('2147483647 days -2147483648 months'), + ('1 year'), + ('-2147483648 days 2147483647 months'), + ('-2147483648 days -2147483648 months'); + +statement ok +FLUSH; + +query TT +SELECT r1.*, r2.* + FROM INTERVAL_TBL_OF r1, INTERVAL_TBL_OF r2 + WHERE r1.f1 > r2.f1 + ORDER BY r1.f1, r2.f1; +---- +-178956970 years -8 mons 2147483647 days -178956970 years -8 mons -2147483648 days +1 year -178956970 years -8 mons -2147483648 days +1 year -178956970 years -8 mons 2147483647 days +178956970 years 7 mons -2147483648 days -178956970 years -8 mons -2147483648 days +178956970 years 7 mons -2147483648 days -178956970 years -8 mons 2147483647 days +178956970 years 7 mons -2147483648 days 1 year +178956970 years 7 mons 2147483647 days -178956970 years -8 mons -2147483648 days +178956970 years 7 mons 2147483647 days -178956970 years -8 mons 2147483647 days +178956970 years 7 mons 2147483647 days 1 year +178956970 years 7 mons 2147483647 days 178956970 years 7 mons -2147483648 days + +statement ok +DROP TABLE INTERVAL_TBL_OF; diff --git a/src/common/src/hash/key.rs b/src/common/src/hash/key.rs index 95a6d6c58189d..2762fcbbe512c 100644 --- a/src/common/src/hash/key.rs +++ b/src/common/src/hash/key.rs @@ -39,8 +39,8 @@ use crate::collection::estimate_size::EstimateSize; use crate::hash::VirtualNode; use crate::row::{OwnedRow, RowDeserializer}; use crate::types::{ - DataType, Decimal, IntervalUnit, NaiveDateTimeWrapper, NaiveDateWrapper, NaiveTimeWrapper, - OrderedF32, OrderedF64, ScalarRef, + DataType, Decimal, NaiveDateTimeWrapper, NaiveDateWrapper, NaiveTimeWrapper, OrderedF32, + OrderedF64, ScalarRef, }; use crate::util::hash_util::Crc32FastBuilder; use crate::util::iter_util::ZipEqFast; @@ -357,29 +357,6 @@ impl HashKeySerDe<'_> for Decimal { } } -impl HashKeySerDe<'_> for IntervalUnit { - type S = [u8; 16]; - - fn serialize(mut self) -> Self::S { - self.justify_interval(); - let mut ret = [0; 16]; - ret[0..4].copy_from_slice(&self.get_months().to_ne_bytes()); - ret[4..8].copy_from_slice(&self.get_days().to_ne_bytes()); - ret[8..16].copy_from_slice(&self.get_usecs().to_ne_bytes()); - - ret - } - - fn deserialize(source: &mut R) -> Self { - let value = Self::read_fixed_size_bytes::(source); - IntervalUnit::from_month_day_usec( - i32::from_ne_bytes(value[0..4].try_into().unwrap()), - i32::from_ne_bytes(value[4..8].try_into().unwrap()), - i64::from_ne_bytes(value[8..16].try_into().unwrap()), - ) - } -} - impl<'a> HashKeySerDe<'a> for &'a str { type S = Vec; diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 153eadc9076ab..0081dadb173c7 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -81,25 +81,6 @@ impl IntervalUnit { self.usecs.rem_euclid(USECS_PER_DAY) as u64 } - /// Justify interval, convert 1 month to 30 days and 86400 s to 1 day. - /// If day is positive, complement the seconds negative value. - /// These rules only use in interval comparison. - pub fn justify_interval(&mut self) { - #[expect(deprecated)] - let total_usecs = self.as_usecs_i64(); - *self = Self { - months: 0, - days: (total_usecs / USECS_PER_DAY) as i32, - usecs: total_usecs % USECS_PER_DAY, - } - } - - pub fn justified(&self) -> Self { - let mut interval = *self; - interval.justify_interval(); - interval - } - #[deprecated] fn from_total_usecs(usecs: i64) -> Self { let mut remaining_usecs = usecs; @@ -485,20 +466,165 @@ impl std::fmt::Debug for IntervalUnitDisplay<'_> { } } -/// Loss of information during the process due to `justify`. Only intended for memcomparable -/// encoding. +/// +/// +/// Do NOT make this `pub` as the assumption of 1 month = 30 days and 1 day = 24 hours does not +/// always hold in other places. +/// +/// Given this equality definition in PostgreSQL, different interval values can be considered equal, +/// forming equivalence classes. For example: +/// * '-45 days' == '-1 months -15 days' == '1 months -75 days' +/// * '-2147483646 months -210 days' == '-2147483648 months -150 days' == '-2075900865 months +/// -2147483640 days' +/// +/// To hash and memcompare them, we need to pick a representative for each equivalence class, and +/// then map all values from the same equivalence class to the same representative. There are 3 +/// choices (may be more): +/// (a) an `i128` of total `usecs`, with `months` and `days` transformed into `usecs`; +/// (b) the justified interval, as defined by PostgreSQL `justify_interval`; +/// (c) the alternate representative interval that maximizes `abs` of smaller units; +/// +/// For simplicity we will assume there are only `months` and `days` and ignore `usecs` below. +/// +/// The justified interval is more human friendly. It requires all units to have the same sign, and +/// that `0 <= abs(usecs) < USECS_PER_DAY && 0 <= abs(days) < 30`. However, it may overflow. In the +/// 2 examples above, '-1 months -15 days' is the justified interval of the first equivalence class, +/// but there is no justified interval in the second one. It would be '-2147483653 months' but this +/// overflows `i32`. A lot of bits are wasted in a justified interval because `days` is using +/// `i32` for `-29..=29` only. +/// +/// The alternate representative interval aims to avoid this overflow. It still requires all units +/// to have the same sign, but maximizes `abs` of smaller unit rather than limit it to `29`. The +/// alternate representative of the 2 examples above are '-45 days' and '-2075900865 months +/// -2147483640 days'. The alternate representative interval always exists. +/// +/// For serialize, we could use any of 3, with a workaround of using (i33, i6, i38) rather than +/// (i32, i32, i64) to avoid overflow of the justified interval. We chose the `usecs: i128` option. +/// +/// For deserialize, we attempt justified interval first and fallback to alternate. This could give +/// human friendly results in common cases and still guarantee no overflow, as long as the bytes +/// were serialized properly. +/// +/// Note the alternate representative interval does not exist in PostgreSQL as they do not +/// deserialize from `IntervalCmpValue`. +#[derive(PartialEq, Eq, Hash, PartialOrd, Ord)] +struct IntervalCmpValue(i128); + +impl From for IntervalCmpValue { + fn from(value: IntervalUnit) -> Self { + let days = (value.days as i64) + 30i64 * (value.months as i64); + let usecs = (value.usecs as i128) + (USECS_PER_DAY as i128) * (days as i128); + Self(usecs) + } +} + +impl Ord for IntervalUnit { + fn cmp(&self, other: &Self) -> Ordering { + IntervalCmpValue::from(*self).cmp(&(*other).into()) + } +} + +impl PartialOrd for IntervalUnit { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for IntervalUnit { + fn eq(&self, other: &Self) -> bool { + self.cmp(other).is_eq() + } +} + +impl Eq for IntervalUnit {} + +impl Hash for IntervalUnit { + fn hash(&self, state: &mut H) { + IntervalCmpValue::from(*self).hash(state); + } +} + +/// Loss of information during the process due to `IntervalCmpValue`. Only intended for +/// memcomparable encoding. impl Serialize for IntervalUnit { fn serialize(&self, serializer: S) -> std::result::Result where S: serde::Serializer, { - let IntervalUnit { - months, - days, - usecs, - } = self.justified(); - // serialize the `IntervalUnit` as a tuple - (months, days, usecs).serialize(serializer) + let cmp_value = IntervalCmpValue::from(*self); + // split i128 as (i64, u64), which is equivalent + ( + (cmp_value.0 >> 64) as i64, + cmp_value.0 as u64, // truncate to get the lower part + ) + .serialize(serializer) + } +} + +impl IntervalCmpValue { + /// Recover the justified interval from this equivalence class, if it exists. + fn as_justified(&self) -> Option { + let usecs = (self.0 % (USECS_PER_DAY as i128)) as i64; + let remaining_days = self.0 / (USECS_PER_DAY as i128); + let days = (remaining_days % 30) as i32; + let months = (remaining_days / 30).try_into().ok()?; + Some(IntervalUnit::from_month_day_usec(months, days, usecs)) + } + + /// Recover the alternate representative interval from this equivalence class. + /// It always exists unless the encoding is invalid. See [`IntervalCmpValue`] for details. + fn as_alternate(&self) -> Option { + match self.0.cmp(&0) { + Ordering::Equal => Some(IntervalUnit::from_month_day_usec(0, 0, 0)), + Ordering::Greater => { + let remaining_usecs = self.0; + let mut usecs = (remaining_usecs % (USECS_PER_DAY as i128)) as i64; + let mut remaining_days = remaining_usecs / (USECS_PER_DAY as i128); + // `usecs` is now smaller than `USECS_PER_DAY` but has 64 bits. + // How much more days (multiples of `USECS_PER_DAY`) can it hold before overflowing + // i64::MAX? + // It should also not exceed `remaining_days` to bring it from positive to negative. + // When `remaining_days` is larger than `i64::MAX`, just limit by `i64::MAX` (no-op) + let extra_days = ((i64::MAX - usecs) / USECS_PER_DAY) + .min(remaining_days.try_into().unwrap_or(i64::MAX)); + // The lhs of `min` ensures `extra_days * USECS_PER_DAY <= i64::MAX - usecs` + usecs += extra_days * USECS_PER_DAY; + // The rhs of `min` ensures `extra_days <= remaining_days` + remaining_days -= extra_days as i128; + + // Similar above + let mut days = (remaining_days % 30) as i32; + let mut remaining_months = remaining_days / 30; + let extra_months = + ((i32::MAX - days) / 30).min(remaining_months.try_into().unwrap_or(i32::MAX)); + days += extra_months * 30; + remaining_months -= extra_months as i128; + + let months = remaining_months.try_into().ok()?; + Some(IntervalUnit::from_month_day_usec(months, days, usecs)) + } + Ordering::Less => { + let remaining_usecs = self.0; + let mut usecs = (remaining_usecs % (USECS_PER_DAY as i128)) as i64; + let mut remaining_days = remaining_usecs / (USECS_PER_DAY as i128); + // The negative case. Borrow negative `extra_days` to make `usecs` as close to + // `i64::MIN` as possible. + let extra_days = ((i64::MIN - usecs) / USECS_PER_DAY) + .max(remaining_days.try_into().unwrap_or(i64::MIN)); + usecs += extra_days * USECS_PER_DAY; + remaining_days -= extra_days as i128; + + let mut days = (remaining_days % 30) as i32; + let mut remaining_months = remaining_days / 30; + let extra_months = + ((i32::MIN - days) / 30).max(remaining_months.try_into().unwrap_or(i32::MIN)); + days += extra_months * 30; + remaining_months -= extra_months as i128; + + let months = remaining_months.try_into().ok()?; + Some(IntervalUnit::from_month_day_usec(months, days, usecs)) + } + } } } @@ -507,15 +633,36 @@ impl<'de> Deserialize<'de> for IntervalUnit { where D: serde::Deserializer<'de>, { - let (months, days, usecs) = <(i32, i32, i64)>::deserialize(deserializer)?; - Ok(Self { - months, - days, - usecs, + let (hi, lo) = <(i64, u64)>::deserialize(deserializer)?; + let cmp_value = IntervalCmpValue(((hi as i128) << 64) | (lo as i128)); + let interval = cmp_value + .as_justified() + .or_else(|| cmp_value.as_alternate()); + interval.ok_or_else(|| { + use serde::de::Error as _; + D::Error::custom("memcomparable deserialize interval overflow") }) } } +impl crate::hash::HashKeySerDe<'_> for IntervalUnit { + type S = [u8; 16]; + + fn serialize(self) -> Self::S { + let cmp_value = IntervalCmpValue::from(self); + cmp_value.0.to_ne_bytes() + } + + fn deserialize(source: &mut R) -> Self { + let value = Self::read_fixed_size_bytes::(source); + let cmp_value = IntervalCmpValue(i128::from_ne_bytes(value)); + cmp_value + .as_justified() + .or_else(|| cmp_value.as_alternate()) + .expect("HashKey deserialize interval overflow") + } +} + /// Duplicated logic only used by `HopWindow`. See #8452. #[expect(clippy::from_over_into)] impl Into for IntervalUnit { @@ -565,43 +712,6 @@ impl Add for IntervalUnit { } } -impl PartialOrd for IntervalUnit { - fn partial_cmp(&self, other: &Self) -> Option { - if self.eq(other) { - Some(Ordering::Equal) - } else { - let diff = *self - *other; - let days = diff.months as i64 * 30 + diff.days as i64; - Some((days * USECS_PER_DAY + diff.usecs).cmp(&0)) - } - } -} - -impl Hash for IntervalUnit { - fn hash(&self, state: &mut H) { - let interval = self.justified(); - interval.months.hash(state); - interval.usecs.hash(state); - interval.days.hash(state); - } -} - -impl PartialEq for IntervalUnit { - fn eq(&self, other: &Self) -> bool { - let interval = self.justified(); - let other = other.justified(); - interval.days == other.days && interval.usecs == other.usecs - } -} - -impl Eq for IntervalUnit {} - -impl Ord for IntervalUnit { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap() - } -} - impl CheckedNeg for IntervalUnit { fn checked_neg(&self) -> Option { let months = self.months.checked_neg()?; @@ -668,10 +778,7 @@ impl Zero for IntervalUnit { impl IsNegative for IntervalUnit { fn is_negative(&self) -> bool { - let i = self.justified(); - i.months < 0 - || (i.months == 0 && i.days < 0) - || (i.months == 0 && i.days == 0 && i.usecs < 0) + self < &Self::from_month_day_usec(0, 0, 0) } } @@ -1299,4 +1406,95 @@ mod tests { assert_eq!(lhs.cmp(&rhs), order) } } + + #[test] + fn test_deserialize_justify() { + let cases = [ + ( + (0, 0, USECS_PER_MONTH * 2 + USECS_PER_DAY * 3 + 4), + Some((2, 3, 4i64, "2 mons 3 days 00:00:00.000004")), + ), + ((i32::MIN, i32::MIN, i64::MIN), None), + ((i32::MAX, i32::MAX, i64::MAX), None), + ( + (0, i32::MIN, i64::MIN), + Some(( + -75141187, + -29, + -14454775808, + "-6261765 years -7 mons -29 days -04:00:54.775808", + )), + ), + ( + (i32::MIN, -60, i64::MAX), + Some(( + -2143925250, + -8, + -71945224193, + "-178660437 years -6 mons -8 days -19:59:05.224193", + )), + ), + ]; + for ((lhs_months, lhs_days, lhs_usecs), rhs) in cases { + let input = IntervalUnit::from_month_day_usec(lhs_months, lhs_days, lhs_usecs); + let actual_deserialize = IntervalCmpValue::from(input).as_justified(); + + match rhs { + None => { + assert_eq!(actual_deserialize, None); + } + Some((rhs_months, rhs_days, rhs_usecs, rhs_str)) => { + // We should test individual fields rather than using custom `Eq` + assert_eq!(actual_deserialize.unwrap().get_months(), rhs_months); + assert_eq!(actual_deserialize.unwrap().get_days(), rhs_days); + assert_eq!(actual_deserialize.unwrap().get_usecs(), rhs_usecs); + assert_eq!(actual_deserialize.unwrap().to_string(), rhs_str); + } + } + } + + // A false positive overflow that is buggy in PostgreSQL 15.2. + let input = IntervalUnit::from_month_day_usec(i32::MIN, -30, 1); + let actual_deserialize = IntervalCmpValue::from(input).as_justified(); + // It has a justified interval within range, and can be obtained by our deserialization. + assert_eq!(actual_deserialize.unwrap().get_months(), i32::MIN); + assert_eq!(actual_deserialize.unwrap().get_days(), -29); + assert_eq!(actual_deserialize.unwrap().get_usecs(), -USECS_PER_DAY + 1); + } + + #[test] + fn test_deserialize_alternate() { + let cases = [ + (0, 0, USECS_PER_MONTH * 2 + USECS_PER_DAY * 3 + 4), + (i32::MIN, i32::MIN, i64::MIN), + (i32::MAX, i32::MAX, i64::MAX), + (0, i32::MIN, i64::MIN), + (i32::MIN, -60, i64::MAX), + ]; + for (months, days, usecs) in cases { + let input = IntervalUnit::from_month_day_usec(months, days, usecs); + + let mut serializer = memcomparable::Serializer::new(vec![]); + input.serialize(&mut serializer).unwrap(); + let buf = serializer.into_inner(); + let mut deserializer = memcomparable::Deserializer::new(&buf[..]); + let actual = IntervalUnit::deserialize(&mut deserializer).unwrap(); + + // The IntervalUnit we get back can be a different one, but they should be equal. + assert_eq!(actual, input); + } + + // Decoding invalid value + let mut serializer = memcomparable::Serializer::new(vec![]); + (i64::MAX, u64::MAX).serialize(&mut serializer).unwrap(); + let buf = serializer.into_inner(); + let mut deserializer = memcomparable::Deserializer::new(&buf[..]); + assert!(IntervalUnit::deserialize(&mut deserializer).is_err()); + + let buf = i128::MIN.to_ne_bytes(); + std::panic::catch_unwind(|| { + ::deserialize(&mut &buf[..]) + }) + .unwrap_err(); + } }