From 7dc820892bbcf01fb6db6d990e48d8f1861a8a9e Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 07:21:34 +0000 Subject: [PATCH 1/8] add hash_scalar Signed-off-by: Bugen Zhao --- src/common/src/array/struct_array.rs | 1 + src/common/src/types/mod.rs | 44 +++++++++++++--------------- src/common/src/types/scalar_impl.rs | 40 +++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/src/common/src/array/struct_array.rs b/src/common/src/array/struct_array.rs index 68295738179f..7409000009df 100644 --- a/src/common/src/array/struct_array.rs +++ b/src/common/src/array/struct_array.rs @@ -356,6 +356,7 @@ pub enum StructRef<'a> { ValueRef { val: &'a StructValue }, } +#[macro_export] macro_rules! iter_fields_ref { ($self:ident, $it:ident, { $($body:tt)* }) => { match $self { diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index 3048c86bfe3f..447af23cad10 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -374,6 +374,8 @@ pub trait ScalarRef<'a>: /// Convert `ScalarRef` to an owned scalar. fn to_owned_scalar(&self) -> Self::ScalarType; + + fn hash_scalar(&self, state: &mut H); } /// `for_all_scalar_variants` includes all variants of our scalar types. If you added a new scalar @@ -688,34 +690,30 @@ macro_rules! impl_scalar_impl_ref_conversion { for_all_scalar_variants! { impl_scalar_impl_ref_conversion } /// Should behave the same as [`crate::array::Array::hash_at`] for non-null items. -#[expect(clippy::derive_hash_xor_eq)] -impl Hash for ScalarImpl { - fn hash(&self, state: &mut H) { - macro_rules! impl_all_hash { - ($({ $variant_type:ty, $scalar_type:ident } ),*) => { +macro_rules! scalar_impl_hash { + ($( { $variant_name:ident, $suffix_name:ident, $scalar:ty, $scalar_ref:ty } ),*) => { + #[expect(clippy::derive_hash_xor_eq)] + impl Hash for ScalarRefImpl<'_> { + fn hash(&self, state: &mut H) { match self { - // Primitive types - $( Self::$scalar_type(inner) => { - NativeType::hash_wrapper(inner, state); - }, )* - Self::Interval(interval) => interval.hash(state), - Self::NaiveDate(naivedate) => naivedate.hash(state), - Self::NaiveTime(naivetime) => naivetime.hash(state), - Self::NaiveDateTime(naivedatetime) => naivedatetime.hash(state), - - // Manually implemented - Self::Bool(b) => b.hash(state), - Self::Utf8(s) => state.write(s.as_bytes()), - Self::Decimal(decimal) => decimal.normalize().hash(state), - Self::Struct(v) => v.hash(state), // TODO: check if this is consistent with `StructArray::hash_at` - Self::List(v) => v.hash(state), // TODO: check if this is consistent with `ListArray::hash_at` + $( Self::$variant_name(inner) => inner.hash_scalar(state), )* } - }; + } } - for_all_native_types! { impl_all_hash } - } + + #[expect(clippy::derive_hash_xor_eq)] + impl Hash for ScalarImpl { + fn hash(&self, state: &mut H) { + match self { + $( Self::$variant_name(inner) => inner.as_scalar_ref().hash_scalar(state), )* + } + } + } + }; } +for_all_scalar_variants! { scalar_impl_hash } + /// Feeds the raw scalar of `datum` to the given `state`, which should behave the same as /// [`crate::array::Array::hash_at`]. NULL value will be carefully handled. /// diff --git a/src/common/src/types/scalar_impl.rs b/src/common/src/types/scalar_impl.rs index 076a7fd2c6f0..e442c4d1cff9 100644 --- a/src/common/src/types/scalar_impl.rs +++ b/src/common/src/types/scalar_impl.rs @@ -47,6 +47,10 @@ macro_rules! impl_all_native_scalar { fn to_owned_scalar(&self) -> Self { *self } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } )* }; @@ -102,6 +106,10 @@ impl<'a> ScalarRef<'a> for &'a str { fn to_owned_scalar(&self) -> String { self.to_string() } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } impl ScalarPartialOrd for Decimal { @@ -149,6 +157,10 @@ impl<'a> ScalarRef<'a> for bool { fn to_owned_scalar(&self) -> bool { *self } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } /// Implement `Scalar` for `Decimal`. @@ -171,6 +183,10 @@ impl<'a> ScalarRef<'a> for Decimal { fn to_owned_scalar(&self) -> Decimal { *self } + + fn hash_scalar(&self, state: &mut H) { + self.normalize().hash(state) + } } /// Implement `Scalar` for `IntervalUnit`. @@ -193,6 +209,10 @@ impl<'a> ScalarRef<'a> for IntervalUnit { fn to_owned_scalar(&self) -> IntervalUnit { *self } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } /// Implement `Scalar` for `NaiveDateWrapper`. @@ -215,6 +235,10 @@ impl<'a> ScalarRef<'a> for NaiveDateWrapper { fn to_owned_scalar(&self) -> NaiveDateWrapper { *self } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } /// Implement `Scalar` for `NaiveDateTimeWrapper`. @@ -237,6 +261,10 @@ impl<'a> ScalarRef<'a> for NaiveDateTimeWrapper { fn to_owned_scalar(&self) -> NaiveDateTimeWrapper { *self } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } /// Implement `Scalar` for `NaiveTimeWrapper`. @@ -259,6 +287,10 @@ impl<'a> ScalarRef<'a> for NaiveTimeWrapper { fn to_owned_scalar(&self) -> NaiveTimeWrapper { *self } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } /// Implement `Scalar` for `StructValue`. @@ -273,6 +305,10 @@ impl<'a> ScalarRef<'a> for StructRef<'a> { .collect(); StructValue::new(fields) } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } /// Implement `Scalar` for `ListValue`. @@ -287,6 +323,10 @@ impl<'a> ScalarRef<'a> for ListRef<'a> { .collect(); ListValue::new(fields) } + + fn hash_scalar(&self, state: &mut H) { + self.hash(state) + } } impl ScalarImpl { From d1934bc7c241af288f7aead3ad207fe927dbc0c4 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 07:24:22 +0000 Subject: [PATCH 2/8] remove hash wrapper native Signed-off-by: Bugen Zhao --- src/common/src/array/primitive_array.rs | 12 ++------ src/common/src/types/native_type.rs | 38 ------------------------- 2 files changed, 3 insertions(+), 47 deletions(-) diff --git a/src/common/src/array/primitive_array.rs b/src/common/src/array/primitive_array.rs index 039e47fa7851..0b172f114f9d 100644 --- a/src/common/src/array/primitive_array.rs +++ b/src/common/src/array/primitive_array.rs @@ -53,7 +53,9 @@ where // item methods fn to_protobuf(self, output: &mut T) -> ArrayResult; - fn hash_wrapper(&self, state: &mut H); + fn hash_wrapper(&self, state: &mut H) { + ScalarRef::hash_scalar(self, state) + } } macro_rules! impl_array_methods { @@ -96,10 +98,6 @@ macro_rules! impl_primitive_for_native_types { fn to_protobuf(self, output: &mut T) -> ArrayResult { NativeType::to_protobuf(self, output) } - - fn hash_wrapper(&self, state: &mut H) { - NativeType::hash_wrapper(self, state) - } } )* } @@ -117,10 +115,6 @@ macro_rules! impl_primitive_for_others { fn to_protobuf(self, output: &mut T) -> ArrayResult { <$scalar_type>::to_protobuf(self, output) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state) - } } )* } diff --git a/src/common/src/types/native_type.rs b/src/common/src/types/native_type.rs index 5b2bcd84b128..bbf8dc47a2c7 100644 --- a/src/common/src/types/native_type.rs +++ b/src/common/src/types/native_type.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::fmt::Debug; -use std::hash::{Hash, Hasher}; use std::io::Write; use super::{OrderedF32, OrderedF64}; @@ -23,95 +22,58 @@ pub trait NativeType: PartialOrd + PartialEq + Debug + Copy + Send + Sync + Sized + Default + 'static { fn to_protobuf(self, output: &mut T) -> ArrayResult; - fn hash_wrapper(&self, state: &mut H); } impl NativeType for i16 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } impl NativeType for i32 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } impl NativeType for i64 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } impl NativeType for OrderedF32 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - state.write_i32(self.0 as i32); - } } impl NativeType for OrderedF64 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - state.write_i64(self.0 as i64); - } } impl NativeType for u8 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } impl NativeType for u16 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } impl NativeType for u32 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } impl NativeType for u64 { fn to_protobuf(self, output: &mut T) -> ArrayResult { output.write(&self.to_be_bytes()).map_err(Into::into) } - - fn hash_wrapper(&self, state: &mut H) { - self.hash(state); - } } From 24eb5add4295a9b8b0ec5dc4e8596a84f0269abc Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 07:38:23 +0000 Subject: [PATCH 3/8] remove some hash_at Signed-off-by: Bugen Zhao --- src/common/src/array/bool_array.rs | 17 ++++------------- src/common/src/array/decimal_array.rs | 14 +++----------- src/common/src/array/mod.rs | 11 +++++++++-- src/common/src/array/primitive_array.rs | 15 +-------------- src/common/src/array/utf8_array.rs | 16 ++++------------ 5 files changed, 21 insertions(+), 52 deletions(-) diff --git a/src/common/src/array/bool_array.rs b/src/common/src/array/bool_array.rs index 655d7589da78..c4946262b5fa 100644 --- a/src/common/src/array/bool_array.rs +++ b/src/common/src/array/bool_array.rs @@ -12,11 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::hash::{Hash, Hasher}; - use risingwave_pb::data::{Array as ProstArray, ArrayType}; -use super::{Array, ArrayBuilder, ArrayIterator, ArrayMeta, NULL_VAL_FOR_HASH}; +use super::{Array, ArrayBuilder, ArrayIterator, ArrayMeta}; use crate::array::ArrayBuilderImpl; use crate::buffer::{Bitmap, BitmapBuilder}; @@ -101,15 +99,6 @@ impl Array for BoolArray { self.bitmap = bitmap; } - #[inline(always)] - fn hash_at(&self, idx: usize, state: &mut H) { - if !self.is_null(idx) { - self.data.is_set(idx).hash(state); - } else { - NULL_VAL_FOR_HASH.hash(state); - } - } - fn create_builder(&self, capacity: usize) -> ArrayBuilderImpl { let array_builder = BoolArrayBuilder::new(capacity); ArrayBuilderImpl::Bool(array_builder) @@ -170,10 +159,12 @@ impl ArrayBuilder for BoolArrayBuilder { #[cfg(test)] mod tests { + use std::hash::Hash; + use itertools::Itertools; use super::*; - use crate::array::read_bool_array; + use crate::array::{read_bool_array, NULL_VAL_FOR_HASH}; fn helper_test_builder(data: Vec>) -> BoolArray { let mut builder = BoolArrayBuilder::new(data.len()); diff --git a/src/common/src/array/decimal_array.rs b/src/common/src/array/decimal_array.rs index 5c2d94df9e4a..2bf0fd21819f 100644 --- a/src/common/src/array/decimal_array.rs +++ b/src/common/src/array/decimal_array.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::hash::{Hash, Hasher}; use std::mem::size_of; use itertools::Itertools; @@ -20,7 +19,7 @@ use risingwave_pb::common::buffer::CompressionType; use risingwave_pb::common::Buffer; use risingwave_pb::data::{Array as ProstArray, ArrayType}; -use super::{Array, ArrayBuilder, ArrayIterator, NULL_VAL_FOR_HASH}; +use super::{Array, ArrayBuilder, ArrayIterator}; use crate::array::{ArrayBuilderImpl, ArrayMeta}; use crate::buffer::{Bitmap, BitmapBuilder}; use crate::types::Decimal; @@ -118,15 +117,6 @@ impl Array for DecimalArray { self.bitmap = bitmap; } - #[inline(always)] - fn hash_at(&self, idx: usize, state: &mut H) { - if !self.is_null(idx) { - self.data[idx].normalize().hash(state); - } else { - NULL_VAL_FOR_HASH.hash(state); - } - } - fn create_builder(&self, capacity: usize) -> ArrayBuilderImpl { let array_builder = DecimalArrayBuilder::new(capacity); ArrayBuilderImpl::Decimal(array_builder) @@ -184,12 +174,14 @@ impl ArrayBuilder for DecimalArrayBuilder { #[cfg(test)] mod tests { + use std::hash::Hash; use std::str::FromStr; use itertools::Itertools; use num_traits::FromPrimitive; use super::*; + use crate::array::NULL_VAL_FOR_HASH; #[test] fn test_decimal_builder() { diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs index 13991b4bad3b..e581bb6c9454 100644 --- a/src/common/src/array/mod.rs +++ b/src/common/src/array/mod.rs @@ -34,7 +34,7 @@ mod utf8_array; mod value_reader; use std::convert::From; -use std::hash::Hasher; +use std::hash::{Hash, Hasher}; use std::sync::Arc; pub use bool_array::{BoolArray, BoolArrayBuilder}; @@ -197,7 +197,14 @@ pub trait Array: std::fmt::Debug + Send + Sync + Sized + 'static + Into(&self, idx: usize, state: &mut H); + #[inline(always)] + fn hash_at(&self, idx: usize, state: &mut H) { + if let Some(value) = self.value_at(idx) { + value.hash_scalar(state); + } else { + NULL_VAL_FOR_HASH.hash(state); + } + } fn hash_vec(&self, hashers: &mut [H]) { assert_eq!(hashers.len(), self.len()); diff --git a/src/common/src/array/primitive_array.rs b/src/common/src/array/primitive_array.rs index 0b172f114f9d..c136eb120215 100644 --- a/src/common/src/array/primitive_array.rs +++ b/src/common/src/array/primitive_array.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::fmt::Debug; -use std::hash::{Hash, Hasher}; use std::io::Write; use std::mem::size_of; @@ -21,7 +20,7 @@ use risingwave_pb::common::buffer::CompressionType; use risingwave_pb::common::Buffer; use risingwave_pb::data::{Array as ProstArray, ArrayType}; -use super::{Array, ArrayBuilder, ArrayIterator, ArrayResult, NULL_VAL_FOR_HASH}; +use super::{Array, ArrayBuilder, ArrayIterator, ArrayResult}; use crate::array::{ArrayBuilderImpl, ArrayImpl, ArrayMeta}; use crate::buffer::{Bitmap, BitmapBuilder}; use crate::for_all_native_types; @@ -53,9 +52,6 @@ where // item methods fn to_protobuf(self, output: &mut T) -> ArrayResult; - fn hash_wrapper(&self, state: &mut H) { - ScalarRef::hash_scalar(self, state) - } } macro_rules! impl_array_methods { @@ -211,15 +207,6 @@ impl Array for PrimitiveArray { self.bitmap = bitmap; } - #[inline(always)] - fn hash_at(&self, idx: usize, state: &mut H) { - if !self.is_null(idx) { - self.data[idx].hash_wrapper(state); - } else { - NULL_VAL_FOR_HASH.hash(state); - } - } - fn create_builder(&self, capacity: usize) -> ArrayBuilderImpl { T::create_array_builder(capacity) } diff --git a/src/common/src/array/utf8_array.rs b/src/common/src/array/utf8_array.rs index 0d7e935a8697..64e9bacecbd4 100644 --- a/src/common/src/array/utf8_array.rs +++ b/src/common/src/array/utf8_array.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::hash::{Hash, Hasher}; use std::iter; use std::mem::size_of; @@ -21,7 +20,7 @@ use risingwave_pb::common::buffer::CompressionType; use risingwave_pb::common::Buffer; use risingwave_pb::data::{Array as ProstArray, ArrayType}; -use super::{Array, ArrayBuilder, ArrayIterator, ArrayMeta, ArrayResult, NULL_VAL_FOR_HASH}; +use super::{Array, ArrayBuilder, ArrayIterator, ArrayMeta, ArrayResult}; use crate::array::ArrayBuilderImpl; use crate::buffer::{Bitmap, BitmapBuilder}; @@ -121,16 +120,6 @@ impl Array for Utf8Array { self.bitmap = bitmap; } - #[inline(always)] - fn hash_at(&self, idx: usize, state: &mut H) { - if !self.is_null(idx) { - let data_slice = &self.data[self.offset[idx]..self.offset[idx + 1]]; - state.write(data_slice); - } else { - NULL_VAL_FOR_HASH.hash(state); - } - } - fn create_builder(&self, capacity: usize) -> ArrayBuilderImpl { let array_builder = Utf8ArrayBuilder::new(capacity); ArrayBuilderImpl::Utf8(array_builder) @@ -320,9 +309,12 @@ impl BytesGuard { #[cfg(test)] mod tests { + use std::hash::Hash; + use itertools::Itertools; use super::*; + use crate::array::NULL_VAL_FOR_HASH; use crate::error::Result; #[test] From e8c63b7b40f87bc2cfbf2ee571acdd18fd15711e Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 08:16:19 +0000 Subject: [PATCH 4/8] fix list / struct hash Signed-off-by: Bugen Zhao --- src/common/src/array/list_array.rs | 30 ++++++++++------------------ src/common/src/array/mod.rs | 1 + src/common/src/array/struct_array.rs | 28 +++++++++----------------- src/common/src/types/mod.rs | 24 ++++++++++++++++------ src/common/src/types/scalar_impl.rs | 4 ++-- 5 files changed, 40 insertions(+), 47 deletions(-) diff --git a/src/common/src/array/list_array.rs b/src/common/src/array/list_array.rs index 47bf843dcfdf..3b376aa8f109 100644 --- a/src/common/src/array/list_array.rs +++ b/src/common/src/array/list_array.rs @@ -15,7 +15,7 @@ use core::fmt; use std::cmp::Ordering; use std::fmt::{Debug, Display}; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; use bytes::{Buf, BufMut}; use itertools::EitherOrBoth::{Both, Left, Right}; @@ -24,13 +24,12 @@ use risingwave_pb::data::{Array as ProstArray, ArrayType as ProstArrayType, List use serde::{Deserializer, Serializer}; use super::{ - Array, ArrayBuilder, ArrayBuilderImpl, ArrayImpl, ArrayIterator, ArrayMeta, ArrayResult, - RowRef, NULL_VAL_FOR_HASH, + Array, ArrayBuilder, ArrayBuilderImpl, ArrayImpl, ArrayIterator, ArrayMeta, ArrayResult, RowRef, }; use crate::buffer::{Bitmap, BitmapBuilder}; use crate::types::{ - deserialize_datum_from, display_datum_ref, serialize_datum_ref_into, to_datum_ref, DataType, - Datum, DatumRef, Scalar, ScalarRefImpl, + deserialize_datum_from, display_datum_ref, hash_datum_ref, serialize_datum_ref_into, + to_datum_ref, DataType, Datum, DatumRef, Scalar, ScalarRefImpl, }; /// This is a naive implementation of list array. @@ -211,14 +210,6 @@ impl Array for ListArray { self.bitmap = bitmap; } - fn hash_at(&self, idx: usize, state: &mut H) { - if !self.is_null(idx) { - self.value.hash_at(idx, state) - } else { - NULL_VAL_FOR_HASH.hash(state); - } - } - fn create_builder(&self, capacity: usize) -> ArrayBuilderImpl { let array_builder = ListArrayBuilder::with_meta( capacity, @@ -444,14 +435,13 @@ impl<'a> ListRef<'a> { }); serializer.serialize_bytes(&inner_serializer.into_inner()) } -} -impl Hash for ListRef<'_> { - fn hash(&self, state: &mut H) { - match self { - ListRef::Indexed { arr, idx } => arr.hash_at(*idx, state), - ListRef::ValueRef { val } => val.hash(state), - } + pub fn hash_scalar_inner(&self, state: &mut H) { + iter_elems_ref!(self, it, { + for datum_ref in it { + hash_datum_ref(datum_ref, state); + } + }) } } diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs index e581bb6c9454..a8fdea6a6ab0 100644 --- a/src/common/src/array/mod.rs +++ b/src/common/src/array/mod.rs @@ -197,6 +197,7 @@ pub trait Array: std::fmt::Debug + Send + Sync + Sized + 'static + Into(&self, idx: usize, state: &mut H) { if let Some(value) = self.value_at(idx) { diff --git a/src/common/src/array/struct_array.rs b/src/common/src/array/struct_array.rs index 7409000009df..50022acd04bc 100644 --- a/src/common/src/array/struct_array.rs +++ b/src/common/src/array/struct_array.rs @@ -15,7 +15,7 @@ use core::fmt; use std::cmp::Ordering; use std::fmt::{Debug, Display}; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; use std::sync::Arc; use bytes::{Buf, BufMut}; @@ -24,13 +24,12 @@ use risingwave_pb::data::{Array as ProstArray, ArrayType as ProstArrayType, Stru use super::{ Array, ArrayBuilder, ArrayBuilderImpl, ArrayImpl, ArrayIterator, ArrayMeta, ArrayResult, - NULL_VAL_FOR_HASH, }; use crate::array::ArrayRef; use crate::buffer::{Bitmap, BitmapBuilder}; use crate::types::{ - deserialize_datum_from, display_datum_ref, serialize_datum_ref_into, to_datum_ref, DataType, - Datum, DatumRef, Scalar, ScalarRefImpl, + deserialize_datum_from, display_datum_ref, hash_datum_ref, serialize_datum_ref_into, + to_datum_ref, DataType, Datum, DatumRef, Scalar, ScalarRefImpl, }; #[derive(Debug)] @@ -209,14 +208,6 @@ impl Array for StructArray { self.bitmap = bitmap; } - fn hash_at(&self, idx: usize, state: &mut H) { - if !self.is_null(idx) { - self.children.iter().for_each(|a| a.hash_at(idx, state)) - } else { - NULL_VAL_FOR_HASH.hash(state); - } - } - fn create_builder(&self, capacity: usize) -> ArrayBuilderImpl { let array_builder = StructArrayBuilder::with_meta( capacity, @@ -388,14 +379,13 @@ impl<'a> StructRef<'a> { Ok(()) }) } -} -impl Hash for StructRef<'_> { - fn hash(&self, state: &mut H) { - match self { - StructRef::Indexed { arr, idx } => arr.hash_at(*idx, state), - StructRef::ValueRef { val } => val.hash(state), - } + pub fn hash_scalar_inner(&self, state: &mut H) { + iter_fields_ref!(self, it, { + for datum_ref in it { + hash_datum_ref(datum_ref, state); + } + }) } } diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index 447af23cad10..95d3b3e5932f 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -375,6 +375,7 @@ pub trait ScalarRef<'a>: /// Convert `ScalarRef` to an owned scalar. fn to_owned_scalar(&self) -> Self::ScalarType; + /// A wrapped hash function to get the hash value for this scaler. fn hash_scalar(&self, state: &mut H); } @@ -429,7 +430,10 @@ macro_rules! scalar_impl_enum { for_all_scalar_variants! { scalar_impl_enum } -/// Implement `PartialOrd` and `Ord` for `ScalarImpl` and `ScalarRefImpl` with macro. +/// Implement [`PartialOrd`] and [`Ord`] for [`ScalarImpl`] and [`ScalarRefImpl`]. +/// +/// Scalars of different types are not comparable. For this case, `partial_cmp` returns `None` and +/// `cmp` will panic. macro_rules! scalar_impl_partial_ord { ($( { $variant_name:ident, $suffix_name:ident, $scalar:ty, $scalar_ref:ty } ),*) => { impl PartialOrd for ScalarImpl { @@ -689,7 +693,9 @@ macro_rules! impl_scalar_impl_ref_conversion { for_all_scalar_variants! { impl_scalar_impl_ref_conversion } -/// Should behave the same as [`crate::array::Array::hash_at`] for non-null items. +/// Implement [`Hash`] for [`ScalarImpl`] and [`ScalarRefImpl`] with `hash_scalar`. +/// +/// Should behave the same as [`crate::array::Array::hash_at`]. macro_rules! scalar_impl_hash { ($( { $variant_name:ident, $suffix_name:ident, $scalar:ty, $scalar_ref:ty } ),*) => { #[expect(clippy::derive_hash_xor_eq)] @@ -716,11 +722,17 @@ for_all_scalar_variants! { scalar_impl_hash } /// Feeds the raw scalar of `datum` to the given `state`, which should behave the same as /// [`crate::array::Array::hash_at`]. NULL value will be carefully handled. -/// -/// Caveats: this relies on the above implementation of [`Hash`]. +#[inline(always)] pub fn hash_datum(datum: &Datum, state: &mut impl std::hash::Hasher) { - match datum { - Some(scalar) => scalar.hash(state), + hash_datum_ref(to_datum_ref(datum), state) +} + +/// Feeds the raw scalar reference of `datum_ref` to the given `state`, which should behave the same +/// as [`crate::array::Array::hash_at`]. NULL value will be carefully handled. +#[inline(always)] +pub fn hash_datum_ref(datum_ref: DatumRef<'_>, state: &mut impl std::hash::Hasher) { + match datum_ref { + Some(scalar_ref) => scalar_ref.hash(state), None => NULL_VAL_FOR_HASH.hash(state), } } diff --git a/src/common/src/types/scalar_impl.rs b/src/common/src/types/scalar_impl.rs index e442c4d1cff9..aaec201dccec 100644 --- a/src/common/src/types/scalar_impl.rs +++ b/src/common/src/types/scalar_impl.rs @@ -307,7 +307,7 @@ impl<'a> ScalarRef<'a> for StructRef<'a> { } fn hash_scalar(&self, state: &mut H) { - self.hash(state) + self.hash_scalar_inner(state) } } @@ -325,7 +325,7 @@ impl<'a> ScalarRef<'a> for ListRef<'a> { } fn hash_scalar(&self, state: &mut H) { - self.hash(state) + self.hash_scalar_inner(state) } } From 6a82ae5559e9b93e2282842cde72d7fe8217e04b Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 08:24:31 +0000 Subject: [PATCH 5/8] use const Signed-off-by: Bugen Zhao --- src/common/src/array/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common/src/array/mod.rs b/src/common/src/array/mod.rs index a8fdea6a6ab0..1b6d276ff38d 100644 --- a/src/common/src/array/mod.rs +++ b/src/common/src/array/mod.rs @@ -74,7 +74,7 @@ pub type F64ArrayBuilder = PrimitiveArrayBuilder; pub type F32ArrayBuilder = PrimitiveArrayBuilder; /// The hash source for `None` values when hashing an item. -pub(crate) static NULL_VAL_FOR_HASH: u32 = 0xfffffff0; +pub(crate) const NULL_VAL_FOR_HASH: u32 = 0xfffffff0; /// A trait over all array builders. /// @@ -200,6 +200,8 @@ pub trait Array: std::fmt::Debug + Send + Sync + Sized + 'static + Into(&self, idx: usize, state: &mut H) { + // We use a default implementation for all arrays for now, as retrieving the reference + // should be lightweight. if let Some(value) = self.value_at(idx) { value.hash_scalar(state); } else { From 184f22cf4f7c7a9f4b015cf51311e390d68ac731 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 11:58:44 +0000 Subject: [PATCH 6/8] add unit test Signed-off-by: Bugen Zhao --- src/common/src/types/mod.rs | 103 +++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index 95d3b3e5932f..0d5497b226dc 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -698,7 +698,6 @@ for_all_scalar_variants! { impl_scalar_impl_ref_conversion } /// Should behave the same as [`crate::array::Array::hash_at`]. macro_rules! scalar_impl_hash { ($( { $variant_name:ident, $suffix_name:ident, $scalar:ty, $scalar_ref:ty } ),*) => { - #[expect(clippy::derive_hash_xor_eq)] impl Hash for ScalarRefImpl<'_> { fn hash(&self, state: &mut H) { match self { @@ -941,12 +940,16 @@ pub fn literal_type_match(data_type: &DataType, literal: Option<&ScalarImpl>) -> #[cfg(test)] mod tests { + use std::hash::{BuildHasher, Hasher}; use std::ops::Neg; + use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; use itertools::Itertools; use rand::thread_rng; + use strum::IntoEnumIterator; use super::*; + use crate::util::hash_util::Crc32FastBuilder; fn serialize_datum_not_null_into_vec(data: i64) -> Vec { let mut serializer = memcomparable::Serializer::new(vec![]); @@ -970,7 +973,7 @@ mod tests { } #[test] - fn test_issue_2057_ordered_float_memcomparable() { + fn test_issue_legacy_2057_ordered_float_memcomparable() { use num_traits::*; use rand::seq::SliceRandom; @@ -1038,4 +1041,100 @@ mod tests { ); assert_eq!(format!("{}", d), "struct".to_string()); } + + #[test] + fn test_hash_implementation() { + fn test(datum: Datum, data_type: DataType) { + let mut builder = data_type.create_array_builder(6); + for _ in 0..3 { + builder.append_null(); + builder.append_datum(&datum); + } + let array = builder.finish(); + + let hash_from_array = { + let mut state = Crc32FastBuilder.build_hasher(); + array.hash_at(3, &mut state); + state.finish() + }; + + let hash_from_datum = { + let mut state = Crc32FastBuilder.build_hasher(); + hash_datum(&datum, &mut state); + state.finish() + }; + + let hash_from_datum_ref = { + let mut state = Crc32FastBuilder.build_hasher(); + hash_datum_ref(to_datum_ref(&datum), &mut state); + state.finish() + }; + + assert_eq!(hash_from_array, hash_from_datum); + assert_eq!(hash_from_datum, hash_from_datum_ref); + } + + for name in DataTypeName::iter() { + let (scalar, data_type) = match name { + DataTypeName::Boolean => (ScalarImpl::Bool(true), DataType::Boolean), + DataTypeName::Int16 => (ScalarImpl::Int16(233), DataType::Int16), + DataTypeName::Int32 => (ScalarImpl::Int32(233333), DataType::Int32), + DataTypeName::Int64 => (ScalarImpl::Int64(233333333333), DataType::Int64), + DataTypeName::Float32 => (ScalarImpl::Float32(23.33.into()), DataType::Float32), + DataTypeName::Float64 => ( + ScalarImpl::Float64(23.333333333333.into()), + DataType::Float64, + ), + DataTypeName::Decimal => ( + ScalarImpl::Decimal("233.33".parse().unwrap()), + DataType::Decimal, + ), + DataTypeName::Date => ( + ScalarImpl::NaiveDate(NaiveDateWrapper(NaiveDate::from_ymd(2333, 3, 3))), + DataType::Date, + ), + DataTypeName::Varchar => (ScalarImpl::Utf8("233".to_string()), DataType::Varchar), + DataTypeName::Time => ( + ScalarImpl::NaiveTime(NaiveTimeWrapper(NaiveTime::from_hms(2, 3, 3))), + DataType::Time, + ), + DataTypeName::Timestamp => ( + ScalarImpl::NaiveDateTime(NaiveDateTimeWrapper(NaiveDateTime::from_timestamp( + 23333333, 2333, + ))), + DataType::Timestamp, + ), + DataTypeName::Timestampz => (ScalarImpl::Int64(233333333), DataType::Timestampz), + DataTypeName::Interval => ( + ScalarImpl::Interval(IntervalUnit::new(2, 3, 3333)), + DataType::Interval, + ), + DataTypeName::Struct => ( + ScalarImpl::Struct(StructValue::new(vec![ + ScalarImpl::Int64(233).into(), + ScalarImpl::Float64(23.33.into()).into(), + ])), + DataType::Struct( + StructType::new(vec![ + (DataType::Int64, "a".to_string()), + (DataType::Float64, "b".to_string()), + ]) + .into(), + ), + ), + DataTypeName::List => ( + ScalarImpl::List(ListValue::new(vec![ + ScalarImpl::Int64(233).into(), + ScalarImpl::Int64(2333).into(), + ])), + DataType::List { + datatype: Box::new(DataType::Int64), + }, + ), + }; + + test(Some(scalar), data_type.clone()); + test(None, data_type); + } + } } From 89ae46a08cb88bf75ea6c829e0be998a707d81aa Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 12:09:44 +0000 Subject: [PATCH 7/8] add fixme Signed-off-by: Bugen Zhao --- src/common/src/types/mod.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index f83126ede155..f1d16bc054f6 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -722,14 +722,22 @@ macro_rules! scalar_impl_hash { for_all_scalar_variants! { scalar_impl_hash } /// Feeds the raw scalar of `datum` to the given `state`, which should behave the same as -/// [`crate::array::Array::hash_at`]. NULL value will be carefully handled. +/// [`crate::array::Array::hash_at`], where NULL value will be carefully handled. +/// +/// **FIXME**: the result of this function might be different from [`std::hash::Hash`] due to the +/// type alias of `Datum = Option<_>`, we should manually implement [`std::hash::Hash`] for +/// [`Datum`] in the future when it becomes a newtype. #[inline(always)] pub fn hash_datum(datum: &Datum, state: &mut impl std::hash::Hasher) { hash_datum_ref(to_datum_ref(datum), state) } /// Feeds the raw scalar reference of `datum_ref` to the given `state`, which should behave the same -/// as [`crate::array::Array::hash_at`]. NULL value will be carefully handled. +/// as [`crate::array::Array::hash_at`], where NULL value will be carefully handled. +/// +/// **FIXME**: the result of this function might be different from [`std::hash::Hash`] due to the +/// type alias of `DatumRef = Option<_>`, we should manually implement [`std::hash::Hash`] for +/// [`DatumRef`] in the future when it becomes a newtype. #[inline(always)] pub fn hash_datum_ref(datum_ref: DatumRef<'_>, state: &mut impl std::hash::Hasher) { match datum_ref { @@ -1044,6 +1052,8 @@ mod tests { #[test] fn test_hash_implementation() { fn test(datum: Datum, data_type: DataType) { + assert!(literal_type_match(&data_type, datum.as_ref())); + let mut builder = data_type.create_array_builder(6); for _ in 0..3 { builder.append_null(); From 2b822ba47456f98544047a83eff74a4485b987f7 Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Fri, 4 Nov 2022 14:01:07 +0000 Subject: [PATCH 8/8] add issue Signed-off-by: Bugen Zhao --- src/common/src/types/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/common/src/types/mod.rs b/src/common/src/types/mod.rs index f1d16bc054f6..742c792b8daf 100644 --- a/src/common/src/types/mod.rs +++ b/src/common/src/types/mod.rs @@ -503,7 +503,7 @@ pub fn serialize_datum_ref_not_null_into( .serialize(serializer) } -// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. +// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. (#477) // TODO: specify `NULL FIRST` or `NULL LAST`. pub fn serialize_datum_into( datum: &Datum, @@ -519,7 +519,7 @@ pub fn serialize_datum_into( Ok(()) } -// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. +// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. (#477) pub fn serialize_datum_not_null_into( datum: &Datum, serializer: &mut memcomparable::Serializer, @@ -530,7 +530,7 @@ pub fn serialize_datum_not_null_into( .serialize(serializer) } -// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. +// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. (#477) pub fn deserialize_datum_from( ty: &DataType, deserializer: &mut memcomparable::Deserializer, @@ -543,7 +543,7 @@ pub fn deserialize_datum_from( } } -// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. +// TODO(MrCroxx): turn Datum into a struct, and impl ser/de as its member functions. (#477) pub fn deserialize_datum_not_null_from( ty: &DataType, deserializer: &mut memcomparable::Deserializer, @@ -726,7 +726,7 @@ for_all_scalar_variants! { scalar_impl_hash } /// /// **FIXME**: the result of this function might be different from [`std::hash::Hash`] due to the /// type alias of `Datum = Option<_>`, we should manually implement [`std::hash::Hash`] for -/// [`Datum`] in the future when it becomes a newtype. +/// [`Datum`] in the future when it becomes a newtype. (#477) #[inline(always)] pub fn hash_datum(datum: &Datum, state: &mut impl std::hash::Hasher) { hash_datum_ref(to_datum_ref(datum), state) @@ -737,7 +737,7 @@ pub fn hash_datum(datum: &Datum, state: &mut impl std::hash::Hasher) { /// /// **FIXME**: the result of this function might be different from [`std::hash::Hash`] due to the /// type alias of `DatumRef = Option<_>`, we should manually implement [`std::hash::Hash`] for -/// [`DatumRef`] in the future when it becomes a newtype. +/// [`DatumRef`] in the future when it becomes a newtype. (#477) #[inline(always)] pub fn hash_datum_ref(datum_ref: DatumRef<'_>, state: &mut impl std::hash::Hasher) { match datum_ref {