From 5b0d0f7a5b1c536bc596d026d0016c421fc824a3 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 12 Nov 2024 17:13:07 -0500 Subject: [PATCH 1/3] feat: kleene_or, kleene_and, and restore SQL semantics I noticed that our row filters disagreed with DuckDB's semantics which is SQL semantics, in particular `where true or null` _keeps_ the row (we filtered it). --- pyvortex/src/expr.rs | 6 +- .../src/array/bool/compute/boolean.rs | 38 +++-- vortex-array/src/array/constant/compute.rs | 62 ++++++-- vortex-array/src/compute/boolean.rs | 150 ++++++++++++++---- vortex-array/src/compute/mod.rs | 20 ++- vortex-array/src/variants.rs | 12 +- vortex-expr/src/binary.rs | 6 +- vortex-serde/src/file/read/mask.rs | 26 ++- vortex-serde/src/file/tests.rs | 9 +- 9 files changed, 262 insertions(+), 67 deletions(-) diff --git a/pyvortex/src/expr.rs b/pyvortex/src/expr.rs index c586e3e64..41fb36153 100644 --- a/pyvortex/src/expr.rs +++ b/pyvortex/src/expr.rs @@ -109,11 +109,13 @@ use crate::dtype::PyDType; /// -- is_valid: all not null /// -- child 0 type: int64 /// [ -/// 25 +/// 25, +/// null /// ] /// -- child 1 type: string_view /// [ -/// "Joseph" +/// "Joseph", +/// "Angela" /// ] #[pyclass(name = "Expr", module = "vortex")] pub struct PyExpr { diff --git a/vortex-array/src/array/bool/compute/boolean.rs b/vortex-array/src/array/bool/compute/boolean.rs index baf103599..e5b4979dc 100644 --- a/vortex-array/src/array/bool/compute/boolean.rs +++ b/vortex-array/src/array/bool/compute/boolean.rs @@ -1,5 +1,7 @@ use arrow_arith::boolean; use arrow_array::cast::AsArray as _; +use arrow_array::{Array as _, BooleanArray}; +use arrow_schema::ArrowError; use vortex_error::VortexResult; use crate::array::BoolArray; @@ -7,30 +9,40 @@ use crate::arrow::FromArrowArray as _; use crate::compute::{AndFn, OrFn}; use crate::{Array, IntoCanonical}; -impl OrFn for BoolArray { - fn or(&self, array: &Array) -> VortexResult { +impl BoolArray { + /// Lift an Arrow binary boolean kernel function to Vortex arrays. + fn lift_arrow(&self, arrow_fun: F, other: &Array) -> VortexResult + where + F: FnOnce(&BooleanArray, &BooleanArray) -> Result, + { let lhs = self.clone().into_canonical()?.into_arrow()?; let lhs = lhs.as_boolean(); - let rhs = array.clone().into_canonical()?.into_arrow()?; + let rhs = other.clone().into_canonical()?.into_arrow()?; let rhs = rhs.as_boolean(); - let array = boolean::or(lhs, rhs)?; + let array = arrow_fun(lhs, rhs)?; - Ok(Array::from_arrow(&array, true)) + Ok(Array::from_arrow(&array, array.is_nullable())) } } -impl AndFn for BoolArray { - fn and(&self, array: &Array) -> VortexResult { - let lhs = self.clone().into_canonical()?.into_arrow()?; - let lhs = lhs.as_boolean(); +impl OrFn for BoolArray { + fn or(&self, array: &Array) -> VortexResult { + self.lift_arrow(boolean::or, array) + } - let rhs = array.clone().into_canonical()?.into_arrow()?; - let rhs = rhs.as_boolean(); + fn or_kleene(&self, array: &Array) -> VortexResult { + self.lift_arrow(boolean::or_kleene, array) + } +} - let array = boolean::and(lhs, rhs)?; +impl AndFn for BoolArray { + fn and(&self, array: &Array) -> VortexResult { + self.lift_arrow(boolean::and, array) + } - Ok(Array::from_arrow(&array, true)) + fn and_kleene(&self, array: &Array) -> VortexResult { + self.lift_arrow(boolean::and_kleene, array) } } diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs index 27853632f..71a5d6f6d 100644 --- a/vortex-array/src/array/constant/compute.rs +++ b/vortex-array/src/array/constant/compute.rs @@ -119,32 +119,66 @@ impl MaybeCompareFn for ConstantArray { } } +fn arrow_and(left: Option, right: Option) -> Option { + left.zip(right).map(|(l, r)| l & r) +} + +fn kleene_and(left: Option, right: Option) -> Option { + match (left, right) { + (Some(false), _) => Some(false), + (_, Some(false)) => Some(false), + (None, _) => None, + (_, None) => None, + (Some(l), Some(r)) => Some(l & r), + } +} + +fn arrow_or(left: Option, right: Option) -> Option { + left.zip(right).map(|(l, r)| l | r) +} + +fn kleene_or(left: Option, right: Option) -> Option { + match (left, right) { + (Some(true), _) => Some(true), + (_, Some(true)) => Some(true), + (None, _) => None, + (_, None) => None, + (Some(l), Some(r)) => Some(l | r), + } +} + impl AndFn for ConstantArray { fn and(&self, array: &Array) -> VortexResult { - constant_array_bool_impl( - self, - array, - |(l, r)| l & r, - |other, this| other.with_dyn(|other| other.and().map(|other| other.and(this))), - ) + constant_array_bool_impl(self, array, arrow_and, |other, this| { + other.with_dyn(|other| other.and().map(|other| other.and(this))) + }) + } + + fn and_kleene(&self, array: &Array) -> VortexResult { + constant_array_bool_impl(self, array, kleene_and, |other, this| { + other.with_dyn(|other| other.and_kleene().map(|other| other.and_kleene(this))) + }) } } impl OrFn for ConstantArray { fn or(&self, array: &Array) -> VortexResult { - constant_array_bool_impl( - self, - array, - |(l, r)| l | r, - |other, this| other.with_dyn(|other| other.or().map(|other| other.or(this))), - ) + constant_array_bool_impl(self, array, arrow_or, |other, this| { + other.with_dyn(|other| other.or().map(|other| other.or(this))) + }) + } + + fn or_kleene(&self, array: &Array) -> VortexResult { + constant_array_bool_impl(self, array, kleene_or, |other, this| { + other.with_dyn(|other| other.or_kleene().map(|other| other.or_kleene(this))) + }) } } fn constant_array_bool_impl( constant_array: &ConstantArray, other: &Array, - bool_op: impl Fn((bool, bool)) -> bool, + bool_op: impl Fn(Option, Option) -> Option, fallback_fn: impl Fn(&Array, &Array) -> Option>, ) -> VortexResult { // If the right side is constant @@ -152,7 +186,7 @@ fn constant_array_bool_impl( let lhs = constant_array.scalar_value().as_bool()?; let rhs = scalar_at(other, 0)?.value().as_bool()?; - let scalar = match lhs.zip(rhs).map(bool_op) { + let scalar = match bool_op(lhs, rhs) { Some(b) => Scalar::bool(b, Nullability::Nullable), None => Scalar::null(constant_array.dtype().as_nullable()), }; diff --git a/vortex-array/src/compute/boolean.rs b/vortex-array/src/compute/boolean.rs index 8bf23511e..b25a20bc8 100644 --- a/vortex-array/src/compute/boolean.rs +++ b/vortex-array/src/compute/boolean.rs @@ -1,16 +1,101 @@ use vortex_error::{vortex_bail, VortexResult}; +use crate::array::BoolArray; use crate::{Array, ArrayDType, IntoArrayVariant}; - pub trait AndFn { + /// Point-wise logical _and_ between two Boolean arrays. + /// + /// This method uses Arrow-style null propagation rather than the Kleene logic semantics. + /// + /// # Examples + /// + /// ``` + /// use vortex_array::Array; + /// use vortex_array::compute::and; + /// use vortex_array::IntoCanonical; + /// use vortex_array::accessor::ArrayAccessor; + /// let a = Array::from(vec![Some(true), Some(true), Some(true), None, None, None, Some(false), Some(false), Some(false)]); + /// let b = Array::from(vec![Some(true), None, Some(false), Some(true), None, Some(false), Some(true), None, Some(false)]); + /// let result = and(a, b)?.into_canonical()?.into_bool()?; + /// let result_vec = result.with_iterator(|it| it.map(|x| x.cloned()).collect::>())?; + /// assert_eq!(result_vec, vec![Some(true), None, Some(false), None, None, None, Some(false), None, Some(false)]); + /// # use vortex_error::VortexError; + /// # Ok::<(), VortexError>(()) + /// ``` fn and(&self, array: &Array) -> VortexResult; + + /// Point-wise Kleene logical _and_ between two Boolean arrays. + /// + /// # Examples + /// + /// ``` + /// use vortex_array::Array; + /// use vortex_array::compute::and_kleene; + /// use vortex_array::IntoCanonical; + /// use vortex_array::accessor::ArrayAccessor; + /// let a = Array::from(vec![Some(true), Some(true), Some(true), None, None, None, Some(false), Some(false), Some(false)]); + /// let b = Array::from(vec![Some(true), None, Some(false), Some(true), None, Some(false), Some(true), None, Some(false)]); + /// let result = and_kleene(a, b)?.into_canonical()?.into_bool()?; + /// let result_vec = result.with_iterator(|it| it.map(|x| x.cloned()).collect::>())?; + /// assert_eq!(result_vec, vec![Some(true), None, Some(false), None, None, Some(false), Some(false), Some(false), Some(false)]); + /// # use vortex_error::VortexError; + /// # Ok::<(), VortexError>(()) + /// ``` + fn and_kleene(&self, array: &Array) -> VortexResult; } pub trait OrFn { + /// Point-wise logical _or_ between two Boolean arrays. + /// + /// This method uses Arrow-style null propagation rather than the Kleene logic semantics. + /// + /// # Examples + /// + /// ``` + /// use vortex_array::Array; + /// use vortex_array::compute::or; + /// use vortex_array::IntoCanonical; + /// use vortex_array::accessor::ArrayAccessor; + /// let a = Array::from(vec![Some(true), Some(true), Some(true), None, None, None, Some(false), Some(false), Some(false)]); + /// let b = Array::from(vec![Some(true), None, Some(false), Some(true), None, Some(false), Some(true), None, Some(false)]); + /// let result = or(a, b)?.into_canonical()?.into_bool()?; + /// let result_vec = result.with_iterator(|it| it.map(|x| x.cloned()).collect::>())?; + /// assert_eq!(result_vec, vec![Some(true), None, Some(true), None, None, None, Some(true), None, Some(false)]); + /// # use vortex_error::VortexError; + /// # Ok::<(), VortexError>(()) + /// ``` fn or(&self, array: &Array) -> VortexResult; + + /// Point-wise Kleene logical _or_ between two Boolean arrays. + /// + /// # Examples + /// + /// ``` + /// use vortex_array::Array; + /// use vortex_array::compute::or_kleene; + /// use vortex_array::IntoCanonical; + /// use vortex_array::accessor::ArrayAccessor; + /// let a = Array::from(vec![Some(true), Some(true), Some(true), None, None, None, Some(false), Some(false), Some(false)]); + /// let b = Array::from(vec![Some(true), None, Some(false), Some(true), None, Some(false), Some(true), None, Some(false)]); + /// let result = or_kleene(a, b)?.into_canonical()?.into_bool()?; + /// let result_vec = result.with_iterator(|it| it.map(|x| x.cloned()).collect::>())?; + /// assert_eq!(result_vec, vec![Some(true), Some(true), Some(true), Some(true), None, None, Some(true), None, Some(false)]); + /// # use vortex_error::VortexError; + /// # Ok::<(), VortexError>(()) + /// ``` + fn or_kleene(&self, array: &Array) -> VortexResult; } -pub fn and(lhs: impl AsRef, rhs: impl AsRef) -> VortexResult { +fn lift_boolean_operator( + lhs: impl AsRef, + rhs: impl AsRef, + trait_fun: F, + bool_array_fun: G, +) -> VortexResult +where + F: Fn(&Array, &Array) -> Option>, + G: FnOnce(BoolArray, &Array) -> VortexResult, +{ let lhs = lhs.as_ref(); let rhs = rhs.as_ref(); @@ -22,44 +107,55 @@ pub fn and(lhs: impl AsRef, rhs: impl AsRef) -> VortexResult, rhs: impl AsRef) -> VortexResult { - let lhs = lhs.as_ref(); - let rhs = rhs.as_ref(); - - if lhs.len() != rhs.len() { - vortex_bail!("Boolean operations aren't supported on arrays of different lengths") - } - - if !lhs.dtype().is_boolean() || !rhs.dtype().is_boolean() { - vortex_bail!("Boolean operations are only supported on boolean arrays") - } - - if let Some(selection) = lhs.with_dyn(|lhs| lhs.or().map(|lhs| lhs.or(rhs))) { - return selection; - } +pub fn and(lhs: impl AsRef, rhs: impl AsRef) -> VortexResult { + lift_boolean_operator( + lhs, + rhs, + |lhs, rhs| lhs.with_dyn(|lhs| lhs.and().map(|lhs| lhs.and(rhs))), + |lhs, rhs| lhs.and(rhs), + ) +} - if let Some(selection) = rhs.with_dyn(|rhs| rhs.or().map(|rhs| rhs.or(lhs))) { - return selection; - } +pub fn and_kleene(lhs: impl AsRef, rhs: impl AsRef) -> VortexResult { + lift_boolean_operator( + lhs, + rhs, + |lhs, rhs| lhs.with_dyn(|lhs| lhs.and_kleene().map(|lhs| lhs.and_kleene(rhs))), + |lhs, rhs| lhs.and_kleene(rhs), + ) +} - // If neither side implements `OrFn`, we try to expand the left-hand side into a `BoolArray`, which we know does implement it, and call into that implementation. - let lhs = lhs.clone().into_bool()?; +pub fn or(lhs: impl AsRef, rhs: impl AsRef) -> VortexResult { + lift_boolean_operator( + lhs, + rhs, + |lhs, rhs| lhs.with_dyn(|lhs| lhs.or().map(|lhs| lhs.or(rhs))), + |lhs, rhs| lhs.or(rhs), + ) +} - lhs.or(rhs) +pub fn or_kleene(lhs: impl AsRef, rhs: impl AsRef) -> VortexResult { + lift_boolean_operator( + lhs, + rhs, + |lhs, rhs| lhs.with_dyn(|lhs| lhs.or_kleene().map(|lhs| lhs.or_kleene(rhs))), + |lhs, rhs| lhs.or_kleene(rhs), + ) } #[cfg(test)] diff --git a/vortex-array/src/compute/mod.rs b/vortex-array/src/compute/mod.rs index aa9b8043a..922b75a8d 100644 --- a/vortex-array/src/compute/mod.rs +++ b/vortex-array/src/compute/mod.rs @@ -7,7 +7,7 @@ //! implementations of these operators, else we will decode, and perform the equivalent operator //! from Arrow. -pub use boolean::{and, or, AndFn, OrFn}; +pub use boolean::{and, and_kleene, or, or_kleene, AndFn, OrFn}; pub use compare::{compare, scalar_cmp, CompareFn, MaybeCompareFn, Operator}; pub use filter::{filter, FilterFn}; pub use search_sorted::*; @@ -93,17 +93,31 @@ pub trait ArrayCompute { None } - /// Perform a boolean AND operation over two arrays + /// Perform an Arrow-style boolean AND operation over two arrays /// /// See: [AndFn]. fn and(&self) -> Option<&dyn AndFn> { None } - /// Perform a boolean OR operation over two arrays + /// Perform a Kleene-style boolean AND operation over two arrays + /// + /// See: [AndFn]. + fn and_kleene(&self) -> Option<&dyn AndFn> { + None + } + + /// Perform an Arrow-style boolean OR operation over two arrays /// /// See: [OrFn]. fn or(&self) -> Option<&dyn OrFn> { None } + + /// Perform a Kleene-style boolean OR operation over two arrays + /// + /// See: [OrFn]. + fn or_kleene(&self) -> Option<&dyn OrFn> { + None + } } diff --git a/vortex-array/src/variants.rs b/vortex-array/src/variants.rs index 82e8ff91b..89ae1d80a 100644 --- a/vortex-array/src/variants.rs +++ b/vortex-array/src/variants.rs @@ -98,10 +98,20 @@ pub trait BoolArrayTrait: ArrayTrait { /// An iterator over the sorted indices of set values in the underlying boolean array /// good to array with low number of set values. + /// + /// # Warning + /// + /// The set-ness of invalid positions is undefined and not necessarily consistent within a given + /// iterator. fn maybe_null_indices_iter<'a>(&'a self) -> Box + 'a>; - /// An iterator over the sorted disjoint contiguous range set values in the underlying boolean + /// An iterator over the sorted disjoint contiguous range of set values in the underlying boolean /// array good for arrays with only long runs of set values. + /// + /// # Warning + /// + /// The set-ness of invalid positions is undefined and not necessarily consistent within a given + /// iterator. fn maybe_null_slices_iter<'a>(&'a self) -> Box + 'a>; // Other possible iterators include: diff --git a/vortex-expr/src/binary.rs b/vortex-expr/src/binary.rs index 471a5de18..cd0f21798 100644 --- a/vortex-expr/src/binary.rs +++ b/vortex-expr/src/binary.rs @@ -2,7 +2,7 @@ use std::any::Any; use std::sync::Arc; use vortex_array::aliases::hash_set::HashSet; -use vortex_array::compute::{and, compare, or, Operator as ArrayOperator}; +use vortex_array::compute::{and_kleene, compare, or_kleene, Operator as ArrayOperator}; use vortex_array::Array; use vortex_dtype::field::Field; use vortex_error::VortexResult; @@ -50,8 +50,8 @@ impl VortexExpr for BinaryExpr { Operator::Lte => compare(lhs, rhs, ArrayOperator::Lte), Operator::Gt => compare(lhs, rhs, ArrayOperator::Gt), Operator::Gte => compare(lhs, rhs, ArrayOperator::Gte), - Operator::And => and(lhs, rhs), - Operator::Or => or(lhs, rhs), + Operator::And => and_kleene(lhs, rhs), + Operator::Or => or_kleene(lhs, rhs), } } diff --git a/vortex-serde/src/file/read/mask.rs b/vortex-serde/src/file/read/mask.rs index 88fc4c418..8bfae07e9 100644 --- a/vortex-serde/src/file/read/mask.rs +++ b/vortex-serde/src/file/read/mask.rs @@ -5,7 +5,7 @@ use arrow_buffer::{BooleanBuffer, MutableBuffer}; use croaring::Bitmap; use vortex_array::array::{BoolArray, PrimitiveArray}; use vortex_array::compute::{filter, slice, take}; -use vortex_array::validity::Validity; +use vortex_array::validity::{LogicalValidity, Validity}; use vortex_array::{iterate_integer_array, Array, IntoArray}; use vortex_dtype::PType; use vortex_error::{vortex_bail, vortex_err, VortexResult}; @@ -43,6 +43,11 @@ impl RowMask { unsafe { RowMask::new_unchecked(Bitmap::from_range(0..(end - begin) as u32), begin, end) } } + /// Construct a RowMask which is invalid everywhere in the given range. + pub fn new_invalid_between(begin: usize, end: usize) -> Self { + unsafe { RowMask::new_unchecked(Bitmap::new(), begin, end) } + } + /// Construct a RowMask from given bitmap and begin. /// /// # Safety @@ -56,6 +61,25 @@ impl RowMask { /// /// True-valued positions are kept by the returned mask. pub fn from_mask_array(array: &Array, begin: usize, end: usize) -> VortexResult { + match array.with_dyn(|a| a.logical_validity()) { + LogicalValidity::AllValid(_) => { + Self::from_mask_array_ignoring_validity(array, begin, end) + } + LogicalValidity::AllInvalid(_) => Ok(Self::new_invalid_between(begin, end)), + LogicalValidity::Array(validity) => { + let mut bits = Self::from_mask_array_ignoring_validity(array, begin, end)?; + let validity = Self::from_mask_array_ignoring_validity(&validity, begin, end)?; + bits.and_inplace(&validity)?; + Ok(bits) + } + } + } + + fn from_mask_array_ignoring_validity( + array: &Array, + begin: usize, + end: usize, + ) -> VortexResult { array.with_dyn(|a| { a.as_bool_array() .ok_or_else(|| vortex_err!("Must be a bool array")) diff --git a/vortex-serde/src/file/tests.rs b/vortex-serde/src/file/tests.rs index c625ed161..bef69a3e6 100644 --- a/vortex-serde/src/file/tests.rs +++ b/vortex-serde/src/file/tests.rs @@ -465,14 +465,17 @@ async fn filter_or() { .map(|s| unsafe { String::from_utf8_unchecked(s.to_vec()) }) .collect::>()) .unwrap(), - vec!["Joseph".to_string()] + vec!["Joseph".to_string(), "Angela".to_string()] ); let ages = result[0] .with_dyn(|a| a.as_struct_array_unchecked().field(1)) .unwrap(); assert_eq!( - ages.into_primitive().unwrap().maybe_null_slice::(), - vec![25] + ages.into_primitive() + .unwrap() + .with_iterator(|iter| iter.map(|x| x.cloned()).collect::>()) + .unwrap(), + vec![Some(25), None] ); } From 9757fb0cfbdaf6d26951f30aab9a0044da8ba527 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 13 Nov 2024 11:38:30 -0500 Subject: [PATCH 2/3] update row filtering description --- pyvortex/src/expr.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyvortex/src/expr.rs b/pyvortex/src/expr.rs index 41fb36153..4579fdece 100644 --- a/pyvortex/src/expr.rs +++ b/pyvortex/src/expr.rs @@ -99,8 +99,8 @@ use crate::dtype::PyDType; /// ] /// /// Read rows whose name is `Angela` or whose age is between 20 and 30, inclusive. Notice that the -/// Angela row is excluded because its age is null. The entire row filtering expression therefore -/// evaluates to null which is interpreted as false: +/// Angela row is included even though its age is null. Under SQL / Kleene semantics, `true or +/// null` is `true`. /// /// >>> name = vortex.expr.column("name") /// >>> e = vortex.io.read_path("a.vortex", row_filter = (name == "Angela") | ((age >= 20) & (age <= 30))) From ac20399e438feaeba64946a06e8daecd46ca266d Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 13 Nov 2024 12:34:20 -0500 Subject: [PATCH 3/3] s/arrow_// --- vortex-array/src/array/constant/compute.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vortex-array/src/array/constant/compute.rs b/vortex-array/src/array/constant/compute.rs index 71a5d6f6d..ffaf497e2 100644 --- a/vortex-array/src/array/constant/compute.rs +++ b/vortex-array/src/array/constant/compute.rs @@ -119,7 +119,7 @@ impl MaybeCompareFn for ConstantArray { } } -fn arrow_and(left: Option, right: Option) -> Option { +fn and(left: Option, right: Option) -> Option { left.zip(right).map(|(l, r)| l & r) } @@ -133,7 +133,7 @@ fn kleene_and(left: Option, right: Option) -> Option { } } -fn arrow_or(left: Option, right: Option) -> Option { +fn or(left: Option, right: Option) -> Option { left.zip(right).map(|(l, r)| l | r) } @@ -149,7 +149,7 @@ fn kleene_or(left: Option, right: Option) -> Option { impl AndFn for ConstantArray { fn and(&self, array: &Array) -> VortexResult { - constant_array_bool_impl(self, array, arrow_and, |other, this| { + constant_array_bool_impl(self, array, and, |other, this| { other.with_dyn(|other| other.and().map(|other| other.and(this))) }) } @@ -163,7 +163,7 @@ impl AndFn for ConstantArray { impl OrFn for ConstantArray { fn or(&self, array: &Array) -> VortexResult { - constant_array_bool_impl(self, array, arrow_or, |other, this| { + constant_array_bool_impl(self, array, or, |other, this| { other.with_dyn(|other| other.or().map(|other| other.or(this))) }) }