diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index f31bc1c785b9..dc4cbe6834c4 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -569,6 +569,20 @@ impl StringViewArray { pub fn to_binary_view(self) -> BinaryViewArray { unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) } } + + /// Returns true if all data within this array is ASCII + pub fn is_ascii(&self) -> bool { + // Alternative (but incorrect): directly check the underlying buffers + // (1) Our string view might be sparse, i.e., a subset of the buffers, + // so even if the buffer is not ascii, we can still be ascii. + // (2) It is quite difficult to know the range of each buffer (unlike StringArray) + // This means that this operation is quite expensive, shall we cache the result? + // i.e. track `is_ascii` in the builder. + self.iter().all(|v| match v { + Some(v) => v.is_ascii(), + None => true, + }) + } } impl From> for StringViewArray { diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 6f6dfe03133d..49831092ffcd 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -20,6 +20,7 @@ use arrow_array::cast::AsArray; use arrow_array::*; use arrow_schema::*; use arrow_select::take::take; +use iterator::ArrayIter; use std::sync::Arc; #[derive(Debug)] @@ -126,24 +127,66 @@ fn like_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) -> Result apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v), + (Utf8, Utf8) => { + apply::<&GenericStringArray>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + } (LargeUtf8, LargeUtf8) => { - apply::(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) + apply::<&GenericStringArray>(op, l.as_string(), l_s, l_v, r.as_string(), r_s, r_v) } + (Utf8View, Utf8View) => apply::<&StringViewArray>( + op, + l.as_string_view(), + l_s, + l_v, + r.as_string_view(), + r_s, + r_v, + ), (l_t, r_t) => Err(ArrowError::InvalidArgumentError(format!( "Invalid string operation: {l_t} {op} {r_t}" ))), } } -fn apply( +/// A trait for Arrow String Arrays, currently three types are supported: +/// - `StringArray` +/// - `LargeStringArray` +/// - `StringViewArray` +/// +/// This trait helps to abstract over the different types of string arrays +/// so that we don't need to duplicate the implementation for each type. +trait StringArrayType<'a>: ArrayAccessor + Sized { + fn is_ascii(&self) -> bool; + fn iter(&self) -> ArrayIter; +} + +impl<'a, O: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray { + fn is_ascii(&self) -> bool { + GenericStringArray::::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + GenericStringArray::::iter(self) + } +} +impl<'a> StringArrayType<'a> for &'a StringViewArray { + fn is_ascii(&self) -> bool { + StringViewArray::is_ascii(self) + } + + fn iter(&self) -> ArrayIter { + StringViewArray::iter(self) + } +} + +fn apply<'a, T: StringArrayType<'a> + 'a>( op: Op, - l: &GenericStringArray, + l: T, l_s: bool, - l_v: Option<&dyn AnyDictionaryArray>, - r: &GenericStringArray, + l_v: Option<&'a dyn AnyDictionaryArray>, + r: T, r_s: bool, - r_v: Option<&dyn AnyDictionaryArray>, + r_v: Option<&'a dyn AnyDictionaryArray>, ) -> Result { let l_len = l_v.map(|l| l.len()).unwrap_or(l.len()); if r_s { @@ -155,7 +198,7 @@ fn apply( if r.is_null(idx) { return Ok(BooleanArray::new_null(l_len)); } - op_scalar(op, l, l_v, r.value(idx)) + op_scalar::(op, l, l_v, r.value(idx)) } else { match (l_s, l_v, r_v) { (true, None, None) => { @@ -187,9 +230,9 @@ fn apply( } #[inline(never)] -fn op_scalar( +fn op_scalar<'a, T: StringArrayType<'a>>( op: Op, - l: &GenericStringArray, + l: T, l_v: Option<&dyn AnyDictionaryArray>, r: &str, ) -> Result { @@ -207,8 +250,8 @@ fn op_scalar( }) } -fn vectored_iter<'a, O: OffsetSizeTrait>( - a: &'a GenericStringArray, +fn vectored_iter<'a, T: StringArrayType<'a> + 'a>( + a: T, a_v: &'a dyn AnyDictionaryArray, ) -> impl Iterator> + 'a { let nulls = a_v.nulls(); @@ -373,24 +416,33 @@ mod tests { use super::*; use arrow_array::types::Int8Type; + /// Applying `op(left, right)`, both sides are arrays + /// The macro tests four types of array implementations: + /// - `StringArray` + /// - `LargeStringArray` + /// - `StringViewArray` + /// - `DictionaryArray` macro_rules! test_utf8 { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] fn $test_name() { let expected = BooleanArray::from($expected); + let left = StringArray::from($left); let right = StringArray::from($right); let res = $op(&left, &right).unwrap(); assert_eq!(res, expected); - } - }; - } - macro_rules! test_dict_utf8 { - ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let expected = BooleanArray::from($expected); + let left = LargeStringArray::from($left); + let right = LargeStringArray::from($right); + let res = $op(&left, &right).unwrap(); + assert_eq!(res, expected); + + let left = StringViewArray::from($left); + let right = StringViewArray::from($right); + let res = $op(&left, &right).unwrap(); + assert_eq!(res, expected); + let left: DictionaryArray = $left.into_iter().collect(); let right: DictionaryArray = $right.into_iter().collect(); let res = $op(&left, &right).unwrap(); @@ -399,6 +451,12 @@ mod tests { }; } + /// Applying `op(left, right)`, left side is array, right side is scalar + /// The macro tests four types of array implementations: + /// - `StringArray` + /// - `LargeStringArray` + /// - `StringViewArray` + /// - `DictionaryArray` macro_rules! test_utf8_scalar { ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { #[test] @@ -406,351 +464,420 @@ mod tests { let expected = BooleanArray::from($expected); let left = StringArray::from($left); - let res = $op(&left, $right).unwrap(); + let right = StringArray::from_iter_values([$right]); + let res = $op(&left, &Scalar::new(&right)).unwrap(); assert_eq!(res, expected); let left = LargeStringArray::from($left); - let res = $op(&left, $right).unwrap(); + let right = LargeStringArray::from_iter_values([$right]); + let res = $op(&left, &Scalar::new(&right)).unwrap(); + assert_eq!(res, expected); + + let left = StringViewArray::from($left); + let right = StringViewArray::from_iter_values([$right]); + let res = $op(&left, &Scalar::new(&right)).unwrap(); + assert_eq!(res, expected); + + let left: DictionaryArray = $left.into_iter().collect(); + let right: DictionaryArray = [$right].into_iter().collect(); + let res = $op(&left, &Scalar::new(&right)).unwrap(); assert_eq!(res, expected); } }; - ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => { - test_utf8_scalar!($test_name, $left, $right, $op, $expected); - test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, $expected); - }; } test_utf8!( test_utf8_array_like, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], - like_utf8, - vec![true, true, true, false, false, true, false, false] - ); - - test_dict_utf8!( - test_utf8_array_like_dict, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow", "arrow"], + vec![ + "arrow", + "arrow_long_string_more than 12 bytes", + "arrow", + "arrow", + "arrow", + "arrows", + "arrow", + "arrow" + ], vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_", ".*"], - like_dyn, + like, vec![true, true, true, false, false, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_testing, - test_utf8_array_like_scalar_dyn_escape_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], + vec![ + "varchar(255)", + "int(255)longer than 12 bytes", + "varchar", + "int" + ], "%(%)%", - like_utf8_scalar, - like_utf8_scalar_dyn, + like, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex, - test_utf8_array_like_scalar_dyn_escape_regex, vec![".*", "a", "*"], ".*", - like_utf8_scalar, - like_utf8_scalar_dyn, + like, vec![true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex_dot, - test_utf8_array_like_scalar_dyn_escape_regex_dot, vec![".", "a", "*"], ".", - like_utf8_scalar, - like_utf8_scalar_dyn, + like, vec![true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar, - test_utf8_array_like_scalar_dyn, - vec!["arrow", "parquet", "datafusion", "flight"], + vec![ + "arrow", + "parquet", + "datafusion", + "flight", + "long string arrow test 12 bytes" + ], "%ar%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] + like, + vec![true, true, false, false, true] ); test_utf8_scalar!( test_utf8_array_like_scalar_start, - test_utf8_array_like_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, true, false] + like, + vec![true, false, true, false, true] ); // Replicates `test_utf8_array_like_scalar_start` `test_utf8_array_like_scalar_dyn_start` to // demonstrate that `SQL STARTSWITH` works as expected. test_utf8_scalar!( test_utf8_array_starts_with_scalar_start, - test_utf8_array_starts_with_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow", - starts_with_utf8_scalar, - starts_with_utf8_scalar_dyn, - vec![true, false, true, false] + starts_with, + vec![true, false, true, false, true] ); test_utf8_scalar!( test_utf8_array_like_scalar_end, - test_utf8_array_like_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "%arrow", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, true, false, false] + like, + vec![true, true, false, false, false] ); // Replicates `test_utf8_array_like_scalar_end` `test_utf8_array_like_scalar_dyn_end` to // demonstrate that `SQL ENDSWITH` works as expected. test_utf8_scalar!( test_utf8_array_ends_with_scalar_end, - test_utf8_array_ends_with_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow", - ends_with_utf8_scalar, - ends_with_utf8_scalar_dyn, - vec![true, true, false, false] + ends_with, + vec![true, true, false, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_equals, - test_utf8_array_like_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false, false, false] + like, + vec![true, false, false, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_one, - test_utf8_array_like_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], + vec![ + "arrow", + "arrows", + "parrow", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow_", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![false, true, false, false] + like, + vec![false, true, false, false, false] ); test_utf8_scalar!( test_utf8_scalar_like_escape, - test_utf8_scalar_like_dyn_escape, - vec!["a%", "a\\x"], + vec!["a%", "a\\x", "arrow long string longer than 12 bytes"], "a\\%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false] + like, + vec![true, false, false] ); test_utf8_scalar!( test_utf8_scalar_like_escape_contains, - test_utf8_scalar_like_dyn_escape_contains, - vec!["ba%", "ba\\x"], + vec!["ba%", "ba\\x", "arrow long string longer than 12 bytes"], "%a\\%", - like_utf8_scalar, - like_utf8_scalar_dyn, - vec![true, false] + like, + vec![true, false, false] ); test_utf8!( test_utf8_scalar_ilike_regex, vec!["%%%"], vec![r"\%_\%"], - ilike_utf8, - vec![true] - ); - - test_dict_utf8!( - test_utf8_scalar_ilike_regex_dict, - vec!["%%%"], - vec![r"\%_\%"], - ilike_dyn, + ilike, vec![true] ); test_utf8!( test_utf8_array_nlike, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_utf8, - vec![false, false, false, true, true, false, true] - ); - - test_dict_utf8!( - test_utf8_array_nlike_dict, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], + vec![ + "arrow", + "arrow", + "arrow long string longer than 12 bytes", + "arrow", + "arrow", + "arrows", + "arrow" + ], vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_dyn, + nlike, vec![false, false, false, true, true, false, true] ); test_utf8_scalar!( test_utf8_array_nlike_escape_testing, - test_utf8_array_nlike_escape_dyn_testing_dyn, - vec!["varchar(255)", "int(255)", "varchar", "int"], + vec![ + "varchar(255)", + "int(255) arrow long string longer than 12 bytes", + "varchar", + "int" + ], "%(%)%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, + nlike, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_escape_regex, - test_utf8_array_nlike_scalar_dyn_escape_regex, vec![".*", "a", "*"], ".*", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, + nlike, vec![false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_escape_regex_dot, - test_utf8_array_nlike_scalar_dyn_escape_regex_dot, vec![".", "a", "*"], ".", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, + nlike, vec![false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar, - test_utf8_array_nlike_scalar_dyn, - vec!["arrow", "parquet", "datafusion", "flight"], + vec![ + "arrow", + "parquet", + "datafusion", + "flight", + "arrow long string longer than 12 bytes" + ], "%ar%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] + nlike, + vec![false, false, true, true, false] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_start, - test_utf8_array_nlike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow%", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, false, true] + nlike, + vec![false, true, false, true, false] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_end, - test_utf8_array_nlike_scalar_dyn_end, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "%arrow", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, false, true, true] + nlike, + vec![false, false, true, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_equals, - test_utf8_array_nlike_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![false, true, true, true] + nlike, + vec![false, true, true, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_one, - test_utf8_array_nlike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], + vec![ + "arrow", + "arrows", + "parrow", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow_", - nlike_utf8_scalar, - nlike_utf8_scalar_dyn, - vec![true, false, true, true] + nlike, + vec![true, false, true, true, true] ); test_utf8!( test_utf8_array_ilike, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - ilike_utf8, - vec![true, true, true, false, false, true, false] - ); - - test_dict_utf8!( - test_utf8_array_ilike_dict, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec![ + "arrow", + "arrow", + "ARROW long string longer than 12 bytes", + "arrow", + "ARROW", + "ARROWS", + "arROw" + ], vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - ilike_dyn, + ilike, vec![true, true, true, false, false, true, false] ); test_utf8_scalar!( ilike_utf8_scalar_escape_testing, - ilike_utf8_scalar_escape_dyn_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], + vec![ + "varchar(255)", + "int(255) long string longer than 12 bytes", + "varchar", + "int" + ], "%(%)%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, + ilike, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar, - test_utf8_array_ilike_dyn_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], + vec![ + "arrow", + "parquet", + "datafusion", + "flight", + "arrow long string longer than 12 bytes" + ], "%AR%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] + ilike, + vec![true, true, false, false, true] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_start, - test_utf8_array_ilike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "ARR"], + vec![ + "arrow", + "parrow", + "arrows", + "ARR", + "arrow long string longer than 12 bytes" + ], "aRRow%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, false, true, false] + ilike, + vec![true, false, true, false, true] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_end, - test_utf8_array_ilike_scalar_dyn_end, - vec!["ArroW", "parrow", "ARRowS", "arr"], + vec![ + "ArroW", + "parrow", + "ARRowS", + "arr", + "arrow long string longer than 12 bytes" + ], "%arrow", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, true, false, false] + ilike, + vec![true, true, false, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_equals, - test_utf8_array_ilike_scalar_dyn_equals, - vec!["arrow", "parrow", "arrows", "arr"], + vec![ + "arrow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "Arrow", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![true, false, false, false] + ilike, + vec![true, false, false, false, false] ); // We only implement loose matching test_utf8_scalar!( test_utf8_array_ilike_unicode, - test_utf8_array_ilike_unicode_dyn, - vec!["FFkoß", "FFkoSS", "FFkoss", "FFkoS", "FFkos", "ffkoSS", "ffkoß", "FFKoSS"], + vec![ + "FFkoß", + "FFkoSS", + "FFkoss", + "FFkoS", + "FFkos", + "ffkoSS", + "ffkoß", + "FFKoSS", + "longer than 12 bytes FFKoSS" + ], "FFkoSS", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, true, false, false, false, false, true] + ilike, + vec![false, true, true, false, false, false, false, true, false] ); test_utf8_scalar!( test_utf8_array_ilike_unicode_starts, - test_utf8_array_ilike_unicode_start_dyn, vec![ "FFkoßsdlkdf", "FFkoSSsdlkdf", @@ -761,16 +888,15 @@ mod tests { "ffkoß", "FfkosSsdfd", "FFKoSS", + "longer than 12 bytes FFKoSS", ], "FFkoSS%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, true, false, false, false, false, true, true] + ilike, + vec![false, true, true, false, false, false, false, true, true, false] ); test_utf8_scalar!( test_utf8_array_ilike_unicode_ends, - test_utf8_array_ilike_unicode_ends_dyn, vec![ "sdlkdfFFkoß", "sdlkdfFFkoSS", @@ -781,16 +907,15 @@ mod tests { "ffkoß", "h😃klFfkosS", "FFKoSS", + "longer than 12 bytes FFKoSS", ], "%FFkoSS", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, true, false, false, false, false, true, true] + ilike, + vec![false, true, true, false, false, false, false, true, true, true] ); test_utf8_scalar!( test_utf8_array_ilike_unicode_contains, - test_utf8_array_ilike_unicode_contains_dyn, vec![ "sdlkdfFkoßsdfs", "sdlkdfFkoSSdggs", @@ -802,11 +927,11 @@ mod tests { "😃sadlksffkosSsh😃klF", "😱slgffkosSsh😃klF", "FFKoSS", + "longer than 12 bytes FFKoSS", ], "%FFkoSS%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, true, false, false, false, false, true, true, true] + ilike, + vec![false, true, true, false, false, false, false, true, true, true, true] ); // Replicates `test_utf8_array_ilike_unicode_contains` and @@ -816,7 +941,6 @@ mod tests { // NOTE: 5 of the values were changed because the original used a case insensitive `ilike`. test_utf8_scalar!( test_utf8_array_contains_unicode_contains, - test_utf8_array_contains_unicode_contains_dyn, vec![ "sdlkdfFkoßsdfs", "sdlkdFFkoSSdggs", // Original was case insensitive "sdlkdfFkoSSdggs" @@ -828,16 +952,15 @@ mod tests { "😃sadlksFFkoSSsh😃klF", // Original was case insensitive "😃sadlksffkosSsh😃klF" "😱slgFFkoSSsh😃klF", // Original was case insensitive "😱slgffkosSsh😃klF" "FFkoSS", // "FFKoSS" + "longer than 12 bytes FFKoSS", ], "FFkoSS", - contains_utf8_scalar, - contains_utf8_scalar_dyn, - vec![false, true, true, false, false, false, false, true, true, true] + contains, + vec![false, true, true, false, false, false, false, true, true, true, false] ); test_utf8_scalar!( test_utf8_array_ilike_unicode_complex, - test_utf8_array_ilike_unicode_complex_dyn, vec![ "sdlkdfFooßsdfs", "sdlkdfFooSSdggs", @@ -849,97 +972,124 @@ mod tests { "😃sadlksffofsSsh😃klF", "😱slgffoesSsh😃klF", "FFKoSS", + "longer than 12 bytes FFKoSS", ], "%FF__SS%", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, true, false, false, false, false, true, true, true] + ilike, + vec![false, true, true, false, false, false, false, true, true, true, true] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_one, - test_utf8_array_ilike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], + vec![ + "arrow", + "arrows", + "parrow", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow_", - ilike_utf8_scalar, - ilike_utf8_scalar_dyn, - vec![false, true, false, false] + ilike, + vec![false, true, false, false, false] ); test_utf8!( test_utf8_array_nilike, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], - vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - nilike_utf8, - vec![false, false, false, true, true, false, true] - ); - - test_dict_utf8!( - test_utf8_array_nilike_dict, - vec!["arrow", "arrow", "ARROW", "arrow", "ARROW", "ARROWS", "arROw"], + vec![ + "arrow", + "arrow", + "ARROW longer than 12 bytes string", + "arrow", + "ARROW", + "ARROWS", + "arROw" + ], vec!["arrow", "ar%", "%ro%", "foo", "ar%r", "arrow_", "arrow_"], - nilike_dyn, + nilike, vec![false, false, false, true, true, false, true] ); test_utf8_scalar!( nilike_utf8_scalar_escape_testing, - nilike_utf8_scalar_escape_dyn_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], + vec![ + "varchar(255)", + "int(255) longer than 12 bytes string", + "varchar", + "int" + ], "%(%)%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, + nilike, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar, - test_utf8_array_nilike_dyn_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], + vec![ + "arrow", + "parquet", + "datafusion", + "flight", + "arrow long string longer than 12 bytes" + ], "%AR%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] + nilike, + vec![false, false, true, true, false] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_start, - test_utf8_array_nilike_scalar_dyn_start, - vec!["arrow", "parrow", "arrows", "ARR"], + vec![ + "arrow", + "parrow", + "arrows", + "ARR", + "arrow long string longer than 12 bytes" + ], "aRRow%", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, true, false, true] + nilike, + vec![false, true, false, true, false] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_end, - test_utf8_array_nilike_scalar_dyn_end, - vec!["ArroW", "parrow", "ARRowS", "arr"], + vec![ + "ArroW", + "parrow", + "ARRowS", + "arr", + "arrow long string longer than 12 bytes" + ], "%arrow", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, false, true, true] + nilike, + vec![false, false, true, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_equals, - test_utf8_array_nilike_scalar_dyn_equals, - vec!["arRow", "parrow", "arrows", "arr"], + vec![ + "arRow", + "parrow", + "arrows", + "arr", + "arrow long string longer than 12 bytes" + ], "Arrow", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![false, true, true, true] + nilike, + vec![false, true, true, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_one, - test_utf8_array_nilike_scalar_dyn_one, - vec!["arrow", "arrows", "parrow", "arr"], + vec![ + "arrow", + "arrows", + "parrow", + "arr", + "arrow long string longer than 12 bytes" + ], "arrow_", - nilike_utf8_scalar, - nilike_utf8_scalar_dyn, - vec![true, false, true, true] + nilike, + vec![true, false, true, true, true] ); #[test] diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 54ecf42e3684..01e3710a6d0a 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{BooleanArray, GenericStringArray, OffsetSizeTrait}; +use arrow_array::{ArrayAccessor, BooleanArray}; use arrow_schema::ArrowError; use memchr::memchr2; use regex::{Regex, RegexBuilder}; @@ -95,11 +95,10 @@ impl<'a> Predicate<'a> { /// /// If `negate` is true the result of the predicate will be negated #[inline(never)] - pub fn evaluate_array( - &self, - array: &GenericStringArray, - negate: bool, - ) -> BooleanArray { + pub fn evaluate_array<'i, T>(&self, array: T, negate: bool) -> BooleanArray + where + T: ArrayAccessor, + { match self { Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| { (haystack.len() == v.len() && haystack == *v) != negate