Skip to content

Commit

Permalink
ARROW-11349: [Rust] Add from_iter_values to create arrays from (non n…
Browse files Browse the repository at this point in the history
…ull) values

The idea of this PR is to have a function `from_iter_values` that (just like `from_iter`) creates an array based on an iterator, but from `T` instead of `Option<T>`.

I have seen some places in DataFusion (especially `to_array_of_size`) where an `Array` is generated from a `Vec` of items, which could be replaced by this.
The other iterators have some memory / time overhead in both creating and manipulating the null buffer (and in the case of `Vec` for allocating / dropping the Vec)

Closes apache#9293 from Dandandan/array_iter_non_null

Authored-by: Heres, Daniel <danielheres@gmail.com>
Signed-off-by: Jorge C. Leitao <jorgecarleitao@gmail.com>
  • Loading branch information
Dandandan authored and michalursa committed Jun 13, 2021
1 parent a93d5b7 commit 77c23ee
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 0 deletions.
27 changes: 27 additions & 0 deletions rust/arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,21 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
let offset = i + self.offset();
unsafe { *self.raw_values.as_ptr().add(offset) }
}

/// Creates a PrimitiveArray based on an iterator of values without nulls
pub fn from_iter_values<I: IntoIterator<Item = T::Native>>(iter: I) -> Self {
let val_buf: Buffer = iter.into_iter().collect();
let data = ArrayData::new(
T::DATA_TYPE,
val_buf.len() / mem::size_of::<<T as ArrowPrimitiveType>::Native>(),
None,
None,
0,
vec![val_buf],
vec![],
);
PrimitiveArray::from(Arc::new(data))
}
}

impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
Expand Down Expand Up @@ -820,6 +835,18 @@ mod tests {
}
}

#[test]
fn test_primitive_from_iter_values() {
// Test building a primitive array with from_iter_values

let arr: PrimitiveArray<Int32Type> = PrimitiveArray::from_iter_values(0..10);
assert_eq!(10, arr.len());
assert_eq!(0, arr.null_count());
for i in 0..10i32 {
assert_eq!(i, arr.value(i as usize));
}
}

#[test]
#[should_panic(expected = "PrimitiveArray data should contain a single buffer only \
(values buffer)")]
Expand Down
40 changes: 40 additions & 0 deletions rust/arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,36 @@ impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
pub(crate) fn from_opt_vec(v: Vec<Option<&str>>) -> Self {
v.into_iter().collect()
}

/// Creates a `GenericStringArray` based on an iterator of values without nulls
pub fn from_iter_values<Ptr, I: IntoIterator<Item = Ptr>>(iter: I) -> Self
where
Ptr: AsRef<str>,
{
let iter = iter.into_iter();
let (_, data_len) = iter.size_hint();
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.

let mut offsets =
MutableBuffer::new((data_len + 1) * std::mem::size_of::<OffsetSize>());
let mut values = MutableBuffer::new(0);

let mut length_so_far = OffsetSize::zero();
offsets.push(length_so_far);

for i in iter {
let s = i.as_ref();
length_so_far += OffsetSize::from_usize(s.len()).unwrap();
offsets.push(length_so_far);
values.extend_from_slice(s.as_bytes());
}
let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
.len(data_len)
.add_buffer(offsets.into())
.add_buffer(values.into())
.build();
Self::from(array_data)
}
}

impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator<Option<Ptr>>
Expand Down Expand Up @@ -411,6 +441,7 @@ mod tests {
);
}

#[test]
fn test_string_array_from_iter() {
let data = vec![Some("hello"), None, Some("arrow")];
// from Vec<Option<&str>>
Expand All @@ -424,4 +455,13 @@ mod tests {
assert_eq!(array1, array2);
assert_eq!(array2, array3);
}

#[test]
fn test_string_array_from_iter_values() {
let data = vec!["hello", "hello2"];
let array1 = StringArray::from_iter_values(data.iter());

assert_eq!(array1.value(0), "hello");
assert_eq!(array1.value(1), "hello2");
}
}

0 comments on commit 77c23ee

Please sign in to comment.