From 7ac17b06634cd5381b1e2ef290e79a0710bfaefa Mon Sep 17 00:00:00 2001 From: Folyd Date: Thu, 19 Oct 2023 22:47:53 +0800 Subject: [PATCH 1/8] Add `Field::remove()`, `Schema::remove_field()`, and `RecordBatch::remove_column()` APIs --- arrow-array/src/record_batch.rs | 41 +++++++++++++++++++++++++++++++++ arrow-schema/src/fields.rs | 23 ++++++++++++++++++ arrow-schema/src/schema.rs | 21 +++++++++++++++++ 3 files changed, 85 insertions(+) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index 1f3e1df847a8..bb8e92085eef 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -20,6 +20,7 @@ use crate::{new_empty_array, Array, ArrayRef, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaBuilder, SchemaRef}; +use std::mem; use std::ops::Index; use std::sync::Arc; @@ -327,6 +328,46 @@ impl RecordBatch { &self.columns[..] } + /// Remove column by index and return it. + /// + /// Return `Some(ArrayRef)` if the column is removed, otherwise return `None. + /// - Return `None` if the `index` is out of bounds + /// - Return `None` if the `index` is in bounds but the schema is shared (i.e. ref count > 1) + /// + /// ``` + /// use std::sync::Arc; + /// use arrow_array::{BooleanArray, Int32Array, RecordBatch}; + /// use arrow_schema::{DataType, Field, Schema}; + /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); + /// let bool_array = BooleanArray::from(vec![true, false, false, true, true]); + /// let schema = Schema::new(vec![ + /// Field::new("id", DataType::Int32, false), + /// Field::new("bool", DataType::Boolean, false), + /// ]); + /// + /// let mut batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array), Arc::new(bool_array)]).unwrap(); + /// + /// let removed_column = batch.remove_column(0).unwrap(); + /// assert_eq!(removed_column.as_any().downcast_ref::().unwrap(), &Int32Array::from(vec![1, 2, 3, 4, 5])); + /// assert_eq!(batch.num_columns(), 1); + /// ``` + pub fn remove_column(&mut self, index: usize) -> Option { + if index < self.num_columns() { + let new_schema = mem::replace(&mut self.schema, Arc::new(Schema::empty())); + if Arc::strong_count(&new_schema) == 1 { + let mut schema = Arc::::into_inner(new_schema).unwrap(); + schema.fields.remove(index); + self.schema = Arc::new(schema); + return Some(self.columns.remove(index)); + } else { + self.schema = new_schema; + return None; + } + } + + None + } + /// Return a new RecordBatch where each column is sliced /// according to `offset` and `length` /// diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 368ecabbf3ef..8aec0e50d562 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -98,6 +98,29 @@ impl Fields { .zip(other.iter()) .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b)) } + + /// Remove a field by index and reture it. + /// ``` + /// use arrow_schema::{DataType, Field, Fields}; + /// let mut fields = Fields::from(vec![ + /// Field::new("a", DataType::Boolean, false), + /// Field::new("b", DataType::Int8, false), + /// Field::new("c", DataType::Utf8, false), + /// ]); + /// assert_eq!(fields.len(), 3); + /// assert_eq!(fields.remove(1).unwrap(), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(fields.len(), 2); + /// ``` + pub fn remove(&mut self, index: usize) -> Option { + if index >= self.len() { + return None; + } + + let mut new_fields = self.0.iter().cloned().collect::>(); + let field = new_fields.remove(index); + self.0 = Arc::from(new_fields); + Some(field) + } } impl Default for Fields { diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 43bbffd06523..6665ba0fa99e 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -332,6 +332,27 @@ impl Schema { .iter() .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } + + /// Remove field by name and return it. + /// + /// ``` + /// use arrow_schema::{DataType, Field, Schema}; + /// let mut schema = Schema::new(vec![ + /// Field::new("a", DataType::Boolean, false), + /// Field::new("b", DataType::Int8, false), + /// Field::new("c", DataType::Utf8, false), + /// ]); + /// assert_eq!(schema.fields.len(), 3); + /// assert_eq!(schema.remove_field("b").unwrap(), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(schema.fields.len(), 2); + /// ``` + pub fn remove_field(&mut self, name: &str) -> Option { + if let Some((idx, _)) = self.fields.find(name) { + self.fields.remove(idx) + } else { + None + } + } } impl fmt::Display for Schema { From a079299dea2424f126622ecde73c7645520a6ad6 Mon Sep 17 00:00:00 2001 From: Folyd Date: Fri, 20 Oct 2023 22:05:02 +0800 Subject: [PATCH 2/8] Update arrow-schema/src/fields.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-schema/src/fields.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 8aec0e50d562..9cb56b95dda3 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -99,7 +99,7 @@ impl Fields { .all(|(a, b)| Arc::ptr_eq(a, b) || a.contains(b)) } - /// Remove a field by index and reture it. + /// Remove a field by index and return it. /// ``` /// use arrow_schema::{DataType, Field, Fields}; /// let mut fields = Fields::from(vec![ From d59e48871b1c7cf8298376ed894404e5725a6471 Mon Sep 17 00:00:00 2001 From: Folyd Date: Sat, 21 Oct 2023 10:22:19 +0800 Subject: [PATCH 3/8] Update arrow-schema/src/schema.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-schema/src/schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 6665ba0fa99e..ea6c7a4e2c89 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -346,7 +346,7 @@ impl Schema { /// assert_eq!(schema.remove_field("b").unwrap(), Field::new("b", DataType::Int8, false).into()); /// assert_eq!(schema.fields.len(), 2); /// ``` - pub fn remove_field(&mut self, name: &str) -> Option { + pub fn remove_field_with_name(&mut self, name: &str) -> Option { if let Some((idx, _)) = self.fields.find(name) { self.fields.remove(idx) } else { From cbb745732a1a23114f6dd096631c3a4b084e078b Mon Sep 17 00:00:00 2001 From: Folyd Date: Sat, 21 Oct 2023 10:39:57 +0800 Subject: [PATCH 4/8] Fix docs testing --- arrow-schema/src/schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index ea6c7a4e2c89..ad047e88f1c6 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -343,7 +343,7 @@ impl Schema { /// Field::new("c", DataType::Utf8, false), /// ]); /// assert_eq!(schema.fields.len(), 3); - /// assert_eq!(schema.remove_field("b").unwrap(), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(schema.remove_field_with_name("b").unwrap(), Field::new("b", DataType::Int8, false).into()); /// assert_eq!(schema.fields.len(), 2); /// ``` pub fn remove_field_with_name(&mut self, name: &str) -> Option { From d023cfe08b2c285bdcff4ee1812c162bc282449e Mon Sep 17 00:00:00 2001 From: Folyd Date: Sat, 21 Oct 2023 10:52:17 +0800 Subject: [PATCH 5/8] Use `SchemaBuilder` to build the new `Schema` --- arrow-array/src/record_batch.rs | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index bb8e92085eef..cb799f4b1e7e 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -20,7 +20,6 @@ use crate::{new_empty_array, Array, ArrayRef, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaBuilder, SchemaRef}; -use std::mem; use std::ops::Index; use std::sync::Arc; @@ -330,9 +329,13 @@ impl RecordBatch { /// Remove column by index and return it. /// - /// Return `Some(ArrayRef)` if the column is removed, otherwise return `None. - /// - Return `None` if the `index` is out of bounds - /// - Return `None` if the `index` is in bounds but the schema is shared (i.e. ref count > 1) + /// Return the `ArrayRef` if the column is removed. + /// + /// # Panics + /// + /// Panics if `index` is outside of `0..num_columns`. + /// + /// # Example /// /// ``` /// use std::sync::Arc; @@ -347,25 +350,15 @@ impl RecordBatch { /// /// let mut batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array), Arc::new(bool_array)]).unwrap(); /// - /// let removed_column = batch.remove_column(0).unwrap(); + /// let removed_column = batch.remove_column(0); /// assert_eq!(removed_column.as_any().downcast_ref::().unwrap(), &Int32Array::from(vec![1, 2, 3, 4, 5])); /// assert_eq!(batch.num_columns(), 1); /// ``` - pub fn remove_column(&mut self, index: usize) -> Option { - if index < self.num_columns() { - let new_schema = mem::replace(&mut self.schema, Arc::new(Schema::empty())); - if Arc::strong_count(&new_schema) == 1 { - let mut schema = Arc::::into_inner(new_schema).unwrap(); - schema.fields.remove(index); - self.schema = Arc::new(schema); - return Some(self.columns.remove(index)); - } else { - self.schema = new_schema; - return None; - } - } - - None + pub fn remove_column(&mut self, index: usize) -> ArrayRef { + let mut builder = SchemaBuilder::from(self.schema.fields()); + builder.remove(index); + self.schema = Arc::new(builder.finish()); + self.columns.remove(index) } /// Return a new RecordBatch where each column is sliced From f423ee1c551b3683d15ee14aa28b43fdc2bfc48e Mon Sep 17 00:00:00 2001 From: Folyd Date: Sat, 21 Oct 2023 11:04:14 +0800 Subject: [PATCH 6/8] Recommend `SchemaBuilder` --- arrow-array/src/record_batch.rs | 2 +- arrow-schema/src/schema.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index cb799f4b1e7e..4e859fdfe7ea 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -333,7 +333,7 @@ impl RecordBatch { /// /// # Panics /// - /// Panics if `index` is outside of `0..num_columns`. + /// Panics if `index`` out of bounds. /// /// # Example /// diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index ad047e88f1c6..c06498961b2a 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -333,7 +333,8 @@ impl Schema { .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } - /// Remove field by name and return it. + /// Remove field by name and return it. Recommend to use [`SchemaBuilder`] + /// if you are looking to remove multiple columns, as this will save allocations. /// /// ``` /// use arrow_schema::{DataType, Field, Schema}; From e8feee3f47225d52964157dc86e767e05b83fa30 Mon Sep 17 00:00:00 2001 From: Folyd Date: Thu, 26 Oct 2023 19:38:57 +0800 Subject: [PATCH 7/8] Apply review suggestions --- arrow-schema/src/fields.rs | 24 +++++++++++++----------- arrow-schema/src/schema.rs | 16 +++++++++------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 9cb56b95dda3..70cb1968e9a4 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::{ArrowError, Field, FieldRef}; +use crate::{ArrowError, Field, FieldRef, SchemaBuilder}; use std::ops::Deref; use std::sync::Arc; @@ -100,6 +100,12 @@ impl Fields { } /// Remove a field by index and return it. + /// + /// # Panic + /// + /// Panics if `index` is out of bounds. + /// + /// # Example /// ``` /// use arrow_schema::{DataType, Field, Fields}; /// let mut fields = Fields::from(vec![ @@ -108,18 +114,14 @@ impl Fields { /// Field::new("c", DataType::Utf8, false), /// ]); /// assert_eq!(fields.len(), 3); - /// assert_eq!(fields.remove(1).unwrap(), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(fields.remove(1), Field::new("b", DataType::Int8, false).into()); /// assert_eq!(fields.len(), 2); /// ``` - pub fn remove(&mut self, index: usize) -> Option { - if index >= self.len() { - return None; - } - - let mut new_fields = self.0.iter().cloned().collect::>(); - let field = new_fields.remove(index); - self.0 = Arc::from(new_fields); - Some(field) + pub fn remove(&mut self, index: usize) -> FieldRef { + let mut builder = SchemaBuilder::from(Fields::from(&*self.0)); + let field = builder.remove(index); + *self = builder.finish().fields; + field } } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index c06498961b2a..e30256524868 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -336,6 +336,12 @@ impl Schema { /// Remove field by name and return it. Recommend to use [`SchemaBuilder`] /// if you are looking to remove multiple columns, as this will save allocations. /// + /// # Panic + /// + /// Panics if `index` is out of bounds. + /// + /// # Example + /// /// ``` /// use arrow_schema::{DataType, Field, Schema}; /// let mut schema = Schema::new(vec![ @@ -344,15 +350,11 @@ impl Schema { /// Field::new("c", DataType::Utf8, false), /// ]); /// assert_eq!(schema.fields.len(), 3); - /// assert_eq!(schema.remove_field_with_name("b").unwrap(), Field::new("b", DataType::Int8, false).into()); + /// assert_eq!(schema.remove(1), Field::new("b", DataType::Int8, false).into()); /// assert_eq!(schema.fields.len(), 2); /// ``` - pub fn remove_field_with_name(&mut self, name: &str) -> Option { - if let Some((idx, _)) = self.fields.find(name) { - self.fields.remove(idx) - } else { - None - } + pub fn remove(&mut self, index: usize) -> FieldRef { + self.fields.remove(index) } } From be0fd6b222f11356ee026c8afd70a0cd9df80c17 Mon Sep 17 00:00:00 2001 From: Folyd Date: Thu, 26 Oct 2023 22:19:46 +0800 Subject: [PATCH 8/8] Update arrow-schema/src/schema.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-schema/src/schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index e30256524868..b39a5a159812 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -333,7 +333,7 @@ impl Schema { .all(|(k, v1)| self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()) } - /// Remove field by name and return it. Recommend to use [`SchemaBuilder`] + /// Remove field by index and return it. Recommend to use [`SchemaBuilder`] /// if you are looking to remove multiple columns, as this will save allocations. /// /// # Panic