fix: Fix unpivot on empty df (#18179)

pola-rs · Aug 14, 2024 · 8111493 · 8111493
1 parent ac30a25
commit 8111493
Show file tree

Hide file tree

Showing 20 changed files with 315 additions and 240 deletions.
diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs
@@ -1,12 +1,10 @@
-use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked;
 use arrow::offset::OffsetsBuffer;
 use rayon::prelude::*;
 use smartstring::alias::String as SmartString;
 
 use crate::chunked_array::ops::explode::offsets_to_indexes;
 use crate::prelude::*;
 use crate::series::IsSorted;
-use crate::utils::try_get_supertype;
 use crate::POOL;
 
 fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
@@ -185,175 +183,6 @@ impl DataFrame {
         let columns = self.select_series(columns)?;
         self.explode_impl(columns)
     }
-
-    ///
-    /// Unpivot a `DataFrame` from wide to long format.
-    ///
-    /// # Example
-    ///
-    /// # Arguments
-    ///
-    /// * `on` - String slice that represent the columns to use as value variables.
-    /// * `index` - String slice that represent the columns to use as id variables.
-    ///
-    /// If `on` is empty all columns that are not in `index` will be used.
-    ///
-    /// ```ignore
-    /// # use polars_core::prelude::*;
-    /// let df = df!("A" => &["a", "b", "a"],
-    ///              "B" => &[1, 3, 5],
-    ///              "C" => &[10, 11, 12],
-    ///              "D" => &[2, 4, 6]
-    ///     )?;
-    ///
-    /// let unpivoted = df.unpivot(&["A", "B"], &["C", "D"])?;
-    /// println!("{:?}", df);
-    /// println!("{:?}", unpivoted);
-    /// # Ok::<(), PolarsError>(())
-    /// ```
-    /// Outputs:
-    /// ```text
-    ///  +-----+-----+-----+-----+
-    ///  | A   | B   | C   | D   |
-    ///  | --- | --- | --- | --- |
-    ///  | str | i32 | i32 | i32 |
-    ///  +=====+=====+=====+=====+
-    ///  | "a" | 1   | 10  | 2   |
-    ///  +-----+-----+-----+-----+
-    ///  | "b" | 3   | 11  | 4   |
-    ///  +-----+-----+-----+-----+
-    ///  | "a" | 5   | 12  | 6   |
-    ///  +-----+-----+-----+-----+
-    ///
-    ///  +-----+-----+----------+-------+
-    ///  | A   | B   | variable | value |
-    ///  | --- | --- | ---      | ---   |
-    ///  | str | i32 | str      | i32   |
-    ///  +=====+=====+==========+=======+
-    ///  | "a" | 1   | "C"      | 10    |
-    ///  +-----+-----+----------+-------+
-    ///  | "b" | 3   | "C"      | 11    |
-    ///  +-----+-----+----------+-------+
-    ///  | "a" | 5   | "C"      | 12    |
-    ///  +-----+-----+----------+-------+
-    ///  | "a" | 1   | "D"      | 2     |
-    ///  +-----+-----+----------+-------+
-    ///  | "b" | 3   | "D"      | 4     |
-    ///  +-----+-----+----------+-------+
-    ///  | "a" | 5   | "D"      | 6     |
-    ///  +-----+-----+----------+-------+
-    /// ```
-    pub fn unpivot<I, J>(&self, on: I, index: J) -> PolarsResult<Self>
-    where
-        I: IntoVec<SmartString>,
-        J: IntoVec<SmartString>,
-    {
-        let index = index.into_vec();
-        let on = on.into_vec();
-        self.unpivot2(UnpivotArgsIR {
-            on,
-            index,
-            ..Default::default()
-        })
-    }
-
-    /// Similar to unpivot, but without generics. This may be easier if you want to pass
-    /// an empty `index` or empty `on`.
-    pub fn unpivot2(&self, args: UnpivotArgsIR) -> PolarsResult<Self> {
-        let index = args.index;
-        let mut on = args.on;
-
-        let variable_name = args.variable_name.as_deref().unwrap_or("variable");
-        let value_name = args.value_name.as_deref().unwrap_or("value");
-
-        let len = self.height();
-
-        // if value vars is empty we take all columns that are not in id_vars.
-        if on.is_empty() {
-            // return empty frame if there are no columns available to use as value vars
-            if index.len() == self.width() {
-                let variable_col = Series::new_empty(variable_name, &DataType::String);
-                let value_col = Series::new_empty(variable_name, &DataType::Null);
-
-                let mut out = self.select(index).unwrap().clear().columns;
-                out.push(variable_col);
-                out.push(value_col);
-
-                return Ok(unsafe { DataFrame::new_no_checks(out) });
-            }
-
-            let index_set = PlHashSet::from_iter(index.iter().map(|s| s.as_str()));
-            on = self
-                .get_columns()
-                .iter()
-                .filter_map(|s| {
-                    if index_set.contains(s.name()) {
-                        None
-                    } else {
-                        Some(s.name().into())
-                    }
-                })
-                .collect();
-        }
-
-        // values will all be placed in single column, so we must find their supertype
-        let schema = self.schema();
-        let mut iter = on
-            .iter()
-            .map(|v| schema.get(v).ok_or_else(|| polars_err!(col_not_found = v)));
-        let mut st = iter.next().unwrap()?.clone();
-        for dt in iter {
-            st = try_get_supertype(&st, dt?)?;
-        }
-
-        // The column name of the variable that is unpivoted
-        let mut variable_col = MutablePlString::with_capacity(len * on.len() + 1);
-        // prepare ids
-        let ids_ = self.select_with_schema_unchecked(index, &schema)?;
-        let mut ids = ids_.clone();
-        if ids.width() > 0 {
-            for _ in 0..on.len() - 1 {
-                ids.vstack_mut_unchecked(&ids_)
-            }
-        }
-        ids.as_single_chunk_par();
-        drop(ids_);
-
-        let mut values = Vec::with_capacity(on.len());
-
-        for value_column_name in &on {
-            variable_col.extend_constant(len, Some(value_column_name.as_str()));
-            // ensure we go via the schema so we are O(1)
-            // self.column() is linear
-            // together with this loop that would make it O^2 over `on`
-            let (pos, _name, _dtype) = schema.try_get_full(value_column_name)?;
-            let col = &self.columns[pos];
-            let value_col = col.cast(&st).map_err(
-                |_| polars_err!(InvalidOperation: "'unpivot' not supported for dtype: {}", col.dtype()),
-            )?;
-            values.extend_from_slice(value_col.chunks())
-        }
-        let values_arr = concatenate_owned_unchecked(&values)?;
-        // SAFETY:
-        // The give dtype is correct
-        let values =
-            unsafe { Series::from_chunks_and_dtype_unchecked(value_name, vec![values_arr], &st) };
-
-        let variable_col = variable_col.as_box();
-        // SAFETY:
-        // The given dtype is correct
-        let variables = unsafe {
-            Series::from_chunks_and_dtype_unchecked(
-                variable_name,
-                vec![variable_col],
-                &DataType::String,
-            )
-        };
-
-        ids.hstack_mut(&[variables, values])?;
-
-        Ok(ids)
-    }
 }
 
 #[cfg(test)]
@@ -432,55 +261,4 @@ mod test {
 
         Ok(())
     }
-
-    #[test]
-    #[cfg_attr(miri, ignore)]
-    fn test_unpivot() -> PolarsResult<()> {
-        let df = df!("A" => &["a", "b", "a"],
-         "B" => &[1, 3, 5],
-         "C" => &[10, 11, 12],
-         "D" => &[2, 4, 6]
-        )
-        .unwrap();
-
-        let unpivoted = df.unpivot(["C", "D"], ["A", "B"])?;
-        assert_eq!(
-            Vec::from(unpivoted.column("value")?.i32()?),
-            &[Some(10), Some(11), Some(12), Some(2), Some(4), Some(6)]
-        );
-
-        let args = UnpivotArgsIR {
-            on: vec![],
-            index: vec![],
-            ..Default::default()
-        };
-
-        let unpivoted = df.unpivot2(args).unwrap();
-        let value = unpivoted.column("value")?;
-        // String because of supertype
-        let value = value.str()?;
-        let value = value.into_no_null_iter().collect::<Vec<_>>();
-        assert_eq!(
-            value,
-            &["a", "b", "a", "1", "3", "5", "10", "11", "12", "2", "4", "6"]
-        );
-
-        let args = UnpivotArgsIR {
-            on: vec![],
-            index: vec!["A".into()],
-            ..Default::default()
-        };
-
-        let unpivoted = df.unpivot2(args).unwrap();
-        let value = unpivoted.column("value")?;
-        let value = value.i32()?;
-        let value = value.into_no_null_iter().collect::<Vec<_>>();
-        assert_eq!(value, &[1, 3, 5, 10, 11, 12, 2, 4, 6]);
-        let variable = unpivoted.column("variable")?;
-        let variable = variable.str()?;
-        let variable = variable.into_no_null_iter().collect::<Vec<_>>();
-        assert_eq!(variable, &["B", "B", "B", "C", "C", "C", "D", "D", "D"]);
-        assert!(unpivoted.column("A").is_ok());
-        Ok(())
-    }
 }
diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs
@@ -575,6 +575,11 @@ impl DataFrame {
         &mut self.columns
     }
 
+    /// Take ownership of the underlying columns vec.
+    pub fn take_columns(self) -> Vec<Series> {
+        self.columns
+    }
+
     /// Iterator over the columns as [`Series`].
     ///
     /// # Example
@@ -926,8 +931,13 @@ impl DataFrame {
         Ok(self)
     }
 
-    /// Does not check if schema is correct
-    pub(crate) fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
+    /// Concatenate a [`DataFrame`] to this [`DataFrame`]
+    ///
+    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks`].
+    ///
+    /// # Panics
+    /// Panics if the schema's don't match.
+    pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
         self.columns
             .iter_mut()
             .zip(other.columns.iter())

diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml
@@ -217,7 +217,7 @@ arg_where = ["polars-plan/arg_where"]
 search_sorted = ["polars-plan/search_sorted"]
 merge_sorted = ["polars-plan/merge_sorted"]
 meta = ["polars-plan/meta"]
-pivot = ["polars-core/rows", "polars-ops/pivot"]
+pivot = ["polars-core/rows", "polars-ops/pivot", "polars-plan/pivot"]
 top_k = ["polars-plan/top_k"]
 semi_anti_join = ["polars-plan/semi_anti_join"]
 cse = ["polars-plan/cse", "polars-mem-engine/cse"]

diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs
@@ -1652,6 +1652,7 @@ impl LazyFrame {
     /// Unpivot the DataFrame from wide to long format.
     ///
     /// See [`UnpivotArgsIR`] for information on how to unpivot a DataFrame.
+    #[cfg(feature = "pivot")]
     pub fn unpivot(self, args: UnpivotArgsDSL) -> LazyFrame {
         let opt_state = self.get_opt_state();
         let lp = self.get_plan_builder().unpivot(args).build();

diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs
@@ -46,6 +46,7 @@ fn test_lazy_alias() {
 }
 
 #[test]
+#[cfg(feature = "pivot")]
 fn test_lazy_unpivot() {
     let df = get_df();
 

diff --git a/crates/polars-ops/src/frame/pivot/mod.rs b/crates/polars-ops/src/frame/pivot/mod.rs
@@ -1,4 +1,5 @@
 mod positioning;
+mod unpivot;
 
 use std::borrow::Cow;
 
@@ -7,6 +8,7 @@ use polars_core::frame::group_by::expr::PhysicalAggExpr;
 use polars_core::prelude::*;
 use polars_core::utils::_split_offsets;
 use polars_core::{downcast_as_macro_arg_physical, POOL};
+pub use unpivot::UnpivotDF;
 
 const HASHMAP_INIT_SIZE: usize = 512;