Skip to content

Commit

Permalink
fix: Fix unpivot on empty df (#18179)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Aug 14, 2024
1 parent ac30a25 commit 8111493
Show file tree
Hide file tree
Showing 20 changed files with 315 additions and 240 deletions.
222 changes: 0 additions & 222 deletions crates/polars-core/src/frame/explode.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked;
use arrow::offset::OffsetsBuffer;
use rayon::prelude::*;
use smartstring::alias::String as SmartString;

use crate::chunked_array::ops::explode::offsets_to_indexes;
use crate::prelude::*;
use crate::series::IsSorted;
use crate::utils::try_get_supertype;
use crate::POOL;

fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
Expand Down Expand Up @@ -185,175 +183,6 @@ impl DataFrame {
let columns = self.select_series(columns)?;
self.explode_impl(columns)
}

///
/// Unpivot a `DataFrame` from wide to long format.
///
/// # Example
///
/// # Arguments
///
/// * `on` - String slice that represent the columns to use as value variables.
/// * `index` - String slice that represent the columns to use as id variables.
///
/// If `on` is empty all columns that are not in `index` will be used.
///
/// ```ignore
/// # use polars_core::prelude::*;
/// let df = df!("A" => &["a", "b", "a"],
/// "B" => &[1, 3, 5],
/// "C" => &[10, 11, 12],
/// "D" => &[2, 4, 6]
/// )?;
///
/// let unpivoted = df.unpivot(&["A", "B"], &["C", "D"])?;
/// println!("{:?}", df);
/// println!("{:?}", unpivoted);
/// # Ok::<(), PolarsError>(())
/// ```
/// Outputs:
/// ```text
/// +-----+-----+-----+-----+
/// | A | B | C | D |
/// | --- | --- | --- | --- |
/// | str | i32 | i32 | i32 |
/// +=====+=====+=====+=====+
/// | "a" | 1 | 10 | 2 |
/// +-----+-----+-----+-----+
/// | "b" | 3 | 11 | 4 |
/// +-----+-----+-----+-----+
/// | "a" | 5 | 12 | 6 |
/// +-----+-----+-----+-----+
///
/// +-----+-----+----------+-------+
/// | A | B | variable | value |
/// | --- | --- | --- | --- |
/// | str | i32 | str | i32 |
/// +=====+=====+==========+=======+
/// | "a" | 1 | "C" | 10 |
/// +-----+-----+----------+-------+
/// | "b" | 3 | "C" | 11 |
/// +-----+-----+----------+-------+
/// | "a" | 5 | "C" | 12 |
/// +-----+-----+----------+-------+
/// | "a" | 1 | "D" | 2 |
/// +-----+-----+----------+-------+
/// | "b" | 3 | "D" | 4 |
/// +-----+-----+----------+-------+
/// | "a" | 5 | "D" | 6 |
/// +-----+-----+----------+-------+
/// ```
pub fn unpivot<I, J>(&self, on: I, index: J) -> PolarsResult<Self>
where
I: IntoVec<SmartString>,
J: IntoVec<SmartString>,
{
let index = index.into_vec();
let on = on.into_vec();
self.unpivot2(UnpivotArgsIR {
on,
index,
..Default::default()
})
}

/// Similar to unpivot, but without generics. This may be easier if you want to pass
/// an empty `index` or empty `on`.
pub fn unpivot2(&self, args: UnpivotArgsIR) -> PolarsResult<Self> {
let index = args.index;
let mut on = args.on;

let variable_name = args.variable_name.as_deref().unwrap_or("variable");
let value_name = args.value_name.as_deref().unwrap_or("value");

let len = self.height();

// if value vars is empty we take all columns that are not in id_vars.
if on.is_empty() {
// return empty frame if there are no columns available to use as value vars
if index.len() == self.width() {
let variable_col = Series::new_empty(variable_name, &DataType::String);
let value_col = Series::new_empty(variable_name, &DataType::Null);

let mut out = self.select(index).unwrap().clear().columns;
out.push(variable_col);
out.push(value_col);

return Ok(unsafe { DataFrame::new_no_checks(out) });
}

let index_set = PlHashSet::from_iter(index.iter().map(|s| s.as_str()));
on = self
.get_columns()
.iter()
.filter_map(|s| {
if index_set.contains(s.name()) {
None
} else {
Some(s.name().into())
}
})
.collect();
}

// values will all be placed in single column, so we must find their supertype
let schema = self.schema();
let mut iter = on
.iter()
.map(|v| schema.get(v).ok_or_else(|| polars_err!(col_not_found = v)));
let mut st = iter.next().unwrap()?.clone();
for dt in iter {
st = try_get_supertype(&st, dt?)?;
}

// The column name of the variable that is unpivoted
let mut variable_col = MutablePlString::with_capacity(len * on.len() + 1);
// prepare ids
let ids_ = self.select_with_schema_unchecked(index, &schema)?;
let mut ids = ids_.clone();
if ids.width() > 0 {
for _ in 0..on.len() - 1 {
ids.vstack_mut_unchecked(&ids_)
}
}
ids.as_single_chunk_par();
drop(ids_);

let mut values = Vec::with_capacity(on.len());

for value_column_name in &on {
variable_col.extend_constant(len, Some(value_column_name.as_str()));
// ensure we go via the schema so we are O(1)
// self.column() is linear
// together with this loop that would make it O^2 over `on`
let (pos, _name, _dtype) = schema.try_get_full(value_column_name)?;
let col = &self.columns[pos];
let value_col = col.cast(&st).map_err(
|_| polars_err!(InvalidOperation: "'unpivot' not supported for dtype: {}", col.dtype()),
)?;
values.extend_from_slice(value_col.chunks())
}
let values_arr = concatenate_owned_unchecked(&values)?;
// SAFETY:
// The give dtype is correct
let values =
unsafe { Series::from_chunks_and_dtype_unchecked(value_name, vec![values_arr], &st) };

let variable_col = variable_col.as_box();
// SAFETY:
// The given dtype is correct
let variables = unsafe {
Series::from_chunks_and_dtype_unchecked(
variable_name,
vec![variable_col],
&DataType::String,
)
};

ids.hstack_mut(&[variables, values])?;

Ok(ids)
}
}

#[cfg(test)]
Expand Down Expand Up @@ -432,55 +261,4 @@ mod test {

Ok(())
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_unpivot() -> PolarsResult<()> {
let df = df!("A" => &["a", "b", "a"],
"B" => &[1, 3, 5],
"C" => &[10, 11, 12],
"D" => &[2, 4, 6]
)
.unwrap();

let unpivoted = df.unpivot(["C", "D"], ["A", "B"])?;
assert_eq!(
Vec::from(unpivoted.column("value")?.i32()?),
&[Some(10), Some(11), Some(12), Some(2), Some(4), Some(6)]
);

let args = UnpivotArgsIR {
on: vec![],
index: vec![],
..Default::default()
};

let unpivoted = df.unpivot2(args).unwrap();
let value = unpivoted.column("value")?;
// String because of supertype
let value = value.str()?;
let value = value.into_no_null_iter().collect::<Vec<_>>();
assert_eq!(
value,
&["a", "b", "a", "1", "3", "5", "10", "11", "12", "2", "4", "6"]
);

let args = UnpivotArgsIR {
on: vec![],
index: vec!["A".into()],
..Default::default()
};

let unpivoted = df.unpivot2(args).unwrap();
let value = unpivoted.column("value")?;
let value = value.i32()?;
let value = value.into_no_null_iter().collect::<Vec<_>>();
assert_eq!(value, &[1, 3, 5, 10, 11, 12, 2, 4, 6]);
let variable = unpivoted.column("variable")?;
let variable = variable.str()?;
let variable = variable.into_no_null_iter().collect::<Vec<_>>();
assert_eq!(variable, &["B", "B", "B", "C", "C", "C", "D", "D", "D"]);
assert!(unpivoted.column("A").is_ok());
Ok(())
}
}
14 changes: 12 additions & 2 deletions crates/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,11 @@ impl DataFrame {
&mut self.columns
}

/// Take ownership of the underlying columns vec.
pub fn take_columns(self) -> Vec<Series> {
self.columns
}

/// Iterator over the columns as [`Series`].
///
/// # Example
Expand Down Expand Up @@ -926,8 +931,13 @@ impl DataFrame {
Ok(self)
}

/// Does not check if schema is correct
pub(crate) fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
/// Concatenate a [`DataFrame`] to this [`DataFrame`]
///
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks`].
///
/// # Panics
/// Panics if the schema's don't match.
pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
self.columns
.iter_mut()
.zip(other.columns.iter())
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ arg_where = ["polars-plan/arg_where"]
search_sorted = ["polars-plan/search_sorted"]
merge_sorted = ["polars-plan/merge_sorted"]
meta = ["polars-plan/meta"]
pivot = ["polars-core/rows", "polars-ops/pivot"]
pivot = ["polars-core/rows", "polars-ops/pivot", "polars-plan/pivot"]
top_k = ["polars-plan/top_k"]
semi_anti_join = ["polars-plan/semi_anti_join"]
cse = ["polars-plan/cse", "polars-mem-engine/cse"]
Expand Down
1 change: 1 addition & 0 deletions crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1652,6 +1652,7 @@ impl LazyFrame {
/// Unpivot the DataFrame from wide to long format.
///
/// See [`UnpivotArgsIR`] for information on how to unpivot a DataFrame.
#[cfg(feature = "pivot")]
pub fn unpivot(self, args: UnpivotArgsDSL) -> LazyFrame {
let opt_state = self.get_opt_state();
let lp = self.get_plan_builder().unpivot(args).build();
Expand Down
1 change: 1 addition & 0 deletions crates/polars-lazy/src/tests/queries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ fn test_lazy_alias() {
}

#[test]
#[cfg(feature = "pivot")]
fn test_lazy_unpivot() {
let df = get_df();

Expand Down
2 changes: 2 additions & 0 deletions crates/polars-ops/src/frame/pivot/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod positioning;
mod unpivot;

use std::borrow::Cow;

Expand All @@ -7,6 +8,7 @@ use polars_core::frame::group_by::expr::PhysicalAggExpr;
use polars_core::prelude::*;
use polars_core::utils::_split_offsets;
use polars_core::{downcast_as_macro_arg_physical, POOL};
pub use unpivot::UnpivotDF;

const HASHMAP_INIT_SIZE: usize = 512;

Expand Down
Loading

0 comments on commit 8111493

Please sign in to comment.