diff --git a/crates/polars-core/src/chunked_array/builder/binary_offset.rs b/crates/polars-core/src/chunked_array/builder/binary_offset.rs deleted file mode 100644 index c7d084294528c..0000000000000 --- a/crates/polars-core/src/chunked_array/builder/binary_offset.rs +++ /dev/null @@ -1,51 +0,0 @@ -use super::*; - -pub struct BinaryOffsetChunkedBuilder { - pub(crate) chunk_builder: MutableBinaryArray, - pub(crate) field: FieldRef, -} - -impl Clone for BinaryOffsetChunkedBuilder { - fn clone(&self) -> Self { - Self { - chunk_builder: self.chunk_builder.clone(), - field: self.field.clone(), - } - } -} - -impl BinaryOffsetChunkedBuilder { - /// Create a new [`BinaryOffsetChunkedBuilder`] - /// - /// # Arguments - /// - /// * `capacity` - Number of string elements in the final array. - pub fn new(name: PlSmallStr, capacity: usize) -> Self { - Self { - chunk_builder: MutableBinaryArray::with_capacity(capacity), - field: Arc::new(Field::new(name, DataType::BinaryOffset)), - } - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: &[u8]) { - self.chunk_builder.push(Some(v)); - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) { - self.chunk_builder.push_null() - } - - #[inline] - pub fn append_option(&mut self, opt: Option<&[u8]>) { - self.chunk_builder.push(opt); - } - - pub fn finish(mut self) -> BinaryOffsetChunked { - let arr = self.chunk_builder.as_box(); - ChunkedArray::new_with_compute_len(self.field, vec![arr]) - } -} diff --git a/crates/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs index e818254d581b1..539586c2193e9 100644 --- a/crates/polars-core/src/chunked_array/builder/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/mod.rs @@ -1,4 +1,3 @@ -mod binary_offset; mod boolean; #[cfg(feature = "dtype-array")] pub mod fixed_size_list; @@ -11,7 +10,6 @@ use std::sync::Arc; use arrow::array::*; use arrow::bitmap::Bitmap; -pub use binary_offset::*; pub use boolean::*; #[cfg(feature = "dtype-array")] pub(crate) use fixed_size_list::*; diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 81adc0d7f5d26..43ca425796df2 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -693,6 +693,16 @@ impl<'a> AnyValue<'a> { None => AnyValue::Null, } } + + pub fn idx(&self) -> IdxSize { + match self { + #[cfg(not(feature = "bigidx"))] + Self::UInt32(v) => *v, + #[cfg(feature = "bigidx")] + Self::UInt64(v) => *v, + _ => panic!("expected index type found {self:?}"), + } + } } impl From> for DataType { diff --git a/crates/polars-core/src/frame/column.rs b/crates/polars-core/src/frame/column.rs deleted file mode 100644 index 8f5b87d9c9da2..0000000000000 --- a/crates/polars-core/src/frame/column.rs +++ /dev/null @@ -1,1187 +0,0 @@ -use std::borrow::Cow; -use std::ops::{Add, Div, Mul, Rem, Sub}; -use std::sync::OnceLock; - -use num_traits::{Num, NumCast}; -use polars_error::PolarsResult; -use polars_utils::pl_str::PlSmallStr; - -use crate::chunked_array::metadata::MetadataFlags; -use crate::prelude::*; -use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; -use crate::utils::Container; - -/// A column within a [`DataFrame`]. -/// -/// This is lazily initialized to a [`Series`] with methods like -/// [`as_materialized_series`][Column::as_materialized_series] and -/// [`take_materialized_series`][Column::take_materialized_series]. -/// -/// Currently, there are two ways to represent a [`Column`]. -/// 1. A [`Series`] of values -/// 2. A [`ScalarColumn`] that repeats a single [`Scalar`] -#[derive(Debug, Clone)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -#[cfg_attr(feature = "serde", serde(from = "Series"))] -#[cfg_attr(feature = "serde", serde(into = "_SerdeSeries"))] -pub enum Column { - Series(Series), - Scalar(ScalarColumn), -} - -/// A [`Column`] that consists of a repeated [`Scalar`] -/// -/// This is lazily materialized into a [`Series`]. -#[derive(Debug, Clone)] -pub struct ScalarColumn { - name: PlSmallStr, - value: Scalar, - length: usize, - - // invariants: - // materialized.name() == name - // materialized.len() == length - // materialized.dtype() == value.dtype - // materialized[i] == value, for all 0 <= i < length - /// A lazily materialized [`Series`] variant of this [`ScalarColumn`] - materialized: OnceLock, -} - -/// Convert `Self` into a [`Column`] -pub trait IntoColumn: Sized { - fn into_column(self) -> Column; -} - -impl Column { - #[inline] - pub fn new(name: PlSmallStr, values: T) -> Self - where - Phantom: ?Sized, - Series: NamedFrom, - { - Self::Series(NamedFrom::new(name, values)) - } - - #[inline] - pub fn new_empty(name: PlSmallStr, dtype: &DataType) -> Self { - Self::new_scalar(name, Scalar::new(dtype.clone(), AnyValue::Null), 0) - } - - #[inline] - pub fn new_scalar(name: PlSmallStr, value: Scalar, length: usize) -> Self { - Self::Scalar(ScalarColumn::new(name, value, length)) - } - - // # Materialize - /// Get a reference to a [`Series`] for this [`Column`] - /// - /// This may need to materialize the [`Series`] on the first invocation for a specific column. - #[inline] - pub fn as_materialized_series(&self) -> &Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => s.as_materialized_series(), - } - } - /// Turn [`Column`] into a [`Column::Series`]. - /// - /// This may need to materialize the [`Series`] on the first invocation for a specific column. - #[inline] - pub fn into_materialized_series(&mut self) -> &mut Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => { - *self = Column::Series(s.to_series()); - let Column::Series(s) = self else { - unreachable!(); - }; - s - }, - } - } - /// Take [`Series`] from a [`Column`] - /// - /// This may need to materialize the [`Series`] on the first invocation for a specific column. - #[inline] - pub fn take_materialized_series(self) -> Series { - match self { - Column::Series(s) => s, - Column::Scalar(s) => s.take_materialized_series(), - } - } - - #[inline] - pub fn dtype(&self) -> &DataType { - match self { - Column::Series(s) => s.dtype(), - Column::Scalar(s) => s.value.dtype(), - } - } - - #[inline] - pub fn field(&self) -> Cow { - match self { - Column::Series(s) => s.field(), - Column::Scalar(s) => match s.materialized.get() { - None => Cow::Owned(Field::new(s.name.clone(), s.value.dtype().clone())), - Some(s) => s.field(), - }, - } - } - - #[inline] - pub fn name(&self) -> &PlSmallStr { - match self { - Column::Series(s) => s.name(), - Column::Scalar(s) => &s.name, - } - } - - #[inline] - pub fn len(&self) -> usize { - match self { - Column::Series(s) => s.len(), - Column::Scalar(s) => s.length, - } - } - - #[inline] - pub fn with_name(mut self, name: PlSmallStr) -> Column { - self.rename(name); - self - } - - #[inline] - pub fn rename(&mut self, name: PlSmallStr) { - match self { - Column::Series(s) => _ = s.rename(name), - Column::Scalar(s) => { - if let Some(series) = s.materialized.get_mut() { - series.rename(name.clone()); - } - - s.name = name; - }, - } - } - - // # Downcasting - #[inline] - pub fn as_series(&self) -> Option<&Series> { - match self { - Column::Series(s) => Some(s), - Column::Scalar(_) => None, - } - } - #[inline] - pub fn as_scalar_column(&self) -> Option<&ScalarColumn> { - match self { - Column::Series(_) => None, - Column::Scalar(s) => Some(s), - } - } - - // # To Chunked Arrays - pub fn bool(&self) -> PolarsResult<&BooleanChunked> { - // @scalar-opt - self.as_materialized_series().bool() - } - pub fn i8(&self) -> PolarsResult<&Int8Chunked> { - // @scalar-opt - self.as_materialized_series().i8() - } - pub fn i16(&self) -> PolarsResult<&Int16Chunked> { - // @scalar-opt - self.as_materialized_series().i16() - } - pub fn i32(&self) -> PolarsResult<&Int32Chunked> { - // @scalar-opt - self.as_materialized_series().i32() - } - pub fn i64(&self) -> PolarsResult<&Int64Chunked> { - // @scalar-opt - self.as_materialized_series().i64() - } - pub fn u8(&self) -> PolarsResult<&UInt8Chunked> { - // @scalar-opt - self.as_materialized_series().u8() - } - pub fn u16(&self) -> PolarsResult<&UInt16Chunked> { - // @scalar-opt - self.as_materialized_series().u16() - } - pub fn u32(&self) -> PolarsResult<&UInt32Chunked> { - // @scalar-opt - self.as_materialized_series().u32() - } - pub fn u64(&self) -> PolarsResult<&UInt64Chunked> { - // @scalar-opt - self.as_materialized_series().u64() - } - pub fn f32(&self) -> PolarsResult<&Float32Chunked> { - // @scalar-opt - self.as_materialized_series().f32() - } - pub fn f64(&self) -> PolarsResult<&Float64Chunked> { - // @scalar-opt - self.as_materialized_series().f64() - } - pub fn str(&self) -> PolarsResult<&StringChunked> { - // @scalar-opt - self.as_materialized_series().str() - } - pub fn list(&self) -> PolarsResult<&ListChunked> { - // @scalar-opt - self.as_materialized_series().list() - } - pub fn binary(&self) -> PolarsResult<&BinaryChunked> { - // @scalar-opt - self.as_materialized_series().binary() - } - pub fn idx(&self) -> PolarsResult<&IdxCa> { - // @scalar-opt - self.as_materialized_series().idx() - } - #[cfg(feature = "dtype-datetime")] - pub fn datetime(&self) -> PolarsResult<&DatetimeChunked> { - // @scalar-opt - self.as_materialized_series().datetime() - } - #[cfg(feature = "dtype-struct")] - pub fn struct_(&self) -> PolarsResult<&StructChunked> { - // @scalar-opt - self.as_materialized_series().struct_() - } - #[cfg(feature = "dtype-decimal")] - pub fn decimal(&self) -> PolarsResult<&DecimalChunked> { - // @scalar-opt - self.as_materialized_series().decimal() - } - #[cfg(feature = "dtype-array")] - pub fn array(&self) -> PolarsResult<&ArrayChunked> { - // @scalar-opt - self.as_materialized_series().array() - } - #[cfg(feature = "dtype-categorical")] - pub fn categorical(&self) -> PolarsResult<&CategoricalChunked> { - self.as_materialized_series().categorical() - } - - // # Casting - pub fn strict_cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .strict_cast(dtype) - .map(Column::from) - } - pub fn cast(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().cast(dtype).map(Column::from) - } - /// # Safety - /// - /// This can lead to invalid memory access in downstream code. - pub unsafe fn cast_unchecked(&self, dtype: &DataType) -> PolarsResult { - // @scalar-opt - unsafe { self.as_materialized_series().cast_unchecked(dtype) }.map(Column::from) - } - - pub fn clear(&self) -> Self { - match self { - Column::Series(s) => s.clear().into(), - Column::Scalar(s) => Self::new_scalar(s.name.clone(), s.value.clone(), 0), - } - } - - #[inline] - pub fn shrink_to_fit(&mut self) { - match self { - Column::Series(s) => s.shrink_to_fit(), - Column::Scalar(_) => {}, - } - } - - #[inline] - pub fn new_from_index(&self, index: usize, length: usize) -> Self { - // @scalar-opt - Self::Series(self.as_materialized_series().new_from_index(index, length)) - } - - pub fn has_nulls(&self) -> bool { - // @scalar-opt - self.as_materialized_series().has_nulls() - } - - pub fn is_not_null(&self) -> ChunkedArray { - // @scalar-opt - self.as_materialized_series().is_not_null() - } - - pub fn to_physical_repr(&self) -> Column { - // @scalar-opt - self.as_materialized_series() - .to_physical_repr() - .into_owned() - .into() - } - - pub fn head(&self, length: Option) -> Column { - // @scalar-opt - self.as_materialized_series().head(length).into() - } - - pub fn tail(&self, length: Option) -> Column { - // @scalar-opt - self.as_materialized_series().tail(length).into() - } - - pub fn slice(&self, offset: i64, length: usize) -> Column { - // @scalar-opt - self.as_materialized_series().slice(offset, length).into() - } - - pub fn split_at(&self, offset: i64) -> (Column, Column) { - // @scalar-opt - let (l, r) = self.as_materialized_series().split_at(offset); - (l.into(), r.into()) - } - - pub fn null_count(&self) -> usize { - // @scalar-opt - self.as_materialized_series().null_count() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_min(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_min(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_max(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_max(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_mean(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_mean(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_sum(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_sum(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_first(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_first(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_last(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_last(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_n_unique(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_n_unique(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - pub unsafe fn agg_quantile( - &self, - groups: &GroupsProxy, - quantile: f64, - interpol: QuantileInterpolOptions, - ) -> Self { - // @scalar-opt - unsafe { - self.as_materialized_series() - .agg_quantile(groups, quantile, interpol) - } - .into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_median(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_median(groups) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_var(&self, groups: &GroupsProxy, ddof: u8) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_var(groups, ddof) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub(crate) unsafe fn agg_std(&self, groups: &GroupsProxy, ddof: u8) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_std(groups, ddof) }.into() - } - - /// # Safety - /// - /// Does no bounds checks, groups must be correct. - #[cfg(feature = "algorithm_group_by")] - pub unsafe fn agg_list(&self, groups: &GroupsProxy) -> Self { - // @scalar-opt - unsafe { self.as_materialized_series().agg_list(groups) }.into() - } - - pub fn full_null(name: PlSmallStr, size: usize, dtype: &DataType) -> Column { - // @scalar-opt - Series::full_null(name, size, dtype).into() - } - - pub fn is_empty(&self) -> bool { - // @scalar-opt - self.as_materialized_series().is_empty() - } - - pub fn reverse(&self) -> Column { - // @scalar-opt - self.as_materialized_series().reverse().into() - } - - pub fn equals(&self, right: &Column) -> bool { - // @scalar-opt - self.as_materialized_series() - .equals(right.as_materialized_series()) - } - - pub fn equals_missing(&self, right: &Column) -> bool { - // @scalar-opt - self.as_materialized_series() - .equals_missing(right.as_materialized_series()) - } - - pub fn set_sorted_flag(&mut self, sorted: IsSorted) { - // @scalar-opt - match self { - Column::Series(s) => s.set_sorted_flag(sorted), - Column::Scalar(_) => {}, - } - } - - pub fn get_flags(&self) -> MetadataFlags { - match self { - Column::Series(s) => s.get_flags(), - // @scalar-opt - Column::Scalar(_) => MetadataFlags::empty(), - } - } - - pub fn get_data_ptr(&self) -> usize { - // @scalar-opt - self.as_materialized_series().get_data_ptr() - } - - pub fn vec_hash(&self, build_hasher: PlRandomState, buf: &mut Vec) -> PolarsResult<()> { - // @scalar-opt? - self.as_materialized_series().vec_hash(build_hasher, buf) - } - - pub fn vec_hash_combine( - &self, - build_hasher: PlRandomState, - hashes: &mut [u64], - ) -> PolarsResult<()> { - // @scalar-opt? - self.as_materialized_series() - .vec_hash_combine(build_hasher, hashes) - } - - /// # Safety - /// - /// Indexes need to be in bounds. - pub(crate) unsafe fn equal_element( - &self, - idx_self: usize, - idx_other: usize, - other: &Column, - ) -> bool { - // @scalar-opt - unsafe { - self.as_materialized_series().equal_element( - idx_self, - idx_other, - other.as_materialized_series(), - ) - } - } - - pub fn append(&mut self, other: &Column) -> PolarsResult<&mut Self> { - // @scalar-opt - self.into_materialized_series() - .append(other.as_materialized_series())?; - Ok(self) - } - - pub fn arg_sort(&self, options: SortOptions) -> IdxCa { - // @scalar-opt - self.as_materialized_series().arg_sort(options) - } - - pub fn bit_repr(&self) -> Option { - // @scalar-opt - self.as_materialized_series().bit_repr() - } - - pub fn into_frame(self) -> DataFrame { - // SAFETY: A single-column dataframe cannot have length mismatches or duplicate names - unsafe { DataFrame::new_no_checks(vec![self]) } - } - - pub fn unique_stable(&self) -> PolarsResult { - // @scalar-opt? - self.as_materialized_series() - .unique_stable() - .map(Column::from) - } - - pub fn extend(&mut self, other: &Column) -> PolarsResult<&mut Self> { - // @scalar-opt - self.into_materialized_series() - .extend(other.as_materialized_series())?; - Ok(self) - } - - pub fn rechunk(&self) -> Column { - match self { - Column::Series(s) => s.rechunk().into(), - Column::Scalar(_) => self.clone(), - } - } - - pub fn explode(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().explode().map(Column::from) - } - - pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .fill_null(strategy) - .map(Column::from) - } - - pub fn divide(&self, rhs: &Column) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .divide(rhs.as_materialized_series()) - .map(Column::from) - } - - pub fn shift(&self, periods: i64) -> Column { - // @scalar-opt - self.as_materialized_series().shift(periods).into() - } - - #[cfg(feature = "zip_with")] - pub fn zip_with_same_type( - &self, - mask: &ChunkedArray, - other: &Column, - ) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .zip_with_same_type(mask, other.as_materialized_series()) - .map(Column::from) - } - - pub fn drop_nulls(&self) -> Column { - // @scalar-opt - self.as_materialized_series().drop_nulls().into() - } - - pub fn is_sorted_flag(&self) -> IsSorted { - // @scalar-opt - self.as_materialized_series().is_sorted_flag() - } - - pub fn get(&self, index: usize) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().get(index) - } - - pub fn unique(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().unique().map(Column::from) - } - - pub fn reshape_list(&self, dimensions: &[i64]) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .reshape_list(dimensions) - .map(Self::from) - } - - #[cfg(feature = "dtype-array")] - pub fn reshape_array(&self, dimensions: &[i64]) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .reshape_array(dimensions) - .map(Self::from) - } - - pub fn sort(&self, sort_options: SortOptions) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .sort(sort_options) - .map(Self::from) - } - - pub fn filter(&self, filter: &ChunkedArray) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().filter(filter).map(Self::from) - } - - #[cfg(feature = "random")] - pub fn shuffle(&self, seed: Option) -> Self { - // @scalar-opt - self.as_materialized_series().shuffle(seed).into() - } - - #[cfg(feature = "random")] - pub fn sample_frac( - &self, - frac: f64, - with_replacement: bool, - shuffle: bool, - seed: Option, - ) -> PolarsResult { - self.as_materialized_series() - .sample_frac(frac, with_replacement, shuffle, seed) - .map(Self::from) - } - - #[cfg(feature = "random")] - pub fn sample_n( - &self, - n: usize, - with_replacement: bool, - shuffle: bool, - seed: Option, - ) -> PolarsResult { - self.as_materialized_series() - .sample_n(n, with_replacement, shuffle, seed) - .map(Self::from) - } - - pub fn gather_every(&self, n: usize, offset: usize) -> Column { - // @scalar-opt - self.as_materialized_series().gather_every(n, offset).into() - } - - pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .extend_constant(value, n) - .map(Self::from) - } - - pub fn is_null(&self) -> BooleanChunked { - // @scalar-opt - self.as_materialized_series().is_null() - } - - #[cfg(feature = "zip_with")] - pub fn zip_with(&self, mask: &BooleanChunked, other: &Self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series() - .zip_with(mask, other.as_materialized_series()) - .map(Self::from) - } - - pub fn is_finite(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_finite() - } - - pub fn is_infinite(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_infinite() - } - - pub fn is_nan(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_nan() - } - - pub fn is_not_nan(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().is_not_nan() - } - - #[cfg(feature = "dtype-date")] - pub fn date(&self) -> PolarsResult<&DateChunked> { - // @scalar-opt - self.as_materialized_series().date() - } - - #[cfg(feature = "dtype-duration")] - pub fn duration(&self) -> PolarsResult<&DurationChunked> { - // @scalar-opt - self.as_materialized_series().duration() - } - - pub fn wrapping_trunc_div_scalar(&self, rhs: T) -> Self - where - T: Num + NumCast, - { - // @scalar-opt - self.as_materialized_series() - .wrapping_trunc_div_scalar(rhs) - .into() - } - - pub fn product(&self) -> PolarsResult { - // @scalar-opt - self.as_materialized_series().product() - } - - pub fn binary_offset(&self) -> PolarsResult<&BinaryOffsetChunked> { - // @scalar-opt - self.as_materialized_series().binary_offset() - } - - pub fn phys_iter(&self) -> SeriesPhysIter<'_> { - // @scalar-opt - self.as_materialized_series().phys_iter() - } - - /// # Safety - /// - /// Does not perform bounds check on `index` - pub unsafe fn get_unchecked(&self, index: usize) -> AnyValue { - // @scalar-opt - self.as_materialized_series().get_unchecked(index) - } - - #[cfg(feature = "object")] - pub fn get_object( - &self, - index: usize, - ) -> Option<&dyn crate::chunked_array::object::PolarsObjectSafe> { - self.as_materialized_series().get_object(index) - } - - pub fn bitand(&self, rhs: &Self) -> PolarsResult { - self.as_materialized_series() - .bitand(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl ChunkCompare<&Column> for Column { - type Item = PolarsResult; - - /// Create a boolean mask by checking for equality. - #[inline] - fn equal(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .equal(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for equality. - #[inline] - fn equal_missing(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .equal_missing(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for inequality. - #[inline] - fn not_equal(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .not_equal(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking for inequality. - #[inline] - fn not_equal_missing(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .not_equal_missing(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self > rhs. - #[inline] - fn gt(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .gt(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self >= rhs. - #[inline] - fn gt_eq(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .gt_eq(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self < rhs. - #[inline] - fn lt(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .lt(rhs.as_materialized_series()) - } - - /// Create a boolean mask by checking if self <= rhs. - #[inline] - fn lt_eq(&self, rhs: &Column) -> PolarsResult { - self.as_materialized_series() - .lt_eq(rhs.as_materialized_series()) - } -} - -impl Default for Column { - fn default() -> Self { - // @scalar-opt - Column::Series(Series::default()) - } -} - -impl PartialEq for Column { - fn eq(&self, other: &Self) -> bool { - // @scalar-opt - self.as_materialized_series() - .eq(other.as_materialized_series()) - } -} - -impl From for Column { - #[inline] - fn from(value: Series) -> Self { - Self::Series(value) - } -} - -impl From for Column { - #[inline] - fn from(value: ScalarColumn) -> Self { - Self::Scalar(value) - } -} - -impl Add for Column { - type Output = PolarsResult; - - fn add(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .add(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Add for &Column { - type Output = PolarsResult; - - fn add(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .add(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for Column { - type Output = PolarsResult; - - fn sub(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .sub(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for &Column { - type Output = PolarsResult; - - fn sub(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .sub(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Mul for Column { - type Output = PolarsResult; - - fn mul(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .mul(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Mul for &Column { - type Output = PolarsResult; - - fn mul(self, rhs: Self) -> Self::Output { - // @scalar-opt - self.as_materialized_series() - .mul(rhs.as_materialized_series()) - .map(Column::from) - } -} - -impl Sub for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn sub(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().sub(rhs).into() - } -} - -impl Sub for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn sub(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().sub(rhs).into() - } -} - -impl Add for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn add(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().add(rhs).into() - } -} - -impl Add for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn add(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().add(rhs).into() - } -} - -impl Div for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn div(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().div(rhs).into() - } -} - -impl Div for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn div(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().div(rhs).into() - } -} - -impl Mul for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn mul(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().mul(rhs).into() - } -} - -impl Mul for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn mul(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().mul(rhs).into() - } -} - -impl Rem for &Column -where - T: Num + NumCast, -{ - type Output = Column; - - fn rem(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().rem(rhs).into() - } -} - -impl Rem for Column -where - T: Num + NumCast, -{ - type Output = Self; - - fn rem(self, rhs: T) -> Self::Output { - // @scalar-opt - self.as_materialized_series().rem(rhs).into() - } -} - -impl ScalarColumn { - #[inline] - pub fn new(name: PlSmallStr, value: Scalar, length: usize) -> Self { - Self { - name, - value, - length, - - materialized: OnceLock::new(), - } - } - - fn _to_series(name: PlSmallStr, value: Scalar, length: usize) -> Series { - let series = if length == 0 { - Series::new_empty(name, value.dtype()) - } else { - // @TODO: There is probably a better way to do this. - value.into_series(name).new_from_index(0, length) - }; - - debug_assert_eq!(series.len(), length); - - series - } - - pub fn to_series(&self) -> Series { - Self::_to_series(self.name.clone(), self.value.clone(), self.length) - } - - /// Get the [`ScalarColumn`] as [`Series`] - /// - /// This needs to materialize upon the first call. Afterwards, this is cached. - pub fn as_materialized_series(&self) -> &Series { - self.materialized.get_or_init(|| self.to_series()) - } - - /// Take the [`ScalarColumn`] and materialize as a [`Series`] if not already done. - pub fn take_materialized_series(self) -> Series { - self.materialized - .into_inner() - .unwrap_or_else(|| Self::_to_series(self.name, self.value, self.length)) - } -} - -impl IntoColumn for T { - #[inline] - fn into_column(self) -> Column { - let series = self.into_series(); - - if series.len() == 1 { - // SAFETY: We just did the bounds check - let value = unsafe { series.get_unchecked(0) }; - - if let Ok(value) = value.into_static() { - let value = Scalar::new(series.dtype().clone(), value); - let mut col = ScalarColumn::new(series.name().clone(), value, 1); - col.materialized = OnceLock::from(series); - return Column::Scalar(col); - } - } - - Column::Series(series) - } -} - -impl IntoColumn for Column { - #[inline(always)] - fn into_column(self) -> Column { - self - } -} - -impl IntoColumn for ScalarColumn { - #[inline(always)] - fn into_column(self) -> Column { - self.into() - } -} - -/// We don't want to serialize the scalar columns. So this helps pretend that columns are always -/// initialized without implementing From for Series. -/// -/// Those casts should be explicit. -#[derive(Clone)] -#[cfg_attr(feature = "serde", derive(serde::Serialize))] -#[cfg_attr(feature = "serde", serde(into = "Series"))] -struct _SerdeSeries(Series); - -impl From for _SerdeSeries { - #[inline] - fn from(value: Column) -> Self { - Self(value.take_materialized_series()) - } -} - -impl From<_SerdeSeries> for Series { - #[inline] - fn from(value: _SerdeSeries) -> Self { - value.0 - } -} diff --git a/crates/polars-core/src/lib.rs b/crates/polars-core/src/lib.rs index 117f462619dc7..a7e74b2304106 100644 --- a/crates/polars-core/src/lib.rs +++ b/crates/polars-core/src/lib.rs @@ -69,3 +69,8 @@ pub static POOL: Lazy = Lazy::new(|| polars_utils::was // utility for the tests to ensure a single thread can execute pub static SINGLE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + +/// Default length for a `.head()` call +pub(crate) const HEAD_DEFAULT_LENGTH: usize = 10; +/// Default length for a `.tail()` call +pub(crate) const TAIL_DEFAULT_LENGTH: usize = 10; diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index 65336b100fa1d..aaa4bc753443a 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -2,7 +2,6 @@ use std::fmt::Write; use arrow::bitmap::MutableBitmap; -use crate::chunked_array::builder::BinaryOffsetChunkedBuilder; #[cfg(feature = "dtype-categorical")] use crate::chunked_array::cast::CastOptions; #[cfg(feature = "object")] @@ -164,7 +163,6 @@ impl Series { #[cfg(feature = "object")] DataType::Object(_, registry) => any_values_to_object(values, registry)?, DataType::Null => Series::new_null(PlSmallStr::EMPTY, values.len()), - DataType::BinaryOffset => any_values_to_binary_offset(values, strict)?.into_series(), dt => { polars_bail!( InvalidOperation: @@ -346,43 +344,6 @@ fn any_values_to_binary(values: &[AnyValue], strict: bool) -> PolarsResult PolarsResult { - fn any_values_to_binary_offset_strict( - values: &[AnyValue], - ) -> PolarsResult { - let mut builder = BinaryOffsetChunkedBuilder::new(PlSmallStr::EMPTY, values.len()); - for av in values { - match av { - AnyValue::Binary(s) => builder.append_value(s), - AnyValue::BinaryOwned(s) => builder.append_value(s.as_slice()), - AnyValue::Null => builder.append_null(), - av => return Err(invalid_value_error(&DataType::Binary, av)), - } - } - Ok(builder.finish()) - } - fn any_values_to_binary_offset_nonstrict(values: &[AnyValue]) -> BinaryOffsetChunked { - values - .iter() - .map(|av| match av { - AnyValue::Binary(b) => Some(*b), - AnyValue::BinaryOwned(b) => Some(&**b), - AnyValue::String(s) => Some(s.as_bytes()), - AnyValue::StringOwned(s) => Some(s.as_str().as_bytes()), - _ => None, - }) - .collect_trusted() - } - if strict { - any_values_to_binary_offset_strict(values) - } else { - Ok(any_values_to_binary_offset_nonstrict(values)) - } -} - #[cfg(feature = "dtype-date")] fn any_values_to_date(values: &[AnyValue], strict: bool) -> PolarsResult { let mut builder = PrimitiveChunkedBuilder::::new(PlSmallStr::EMPTY, values.len()); diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 2a18d7c7a6e92..cb4a9bb84030d 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -1,6 +1,7 @@ //! Type agnostic columnar data structure. pub use crate::prelude::ChunkCompare; use crate::prelude::*; +use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH}; pub mod amortized_iter; mod any_value; @@ -817,18 +818,14 @@ impl Series { } /// Get the head of the Series. pub fn head(&self, length: Option) -> Series { - match length { - Some(len) => self.slice(0, std::cmp::min(len, self.len())), - None => self.slice(0, std::cmp::min(10, self.len())), - } + let len = length.unwrap_or(HEAD_DEFAULT_LENGTH); + self.slice(0, std::cmp::min(len, self.len())) } /// Get the tail of the Series. pub fn tail(&self, length: Option) -> Series { - let len = match length { - Some(len) => std::cmp::min(len, self.len()), - None => std::cmp::min(10, self.len()), - }; + let len = length.unwrap_or(TAIL_DEFAULT_LENGTH); + let len = std::cmp::min(len, self.len()); self.slice(-(len as i64), len) } diff --git a/crates/polars-mem-engine/src/executors/group_by_rolling.rs b/crates/polars-mem-engine/src/executors/group_by_rolling.rs index 50ad9da7fef27..3e84740ea92dd 100644 --- a/crates/polars-mem-engine/src/executors/group_by_rolling.rs +++ b/crates/polars-mem-engine/src/executors/group_by_rolling.rs @@ -21,10 +21,7 @@ unsafe fn update_keys(keys: &mut [Column], groups: &GroupsProxy) { // can be empty, but we still want to know the first value // of that group for key in keys.iter_mut() { - *key = key - .as_materialized_series() - .take_unchecked_from_slice(first) - .into_column(); + *key = key.take_slice_unchecked(first); } }, GroupsProxy::Slice { groups, .. } => { @@ -33,10 +30,7 @@ unsafe fn update_keys(keys: &mut [Column], groups: &GroupsProxy) { .iter() .map(|[first, _len]| *first) .collect_ca(PlSmallStr::EMPTY); - *key = key - .as_materialized_series() - .take_unchecked(&indices) - .into_column(); + *key = key.take_unchecked(&indices); } }, } diff --git a/crates/polars-ops/src/frame/join/asof/groups.rs b/crates/polars-ops/src/frame/join/asof/groups.rs index 6a7ccb3b76f80..9332b10e392b8 100644 --- a/crates/polars-ops/src/frame/join/asof/groups.rs +++ b/crates/polars-ops/src/frame/join/asof/groups.rs @@ -32,8 +32,8 @@ pub(crate) unsafe fn compare_df_rows2( join_nulls: bool, ) -> bool { for (l, r) in left.get_columns().iter().zip(right.get_columns()) { - let l = l.as_materialized_series().get_unchecked(left_idx); - let r = r.as_materialized_series().get_unchecked(right_idx); + let l = l.get_unchecked(left_idx); + let r = r.get_unchecked(right_idx); if !l.eq_missing(&r, join_nulls) { return false; } diff --git a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs index a1ecc4a12d026..4dafb71643bf9 100644 --- a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs +++ b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs @@ -72,6 +72,7 @@ pub(super) fn shift_and_fill(args: &[Column]) -> PolarsResult { let ca = s.str()?; let fill_value = match fill_value { AnyValue::String(v) => Some(v), + AnyValue::StringOwned(ref v) => Some(v.as_str()), AnyValue::Null => None, v => polars_bail!(ComputeError: "fill value '{}' is not supported", v), }; diff --git a/crates/polars-utils/src/index.rs b/crates/polars-utils/src/index.rs index 1ca29d3947273..f21ba1b392840 100644 --- a/crates/polars-utils/src/index.rs +++ b/crates/polars-utils/src/index.rs @@ -127,18 +127,11 @@ impl Indexable for &[T] { pub fn check_bounds(idx: &[IdxSize], len: IdxSize) -> PolarsResult<()> { // We iterate in large uninterrupted chunks to help auto-vectorization. - let mut in_bounds = true; - for chunk in idx.chunks(1024) { - for i in chunk { - if *i >= len { - in_bounds = false; - } - } - if !in_bounds { - break; - } - } - polars_ensure!(in_bounds, OutOfBounds: "indices are out of bounds"); + let Some(max_idx) = idx.iter().copied().max() else { + return Ok(()); + }; + + polars_ensure!(max_idx < len, OutOfBounds: "indices are out of bounds"); Ok(()) }