Skip to content

Commit

Permalink
refactor: Deprecate ddof parameter for correlation coefficient (#20197)
Browse files Browse the repository at this point in the history
  • Loading branch information
flowlight0 authored Dec 7, 2024
1 parent c92612a commit 8abc550
Show file tree
Hide file tree
Showing 10 changed files with 47 additions and 51 deletions.
3 changes: 1 addition & 2 deletions crates/polars-compute/src/var_cov.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,7 @@ impl PearsonState {
self.mean_y = new_mean_y;
}

pub fn finalize(&self, _ddof: u8) -> f64 {
// The division by sample_weight - ddof on both sides cancels out.
pub fn finalize(&self) -> f64 {
let denom = (self.dp_xx * self.dp_yy).sqrt();
if denom == 0.0 {
f64::NAN
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-lazy/src/tests/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
.lazy()
.group_by_stable([col("uid")])
// a double aggregation expression.
.agg([pearson_corr(col("day"), col("cumcases"), 1).alias("pearson_corr")])
.agg([pearson_corr(col("day"), col("cumcases")).alias("pearson_corr")])
.collect()?;
let s = out.column("pearson_corr")?.f64()?;
assert!((s.get(0).unwrap() - 0.997176).abs() < 0.000001);
Expand All @@ -25,7 +25,7 @@ fn test_pearson_corr() -> PolarsResult<()> {
.lazy()
.group_by_stable([col("uid")])
// a double aggregation expression.
.agg([pearson_corr(col("day"), col("cumcases"), 1)
.agg([pearson_corr(col("day"), col("cumcases"))
.pow(2.0)
.alias("pearson_corr")])
.collect()
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-ops/src/chunked_array/cov.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ where
}

/// Compute the pearson correlation between two columns.
pub fn pearson_corr<T>(a: &ChunkedArray<T>, b: &ChunkedArray<T>, ddof: u8) -> Option<f64>
pub fn pearson_corr<T>(a: &ChunkedArray<T>, b: &ChunkedArray<T>) -> Option<f64>
where
T: PolarsNumericType,
T::Native: AsPrimitive<f64>,
Expand All @@ -30,5 +30,5 @@ where
for (a, b) in a.downcast_iter().zip(b.downcast_iter()) {
out.combine(&polars_compute::var_cov::pearson_corr(a, b))
}
Some(out.finalize(ddof))
Some(out.finalize())
}
32 changes: 15 additions & 17 deletions crates/polars-plan/src/dsl/function_expr/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub enum CorrelationMethod {
Pearson,
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
SpearmanRank(bool),
Covariance,
Covariance(u8),
}

impl Display for CorrelationMethod {
Expand All @@ -19,20 +19,18 @@ impl Display for CorrelationMethod {
Pearson => "pearson",
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
SpearmanRank(_) => "spearman_rank",
Covariance => return write!(f, "covariance"),
Covariance(_) => return write!(f, "covariance"),
};
write!(f, "{}_correlation", s)
}
}

pub(super) fn corr(s: &[Column], ddof: u8, method: CorrelationMethod) -> PolarsResult<Column> {
pub(super) fn corr(s: &[Column], method: CorrelationMethod) -> PolarsResult<Column> {
match method {
CorrelationMethod::Pearson => pearson_corr(s, ddof),
CorrelationMethod::Pearson => pearson_corr(s),
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
CorrelationMethod::SpearmanRank(propagate_nans) => {
spearman_rank_corr(s, ddof, propagate_nans)
},
CorrelationMethod::Covariance => covariance(s, ddof),
CorrelationMethod::SpearmanRank(propagate_nans) => spearman_rank_corr(s, propagate_nans),
CorrelationMethod::Covariance(ddof) => covariance(s, ddof),
}
}

Expand Down Expand Up @@ -61,32 +59,32 @@ fn covariance(s: &[Column], ddof: u8) -> PolarsResult<Column> {
Ok(Column::new(name, &[ret]))
}

fn pearson_corr(s: &[Column], ddof: u8) -> PolarsResult<Column> {
fn pearson_corr(s: &[Column]) -> PolarsResult<Column> {
let a = &s[0];
let b = &s[1];
let name = PlSmallStr::from_static("pearson_corr");

use polars_ops::chunked_array::cov::pearson_corr;
let ret = match a.dtype() {
DataType::Float32 => {
let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap(), ddof).map(|v| v as f32);
let ret = pearson_corr(a.f32().unwrap(), b.f32().unwrap()).map(|v| v as f32);
return Ok(Column::new(name.clone(), &[ret]));
},
DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof),
DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap(), ddof),
DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap(), ddof),
DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap(), ddof),
DataType::Float64 => pearson_corr(a.f64().unwrap(), b.f64().unwrap()),
DataType::Int32 => pearson_corr(a.i32().unwrap(), b.i32().unwrap()),
DataType::Int64 => pearson_corr(a.i64().unwrap(), b.i64().unwrap()),
DataType::UInt32 => pearson_corr(a.u32().unwrap(), b.u32().unwrap()),
_ => {
let a = a.cast(&DataType::Float64)?;
let b = b.cast(&DataType::Float64)?;
pearson_corr(a.f64().unwrap(), b.f64().unwrap(), ddof)
pearson_corr(a.f64().unwrap(), b.f64().unwrap())
},
};
Ok(Column::new(name, &[ret]))
}

#[cfg(all(feature = "rank", feature = "propagate_nans"))]
fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsResult<Column> {
fn spearman_rank_corr(s: &[Column], propagate_nans: bool) -> PolarsResult<Column> {
use polars_core::utils::coalesce_nulls_columns;
use polars_ops::chunked_array::nan_propagating_aggregate::nan_max_s;
let a = &s[0];
Expand Down Expand Up @@ -134,5 +132,5 @@ fn spearman_rank_corr(s: &[Column], ddof: u8, propagate_nans: bool) -> PolarsRes
)
.into();

pearson_corr(&[a_rank, b_rank], ddof)
pearson_corr(&[a_rank, b_rank])
}
3 changes: 1 addition & 2 deletions crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,6 @@ pub enum FunctionExpr {
#[cfg(feature = "cov")]
Correlation {
method: correlation::CorrelationMethod,
ddof: u8,
},
#[cfg(feature = "peaks")]
PeakMin,
Expand Down Expand Up @@ -1104,7 +1103,7 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn ColumnsUdf>> {
Fused(op) => map_as_slice!(fused::fused, op),
ConcatExpr(rechunk) => map_as_slice!(concat::concat_expr, rechunk),
#[cfg(feature = "cov")]
Correlation { method, ddof } => map_as_slice!(correlation::corr, ddof, method),
Correlation { method } => map_as_slice!(correlation::corr, method),
#[cfg(feature = "peaks")]
PeakMin => map!(peaks::peak_min),
#[cfg(feature = "peaks")]
Expand Down
15 changes: 3 additions & 12 deletions crates/polars-plan/src/dsl/functions/correlation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ use super::*;
pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr {
let input = vec![a, b];
let function = FunctionExpr::Correlation {
method: CorrelationMethod::Covariance,
ddof,
method: CorrelationMethod::Covariance(ddof),
};
Expr::Function {
input,
Expand All @@ -20,15 +19,10 @@ pub fn cov(a: Expr, b: Expr, ddof: u8) -> Expr {
}

/// Compute the pearson correlation between two columns.
///
/// # Arguments
/// * ddof
/// Delta degrees of freedom
pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr {
pub fn pearson_corr(a: Expr, b: Expr) -> Expr {
let input = vec![a, b];
let function = FunctionExpr::Correlation {
method: CorrelationMethod::Pearson,
ddof,
};
Expr::Function {
input,
Expand All @@ -45,18 +39,15 @@ pub fn pearson_corr(a: Expr, b: Expr, ddof: u8) -> Expr {
/// Compute the spearman rank correlation between two columns.
/// Missing data will be excluded from the computation.
/// # Arguments
/// * ddof
/// Delta degrees of freedom
/// * propagate_nans
/// If `true` any `NaN` encountered will lead to `NaN` in the output.
/// If to `false` then `NaN` are regarded as larger than any finite number
/// and thus lead to the highest rank.
#[cfg(all(feature = "rank", feature = "propagate_nans"))]
pub fn spearman_rank_corr(a: Expr, b: Expr, ddof: u8, propagate_nans: bool) -> Expr {
pub fn spearman_rank_corr(a: Expr, b: Expr, propagate_nans: bool) -> Expr {
let input = vec![a, b];
let function = FunctionExpr::Correlation {
method: CorrelationMethod::SpearmanRank(propagate_nans),
ddof,
};
Expr::Function {
input,
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-python/src/functions/lazy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,8 @@ pub fn map_mul(
}

#[pyfunction]
pub fn pearson_corr(a: PyExpr, b: PyExpr, ddof: u8) -> PyExpr {
dsl::pearson_corr(a.inner, b.inner, ddof).into()
pub fn pearson_corr(a: PyExpr, b: PyExpr) -> PyExpr {
dsl::pearson_corr(a.inner, b.inner).into()
}

#[pyfunction]
Expand Down Expand Up @@ -537,10 +537,10 @@ pub fn repeat(value: PyExpr, n: PyExpr, dtype: Option<Wrap<DataType>>) -> PyResu
}

#[pyfunction]
pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, ddof: u8, propagate_nans: bool) -> PyExpr {
pub fn spearman_rank_corr(a: PyExpr, b: PyExpr, propagate_nans: bool) -> PyExpr {
#[cfg(feature = "propagate_nans")]
{
dsl::spearman_rank_corr(a.inner, b.inner, ddof, propagate_nans).into()
dsl::spearman_rank_corr(a.inner, b.inner, propagate_nans).into()
}
#[cfg(not(feature = "propagate_nans"))]
{
Expand Down
19 changes: 13 additions & 6 deletions py-polars/polars/functions/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ def corr(
b: IntoExpr,
*,
method: CorrelationMethod = "pearson",
ddof: int = 1,
ddof: int | None = None,
propagate_nans: bool = False,
) -> Expr:
"""
Expand All @@ -794,9 +794,10 @@ def corr(
b
Column name or Expression.
ddof
"Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
where N represents the number of elements.
By default ddof is 1.
Has no effect, do not use.
.. deprecated:: 1.17.0
method : {'pearson', 'spearman'}
Correlation method.
propagate_nans
Expand Down Expand Up @@ -844,13 +845,19 @@ def corr(
│ 0.5 │
└─────┘
"""
if ddof is not None:
issue_deprecation_warning(
"The `ddof` parameter has no effect. Do not use it.",
version="1.17.0",
)

a = parse_into_expression(a)
b = parse_into_expression(b)

if method == "pearson":
return wrap_expr(plr.pearson_corr(a, b, ddof))
return wrap_expr(plr.pearson_corr(a, b))
elif method == "spearman":
return wrap_expr(plr.spearman_rank_corr(a, b, ddof, propagate_nans))
return wrap_expr(plr.spearman_rank_corr(a, b, propagate_nans))
else:
msg = f"method must be one of {{'pearson', 'spearman'}}, got {method!r}"
raise ValueError(msg)
Expand Down
8 changes: 5 additions & 3 deletions py-polars/tests/unit/lazyframe/test_lazyframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ def test_spearman_corr_ties() -> None:
df = pl.DataFrame({"a": [1, 1, 1, 2, 3, 7, 4], "b": [4, 3, 2, 2, 4, 3, 1]})

result = df.select(
pl.corr("a", "b", method="spearman", ddof=0).alias("a1"),
pl.corr("a", "b", method="spearman").alias("a1"),
pl.corr(pl.col("a").rank("min"), pl.col("b").rank("min")).alias("a2"),
pl.corr(pl.col("a").rank(), pl.col("b").rank()).alias("a3"),
)
Expand All @@ -972,7 +972,9 @@ def test_pearson_corr() -> None:
out = (
ldf.group_by("era", maintain_order=True).agg(
pl.corr(
pl.col("prediction"), pl.col("target"), method="pearson", ddof=0
pl.col("prediction"),
pl.col("target"),
method="pearson",
).alias("c"),
)
).collect()["c"]
Expand All @@ -981,7 +983,7 @@ def test_pearson_corr() -> None:
# we can also pass in column names directly
out = (
ldf.group_by("era", maintain_order=True).agg(
pl.corr("prediction", "target", method="pearson", ddof=0).alias("c"),
pl.corr("prediction", "target", method="pearson").alias("c"),
)
).collect()["c"]
assert out.to_list() == pytest.approx([0.6546536707079772, -5.477514993831792e-1])
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/operations/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_corr() -> None:

def test_corr_nan() -> None:
df = pl.DataFrame({"a": [1.0, 1.0], "b": [1.0, 2.0]})
assert str(df.select(pl.corr("a", "b", ddof=1))[0, 0]) == "nan"
assert str(df.select(pl.corr("a", "b"))[0, 0]) == "nan"


def test_median_quantile_duration() -> None:
Expand Down

0 comments on commit 8abc550

Please sign in to comment.