diff --git a/CHANGELOG.md b/CHANGELOG.md index 6689b24a..db574d1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,15 +6,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## Added - implemented row protection [#215](https://github.com/Qrlew/qrlew/issues/215) +## [0.5.4] - 2023-12-05 +- implemented `STD` and `VAR`aggregations in the dp rewritting [#205](https://github.com/Qrlew/qrlew/issues/205) +- `Expr::filter_by_function`: if the filtered datatype cannot be determined, keep the original data [#209](https://github.com/Qrlew/qrlew/issues/209) +## Added +- implemented Public -> Synthetic rewritting rule for table [#209](https://github.com/Qrlew/qrlew/issues/209) ## [0.5.3] - 2023-12-02 ## Changed - some cleaning in the translation of Expr -> ast::Expr [#204](https://github.com/Qrlew/qrlew/issues/204) -- `Expr::filter_by_function`: if the filtered datatype cannot be determined, keep the original data [#209](https://github.com/Qrlew/qrlew/issues/209) -## Added -- implemented Public -> Synthetic rewritting rule for table [#209](https://github.com/Qrlew/qrlew/issues/209) - implemented `BETWEEN`, `IS TRUE`, `IS FALSE`, `IS NULL`, `CHOOSE`, `LIKE` and `ILIKE` [#203](https://github.com/Qrlew/qrlew/issues/203) - implemented `DAYNAME`, `FROM_UNIXTIME`, `UNIX_TIMESTAMP`, `DATETIME_DIFF`, `QUARTER` and `DATE_FORMAT` [#202](https://github.com/Qrlew/qrlew/issues/202) - implemented `CURRENT_DATE`, `CURRENT_TIME`, `CURRENT_TIMESTAMP` and `EXTRACT(datepart FROM col)` [#200](https://github.com/Qrlew/qrlew/issues/200) diff --git a/Cargo.toml b/Cargo.toml index 5265275d..3d258c42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] authors = ["Nicolas Grislain "] name = "qrlew" -version = "0.5.3" +version = "0.5.4" edition = "2021" description = "Sarus Qrlew Engine" documentation = "https://docs.rs/qrlew" diff --git a/src/differential_privacy/aggregates.rs b/src/differential_privacy/aggregates.rs index 86becd7c..d08b4d8b 100644 --- a/src/differential_privacy/aggregates.rs +++ b/src/differential_privacy/aggregates.rs @@ -150,6 +150,8 @@ impl PUPRelation { let one_col = format!("_ONE_{}", col_name); let sum_col = format!("_SUM_{}", col_name); let count_col = format!("_COUNT_{}", col_name); + let square_col = format!("_SQUARE_{}", col_name); + let sum_square_col = format!("_SUM_{}", square_col); match aggregate.aggregate() { aggregate::Aggregate::First => { assert!(group_by_names.contains(&col_name.as_str())); @@ -179,8 +181,56 @@ impl PUPRelation { sums.push((sum_col.clone(), col_name)); output_b = output_b.with((name, Expr::col(sum_col))); } - aggregate::Aggregate::Std => todo!(), - aggregate::Aggregate::Var => todo!(), + aggregate::Aggregate::Std => { + input_b = input_b + .with((col_name.as_str(), Expr::col(col_name.as_str()))) + .with((square_col.as_str(), Expr::pow(Expr::col(col_name.as_str()), Expr::val(2)))) + .with((one_col.as_str(), Expr::val(1.))); + sums.push((count_col.clone(), one_col)); + sums.push((sum_col.clone(), col_name)); + sums.push((sum_square_col.clone(), square_col)); + output_b = output_b.with(( + name, + Expr::sqrt(Expr::greatest( + Expr::val(0.), + Expr::minus( + Expr::divide( + Expr::col(sum_square_col), + Expr::greatest(Expr::val(1.), Expr::col(count_col.clone())), + ), + Expr::divide( + Expr::col(sum_col), + Expr::greatest(Expr::val(1.), Expr::col(count_col)), + ), + ) + )) + )) + } + aggregate::Aggregate::Var => { + input_b = input_b + .with((col_name.as_str(), Expr::col(col_name.as_str()))) + .with((square_col.as_str(), Expr::pow(Expr::col(col_name.as_str()), Expr::val(2)))) + .with((one_col.as_str(), Expr::val(1.))); + sums.push((count_col.clone(), one_col)); + sums.push((sum_col.clone(), col_name)); + sums.push((sum_square_col.clone(), square_col)); + output_b = output_b.with(( + name, + Expr::greatest( + Expr::val(0.), + Expr::minus( + Expr::divide( + Expr::col(sum_square_col), + Expr::greatest(Expr::val(1.), Expr::col(count_col.clone())), + ), + Expr::divide( + Expr::col(sum_col), + Expr::greatest(Expr::val(1.), Expr::col(count_col)), + ), + ) + ) + )) + } _ => (), } (input_b, sums, output_b) @@ -531,6 +581,8 @@ mod tests { ("count_price".to_string(), AggregateColumn::count("price")), ("sum_price".to_string(), AggregateColumn::sum("price")), ("avg_price".to_string(), AggregateColumn::mean("price")), + ("var_price".to_string(), AggregateColumn::var("price")), + ("std_price".to_string(), AggregateColumn::std("price")), ], vec![], pup_table.deref().clone().into(), @@ -542,17 +594,19 @@ mod tests { .differentially_private_aggregates(epsilon, delta) .unwrap(); dp_relation.display_dot().unwrap(); - assert_eq!(dp_relation.schema().len(), 3); + assert_eq!(dp_relation.schema().len(), 5); + println!("data_type = {}", dp_relation.data_type()); assert!(dp_relation .data_type() .is_subset_of(&DataType::structured(vec![ ("count_price", DataType::float()), ("sum_price", DataType::float()), ("avg_price", DataType::float()), + ("var_price", DataType::float_min(0.)), + ("std_price", DataType::float_min(0.)), ]))); - let query: &str = &ast::Query::from(&relation).to_string(); - println!("{query}"); + println!("\n{query}"); _ = database.query(query).unwrap(); } @@ -590,6 +644,8 @@ mod tests { ("count_price".to_string(), AggregateColumn::count("price")), ("sum_price".to_string(), AggregateColumn::sum("price")), ("avg_price".to_string(), AggregateColumn::mean("price")), + ("var_price".to_string(), AggregateColumn::var("price")), + ("std_price".to_string(), AggregateColumn::std("price")), ], vec![expr!(item)], pup_table.deref().clone().into(), @@ -601,13 +657,15 @@ mod tests { .differentially_private_aggregates(epsilon, delta) .unwrap(); dp_relation.display_dot().unwrap(); - assert_eq!(dp_relation.schema().len(), 3); + assert_eq!(dp_relation.schema().len(), 5); assert!(dp_relation .data_type() .is_subset_of(&DataType::structured(vec![ ("count_price", DataType::float()), ("sum_price", DataType::float()), ("avg_price", DataType::float()), + ("var_price", DataType::float_min(0.)), + ("std_price", DataType::float_min(0.)), ]))); let query: &str = &ast::Query::from(&relation).to_string(); @@ -996,6 +1054,9 @@ mod tests { .with(("sum_distinct_a", AggregateColumn::sum_distinct("a"))) .with(("count_b", AggregateColumn::count("b"))) .with(("count_distinct_b", AggregateColumn::count_distinct("b"))) + .with(("avg_distinct_b", AggregateColumn::mean_distinct("b"))) + .with(("var_distinct_b", AggregateColumn::var_distinct("b"))) + .with(("std_distinct_b", AggregateColumn::std_distinct("b"))) .build(); let dp_relation = reduce.differentially_private_aggregates(epsilon.clone(), delta.clone()).unwrap(); //dp_relation.relation().display_dot().unwrap(); @@ -1006,6 +1067,9 @@ mod tests { ("sum_distinct_a", DataType::float_interval(-2000., 2000.)), ("count_b", DataType::float_interval(0., 1000.)), ("count_distinct_b", DataType::float_interval(0., 1000.)), + ("avg_distinct_b", DataType::float_interval(0., 10000.)), + ("var_distinct_b", DataType::float_interval(0., 100000.)), + ("std_distinct_b", DataType::float_interval(0., 316.22776601683796)), ]) ); @@ -1017,6 +1081,9 @@ mod tests { .with(("count_b", AggregateColumn::count("b"))) .with(("count_distinct_b", AggregateColumn::count_distinct("b"))) .with(("my_c", AggregateColumn::first("c"))) + .with(("avg_distinct_b", AggregateColumn::mean_distinct("b"))) + .with(("var_distinct_b", AggregateColumn::var_distinct("b"))) + .with(("std_distinct_b", AggregateColumn::std_distinct("b"))) .group_by(expr!(c)) .build(); let dp_relation = reduce.differentially_private_aggregates(epsilon.clone(), delta.clone()).unwrap(); @@ -1029,6 +1096,9 @@ mod tests { ("count_b", DataType::float_interval(0., 1000.)), ("count_distinct_b", DataType::float_interval(0., 1000.)), ("my_c", DataType::float_interval(10., 20.)), + ("avg_distinct_b", DataType::float_interval(0., 10000.)), + ("var_distinct_b", DataType::float_interval(0., 100000.)), + ("std_distinct_b", DataType::float_interval(0., 316.22776601683796)), ]) ); }