diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fec9395..b87ad198 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [0.9.24] - 2024-11-27 +## [0.9.26] - 2024-12-05 +### Fixed +- Limit the privacy unit contribution to at most max_privacy_unit_groups when applying tau_thresholding. + +## [0.9.25] - 2024-11-27 ### Added - Add attributes to data_type::Id diff --git a/Cargo.toml b/Cargo.toml index 2b9e1b40..76d25acd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] authors = ["Nicolas Grislain "] name = "qrlew" -version = "0.9.25" +version = "0.9.26" edition = "2021" description = "Sarus Qrlew Engine" documentation = "https://docs.rs/qrlew" diff --git a/src/differential_privacy/dp_parameters.rs b/src/differential_privacy/dp_parameters.rs index 75b1ed20..de623413 100644 --- a/src/differential_privacy/dp_parameters.rs +++ b/src/differential_privacy/dp_parameters.rs @@ -13,6 +13,9 @@ pub struct DpParameters { pub privacy_unit_max_multiplicity: f64, /// The max_multiplicity in terms of the dataset size pub privacy_unit_max_multiplicity_share: f64, + /// the maximum number of groups a privacy unit can contribute to. + /// Is the Cu parameter in the wilson paper. + pub max_privacy_unit_groups: u64, } impl DpParameters { @@ -22,6 +25,7 @@ impl DpParameters { tau_thresholding_share: f64, privacy_unit_max_multiplicity: f64, privacy_unit_max_multiplicity_share: f64, + max_privacy_unit_groups: u64, ) -> DpParameters { DpParameters { epsilon, @@ -29,12 +33,13 @@ impl DpParameters { tau_thresholding_share, privacy_unit_max_multiplicity, privacy_unit_max_multiplicity_share, + max_privacy_unit_groups, } } pub fn from_epsilon_delta(epsilon: f64, delta: f64) -> DpParameters { // These default values are underestimating the bounds - DpParameters::new(epsilon, delta, 0.5, 100.0, 0.1) + DpParameters::new(epsilon, delta, 0.5, 100.0, 0.1, 1) } pub fn with_tau_thresholding_share(self, tau_thresholding_share: f64) -> DpParameters { @@ -63,6 +68,12 @@ impl DpParameters { ..self } } + pub fn with_max_privacy_unit_groups(self, max_privacy_unit_groups: u64) -> DpParameters { + DpParameters { + max_privacy_unit_groups, + ..self + } + } } impl DpParameters { diff --git a/src/differential_privacy/group_by.rs b/src/differential_privacy/group_by.rs index f3d6b4f5..fb26ff2e 100644 --- a/src/differential_privacy/group_by.rs +++ b/src/differential_privacy/group_by.rs @@ -2,19 +2,24 @@ use super::Error; use crate::{ builder::{Ready, With, WithIterator}, differential_privacy::{dp_event, DpEvent, DpRelation, Result}, - expr::{aggregate, Expr}, + expr::Expr, namer::{self, name_from_content}, privacy_unit_tracking::{PrivacyUnit, PupRelation}, relation::{Join, Reduce, Relation, Variant as _}, }; -pub const COUNT_DISTINCT_PE_ID: &str = "_COUNT_DISTINCT_PE_ID_"; +pub const COUNT_DISTINCT_PID: &str = "_COUNT_DISTINCT_PID_"; impl Reduce { /// Returns a `DPRelation` whose: /// - `relation` outputs all the DP values of the `self` grouping keys /// - `dp_event` stores the invoked DP mechanisms - pub fn differentially_private_group_by(&self, epsilon: f64, delta: f64) -> Result { + pub fn differentially_private_group_by( + &self, + epsilon: f64, + delta: f64, + max_privacy_unit_groups: u64, + ) -> Result { if self.group_by().is_empty() { Err(Error::GroupingKeysError(format!("No grouping keys"))) } else { @@ -35,7 +40,7 @@ impl Reduce { )) .input(self.input().clone()) .build(); - PupRelation::try_from(relation)?.dp_values(epsilon, delta) + PupRelation::try_from(relation)?.dp_values(epsilon, delta, max_privacy_unit_groups) } } } @@ -45,46 +50,75 @@ impl PupRelation { /// - `relation` outputs the (epsilon, delta)-DP values /// (found by tau-thresholding) of the fields of the current `Relation` /// - `dp_event` stores the invoked DP mechanisms - fn tau_thresholding_values(self, epsilon: f64, delta: f64) -> Result { + fn tau_thresholding_values( + self, + epsilon: f64, + delta: f64, + max_privacy_unit_groups: u64, + ) -> Result { + // It limits the PU contribution to at most max_privacy_unit_groups random groups + // It counts distinct PUs + // It applies tau-thresholding if epsilon == 0. || delta == 0. { return Err(Error::BudgetError(format!( "Not enough budget for tau-thresholding. Got: (espilon, delta) = ({epsilon}, {delta})" ))); } - // compute COUNT (DISTINCT PrivacyUnit::privacy_unit()) GROUP BY columns - let columns: Vec = self + // Build a reduce grouping by columns and the PU + let columns: Vec<&str> = self .schema() .iter() - .cloned() .filter_map(|f| { if f.name() == self.privacy_unit() || f.name() == self.privacy_unit_weight() { None } else { - Some(f.name().to_string()) + Some(f.name()) } }) .collect(); - let columns: Vec<&str> = columns.iter().map(|s| s.as_str()).collect(); - let aggregates = vec![(COUNT_DISTINCT_PE_ID, aggregate::Aggregate::Count)]; - let peid = self.privacy_unit().to_string(); - let rel = - Relation::from(self).distinct_aggregates(peid.as_ref(), columns.clone(), aggregates); + let columns_and_pu: Vec<_> = columns + .iter() + .cloned() + .chain(std::iter::once(self.privacy_unit())) + .collect(); + let red = Relation::from(self.clone()).unique(&columns_and_pu); + + let rel_with_limited_pu_contributions = + red.limit_col_contributions(self.privacy_unit(), max_privacy_unit_groups); + + let mut columns_aggs: Vec<(&str, Expr)> = vec![( + COUNT_DISTINCT_PID, + Expr::count(Expr::col(self.privacy_unit())), + )]; + let mut columns_groups: Vec = vec![]; + columns.into_iter().for_each(|c| { + let col = Expr::col(c); + columns_aggs.push((c, Expr::first(col.clone()))); + columns_groups.push(col); + }); + + // Count distinct PUs. + let rel: Relation = Relation::reduce() + .with_iter(columns_aggs) + .group_by_iter(columns_groups) + .input(rel_with_limited_pu_contributions) + .build(); // Apply noise let name_sigmas = vec![( - COUNT_DISTINCT_PE_ID, - dp_event::gaussian_noise(epsilon, delta, 1.), + COUNT_DISTINCT_PID, + dp_event::gaussian_noise(epsilon, delta, max_privacy_unit_groups as f64), )]; let rel = rel.add_gaussian_noise(&name_sigmas); - // Returns a `Relation::Map` with the right field names and with `COUNT(DISTINCT PE_ID) > tau` - let tau = dp_event::gaussian_tau(epsilon, delta, 1.0); - let filter_column = [(COUNT_DISTINCT_PE_ID, (Some(tau.into()), None, vec![]))] + // Returns a `Relation::Map` with the right field names and with `COUNT(DISTINCT PID) > tau` + let tau = dp_event::gaussian_tau(epsilon, delta, max_privacy_unit_groups as f64); + let filter_column = [(COUNT_DISTINCT_PID, (Some(tau.into()), None, vec![]))] .into_iter() .collect(); let relation = rel .filter_columns(filter_column) - .filter_fields(|f| columns.contains(&f)); + .filter_fields(|f| columns_and_pu.contains(&f)); Ok(DpRelation::new( relation, DpEvent::epsilon_delta(epsilon, delta), @@ -99,7 +133,12 @@ impl PupRelation { /// - Using the propagated public values of the grouping columns when they exist /// - Applying tau-thresholding mechanism with the (epsilon, delta) privacy parameters for t /// he columns that do not have public values - pub fn dp_values(self, epsilon: f64, delta: f64) -> Result { + pub fn dp_values( + self, + epsilon: f64, + delta: f64, + max_privacy_unit_groups: u64, + ) -> Result { // TODO this code is super-ugly rewrite it let public_columns: Vec = self .schema() @@ -116,7 +155,7 @@ impl PupRelation { if public_columns.is_empty() { let name = namer::name_from_content("FILTER_", &self.name()); self.with_name(name)? - .tau_thresholding_values(epsilon, delta) + .tau_thresholding_values(epsilon, delta, max_privacy_unit_groups) } else if all_columns_are_public { Ok(DpRelation::new( self.with_public_values(&public_columns)?, @@ -127,7 +166,7 @@ impl PupRelation { .clone() .with_name(namer::name_from_content("FILTER_", &self.name()))? .filter_fields(|f| !public_columns.contains(&f.to_string()))? - .tau_thresholding_values(epsilon, delta)? + .tau_thresholding_values(epsilon, delta, max_privacy_unit_groups)? .into(); let relation = self .with_public_values(&public_columns)? @@ -237,10 +276,10 @@ mod tests { let (rel, pq) = pup_table .clone() - .tau_thresholding_values(1., 0.003) + .tau_thresholding_values(1., 0.003, 1) .unwrap() .into(); - //rel.display_dot(); + rel.display_dot().unwrap(); assert_eq!( rel.data_type(), DataType::structured([ @@ -273,7 +312,7 @@ mod tests { ) .build(); let pup_table = PupRelation(table); - let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into(); + let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into(); matches!(rel, Relation::Join(_)); assert_eq!( rel.data_type(), @@ -305,7 +344,7 @@ mod tests { ) .build(); let pup_table = PupRelation(table); - let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into(); + let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into(); assert!(matches!(rel, Relation::Map(_))); assert_eq!( rel.data_type(), @@ -336,7 +375,7 @@ mod tests { ) .build(); let pup_table = PupRelation(table); - let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into(); + let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into(); assert!(matches!(rel, Relation::Join(_))); assert!(matches!(rel.inputs()[0], &Relation::Values(_))); assert!(matches!(rel.inputs()[1], &Relation::Map(_))); @@ -379,7 +418,7 @@ mod tests { .with(("sum_a".to_string(), AggregateColumn::sum("a"))) .input(table.clone()) .build(); - let dp_reduce = reduce.differentially_private_group_by(epsilon, delta); + let dp_reduce = reduce.differentially_private_group_by(epsilon, delta, 1); assert!(dp_reduce.is_err()); // With GROUPBY. Only one column with possible values @@ -390,7 +429,7 @@ mod tests { .input(table.clone()) .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); @@ -408,7 +447,7 @@ mod tests { .input(table.clone()) .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); assert_eq!(dp_event, DpEvent::epsilon_delta(epsilon, delta)); @@ -429,7 +468,7 @@ mod tests { .input(table.clone()) .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); assert_eq!(dp_event, DpEvent::epsilon_delta(epsilon, delta)); @@ -488,7 +527,7 @@ mod tests { .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); @@ -529,7 +568,7 @@ mod tests { .input(input) .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); @@ -613,7 +652,7 @@ mod tests { .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); @@ -675,7 +714,7 @@ mod tests { .input(input) .build(); let (dp_relation, _) = reduce - .differentially_private_group_by(1., 1e-2) + .differentially_private_group_by(1., 1e-2, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); @@ -740,7 +779,7 @@ mod tests { .input(table.clone()) .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); @@ -757,7 +796,7 @@ mod tests { .input(table.clone()) .build(); let (dp_relation, dp_event) = reduce - .differentially_private_group_by(epsilon, delta) + .differentially_private_group_by(epsilon, delta, 1) .unwrap() .into(); dp_relation.display_dot().unwrap(); diff --git a/src/differential_privacy/mod.rs b/src/differential_privacy/mod.rs index a185855a..2cb9a908 100644 --- a/src/differential_privacy/mod.rs +++ b/src/differential_privacy/mod.rs @@ -135,6 +135,7 @@ impl Reduce { .differentially_private_group_by( parameters.epsilon * parameters.tau_thresholding_share, parameters.delta * parameters.tau_thresholding_share, + parameters.max_privacy_unit_groups, )? .into(); let input_relation_with_privacy_tracked_group_by = self @@ -364,7 +365,8 @@ mod tests { .unwrap() .deref() .clone(); - let parameters = DpParameters::from_epsilon_delta(1., 1e-3); + let parameters = + DpParameters::from_epsilon_delta(100., 1e-3).with_max_privacy_unit_groups(10); // privacy track the inputs let privacy_unit_tracking = PrivacyUnitTracking::from(( @@ -409,7 +411,8 @@ mod tests { let query: &str = &ast::Query::from(&dp_relation).to_string(); println!("{query}"); - _ = database.query(query).unwrap(); + let res = database.query(query).unwrap(); + println!("\n{:?}", res); } #[test] diff --git a/src/relation/dot.rs b/src/relation/dot.rs index 5a09f5ce..888b5ddb 100644 --- a/src/relation/dot.rs +++ b/src/relation/dot.rs @@ -253,7 +253,11 @@ impl<'a, T: Clone + fmt::Display, V: Visitor<'a, T>> dot::Labeller<'a, Node<'a, | JoinOperator::LeftOuter(expr) | JoinOperator::RightOuter(expr) | JoinOperator::FullOuter(expr) => { - format!("
{} ON {}", join.operator.to_string(), expr) + format!( + "
{} ON {}", + join.operator.to_string(), + dot::escape_html(&expr.to_string()) + ) } JoinOperator::Cross => format!("
{}", join.operator.to_string()), }; diff --git a/src/relation/rewriting.rs b/src/relation/rewriting.rs index 3ff0a620..f3f79258 100644 --- a/src/relation/rewriting.rs +++ b/src/relation/rewriting.rs @@ -721,7 +721,7 @@ impl Relation { } /// Returns a Relation whose fields have unique values - fn unique(self, columns: &[&str]) -> Relation { + pub fn unique(self, columns: &[&str]) -> Relation { let named_columns: Vec<(&str, Expr)> = columns.iter().copied().map(|c| (c, Expr::col(c))).collect(); @@ -736,6 +736,69 @@ impl Relation { .build() } + /// It limits the contribution of a column to max_contributions randomly. + pub fn limit_col_contributions(self, column: &str, max_contributions: u64) -> Relation { + let random_col: &str = "_RANDOM_"; + let contribution_index_col: &str = "_CONTRIBUTION_INDEX_"; + + let columns: Vec<&str> = self.schema().iter().map(|f| f.name()).collect(); + + let left = self.clone().with_field(random_col, Expr::random(0)); + let right = left.clone(); + + let join_names: Hierarchy = left + .schema() + .iter() + .flat_map(|f| { + [ + ([Join::left_name(), f.name()], f.name().to_string()), + ( + [Join::right_name(), f.name()], + format!("r_{}", f.name().to_string()), + ), + ] + }) + .collect(); + + let joined: Relation = Relation::join() + .left(left) + .right(right) + .on(Expr::eq( + Expr::qcol(Join::left_name(), column), + Expr::qcol(Join::right_name(), column), + )) + .and(Expr::lt_eq( + Expr::qcol(Join::left_name(), random_col), + Expr::qcol(Join::right_name(), random_col), + )) + .names(join_names) + .build(); + + // Build the reduce with a row number (contribution_index_col) + let mut aggregates: Vec<(&str, Expr)> = + vec![(contribution_index_col, Expr::count(Expr::col(random_col)))]; + let mut groups: Vec = vec![]; + columns.iter().for_each(|c| { + let col = Expr::col(*c); + aggregates.push((c, Expr::first(col.clone()))); + groups.push(col); + }); + let red: Relation = Relation::reduce() + .with_iter(aggregates) + .group_by_iter(groups) + .input(joined) + .build(); + + Relation::map() + .with_iter(columns.into_iter().map(|f| (f, Expr::col(f)))) + .filter(Expr::lt_eq( + Expr::col(contribution_index_col), + Expr::val(max_contributions as f64), + )) + .input(red) + .build() + } + /// Returns a `Relation` whose output fields correspond to the `aggregates` /// grouped by the expressions in `grouping_exprs`. /// If `grouping_exprs` is not empty, we order by the grouping expressions. @@ -898,6 +961,8 @@ impl With<(&str, Expr)> for Relation { #[cfg(test)] mod tests { + use data_type::value; + use super::*; use crate::{ ast, @@ -2104,4 +2169,74 @@ mod tests { assert_eq!(red.group_by.len(), relation.schema().len()) } } + + #[test] + fn test_limit_col_contribution() { + let mut database = postgresql::test_database(); + let relations = database.relations(); + + // relation with reduce + let relation = Relation::try_from( + parse("SELECT id, city FROM user_table") + .unwrap() + .with(&relations), + ) + .unwrap(); + relation.display_dot().unwrap(); + + // Compute id contribution of the relation + let aggregates: Vec<(&str, Expr)> = vec![ + ("group_count", Expr::count(Expr::col("city"))), + ("id", Expr::first(Expr::col("id"))), + ]; + let groups: Vec = vec![Expr::col("id")]; + let red: Relation = Relation::reduce() + .with_iter(aggregates) + .group_by_iter(groups) + .input(relation.clone()) + .build(); + let max_contr: Relation = Relation::reduce() + .with_iter(vec![( + "max_group_count", + Expr::max(Expr::col("group_count")), + )]) + .input(red) + .build(); + + let query = &ast::Query::from(&max_contr); + let res = database.query(&query.to_string()).unwrap(); + let val = value::Value::integer(1); + assert!(res[0][0] != val); + + let with_limited_contr = relation.limit_col_contributions("id", 1); + with_limited_contr.display_dot().unwrap(); + let query = &ast::Query::from(&with_limited_contr); + + println!("\n{}", query); + _ = database.query(&query.to_string()).unwrap(); + + // Compute id contribution of the with_limited_d + let aggregates: Vec<(&str, Expr)> = vec![ + ("group_count", Expr::count(Expr::col("city"))), + ("id", Expr::first(Expr::col("id"))), + ]; + let groups: Vec = vec![Expr::col("id")]; + let red: Relation = Relation::reduce() + .with_iter(aggregates) + .group_by_iter(groups) + .input(with_limited_contr) + .build(); + let max_contr: Relation = Relation::reduce() + .with_iter(vec![( + "max_group_count", + Expr::max(Expr::col("group_count")), + )]) + .input(red) + .build(); + + let query = &ast::Query::from(&max_contr); + let res = database.query(&query.to_string()).unwrap(); + let val = value::Value::integer(1); + assert!(res[0][0] == val) + } }