Skip to content

Commit

Permalink
Merge pull request #297 from Qrlew/limit_user_contributions_gb
Browse files Browse the repository at this point in the history
Limit user contributions in a group by
  • Loading branch information
ngrislain authored Dec 10, 2024
2 parents bd2f1b9 + de32fd4 commit 5f485a1
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 45 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.9.24] - 2024-11-27
## [0.9.26] - 2024-12-05
### Fixed
- Limit the privacy unit contribution to at most max_privacy_unit_groups when applying tau_thresholding.

## [0.9.25] - 2024-11-27
### Added
- Add attributes to data_type::Id

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
authors = ["Nicolas Grislain <ng@sarus.tech>"]
name = "qrlew"
version = "0.9.25"
version = "0.9.26"
edition = "2021"
description = "Sarus Qrlew Engine"
documentation = "https://docs.rs/qrlew"
Expand Down
13 changes: 12 additions & 1 deletion src/differential_privacy/dp_parameters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ pub struct DpParameters {
pub privacy_unit_max_multiplicity: f64,
/// The max_multiplicity in terms of the dataset size
pub privacy_unit_max_multiplicity_share: f64,
/// the maximum number of groups a privacy unit can contribute to.
/// Is the Cu parameter in the wilson paper.
pub max_privacy_unit_groups: u64,
}

impl DpParameters {
Expand All @@ -22,19 +25,21 @@ impl DpParameters {
tau_thresholding_share: f64,
privacy_unit_max_multiplicity: f64,
privacy_unit_max_multiplicity_share: f64,
max_privacy_unit_groups: u64,
) -> DpParameters {
DpParameters {
epsilon,
delta,
tau_thresholding_share,
privacy_unit_max_multiplicity,
privacy_unit_max_multiplicity_share,
max_privacy_unit_groups,
}
}

pub fn from_epsilon_delta(epsilon: f64, delta: f64) -> DpParameters {
// These default values are underestimating the bounds
DpParameters::new(epsilon, delta, 0.5, 100.0, 0.1)
DpParameters::new(epsilon, delta, 0.5, 100.0, 0.1, 1)
}

pub fn with_tau_thresholding_share(self, tau_thresholding_share: f64) -> DpParameters {
Expand Down Expand Up @@ -63,6 +68,12 @@ impl DpParameters {
..self
}
}
pub fn with_max_privacy_unit_groups(self, max_privacy_unit_groups: u64) -> DpParameters {
DpParameters {
max_privacy_unit_groups,
..self
}
}
}

impl DpParameters {
Expand Down
115 changes: 77 additions & 38 deletions src/differential_privacy/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@ use super::Error;
use crate::{
builder::{Ready, With, WithIterator},
differential_privacy::{dp_event, DpEvent, DpRelation, Result},
expr::{aggregate, Expr},
expr::Expr,
namer::{self, name_from_content},
privacy_unit_tracking::{PrivacyUnit, PupRelation},
relation::{Join, Reduce, Relation, Variant as _},
};

pub const COUNT_DISTINCT_PE_ID: &str = "_COUNT_DISTINCT_PE_ID_";
pub const COUNT_DISTINCT_PID: &str = "_COUNT_DISTINCT_PID_";

impl Reduce {
/// Returns a `DPRelation` whose:
/// - `relation` outputs all the DP values of the `self` grouping keys
/// - `dp_event` stores the invoked DP mechanisms
pub fn differentially_private_group_by(&self, epsilon: f64, delta: f64) -> Result<DpRelation> {
pub fn differentially_private_group_by(
&self,
epsilon: f64,
delta: f64,
max_privacy_unit_groups: u64,
) -> Result<DpRelation> {
if self.group_by().is_empty() {
Err(Error::GroupingKeysError(format!("No grouping keys")))
} else {
Expand All @@ -35,7 +40,7 @@ impl Reduce {
))
.input(self.input().clone())
.build();
PupRelation::try_from(relation)?.dp_values(epsilon, delta)
PupRelation::try_from(relation)?.dp_values(epsilon, delta, max_privacy_unit_groups)
}
}
}
Expand All @@ -45,46 +50,75 @@ impl PupRelation {
/// - `relation` outputs the (epsilon, delta)-DP values
/// (found by tau-thresholding) of the fields of the current `Relation`
/// - `dp_event` stores the invoked DP mechanisms
fn tau_thresholding_values(self, epsilon: f64, delta: f64) -> Result<DpRelation> {
fn tau_thresholding_values(
self,
epsilon: f64,
delta: f64,
max_privacy_unit_groups: u64,
) -> Result<DpRelation> {
// It limits the PU contribution to at most max_privacy_unit_groups random groups
// It counts distinct PUs
// It applies tau-thresholding
if epsilon == 0. || delta == 0. {
return Err(Error::BudgetError(format!(
"Not enough budget for tau-thresholding. Got: (espilon, delta) = ({epsilon}, {delta})"
)));
}
// compute COUNT (DISTINCT PrivacyUnit::privacy_unit()) GROUP BY columns
let columns: Vec<String> = self
// Build a reduce grouping by columns and the PU
let columns: Vec<&str> = self
.schema()
.iter()
.cloned()
.filter_map(|f| {
if f.name() == self.privacy_unit() || f.name() == self.privacy_unit_weight() {
None
} else {
Some(f.name().to_string())
Some(f.name())
}
})
.collect();
let columns: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
let aggregates = vec![(COUNT_DISTINCT_PE_ID, aggregate::Aggregate::Count)];
let peid = self.privacy_unit().to_string();
let rel =
Relation::from(self).distinct_aggregates(peid.as_ref(), columns.clone(), aggregates);
let columns_and_pu: Vec<_> = columns
.iter()
.cloned()
.chain(std::iter::once(self.privacy_unit()))
.collect();
let red = Relation::from(self.clone()).unique(&columns_and_pu);

let rel_with_limited_pu_contributions =
red.limit_col_contributions(self.privacy_unit(), max_privacy_unit_groups);

let mut columns_aggs: Vec<(&str, Expr)> = vec![(
COUNT_DISTINCT_PID,
Expr::count(Expr::col(self.privacy_unit())),
)];
let mut columns_groups: Vec<Expr> = vec![];
columns.into_iter().for_each(|c| {
let col = Expr::col(c);
columns_aggs.push((c, Expr::first(col.clone())));
columns_groups.push(col);
});

// Count distinct PUs.
let rel: Relation = Relation::reduce()
.with_iter(columns_aggs)
.group_by_iter(columns_groups)
.input(rel_with_limited_pu_contributions)
.build();

// Apply noise
let name_sigmas = vec![(
COUNT_DISTINCT_PE_ID,
dp_event::gaussian_noise(epsilon, delta, 1.),
COUNT_DISTINCT_PID,
dp_event::gaussian_noise(epsilon, delta, max_privacy_unit_groups as f64),
)];
let rel = rel.add_gaussian_noise(&name_sigmas);

// Returns a `Relation::Map` with the right field names and with `COUNT(DISTINCT PE_ID) > tau`
let tau = dp_event::gaussian_tau(epsilon, delta, 1.0);
let filter_column = [(COUNT_DISTINCT_PE_ID, (Some(tau.into()), None, vec![]))]
// Returns a `Relation::Map` with the right field names and with `COUNT(DISTINCT PID) > tau`
let tau = dp_event::gaussian_tau(epsilon, delta, max_privacy_unit_groups as f64);
let filter_column = [(COUNT_DISTINCT_PID, (Some(tau.into()), None, vec![]))]
.into_iter()
.collect();
let relation = rel
.filter_columns(filter_column)
.filter_fields(|f| columns.contains(&f));
.filter_fields(|f| columns_and_pu.contains(&f));
Ok(DpRelation::new(
relation,
DpEvent::epsilon_delta(epsilon, delta),
Expand All @@ -99,7 +133,12 @@ impl PupRelation {
/// - Using the propagated public values of the grouping columns when they exist
/// - Applying tau-thresholding mechanism with the (epsilon, delta) privacy parameters for t
/// he columns that do not have public values
pub fn dp_values(self, epsilon: f64, delta: f64) -> Result<DpRelation> {
pub fn dp_values(
self,
epsilon: f64,
delta: f64,
max_privacy_unit_groups: u64,
) -> Result<DpRelation> {
// TODO this code is super-ugly rewrite it
let public_columns: Vec<String> = self
.schema()
Expand All @@ -116,7 +155,7 @@ impl PupRelation {
if public_columns.is_empty() {
let name = namer::name_from_content("FILTER_", &self.name());
self.with_name(name)?
.tau_thresholding_values(epsilon, delta)
.tau_thresholding_values(epsilon, delta, max_privacy_unit_groups)
} else if all_columns_are_public {
Ok(DpRelation::new(
self.with_public_values(&public_columns)?,
Expand All @@ -127,7 +166,7 @@ impl PupRelation {
.clone()
.with_name(namer::name_from_content("FILTER_", &self.name()))?
.filter_fields(|f| !public_columns.contains(&f.to_string()))?
.tau_thresholding_values(epsilon, delta)?
.tau_thresholding_values(epsilon, delta, max_privacy_unit_groups)?
.into();
let relation = self
.with_public_values(&public_columns)?
Expand Down Expand Up @@ -237,10 +276,10 @@ mod tests {

let (rel, pq) = pup_table
.clone()
.tau_thresholding_values(1., 0.003)
.tau_thresholding_values(1., 0.003, 1)
.unwrap()
.into();
//rel.display_dot();
rel.display_dot().unwrap();
assert_eq!(
rel.data_type(),
DataType::structured([
Expand Down Expand Up @@ -273,7 +312,7 @@ mod tests {
)
.build();
let pup_table = PupRelation(table);
let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into();
let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into();
matches!(rel, Relation::Join(_));
assert_eq!(
rel.data_type(),
Expand Down Expand Up @@ -305,7 +344,7 @@ mod tests {
)
.build();
let pup_table = PupRelation(table);
let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into();
let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into();
assert!(matches!(rel, Relation::Map(_)));
assert_eq!(
rel.data_type(),
Expand Down Expand Up @@ -336,7 +375,7 @@ mod tests {
)
.build();
let pup_table = PupRelation(table);
let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into();
let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into();
assert!(matches!(rel, Relation::Join(_)));
assert!(matches!(rel.inputs()[0], &Relation::Values(_)));
assert!(matches!(rel.inputs()[1], &Relation::Map(_)));
Expand Down Expand Up @@ -379,7 +418,7 @@ mod tests {
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.input(table.clone())
.build();
let dp_reduce = reduce.differentially_private_group_by(epsilon, delta);
let dp_reduce = reduce.differentially_private_group_by(epsilon, delta, 1);
assert!(dp_reduce.is_err());

// With GROUPBY. Only one column with possible values
Expand All @@ -390,7 +429,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand All @@ -408,7 +447,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
assert_eq!(dp_event, DpEvent::epsilon_delta(epsilon, delta));
Expand All @@ -429,7 +468,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
assert_eq!(dp_event, DpEvent::epsilon_delta(epsilon, delta));
Expand Down Expand Up @@ -488,7 +527,7 @@ mod tests {
.build();

let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -529,7 +568,7 @@ mod tests {
.input(input)
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -613,7 +652,7 @@ mod tests {
.build();

let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -675,7 +714,7 @@ mod tests {
.input(input)
.build();
let (dp_relation, _) = reduce
.differentially_private_group_by(1., 1e-2)
.differentially_private_group_by(1., 1e-2, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -740,7 +779,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand All @@ -757,7 +796,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down
7 changes: 5 additions & 2 deletions src/differential_privacy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ impl Reduce {
.differentially_private_group_by(
parameters.epsilon * parameters.tau_thresholding_share,
parameters.delta * parameters.tau_thresholding_share,
parameters.max_privacy_unit_groups,
)?
.into();
let input_relation_with_privacy_tracked_group_by = self
Expand Down Expand Up @@ -364,7 +365,8 @@ mod tests {
.unwrap()
.deref()
.clone();
let parameters = DpParameters::from_epsilon_delta(1., 1e-3);
let parameters =
DpParameters::from_epsilon_delta(100., 1e-3).with_max_privacy_unit_groups(10);

// privacy track the inputs
let privacy_unit_tracking = PrivacyUnitTracking::from((
Expand Down Expand Up @@ -409,7 +411,8 @@ mod tests {

let query: &str = &ast::Query::from(&dp_relation).to_string();
println!("{query}");
_ = database.query(query).unwrap();
let res = database.query(query).unwrap();
println!("\n{:?}", res);
}

#[test]
Expand Down
6 changes: 5 additions & 1 deletion src/relation/dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,11 @@ impl<'a, T: Clone + fmt::Display, V: Visitor<'a, T>> dot::Labeller<'a, Node<'a,
| JoinOperator::LeftOuter(expr)
| JoinOperator::RightOuter(expr)
| JoinOperator::FullOuter(expr) => {
format!("<br/>{} ON {}", join.operator.to_string(), expr)
format!(
"<br/>{} ON {}",
join.operator.to_string(),
dot::escape_html(&expr.to_string())
)
}
JoinOperator::Cross => format!("<br/>{}", join.operator.to_string()),
};
Expand Down
Loading

0 comments on commit 5f485a1

Please sign in to comment.