Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit user contributions in a group by #297

Merged
merged 13 commits into from
Dec 10, 2024
Merged
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.9.24] - 2024-11-27
## [0.9.26] - 2024-12-05
### Fixed
- Limit the privacy unit contribution to at most max_privacy_unit_groups when applying tau_thresholding.

## [0.9.25] - 2024-11-27
### Added
- Add attributes to data_type::Id

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
authors = ["Nicolas Grislain <ng@sarus.tech>"]
name = "qrlew"
version = "0.9.25"
version = "0.9.26"
edition = "2021"
description = "Sarus Qrlew Engine"
documentation = "https://docs.rs/qrlew"
Expand Down
13 changes: 12 additions & 1 deletion src/differential_privacy/dp_parameters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ pub struct DpParameters {
pub privacy_unit_max_multiplicity: f64,
/// The max_multiplicity in terms of the dataset size
pub privacy_unit_max_multiplicity_share: f64,
/// the maximum number of groups a privacy unit can contribute to.
/// Is the Cu parameter in the wilson paper.
pub max_privacy_unit_groups: u64,
}

impl DpParameters {
Expand All @@ -22,19 +25,21 @@ impl DpParameters {
tau_thresholding_share: f64,
privacy_unit_max_multiplicity: f64,
privacy_unit_max_multiplicity_share: f64,
max_privacy_unit_groups: u64,
) -> DpParameters {
DpParameters {
epsilon,
delta,
tau_thresholding_share,
privacy_unit_max_multiplicity,
privacy_unit_max_multiplicity_share,
max_privacy_unit_groups,
}
}

pub fn from_epsilon_delta(epsilon: f64, delta: f64) -> DpParameters {
// These default values are underestimating the bounds
DpParameters::new(epsilon, delta, 0.5, 100.0, 0.1)
DpParameters::new(epsilon, delta, 0.5, 100.0, 0.1, 1)
}

pub fn with_tau_thresholding_share(self, tau_thresholding_share: f64) -> DpParameters {
Expand Down Expand Up @@ -63,6 +68,12 @@ impl DpParameters {
..self
}
}
pub fn with_max_privacy_unit_groups(self, max_privacy_unit_groups: u64) -> DpParameters {
DpParameters {
max_privacy_unit_groups,
..self
}
}
}

impl DpParameters {
Expand Down
115 changes: 77 additions & 38 deletions src/differential_privacy/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@ use super::Error;
use crate::{
builder::{Ready, With, WithIterator},
differential_privacy::{dp_event, DpEvent, DpRelation, Result},
expr::{aggregate, Expr},
expr::Expr,
namer::{self, name_from_content},
privacy_unit_tracking::{PrivacyUnit, PupRelation},
relation::{Join, Reduce, Relation, Variant as _},
};

pub const COUNT_DISTINCT_PE_ID: &str = "_COUNT_DISTINCT_PE_ID_";
pub const COUNT_DISTINCT_PID: &str = "_COUNT_DISTINCT_PID_";

impl Reduce {
/// Returns a `DPRelation` whose:
/// - `relation` outputs all the DP values of the `self` grouping keys
/// - `dp_event` stores the invoked DP mechanisms
pub fn differentially_private_group_by(&self, epsilon: f64, delta: f64) -> Result<DpRelation> {
pub fn differentially_private_group_by(
&self,
epsilon: f64,
delta: f64,
max_privacy_unit_groups: u64,
) -> Result<DpRelation> {
if self.group_by().is_empty() {
Err(Error::GroupingKeysError(format!("No grouping keys")))
} else {
Expand All @@ -35,7 +40,7 @@ impl Reduce {
))
.input(self.input().clone())
.build();
PupRelation::try_from(relation)?.dp_values(epsilon, delta)
PupRelation::try_from(relation)?.dp_values(epsilon, delta, max_privacy_unit_groups)
}
}
}
Expand All @@ -45,46 +50,75 @@ impl PupRelation {
/// - `relation` outputs the (epsilon, delta)-DP values
/// (found by tau-thresholding) of the fields of the current `Relation`
/// - `dp_event` stores the invoked DP mechanisms
fn tau_thresholding_values(self, epsilon: f64, delta: f64) -> Result<DpRelation> {
fn tau_thresholding_values(
self,
epsilon: f64,
delta: f64,
max_privacy_unit_groups: u64,
) -> Result<DpRelation> {
// It limits the PU contribution to at most max_privacy_unit_groups random groups
// It counts distinct PUs
// It applies tau-thresholding
if epsilon == 0. || delta == 0. {
return Err(Error::BudgetError(format!(
"Not enough budget for tau-thresholding. Got: (espilon, delta) = ({epsilon}, {delta})"
)));
}
// compute COUNT (DISTINCT PrivacyUnit::privacy_unit()) GROUP BY columns
let columns: Vec<String> = self
// Build a reduce grouping by columns and the PU
let columns: Vec<&str> = self
.schema()
.iter()
.cloned()
.filter_map(|f| {
if f.name() == self.privacy_unit() || f.name() == self.privacy_unit_weight() {
None
} else {
Some(f.name().to_string())
Some(f.name())
}
})
.collect();
let columns: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
let aggregates = vec![(COUNT_DISTINCT_PE_ID, aggregate::Aggregate::Count)];
let peid = self.privacy_unit().to_string();
let rel =
Relation::from(self).distinct_aggregates(peid.as_ref(), columns.clone(), aggregates);
let columns_and_pu: Vec<_> = columns
.iter()
.cloned()
.chain(std::iter::once(self.privacy_unit()))
.collect();
let red = Relation::from(self.clone()).unique(&columns_and_pu);

let rel_with_limited_pu_contributions =
red.limit_col_contributions(self.privacy_unit(), max_privacy_unit_groups);

let mut columns_aggs: Vec<(&str, Expr)> = vec![(
COUNT_DISTINCT_PID,
Expr::count(Expr::col(self.privacy_unit())),
)];
let mut columns_groups: Vec<Expr> = vec![];
columns.into_iter().for_each(|c| {
let col = Expr::col(c);
columns_aggs.push((c, Expr::first(col.clone())));
columns_groups.push(col);
});

// Count distinct PUs.
let rel: Relation = Relation::reduce()
.with_iter(columns_aggs)
.group_by_iter(columns_groups)
.input(rel_with_limited_pu_contributions)
.build();

// Apply noise
let name_sigmas = vec![(
COUNT_DISTINCT_PE_ID,
dp_event::gaussian_noise(epsilon, delta, 1.),
COUNT_DISTINCT_PID,
dp_event::gaussian_noise(epsilon, delta, max_privacy_unit_groups as f64),
)];
let rel = rel.add_gaussian_noise(&name_sigmas);

// Returns a `Relation::Map` with the right field names and with `COUNT(DISTINCT PE_ID) > tau`
let tau = dp_event::gaussian_tau(epsilon, delta, 1.0);
let filter_column = [(COUNT_DISTINCT_PE_ID, (Some(tau.into()), None, vec![]))]
// Returns a `Relation::Map` with the right field names and with `COUNT(DISTINCT PID) > tau`
let tau = dp_event::gaussian_tau(epsilon, delta, max_privacy_unit_groups as f64);
let filter_column = [(COUNT_DISTINCT_PID, (Some(tau.into()), None, vec![]))]
.into_iter()
.collect();
let relation = rel
.filter_columns(filter_column)
.filter_fields(|f| columns.contains(&f));
.filter_fields(|f| columns_and_pu.contains(&f));
Ok(DpRelation::new(
relation,
DpEvent::epsilon_delta(epsilon, delta),
Expand All @@ -99,7 +133,12 @@ impl PupRelation {
/// - Using the propagated public values of the grouping columns when they exist
/// - Applying tau-thresholding mechanism with the (epsilon, delta) privacy parameters for t
/// he columns that do not have public values
pub fn dp_values(self, epsilon: f64, delta: f64) -> Result<DpRelation> {
pub fn dp_values(
self,
epsilon: f64,
delta: f64,
max_privacy_unit_groups: u64,
) -> Result<DpRelation> {
// TODO this code is super-ugly rewrite it
let public_columns: Vec<String> = self
.schema()
Expand All @@ -116,7 +155,7 @@ impl PupRelation {
if public_columns.is_empty() {
let name = namer::name_from_content("FILTER_", &self.name());
self.with_name(name)?
.tau_thresholding_values(epsilon, delta)
.tau_thresholding_values(epsilon, delta, max_privacy_unit_groups)
} else if all_columns_are_public {
Ok(DpRelation::new(
self.with_public_values(&public_columns)?,
Expand All @@ -127,7 +166,7 @@ impl PupRelation {
.clone()
.with_name(namer::name_from_content("FILTER_", &self.name()))?
.filter_fields(|f| !public_columns.contains(&f.to_string()))?
.tau_thresholding_values(epsilon, delta)?
.tau_thresholding_values(epsilon, delta, max_privacy_unit_groups)?
.into();
let relation = self
.with_public_values(&public_columns)?
Expand Down Expand Up @@ -237,10 +276,10 @@ mod tests {

let (rel, pq) = pup_table
.clone()
.tau_thresholding_values(1., 0.003)
.tau_thresholding_values(1., 0.003, 1)
.unwrap()
.into();
//rel.display_dot();
rel.display_dot().unwrap();
assert_eq!(
rel.data_type(),
DataType::structured([
Expand Down Expand Up @@ -273,7 +312,7 @@ mod tests {
)
.build();
let pup_table = PupRelation(table);
let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into();
let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into();
matches!(rel, Relation::Join(_));
assert_eq!(
rel.data_type(),
Expand Down Expand Up @@ -305,7 +344,7 @@ mod tests {
)
.build();
let pup_table = PupRelation(table);
let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into();
let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into();
assert!(matches!(rel, Relation::Map(_)));
assert_eq!(
rel.data_type(),
Expand Down Expand Up @@ -336,7 +375,7 @@ mod tests {
)
.build();
let pup_table = PupRelation(table);
let (rel, pq) = pup_table.dp_values(1., 0.003).unwrap().into();
let (rel, pq) = pup_table.dp_values(1., 0.003, 1).unwrap().into();
assert!(matches!(rel, Relation::Join(_)));
assert!(matches!(rel.inputs()[0], &Relation::Values(_)));
assert!(matches!(rel.inputs()[1], &Relation::Map(_)));
Expand Down Expand Up @@ -379,7 +418,7 @@ mod tests {
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.input(table.clone())
.build();
let dp_reduce = reduce.differentially_private_group_by(epsilon, delta);
let dp_reduce = reduce.differentially_private_group_by(epsilon, delta, 1);
assert!(dp_reduce.is_err());

// With GROUPBY. Only one column with possible values
Expand All @@ -390,7 +429,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand All @@ -408,7 +447,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
assert_eq!(dp_event, DpEvent::epsilon_delta(epsilon, delta));
Expand All @@ -429,7 +468,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
assert_eq!(dp_event, DpEvent::epsilon_delta(epsilon, delta));
Expand Down Expand Up @@ -488,7 +527,7 @@ mod tests {
.build();

let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -529,7 +568,7 @@ mod tests {
.input(input)
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -613,7 +652,7 @@ mod tests {
.build();

let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -675,7 +714,7 @@ mod tests {
.input(input)
.build();
let (dp_relation, _) = reduce
.differentially_private_group_by(1., 1e-2)
.differentially_private_group_by(1., 1e-2, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down Expand Up @@ -740,7 +779,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand All @@ -757,7 +796,7 @@ mod tests {
.input(table.clone())
.build();
let (dp_relation, dp_event) = reduce
.differentially_private_group_by(epsilon, delta)
.differentially_private_group_by(epsilon, delta, 1)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
Expand Down
7 changes: 5 additions & 2 deletions src/differential_privacy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ impl Reduce {
.differentially_private_group_by(
parameters.epsilon * parameters.tau_thresholding_share,
parameters.delta * parameters.tau_thresholding_share,
parameters.max_privacy_unit_groups,
)?
.into();
let input_relation_with_privacy_tracked_group_by = self
Expand Down Expand Up @@ -364,7 +365,8 @@ mod tests {
.unwrap()
.deref()
.clone();
let parameters = DpParameters::from_epsilon_delta(1., 1e-3);
let parameters =
DpParameters::from_epsilon_delta(100., 1e-3).with_max_privacy_unit_groups(10);

// privacy track the inputs
let privacy_unit_tracking = PrivacyUnitTracking::from((
Expand Down Expand Up @@ -409,7 +411,8 @@ mod tests {

let query: &str = &ast::Query::from(&dp_relation).to_string();
println!("{query}");
_ = database.query(query).unwrap();
let res = database.query(query).unwrap();
println!("\n{:?}", res);
}

#[test]
Expand Down
6 changes: 5 additions & 1 deletion src/relation/dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,11 @@ impl<'a, T: Clone + fmt::Display, V: Visitor<'a, T>> dot::Labeller<'a, Node<'a,
| JoinOperator::LeftOuter(expr)
| JoinOperator::RightOuter(expr)
| JoinOperator::FullOuter(expr) => {
format!("<br/>{} ON {}", join.operator.to_string(), expr)
format!(
"<br/>{} ON {}",
join.operator.to_string(),
dot::escape_html(&expr.to_string())
)
}
JoinOperator::Cross => format!("<br/>{}", join.operator.to_string()),
};
Expand Down
Loading
Loading