Skip to content

Commit

Permalink
Merge pull request #222 from Qrlew/impl_specific_aggs
Browse files Browse the repository at this point in the history
Fix bug when using the same column in the GROUP BY and an Aggregate function
  • Loading branch information
ngrislain authored Dec 15, 2023
2 parents 8a4bb34 + beef547 commit 7db78a3
Show file tree
Hide file tree
Showing 8 changed files with 370 additions and 123 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Fixed
- Fixed bug when using the same column in the GROUP BY and an Aggregate function [#222](https://github.com/Qrlew/qrlew/issues/222)
- Natural joins [#221](https://github.com/Qrlew/qrlew/issues/221)
- When the clipping factor is zero, multiply by zero instead of dividing by 1 / clipping_factor [#218](https://github.com/Qrlew/qrlew/issues/218)

Expand Down
157 changes: 139 additions & 18 deletions src/differential_privacy/aggregates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ impl PUPRelation {
/// The budget is equally splitted among the sums.
fn differentially_private_sums(
self,
sums: Vec<&str>,
sums: Vec<(&str, &str)>,
group_by_names: Vec<&str>,
epsilon: f64,
delta: f64,
Expand All @@ -82,8 +82,9 @@ impl PUPRelation {

let input_values_bound = sums
.iter()
.map(|c| {
.map(|(s, c)| {
(
*s,
*c,
self.schema()[*c]
.data_type()
Expand All @@ -92,18 +93,19 @@ impl PUPRelation {
)
})
.collect::<Vec<_>>();

// Clip the relation
let clipped_relation = self.deref().clone().l2_clipped_sums(
self.privacy_unit(),
group_by_names,
input_values_bound.clone(),
);

let input_values_bound = input_values_bound
.iter()
.map(|(s, _, f)| (*s, *f))
.collect::<Vec<_>>();
let (dp_clipped_relation, private_query) = clipped_relation
.gaussian_mechanisms(epsilon, delta, input_values_bound)
.into();

Ok(DPRelation::new(dp_clipped_relation, private_query))
}

Expand Down Expand Up @@ -241,21 +243,16 @@ impl PUPRelation {
.differentially_private_sums(
named_sums
.iter() // Convert &str to String
.map(|(_, s)| s.as_str())
.collect::<Vec<&str>>(),
.map(|(s1, s2)| (s1.as_str(), s2.as_str()))
.collect::<Vec<_>>(),
group_by_names,
epsilon,
delta,
)?
.into();
let names: HashMap<String, String> =
named_sums.into_iter().map(|(s1, s2)| (s2, s1)).collect();
let dp_relation = output_builder
.input(
dp_relation
.rename_fields(|n, _| names.get(n).unwrap_or(&n.to_string()).to_string()),
)
.build();
.input(dp_relation)
.build();
Ok(DPRelation::new(dp_relation, private_query))
}
}
Expand Down Expand Up @@ -481,7 +478,7 @@ mod tests {

let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec!["price"], vec![], epsilon, delta)
.differentially_private_sums(vec![("sum_price", "price")], vec![], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();
matches!(dp_relation.schema()[0].data_type(), DataType::Float(_));
Expand Down Expand Up @@ -527,11 +524,11 @@ mod tests {
pup_table.deref().clone().into(),
);
let relation = Relation::from(reduce.clone());
relation.display_dot().unwrap();
//relation.display_dot().unwrap();

let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec!["price"], vec!["item"], epsilon, delta)
.differentially_private_sums(vec![("sum_price", "price")], vec!["item"], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();
assert_eq!(dp_relation.schema().len(), 2);
Expand All @@ -545,6 +542,55 @@ mod tests {
_ = database.query(query).unwrap();
}

#[test]
fn test_differentially_private_sums_group_by_aggregate() {
let table: Relation = Relation::table()
.name("table")
.schema(
Schema::builder()
.with(("a", DataType::integer_range(1..=10)))
.with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
.with(("c", DataType::integer_range(5..=20)))
.with((
PrivacyUnit::privacy_unit(),
DataType::integer_range(1..=100),
))
.with((
PrivacyUnit::privacy_unit_weight(),
DataType::float_interval(0., 1.),
))
.build(),
)
.size(100)
.build();
let (epsilon, delta) = (1., 1e-3);

// GROUP BY and the aggregate input the same column
let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.group_by(expr!(a))
.input(table.clone())
.build();
let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec![("sum_a", "a")], vec!["a"], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();

let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.with_group_by_column("a")
.input(table.clone())
.build();
let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec![("sum_a", "a")], vec!["a"], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();
}

#[test]
fn test_differentially_private_aggregates() {
let mut database = postgresql::test_database();
Expand Down Expand Up @@ -593,7 +639,6 @@ mod tests {
.unwrap();
dp_relation.display_dot().unwrap();
assert_eq!(dp_relation.schema().len(), 5);
println!("data_type = {}", dp_relation.data_type());
assert!(dp_relation
.data_type()
.is_subset_of(&DataType::structured(vec![
Expand Down Expand Up @@ -1101,4 +1146,80 @@ mod tests {
);
}

#[test]
fn test_differentially_private_group_by_aggregate() {
let table: Relation = Relation::table()
.name("table")
.schema(
Schema::builder()
.with(("a", DataType::integer_range(1..=10)))
.with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
.with(("c", DataType::integer_range(5..=20)))
.with((
PrivacyUnit::privacy_unit(),
DataType::integer_range(1..=100),
))
.with((
PrivacyUnit::privacy_unit_weight(),
DataType::float_interval(0., 1.),
))
.build(),
)
.size(100)
.build();
let (epsilon, delta) = (1., 1e-3);

// GROUP BY and the aggregate input the same column
let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.with_group_by_column("a")
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_aggregates(epsilon.clone(), delta.clone())
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(
private_query,
PrivateQuery::gaussian_from_epsilon_delta_sensitivity(
epsilon.clone(),
delta.clone(),
10.
)
);
assert_eq!(
dp_relation.data_type(),
DataType::structured([
("sum_a", DataType::float_interval(0., 1000.)),
("a", DataType::integer_range(1..=10)
)])
);


let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.group_by(expr!(a))
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_aggregates(epsilon.clone(), delta.clone())
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(
private_query,
PrivateQuery::gaussian_from_epsilon_delta_sensitivity(
epsilon.clone(),
delta.clone(),
10.
)
);
assert_eq!(
dp_relation.data_type(),
DataType::structured([("sum_a", DataType::float_interval(0., 1000.)),])
);
}
}
59 changes: 59 additions & 0 deletions src/differential_privacy/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -707,4 +707,63 @@ mod tests {
println!("{:?}", city_keys);
assert_eq!(city_keys, correct_keys);
}

#[test]
fn test_differentially_private_group_by_spefic_aggregate() {
let table: Relation = Relation::table()
.name("table")
.schema(
Schema::builder()
.with(("a", DataType::integer_range(1..=10)))
.with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
.with(("c", DataType::integer_range(5..=20)))
.with((
PrivacyUnit::privacy_unit(),
DataType::integer_range(1..=100),
))
.with((
PrivacyUnit::privacy_unit_weight(),
DataType::float_interval(0., 1.),
))
.build(),
)
.size(100)
.build();
let (epsilon, delta) = (1., 1e-3);

// GROUP BY and the aggregate input the same column
let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.group_by(expr!(a))
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_group_by(epsilon, delta)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(private_query, PrivateQuery::EpsilonDelta(epsilon, delta));
assert_eq!(
dp_relation.data_type(),
DataType::structured([("a", DataType::integer_range(1..=10))])
);

let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.with_group_by_column("a")
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_group_by(epsilon, delta)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(private_query, PrivateQuery::EpsilonDelta(epsilon, delta));
assert_eq!(
dp_relation.data_type(),
DataType::structured([("a", DataType::integer_range(1..=10))])
);
}
}
1 change: 0 additions & 1 deletion src/relation/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,6 @@ impl Ready<Reduce> for ReduceBuilder<WithInput> {
),
None => self.input.0,
};
input.display_dot().unwrap();
// Check that the First aggregate columns are in the GROUP BY
reduce.named_aggregates.iter()
.filter_map(|(_, agg)| matches!(agg.aggregate(), expr::aggregate::Aggregate::First).then_some(agg.column()))
Expand Down
17 changes: 16 additions & 1 deletion src/relation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1684,7 +1684,7 @@ mod tests {
.with(("my_sum", Expr::sum(Expr::col("b"))))
.with(("my_a", Expr::first(expr!(3 * a))))
.group_by(expr!(3 * a))
.input(table)
.input(table.clone())
.build();
assert_eq!(
reduce.data_type(),
Expand All @@ -1693,6 +1693,21 @@ mod tests {
("my_a", DataType::integer_interval(0, 30)),
])
);

// GROUP BY and aggregates have the same argument
let reduce: Relation = Relation::reduce()
.with(("my_sum", Expr::sum(Expr::col("a"))))
.with(("my_a", Expr::first(expr!(a))))
.group_by(expr!(a))
.input(table)
.build();
assert_eq!(
reduce.data_type(),
DataType::structured([
("my_sum", DataType::integer_interval(0, 1000)),
("my_a", DataType::integer_interval(0, 10)),
])
);
}

#[test]
Expand Down
Loading

0 comments on commit 7db78a3

Please sign in to comment.