Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug when using the same column in the GROUP BY and an Aggregate function #222

Merged
merged 5 commits into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Fixed
- Fixed bug when using the same column in the GROUP BY and an Aggregate function [#222](https://github.com/Qrlew/qrlew/issues/222)
- Natural joins [#221](https://github.com/Qrlew/qrlew/issues/221)
- When the clipping factor is zero, multiply by zero instead of dividing by 1 / clipping_factor [#218](https://github.com/Qrlew/qrlew/issues/218)

Expand Down
157 changes: 139 additions & 18 deletions src/differential_privacy/aggregates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ impl PUPRelation {
/// The budget is equally splitted among the sums.
fn differentially_private_sums(
self,
sums: Vec<&str>,
sums: Vec<(&str, &str)>,
group_by_names: Vec<&str>,
epsilon: f64,
delta: f64,
Expand All @@ -82,8 +82,9 @@ impl PUPRelation {

let input_values_bound = sums
.iter()
.map(|c| {
.map(|(s, c)| {
(
*s,
*c,
self.schema()[*c]
.data_type()
Expand All @@ -92,18 +93,19 @@ impl PUPRelation {
)
})
.collect::<Vec<_>>();

// Clip the relation
let clipped_relation = self.deref().clone().l2_clipped_sums(
self.privacy_unit(),
group_by_names,
input_values_bound.clone(),
);

let input_values_bound = input_values_bound
.iter()
.map(|(s, _, f)| (*s, *f))
.collect::<Vec<_>>();
let (dp_clipped_relation, private_query) = clipped_relation
.gaussian_mechanisms(epsilon, delta, input_values_bound)
.into();

Ok(DPRelation::new(dp_clipped_relation, private_query))
}

Expand Down Expand Up @@ -241,21 +243,16 @@ impl PUPRelation {
.differentially_private_sums(
named_sums
.iter() // Convert &str to String
.map(|(_, s)| s.as_str())
.collect::<Vec<&str>>(),
.map(|(s1, s2)| (s1.as_str(), s2.as_str()))
.collect::<Vec<_>>(),
group_by_names,
epsilon,
delta,
)?
.into();
let names: HashMap<String, String> =
named_sums.into_iter().map(|(s1, s2)| (s2, s1)).collect();
let dp_relation = output_builder
.input(
dp_relation
.rename_fields(|n, _| names.get(n).unwrap_or(&n.to_string()).to_string()),
)
.build();
.input(dp_relation)
.build();
Ok(DPRelation::new(dp_relation, private_query))
}
}
Expand Down Expand Up @@ -481,7 +478,7 @@ mod tests {

let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec!["price"], vec![], epsilon, delta)
.differentially_private_sums(vec![("sum_price", "price")], vec![], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();
matches!(dp_relation.schema()[0].data_type(), DataType::Float(_));
Expand Down Expand Up @@ -527,11 +524,11 @@ mod tests {
pup_table.deref().clone().into(),
);
let relation = Relation::from(reduce.clone());
relation.display_dot().unwrap();
//relation.display_dot().unwrap();

let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec!["price"], vec!["item"], epsilon, delta)
.differentially_private_sums(vec![("sum_price", "price")], vec!["item"], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();
assert_eq!(dp_relation.schema().len(), 2);
Expand All @@ -545,6 +542,55 @@ mod tests {
_ = database.query(query).unwrap();
}

#[test]
fn test_differentially_private_sums_group_by_aggregate() {
let table: Relation = Relation::table()
.name("table")
.schema(
Schema::builder()
.with(("a", DataType::integer_range(1..=10)))
.with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
.with(("c", DataType::integer_range(5..=20)))
.with((
PrivacyUnit::privacy_unit(),
DataType::integer_range(1..=100),
))
.with((
PrivacyUnit::privacy_unit_weight(),
DataType::float_interval(0., 1.),
))
.build(),
)
.size(100)
.build();
let (epsilon, delta) = (1., 1e-3);

// GROUP BY and the aggregate input the same column
let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.group_by(expr!(a))
.input(table.clone())
.build();
let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec![("sum_a", "a")], vec!["a"], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();

let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.with_group_by_column("a")
.input(table.clone())
.build();
let dp_relation = PUPRelation::try_from(reduce.input().clone())
.unwrap()
.differentially_private_sums(vec![("sum_a", "a")], vec!["a"], epsilon, delta)
.unwrap();
dp_relation.display_dot().unwrap();
}

#[test]
fn test_differentially_private_aggregates() {
let mut database = postgresql::test_database();
Expand Down Expand Up @@ -593,7 +639,6 @@ mod tests {
.unwrap();
dp_relation.display_dot().unwrap();
assert_eq!(dp_relation.schema().len(), 5);
println!("data_type = {}", dp_relation.data_type());
assert!(dp_relation
.data_type()
.is_subset_of(&DataType::structured(vec![
Expand Down Expand Up @@ -1101,4 +1146,80 @@ mod tests {
);
}

#[test]
fn test_differentially_private_group_by_aggregate() {
let table: Relation = Relation::table()
.name("table")
.schema(
Schema::builder()
.with(("a", DataType::integer_range(1..=10)))
.with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
.with(("c", DataType::integer_range(5..=20)))
.with((
PrivacyUnit::privacy_unit(),
DataType::integer_range(1..=100),
))
.with((
PrivacyUnit::privacy_unit_weight(),
DataType::float_interval(0., 1.),
))
.build(),
)
.size(100)
.build();
let (epsilon, delta) = (1., 1e-3);

// GROUP BY and the aggregate input the same column
let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.with_group_by_column("a")
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_aggregates(epsilon.clone(), delta.clone())
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(
private_query,
PrivateQuery::gaussian_from_epsilon_delta_sensitivity(
epsilon.clone(),
delta.clone(),
10.
)
);
assert_eq!(
dp_relation.data_type(),
DataType::structured([
("sum_a", DataType::float_interval(0., 1000.)),
("a", DataType::integer_range(1..=10)
)])
);


let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.group_by(expr!(a))
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_aggregates(epsilon.clone(), delta.clone())
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(
private_query,
PrivateQuery::gaussian_from_epsilon_delta_sensitivity(
epsilon.clone(),
delta.clone(),
10.
)
);
assert_eq!(
dp_relation.data_type(),
DataType::structured([("sum_a", DataType::float_interval(0., 1000.)),])
);
}
}
59 changes: 59 additions & 0 deletions src/differential_privacy/group_by.rs
Original file line number Diff line number Diff line change
Expand Up @@ -707,4 +707,63 @@ mod tests {
println!("{:?}", city_keys);
assert_eq!(city_keys, correct_keys);
}

#[test]
fn test_differentially_private_group_by_spefic_aggregate() {
let table: Relation = Relation::table()
.name("table")
.schema(
Schema::builder()
.with(("a", DataType::integer_range(1..=10)))
.with(("b", DataType::integer_values([1, 2, 5, 6, 7, 8])))
.with(("c", DataType::integer_range(5..=20)))
.with((
PrivacyUnit::privacy_unit(),
DataType::integer_range(1..=100),
))
.with((
PrivacyUnit::privacy_unit_weight(),
DataType::float_interval(0., 1.),
))
.build(),
)
.size(100)
.build();
let (epsilon, delta) = (1., 1e-3);

// GROUP BY and the aggregate input the same column
let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.group_by(expr!(a))
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_group_by(epsilon, delta)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(private_query, PrivateQuery::EpsilonDelta(epsilon, delta));
assert_eq!(
dp_relation.data_type(),
DataType::structured([("a", DataType::integer_range(1..=10))])
);

let reduce: Reduce = Relation::reduce()
.name("reduce_relation")
.with(("sum_a".to_string(), AggregateColumn::sum("a")))
.with_group_by_column("a")
.input(table.clone())
.build();
let (dp_relation, private_query) = reduce
.differentially_private_group_by(epsilon, delta)
.unwrap()
.into();
dp_relation.display_dot().unwrap();
assert_eq!(private_query, PrivateQuery::EpsilonDelta(epsilon, delta));
assert_eq!(
dp_relation.data_type(),
DataType::structured([("a", DataType::integer_range(1..=10))])
);
}
}
1 change: 0 additions & 1 deletion src/relation/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,6 @@ impl Ready<Reduce> for ReduceBuilder<WithInput> {
),
None => self.input.0,
};
input.display_dot().unwrap();
// Check that the First aggregate columns are in the GROUP BY
reduce.named_aggregates.iter()
.filter_map(|(_, agg)| matches!(agg.aggregate(), expr::aggregate::Aggregate::First).then_some(agg.column()))
Expand Down
17 changes: 16 additions & 1 deletion src/relation/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1684,7 +1684,7 @@ mod tests {
.with(("my_sum", Expr::sum(Expr::col("b"))))
.with(("my_a", Expr::first(expr!(3 * a))))
.group_by(expr!(3 * a))
.input(table)
.input(table.clone())
.build();
assert_eq!(
reduce.data_type(),
Expand All @@ -1693,6 +1693,21 @@ mod tests {
("my_a", DataType::integer_interval(0, 30)),
])
);

// GROUP BY and aggregates have the same argument
let reduce: Relation = Relation::reduce()
.with(("my_sum", Expr::sum(Expr::col("a"))))
.with(("my_a", Expr::first(expr!(a))))
.group_by(expr!(a))
.input(table)
.build();
assert_eq!(
reduce.data_type(),
DataType::structured([
("my_sum", DataType::integer_interval(0, 1000)),
("my_a", DataType::integer_interval(0, 10)),
])
);
}

#[test]
Expand Down
Loading