Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add logic for finding max_sum_per_partition candidates #484

Merged
merged 9 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions analysis/cross_partition_combiners.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ def _sum_metrics_to_data_dropped(
sum_metrics: metrics.SumMetrics, partition_keep_probability: float,
dp_metric: pipeline_dp.Metric) -> metrics.DataDropInfo:
"""Finds Data drop information from per-partition metrics."""
# TODO(dvadym): implement for Sum
assert dp_metric != pipeline_dp.Metrics.SUM, "Cross-partition metrics are not implemented for SUM"

# This function attributed the data that is dropped, to different reasons
# how they are dropped.

# 1. linf/l0 contribution bounding
# Contribution bounding errors are negative, negate to keep data dropped
# to be positive.
linf_dropped = -sum_metrics.clipping_to_max_error # not correct for SUM
linf_dropped = -sum_metrics.clipping_to_max_error
if dp_metric == pipeline_dp.Metrics.SUM:
linf_dropped += sum_metrics.clipping_to_min_error
RamSaw marked this conversation as resolved.
Show resolved Hide resolved
l0_dropped = -sum_metrics.expected_l0_bounding_error

# 2. Partition selection (in case of private partition selection).
Expand Down
107 changes: 77 additions & 30 deletions analysis/parameter_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from numbers import Number

import pipeline_dp
from pipeline_dp import pipeline_backend
Expand All @@ -23,12 +24,10 @@

import dataclasses
from dataclasses import dataclass
from typing import Callable, List, Optional, Tuple, Union
from typing import Callable, List, Optional, Tuple, Union, Sequence
from enum import Enum
import numpy as np

from pipeline_dp import private_contribution_bounds


class MinimizingFunction(Enum):
ABSOLUTE_ERROR = 'absolute_error'
Expand Down Expand Up @@ -130,47 +129,82 @@ def _find_candidate_parameters(
"""
calculate_l0_param = parameters_to_tune.max_partitions_contributed
generate_linf = metric == pipeline_dp.Metrics.COUNT
generate_max_sum_per_partition = metric == pipeline_dp.Metrics.SUM
calculate_linf_param = (parameters_to_tune.max_contributions_per_partition
RamSaw marked this conversation as resolved.
Show resolved Hide resolved
and generate_linf)
calculate_sum_per_partition_param = (
parameters_to_tune.max_sum_per_partition and
generate_max_sum_per_partition)
l0_bounds = linf_bounds = None
max_sum_per_partition_bounds = min_sum_per_partition_bounds = None

if calculate_sum_per_partition_param:
assert not parameters_to_tune.min_sum_per_partition, "Tuning of min_sum_per_partition is not supported yet"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please put this check in _check_tune_args

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added, wdyt? afaiu options.parameters_to_tune.min_sum_per_partition should never be true because we don't support it in any cases.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, agree that tuning min_sum_per_partition should be false for now

assert hist.linf_sum_contributions_histogram.bins[
0].lower >= 0, "max_sum_per_partition should not contain negative sums because min_sum_per_partition tuning is not supported yet and therefore tuning for max_sum_per_partition works only when linf_sum_contributions_histogram does not negative sums"
RamSaw marked this conversation as resolved.
Show resolved Hide resolved

if calculate_l0_param and calculate_linf_param:
max_candidates_per_parameter = int(math.sqrt(max_candidates))
l0_candidates = _find_candidates_constant_relative_step(
hist.l0_contributions_histogram, max_candidates_per_parameter)
linf_candidates = _find_candidates_constant_relative_step(
hist.linf_contributions_histogram, max_candidates_per_parameter)
l0_bounds, linf_bounds = [], []

# if linf or l0 has fewer candidates than requested then we can add more
# candidates for the other parameter.
if (len(linf_candidates) < max_candidates_per_parameter and
len(l0_candidates) == max_candidates_per_parameter):
l0_candidates = _find_candidates_constant_relative_step(
hist.l0_contributions_histogram,
int(max_candidates / len(linf_candidates)))
elif (len(l0_candidates) < max_candidates_per_parameter and
len(linf_candidates) == max_candidates_per_parameter):
linf_candidates = _find_candidates_constant_relative_step(
hist.linf_contributions_histogram,
int(max_candidates / len(l0_candidates)))

for l0 in l0_candidates:
for linf in linf_candidates:
l0_bounds.append(l0)
linf_bounds.append(linf)
l0_bounds, linf_bounds = _find_candidates_parameters_in_2d_grid(
hist.l0_contributions_histogram, hist.linf_contributions_histogram,
_find_candidates_constant_relative_step,
_find_candidates_constant_relative_step, max_candidates)
elif calculate_l0_param and calculate_sum_per_partition_param:
l0_bounds, max_sum_per_partition_bounds = _find_candidates_parameters_in_2d_grid(
hist.l0_contributions_histogram,
hist.linf_sum_contributions_histogram,
_find_candidates_constant_relative_step,
_find_candidates_bins_max_values_subsample, max_candidates)
min_sum_per_partition_bounds = [0] * len(max_sum_per_partition_bounds)
elif calculate_l0_param:
l0_bounds = _find_candidates_constant_relative_step(
hist.l0_contributions_histogram, max_candidates)
elif calculate_linf_param:
linf_bounds = _find_candidates_constant_relative_step(
hist.linf_contributions_histogram, max_candidates)
elif calculate_sum_per_partition_param:
max_sum_per_partition_bounds = _find_candidates_bins_max_values_subsample(
hist.linf_sum_contributions_histogram, max_candidates)
min_sum_per_partition_bounds = [0] * len(max_sum_per_partition_bounds)
else:
assert False, "Nothing to tune."

return analysis.MultiParameterConfiguration(
max_partitions_contributed=l0_bounds,
max_contributions_per_partition=linf_bounds)
max_contributions_per_partition=linf_bounds,
min_sum_per_partition=min_sum_per_partition_bounds,
max_sum_per_partition=max_sum_per_partition_bounds)


def _find_candidates_parameters_in_2d_grid(
RamSaw marked this conversation as resolved.
Show resolved Hide resolved
hist1: histograms.Histogram, hist2: histograms.Histogram,
find_candidates_func1: Callable[[histograms.Histogram, int],
Sequence[Number]],
find_candidates_func2: Callable[[histograms.Histogram, int],
Sequence[Number]], max_candidates: int):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: please add docstring for this function (it has pretty complicated API)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added, wdyt?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SGTM, thanks

max_candidates_per_parameter = int(math.sqrt(max_candidates))
param1_candidates = find_candidates_func1(hist1,
max_candidates_per_parameter)
param2_candidates = find_candidates_func2(hist2,
max_candidates_per_parameter)
param1_bounds, param2_bounds = [], []

# if param1 or param2 has fewer candidates than requested then we can add
# more candidates for the other parameter.
if (len(param2_candidates) < max_candidates_per_parameter and
len(param1_candidates) == max_candidates_per_parameter):
param1_candidates = find_candidates_func1(
hist1, int(max_candidates / len(param2_candidates)))
elif (len(param1_candidates) < max_candidates_per_parameter and
len(param2_candidates) == max_candidates_per_parameter):
param2_candidates = find_candidates_func2(
hist2, int(max_candidates / len(param1_candidates)))

for param1 in param1_candidates:
for param2 in param2_candidates:
param1_bounds.append(param1)
param2_bounds.append(param2)

return param1_bounds, param2_bounds


def _find_candidates_constant_relative_step(histogram: histograms.Histogram,
Expand Down Expand Up @@ -204,6 +238,18 @@ def _find_candidates_constant_relative_step(histogram: histograms.Histogram,
return candidates


def _find_candidates_bins_max_values_subsample(
histogram: histograms.Histogram, max_candidates: int) -> List[float]:
"""Takes max values of histogram bins with constant step between each other.
"""
RamSaw marked this conversation as resolved.
Show resolved Hide resolved
max_candidates = min(max_candidates, len(histogram.bins))
ids = np.round(np.linspace(0, len(histogram.bins) - 1,
num=max_candidates)).astype(int)
bin_maximums = np.fromiter(map(lambda bin: bin.max, histogram.bins),
dtype=np.float)
return bin_maximums[ids].tolist()


def tune(col,
backend: pipeline_backend.PipelineBackend,
contribution_histograms: histograms.DatasetHistograms,
Expand Down Expand Up @@ -324,10 +370,11 @@ def _check_tune_args(options: TuneOptions, is_public_partitions: bool):
f"Tuning supports only one metric, but {metrics} given.")
else: # len(metrics) == 1
if metrics[0] not in [
pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT
pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT,
pipeline_dp.Metrics.SUM
]:
raise ValueError(
f"Tuning is supported only for Count and Privacy id count, but {metrics[0]} given."
f"Tuning is supported only for Count, Privacy id count and Sum, but {metrics[0]} given."
)

if options.function_to_minimize != MinimizingFunction.ABSOLUTE_ERROR:
Expand Down
43 changes: 34 additions & 9 deletions analysis/tests/cross_partition_combiners_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,18 @@ def _get_sum_metrics(sum=10.0):

class PerPartitionToCrossPartitionMetrics(parameterized.TestCase):

@parameterized.parameters(1, 0.25)
def test_metric_utility_count(self, keep_prob: float):
@parameterized.product(
metric=[pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM],
keep_prob=[1, 0.25])
def test_metric_utility(self, metric: pipeline_dp.Metric, keep_prob: float):
input = _get_sum_metrics()
output: metrics.MetricUtility = cross_partition_combiners._sum_metrics_to_metric_utility(
input,
pipeline_dp.Metrics.COUNT,
metric,
partition_keep_probability=keep_prob,
partition_weight=keep_prob)

self.assertEqual(output.metric, pipeline_dp.Metrics.COUNT)
self.assertEqual(output.metric, metric)
self.assertEqual(output.noise_kind, input.noise_kind)
self.assertEqual(output.noise_std, input.std_noise)

Expand Down Expand Up @@ -106,10 +108,14 @@ def test_per_partition_to_cross_partition_utility(
per_partition_utility = metrics.PerPartitionMetrics(
partition_selection_probability_to_keep=0.2,
raw_statistics=metrics.RawStatistics(privacy_id_count=10, count=15),
metric_errors=[_get_sum_metrics(),
_get_sum_metrics()])
metric_errors=[
_get_sum_metrics(),
_get_sum_metrics(),
_get_sum_metrics()
])
dp_metrics = [
pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT
pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT,
pipeline_dp.Metrics.SUM
]
cross_partition_combiners._per_partition_to_utility_report(
per_partition_utility,
Expand All @@ -123,7 +129,7 @@ def test_per_partition_to_cross_partition_utility(
mock_create_for_public_partitions.assert_not_called()
mock_create_for_private_partitions.assert_called_once_with(0.2)

self.assertEqual(mock_sum_metrics_to_metric_utility.call_count, 2)
self.assertEqual(mock_sum_metrics_to_metric_utility.call_count, 3)

@patch(
"analysis.cross_partition_combiners._partition_metrics_public_partitions"
Expand All @@ -150,7 +156,7 @@ def test_per_partition_to_cross_partition_utility_only_partition_selection(
mock_create_for_public_partitions.assert_not_called()
mock_create_for_private_partitions.assert_called_once_with(0.5)

def test_sum_metrics_to_data_dropped(self):
def test_sum_metrics_to_data_dropped_count(self):
input = _get_sum_metrics()
output = cross_partition_combiners._sum_metrics_to_data_dropped(
input,
Expand All @@ -160,6 +166,25 @@ def test_sum_metrics_to_data_dropped(self):
output,
metrics.DataDropInfo(l0=2.0, linf=5.0, partition_selection=1.5))

def test_sum_metrics_to_data_dropped_sum(self):
input = metrics.SumMetrics(aggregation=pipeline_dp.Metrics.SUM,
sum=12,
clipping_to_min_error=3.0,
clipping_to_max_error=-5.0,
expected_l0_bounding_error=-2.0,
std_l0_bounding_error=3.0,
std_noise=4.0,
noise_kind=pipeline_dp.NoiseKind.LAPLACE)

output = cross_partition_combiners._sum_metrics_to_data_dropped(
input,
partition_keep_probability=0.5,
dp_metric=pipeline_dp.Metrics.SUM)

self.assertEqual(
output,
metrics.DataDropInfo(l0=2.0, linf=8.0, partition_selection=1.0))

def test_sum_metrics_to_data_dropped_public_partition(self):
input = _get_sum_metrics()
output = cross_partition_combiners._sum_metrics_to_data_dropped(
Expand Down
Loading
Loading