From 6e922adf1bb53aae7b0bae0e07fd97963c0cbd4c Mon Sep 17 00:00:00 2001 From: Mikhail Pravilov Date: Tue, 29 Aug 2023 10:34:15 +0200 Subject: [PATCH] Add logic for finding `max_sum_per_partition` candidates (#484) --- analysis/cross_partition_combiners.py | 4 +- analysis/parameter_tuning.py | 146 ++++++++--- .../tests/cross_partition_combiners_test.py | 98 ++++++-- analysis/tests/parameter_tuning_test.py | 229 +++++++++++++++--- 4 files changed, 386 insertions(+), 91 deletions(-) diff --git a/analysis/cross_partition_combiners.py b/analysis/cross_partition_combiners.py index c7ff7e7e..aed48733 100644 --- a/analysis/cross_partition_combiners.py +++ b/analysis/cross_partition_combiners.py @@ -25,8 +25,6 @@ def _sum_metrics_to_data_dropped( sum_metrics: metrics.SumMetrics, partition_keep_probability: float, dp_metric: pipeline_dp.Metric) -> metrics.DataDropInfo: """Finds Data drop information from per-partition metrics.""" - # TODO(dvadym): implement for Sum - assert dp_metric != pipeline_dp.Metrics.SUM, "Cross-partition metrics are not implemented for SUM" # This function attributed the data that is dropped, to different reasons # how they are dropped. @@ -34,7 +32,7 @@ def _sum_metrics_to_data_dropped( # 1. linf/l0 contribution bounding # Contribution bounding errors are negative, negate to keep data dropped # to be positive. - linf_dropped = -sum_metrics.clipping_to_max_error # not correct for SUM + linf_dropped = sum_metrics.clipping_to_min_error - sum_metrics.clipping_to_max_error l0_dropped = -sum_metrics.expected_l0_bounding_error # 2. Partition selection (in case of private partition selection). diff --git a/analysis/parameter_tuning.py b/analysis/parameter_tuning.py index cc726e59..fc77b289 100644 --- a/analysis/parameter_tuning.py +++ b/analysis/parameter_tuning.py @@ -11,7 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import math +from numbers import Number import pipeline_dp from pipeline_dp import pipeline_backend @@ -23,12 +25,10 @@ import dataclasses from dataclasses import dataclass -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Union, Sequence from enum import Enum import numpy as np -from pipeline_dp import private_contribution_bounds - class MinimizingFunction(Enum): ABSOLUTE_ERROR = 'absolute_error' @@ -117,7 +117,7 @@ def _find_candidate_parameters( parameters_to_tune: ParametersToTune, metric: Optional[pipeline_dp.Metric], max_candidates: int) -> analysis.MultiParameterConfiguration: - """Finds candidates for l0 and/or l_inf parameters. + """Finds candidates for l0, l_inf and max_sum_per_partition_bounds parameters. Args: hist: dataset contribution histogram. @@ -129,48 +129,106 @@ def _find_candidate_parameters( heuristically chosen value, better to adjust it for your use-case. """ calculate_l0_param = parameters_to_tune.max_partitions_contributed - generate_linf = metric == pipeline_dp.Metrics.COUNT - calculate_linf_param = (parameters_to_tune.max_contributions_per_partition - and generate_linf) + generate_linf_count = metric == pipeline_dp.Metrics.COUNT + generate_max_sum_per_partition = metric == pipeline_dp.Metrics.SUM + calculate_linf_count = (parameters_to_tune.max_contributions_per_partition + and generate_linf_count) + calculate_sum_per_partition_param = ( + parameters_to_tune.max_sum_per_partition and + generate_max_sum_per_partition) l0_bounds = linf_bounds = None + max_sum_per_partition_bounds = min_sum_per_partition_bounds = None + + if calculate_sum_per_partition_param: + if hist.linf_sum_contributions_histogram.bins[0].lower >= 0: + logging.warning( + "max_sum_per_partition should not contain negative sums because min_sum_per_partition tuning is not supported yet and therefore tuning for max_sum_per_partition works only when linf_sum_contributions_histogram does not negative sums" + ) - if calculate_l0_param and calculate_linf_param: - max_candidates_per_parameter = int(math.sqrt(max_candidates)) - l0_candidates = _find_candidates_constant_relative_step( - hist.l0_contributions_histogram, max_candidates_per_parameter) - linf_candidates = _find_candidates_constant_relative_step( - hist.linf_contributions_histogram, max_candidates_per_parameter) - l0_bounds, linf_bounds = [], [] - - # if linf or l0 has fewer candidates than requested then we can add more - # candidates for the other parameter. - if (len(linf_candidates) < max_candidates_per_parameter and - len(l0_candidates) == max_candidates_per_parameter): - l0_candidates = _find_candidates_constant_relative_step( - hist.l0_contributions_histogram, - int(max_candidates / len(linf_candidates))) - elif (len(l0_candidates) < max_candidates_per_parameter and - len(linf_candidates) == max_candidates_per_parameter): - linf_candidates = _find_candidates_constant_relative_step( - hist.linf_contributions_histogram, - int(max_candidates / len(l0_candidates))) - - for l0 in l0_candidates: - for linf in linf_candidates: - l0_bounds.append(l0) - linf_bounds.append(linf) + if calculate_l0_param and calculate_linf_count: + l0_bounds, linf_bounds = _find_candidates_parameters_in_2d_grid( + hist.l0_contributions_histogram, hist.linf_contributions_histogram, + _find_candidates_constant_relative_step, + _find_candidates_constant_relative_step, max_candidates) + elif calculate_l0_param and calculate_sum_per_partition_param: + l0_bounds, max_sum_per_partition_bounds = _find_candidates_parameters_in_2d_grid( + hist.l0_contributions_histogram, + hist.linf_sum_contributions_histogram, + _find_candidates_constant_relative_step, + _find_candidates_bins_max_values_subsample, max_candidates) + min_sum_per_partition_bounds = [0] * len(max_sum_per_partition_bounds) elif calculate_l0_param: l0_bounds = _find_candidates_constant_relative_step( hist.l0_contributions_histogram, max_candidates) - elif calculate_linf_param: + elif calculate_linf_count: linf_bounds = _find_candidates_constant_relative_step( hist.linf_contributions_histogram, max_candidates) + elif calculate_sum_per_partition_param: + max_sum_per_partition_bounds = _find_candidates_bins_max_values_subsample( + hist.linf_sum_contributions_histogram, max_candidates) + min_sum_per_partition_bounds = [0] * len(max_sum_per_partition_bounds) else: assert False, "Nothing to tune." return analysis.MultiParameterConfiguration( max_partitions_contributed=l0_bounds, - max_contributions_per_partition=linf_bounds) + max_contributions_per_partition=linf_bounds, + min_sum_per_partition=min_sum_per_partition_bounds, + max_sum_per_partition=max_sum_per_partition_bounds) + + +def _find_candidates_parameters_in_2d_grid( + hist1: histograms.Histogram, hist2: histograms.Histogram, + find_candidates_func1: Callable[[histograms.Histogram, int], + Sequence[Number]], + find_candidates_func2: Callable[[histograms.Histogram, int], + Sequence[Number]], + max_candidates: int) -> Tuple[Sequence[Number], Sequence[Number]]: + """Finds candidates for 2 parameters. + + If we have 2 parameters to tune, then candidates for them form a 2 + dimensional grid. If for one parameter there is less than + sqrt(max_candidates) candidates, we can add more candidates for the other + parameter. This function implements this logic. + + Args: + hist1: histogram of the distribution of the first parameter. + hist2: histogram of the distribution of the second parameter. + find_candidates_func1: function that given hist1 and maximum of + candidates finds the candidates. + find_candidates_func2: function that given hist2 and maximum of + candidates finds the candidates. + max_candidates: maximum number of the candidates to produce. + Returns: + Two sequences which represent pairs of candidates for parameters 1 and + 2. Sequences are of the same length and their lengths do not exceed + max_candidates. + """ + + max_candidates_per_parameter = int(math.sqrt(max_candidates)) + param1_candidates = find_candidates_func1(hist1, + max_candidates_per_parameter) + param2_candidates = find_candidates_func2(hist2, + max_candidates_per_parameter) + param1_bounds, param2_bounds = [], [] + + # if param1 or param2 has fewer candidates than requested then we can add + # more candidates for the other parameter. + if (len(param2_candidates) < max_candidates_per_parameter and + len(param1_candidates) == max_candidates_per_parameter): + param1_candidates = find_candidates_func1( + hist1, int(max_candidates / len(param2_candidates))) + elif (len(param1_candidates) < max_candidates_per_parameter and + len(param2_candidates) == max_candidates_per_parameter): + param2_candidates = find_candidates_func2( + hist2, int(max_candidates / len(param1_candidates))) + + for param1 in param1_candidates: + for param2 in param2_candidates: + param1_bounds.append(param1) + param2_bounds.append(param2) + + return param1_bounds, param2_bounds def _find_candidates_constant_relative_step(histogram: histograms.Histogram, @@ -204,6 +262,17 @@ def _find_candidates_constant_relative_step(histogram: histograms.Histogram, return candidates +def _find_candidates_bins_max_values_subsample( + histogram: histograms.Histogram, max_candidates: int) -> List[float]: + """Takes max values of histogram bins with constant step between each other.""" + max_candidates = min(max_candidates, len(histogram.bins)) + ids = np.round(np.linspace(0, len(histogram.bins) - 1, + num=max_candidates)).astype(int) + bin_maximums = np.fromiter(map(lambda bin: bin.max, histogram.bins), + dtype=float) + return bin_maximums[ids].tolist() + + def tune(col, backend: pipeline_backend.PipelineBackend, contribution_histograms: histograms.DatasetHistograms, @@ -324,12 +393,17 @@ def _check_tune_args(options: TuneOptions, is_public_partitions: bool): f"Tuning supports only one metric, but {metrics} given.") else: # len(metrics) == 1 if metrics[0] not in [ - pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT + pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT, + pipeline_dp.Metrics.SUM ]: raise ValueError( - f"Tuning is supported only for Count and Privacy id count, but {metrics[0]} given." + f"Tuning is supported only for Count, Privacy id count and Sum, but {metrics[0]} given." ) + if options.parameters_to_tune.min_sum_per_partition: + raise ValueError( + "Tuning of min_sum_per_partition is not supported yet.") + if options.function_to_minimize != MinimizingFunction.ABSOLUTE_ERROR: raise NotImplementedError( f"Only {MinimizingFunction.ABSOLUTE_ERROR} is implemented.") diff --git a/analysis/tests/cross_partition_combiners_test.py b/analysis/tests/cross_partition_combiners_test.py index 74593e39..abe88fa4 100644 --- a/analysis/tests/cross_partition_combiners_test.py +++ b/analysis/tests/cross_partition_combiners_test.py @@ -24,10 +24,10 @@ import pipeline_dp -def _get_sum_metrics(sum=10.0): - return metrics.SumMetrics(aggregation=pipeline_dp.Metrics.SUM, +def _get_default_sum_metrics(metric=pipeline_dp.Metrics.COUNT, sum=10.0): + return metrics.SumMetrics(aggregation=metric, sum=sum, - clipping_to_min_error=3.0, + clipping_to_min_error=0.0, clipping_to_max_error=-5.0, expected_l0_bounding_error=-2.0, std_l0_bounding_error=3.0, @@ -37,38 +37,53 @@ def _get_sum_metrics(sum=10.0): class PerPartitionToCrossPartitionMetrics(parameterized.TestCase): - @parameterized.parameters(1, 0.25) - def test_metric_utility_count(self, keep_prob: float): - input = _get_sum_metrics() + @parameterized.product( + metric=[pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.SUM], + keep_prob=[1, 0.25]) + def test_metric_utility(self, metric: pipeline_dp.Metric, keep_prob: float): + input = metrics.SumMetrics(aggregation=metric, + sum=10.0, + clipping_to_min_error=0.0, + clipping_to_max_error=-5.0, + expected_l0_bounding_error=-2.0, + std_l0_bounding_error=3.0, + std_noise=4.0, + noise_kind=pipeline_dp.NoiseKind.LAPLACE) output: metrics.MetricUtility = cross_partition_combiners._sum_metrics_to_metric_utility( input, - pipeline_dp.Metrics.COUNT, + metric, partition_keep_probability=keep_prob, partition_weight=keep_prob) - self.assertEqual(output.metric, pipeline_dp.Metrics.COUNT) + self.assertEqual(output.metric, metric) self.assertEqual(output.noise_kind, input.noise_kind) self.assertEqual(output.noise_std, input.std_noise) # Check absolute_error. abs_error: metrics.ValueErrors = output.absolute_error - self.assertEqual(abs_error.mean, -4 * keep_prob) - self.assertEqual(abs_error.variance, 25 * keep_prob) + # (expected_l0_bounding_error + clipping_to_min_error + + # clipping_to_max_error) = -7 + self.assertEqual(abs_error.mean, (-2 + 0 + (-5)) * keep_prob) + # (std_l0_bounding_error**2 + std_noise**2) = 25 + self.assertEqual(abs_error.variance, (3**2 + 4**2) * keep_prob) + # sqrt(mean ** 2 + variance) self.assertAlmostEqual(abs_error.rmse, - math.sqrt(4 * 4 + 25) * keep_prob, + math.sqrt(7**2 + 25) * keep_prob, delta=1e-12) bounding_errors = abs_error.bounding_errors - self.assertEqual(bounding_errors.l0, - metrics.MeanVariance(-2.0 * keep_prob, 9 * keep_prob)) - self.assertEqual(bounding_errors.linf_min, 3.0 * keep_prob) + self.assertEqual( + bounding_errors.l0, + metrics.MeanVariance(-2.0 * keep_prob, (3**2) * keep_prob)) + self.assertEqual(bounding_errors.linf_min, 0.0 * keep_prob) self.assertEqual(bounding_errors.linf_max, -5.0 * keep_prob) # Check relative_error. expected_rel_error = abs_error.to_relative(input.sum) self.assertEqual(output.relative_error, expected_rel_error) + # mean / sum = -7 / 10 self.assertAlmostEqual(output.relative_error.mean, - -0.4 * keep_prob, + -0.7 * keep_prob, delta=1e-12) @parameterized.parameters(False, True) @@ -106,10 +121,15 @@ def test_per_partition_to_cross_partition_utility( per_partition_utility = metrics.PerPartitionMetrics( partition_selection_probability_to_keep=0.2, raw_statistics=metrics.RawStatistics(privacy_id_count=10, count=15), - metric_errors=[_get_sum_metrics(), - _get_sum_metrics()]) + metric_errors=[ + _get_default_sum_metrics( + metric=pipeline_dp.Metrics.PRIVACY_ID_COUNT), + _get_default_sum_metrics(metric=pipeline_dp.Metrics.COUNT), + _get_default_sum_metrics(metric=pipeline_dp.Metrics.SUM) + ]) dp_metrics = [ - pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT + pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, + pipeline_dp.Metrics.SUM ] cross_partition_combiners._per_partition_to_utility_report( per_partition_utility, @@ -123,7 +143,7 @@ def test_per_partition_to_cross_partition_utility( mock_create_for_public_partitions.assert_not_called() mock_create_for_private_partitions.assert_called_once_with(0.2) - self.assertEqual(mock_sum_metrics_to_metric_utility.call_count, 2) + self.assertEqual(mock_sum_metrics_to_metric_utility.call_count, 3) @patch( "analysis.cross_partition_combiners._partition_metrics_public_partitions" @@ -150,8 +170,15 @@ def test_per_partition_to_cross_partition_utility_only_partition_selection( mock_create_for_public_partitions.assert_not_called() mock_create_for_private_partitions.assert_called_once_with(0.5) - def test_sum_metrics_to_data_dropped(self): - input = _get_sum_metrics() + def test_sum_metrics_to_data_dropped_count(self): + input = metrics.SumMetrics(aggregation=pipeline_dp.Metrics.COUNT, + sum=10.0, + clipping_to_min_error=0.0, + clipping_to_max_error=-5.0, + expected_l0_bounding_error=-2.0, + std_l0_bounding_error=3.0, + std_noise=4.0, + noise_kind=pipeline_dp.NoiseKind.LAPLACE) output = cross_partition_combiners._sum_metrics_to_data_dropped( input, partition_keep_probability=0.5, @@ -160,8 +187,27 @@ def test_sum_metrics_to_data_dropped(self): output, metrics.DataDropInfo(l0=2.0, linf=5.0, partition_selection=1.5)) + def test_sum_metrics_to_data_dropped_sum(self): + input = metrics.SumMetrics(aggregation=pipeline_dp.Metrics.SUM, + sum=12, + clipping_to_min_error=3.0, + clipping_to_max_error=-5.0, + expected_l0_bounding_error=-2.0, + std_l0_bounding_error=3.0, + std_noise=4.0, + noise_kind=pipeline_dp.NoiseKind.LAPLACE) + + output = cross_partition_combiners._sum_metrics_to_data_dropped( + input, + partition_keep_probability=0.5, + dp_metric=pipeline_dp.Metrics.SUM) + + self.assertEqual( + output, + metrics.DataDropInfo(l0=2.0, linf=8.0, partition_selection=1.0)) + def test_sum_metrics_to_data_dropped_public_partition(self): - input = _get_sum_metrics() + input = _get_default_sum_metrics(metric=pipeline_dp.Metrics.COUNT) output = cross_partition_combiners._sum_metrics_to_data_dropped( input, partition_keep_probability=1.0, @@ -313,7 +359,7 @@ def test_create_report_wo_mocks(self): per_partition_metrics = metrics.PerPartitionMetrics( partition_selection_probability_to_keep=prob_keep, raw_statistics=metrics.RawStatistics(privacy_id_count=3, count=9), - metric_errors=[_get_sum_metrics(sum=10.0)]) + metric_errors=[_get_default_sum_metrics(sum=10.0)]) sum_actual, utility_report, weight = combiner.create_accumulator( per_partition_metrics) self.assertEqual(sum_actual, (10.0,)) @@ -328,7 +374,7 @@ def test_create_report_partition_size_is_used_as_weight_wo_mocks(self): per_partition_metrics = metrics.PerPartitionMetrics( partition_selection_probability_to_keep=0.2, raw_statistics=metrics.RawStatistics(privacy_id_count=3, count=9), - metric_errors=[_get_sum_metrics(sum=5.0)]) + metric_errors=[_get_default_sum_metrics(sum=5.0)]) _, _, weight = combiner.create_accumulator(per_partition_metrics) self.assertEqual(weight, 5.0) @@ -346,7 +392,9 @@ def test_create_report_with_mocks(self, per_partition_metrics = metrics.PerPartitionMetrics( partition_selection_probability_to_keep=prob_keep, raw_statistics=metrics.RawStatistics(privacy_id_count=3, count=9), - metric_errors=[_get_sum_metrics()]) + metric_errors=[ + _get_default_sum_metrics(metric=pipeline_dp.Metrics.COUNT) + ]) combiner.create_accumulator(per_partition_metrics) mock_per_partition_to_utility_report.assert_called_once_with( per_partition_metrics, dp_metrics, public_partitions, prob_keep) diff --git a/analysis/tests/parameter_tuning_test.py b/analysis/tests/parameter_tuning_test.py index c9b54530..eaa7aaf5 100644 --- a/analysis/tests/parameter_tuning_test.py +++ b/analysis/tests/parameter_tuning_test.py @@ -24,27 +24,38 @@ from analysis import parameter_tuning from pipeline_dp.dataset_histograms import histograms from pipeline_dp.dataset_histograms import computing_histograms +from pipeline_dp.dataset_histograms.histograms import FrequencyBin -def _get_aggregate_params(): - # Limit contributions to 1 per partition, contribution error will be half of the count. +def _get_aggregate_params(metrics: List[pipeline_dp.Metric]): return pipeline_dp.AggregateParams( noise_kind=pipeline_dp.NoiseKind.GAUSSIAN, - metrics=[pipeline_dp.Metrics.COUNT], - max_partitions_contributed=1, - max_contributions_per_partition=1) - - -def _get_tune_options(): + metrics=metrics, + max_partitions_contributed=1, # does not matter + max_contributions_per_partition=1, # does not matter + min_value=0, # does not matter + max_value=1) # does not matter + + +def _get_tune_options( + metrics: List[pipeline_dp.Metric], + parameters_to_tune: parameter_tuning.ParametersToTune = parameter_tuning. + ParametersToTune(max_partitions_contributed=True, + max_contributions_per_partition=True) +) -> parameter_tuning.TuneOptions: return parameter_tuning.TuneOptions( epsilon=1, delta=1e-10, - aggregate_params=_get_aggregate_params(), + aggregate_params=_get_aggregate_params(metrics), function_to_minimize=parameter_tuning.MinimizingFunction.ABSOLUTE_ERROR, - parameters_to_tune=parameter_tuning.ParametersToTune(True, True), + parameters_to_tune=parameters_to_tune, number_of_parameter_candidates=3) +def _frequency_bin(max_value: float = 0.0, lower: float = 0.0) -> FrequencyBin: + return FrequencyBin(max=max_value, lower=lower, count=None, sum=None) + + class ParameterTuning(parameterized.TestCase): def test_find_candidate_parameters_maximum_number_of_candidates_is_respected_when_both_parameters_needs_to_be_tuned( @@ -156,8 +167,8 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_ max_candidates=5, # ceil(1000^(i / 4)), where i in [0, 1, 2, 3, 4] expected_candidates=[1, 6, 32, 178, 1000])) - def test_find_candidate_parameters(self, max_value, max_candidates, - expected_candidates): + def test_find_candidate_parameters_count(self, max_value, max_candidates, + expected_candidates): mock_l0_histogram = histograms.Histogram(None, None) mock_l0_histogram.max_value = mock.Mock(return_value=max_value) @@ -176,6 +187,106 @@ def test_find_candidate_parameters(self, max_value, max_candidates, self.assertEqual(expected_candidates, candidates.max_partitions_contributed) + @parameterized.named_parameters( + dict(testcase_name='bin_max_values=[1], returns [1]', + bins=[_frequency_bin(max_value=1)], + max_candidates=1000, + expected_candidates=[1]), + dict(testcase_name='max_candidates=1, returns max value of the first' + ' bin', + bins=[ + _frequency_bin(max_value=0.1), + _frequency_bin(max_value=0.2), + _frequency_bin(max_value=0.3) + ], + max_candidates=1, + expected_candidates=[0.1]), + dict(testcase_name='max_candidates=2, returns max values of the first' + ' and last bin', + bins=[ + _frequency_bin(max_value=0.1), + _frequency_bin(max_value=0.2), + _frequency_bin(max_value=0.3) + ], + max_candidates=2, + expected_candidates=[0.1, 0.3]), + dict(testcase_name='max_candidates is equal to number of bins, returns' + ' all bin max values as candidates', + bins=[ + _frequency_bin(max_value=0.1), + _frequency_bin(max_value=0.2), + _frequency_bin(max_value=0.3) + ], + max_candidates=3, + expected_candidates=[0.1, 0.2, 0.3]), + dict(testcase_name='max_candidates is larger than number of bins,' + ' returns all bin max values as candidates', + bins=[ + _frequency_bin(max_value=0.1), + _frequency_bin(max_value=0.2), + _frequency_bin(max_value=0.3) + ], + max_candidates=100, + expected_candidates=[0.1, 0.2, 0.3]), + dict( + testcase_name='max_candidates is smaller than number of bins,' + ' returns uniformly distributed subsample of bin' + ' max values', + bins=[_frequency_bin(max_value=i) for i in range(10)], + max_candidates=5, + # Takes each bin with step ((10 - 1) / (5 - 1) = 1.8), i.e. + # [0, 2.25, 4.5, 6.75, 9], then rounds it, i.e. we get + # [0, 2, 4, 7, 9] indices of bins to take, they equal to max + # values of these bins + expected_candidates=[0, 2, 4, 7, 9]), + ) + def test_find_candidate_parameters_sum(self, bins, max_candidates, + expected_candidates): + mock_linf_sum_contributions_histogram = histograms.Histogram(None, bins) + mock_histograms = histograms.DatasetHistograms( + None, None, None, mock_linf_sum_contributions_histogram, None, None) + parameters_to_tune = parameter_tuning.ParametersToTune( + max_partitions_contributed=False, + min_sum_per_partition=False, + max_sum_per_partition=True) + + candidates = parameter_tuning._find_candidate_parameters( + mock_histograms, + parameters_to_tune, + pipeline_dp.Metrics.SUM, + max_candidates=max_candidates) + + self.assertEqual(expected_candidates, candidates.max_sum_per_partition) + self.assertEqual([0] * len(expected_candidates), + candidates.min_sum_per_partition) + + def test_find_candidate_parameters_both_l0_and_linf_sum_to_be_tuned(self): + mock_l0_histogram = histograms.Histogram(None, None) + mock_l0_histogram.max_value = mock.Mock(return_value=6) + mock_linf_sum_contributions_histogram = histograms.Histogram( + None, [ + _frequency_bin(max_value=1), + _frequency_bin(max_value=2), + _frequency_bin(max_value=3) + ]) + + mock_histograms = histograms.DatasetHistograms( + mock_l0_histogram, None, None, + mock_linf_sum_contributions_histogram, None, None) + parameters_to_tune = parameter_tuning.ParametersToTune( + max_partitions_contributed=True, + min_sum_per_partition=False, + max_sum_per_partition=True) + + candidates = parameter_tuning._find_candidate_parameters( + mock_histograms, + parameters_to_tune, + pipeline_dp.Metrics.SUM, + max_candidates=5) + self.assertEqual([1, 1, 6, 6], candidates.max_partitions_contributed) + self.assertEqual([1, 3, 1, 3], candidates.max_sum_per_partition) + self.assertEqual([0, 0, 0, 0], candidates.min_sum_per_partition) + @parameterized.named_parameters( dict( testcase_name='COUNT', @@ -187,6 +298,11 @@ def test_find_candidate_parameters(self, max_value, max_candidates, metric=pipeline_dp.Metrics.PRIVACY_ID_COUNT, expected_generate_linf=False, ), + dict( + testcase_name='SUM', + metric=pipeline_dp.Metrics.SUM, + expected_generate_linf=False, + ), dict( testcase_name='No metric (select partition)', metric=None, @@ -241,7 +357,11 @@ def test_tune_count(self): computing_histograms.compute_dataset_histograms( input, data_extractors, pipeline_dp.LocalBackend()))[0] - tune_options = _get_tune_options() + tune_options = _get_tune_options( + [pipeline_dp.Metrics.COUNT], + parameter_tuning.ParametersToTune( + max_partitions_contributed=True, + max_contributions_per_partition=True)) # Act. result = parameter_tuning.tune(input, pipeline_dp.LocalBackend(), @@ -265,6 +385,48 @@ def test_tune_count(self): self.assertEqual(utility_reports[0].metric_errors[0].metric, pipeline_dp.Metrics.COUNT) + def test_tune_sum(self): + # Arrange. + # Generate dataset, with 10 privacy units, each of them contribute to + # the same partition with value equal to its id. + input = [(i, f"pk0", i) for i in range(10)] + public_partitions = [f"pk{i}" for i in range(10)] + data_extractors = pipeline_dp.DataExtractors( + privacy_id_extractor=lambda x: x[0], + partition_extractor=lambda x: x[1], + value_extractor=lambda x: x[2]) + + contribution_histograms = list( + computing_histograms.compute_dataset_histograms( + input, data_extractors, pipeline_dp.LocalBackend()))[0] + + tune_options = _get_tune_options([pipeline_dp.Metrics.SUM], + parameter_tuning.ParametersToTune( + max_partitions_contributed=True, + max_sum_per_partition=True)) + + # Act. + result = parameter_tuning.tune(input, pipeline_dp.LocalBackend(), + contribution_histograms, tune_options, + data_extractors, public_partitions) + + # Assert. + tune_result, per_partition_utility_analysis = result + per_partition_utility_analysis = list(per_partition_utility_analysis) + self.assertLen(per_partition_utility_analysis, 10) + + tune_result = list(tune_result)[0] + + self.assertEqual(tune_options, tune_result.options) + self.assertEqual(contribution_histograms, + tune_result.contribution_histograms) + utility_reports = tune_result.utility_reports + self.assertLen(utility_reports, 1) + self.assertIsInstance(utility_reports[0], metrics.UtilityReport) + self.assertLen(utility_reports[0].metric_errors, 1) + self.assertEqual(utility_reports[0].metric_errors[0].metric, + pipeline_dp.Metrics.SUM) + def test_select_partitions(self): # Arrange. # Generate dataset, with 10 privacy units, 5 of them contribute to @@ -279,9 +441,7 @@ def test_select_partitions(self): computing_histograms.compute_dataset_histograms( input, data_extractors, pipeline_dp.LocalBackend()))[0] - tune_options = _get_tune_options() - # Setting metrics to empty list makes running only partition selectoin. - tune_options.aggregate_params.metrics = [] + tune_options = _get_tune_options(metrics=[]) # Act. result = parameter_tuning.tune(input, pipeline_dp.LocalBackend(), @@ -320,10 +480,9 @@ def test_tune_privacy_id_count(self): computing_histograms.compute_dataset_histograms( input, data_extractors, pipeline_dp.LocalBackend()))[0] - tune_options = _get_tune_options() - tune_options.aggregate_params.metrics = [ - pipeline_dp.Metrics.PRIVACY_ID_COUNT - ] + tune_options = _get_tune_options( + [pipeline_dp.Metrics.PRIVACY_ID_COUNT], + parameter_tuning.ParametersToTune(max_partitions_contributed=True)) # Act. result, _ = parameter_tuning.tune(input, pipeline_dp.LocalBackend(), @@ -354,17 +513,16 @@ def test_tune_privacy_id_count(self): pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT ], is_public_partitions=True), - dict( - testcase_name="Mean is not supported", - error_msg="Tuning is supported only for Count and Privacy id count", - metrics=[pipeline_dp.Metrics.MEAN], - is_public_partitions=False), + dict(testcase_name="Mean is not supported", + error_msg= + "Tuning is supported only for Count, Privacy id count and Sum", + metrics=[pipeline_dp.Metrics.MEAN], + is_public_partitions=False), ) def test_tune_params_validation(self, error_msg, metrics: List[pipeline_dp.Metric], is_public_partitions: bool): - tune_options = _get_tune_options() - tune_options.aggregate_params.metrics = metrics + tune_options = _get_tune_options(metrics) contribution_histograms = histograms.DatasetHistograms( None, None, None, None, None, None) data_extractors = pipeline_dp.DataExtractors( @@ -375,6 +533,23 @@ def test_tune_params_validation(self, error_msg, contribution_histograms, tune_options, data_extractors, public_partitions) + def test_tune_min_sum_per_partition_is_not_supported(self): + tune_options = _get_tune_options([pipeline_dp.Metrics.SUM], + parameter_tuning.ParametersToTune( + max_partitions_contributed=True, + min_sum_per_partition=True, + max_sum_per_partition=True)) + contribution_histograms = histograms.DatasetHistograms( + None, None, None, None, None, None) + data_extractors = pipeline_dp.DataExtractors( + privacy_id_extractor=lambda _: 0, partition_extractor=lambda _: 0) + with self.assertRaisesRegex( + ValueError, + "Tuning of min_sum_per_partition is not supported yet."): + parameter_tuning.tune(input, pipeline_dp.LocalBackend(), + contribution_histograms, tune_options, + data_extractors) + if __name__ == '__main__': absltest.main()