From 70c8c7cbf8959af7f7990a4df45be9cc35da4201 Mon Sep 17 00:00:00 2001 From: Differential Privacy Team Date: Wed, 24 Apr 2024 08:00:56 -0700 Subject: [PATCH] DP Auditorium divergence testers update Privacy on Beam: * Bump golang.org/x/net from 0.22.0 to 0.23.0 DP Auditorium: * Unify RenyiPropertyTester under new divergence tester class * Add interface for divergence based testers * Add example of testing PipelineDP mean mechanism in IPython * Upgrade Histogram tester * Update dependencies DP Accounting: * Increment patch version of DP accounting library for PyPi release Change-Id: I3a513cf7d7c7e144b11c778f792f723dff53132f GitOrigin-RevId: 1833e65df37c76d756d3a62ed6ba99112c0a5dd0 --- privacy-on-beam/go.mod | 2 +- privacy-on-beam/go.sum | 4 +- privacy-on-beam/privacy_on_beam_deps.bzl | 4 +- python/dp_accounting/VERSION | 2 +- .../configs/property_tester_config.py | 9 + .../pipelinedp_mean_mechanism_example.ipynb | 235 +++++++++++++++ .../examples/run_mean_mechanism_example.ipynb | 8 +- .../mechanisms/pipeline_dp/aggregation.py | 86 ++++++ .../pipeline_dp/aggregation_test.py | 89 ++++++ .../dp_auditorium/testers/BUILD.bazel | 28 +- .../dp_auditorium/testers/__init__.py | 1 - .../testers/divergence_tester.py | 123 ++++++++ .../testers/divergence_tester_test.py | 122 ++++++++ .../dp_auditorium/testers/histogram_tester.py | 36 ++- .../testers/histogram_tester_test.py | 33 ++- .../testers/hockey_stick_tester.py | 268 +++++------------- .../testers/hockey_stick_tester_test.py | 101 +++---- .../testers/property_tester_utils.py | 2 +- .../testers/property_tester_utils_test.py | 16 +- .../dp_auditorium/testers/renyi_tester.py | 251 ++++------------ .../testers/renyi_tester_test.py | 73 +++-- python/dp_auditorium/requirements.in | 2 + python/dp_auditorium/requirements.txt | 6 +- 23 files changed, 997 insertions(+), 504 deletions(-) create mode 100644 python/dp_auditorium/dp_auditorium/examples/pipelinedp_mean_mechanism_example.ipynb create mode 100644 python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation.py create mode 100644 python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation_test.py create mode 100644 python/dp_auditorium/dp_auditorium/testers/divergence_tester.py create mode 100644 python/dp_auditorium/dp_auditorium/testers/divergence_tester_test.py diff --git a/privacy-on-beam/go.mod b/privacy-on-beam/go.mod index f011fd9e..68342a4d 100644 --- a/privacy-on-beam/go.mod +++ b/privacy-on-beam/go.mod @@ -58,7 +58,7 @@ require ( golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect golang.org/x/image v0.15.0 // indirect golang.org/x/mod v0.16.0 // indirect - golang.org/x/net v0.22.0 // indirect + golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.18.0 // indirect golang.org/x/sync v0.6.0 // indirect golang.org/x/sys v0.18.0 // indirect diff --git a/privacy-on-beam/go.sum b/privacy-on-beam/go.sum index 65ec79ed..cb896870 100644 --- a/privacy-on-beam/go.sum +++ b/privacy-on-beam/go.sum @@ -223,8 +223,8 @@ golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwY golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.22.0 h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc= -golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.18.0 h1:09qnuIAgzdx1XplqJvW6CQqMCtGZykZWcXzPMPUusvI= golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi8= diff --git a/privacy-on-beam/privacy_on_beam_deps.bzl b/privacy-on-beam/privacy_on_beam_deps.bzl index 8a125cca..64592fdb 100644 --- a/privacy-on-beam/privacy_on_beam_deps.bzl +++ b/privacy-on-beam/privacy_on_beam_deps.bzl @@ -2001,8 +2001,8 @@ def privacy_on_beam_deps(): go_repository( name = "org_golang_x_net", importpath = "golang.org/x/net", - sum = "h1:9sGLhx7iRIHEiX0oAJ3MRZMUCElJgy7Br1nO+AMN3Tc=", - version = "v0.22.0", + sum = "h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=", + version = "v0.23.0", ) go_repository( name = "org_golang_x_oauth2", diff --git a/python/dp_accounting/VERSION b/python/dp_accounting/VERSION index 9673a438..de476221 100644 --- a/python/dp_accounting/VERSION +++ b/python/dp_accounting/VERSION @@ -1,2 +1,2 @@ """ Version of the current release of DP Accounting """ -0.4.3 +0.4.4 diff --git a/python/dp_auditorium/dp_auditorium/configs/property_tester_config.py b/python/dp_auditorium/dp_auditorium/configs/property_tester_config.py index d2d9bbb5..143e4d7c 100644 --- a/python/dp_auditorium/dp_auditorium/configs/property_tester_config.py +++ b/python/dp_auditorium/dp_auditorium/configs/property_tester_config.py @@ -66,10 +66,14 @@ class HockeyStickPropertyTesterConfig: Attributes: training_config: Required training parameters. approximate_dp: Approximate DP privacy parameters to be tested. + evaluation_batch_size: Batch size for computing accuracy of classifier + distinguishing two distributions for Hockey Stick divergence. See + `HockeyStickPropertyTester` class for details. """ training_config: TrainingConfig approximate_dp: privacy_property.ApproximateDp + evaluation_batch_size: int = 1000 @dataclasses.dataclass @@ -83,6 +87,10 @@ class HistogramPropertyTesterConfig: min_value: Lower end value for the histogram. max_value: Upper end value for the histogram. approximate_dp: Approximate DP privacy parameters to be tested. + use_original_tester: Whether to use the original version of the tester due + to Gilbert and McMillan (2018), or a new version developed for + DP-Auditorium. The new version generally improves over the original + verison, but the original version is retained for comparison purposes. """ test_discrete_mechanism: bool @@ -90,6 +98,7 @@ class HistogramPropertyTesterConfig: min_value: float max_value: float approximate_dp: privacy_property.ApproximateDp + use_original_tester: bool = False class Kernel(enum.Enum): diff --git a/python/dp_auditorium/dp_auditorium/examples/pipelinedp_mean_mechanism_example.ipynb b/python/dp_auditorium/dp_auditorium/examples/pipelinedp_mean_mechanism_example.ipynb new file mode 100644 index 00000000..df140e51 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/examples/pipelinedp_mean_mechanism_example.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "I0Z7vNS_ybbU" + }, + "source": [ + "This colab notebook uses DP-auditorium to test differentially private mechanisms computing aggregate statistics using PipelineDP." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wUtLsXpF9q4D" + }, + "source": [ + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n", + " \u003ctd\u003e\n", + " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/differential-privacy/blob/main/python/dp_auditorium/dp_auditorium/examples/pipelinedp_mean_mechanism_example.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", + " \u003c/td\u003e\n", + " \u003ctd\u003e\n", + " \u003ca target=\"_blank\" href=\"https://github.com/google/differential-privacy/blob/main/python/dp_auditorium/dp_auditorium/examples/pipelinedp_mean_mechanism_example.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", + " \u003c/td\u003e\n", + "\u003c/table\u003e\n", + "\n", + "\u003cbr\u003e\n", + "\u003cbr\u003e" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WPLSKwjEHfXI" + }, + "outputs": [], + "source": [ + "#@title Install and import dp_auditorium and all necessary libraries.\n", + "!pip install google-vizier equinox pipeline_dp\n", + "!git clone https://github.com/google/differential-privacy.git\n", + "import sys\n", + "sys.path.append('differential-privacy/python/dp_auditorium')\n", + "\n", + "from dp_auditorium import privacy_test_runner\n", + "from dp_auditorium.generators import pipeline_dp_vizier_dataset_generator\n", + "from dp_auditorium.configs import dataset_generator_config\n", + "from dp_auditorium.configs import privacy_property\n", + "from dp_auditorium.configs import privacy_test_runner_config\n", + "from dp_auditorium.configs import property_tester_config\n", + "from dp_auditorium.mechanisms.pipeline_dp import aggregation as pipeline_dp_mechanism\n", + "from dp_auditorium.testers import hockey_stick_tester\n", + "\n", + "import pipeline_dp\n", + "import tensorflow as tf\n", + "tf.compat.v1.enable_eager_execution()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dr5A5W7Aq2SO" + }, + "outputs": [], + "source": [ + "#@title Example of testing PipelineDP mean mechanism\n", + "import time\n", + "\n", + "def pipeline_dp_mean_mechanism_report(\n", + " epsilon: float,\n", + " delta: float,\n", + " seed: int,\n", + " max_number_partitions: int = 10,\n", + ") -\u003e privacy_test_runner_config.PrivacyTestRunnerResults:\n", + " \"\"\"Runs the example code for a mean mechanism.\n", + "\n", + " Args:\n", + " epsilon: standard approximate DP parameter.\n", + " delta: standard approximate DP parameter.\n", + " seed: seed to initialize the random number generator.\n", + " max_number_partitions: maximum number of partitions which can be used by\n", + " dataset generator.\n", + "\n", + " Returns:\n", + " The result of the example code as PrivacyTestRunnerResults.\n", + " \"\"\"\n", + " tf.random.set_seed(seed)\n", + "\n", + " # Specify a config for computing with PipeineDP Mean aggregation, namely\n", + " # computing mean aggregation per partition, i.e. in SQL terms DP version of\n", + " # SELECT partition_key, mean(value)\n", + " # GROUP BY partition_key\n", + " # is computed.\n", + " # See https://pipelinedp.io/key-definitions/ on more details of PipelineDP terminology.\n", + " mech_config = pipeline_dp.AggregateParams(\n", + " metrics=[pipeline_dp.Metrics.MEAN],\n", + " # Laplace noise is used for ensuring DP\n", + " noise_kind=pipeline_dp.NoiseKind.LAPLACE,\n", + " # Set contribution bounds:\n", + "\n", + " # 1. If some privacy unit contributes more than to 1 partition then\n", + " # PipelineDP will choose randomly 1 partition, contributions to others\n", + " # will be dropped.\n", + " max_partitions_contributed=1,\n", + " # 2. If some privacy unit contributes to more than 1 time to some\n", + " # partition then PipelineDP will choose randomly 1 contribution, others\n", + " # contribution will be dropped\n", + " max_contributions_per_partition=1,\n", + "\n", + " # 3. Each contributions will be clipped to [-1, 1].\n", + " min_value=-1.0,\n", + " max_value=1.0)\n", + "\n", + " # Initialize the mechanism.\n", + " public_partitions = list(range(max_number_partitions))\n", + " mechanism = pipeline_dp_mechanism.AggregationMechanism(mech_config,\n", + " privacy_property.ApproximateDp(\n", + " epsilon=epsilon,\n", + " delta=delta,\n", + " ), public_partitions)\n", + "\n", + " # Configuration for a Hockey-Stick property tester. Given arrays s1 and s2\n", + " # with samples from two distributions it will estimate the hockey-stick\n", + " # divergence between the underlying distributions. It checks if the estimated\n", + " # divergence is bounded by delta.\n", + " tester_config = property_tester_config.HockeyStickPropertyTesterConfig(\n", + " training_config=hockey_stick_tester.make_default_hs_training_config(),\n", + " approximate_dp=privacy_property.ApproximateDp(\n", + " epsilon=epsilon,\n", + " delta=delta,\n", + " ),\n", + " )\n", + "\n", + " # Initialize a classifier model for the Hockey-Stick property tester.\n", + " # This classifier will learn to distinguish between samples of the mechanism\n", + " # on adjacent datasets. Its accuracy level should be controlled by the privacy\n", + " # guarantee.\n", + " base_model = hockey_stick_tester.make_default_hs_base_model()\n", + " # Initialize a property tester.\n", + " property_tester = hockey_stick_tester.HockeyStickPropertyTester(\n", + " config=tester_config,\n", + " base_model=base_model,\n", + " )\n", + "\n", + " # Configuration for dataset generator. It generates neighboring datasets under\n", + " # the add/remove definition. Unique study name prevents using cached results\n", + " # from previous runs.\n", + " generator_config = dataset_generator_config.VizierDatasetGeneratorConfig(\n", + " study_name=str(time.time()),\n", + " study_owner=\"owner\",\n", + " num_vizier_parameters=2,\n", + " data_type=dataset_generator_config.DataType.DATA_TYPE_FLOAT,\n", + " min_value=-1.0,\n", + " max_value=1.0,\n", + " search_algorithm=\"RANDOM_SEARCH\",\n", + " metric_name=\"hockey_stick_divergence\",\n", + " )\n", + "\n", + " # Dataset generator will generate datasets of not more than\n", + " # max_number_partitions partitions and not more than 10 privacy units.\n", + " # The same partitions are used as public_partitions and as partitions in\n", + " # dataset. So the mechanism will not drop the partitions. We do not check\n", + " # partition selection. We focus only on checking noise.\n", + " pipeline_dp_generator_config = pipeline_dp_vizier_dataset_generator.PipelineDpDatasetGeneratorConfig(\n", + " max_num_privacy_ids=10, max_num_partitions=max_number_partitions)\n", + "\n", + " # Initialize the dataset generator.\n", + " dataset_generator = pipeline_dp_vizier_dataset_generator.PipelineDpDatasetGenerator(\n", + " generator_config, pipeline_dp_generator_config)\n", + "\n", + " # Configuration for the test runner.\n", + " # The test runner coordinates how the test is evaluated. It receives a\n", + " # dataset generator, a property tester and a configuration (see base class for\n", + " # details on these parameters), and runs privacy tests using the property\n", + " # tester on datasets generated by the dataset generator.\n", + " test_runner_config = privacy_test_runner_config.PrivacyTestRunnerConfig(\n", + " property_tester=privacy_test_runner_config.PropertyTester.HOCKEY_STICK_TESTER,\n", + " max_num_trials=10,\n", + " failure_probability=0.05,\n", + " num_samples=10_000,\n", + " # Apply a hyperbolic tangent function to the output of the mechanism\n", + " post_processing=privacy_test_runner_config.PostProcessing.TANH,\n", + " )\n", + " # Initialize the test runner.\n", + " test_runner = privacy_test_runner.PrivacyTestRunner(\n", + " config=test_runner_config,\n", + " dataset_generator=dataset_generator,\n", + " property_tester=property_tester,\n", + " )\n", + "\n", + " return test_runner.test_privacy(mechanism, \"pipeline_dp-mean-mechanism\")\n", + "\n", + "\n", + "EPSILON = 1.0\n", + "DELTA = 1e-5\n", + "SEED = 1\n", + "\n", + "# The results indicate whether a privacy violation was identified within the\n", + "# designated number of trials defined in the configuration. In the absence of a\n", + "# violation, a message is returned indicating that the limit of the number of\n", + "# trials has been reached. For reference, all computed divergences across all\n", + "# trials are also reported.\n", + "results = pipeline_dp_mean_mechanism_report(EPSILON, DELTA, SEED)\n", + "print(f\" \\nResults: \\n{results}\")\n", + "if results.found_privacy_violation is not None:\n", + " print(\"Privacy violations found!\")\n" + ] + } + ], + "metadata": { + "colab": { + "private_outputs": true, + "provenance": [ + { + "file_id": "1QyFD_doucyHewiRMtxGvFxNrFlgbCqQa", + "timestamp": 1708693099970 + }, + { + "file_id": "1pBgTlH19OwJ3diUYf3m3QaZcVNQGeB8B", + "timestamp": 1708692052606 + } + ] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.ipynb b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.ipynb index 7e6ad5ce..27d74260 100644 --- a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.ipynb +++ b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.ipynb @@ -81,8 +81,8 @@ " \"\"\"Runs the example code for a mean mechanism.\n", "\n", " Args:\n", - " epsilon: standard DP parmaeter.\n", - " delta: standard DP parameter.\n", + " epsilon: standard approximate DP parmaeter.\n", + " delta: standard approximate DP parameter.\n", " seed: seed to initialize the random number generator.\n", " generator_factory: factory to create a generator; to be replaced in tests\n", "\n", @@ -182,10 +182,6 @@ ], "metadata": { "colab": { - "last_runtime": { - "build_target": "//learning/vizier/service/colab:notebook", - "kind": "private" - }, "private_outputs": true, "provenance": [ { diff --git a/python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation.py b/python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation.py new file mode 100644 index 00000000..834d5164 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation.py @@ -0,0 +1,86 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pipeline-DP library mechanisms.""" + + +import itertools + +import numpy as np +import pipeline_dp + +from dp_auditorium import interfaces +from dp_auditorium.configs import privacy_property + + +class AggregationMechanism(interfaces.Mechanism): + """Pipeline DP mechanism wrapper for privacy auditing.""" + + def __init__( + self, + config: pipeline_dp.AggregateParams, + tested_privacy_property: privacy_property.ApproximateDp, + public_partitions: list[int] | None, + ): + self._epsilon = tested_privacy_property.epsilon + self._delta = tested_privacy_property.delta + self._config = config + self._public_partitions = public_partitions + + def _compute_aggregations(self, data: list[float]) -> list[float]: + """Returns one sample of a DP aggregation using the pipeline_dp library. + + Args: + data: One dimensional array with scalar corresponding to different + records. + """ + budget_accountant = pipeline_dp.NaiveBudgetAccountant( + self._epsilon, self._delta + ) + dp_engine = pipeline_dp.DPEngine( + budget_accountant, pipeline_dp.LocalBackend() + ) + + data_extractors = pipeline_dp.DataExtractors( + partition_extractor=lambda x: x[0], + privacy_id_extractor=lambda x: x[1], + value_extractor=lambda x: x[2], + ) + + result = dp_engine.aggregate( + data, + self._config, + data_extractors, + public_partitions=self._public_partitions, + ) + budget_accountant.compute_budgets() + + # result is an iterator where each item is a tuple + # `(`partition_id`, MetricsTuple)`. We drop partition_id and concatenate all + # metrics' values. + values = [row[1] for row in result] + + # The output of this wrapper is designed for `interfaces.PropertyTester` + # which receives arrays of samples where each sample is a one-dimensional + # array. The specific metric defining each entry does not affect the privacy + # test result, so for each sample we flatten all metrics across distinct + # partitions. + return [x for x in itertools.chain(*values)] + + def __call__(self, data: np.ndarray, num_samples: int) -> np.ndarray: + """Returns an array of samples of a DP aggregation using pipeline_dp.""" + result = [] + data = list(data) # PipelineDP works now for list only. + for _ in range(num_samples): + result.append(self._compute_aggregations(data)) + return np.array(result) diff --git a/python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation_test.py b/python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation_test.py new file mode 100644 index 00000000..3303cf91 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/mechanisms/pipeline_dp/aggregation_test.py @@ -0,0 +1,89 @@ +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the mean mechanism with Pipeline DP.""" +from absl.testing import absltest +from absl.testing import parameterized +import numpy as np +import pipeline_dp + +from dp_auditorium.configs import privacy_property +from dp_auditorium.mechanisms.pipeline_dp import aggregation + + +class AggregationMechanismTest(parameterized.TestCase): + + @parameterized.product( + metrics=[ + [pipeline_dp.Metrics.MEAN], + [pipeline_dp.Metrics.PERCENTILE(0.1)], + [pipeline_dp.Metrics.MEAN, pipeline_dp.Metrics.SUM], + ], + num_samples=[1, 2, 3], + public_partitions=[[1], [1, 2, 3]], + delta=[0.0, 0.5], + ) + def test_pipeline_dp_mechanism( + self, metrics, num_samples, public_partitions, delta + ): + """Tests that the output of the mechanism has the expected shape. + + Correctness of the implementation and returned values will be verified using + DP-Auditorium testers. Here we only verify the mechanism wrapper works as + expected. + + Args: + metrics: aggregations to be tested. + num_samples: Number of samples to draw from the mechanism. + public_partitions: List with ids of public partitions. + delta: Privacy parameter. + """ + # Stub data to test the mechanism. The first column represents a partition + # id, the second column represents the user id, and the third the + # corresponding value. + data = np.array([ + [1, 1, 1.0], + [1, 1, 1.5], + [2, 1, 3.1], + [1, 2, 1.0], + [2, 2, 1.0], + [1, 3, 1.7], + [3, 3, 2.0], + ]) + epsilon = 10000 + tested_privacy_property = privacy_property.ApproximateDp( + epsilon=epsilon, delta=delta + ) + config = pipeline_dp.AggregateParams( + metrics=metrics, + min_value=0.01, + max_value=1.0, + max_partitions_contributed=2, + max_contributions_per_partition=1, + contribution_bounds_already_enforced=False, + ) + aggregation_mechanism = aggregation.AggregationMechanism( + config=config, + tested_privacy_property=tested_privacy_property, + public_partitions=public_partitions, + ) + + result = aggregation_mechanism(data, num_samples=num_samples) + self.assertEqual( + result.shape, (num_samples, len(public_partitions)* len(metrics)) + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/python/dp_auditorium/dp_auditorium/testers/BUILD.bazel b/python/dp_auditorium/dp_auditorium/testers/BUILD.bazel index 78b6ebca..4e4da9a9 100644 --- a/python/dp_auditorium/dp_auditorium/testers/BUILD.bazel +++ b/python/dp_auditorium/dp_auditorium/testers/BUILD.bazel @@ -35,8 +35,8 @@ py_library( name = "hockey_stick_tester", srcs = ["hockey_stick_tester.py"], deps = [ + ":divergence_tester", ":property_tester_utils", - "//dp_auditorium:interfaces", "//dp_auditorium/configs", requirement("absl-py"), requirement("numpy"), @@ -83,8 +83,8 @@ py_library( name = "renyi_tester", srcs = ["renyi_tester.py"], deps = [ + ":divergence_tester", ":property_tester_utils", - "//dp_auditorium:interfaces", "//dp_auditorium/configs", requirement("absl-py"), requirement("numpy"), @@ -128,6 +128,30 @@ py_test( ], ) +py_library( + name = "divergence_tester", + srcs = ["divergence_tester.py"], + deps = [ + ":property_tester_utils", + "//dp_auditorium:interfaces", + requirement("numpy"), + requirement("tensorflow"), + requirement("typing_extensions"), + ], +) + +py_test( + name = "divergence_tester_test", + srcs = ["divergence_tester_test.py"], + deps = [ + ":divergence_tester", + "//dp_auditorium/configs", + requirement("absl-py"), + requirement("numpy"), + requirement("tensorflow"), + ], +) + py_library( name = "property_tester_utils", srcs = ["property_tester_utils.py"], diff --git a/python/dp_auditorium/dp_auditorium/testers/__init__.py b/python/dp_auditorium/dp_auditorium/testers/__init__.py index e94f69e6..88b2d1b6 100644 --- a/python/dp_auditorium/dp_auditorium/testers/__init__.py +++ b/python/dp_auditorium/dp_auditorium/testers/__init__.py @@ -15,7 +15,6 @@ """Differential Privacy Property Testers.""" from dp_auditorium.testers.histogram_tester import HistogramTester -from dp_auditorium.testers.hockey_stick_tester import HockeyStickDivergenceTrainingOptions from dp_auditorium.testers.hockey_stick_tester import HockeyStickPropertyTester from dp_auditorium.testers.mmd_tester import MMDPropertyTester from dp_auditorium.testers.renyi_tester import RenyiModel diff --git a/python/dp_auditorium/dp_auditorium/testers/divergence_tester.py b/python/dp_auditorium/dp_auditorium/testers/divergence_tester.py new file mode 100644 index 00000000..c82a3b6f --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/testers/divergence_tester.py @@ -0,0 +1,123 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testers type aliases used throughout the dp-auditorium library.""" + +import abc +from typing import Any, final + +import numpy as np +import tensorflow as tf +from typing_extensions import override + +from dp_auditorium import interfaces +from dp_auditorium.testers import property_tester_utils + + +class DivergencePropertyTester(interfaces.PropertyTester, abc.ABC): + """PropertyTester that estimate divergences optimizing a parametrized model.""" + + def __init__(self, config: Any, base_model: tf.keras.Model): + """Initializes the instance. + + Args: + config: Configuration for initializing property tester. + base_model: A Keras model that discriminates between samples generated by + a mechanism run on two different datasets. + """ + + @property + @abc.abstractmethod + def _test_threshold(self) -> float: + """Threshold above which a divergence estimator will fail the privacy test.""" + + @abc.abstractmethod + def _get_optimized_divergence_estimation_model( + self, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, + ) -> tf.keras.Model: + """Fits model weights that maximize a given divergence on provided samples. + + This method optimizes parameters of a base model using samples from two + distributions, maximizing a lower bound on the divergence between those + distributions. + + Args: + samples_first_distribution: Array with training samples from first + distribution. + samples_second_distribution: Arrays with training samples from second + distribution. + + Returns: + tf.keras.Model fitted on input samples to maximize a given divergence + estimator. + """ + + @abc.abstractmethod + def _compute_divergence_on_samples( + self, + model: tf.keras.Model, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, + failure_probability: float, + ) -> float: + """Estimate lower bound divergence on given samples. + + Args: + model: Model used to estimate the divergence on test samples. + samples_first_distribution: Array with samples from first distribution. + samples_second_distribution: Arrays with samples from second distribution. + failure_probability: Probability of test failure. + + Returns: + Estimated divergence. + """ + + @override + @final + def estimate_lower_bound( + self, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, + failure_probability: float, + ) -> float: + samples1_train, samples1_test = ( + property_tester_utils.split_train_test_samples( + samples_first_distribution + ) + ) + samples2_train, samples2_test = ( + property_tester_utils.split_train_test_samples( + samples_second_distribution + ) + ) + + model = self._get_optimized_divergence_estimation_model( + samples1_train, + samples2_train, + ) + + divergence_test = self._compute_divergence_on_samples( + model, + samples1_test, + samples2_test, + failure_probability, + ) + + return divergence_test + + @override + @final + def reject_property(self, lower_bound: float) -> bool: + return lower_bound > self._test_threshold diff --git a/python/dp_auditorium/dp_auditorium/testers/divergence_tester_test.py b/python/dp_auditorium/dp_auditorium/testers/divergence_tester_test.py new file mode 100644 index 00000000..3dd8e4e4 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/testers/divergence_tester_test.py @@ -0,0 +1,122 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for DP-Auditorium interfaces.""" + +import dataclasses + +from absl.testing import absltest +from absl.testing import parameterized +import numpy as np +import tensorflow as tf + +from dp_auditorium.configs import privacy_property +from dp_auditorium.testers import divergence_tester + + +@dataclasses.dataclass +class StubDivergencePropertyTesterConfig: + """Configuration for stub divergence property tester.""" + + estimated_divergence: float + test_threshold: float + + +class StubDivergencePropertyTester(divergence_tester.DivergencePropertyTester): + + def __init__( + self, + config: StubDivergencePropertyTesterConfig, + base_model: tf.keras.Model, + ): + self._config_test_threshold = config.test_threshold + self._estimated_divergence = config.estimated_divergence + self._base_model = base_model + + @property + def _test_threshold(self) -> float: + return self._config_test_threshold + + @property + def privacy_property(self) -> privacy_property.PrivacyProperty: + return privacy_property.PureDp(epsilon=0.1) + + def _get_optimized_divergence_estimation_model( + self, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, + ) -> tf.keras.Model: + return self._base_model + + def _compute_divergence_on_samples( + self, + model: tf.keras.Model, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, + failure_probability: float, + ) -> float: + del ( + model, + samples_first_distribution, + samples_second_distribution, + failure_probability, + ) + return self._estimated_divergence + + +class DivergencePropertyTesterTest(parameterized.TestCase): + + def setUp(self): + super().setUp() + self.base_model = tf.keras.Sequential([ + tf.keras.Input(shape=(3,)), + tf.keras.layers.Dense(1, activation="relu"), + ]) + + @parameterized.parameters(0.3, 1.1, 2.5) + def test_divergence_property_estimates_lower_bound_returns_expected_divergence( + self, divergence + ): + samples1 = np.ones((100, 1)) + samples2 = np.ones((100, 1)) + + config = StubDivergencePropertyTesterConfig( + estimated_divergence=divergence, test_threshold=0.1 + ) + divergence_property_tester = StubDivergencePropertyTester( + config, self.base_model + ) + + estimated_divergence = divergence_property_tester.estimate_lower_bound( + samples1, samples2, failure_probability=0.1 + ) + self.assertAlmostEqual(estimated_divergence, divergence) + + @parameterized.product(lower_bound=[0.1, 0.5], threshold=[0.0, 0.5, 1.0]) + def test_divergence_property_tester_rejects_property( + self, lower_bound, threshold + ): + # In this test the estimated divergence parameter will be unused and we can + # set a dummy value. + config = StubDivergencePropertyTesterConfig( + estimated_divergence=0.314, test_threshold=threshold + ) + divergence_property_tester = StubDivergencePropertyTester( + config, self.base_model + ) + result = divergence_property_tester.reject_property(lower_bound) + self.assertEqual(lower_bound > threshold, result) + + +if __name__ == "__main__": + absltest.main() diff --git a/python/dp_auditorium/dp_auditorium/testers/histogram_tester.py b/python/dp_auditorium/dp_auditorium/testers/histogram_tester.py index c1691d61..83b4fc00 100644 --- a/python/dp_auditorium/dp_auditorium/testers/histogram_tester.py +++ b/python/dp_auditorium/dp_auditorium/testers/histogram_tester.py @@ -139,6 +139,7 @@ def __init__( ) self._epsilon = config.approximate_dp.epsilon self._delta = config.approximate_dp.delta + self._use_original_tester = config.use_original_tester self._histogram_size = config.histogram_size self._approximate_dp = config.approximate_dp @@ -148,18 +149,33 @@ def privacy_property(self) -> privacy_property.PrivacyProperty: return privacy_property.PrivacyProperty(approximate_dp=self._approximate_dp) def _get_error_tolerance( - self, num_samples: float, failure_probability: float + self, + num_samples: float, + probabilities1: np.ndarray, + probabilities2: np.ndarray, + failure_probability: float ) -> float: """Gets error tolerance for Histogram property tester.""" - term_1 = ( - 2 - * (1 + np.exp(self._epsilon)) - * np.sqrt(self._histogram_size / num_samples) - ) + if self._use_original_tester: + term_1 = ( + 2.0 + * (1.0 + np.exp(self._epsilon)) + * np.sqrt(self._histogram_size / num_samples) + ) + else: + term_1a = ( + 2.0 / np.sqrt(num_samples) + * sum(np.sqrt(probabilities1)) + ) + term_1b = ( + 2.0 * np.exp(self._epsilon) / np.sqrt(num_samples) + * sum(np.sqrt(probabilities2)) + ) + term_1 = term_1a + term_1b term_2 = ( - 6 - * (1 + np.exp(self._epsilon)) - * np.sqrt(np.log(4 / failure_probability) / (2 * num_samples)) + 6.0 + * (1.0 + np.exp(self._epsilon)) + * np.sqrt(np.log(4.0 / failure_probability) / (2.0 * num_samples)) ) return term_1 + term_2 @@ -177,7 +193,7 @@ def estimate_lower_bound( per_outcome_delta = probabilities1 - np.exp(self._epsilon) * probabilities2 estimated_delta = np.sum(per_outcome_delta[per_outcome_delta > 0]) error_tolerance = self._get_error_tolerance( - num_samples, failure_probability + num_samples, probabilities1, probabilities2, failure_probability ) return estimated_delta - error_tolerance diff --git a/python/dp_auditorium/dp_auditorium/testers/histogram_tester_test.py b/python/dp_auditorium/dp_auditorium/testers/histogram_tester_test.py index 342823ae..4a223c1e 100644 --- a/python/dp_auditorium/dp_auditorium/testers/histogram_tester_test.py +++ b/python/dp_auditorium/dp_auditorium/testers/histogram_tester_test.py @@ -42,11 +42,28 @@ def test_get_error_tolerance( test_discrete_mechanism=False, histogram_size=histogram_size, ) - tester = histogram_tester.HistogramTester(config) - result = tester._get_error_tolerance(num_samples, failure_probability) - self.assertAllClose(result, expected_error_tolerance, rtol=1e-2) - - def test_estimate_lower_bound(self): + probabilities1 = np.random.dirichlet(alpha=np.ones(histogram_size)) + probabilities2 = np.random.dirichlet(alpha=np.ones(histogram_size)) + with self.subTest(use_original_tester=True): + config.use_original_tester = True + tester = histogram_tester.HistogramTester(config) + result = tester._get_error_tolerance(num_samples, + probabilities1, + probabilities2, + failure_probability) + self.assertAllClose(result, expected_error_tolerance, rtol=1e-2) + with self.subTest(use_original_tester=False): + config.use_original_tester = False + tester = histogram_tester.HistogramTester(config) + result = tester._get_error_tolerance(num_samples, + probabilities1, + probabilities2, + failure_probability) + # New tester always has smaller error tolerance than original tester. + self.assertLess(result, expected_error_tolerance) + + @parameterized.parameters(True, False) + def test_estimate_lower_bound(self, use_original_tester): """Verifies estimate of delta lower bound. Verifies that correct delta is calculated for a mechanism that @@ -56,6 +73,8 @@ def test_estimate_lower_bound(self): num_samples = 100 samples1 = np.zeros(num_samples) samples2 = np.ones(num_samples) + probabilities1 = np.array([1, 0]) + probabilities2 = np.array([0, 1]) # Initialize tester. config = property_tester_config.HistogramPropertyTesterConfig( @@ -65,12 +84,13 @@ def test_estimate_lower_bound(self): test_discrete_mechanism=False, histogram_size=2, ) + config.use_original_tester = use_original_tester tester = histogram_tester.HistogramTester(config) # Estimate delta. failure_probability = 0.1 expected_delta = 1.0 - tester._get_error_tolerance( - num_samples, failure_probability + num_samples, probabilities1, probabilities2, failure_probability ) estimated_delta = tester.estimate_lower_bound( samples1, samples2, failure_probability @@ -85,6 +105,7 @@ def test_reject_property(self): max_value=1, min_value=0, test_discrete_mechanism=False, + use_original_tester=True, histogram_size=2, ) tester = histogram_tester.HistogramTester(config) diff --git a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py index dfdbbf75..a8b6bbc7 100644 --- a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py +++ b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py @@ -18,39 +18,19 @@ using its dual formulation as the weighted accuracy of a classifier. """ -import dataclasses from typing import Tuple from absl import logging import numpy as np import tensorflow as tf +from typing_extensions import override -from dp_auditorium import interfaces from dp_auditorium.configs import privacy_property from dp_auditorium.configs import property_tester_config +from dp_auditorium.testers import divergence_tester from dp_auditorium.testers import property_tester_utils -@dataclasses.dataclass(frozen=False) -class HockeyStickDivergenceTrainingOptions: - """Training options for the HockeyStickDivergenceTester. - - Attributes: - num_epochs: Number of epochs to run the training pipeline. - learning_rate: Learning rate for Adam optimizer. - batch_size: Batch size to use during training. - failure_probability: Probability of returning a false positive. That is, the - divergence suggests the mechanism is not private even though it is. - verbose: Integer to pass to keras to decide the verbosity of the training - process. - """ - - num_epochs: int - learning_rate: float - batch_size: int - verbose: int - - def make_default_hs_training_config() -> property_tester_config.TrainingConfig: return property_tester_config.TrainingConfig( training_epochs=2, @@ -69,20 +49,9 @@ def make_default_hs_base_model() -> tf.keras.Model: ]) -def make_training_options_from_config( - training_config: property_tester_config.TrainingConfig, -): - return HockeyStickDivergenceTrainingOptions( - num_epochs=training_config.training_epochs, - learning_rate=training_config.optimizer_learning_rate, - batch_size=training_config.batch_size, - verbose=training_config.verbose, - ) - - # Helper functions and classes for the HockeyStickDivergenceTester def _get_accuracy_confidence_bound( - n_samples: int, confidence: float = 0.95 + range_bound: float, n_samples: int, confidence: float = 0.95, ) -> float: r"""Returns a confidence bound on the estimate of P(h(X) = y). @@ -90,6 +59,7 @@ def _get_accuracy_confidence_bound( \frac{1}{n} \sum_{i=1}^n {h(X_i) = Y_i}. Args: + range_bound: a bound on the length of the range on estimated values. n_samples: Number of samples used in the estimate. confidence: The level of confidence we want the estimate to have. @@ -97,10 +67,10 @@ def _get_accuracy_confidence_bound( The one-sided confidence error around the estimate. """ delta = 1.0 - confidence - return np.sqrt(np.log(1.0 / delta) / 2.0 / n_samples) + return range_bound * np.sqrt(np.log(1.0 / delta) / 2.0 / n_samples) -class HockeyStickPropertyTester(interfaces.PropertyTester): +class HockeyStickPropertyTester(divergence_tester.DivergencePropertyTester): r"""Uses a model to estimate divergence between the outputs of a mechanism. Specifically, given two neighboring datasets D_0, D_1 and epsilon. Generates @@ -109,7 +79,11 @@ class HockeyStickPropertyTester(interfaces.PropertyTester): tries to distinguish between "positive" and "negative" examples. A mechanism is (epsilon,delta) DP if and only if the accuracy of a classifier in this dataset is less than (e^epsilon + delta) / (1 + e^epsilon). The hockey stick - divergence corresponds to \delta. + divergence corresponds to delta. + + NOTE: This property tester overrides any user-specified value of + config.training_config.model_output_coordinate_bound with 1.0 + for the sake of validity and efficiency. Attributes: _base_model: A keras model that discriminates between samples generated by a @@ -117,9 +91,6 @@ class HockeyStickPropertyTester(interfaces.PropertyTester): class must return logits. _epsilon: The epsilon in the (epsilon, delta) guarantee the mechanism is supposed to satisfy. - _delta: The delta in the (epsilon,delta) guarantee the mechanism is supposed - to satisfy. - _has_called_fit: Boolean that verifies if model has been trained. """ def __init__( @@ -138,40 +109,35 @@ def __init__( property_tester_utils.validate_approximate_dp_property( config.approximate_dp ) + # This constant defines the maximum output value of a `base_model` and is + # used to get confidence intervals for the lower bound of the divergence. + # We set it here to 1.0 given that the tester optimizes for a binary + # classification task. + logging.info( + "Overwriting `model_output_coordinate_bound`; the validity and efficacy" + " of the test is optimized for `model_output_coordinate_bound=1.0`" + ) + config.training_config.model_output_coordinate_bound = 1.0 + property_tester_utils.validate_training_config(config.training_config) + self._model_coordinate_bound = ( + config.training_config.model_output_coordinate_bound + ) self._base_model = base_model self._epsilon = config.approximate_dp.epsilon self._delta = config.approximate_dp.delta self._approximate_dp = config.approximate_dp - self._has_called_fit = False - self._training_options = make_training_options_from_config( - config.training_config - ) - self.initialize(self._training_options) + self._training_options = config.training_config + self._evaluation_batch_size = config.evaluation_batch_size + + @property + def _test_threshold(self) -> float: + return self._delta @property def privacy_property(self) -> privacy_property.PrivacyProperty: """The privacy guarantee that the tester is being used to test for.""" return privacy_property.PrivacyProperty(approximate_dp=self._approximate_dp) - def initialize(self, training_options: HockeyStickDivergenceTrainingOptions): - """Compiles internal model. - - Auxiliary function to use as a standalone tester while changing training - options of the tester across different runs. - - Args: - training_options: Training options for keras optimization. - """ - - if self._has_called_fit: - self._base_model = tf.keras.models.clone_model(self._base_model) - self._base_model.compile( - optimizer=tf.keras.optimizers.Adam(training_options.learning_rate), - loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), - metrics=tf.keras.metrics.BinaryAccuracy(threshold=0.0), - ) - self._training_options = training_options - def _generate_inputs_to_model( self, samples1: np.ndarray, @@ -195,7 +161,7 @@ def _generate_inputs_to_model( Raises: ValueError if the ranks of sample1 and sample 2 are not equal. """ - sample_cutoff_fraction = 1.0 / (np.exp(self._epsilon) + 1) + sample_cutoff_fraction = 1.0 / (np.exp(self._epsilon) + 1.0) if len(samples1.shape) != len(samples2.shape): raise ValueError(f"""Mechanism outputs on dataset 1 and dataset 2 should @@ -215,146 +181,60 @@ def _generate_inputs_to_model( labels = np.concatenate([labels_1, labels_2], axis=0) return features, labels - def _fit( + @override + def _get_optimized_divergence_estimation_model( self, - samples1: np.ndarray, - samples2: np.ndarray, - batch_size: int, - epochs: int, - verbose: int, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, ): - """Fits the underlying model on the labeled output of a mechansim. - - Args: - samples1: Samples from one distribution - samples2: Samples from the other distribution - batch_size: Batch size to use in the training process. - epochs: Number of epochs to train for. - verbose: Option passed to keras trainer. - """ - self.initialize(self._training_options) - features, labels = self._generate_inputs_to_model(samples1, samples2) - self._base_model.fit( + model = tf.keras.models.clone_model(self._base_model) + model.compile( + optimizer=tf.keras.optimizers.Adam( + self._training_options.optimizer_learning_rate + ), + loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), + metrics=tf.keras.metrics.BinaryAccuracy(threshold=0.0), + ) + features, labels = self._generate_inputs_to_model( + samples_first_distribution, + samples_second_distribution, + ) + model.fit( features, labels, shuffle=True, - epochs=epochs, - batch_size=batch_size, - verbose=verbose, - ) - self._has_called_fit = True - - def _get_accuracy_and_divergence_estimate( - self, - samples1, - samples2, - failure_probability: float, - ) -> Tuple[float, float]: - """Returns the accuracy of a trained classifier. - - Args: - samples1: Samples from one distribution - samples2: Samples from the other distribution - failure_probability: Probability of having a false positive. I.e. the test - suggests that this is a privacy violation when in reality it is not. - - Returns: - The accuracy of the classifier (adjusted with confidence) and the - associated hockey stick divergence (this corresponds to the delta of the - mechanism). - - Raises: - AttributeError if called before calling fit(). - """ - if not self._has_called_fit: - raise AttributeError( - "Estimator should be trained with fit() before getting accuracy" - ) - features, labels = self._generate_inputs_to_model(samples1, samples2) - - accuracy = self._base_model.evaluate(features, labels, batch_size=1000)[1] - test_sample_size = samples1.shape[0] - accuracy -= _get_accuracy_confidence_bound( - test_sample_size, 1 - failure_probability - ) - hs_divergence = accuracy * (1 + np.exp(self._epsilon)) - np.exp( - self._epsilon - ) - return accuracy, hs_divergence - - def _estimate_discriminative_accuracy_and_hs_divergence_of_mechanism( - self, samples_1, samples_2, failure_probability: float - ) -> Tuple[float, float]: - """End to end estimation of accuracy and divergence. - - Args: - samples_1: Samples from one distribution - samples_2: Samples from the other distribution - failure_probability: The probability of the test asserting that the the - accuracy estimated by the method is lower than the returned value. - - Returns: - Accuracy and hockey stick divergence of the mechanism on datasets. - """ - train_samples1, test_samples_1 = ( - property_tester_utils.split_train_test_samples( - samples_1 - ) - ) - train_samples2, test_samples_2 = ( - property_tester_utils.split_train_test_samples( - samples_2 - ) - ) - self._fit( - train_samples1, - train_samples2, - epochs=self._training_options.num_epochs, + epochs=self._training_options.training_epochs, batch_size=self._training_options.batch_size, verbose=self._training_options.verbose, ) - logging.info("Evaluating model") - return self._get_accuracy_and_divergence_estimate( - test_samples_1, - test_samples_2, - failure_probability=failure_probability, - ) - def estimate_lower_bound( + return model + + @override + def _compute_divergence_on_samples( self, - samples_1: np.ndarray, - samples_2: np.ndarray, + model: tf.keras.Model, + samples_first_distribution: np.ndarray, + samples_second_distribution: np.ndarray, failure_probability: float, ) -> float: - """Returns a lower bound on the hockey stick divergence between the samples. - - Args: - samples_1: First set of samples. - samples_2: Second set of samples - failure_probability: The probability that the returned value is not in - fact a lower bound on the divergence between the distributions that - generated these samples. - - Returns: - Estimated lower bound on the divergence between two distributions - represented by samples_1 and samples_2. - """ - accuracy, divergence = ( - self._estimate_discriminative_accuracy_and_hs_divergence_of_mechanism( - samples_1, samples_2, failure_probability - ) + features, labels = self._generate_inputs_to_model( + samples_first_distribution, samples_second_distribution ) - logging.info("Accuracy: %f, Divergence: %f", accuracy, divergence) - return divergence - - def reject_property(self, lower_bound: float) -> bool: - """Tests whether a mechanism is epsilon-delta private. - Args: - lower_bound: Divergence obtained from estimate_divergence - - Returns: - True if the esitmated lower bound on the divergence is above the - expected delta parameter of a mechanism. - """ - return lower_bound > self._delta + accuracy = model.evaluate( + features, labels, batch_size=self._evaluation_batch_size + )[1] + test_sample_size = min( + samples_first_distribution.shape[0], + samples_second_distribution.shape[0], + ) + accuracy -= _get_accuracy_confidence_bound( + self._model_coordinate_bound, + test_sample_size, + 1.0 - failure_probability, + ) + hs_divergence = accuracy * (1.0 + np.exp(self._epsilon)) - np.exp( + self._epsilon + ) + return hs_divergence diff --git a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py index 00b2a3b3..3403e03f 100644 --- a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py +++ b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py @@ -16,13 +16,14 @@ from absl.testing import absltest import numpy as np from tensorflow import keras -from dp_auditorium.configs import privacy_property as privacy_property +from dp_auditorium.configs import privacy_property from dp_auditorium.configs import property_tester_config as config from dp_auditorium.testers import hockey_stick_tester as hst _SEED = 123456 _RNG = np.random.default_rng(seed=_SEED) +_ESTIMATION_RANGE_BOUND = 1.0 class HockeyStickDivergenceTest(absltest.TestCase): @@ -30,7 +31,9 @@ class HockeyStickDivergenceTest(absltest.TestCase): def test_confidence_bound(self): n_samples = 100 n_experiments = 1000 - cb = hst._get_accuracy_confidence_bound(n_samples, confidence=0.95) + cb = hst._get_accuracy_confidence_bound( + _ESTIMATION_RANGE_BOUND, n_samples, confidence=0.95 + ) # Generate 1000 sums of bernoulli random variables. sample = _RNG.binomial(n_samples, 0.3, n_experiments) errors = np.abs(sample / n_samples - 0.3) @@ -52,7 +55,7 @@ def test_get_model_inputs(self): training_config = hst.make_default_hs_training_config() hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.001), - training_config=training_config, + training_config=training_config, evaluation_batch_size=1000, ) div_estimator = hst.HockeyStickPropertyTester( config=hs_config, base_model=model @@ -81,7 +84,7 @@ def test_get_model_inputs_higher_dim(self): training_config = hst.make_default_hs_training_config() hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.001), - training_config=training_config, + training_config=training_config, evaluation_batch_size=1000, ) div_estimator = hst.HockeyStickPropertyTester( config=hs_config, base_model=model @@ -124,35 +127,37 @@ def test_get_model_inputs_higher_dim(self): labels[features_sort_ix, ...], expected_labels ) - def test_get_accuracy_and_divergence(self): + def test_compute_divergence_on_samples(self): data1 = np.array([1]) data2 = np.array([-1]) - samples1 = self.dummy_mechanism(data1, 2000) - samples2 = self.dummy_mechanism(data2, 2000) + samples1_train = self.dummy_mechanism(data1, 2000) + samples2_train = self.dummy_mechanism(data2, 2000) + + samples1_test = self.dummy_mechanism(data1, 2000) + samples2_test = self.dummy_mechanism(data2, 2000) model = keras.Sequential([keras.layers.Dense(1, use_bias=False)]) training_options = hst.make_default_hs_training_config() training_options.training_epochs = 1000 hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=0.5, delta=0.1), - training_config=training_options, + training_config=training_options, evaluation_batch_size=1000, ) div_estimator = hst.HockeyStickPropertyTester( config=hs_config, base_model=model ) - accuracy, div = ( - div_estimator._estimate_discriminative_accuracy_and_hs_divergence_of_mechanism( # pylint: disable=line-too-long - samples1, samples2, 0.05 - ) + model = div_estimator._get_optimized_divergence_estimation_model( + samples1_train, samples2_train + ) + div = div_estimator._compute_divergence_on_samples( + model, samples1_test, samples2_test, 0.05 ) - expected_accuracy = 0.99 - hst._get_accuracy_confidence_bound(1000) - self.assertGreater(accuracy, expected_accuracy) self.assertGreater(div, 0.5) def laplace_mechanism(self, x, n_samples): return _RNG.laplace(0, 1.0, n_samples) + x - def test_get_accuracy_and_divergence_private_mechanism(self): + def test_compute_divergence_private_mechanism(self): data1 = np.array([1]) data2 = np.array([0]) model = keras.Sequential([keras.layers.Dense(1)]) @@ -162,56 +167,38 @@ def test_get_accuracy_and_divergence_private_mechanism(self): training_config.training_epochs = 1000 hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.0), - training_config=training_config, + training_config=training_config, evaluation_batch_size=1000, ) div_estimator = hst.HockeyStickPropertyTester( config=hs_config, base_model=model ) - - accuracy, div = ( - div_estimator._estimate_discriminative_accuracy_and_hs_divergence_of_mechanism( # pylint: disable=line-too-long - samples1, samples2, 0.05 - ) - ) - expected_accuracy = np.exp(1) / (1 + np.exp(1)) - self.assertLess(accuracy, expected_accuracy) - self.assertLess(div, 0.0) - - def test_fails_to_evaluate_when_not_fitted(self): - model = keras.Sequential([keras.layers.Dense(1)]) - training_options = hst.make_default_hs_training_config() - hs_config = config.HockeyStickPropertyTesterConfig( - approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.0), - training_config=training_options, + model = div_estimator._get_optimized_divergence_estimation_model( + samples1[:1000], samples2[:1000] ) - div_estimator = hst.HockeyStickPropertyTester( - config=hs_config, base_model=model + div = div_estimator._compute_divergence_on_samples( + model, samples1, samples2, 0.05 ) - with self.assertRaises(AttributeError) as context: - div_estimator._get_accuracy_and_divergence_estimate( - np.array([0]), np.array([1]), 0.05 - ) - self.assertIn("should be trained", str(context.exception)) + self.assertLess(div, 0.0) - def bad_mechanism(self, data, n_samples): + def mechanism_with_different_output_shapes(self, data, n_samples): if data[0] == 0: return np.ones((n_samples, 2)) if data[0] == 1: return np.ones((n_samples, 3, 4)) - def test_fails_on_bad_mechanism(self): + def test_fails_on_mechanism_with_different_output_shapes(self): model = keras.Sequential([keras.layers.Dense(1)]) training_options = hst.make_default_hs_training_config() hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.1), - training_config=training_options, + training_config=training_options, evaluation_batch_size=1000, ) div_estimator = hst.HockeyStickPropertyTester( config=hs_config, base_model=model ) - samples1 = self.bad_mechanism(np.array([0]), 100) - samples2 = self.bad_mechanism(np.array([1]), 100) + samples1 = self.mechanism_with_different_output_shapes(np.array([0]), 100) + samples2 = self.mechanism_with_different_output_shapes(np.array([1]), 100) with self.assertRaises(ValueError) as context: div_estimator._generate_inputs_to_model(samples1, samples2) self.assertIn("rank", str(context.exception)) @@ -233,7 +220,7 @@ def test_hockey_stick_privacy_tester(self): training_options.training_epochs = 1000 hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=epsilon, delta=delta), - training_config=training_options, + training_config=training_options, evaluation_batch_size=1000, ) hsdt = hst.HockeyStickPropertyTester(config=hs_config, base_model=model) @@ -244,10 +231,11 @@ def test_hockey_stick_privacy_tester(self): divergence = hsdt.estimate_lower_bound( samples1, samples2, failure_probability ) - - self.assertLess(divergence, 0.0) + with self.subTest("divergence_less_than_threshold"): + self.assertLess(divergence, 0.0) found_privacy_violation = hsdt.reject_property(divergence) - self.assertFalse(found_privacy_violation) + with self.subTest("found_privacy_violation"): + self.assertFalse(found_privacy_violation) def test_hockey_stick_non_private_mechanism(self): model = keras.Sequential([keras.layers.Dense(1, use_bias=True)]) @@ -257,22 +245,24 @@ def test_hockey_stick_non_private_mechanism(self): training_options.training_epochs = 1000 hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=epsilon, delta=delta), - training_config=training_options, + training_config=training_options, evaluation_batch_size=1000, ) hsdt = hst.HockeyStickPropertyTester(config=hs_config, base_model=model) samples1 = self.non_private_mechanism_for_testing(np.array([0, 1]), 200) samples2 = self.non_private_mechanism_for_testing(np.array([0]), 200) divergence = hsdt.estimate_lower_bound(samples1, samples2, 0.05) + with self.subTest("divergence_greater_than_threshold"): + self.assertLess(0.5, divergence) - self.assertLess(0.5, divergence) found_privacy_violation = hsdt.reject_property(divergence) - self.assertTrue(found_privacy_violation) + with self.subTest("found_privacy_violation"): + self.assertTrue(found_privacy_violation) def test_assert_privacy_violation(self): training_options = hst.make_default_hs_training_config() hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.1), - training_config=training_options, + training_config=training_options, evaluation_batch_size=1000, ) hsdt = hst.HockeyStickPropertyTester( config=hs_config, @@ -286,14 +276,15 @@ def test_privacy_property(self): training_options = hst.make_default_hs_training_config() hs_config = config.HockeyStickPropertyTesterConfig( approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.1), - training_config=training_options, + training_config=training_options, evaluation_batch_size=1000, ) hs_tester = hst.HockeyStickPropertyTester( - config=hs_config, - base_model=hst.make_default_hs_base_model()) + config=hs_config, base_model=hst.make_default_hs_base_model() + ) self.assertEqual( hs_config.approximate_dp, hs_tester.privacy_property.approximate_dp ) + if __name__ == "__main__": absltest.main() diff --git a/python/dp_auditorium/dp_auditorium/testers/property_tester_utils.py b/python/dp_auditorium/dp_auditorium/testers/property_tester_utils.py index b5b230af..dbe283fb 100644 --- a/python/dp_auditorium/dp_auditorium/testers/property_tester_utils.py +++ b/python/dp_auditorium/dp_auditorium/testers/property_tester_utils.py @@ -48,7 +48,7 @@ def split_train_test_samples( return samples[0 : n // 2, ...], samples[n // 2 :, ...] -def validate_training_params( +def validate_training_config( training_config: property_tester_config.TrainingConfig, ): """Returns true if the training_params parameters are valid.""" diff --git a/python/dp_auditorium/dp_auditorium/testers/property_tester_utils_test.py b/python/dp_auditorium/dp_auditorium/testers/property_tester_utils_test.py index b50467e6..60ed9500 100644 --- a/python/dp_auditorium/dp_auditorium/testers/property_tester_utils_test.py +++ b/python/dp_auditorium/dp_auditorium/testers/property_tester_utils_test.py @@ -34,30 +34,30 @@ def setUp(self): ) @parameterized.parameters(-0.1, 0.0) - def test_validate_training_params_wrong_lr(self, learning_rate): + def test_validate_training_config_wrong_lr(self, learning_rate): self.training_config.optimizer_learning_rate = learning_rate with self.assertRaises(ValueError): - _ = property_tester_utils.validate_training_params(self.training_config) + _ = property_tester_utils.validate_training_config(self.training_config) @parameterized.parameters(-10, 0) - def test_validate_training_params_wrong_training_epochs( + def test_validate_training_config_wrong_training_epochs( self, training_epochs ): self.training_config.training_epochs = training_epochs with self.assertRaises(ValueError): - _ = property_tester_utils.validate_training_params(self.training_config) + _ = property_tester_utils.validate_training_config(self.training_config) @parameterized.parameters(-5, 0) - def test_validate_training_params_wrong_batch_size(self, batch_size): + def test_validate_training_config_wrong_batch_size(self, batch_size): self.training_config.batch_size = batch_size with self.assertRaises(ValueError): - _ = property_tester_utils.validate_training_params(self.training_config) + _ = property_tester_utils.validate_training_config(self.training_config) @parameterized.parameters(-0.1, 0.0) - def test_validate_training_params_wrong_bound(self, bound): + def test_validate_training_config_wrong_bound(self, bound): self.training_config.model_output_coordinate_bound = bound with self.assertRaises(ValueError): - _ = property_tester_utils.validate_training_params(self.training_config) + _ = property_tester_utils.validate_training_config(self.training_config) def test_split_train_test_samples_raises_exception(self): with self.assertRaises(ValueError): diff --git a/python/dp_auditorium/dp_auditorium/testers/renyi_tester.py b/python/dp_auditorium/dp_auditorium/testers/renyi_tester.py index 56b560a1..126fc5ae 100644 --- a/python/dp_auditorium/dp_auditorium/testers/renyi_tester.py +++ b/python/dp_auditorium/dp_auditorium/testers/renyi_tester.py @@ -16,103 +16,24 @@ Functions to estimate Renyi divergence between samples of two distributions. """ -from typing import Dict, Optional, Union +from typing import Dict import numpy as np import tensorflow as tf from typing_extensions import override -from dp_auditorium import interfaces from dp_auditorium.configs import privacy_property from dp_auditorium.configs import property_tester_config +from dp_auditorium.testers import divergence_tester from dp_auditorium.testers import property_tester_utils -def _renyi_model_parameters_initializer( - config: property_tester_config.RenyiPropertyTesterConfig, - base_model: Optional[tf.keras.Model] = None, -) -> dict[str, Union[float, int, None, tf.keras.Model]]: - """Initializes attributes for RenyiPropertyTester. - - This function processes `config` to extract privacy parameters - and initialize the model parametrizing the Renyi divergence approximation. See - section 4.1. of https://arxiv.org/pdf/2307.05608.pdf for more details. - - Args: - config: A RenyiPropertyTester configuration. - base_model: A keras model to use to parametrize the variational formulation - of the Renyi divergence. - - Returns: - A dictionary with relevant attributes to initialize a RenyiPropertyTester. - The dictionary contains (1) a value `alpha` for the order of the Renyi - divergence being estimated, (2) the `test_threshold`, and (3) a `base_model` - keras model parametrizing the function space to estimate the Renyi - divergence. - - Raises: - ValueError if the config sets two different alpha parameters - when testing Renyi DP or if the privacy property is different than pure or - Renyi DP. - """ - - if config.privacy_property.renyi_dp is not None: - privacy_type = 'renyi_dp' - alpha = config.privacy_property.renyi_dp.alpha - epsilon = config.privacy_property.renyi_dp.epsilon - if config.alpha != alpha: - raise ValueError( - 'Alpha parameter for Renyi DP should be specified in' - ' privacy_tester_config.privacy_property. It was specified in' - ' config.alpha which is only used for Pure DP tests.' - ) - elif config.privacy_property.pure_dp is not None: - privacy_type = 'pure_dp' - epsilon = config.privacy_property.pure_dp.epsilon - alpha = config.alpha - else: - raise ValueError( - 'The specified privacy_property is not supported by RenyiTester.' - ) - - model_output_coordinate_bound = ( - config.training_config.model_output_coordinate_bound - ) - - def scaled_tanh(x): - return model_output_coordinate_bound * tf.keras.activations.tanh(x) - - if base_model is None: - base_model = tf.keras.models.Sequential([ - tf.keras.layers.Dense(100, activation=scaled_tanh), - tf.keras.layers.Dense(100, activation=scaled_tanh), - tf.keras.layers.Dense(1), - ]) - - base_model.add(tf.keras.layers.Activation(scaled_tanh)) - - if privacy_type == 'renyi_dp': - threshold = epsilon - else: - threshold = min(epsilon, 2 * alpha * epsilon**2) - return { - 'alpha': alpha, - 'test_threshold': threshold, - 'base_model': base_model, - } - - -def _compute_error_from_gamma(gamma: float) -> float: - """Returns additive error from convenience variable gamma. - - To estimate number of samples we allow for a multiplicative error `gamma` from - chernoff bound in https://arxiv.org/abs/2307.05608. This function converts the - multiplicative error to additive error. - - Args: - gamma: Multiplicative error. - """ - return np.log((1 + gamma) / (1 - gamma)) +def make_default_renyi_base_model() -> tf.keras.Model: + return tf.keras.models.Sequential([ + tf.keras.layers.Dense(100, activation=tf.keras.activations.tanh), + tf.keras.layers.Dense(100, activation=tf.keras.activations.tanh), + tf.keras.layers.Dense(1), + ]) def _compute_error_from_samples( @@ -143,7 +64,8 @@ def _compute_error_from_samples( / num_samples ) gamma = max(error_1, error_2) - return _compute_error_from_gamma(gamma) + error_from_gamma = np.log((1 + gamma) / (1 - gamma)) + return error_from_gamma class RenyiModel(tf.keras.Model): @@ -164,7 +86,7 @@ def train_step( trainable_vars = self.nn_model.trainable_variables d_loss = tape.gradient(loss, trainable_vars) self.optimizer.apply_gradients(zip(d_loss, trainable_vars)) - return {'renyi': divergence} + return {'divergence': divergence} def call( self, data: tuple[np.ndarray, np.ndarray], training: bool = None @@ -196,7 +118,7 @@ def call( return divergence -class RenyiPropertyTester(interfaces.PropertyTester): +class RenyiPropertyTester(divergence_tester.DivergencePropertyTester): """Renyi tester main class. RenyiTester computes a lower bound for the Renyi divergence using Algorithm 2 @@ -209,40 +131,56 @@ class RenyiPropertyTester(interfaces.PropertyTester): def __init__( self, config: property_tester_config.RenyiPropertyTesterConfig, - base_model: Optional[tf.keras.Model] = None, + base_model: tf.keras.Model, ): # Get privacy parameters if config.privacy_property.renyi_dp is not None: property_tester_utils.validate_renyi_dp_property( config.privacy_property.renyi_dp ) + privacy_type = 'renyi_dp' + epsilon = config.privacy_property.renyi_dp.epsilon + alpha = config.privacy_property.renyi_dp.alpha + if config.alpha != alpha: + raise ValueError( + 'Alpha parameter for Renyi DP should be specified in' + ' privacy_tester_config.privacy_property. It was specified in' + ' config.alpha which is only used for Pure DP tests.' + ) elif config.privacy_property.pure_dp is not None: property_tester_utils.validate_pure_dp_property( config.privacy_property.pure_dp ) + privacy_type = 'pure_dp' + epsilon = config.privacy_property.pure_dp.epsilon + alpha = config.alpha else: raise ValueError( 'The specified privacy_property is not supported by' ' RenyiPropertyTester.' ) - property_tester_utils.validate_training_params(config.training_config) - params = _renyi_model_parameters_initializer( - config=config, - base_model=base_model, - ) + property_tester_utils.validate_training_config(config.training_config) + + if privacy_type == 'renyi_dp': + self._initial_test_threshold = epsilon + else: + self._initial_test_threshold = min(epsilon, 2 * alpha * epsilon**2) - # Privacy test parameters. self._tested_property = config.privacy_property + self._alpha = alpha + + self._training_config = config.training_config + self._model_output_coordinate_bound = ( config.training_config.model_output_coordinate_bound ) - self._alpha = params['alpha'] - self._test_threshold = params['test_threshold'] - # Optimization parameters. - self._training_config = config.training_config + def scaled_tanh(x): + return self._model_output_coordinate_bound * tf.keras.activations.tanh(x) - self._renyi_model = RenyiModel(params['base_model'], self._alpha) + base_model.add(tf.keras.layers.Activation(scaled_tanh)) + + self._renyi_model = RenyiModel(base_model, self._alpha) self._renyi_model.compile( optimizer=tf.keras.optimizers.Adam( config.training_config.optimizer_learning_rate @@ -250,11 +188,15 @@ def __init__( ) self._divergence_train = [] + @property + def _test_threshold(self) -> float: + return self._initial_test_threshold + @property def privacy_property(self) -> privacy_property.PrivacyProperty: return self._tested_property - def _reinitialize_nn_model(self): + def _reset_model_weights(self): for layer in self._renyi_model.nn_model.layers: if hasattr(layer, 'kernel'): if layer.kernel is not None and hasattr(layer, 'kernel_initializer'): @@ -263,77 +205,34 @@ def _reinitialize_nn_model(self): if layer.bias is not None and hasattr(layer, 'bias_initializer'): layer.bias.assign(layer.bias_initializer(tf.shape(layer.bias))) - def _optimize_renyi_divergence( + @override + def _get_optimized_divergence_estimation_model( self, samples_first_distribution: np.ndarray, samples_second_distribution: np.ndarray, - verbose: int = 0, - ) -> tf.Tensor: - """Renyi divergence computation. - - Args: - samples_first_distribution: one dimensional array with samples. - samples_second_distribution: one dimensional arrays with samples and same - shape as as p. - verbose: whether to print training evolution, for details see - `tf.keras.mode.fit`. - - Returns: - Estimated Renyi divergence on train samples. - """ - self._reinitialize_nn_model() + ) -> tf.keras.Model: + self._reset_model_weights() self._renyi_model.fit( samples_first_distribution, samples_second_distribution, batch_size=self._training_config.batch_size, epochs=self._training_config.training_epochs, - verbose=verbose, + verbose=self._training_config.verbose, ) - train_renyi = self._renyi_model.history.history['renyi'][-1] - - return train_renyi + return self._renyi_model - def estimate_divergence_from_samples( + @override + def _compute_divergence_on_samples( self, - samples_1_train: np.ndarray, - samples_2_train: np.ndarray, - samples_1_test: np.ndarray, - samples_2_test: np.ndarray, + model: tf.keras.Model, + samples1_test: np.ndarray, + samples2_test: np.ndarray, failure_probability: float, - verbose: int, - ) -> tuple[tf.Tensor, tf.Tensor]: - """Estimates Renyi divergence from samples. - - This method estimates the Renyi divergence beween two distributions. First - it optimizes over a function space determined by the RenyiModel and - then uses the learned function to estimate the Renyi divergence over test - samples. - - Args: - samples_1_train: Samples from the first distribution used to find a - suitable set of parameters for `renyi_model`. - samples_2_train: Samples from the second distribution used to find a - suitable set of parameters for `renyi_model`. - samples_1_test: Samples from the first distribution used to estimate - divergence. - samples_2_test: Samples from the second distribution used to estimate - divergence. - failure_probability: P - verbose: integer passed to `fit` method for logging. - - Returns: - A tuple where the first element is the train divergence and the second is - the estimated divergence lower bound. - """ - # Find suitable model parameters. - divergence_train = self._optimize_renyi_divergence( - samples_1_train, samples_2_train, verbose=verbose - ) - - divergence_test = self._renyi_model((samples_1_test, samples_2_test)) + ) -> float: + divergence_test = model((samples1_test, samples2_test)) # Calculate lower end of confidence interval. - num_samples = samples_1_test.shape[0] + num_samples = min(samples1_test.shape[0], samples2_test.shape[0]) error = _compute_error_from_samples( num_samples=num_samples, failure_probability=failure_probability, @@ -342,34 +241,4 @@ def estimate_divergence_from_samples( ) divergence_test_lower_bound = divergence_test - error - return divergence_train, divergence_test_lower_bound - - @override - def estimate_lower_bound( - self, - samples1: np.ndarray, - samples2: np.ndarray, - failure_probability: float, - ) -> float: - samples1_train, samples1_test = ( - property_tester_utils.split_train_test_samples(samples1) - ) - samples2_train, samples2_test = ( - property_tester_utils.split_train_test_samples(samples2) - ) - - divergence_train, divergence_test = self.estimate_divergence_from_samples( - samples1_train, - samples2_train, - samples1_test, - samples2_test, - failure_probability, - verbose=0, - ) - self._divergence_train.append(divergence_train) - - return divergence_test.numpy() - - @override - def reject_property(self, lower_bound: float) -> bool: - return lower_bound > self._test_threshold + return divergence_test_lower_bound diff --git a/python/dp_auditorium/dp_auditorium/testers/renyi_tester_test.py b/python/dp_auditorium/dp_auditorium/testers/renyi_tester_test.py index 74f7029d..6b7d079a 100644 --- a/python/dp_auditorium/dp_auditorium/testers/renyi_tester_test.py +++ b/python/dp_auditorium/dp_auditorium/testers/renyi_tester_test.py @@ -118,7 +118,8 @@ def setUp(self): alpha=3.0, ) self.renyi_tester = renyi_tester.RenyiPropertyTester( - self.renyi_tester_config + self.renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), ) @parameterized.parameters(1.1, 1.5) @@ -136,10 +137,17 @@ def test_returns_lower_bound_gaussian(self, alpha: float): x_test = self.rng.normal(0, sigma, (num_samples, 1)) y_test = self.rng.normal(mu, sigma, (num_samples, 1)) self.renyi_tester_config.alpha = alpha - tester = renyi_tester.RenyiPropertyTester(self.renyi_tester_config) + tester = renyi_tester.RenyiPropertyTester( + self.renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), + ) - _, divergence_test = tester.estimate_divergence_from_samples( - x, y, x_test, y_test, failure_probability=0.1, verbose=0 + model = tester._get_optimized_divergence_estimation_model(x, y) + divergence_test = tester._compute_divergence_on_samples( + model, + x_test, + y_test, + failure_probability=0.1, ) logging.info('Result divergence test: %.3f', divergence_test) logging.info('Expected divergence: %.3f', expected_divergence) @@ -165,10 +173,18 @@ def test_returns_lower_bound_uniform(self, alpha: float): x_test = self.rng.uniform(low_1, high_1, (num_samples, 1)) y_test = self.rng.uniform(low_2, high_2, (num_samples, 1)) self.renyi_tester_config.alpha = alpha - tester = renyi_tester.RenyiPropertyTester(self.renyi_tester_config) - _, divergence_test = tester.estimate_divergence_from_samples( - x, y, x_test, y_test, failure_probability=0.1, verbose=0 + tester = renyi_tester.RenyiPropertyTester( + self.renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), + ) + + model = tester._get_optimized_divergence_estimation_model(x, y) + divergence_test = tester._compute_divergence_on_samples( + model, + x_test, + y_test, + failure_probability=0.1, ) logging.info('Result divergence test: %.3f', divergence_test) logging.info('Expected divergence: %.3f', expected_divergence) @@ -198,12 +214,18 @@ def test_returns_lower_bound_exponential(self, alpha: float): y_test = self.rng.exponential(lambda_2, (num_samples, 1)) self.renyi_tester_config.alpha = alpha - tester = renyi_tester.RenyiPropertyTester(self.renyi_tester_config) - - _, divergence_test = tester.estimate_divergence_from_samples( - x, y, x_test, y_test, failure_probability=0.1, verbose=0 + tester = renyi_tester.RenyiPropertyTester( + self.renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), ) + model = tester._get_optimized_divergence_estimation_model(x, y) + divergence_test = tester._compute_divergence_on_samples( + model, + x_test, + y_test, + failure_probability=0.1, + ) logging.info('Result divergence test: %.3f', divergence_test) logging.info('Expected divergence: %.3f', expected_divergence) self.assertLess(divergence_test, expected_divergence) @@ -228,10 +250,17 @@ def test_returns_lower_bound_laplace(self, alpha: float): x_test = self.rng.laplace(mu_1, scale_1, (num_samples, 1)) y_test = self.rng.laplace(mu_2, scale_2, (num_samples, 1)) self.renyi_tester_config.alpha = alpha - tester = renyi_tester.RenyiPropertyTester(self.renyi_tester_config) + tester = renyi_tester.RenyiPropertyTester( + self.renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), + ) - _, divergence_test = tester.estimate_divergence_from_samples( - x, y, x_test, y_test, failure_probability=0.1, verbose=0 + model = tester._get_optimized_divergence_estimation_model(x, y) + divergence_test = tester._compute_divergence_on_samples( + model, + x_test, + y_test, + failure_probability=0.1, ) logging.info('Result divergence test: %.3f', divergence_test) logging.info('Expected divergence: %.3f', expected_divergence) @@ -261,13 +290,10 @@ def test_renyi_model_parameters_initializer_wrong_property(self): training_config=self.training_config, privacy_property=approx_dp_privacy_property, ) - - with self.assertRaisesRegex( - ValueError, - 'The specified privacy_property is not supported by RenyiTester.', - ): - _ = renyi_tester._renyi_model_parameters_initializer( + with self.assertRaises(ValueError): + _ = renyi_tester.RenyiPropertyTester( config=renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), ) @parameterized.parameters( @@ -301,11 +327,12 @@ def test_renyi_model_parameters_initializer_sets_params( training_config=self.training_config, privacy_property=tested_property, ) - params = renyi_tester._renyi_model_parameters_initializer( + tester = renyi_tester.RenyiPropertyTester( config=renyi_tester_config, + base_model=renyi_tester.make_default_renyi_base_model(), ) - self.assertAlmostEqual(params['test_threshold'], threshold, places=6) - self.assertAlmostEqual(params['alpha'], alpha, places=6) + self.assertAlmostEqual(tester._test_threshold, threshold, places=6) + self.assertAlmostEqual(tester._alpha, alpha, places=6) def test_computes_error_from_samples(self): alpha = 2 diff --git a/python/dp_auditorium/requirements.in b/python/dp_auditorium/requirements.in index ea1d3510..81c07f35 100644 --- a/python/dp_auditorium/requirements.in +++ b/python/dp_auditorium/requirements.in @@ -25,3 +25,5 @@ googleapis-common-protos~=1.56.4 pysqlite3~=0.5.2 # Used for multi-platform compatibility. tensorflow-io-gcs-filesystem~=0.36.0 +pipeline-dp +python-dp \ No newline at end of file diff --git a/python/dp_auditorium/requirements.txt b/python/dp_auditorium/requirements.txt index aeaddde9..faa18bee 100644 --- a/python/dp_auditorium/requirements.txt +++ b/python/dp_auditorium/requirements.txt @@ -78,7 +78,7 @@ grpcio-tools==1.60.0 # via google-vizier h5py==3.10.0 # via tensorflow -idna==3.6 +idna==3.7 # via requests importlib-metadata==7.0.1 # via -r requirements.in @@ -159,6 +159,8 @@ orbax-checkpoint==0.4.4 # via flax packaging==22.0 # via tensorflow +pipeline-dp + # via -r requirements.in portpicker==1.6.0 # via google-vizier protobuf==4.23.4 @@ -176,6 +178,8 @@ pyasn1==0.5.1 # via # pyasn1-modules # rsa +python-dp + # via -r requirements.in pyasn1-modules==0.3.0 # via google-auth pygments==2.17.2