From 12c1279bea5c080c63770ab41c645c6e82076043 Mon Sep 17 00:00:00 2001 From: Differential Privacy Team Date: Tue, 13 Feb 2024 10:36:48 -0800 Subject: [PATCH] Smaller fixes for DP Auditorium * Refactor DP Auditorium example * Fix dependencies * Fix README PiperOrigin-RevId: 606329824 Change-Id: I425c7d754416664f710f1e46db338d4241d145ef GitOrigin-RevId: f952488cad35fad5b15fb1306d4b62e5e6a79102 --- README.md | 2 + python/dp_auditorium/README.md | 40 +++++---- .../dp_auditorium/examples/BUILD.bazel | 25 ++++++ .../examples/run_mean_mechanism_example.py | 88 +++++++++++-------- .../run_mean_mechanism_example_test.py | 51 +++++++++++ .../testers/hockey_stick_tester.py | 18 ++-- .../testers/hockey_stick_tester_test.py | 12 +++ python/dp_auditorium/requirements.in | 36 ++++---- python/dp_auditorium/requirements.txt | 13 ++- python/dp_auditorium/setup.py | 3 +- 10 files changed, 209 insertions(+), 79 deletions(-) create mode 100644 python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example_test.py diff --git a/README.md b/README.md index 83b21fe3..2aa64621 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ private statistics over datasets. It contains the following tools. tracking privacy budget. * A [command line interface](examples/zetasql) for running differentially private SQL queries with [ZetaSQL](https://github.com/google/zetasql). +* [DP Auditorium](python/dp_auditorium) is a library for auditing differential + privacy guarantees. To get started on generating differentially private data, we recommend you follow the [Privacy on Beam codelab](https://codelabs.developers.google.com/codelabs/privacy-on-beam/). diff --git a/python/dp_auditorium/README.md b/python/dp_auditorium/README.md index 0a604f47..4a61b9e7 100644 --- a/python/dp_auditorium/README.md +++ b/python/dp_auditorium/README.md @@ -30,35 +30,39 @@ Details on the signature of the above objects, can be found in the An illustrative example is available in `examples/run_mean_mechanism_example.py`. This binary defines a Hockey-Stick divergence test as the property tester and a dataset generator that employs a -Gaussian process bandit (using OSS Vizier) for suggesting datasets. -Subsequently, the runner is instantiated with these two objects to conduct a -test on a non-private mean mechanism. +random search (using [Vizier](https://github.com/google/vizier)) for suggesting +datasets. Subsequently, the runner is instantiated with these two objects to +conduct a test on a non-private mean mechanism. There are two ways to run this, either via Bazel or after installing the library -using `setup.py`. +using `pip install`. Before install, please ensure that your machine has the +`sqlite3` library installed, e.g., `sudo apt-get install libsqlite3-dev` on Ubuntu +machines. -### Run with bazel +### Run with Bazelisk For the first option, you need to have -[Bazel installed](https://docs.bazel.build/versions/main/install.html). -Once that is done, run: +[Bazelisk installed](https://github.com/bazelbuild/bazelisk). Once that is done, +run: ``` -bazel build dp_auditorium:all -bazel run dp_auditorium/examples:run_mean_mechanism_example +bazelisk build dp_auditorium:all +bazelisk run dp_auditorium/examples:run_mean_mechanism_example ``` -### Run via setup.py +### Run via pip -For the second option, you will need the -[setuptools package](https://pypi.org/project/setuptools/) installed. -To ensure this, you may run +For the second option, you will need the [setuptools +package](https://pypi.org/project/setuptools/) installed. To ensure this, you +may run `pip install --upgrade setuptools`. Then, to demonstrate our example, +run: ``` -pip install --upgrade setuptools +python -m pip install . +python dp_auditorium/examples/run_mean_mechanism_example.py ``` -Then, to demonstrate our example, run: +Some MacOS users may run into issues linking Sqlite3 with Python during the +installation of DP-Auditorium. In these cases, users may have to pass the +relevant C++ options to `pip` directly. For example, ``` -python setup.py install -python dp_auditorium/examples/run_mean_mechanism_example.py +CFLAGS=-Wno-error=implicit-function-declaration pip install . ``` - diff --git a/python/dp_auditorium/dp_auditorium/examples/BUILD.bazel b/python/dp_auditorium/dp_auditorium/examples/BUILD.bazel index e5f7b1b4..3f901e31 100644 --- a/python/dp_auditorium/dp_auditorium/examples/BUILD.bazel +++ b/python/dp_auditorium/dp_auditorium/examples/BUILD.bazel @@ -29,6 +29,31 @@ py_binary( "//dp_auditorium/mechanisms:mean", "//dp_auditorium/testers:hockey_stick_tester", requirement("absl-py"), + requirement("jax"), + requirement("jaxopt"), requirement("numpy"), + requirement("tensorflow"), + requirement("tensorflow_probability"), + ], +) + +py_test( + name = "run_mean_mechanism_example_test", + srcs = ["run_mean_mechanism_example_test.py"], + deps = [ + ":run_mean_mechanism_example", + "//dp_auditorium/generators:vizier_dataset_generator", + requirement("absl-py"), + requirement("google-vizier"), + requirement("numpy"), + # Needed to fix strange dependency bugs in Vizier. Order is important! + requirement("equinox"), + requirement("flax"), + requirement("googleapis-common-protos"), + requirement("jax"), + requirement("jaxlib"), + requirement("pysqlite3"), + "//dp_auditorium:interfaces", + "//dp_auditorium/configs", ], ) diff --git a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.py b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.py index ddc8829e..89068130 100644 --- a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.py +++ b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.py @@ -14,12 +14,13 @@ """Example binary running privacy tests for a DP mean mechanism.""" from collections.abc import Sequence -import os +import time +from typing import Callable from absl import app from absl import flags -from absl import logging import numpy as np +import tensorflow as tf from dp_auditorium import privacy_test_runner from dp_auditorium.configs import dataset_generator_config @@ -32,28 +33,51 @@ from dp_auditorium.testers import hockey_stick_tester -_OUTPUT_DIR = flags.DEFINE_string("output_dir", "", "Output directory") -_EXPERIMENT_NAME = flags.DEFINE_string("experiment_name", "", "Experiment name") -_EPSILON = flags.DEFINE_float("epsilon", 0.01, "Privacy parameter") +_EPSILON = flags.DEFINE_float("epsilon", 1.0, "Privacy parameter") _DELTA = flags.DEFINE_float("delta", 0.0, "Privacy parameter") _SEED = flags.DEFINE_integer( "seed", 0, "Seed to initialize random numbers generator." ) -def main(argv: Sequence[str]) -> None: - if len(argv) > 1: - raise app.UsageError("Too many command-line arguments.") +def default_generator_factory( + config: dataset_generator_config.VizierDatasetGeneratorConfig, +) -> vizier_dataset_generator.VizierScalarDataAddRemoveGenerator: + return vizier_dataset_generator.VizierScalarDataAddRemoveGenerator( + config=config + ) + - rng = np.random.default_rng(seed=_SEED.value) +def mean_mechanism_report( + epsilon: float, + delta: float, + seed: int, + generator_factory: Callable[ + [dataset_generator_config.VizierDatasetGeneratorConfig], + vizier_dataset_generator.VizierScalarDataAddRemoveGenerator, + ] = default_generator_factory, +) -> privacy_test_runner_config.PrivacyTestRunnerResults: + """Runs the example code for a mean mechanism. + + Args: + epsilon: standard DP parmaeter. + delta: standard DP parameter. + seed: seed to initialize the random number generator. + generator_factory: factory to create a generator; to be replaced in tests + + Returns: + The result of the example code as PrivacyTestRunnerResults. + """ + rng = np.random.default_rng(seed=seed) + tf.random.set_seed(seed) # Configuration for a non-private mean mechanism that uses the true number of - # points to calculate the average. + # points to calculate the average and the scale of the noise. mech_config = mechanism_config.MeanMechanismConfig( - epsilon=_EPSILON.value, - delta=_DELTA.value, + epsilon=epsilon, + delta=delta, use_noised_counts_for_calculating_mean=False, - use_noised_counts_for_calculating_noise_scale=True, + use_noised_counts_for_calculating_noise_scale=False, min_value=0.0, max_value=1.0, ) @@ -67,8 +91,8 @@ def main(argv: Sequence[str]) -> None: tester_config = property_tester_config.HockeyStickPropertyTesterConfig( training_config=hockey_stick_tester.make_default_hs_training_config(), approximate_dp=privacy_property.ApproximateDp( - epsilon=_EPSILON.value, - delta=_DELTA.value, + epsilon=epsilon, + delta=delta, ), ) # Initialize a classifier model for the Hockey-Stick property tester. @@ -80,28 +104,25 @@ def main(argv: Sequence[str]) -> None: ) # Configuration for dataset generator. It generates neighboring datasets under - # the add/remove definition. + # the add/remove definition. Unique study name prevents using cached results + # from previous runs. generator_config = dataset_generator_config.VizierDatasetGeneratorConfig( - study_name="non-private-mean-hockey-stick-test", + study_name=str(time.time()), study_owner="owner", num_vizier_parameters=2, data_type=dataset_generator_config.DataType.DATA_TYPE_FLOAT, min_value=-1.0, max_value=1.0, - search_algorithm="GAUSSIAN_PROCESS_BANDIT", + search_algorithm="RANDOM_SEARCH", metric_name="hockey_stick_divergence", ) # Initialize the dataset generator. - dataset_generator = ( - vizier_dataset_generator.VizierScalarDataAddRemoveGenerator( - config=generator_config, - ) - ) + dataset_generator = generator_factory(generator_config) # Configuration for the test runner. test_runner_config = privacy_test_runner_config.PrivacyTestRunnerConfig( property_tester=privacy_test_runner_config.PropertyTester.HOCKEY_STICK_TESTER, - max_num_trials=5, + max_num_trials=10, failure_probability=0.05, num_samples=10_000, # Apply a hyperbolic tangent function to the output of the mechanism @@ -114,22 +135,17 @@ def main(argv: Sequence[str]) -> None: property_tester=property_tester, ) - results = test_runner.test_privacy(mechanism, "non-private-mean-mechanism") + return test_runner.test_privacy(mechanism, "non-private-mean-mechanism") - logging.info("\nResults: \n") - logging.info(results) - # Write results to the output directory. - if not os.path.exists(_OUTPUT_DIR.value): - os.makedirs(_OUTPUT_DIR.value) - logging.info("Directory %s created successfully", _OUTPUT_DIR.value) - else: - print(f"Directory '{_OUTPUT_DIR.value}' already exists") +def main(argv: Sequence[str]) -> None: + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") - output_file_name = os.path.join(_OUTPUT_DIR.value, _EXPERIMENT_NAME.value) - with open(output_file_name, mode="w") as f: - f.write(str(results)) + results = mean_mechanism_report(_EPSILON.value, _DELTA.value, _SEED.value) + print(" \nResults: \n") + print(results) if __name__ == "__main__": app.run(main) diff --git a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example_test.py b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example_test.py new file mode 100644 index 00000000..5054c3f9 --- /dev/null +++ b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example_test.py @@ -0,0 +1,51 @@ +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from absl.testing import absltest +import numpy as np +from vizier.service import clients + +from dp_auditorium import interfaces +from dp_auditorium.configs import dataset_generator_config +from dp_auditorium.examples.run_mean_mechanism_example import mean_mechanism_report +from dp_auditorium.generators import vizier_dataset_generator + + +class StubVizierGenerator( + vizier_dataset_generator.VizierScalarDataAddRemoveGenerator +): + + def get_neighboring_datasets_from_vizier_params( + self, vizier_params: np.ndarray + ) -> interfaces.NeighboringDatasetsType: + return np.ones(2), np.ones(2) + + +class RunMeanMechanismExampleTest(absltest.TestCase): + + def test_generates_result(self): + clients.environment_variables.servicer_use_sql_ram() + output = mean_mechanism_report( + 0.1, + 0.1, + 1, + lambda config: vizier_dataset_generator.VizierScalarDataAddRemoveGenerator( + config=config + ), + ) + self.assertNotEmpty(str(output)) + + +if __name__ == "__main__": + absltest.main() diff --git a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py index faf27a43..dfdbbf75 100644 --- a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py +++ b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py @@ -26,7 +26,8 @@ import tensorflow as tf from dp_auditorium import interfaces -from dp_auditorium.configs import property_tester_config as config +from dp_auditorium.configs import privacy_property +from dp_auditorium.configs import property_tester_config from dp_auditorium.testers import property_tester_utils @@ -50,8 +51,8 @@ class HockeyStickDivergenceTrainingOptions: verbose: int -def make_default_hs_training_config() -> config.TrainingConfig: - return config.TrainingConfig( +def make_default_hs_training_config() -> property_tester_config.TrainingConfig: + return property_tester_config.TrainingConfig( training_epochs=2, optimizer_learning_rate=1e-2, batch_size=100, @@ -69,7 +70,7 @@ def make_default_hs_base_model() -> tf.keras.Model: def make_training_options_from_config( - training_config: config.TrainingConfig, + training_config: property_tester_config.TrainingConfig, ): return HockeyStickDivergenceTrainingOptions( num_epochs=training_config.training_epochs, @@ -123,7 +124,7 @@ class must return logits. def __init__( self, - config: config.HockeyStickPropertyTesterConfig, + config: property_tester_config.HockeyStickPropertyTesterConfig, base_model: tf.keras.Model, ): """Initializes the instance. @@ -140,12 +141,18 @@ def __init__( self._base_model = base_model self._epsilon = config.approximate_dp.epsilon self._delta = config.approximate_dp.delta + self._approximate_dp = config.approximate_dp self._has_called_fit = False self._training_options = make_training_options_from_config( config.training_config ) self.initialize(self._training_options) + @property + def privacy_property(self) -> privacy_property.PrivacyProperty: + """The privacy guarantee that the tester is being used to test for.""" + return privacy_property.PrivacyProperty(approximate_dp=self._approximate_dp) + def initialize(self, training_options: HockeyStickDivergenceTrainingOptions): """Compiles internal model. @@ -225,6 +232,7 @@ def _fit( epochs: Number of epochs to train for. verbose: Option passed to keras trainer. """ + self.initialize(self._training_options) features, labels = self._generate_inputs_to_model(samples1, samples2) self._base_model.fit( features, diff --git a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py index 42eb1037..00b2a3b3 100644 --- a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py +++ b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py @@ -282,6 +282,18 @@ def test_assert_privacy_violation(self): self.assertTrue(hsdt.reject_property(0.11)) self.assertFalse(hsdt.reject_property(0.09)) + def test_privacy_property(self): + training_options = hst.make_default_hs_training_config() + hs_config = config.HockeyStickPropertyTesterConfig( + approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.1), + training_config=training_options, + ) + hs_tester = hst.HockeyStickPropertyTester( + config=hs_config, + base_model=hst.make_default_hs_base_model()) + self.assertEqual( + hs_config.approximate_dp, hs_tester.privacy_property.approximate_dp + ) if __name__ == "__main__": absltest.main() diff --git a/python/dp_auditorium/requirements.in b/python/dp_auditorium/requirements.in index 0723d3fc..ea1d3510 100644 --- a/python/dp_auditorium/requirements.in +++ b/python/dp_auditorium/requirements.in @@ -2,22 +2,26 @@ # command `python3 -m piptools compile` from this directory, and modify the # last line in the generated `requirements.txt` file to `setuptools~=69.0.3` # in order to include `setuptools` as one of the package requirements. -absl-py<=1.0.0 -etils<=1.5.2 +absl-py~=1.0.0 +etils~=1.5.2 # v0.1.7 uses `dataclass.kw_only` which is unsupported in Python 3.9 -google-vizier<0.1.7 -importlib-metadata<=7.0.1 -numpy<=1.26.3 -scipy<=1.11.4 -tensorflow<=2.15.0 -tensorflow-probability<=0.23.0 -typing-extensions<=4.9.0 +google-vizier~=0.1.6 +importlib-metadata~=7.0.1 +numpy~=1.26.3 +scipy~=1.11.4 +tensorflow~=2.15.0 +tensorflow-probability~=0.23.0 +typing-extensions~=4.9.0 # Required to run on older Ubuntu machines. -urllib3 < 2.0.0 +urllib3~=1.26.18 # Used only for google-vizier compatibility. -equinox<=0.11.3 -flax<=0.8.0 -jax<=0.4.23 -jaxlib<=0.4.23 -google-api-core<=2.15.0 -pysqlite3<=0.5.2 +equinox~=0.11.3 +flax~=0.8.0 +jax~=0.4.23 +jaxlib~=0.4.23 +jaxopt~=0.8.3 +google-api-core~=2.15.0 +googleapis-common-protos~=1.56.4 +pysqlite3~=0.5.2 +# Used for multi-platform compatibility. +tensorflow-io-gcs-filesystem~=0.36.0 diff --git a/python/dp_auditorium/requirements.txt b/python/dp_auditorium/requirements.txt index 7f9c4d82..aeaddde9 100644 --- a/python/dp_auditorium/requirements.txt +++ b/python/dp_auditorium/requirements.txt @@ -63,6 +63,7 @@ google-vizier==0.1.6 # via -r requirements.in googleapis-common-protos==1.56.4 # via + # -r requirements.in # google-api-core # google-vizier greenlet==3.0.3 @@ -89,14 +90,18 @@ jax==0.4.23 # chex # equinox # flax + # jaxopt # optax # orbax-checkpoint jaxlib==0.4.23 # via # -r requirements.in # chex + # jaxopt # optax # orbax-checkpoint +jaxopt==0.8.3 + # via -r requirements.in jaxtyping==0.2.25 # via equinox keras==2.15.0 @@ -131,6 +136,7 @@ numpy==1.26.3 # h5py # jax # jaxlib + # jaxopt # jaxtyping # ml-dtypes # opt-einsum @@ -196,6 +202,7 @@ scipy==1.11.4 # -r requirements.in # jax # jaxlib + # jaxopt six==1.16.0 # via # absl-py @@ -214,8 +221,10 @@ tensorflow==2.15.0 # via -r requirements.in tensorflow-estimator==2.15.0 # via tensorflow -tensorflow-io-gcs-filesystem==0.35.0 - # via tensorflow +tensorflow-io-gcs-filesystem==0.36.0 + # via + # -r requirements.in + # tensorflow tensorflow-probability==0.23.0 # via -r requirements.in tensorstore==0.1.45 diff --git a/python/dp_auditorium/setup.py b/python/dp_auditorium/setup.py index 7e971f35..5e786bc8 100644 --- a/python/dp_auditorium/setup.py +++ b/python/dp_auditorium/setup.py @@ -59,9 +59,8 @@ def _read_description(path): return open(os.path.join(here, path)).read().split("## Examples")[0] - setuptools.setup( - name="dp-testing", + name="dp-auditorium", version=_get_version(), author="Google Differential Privacy Team", author_email="dp-open-source@google.com",