Smaller fixes for DP Auditorium

* Refactor DP Auditorium example * Fix dependencies * Fix README PiperOrigin-RevId: 606329824 Change-Id: I425c7d754416664f710f1e46db338d4241d145ef GitOrigin-RevId: f952488cad35fad5b15fb1306d4b62e5e6a79102
google · Feb 13, 2024 · 12c1279 · 12c1279
1 parent 4ca99c0
commit 12c1279
Show file tree

Hide file tree

Showing 10 changed files with 209 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,8 @@ private statistics over datasets. It contains the following tools.
   tracking privacy budget.
 * A [command line interface](examples/zetasql) for running differentially
   private SQL queries with [ZetaSQL](https://github.com/google/zetasql).
+* [DP Auditorium](python/dp_auditorium) is a library for auditing differential
+  privacy guarantees.
 
 To get started on generating differentially private data, we recommend you follow
 the [Privacy on Beam codelab](https://codelabs.developers.google.com/codelabs/privacy-on-beam/).

diff --git a/python/dp_auditorium/README.md b/python/dp_auditorium/README.md
@@ -30,35 +30,39 @@ Details on the signature of the above objects, can be found in the
 An illustrative example is available in
 `examples/run_mean_mechanism_example.py`. This binary defines a Hockey-Stick
 divergence test as the property tester and a dataset generator that employs a
-Gaussian process bandit (using OSS Vizier) for suggesting datasets.
-Subsequently, the runner is instantiated with these two objects to conduct a
-test on a non-private mean mechanism.
+random search (using [Vizier](https://github.com/google/vizier)) for suggesting
+datasets. Subsequently, the runner is instantiated with these two objects to
+conduct a test on a non-private mean mechanism.
 
 There are two ways to run this, either via Bazel or after installing the library
-using `setup.py`.
+using `pip install`. Before install, please ensure that your machine has the
+`sqlite3` library installed, e.g., `sudo apt-get install libsqlite3-dev` on Ubuntu
+machines.
 
-### Run with bazel
+### Run with Bazelisk
 
 For the first option, you need to have
-[Bazel installed](https://docs.bazel.build/versions/main/install.html).
-Once that is done, run:
+[Bazelisk installed](https://github.com/bazelbuild/bazelisk). Once that is done,
+run:
 
 ```
-bazel build dp_auditorium:all
-bazel run dp_auditorium/examples:run_mean_mechanism_example
+bazelisk build dp_auditorium:all
+bazelisk run dp_auditorium/examples:run_mean_mechanism_example
 ```
 
-### Run via setup.py
+### Run via pip
 
-For the second option, you will need the
-[setuptools package](https://pypi.org/project/setuptools/) installed.
-To ensure this, you may run
+For the second option, you will need the [setuptools
+package](https://pypi.org/project/setuptools/) installed. To ensure this, you
+may run `pip install --upgrade setuptools`. Then, to demonstrate our example,
+run:
 ```
-pip install --upgrade setuptools
+python -m pip install .
+python dp_auditorium/examples/run_mean_mechanism_example.py
 ```
-Then, to demonstrate our example, run:
+Some MacOS users may run into issues linking Sqlite3 with Python during the
+installation of DP-Auditorium. In these cases, users may have to pass the
+relevant C++ options to `pip` directly. For example,
 ```
-python setup.py install
-python dp_auditorium/examples/run_mean_mechanism_example.py
+CFLAGS=-Wno-error=implicit-function-declaration pip install .
 ```
-
diff --git a/python/dp_auditorium/dp_auditorium/examples/BUILD.bazel b/python/dp_auditorium/dp_auditorium/examples/BUILD.bazel
@@ -29,6 +29,31 @@ py_binary(
         "//dp_auditorium/mechanisms:mean",
         "//dp_auditorium/testers:hockey_stick_tester",
         requirement("absl-py"),
+        requirement("jax"),
+        requirement("jaxopt"),
         requirement("numpy"),
+        requirement("tensorflow"),
+        requirement("tensorflow_probability"),
+    ],
+)
+
+py_test(
+    name = "run_mean_mechanism_example_test",
+    srcs = ["run_mean_mechanism_example_test.py"],
+    deps = [
+        ":run_mean_mechanism_example",
+        "//dp_auditorium/generators:vizier_dataset_generator",
+        requirement("absl-py"),
+        requirement("google-vizier"),
+        requirement("numpy"),
+        # Needed to fix strange dependency bugs in Vizier. Order is important!
+        requirement("equinox"),
+        requirement("flax"),
+        requirement("googleapis-common-protos"),
+        requirement("jax"),
+        requirement("jaxlib"),
+        requirement("pysqlite3"),
+        "//dp_auditorium:interfaces",
+        "//dp_auditorium/configs",
     ],
 )
diff --git a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.py b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example.py
@@ -14,12 +14,13 @@
 """Example binary running privacy tests for a DP mean mechanism."""
 
 from collections.abc import Sequence
-import os
+import time
+from typing import Callable
 
 from absl import app
 from absl import flags
-from absl import logging
 import numpy as np
+import tensorflow as tf
 
 from dp_auditorium import privacy_test_runner
 from dp_auditorium.configs import dataset_generator_config
@@ -32,28 +33,51 @@
 from dp_auditorium.testers import hockey_stick_tester
 
 
-_OUTPUT_DIR = flags.DEFINE_string("output_dir", "", "Output directory")
-_EXPERIMENT_NAME = flags.DEFINE_string("experiment_name", "", "Experiment name")
-_EPSILON = flags.DEFINE_float("epsilon", 0.01, "Privacy parameter")
+_EPSILON = flags.DEFINE_float("epsilon", 1.0, "Privacy parameter")
 _DELTA = flags.DEFINE_float("delta", 0.0, "Privacy parameter")
 _SEED = flags.DEFINE_integer(
     "seed", 0, "Seed to initialize random numbers generator."
 )
 
 
-def main(argv: Sequence[str]) -> None:
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
+def default_generator_factory(
+    config: dataset_generator_config.VizierDatasetGeneratorConfig,
+) -> vizier_dataset_generator.VizierScalarDataAddRemoveGenerator:
+  return vizier_dataset_generator.VizierScalarDataAddRemoveGenerator(
+      config=config
+  )
+
 
-  rng = np.random.default_rng(seed=_SEED.value)
+def mean_mechanism_report(
+    epsilon: float,
+    delta: float,
+    seed: int,
+    generator_factory: Callable[
+        [dataset_generator_config.VizierDatasetGeneratorConfig],
+        vizier_dataset_generator.VizierScalarDataAddRemoveGenerator,
+    ] = default_generator_factory,
+) -> privacy_test_runner_config.PrivacyTestRunnerResults:
+  """Runs the example code for a mean mechanism.
+
+  Args:
+    epsilon: standard DP parmaeter.
+    delta: standard DP parameter.
+    seed: seed to initialize the random number generator.
+    generator_factory: factory to create a generator; to be replaced in tests
+
+  Returns:
+    The result of the example code as PrivacyTestRunnerResults.
+  """
+  rng = np.random.default_rng(seed=seed)
+  tf.random.set_seed(seed)
 
   # Configuration for a non-private mean mechanism that uses the true number of
-  # points to calculate the average.
+  # points to calculate the average and the scale of the noise.
   mech_config = mechanism_config.MeanMechanismConfig(
-      epsilon=_EPSILON.value,
-      delta=_DELTA.value,
+      epsilon=epsilon,
+      delta=delta,
       use_noised_counts_for_calculating_mean=False,
-      use_noised_counts_for_calculating_noise_scale=True,
+      use_noised_counts_for_calculating_noise_scale=False,
       min_value=0.0,
       max_value=1.0,
   )
@@ -67,8 +91,8 @@ def main(argv: Sequence[str]) -> None:
   tester_config = property_tester_config.HockeyStickPropertyTesterConfig(
       training_config=hockey_stick_tester.make_default_hs_training_config(),
       approximate_dp=privacy_property.ApproximateDp(
-          epsilon=_EPSILON.value,
-          delta=_DELTA.value,
+          epsilon=epsilon,
+          delta=delta,
       ),
   )
   # Initialize a classifier model for the Hockey-Stick property tester.
@@ -80,28 +104,25 @@ def main(argv: Sequence[str]) -> None:
   )
 
   # Configuration for dataset generator. It generates neighboring datasets under
-  # the add/remove definition.
+  # the add/remove definition. Unique study name prevents using cached results
+  # from previous runs.
   generator_config = dataset_generator_config.VizierDatasetGeneratorConfig(
-      study_name="non-private-mean-hockey-stick-test",
+      study_name=str(time.time()),
       study_owner="owner",
       num_vizier_parameters=2,
       data_type=dataset_generator_config.DataType.DATA_TYPE_FLOAT,
       min_value=-1.0,
       max_value=1.0,
-      search_algorithm="GAUSSIAN_PROCESS_BANDIT",
+      search_algorithm="RANDOM_SEARCH",
       metric_name="hockey_stick_divergence",
   )
   # Initialize the dataset generator.
-  dataset_generator = (
-      vizier_dataset_generator.VizierScalarDataAddRemoveGenerator(
-          config=generator_config,
-      )
-  )
+  dataset_generator = generator_factory(generator_config)
 
   # Configuration for the test runner.
   test_runner_config = privacy_test_runner_config.PrivacyTestRunnerConfig(
       property_tester=privacy_test_runner_config.PropertyTester.HOCKEY_STICK_TESTER,
-      max_num_trials=5,
+      max_num_trials=10,
       failure_probability=0.05,
       num_samples=10_000,
       # Apply a hyperbolic tangent function to the output of the mechanism
@@ -114,22 +135,17 @@ def main(argv: Sequence[str]) -> None:
       property_tester=property_tester,
   )
 
-  results = test_runner.test_privacy(mechanism, "non-private-mean-mechanism")
+  return test_runner.test_privacy(mechanism, "non-private-mean-mechanism")
 
-  logging.info("\nResults: \n")
-  logging.info(results)
 
-  # Write results to the output directory.
-  if not os.path.exists(_OUTPUT_DIR.value):
-    os.makedirs(_OUTPUT_DIR.value)
-    logging.info("Directory %s created successfully", _OUTPUT_DIR.value)
-  else:
-    print(f"Directory '{_OUTPUT_DIR.value}' already exists")
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
 
-  output_file_name = os.path.join(_OUTPUT_DIR.value, _EXPERIMENT_NAME.value)
-  with open(output_file_name, mode="w") as f:
-    f.write(str(results))
+  results = mean_mechanism_report(_EPSILON.value, _DELTA.value, _SEED.value)
 
+  print(" \nResults: \n")
+  print(results)
 
 if __name__ == "__main__":
   app.run(main)
diff --git a/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example_test.py b/python/dp_auditorium/dp_auditorium/examples/run_mean_mechanism_example_test.py
@@ -0,0 +1,51 @@
+# Copyright 2024 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from absl.testing import absltest
+import numpy as np
+from vizier.service import clients
+
+from dp_auditorium import interfaces
+from dp_auditorium.configs import dataset_generator_config
+from dp_auditorium.examples.run_mean_mechanism_example import mean_mechanism_report
+from dp_auditorium.generators import vizier_dataset_generator
+
+
+class StubVizierGenerator(
+    vizier_dataset_generator.VizierScalarDataAddRemoveGenerator
+):
+
+  def get_neighboring_datasets_from_vizier_params(
+      self, vizier_params: np.ndarray
+  ) -> interfaces.NeighboringDatasetsType:
+    return np.ones(2), np.ones(2)
+
+
+class RunMeanMechanismExampleTest(absltest.TestCase):
+
+  def test_generates_result(self):
+    clients.environment_variables.servicer_use_sql_ram()
+    output = mean_mechanism_report(
+        0.1,
+        0.1,
+        1,
+        lambda config: vizier_dataset_generator.VizierScalarDataAddRemoveGenerator(
+            config=config
+        ),
+    )
+    self.assertNotEmpty(str(output))
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester.py
@@ -26,7 +26,8 @@
 import tensorflow as tf
 
 from dp_auditorium import interfaces
-from dp_auditorium.configs import property_tester_config as config
+from dp_auditorium.configs import privacy_property
+from dp_auditorium.configs import property_tester_config
 from dp_auditorium.testers import property_tester_utils
 
 
@@ -50,8 +51,8 @@ class HockeyStickDivergenceTrainingOptions:
   verbose: int
 
 
-def make_default_hs_training_config() -> config.TrainingConfig:
-  return config.TrainingConfig(
+def make_default_hs_training_config() -> property_tester_config.TrainingConfig:
+  return property_tester_config.TrainingConfig(
       training_epochs=2,
       optimizer_learning_rate=1e-2,
       batch_size=100,
@@ -69,7 +70,7 @@ def make_default_hs_base_model() -> tf.keras.Model:
 
 
 def make_training_options_from_config(
-    training_config: config.TrainingConfig,
+    training_config: property_tester_config.TrainingConfig,
 ):
   return HockeyStickDivergenceTrainingOptions(
       num_epochs=training_config.training_epochs,
@@ -123,7 +124,7 @@ class must return logits.
 
   def __init__(
       self,
-      config: config.HockeyStickPropertyTesterConfig,
+      config: property_tester_config.HockeyStickPropertyTesterConfig,
       base_model: tf.keras.Model,
   ):
     """Initializes the instance.
@@ -140,12 +141,18 @@ def __init__(
     self._base_model = base_model
     self._epsilon = config.approximate_dp.epsilon
     self._delta = config.approximate_dp.delta
+    self._approximate_dp = config.approximate_dp
     self._has_called_fit = False
     self._training_options = make_training_options_from_config(
         config.training_config
     )
     self.initialize(self._training_options)
 
+  @property
+  def privacy_property(self) -> privacy_property.PrivacyProperty:
+    """The privacy guarantee that the tester is being used to test for."""
+    return privacy_property.PrivacyProperty(approximate_dp=self._approximate_dp)
+
   def initialize(self, training_options: HockeyStickDivergenceTrainingOptions):
     """Compiles internal model.
 
@@ -225,6 +232,7 @@ def _fit(
       epochs: Number of epochs to train for.
       verbose: Option passed to keras trainer.
     """
+    self.initialize(self._training_options)
     features, labels = self._generate_inputs_to_model(samples1, samples2)
     self._base_model.fit(
         features,

diff --git a/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py b/python/dp_auditorium/dp_auditorium/testers/hockey_stick_tester_test.py
@@ -282,6 +282,18 @@ def test_assert_privacy_violation(self):
     self.assertTrue(hsdt.reject_property(0.11))
     self.assertFalse(hsdt.reject_property(0.09))
 
+  def test_privacy_property(self):
+    training_options = hst.make_default_hs_training_config()
+    hs_config = config.HockeyStickPropertyTesterConfig(
+        approximate_dp=self.make_privacy_property(epsilon=1.0, delta=0.1),
+        training_config=training_options,
+    )
+    hs_tester = hst.HockeyStickPropertyTester(
+        config=hs_config,
+        base_model=hst.make_default_hs_base_model())
+    self.assertEqual(
+        hs_config.approximate_dp, hs_tester.privacy_property.approximate_dp
+    )
 
 if __name__ == "__main__":
   absltest.main()