OpenMined · dvadym · May 27, 2021 · May 20, 2021 · May 20, 2021 · May 20, 2021
diff --git a/pipeline_dp/__init__.py b/pipeline_dp/__init__.py
@@ -3,5 +3,6 @@
 from pipeline_dp.aggregate_params import Metrics
 from pipeline_dp.dp_engine import DataExtractors
 from pipeline_dp.dp_engine import DPEngine
+from pipeline_dp.pipeline_operations import LocalPipelineOperations
 from pipeline_dp.pipeline_operations import BeamOperations
 from pipeline_dp.pipeline_operations import SparkRDDOperations
diff --git a/pipeline_dp/dp_engine.py b/pipeline_dp/dp_engine.py
@@ -8,10 +8,10 @@
 from pipeline_dp.pipeline_operations import PipelineOperations
 from pipeline_dp.report_generator import ReportGenerator
 
+
 @dataclass
 class DataExtractors:
   """Data extractors
-
   A set of functions that, given an input, return the privacy id, partition key,
   and value.
   """
@@ -49,3 +49,45 @@ def aggregate(self, col, params: AggregateParams,
     # TODO: implement aggregate().
     # It returns input for now, just to ensure that the an example works.
     return col
+
+  def _bound_cross_partition_contributions(self, col,
+                                           max_partitions_contributed: int,
+                                           max_contributions_per_partition: int,
+                                           aggregator_fn):
+    """
+    Bounds the contribution by privacy_id in and cross partitions
+    Args:
+      col: collection, with types of each element: (privacy_id,
+        partition_key, value)
+      max_partitions_contributed: maximum number of partitions that one
+        privacy id can contribute to
+      max_contributions_per_partition: maximum number of records that one
+        privacy id can contribute to one partition
+      aggregator_fn: function that takes a list of values and returns an
+        aggregator object which handles all aggregation logic.
+
+    return: collection with elements ((privacy_id, partition_key),
+          aggregator)
+    """
+    # per partition-contribution bounding with bounding of each contribution
+    col = self._ops.map_tuple(col, lambda pid, pk, v: ((pid, pk), v),
+                              "To (privacy_id, partition_key), value))")
+    col = self._ops.sample_fixed_per_key(col,
+                                         max_contributions_per_partition,
+                                         "Sample per (privacy_id, partition_key)")
+    # ((privacy_id, partition_key), [value])
+    col = self._ops.map(col, lambda pid_pk: (pid_pk[0], aggregator_fn(
+      pid_pk[1])), "Apply aggregate_fn after per partition bounding")
+    # ((privacy_id, partition_key), aggregator)
+
+    # Cross partition bounding
+    col = self._ops.map_tuple(col, lambda pid_pk, v: (pid_pk[0],
+                                                      (pid_pk[1], v)),
+                              "To (privacy_id, (partition_key, aggregator))")
+    col = self._ops.sample_fixed_per_key(col, max_partitions_contributed,
+                                         "Sample per privacy_id")
+    # (privacy_id, [(partition_key, aggregator)])
+    return self._ops.flat_map(col, lambda pid: [((pid[0], pk_v[0]), pk_v[1])
+                                                for pk_v in pid[1]],
+                              "Unnest")
+
diff --git a/pipeline_dp/pipeline_operations.py b/pipeline_dp/pipeline_operations.py
@@ -1,6 +1,8 @@
 """Adapters for working with pipeline frameworks."""
 
 import random
+import collections
+import numpy as np
 
 import abc
 import apache_beam as beam
@@ -16,6 +18,10 @@ class PipelineOperations(abc.ABC):
     def map(self, col, fn, stage_name: str):
         pass
 
+    @abc.abstractmethod
+    def flat_map(self, col, fn, stage_name: str):
+        pass
+
     @abc.abstractmethod
     def map_tuple(self, col, fn, stage_name: str):
         pass
@@ -55,8 +61,11 @@ class BeamOperations(PipelineOperations):
     def map(self, col, fn, stage_name: str):
         return col | stage_name >> beam.Map(fn)
 
+    def flat_map(self, col, fn, stage_name: str):
+        return col | stage_name >> beam.FlatMap(fn)
+
     def map_tuple(self, col, fn, stage_name: str):
-        return col | stage_name >> beam.MapTuple(fn)
+        return col | stage_name >> beam.Map(lambda x: fn(*x))
 
     def map_values(self, col, fn, stage_name: str):
         return col | stage_name >> beam.MapTuple(lambda k, v: (k, fn(v)))
@@ -152,8 +161,11 @@ class LocalPipelineOperations(PipelineOperations):
     def map(self, col, fn, stage_name: typing.Optional[str] = None):
         return map(fn, col)
 
-    def map_tuple(self, col, fn, stage_name: typing.Optional[str] = None):
-        return (fn(k, v) for k, v in col)
+    def flat_map(self, col, fn, stage_name: str):
+        return (x for el in col for x in fn(el))
+
+    def map_tuple(self, col, fn, stage_name: str = None):
+        return map(lambda x: fn(*x), col)
 
     def map_values(self, col, fn, stage_name: typing.Optional[str] = None):
         return ((k, fn(v)) for k, v in col)
@@ -176,8 +188,19 @@ def keys(self, col, stage_name: str):
     def values(self, col, stage_name: typing.Optional[str] = None):
         return (v for k, v in col)
 
-    def sample_fixed_per_key(self, col, n: int, stage_name: str):
-        pass
+    def sample_fixed_per_key(self, col, n: int,
+                             stage_name: typing.Optional[str] = None):
+        def sample_fixed_per_key_generator():
+            for item in self.group_by_key(col):
+                key = item[0]
+                values = item[1]
+                if len(values) > n:
+                    sampled_indices = np.random.choice(range(len(values)), n,
+                                                       replace=False)
+                    values = [values[i] for i in sampled_indices]
+                yield key, values
+
+        return sample_fixed_per_key_generator()
 
     def count_per_element(self, col, stage_name: typing.Optional[str] = None):
         yield from collections.Counter(col).items()
diff --git a/tests/dp_engine_test.py b/tests/dp_engine_test.py
@@ -1,34 +1,141 @@
-"""DPEngine Test"""
 
+import collections
+import numpy as np
 import unittest
 
-from pipeline_dp.aggregate_params import AggregateParams, Metrics
-from pipeline_dp.dp_engine import DPEngine
+import pipeline_dp
+
+"""DPEngine Test"""
+
+
+class dp_engineTest(unittest.TestCase):
+  aggregator_fn = lambda input_values: (len(input_values),
+                                        np.sum(input_values),
+                                        np.sum(np.square(input_values)))
+
+  def test_contribution_bounding_empty_col(self):
+    input_col = []
+    max_partitions_contributed = 2
+    max_contributions_per_partition = 2
+
+    dp_engine = pipeline_dp.DPEngine(
+      pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
+      pipeline_dp.LocalPipelineOperations())
+    bound_result = list(dp_engine._bound_cross_partition_contributions(
+      input_col,
+      max_partitions_contributed=max_partitions_contributed,
+      max_contributions_per_partition=max_contributions_per_partition,
+      aggregator_fn=dp_engineTest.aggregator_fn))
+
+    self.assertFalse(bound_result)
+
+  def test_contribution_bounding_bound_input_nothing_dropped(self):
+    input_col = [("pid1", 'pk1', 1),
+                 ("pid1", 'pk1', 2),
+                 ("pid1", 'pk2', 3),
+                 ("pid1", 'pk2', 4)]
+    max_partitions_contributed = 2
+    max_contributions_per_partition = 2
+
+    dp_engine = pipeline_dp.DPEngine(
+      pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
+      pipeline_dp.LocalPipelineOperations())
+    bound_result = list(dp_engine._bound_cross_partition_contributions(
+      input_col,
+      max_partitions_contributed=max_partitions_contributed,
+      max_contributions_per_partition=max_contributions_per_partition,
+      aggregator_fn=dp_engineTest.aggregator_fn))
+
+    expected_result = [(('pid1', 'pk2'), (2, 7, 25)),
+                       (('pid1', 'pk1'), (2, 3, 5))]
+    self.assertEqual(set(expected_result), set(bound_result))
+
+  def test_contribution_bounding_per_partition_bounding_applied(self):
+    input_col = [("pid1", 'pk1', 1),
+                 ("pid1", 'pk1', 2),
+                 ("pid1", 'pk2', 3),
+                 ("pid1", 'pk2', 4),
+                 ("pid1", 'pk2', 5),
+                 ("pid2", 'pk2', 6)]
+    max_partitions_contributed = 5
+    max_contributions_per_partition = 2
+
+    dp_engine = pipeline_dp.DPEngine(
+      pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
+      pipeline_dp.LocalPipelineOperations())
+    bound_result = list(dp_engine._bound_cross_partition_contributions(
+      input_col,
+      max_partitions_contributed=max_partitions_contributed,
+      max_contributions_per_partition=max_contributions_per_partition,
+      aggregator_fn=dp_engineTest.aggregator_fn))
+
+    self.assertEqual(len(bound_result), 3)
+    # Check contributions per partitions
+    self.assertTrue(all(map(
+      lambda op_val: op_val[1][0] <= max_contributions_per_partition,
+      bound_result)))
+
+  def test_contribution_bounding_cross_partition_bounding_applied(self):
+    input_col = [("pid1", 'pk1', 1),
+                 ("pid1", 'pk1', 2),
+                 ("pid1", 'pk2', 3),
+                 ("pid1", 'pk2', 4),
+                 ("pid1", 'pk2', 5),
+                 ("pid1", 'pk3', 6),
+                 ("pid1", 'pk4', 7),
+                 ("pid2", 'pk4', 8)]
+    max_partitions_contributed = 3
+    max_contributions_per_partition = 5
+
+    dp_engine = pipeline_dp.DPEngine(
+      pipeline_dp.BudgetAccountant(epsilon=1, delta=1e-10),
+      pipeline_dp.LocalPipelineOperations())
+    bound_result = list(dp_engine._bound_cross_partition_contributions(
+      input_col,
+      max_partitions_contributed=max_partitions_contributed,
+      max_contributions_per_partition=max_contributions_per_partition,
+      aggregator_fn=dp_engineTest.aggregator_fn))
+
+    self.assertEqual(len(bound_result), 4)
+    # Check contributions per partitions
+    self.assertTrue(all(map(
+      lambda op_val: op_val[1][0] <= max_contributions_per_partition,
+      bound_result)))
+    # Check cross partition contributions
+    dict_of_pid_to_pk = collections.defaultdict(lambda: [])
+    for key, _ in bound_result:
+      dict_of_pid_to_pk[key[0]].append(key[1])
+    self.assertEqual(len(dict_of_pid_to_pk), 2)
+    self.assertTrue(
+      all(map(lambda key: len(
+        dict_of_pid_to_pk[key]) <= max_partitions_contributed,
+              dict_of_pid_to_pk)))
 
-class DPEngineTest(unittest.TestCase):
   def test_aggregate_none(self):
-    self.assertIsNone(DPEngine(None, None).aggregate(None, None, None))
+    self.assertIsNone(pipeline_dp.DPEngine(None, None).aggregate(None, None,
+                                                                 None))
 
   def test_aggregate_report(self):
-    params1 = AggregateParams(
+    params1 = pipeline_dp.AggregateParams(
       max_partitions_contributed=3,
       max_contributions_per_partition=2,
       low=1,
       high=5,
-      metrics=[Metrics.PRIVACY_ID_COUNT, Metrics.COUNT, Metrics.MEAN],
+      metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN],
     )
-    params2 = AggregateParams(
+    params2 = pipeline_dp.AggregateParams(
       max_partitions_contributed=1,
       max_contributions_per_partition=3,
       low=2,
       high=10,
-      metrics=[Metrics.VAR, Metrics.SUM, Metrics.MEAN],
-      public_partitions = list(range(1,40)),
+      metrics=[pipeline_dp.Metrics.VAR, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN],
+      public_partitions = list(range(1, 40)),
     )
-    engine = DPEngine(None, None)
+    engine = pipeline_dp.DPEngine(None, None)
     engine.aggregate(None, params1, None)
     engine.aggregate(None, params2, None)
     self.assertEqual(len(engine._report_generators), 2)  # pylint: disable=protected-access
 
+
 if __name__ == '__main__':
   unittest.main()