facebookresearch · min-xu-ai · Apr 28, 2021 · Apr 27, 2021 · Apr 27, 2021 · Apr 27, 2021
diff --git a/fairscale/utils/reduce_scatter_bucketer.py b/fairscale/utils/reduce_scatter_bucketer.py
@@ -166,6 +166,10 @@ def _get_shard_size(self, element_size: int, num_shards: int) -> int:
         return int(bucket_size // num_shards)
 
     def _get_bucket(self, tensor: Tensor, group: ProcessGroup) -> Bucket:
+        # TODO (Min): the `group` used here in the key is the object hash, not the content
+        #     hash. That means if FSDP instances are initialized with different process groups,
+        #     even when the group members are in fact the same, we end up creating different
+        #     buckets here.
         key = (tensor.dtype, tensor.device, group)
         if key not in self.buckets:
             # buckets are divided into world_size pieces, bucket.data shaped (world_size, shard_size)