mosaicml · snarayan21 · Oct 27, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/streaming/base/batching/per_stream.py b/streaming/base/batching/per_stream.py
@@ -63,7 +63,9 @@ def generate_work_per_stream_batching(dataset: StreamingDataset, world: World, e
             # same as the ratio of the stream's samples to overall samples.
             # This ensures that the overall training shuffle block size is still approximately
             # equal to what is set by the user, and allows for reasoning about cache_limit as well.
-            assert isinstance(dataset.shuffle_block_size, int)
+            if not isinstance(dataset.shuffle_block_size, int):
+                raise TypeError(f'Dataset `shuffle_block_size` must be an integer. ' +
+                                f'Got {dataset.shuffle_block_size} instead.')
             shuffle_block_portion = int(dataset.shuffle_block_size * stream.proportion)
             stream_shuffle = get_shuffle(dataset.shuffle_algo, shuffle_units,
                                          dataset.num_canonical_nodes, dataset.shuffle_seed, epoch,

diff --git a/streaming/base/batching/random.py b/streaming/base/batching/random.py
@@ -57,7 +57,9 @@ def generate_work_random_batching(dataset: StreamingDataset, world: World, epoch
 
     # If we need to shuffle, shuffle in a node-aware and *underlying* shard-aware way.
     if dataset.shuffle:
-        assert isinstance(dataset.shuffle_block_size, int)
+        if not isinstance(dataset.shuffle_block_size, int):
+            raise TypeError(f'Dataset `shuffle_block_size` must be an integer. ' +
+                            f'Got {dataset.shuffle_block_size} instead.')
         shuffle = get_shuffle(dataset.shuffle_algo, shuffle_units, dataset.num_canonical_nodes,
                               dataset.shuffle_seed, epoch, dataset.shuffle_block_size)
         big_ids = np.where(big_ids != -1, shuffle[big_ids], -1)

diff --git a/streaming/base/batching/stratified.py b/streaming/base/batching/stratified.py
@@ -74,7 +74,9 @@ def generate_work_stratified_batching(dataset: StreamingDataset, world: World, e
             # same as the ratio of the stream's samples to overall samples.
             # This ensures that the overall training shuffle block size is still approximately
             # equal to what is set by the user, and allows for reasoning about cache_limit as well.
-            assert isinstance(dataset.shuffle_block_size, int)
+            if not isinstance(dataset.shuffle_block_size, int):
+                raise TypeError(f'Dataset `shuffle_block_size` must be an integer. ' +
+                    f'Got {dataset.shuffle_block_size} instead.')
             shuffle_block_portion = int(dataset.shuffle_block_size * stream.proportion)
             stream_shuffle = get_shuffle(dataset.shuffle_algo, shuffle_units,
                                          dataset.num_canonical_nodes, dataset.shuffle_seed, epoch,