From 337e7e83a59cfaa0f6d26e9b4322d3ebaf987a5c Mon Sep 17 00:00:00 2001
From: Karan Jariwala <karankjariwala@gmail.com>
Date: Thu, 22 Sep 2022 13:53:39 -0700
Subject: [PATCH 1/3] Added pydocstyle and docformatter in pre-commit config

---
 .pre-commit-config.yaml                       | 16 ++++
 docs/source/conf.py                           |  8 +-
 docs/source/doctest_fixtures.py               |  5 +-
 pyproject.toml                                |  5 ++
 setup.py                                      |  5 +-
 streaming/__init__.py                         |  2 +
 streaming/base/__init__.py                    |  2 +
 streaming/base/compression/__init__.py        |  2 +
 streaming/base/compression/bench.py           |  8 +-
 streaming/base/compression/compression.py     |  2 +
 streaming/base/compression/plot.py            |  8 +-
 streaming/base/dataset.py                     |  3 +-
 streaming/base/distributed.py                 | 30 ++++++-
 streaming/base/download.py                    |  2 +
 streaming/base/format/__init__.py             | 12 +++
 streaming/base/format/base/__init__.py        |  2 +
 streaming/base/format/base/reader.py          |  2 +
 streaming/base/format/base/writer.py          |  2 +
 streaming/base/format/json/README.md          |  2 +
 streaming/base/format/json/__init__.py        |  2 +
 streaming/base/format/json/encodings.py       |  2 +
 streaming/base/format/json/reader.py          |  2 +
 streaming/base/format/json/writer.py          |  4 +-
 streaming/base/format/mds/README.md           |  2 +
 streaming/base/format/mds/__init__.py         |  2 +
 streaming/base/format/mds/encodings.py        |  2 +
 streaming/base/format/mds/reader.py           |  2 +
 streaming/base/format/mds/writer.py           |  2 +
 streaming/base/format/xsv/README.md           |  2 +
 streaming/base/format/xsv/__init__.py         |  2 +
 streaming/base/format/xsv/encodings.py        |  2 +
 streaming/base/format/xsv/reader.py           |  2 +
 streaming/base/format/xsv/writer.py           |  8 +-
 streaming/base/hashing/__init__.py            |  2 +
 streaming/base/hashing/bench.py               |  8 +-
 streaming/base/hashing/hashing.py             |  2 +
 streaming/base/hashing/plot.py                |  8 +-
 streaming/base/index.py                       |  7 +-
 streaming/base/local.py                       | 21 +++++
 streaming/base/util.py                        |  4 +-
 streaming/text/__init__.py                    |  2 +
 streaming/text/c4.py                          |  8 +-
 streaming/text/convert/__init__.py            |  2 +
 streaming/text/convert/c4.py                  | 64 ++++++++++++---
 streaming/vision/__init__.py                  |  2 +
 streaming/vision/ade20k.py                    |  8 +-
 streaming/vision/base.py                      |  5 +-
 streaming/vision/cifar10.py                   |  8 +-
 streaming/vision/coco.py                      | 17 +++-
 streaming/vision/convert/__init__.py          |  2 +
 streaming/vision/convert/ade20k.py            | 21 ++---
 streaming/vision/convert/base.py              |  2 +
 streaming/vision/convert/cifar10.py           | 67 ++++++++++++---
 streaming/vision/convert/coco.py              | 12 +--
 .../convert/{image_folder.py => imagenet.py}  | 81 +++++++++++++++----
 streaming/vision/imagenet.py                  |  8 +-
 tests/test_streaming.py                       |  4 +-
 tests/test_streaming_remote.py                |  1 -
 58 files changed, 422 insertions(+), 98 deletions(-)
 rename streaming/vision/convert/{image_folder.py => imagenet.py} (61%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e27daabd7..644cdd6eb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -68,3 +68,19 @@ repos:
         pass_filenames: false
         args: [--warnings]
         additional_dependencies: ["pyright@1.1.256"]
+  - repo: https://github.com/myint/docformatter
+    rev: v1.5.0
+    hooks:
+      - id: docformatter
+        args: [--in-place, --wrap-summaries=99, --wrap-descriptions=99]
+  - repo: https://github.com/PyCQA/pydocstyle
+    hooks:
+      - id: pydocstyle
+        name: pydocstyle
+        entry: pydocstyle
+        language: python
+        types: [python]
+        exclude: '(tests|.ci|.github)'
+        additional_dependencies:
+          - "toml"
+    rev: 6.1.1
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6388224dd..f4bdc3aba 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -371,9 +371,7 @@ def _auto_rst_for_module(module: types.ModuleType, exclude_members: List[Any]) -
 
 
 def _modules_to_rst() -> List[types.ModuleType]:
-    """Return the list of modules for which to generate API reference rst
-    files."""
-
+    """Return the list of modules for which to generate API reference rst files."""
     document_modules: List[types.Module] = [
         streaming,
     ]
@@ -389,8 +387,8 @@ def _modules_to_rst() -> List[types.ModuleType]:
 def _generate_rst_files_for_modules() -> None:
     """Generate .rst files for each module to include in the API reference.
 
-    These files contain the module docstring followed by tables listing
-    all the functions, classes, etc.
+    These files contain the module docstring followed by tables listing all the functions, classes,
+    etc.
     """
     docs_dir = os.path.abspath(os.path.dirname(__file__))
     module_rst_save_dir = os.path.join(docs_dir, 'api_reference')
diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py
index a997373c2..b7ff8c4fe 100644
--- a/docs/source/doctest_fixtures.py
+++ b/docs/source/doctest_fixtures.py
@@ -6,9 +6,8 @@
 
 """Fixtures available in doctests.
 
-The script is run before any doctests are executed, so all imports and
-variables are available in any doctest. The output of this setup script
-does not show up in the documentation.
+The script is run before any doctests are executed, so all imports and variables are available in
+any doctest. The output of this setup script does not show up in the documentation.
 """
 import os
 import sys
diff --git a/pyproject.toml b/pyproject.toml
index a0192d86f..ea3db9337 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -475,3 +475,8 @@ ignore_patterns = [
     "wandb/**/*.py",
     "build/**/*.py",
 ]
+
+[tool.pydocstyle]
+convention="google"
+add_ignore="D102,D105,D107,D401"
+add_select="D400,D404"
diff --git a/setup.py b/setup.py
index f91b08bb7..ebb83a02d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,7 @@
-# Copyright 2022 MosaicML. All Rights Reserved.
+# Copyright 2022 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Streaming package setup."""
 
 import os
 
diff --git a/streaming/__init__.py b/streaming/__init__.py
index 52191ad6e..4c9ab3693 100644
--- a/streaming/__init__.py
+++ b/streaming/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""MosaicML Streaming Datasets for cloud-native model training."""
+
 from streaming._version import __version__
 from streaming.base import (CSVWriter, Dataset, JSONWriter, LocalDataset, MDSWriter, TSVWriter,
                             XSVWriter)
diff --git a/streaming/base/__init__.py b/streaming/base/__init__.py
index 29281609a..e791cfcf3 100644
--- a/streaming/base/__init__.py
+++ b/streaming/base/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""MosaicML Streaming Datasets for cloud-native model training."""
+
 from streaming.base.dataset import Dataset
 from streaming.base.format import CSVWriter, JSONWriter, MDSWriter, TSVWriter, XSVWriter
 from streaming.base.local import LocalDataset
diff --git a/streaming/base/compression/__init__.py b/streaming/base/compression/__init__.py
index ab1ea64f2..09c6533c0 100644
--- a/streaming/base/compression/__init__.py
+++ b/streaming/base/compression/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""A collection of compression algorithm."""
+
 from streaming.base.compression.compression import (compress, decompress,
                                                     get_compression_extension, get_compressions,
                                                     is_compression)
diff --git a/streaming/base/compression/bench.py b/streaming/base/compression/bench.py
index f02c81e96..335653087 100644
--- a/streaming/base/compression/bench.py
+++ b/streaming/base/compression/bench.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Script to Benchmark compression algorithms."""
+
 from argparse import ArgumentParser, Namespace
 from time import time
 from typing import Iterator
@@ -11,10 +13,10 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
     args.add_argument('--data', type=str, required=True)
@@ -48,7 +50,7 @@ def main(args: Namespace) -> None:
     """Benchmark compression algorithms.
 
     Args:
-        args (Namespace): Commandline flags.
+        args (Namespace): command-line flags.
     """
     data = open(args.data, 'rb').read()
     for algo in sorted(get_compressions()):
diff --git a/streaming/base/compression/compression.py b/streaming/base/compression/compression.py
index 26064494e..4f4423400 100644
--- a/streaming/base/compression/compression.py
+++ b/streaming/base/compression/compression.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""List of Compression and Decompression algorithms."""
+
 import bz2
 import gzip
 from abc import ABC, abstractmethod
diff --git a/streaming/base/compression/plot.py b/streaming/base/compression/plot.py
index 7a097e2db..19f8065c8 100644
--- a/streaming/base/compression/plot.py
+++ b/streaming/base/compression/plot.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Utility and helper functions to plot compression information."""
+
 from argparse import ArgumentParser, Namespace
 from collections import defaultdict
 from dataclasses import dataclass
@@ -72,10 +74,10 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
     args.add_argument('--data', type=str, required=True)
@@ -252,7 +254,7 @@ def main(args: Namespace) -> None:
     """Plot info about compression.
 
     Args:
-        args (Namespace): Commandline arguments.
+        args (Namespace): command-line arguments.
     """
     data = load(args.data, args.min_dec_size)
     plot_compression_rates(data, algo2color, args.dpi, args.font_size, args.line_width,
diff --git a/streaming/base/dataset.py b/streaming/base/dataset.py
index 1834e5cab..e9e5603ec 100644
--- a/streaming/base/dataset.py
+++ b/streaming/base/dataset.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""The :class:`Dataset` class, used for building streaming iterable datasets."""
+
 import json
 import os
 from enum import IntEnum
@@ -292,7 +294,6 @@ def _preload_shard(self, shard: int, partition: Partition) -> bool:
         Returns:
             bool: Whether shard is present.
         """
-
         assert shard in partition.shards
         reader = self.shards[shard]
         for raw_info, zip_info in reader.file_pairs:
diff --git a/streaming/base/distributed.py b/streaming/base/distributed.py
index 0e4f7628b..7d6ece165 100644
--- a/streaming/base/distributed.py
+++ b/streaming/base/distributed.py
@@ -1,22 +1,44 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Helper methods to get the distributed attributes."""
+
 import os
 
 __all__ = ['get_global_rank', 'get_local_rank', 'get_local_world_size', 'get_world_size']
 
 
-def get_global_rank():
+def get_global_rank() -> int:
+    """Returns the global rank of the current process, which is on ``[0; WORLD_SIZE - 1]``.
+
+    Returns:
+        int: The global rank.
+    """
     return int(os.environ.get('RANK', 0))
 
 
-def get_world_size():
+def get_world_size() -> int:
+    """Returns the world size, which is the number of processes participating in this training run.
+
+    Returns:
+        int: The world size.
+    """
     return int(os.environ.get('WORLD_SIZE', 1))
 
 
-def get_local_rank():
+def get_local_rank() -> int:
+    """Returns the local rank for the current process, which is on ``[0; LOCAL_WORLD_SIZE - 1]``.
+
+    Returns:
+        int: The local rank.
+    """
     return int(os.environ.get('LOCAL_RANK', 0))
 
 
-def get_local_world_size():
+def get_local_world_size() -> int:
+    """Returns the local world size, which is the number of processes for the current node.
+
+    Returns:
+        int: The local world size.
+    """
     return int(os.environ.get('LOCAL_WORLD_SIZE', 1))
diff --git a/streaming/base/download.py b/streaming/base/download.py
index 86af8d174..54243b5e9 100644
--- a/streaming/base/download.py
+++ b/streaming/base/download.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Download handling for :class:`Dataset`."""
+
 import os
 import shutil
 import urllib.parse
diff --git a/streaming/base/format/__init__.py b/streaming/base/format/__init__.py
index a483e6dff..7210b0219 100644
--- a/streaming/base/format/__init__.py
+++ b/streaming/base/format/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Individual dataset writer for every format."""
+
 from typing import Any, Dict, Optional
 
 from streaming.base.format.base.reader import Reader
@@ -19,6 +21,16 @@
 
 
 def reader_from_json(dirname: str, split: Optional[str], obj: Dict[str, Any]) -> Reader:
+    """Initialize the reader from JSON object.
+
+    Args:
+        dirname (str): Local directory containing shards.
+        split (str, optional): Which dataset split to use, if any.
+        obj (Dict[str, Any]): JSON object to load.
+
+    Returns:
+        Reader: Loaded Reader of `format` type
+    """
     assert obj['version'] == 2
     cls = _readers[obj['format']]
     return cls.from_json(dirname, split, obj)
diff --git a/streaming/base/format/base/__init__.py b/streaming/base/format/base/__init__.py
index 636fee966..7e551a3c8 100644
--- a/streaming/base/format/base/__init__.py
+++ b/streaming/base/format/base/__init__.py
@@ -1,2 +1,4 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""Base module for dataset reader and writer."""
diff --git a/streaming/base/format/base/reader.py b/streaming/base/format/base/reader.py
index 2d2d1df65..c851ff8ce 100644
--- a/streaming/base/format/base/reader.py
+++ b/streaming/base/format/base/reader.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Read and decode sample from shards."""
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator, List, Optional
diff --git a/streaming/base/format/base/writer.py b/streaming/base/format/base/writer.py
index 94ef315f9..e3f7b8710 100644
--- a/streaming/base/format/base/writer.py
+++ b/streaming/base/format/base/writer.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Convert a list of samples into a format files that can be read as a :class:`Dataset`."""
+
 import json
 import os
 from abc import ABC, abstractmethod
diff --git a/streaming/base/format/json/README.md b/streaming/base/format/json/README.md
index f41c9ed71..13cd1fd99 100644
--- a/streaming/base/format/json/README.md
+++ b/streaming/base/format/json/README.md
@@ -1,5 +1,6 @@
 Example:
 
+```json
     {
       "columns": {
         "number": "int",
@@ -48,3 +49,4 @@ Example:
         }
       }
     }
+```
diff --git a/streaming/base/format/json/__init__.py b/streaming/base/format/json/__init__.py
index 869e6deed..10a569a69 100644
--- a/streaming/base/format/json/__init__.py
+++ b/streaming/base/format/json/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Module to write and read the dataset in JSON format."""
+
 from streaming.base.format.json.reader import JSONReader
 from streaming.base.format.json.writer import JSONWriter
 
diff --git a/streaming/base/format/json/encodings.py b/streaming/base/format/json/encodings.py
index 2c07359de..4834cdd98 100644
--- a/streaming/base/format/json/encodings.py
+++ b/streaming/base/format/json/encodings.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Check whether sample encoding is of supported JSON types."""
+
 from abc import ABC, abstractmethod
 from typing import Any
 
diff --git a/streaming/base/format/json/reader.py b/streaming/base/format/json/reader.py
index d6eae6417..4ac4409c9 100644
--- a/streaming/base/format/json/reader.py
+++ b/streaming/base/format/json/reader.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+""":class:`JSONReader` reads samples from binary ``.json`` files that were written out by :class:`MDSWriter`."""
+
 import json
 import os
 from copy import deepcopy
diff --git a/streaming/base/format/json/writer.py b/streaming/base/format/json/writer.py
index d575daf3d..217a123ae 100644
--- a/streaming/base/format/json/writer.py
+++ b/streaming/base/format/json/writer.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+""":class:`JSONWriter` converts a list of samples into binary `.mds` files that can be read as a :class:`JSONReader`."""
+
 import json
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -11,7 +13,7 @@
 
 
 class JSONWriter(SplitWriter):
-    """Writes a streaming JSON dataset.
+    r"""Writes a streaming JSON dataset.
 
     Args:
         dirname (str): Local dataset directory.
diff --git a/streaming/base/format/mds/README.md b/streaming/base/format/mds/README.md
index 11ad219e9..7bf04983b 100644
--- a/streaming/base/format/mds/README.md
+++ b/streaming/base/format/mds/README.md
@@ -1,5 +1,6 @@
 Example:
 
+```json
     {
       "column_encodings": [
         "int",
@@ -39,3 +40,4 @@ Example:
         }
       }
     }
+```
diff --git a/streaming/base/format/mds/__init__.py b/streaming/base/format/mds/__init__.py
index 2272ba3ef..084c5fd71 100644
--- a/streaming/base/format/mds/__init__.py
+++ b/streaming/base/format/mds/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Module to write and read the dataset in MDS format."""
+
 from streaming.base.format.mds.reader import MDSReader
 from streaming.base.format.mds.writer import MDSWriter
 
diff --git a/streaming/base/format/mds/encodings.py b/streaming/base/format/mds/encodings.py
index 93eebba92..176bc1d6b 100644
--- a/streaming/base/format/mds/encodings.py
+++ b/streaming/base/format/mds/encodings.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Encode and Decode samples in a supported MDS format."""
+
 import json
 import pickle
 from abc import ABC, abstractmethod
diff --git a/streaming/base/format/mds/reader.py b/streaming/base/format/mds/reader.py
index 021405837..698bc2979 100644
--- a/streaming/base/format/mds/reader.py
+++ b/streaming/base/format/mds/reader.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+""":class:`MDSReader` reads samples from binary ``.mds`` files that were written out by:class:`StreamingDatasetWriter`."""
+
 import os
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
diff --git a/streaming/base/format/mds/writer.py b/streaming/base/format/mds/writer.py
index 1884738e4..ec97d776d 100644
--- a/streaming/base/format/mds/writer.py
+++ b/streaming/base/format/mds/writer.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+""":class:`MDSWriter` converts a list of samples into binary `.mds` files that can be read as a :class:`MDSReader`."""
+
 import json
 from typing import Any, Dict, List, Optional
 
diff --git a/streaming/base/format/xsv/README.md b/streaming/base/format/xsv/README.md
index 296822b78..242a5ef9e 100644
--- a/streaming/base/format/xsv/README.md
+++ b/streaming/base/format/xsv/README.md
@@ -1,5 +1,6 @@
 Example:
 
+```json
     {
       "column_encodings": [
         "int",
@@ -53,3 +54,4 @@ Example:
         }
       }
     }
+```
diff --git a/streaming/base/format/xsv/__init__.py b/streaming/base/format/xsv/__init__.py
index d71949a04..5d7f691f7 100644
--- a/streaming/base/format/xsv/__init__.py
+++ b/streaming/base/format/xsv/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Module to write and read the dataset in Tabular format."""
+
 from streaming.base.format.xsv.reader import CSVReader, TSVReader, XSVReader
 from streaming.base.format.xsv.writer import CSVWriter, TSVWriter, XSVWriter
 
diff --git a/streaming/base/format/xsv/encodings.py b/streaming/base/format/xsv/encodings.py
index da936250d..f8fb94cab 100644
--- a/streaming/base/format/xsv/encodings.py
+++ b/streaming/base/format/xsv/encodings.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Encode and Decode samples in a supported Tabular format."""
+
 from abc import ABC, abstractmethod
 from typing import Any
 
diff --git a/streaming/base/format/xsv/reader.py b/streaming/base/format/xsv/reader.py
index 8a08c6e8b..d53a797ab 100644
--- a/streaming/base/format/xsv/reader.py
+++ b/streaming/base/format/xsv/reader.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Reads and decode samples from a Tabular format files such as XSV, CSV, and TSV."""
+
 import os
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
diff --git a/streaming/base/format/xsv/writer.py b/streaming/base/format/xsv/writer.py
index 47deb1e06..05e8c95b8 100644
--- a/streaming/base/format/xsv/writer.py
+++ b/streaming/base/format/xsv/writer.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Converts a list of samples into a tabular data format files such as XSV, CSV, and TSV."""
+
 import json
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -11,7 +13,7 @@
 
 
 class XSVWriter(SplitWriter):
-    """Writes a streaming XSV dataset.
+    r"""Writes a streaming XSV dataset.
 
     Args:
         dirname (str): Local dataset directory.
@@ -107,7 +109,7 @@ def encode_split_shard(self) -> Tuple[bytes, bytes]:
 
 
 class CSVWriter(XSVWriter):
-    """Writes a streaming CSV dataset.
+    r"""Writes a streaming CSV dataset.
 
     Args:
         dirname (str): Local dataset directory.
@@ -147,7 +149,7 @@ def get_config(self) -> Dict[str, Any]:
 
 
 class TSVWriter(XSVWriter):
-    """Writes a streaming TSV dataset.
+    r"""Writes a streaming TSV dataset.
 
     Args:
         dirname (str): Local dataset directory.
diff --git a/streaming/base/hashing/__init__.py b/streaming/base/hashing/__init__.py
index 501d6a82b..362426685 100644
--- a/streaming/base/hashing/__init__.py
+++ b/streaming/base/hashing/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Hashing algorithms."""
+
 from streaming.base.hashing.hashing import get_hash, get_hashes, is_hash
 
 __all__ = ['get_hash', 'get_hashes', 'is_hash']
diff --git a/streaming/base/hashing/bench.py b/streaming/base/hashing/bench.py
index 8c95e8407..9354e857a 100644
--- a/streaming/base/hashing/bench.py
+++ b/streaming/base/hashing/bench.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Script to benchmark various hashing algorithms."""
+
 from argparse import ArgumentParser, Namespace
 from time import time
 from typing import Iterator
@@ -11,10 +13,10 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
     args.add_argument('--data', type=str, required=True)
@@ -48,7 +50,7 @@ def main(args: Namespace) -> None:
     """Benchmark hash algorithms.
 
     Args:
-        args (Namespace): Commandline flags.
+        args (Namespace): command-line flags.
     """
     data = open(args.data, 'rb').read()
     for algo in sorted(get_hashes()):
diff --git a/streaming/base/hashing/hashing.py b/streaming/base/hashing/hashing.py
index 32401d0b7..fe33443ef 100644
--- a/streaming/base/hashing/hashing.py
+++ b/streaming/base/hashing/hashing.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Setter and Getter for support hashing algorithms."""
+
 import hashlib
 from typing import Any, Callable, Dict, Set
 
diff --git a/streaming/base/hashing/plot.py b/streaming/base/hashing/plot.py
index a6f630453..f82a958e8 100644
--- a/streaming/base/hashing/plot.py
+++ b/streaming/base/hashing/plot.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Utility and helper functions to plot hashing information."""
+
 from argparse import ArgumentParser, Namespace
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple
@@ -30,10 +32,10 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
     args.add_argument('--data', type=str, required=True)
@@ -112,7 +114,7 @@ def main(args: Namespace) -> None:
     """Plot info about hashing.
 
     Args:
-        args (Namespace): Commandline arguments.
+        args (Namespace): command-line arguments.
     """
     data = load(args.data)
     plot_hash_rates(data, algo2color, args.dpi, args.font_size, args.line_width, args.hash_rates)
diff --git a/streaming/base/index.py b/streaming/base/index.py
index 39f8490b7..a8902ec75 100644
--- a/streaming/base/index.py
+++ b/streaming/base/index.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Helper methods to get the shard attributes."""
+
 from math import ceil
 from typing import List, Optional, Tuple
 
@@ -41,9 +43,8 @@ def __init__(self, shards: List[int], shards_to_download: List[int], min_sample_
 class Index(object):
     """An index of sample ranges (corresponding to shards).
 
-    Enables (a) finding the shard for a given sample, (b) getting the
-    per-device dataset size, and (c) getting this device/worker's sample
-    range of the dataset.
+    Enables (a) finding the shard for a given sample, (b) getting the per-device dataset size, and
+    (c) getting this device/worker's sample range of the dataset.
     """
 
     def __init__(self, samples_per_shard: List[int], batch_size: Optional[int] = None) -> None:
diff --git a/streaming/base/local.py b/streaming/base/local.py
index 882b84567..ad232099e 100644
--- a/streaming/base/local.py
+++ b/streaming/base/local.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Local Dataset."""
+
 import json
 import os
 from typing import Any, Dict, Optional
@@ -12,6 +14,12 @@
 
 
 class LocalDataset(Dataset):
+    """The dataset resides locally in a machine.
+
+    Args:
+        dirname (str): Local dataset directory where the dataset is present.
+        split (str, optional): Which dataset split to use, if any. Defaults to ``None``.
+    """
 
     def __init__(self, dirname: str, split: Optional[str] = None):
         split = split or ''
@@ -32,9 +40,22 @@ def __init__(self, dirname: str, split: Optional[str] = None):
         self.index = Index(shard_sizes)
 
     def __len__(self) -> int:
+        """Get the length as an IterableDataset.
+
+        Returns:
+            int: Dataset length.
+        """
         return self.index.total_samples
 
     def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get sample by global index.
+
+        Args:
+            idx (int): Sample index.
+
+        Returns:
+            Dict[str, Any]: Column name with sample data.
+        """
         shard_idx, idx_in_shard = self.index.find_sample(idx)
         shard = self.shards[shard_idx]
         return shard[idx_in_shard]
diff --git a/streaming/base/util.py b/streaming/base/util.py
index 3b6d5a57a..a523bd4a9 100644
--- a/streaming/base/util.py
+++ b/streaming/base/util.py
@@ -1,11 +1,13 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Utility and helper functions for datasets."""
+
 from typing import List
 
 
 def get_list_arg(text: str) -> List[str]:
-    """Pass a list as a commandline flag.
+    """Pass a list as a command-line flag.
 
     Args:
         text (str): Text to split.
diff --git a/streaming/text/__init__.py b/streaming/text/__init__.py
index ea6a79b3c..91a442c4e 100644
--- a/streaming/text/__init__.py
+++ b/streaming/text/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Natively supported NLP datasets."""
+
 from streaming.text.c4 import C4 as C4
 
 __all__ = ['C4']
diff --git a/streaming/text/c4.py b/streaming/text/c4.py
index 9a9457449..8387cf9e6 100644
--- a/streaming/text/c4.py
+++ b/streaming/text/c4.py
@@ -1,6 +1,12 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""C4 (Colossal Cleaned Common Crawl) dataset.
+
+This dataset is a colossal, cleaned version of Common Crawl's web crawl corpus and it is based on
+the `Common Crawl <https://commoncrawl.org>`_ dataset.
+"""
+
 from typing import Any, Dict, Optional
 
 from transformers.models.auto.tokenization_auto import AutoTokenizer
@@ -9,7 +15,7 @@
 
 
 class C4(Dataset):
-    """C4 (Colossal Cleaned Common Crawl) dataset.
+    """Implementation of the C4 (Colossal Cleaned Common Crawl) dataset using streaming Dataset.
 
     Args:
         tokenizer_name (str): The name of the HuggingFace tokenizer to use to tokenize samples.
diff --git a/streaming/text/convert/__init__.py b/streaming/text/convert/__init__.py
index 636fee966..361a845c8 100644
--- a/streaming/text/convert/__init__.py
+++ b/streaming/text/convert/__init__.py
@@ -1,2 +1,4 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""Data conversion scripts for Natural Language Processing."""
diff --git a/streaming/text/convert/c4.py b/streaming/text/convert/c4.py
index b59940f5d..43b28244e 100644
--- a/streaming/text/convert/c4.py
+++ b/streaming/text/convert/c4.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""C4 (Colossal Cleaned Common Crawl) streaming dataset conversion scripts."""
+
 import os
 from argparse import ArgumentParser, Namespace
 from typing import Any, Dict, Iterator
@@ -15,19 +17,55 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
-    args.add_argument('--out_root', type=str, required=True)
-    args.add_argument('--compression', type=str, default='zstd:7')
-    args.add_argument('--hashes', type=str, default='sha1,xxh64')
-    args.add_argument('--limit', type=int, default=1 << 27)
-    args.add_argument('--batch_size', type=int, default=512)
-    args.add_argument('--progbar', type=int, default=1)
-    args.add_argument('--leave', type=int, default=0)
+    args.add_argument(
+        '--out_root',
+        type=str,
+        required=True,
+        help='Directory path to store the output dataset',
+    )
+    args.add_argument(
+        '--compression',
+        type=str,
+        default='zstd:7',
+        help='Compression algorithm to use. Default: zstd:7',
+    )
+    args.add_argument(
+        '--hashes',
+        type=str,
+        default='sha1,xxh64',
+        help='Hashing algorithms to apply to shard files. Default: sha1,xxh64',
+    )
+    args.add_argument(
+        '--size_limit',
+        type=int,
+        default=1 << 27,
+        help='Shard size limit, after which point to start a new shard. Default: 1 << 27',
+    )
+    args.add_argument(
+        '--batch_size',
+        type=int,
+        default=512,
+        help='DataLoader batch size. Default: 512',
+    )
+    args.add_argument(
+        '--progbar',
+        type=int,
+        default=1,
+        help='tqdm progress bar. Default: 1 (True)',
+    )
+    args.add_argument(
+        '--leave',
+        type=int,
+        default=0,
+        help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' +
+        '(False)',
+    )
     return args.parse_args()
 
 
@@ -72,6 +110,7 @@ def each(dataset: Dataset, num_workers: int, batch_size: int) -> Iterator[Dict[s
         dataset (Dataset): A HuggingFace Dataset locally downloaded.
         num_workers (int): DataLoader number of workers.
         batch_size (int): DataLoader batch size.
+
     Returns:
         Iterator[Dict[str, Any]]: Sample dicts.
     """
@@ -89,9 +128,10 @@ def each(dataset: Dataset, num_workers: int, batch_size: int) -> Iterator[Dict[s
 
 
 def main(args: Namespace) -> None:
-    """Main: create streaming CIFAR10 dataset.
+    """Main: create streaming C4 dataset.
+
     Args:
-        args (Namespace): Commandline arguments.
+        args (Namespace): command-line arguments.
     """
     splits = [
         ('train', 'train', 364868892, 64),
@@ -102,7 +142,7 @@ def main(args: Namespace) -> None:
     for old_split, new_split, num_samples, num_workers in splits:
         dataset = get(old_split)
         split_dir = os.path.join(args.out_root, new_split)
-        with MDSWriter(split_dir, fields, args.compression, hashes, args.limit) as out:
+        with MDSWriter(split_dir, fields, args.compression, hashes, args.size_limit) as out:
             samples = each(dataset, num_workers, args.batch_size)  # pyright: ignore
             if args.progbar:
                 samples = tqdm(samples, total=num_samples, leave=args.leave)
diff --git a/streaming/vision/__init__.py b/streaming/vision/__init__.py
index 0852fb45f..2b25df288 100644
--- a/streaming/vision/__init__.py
+++ b/streaming/vision/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Natively supported CV datasets."""
+
 from streaming.vision.ade20k import ADE20k as ADE20K
 from streaming.vision.cifar10 import CIFAR10 as CIFAR10
 from streaming.vision.coco import COCO as COCO
diff --git a/streaming/vision/ade20k.py b/streaming/vision/ade20k.py
index af69827d3..40d05fcee 100644
--- a/streaming/vision/ade20k.py
+++ b/streaming/vision/ade20k.py
@@ -1,6 +1,12 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""ADE20K Semantic segmentation and scene parsing dataset.
+
+Please refer to the `ADE20K dataset <https://groups.csail.mit.edu/vision/datasets/ADE20K/>`_ for more details about this
+dataset.
+"""
+
 from typing import Any, Callable, Optional, Tuple
 
 from streaming.base import Dataset
@@ -57,7 +63,7 @@ def __getitem__(self, idx: int) -> Tuple[Any, Any]:
             idx (int): Sample index.
 
         Returns:
-            Any: Sample data.
+            Tuple[Any, Any]: Sample data and label.
         """
         obj = super().__getitem__(idx)
         x = obj['x']
diff --git a/streaming/vision/base.py b/streaming/vision/base.py
index 1e0f7124a..1960af117 100644
--- a/streaming/vision/base.py
+++ b/streaming/vision/base.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Base Class for making Computer Vision datasets which are compatible with :class:`Dataset`."""
+
 from typing import Any, Callable, Optional, Tuple
 
 from torchvision.transforms.functional import to_tensor
@@ -11,8 +13,7 @@
 
 
 class StandardTransform(object):
-    """Individual input and output transforms called jointly, following
-    torchvision.
+    """Individual input and output transforms called jointly, following torchvision.
 
     Args:
         transform (Callable, optional): Input transform. Defaults to ``None``.
diff --git a/streaming/vision/cifar10.py b/streaming/vision/cifar10.py
index 0b87babec..9a9a0cd3a 100644
--- a/streaming/vision/cifar10.py
+++ b/streaming/vision/cifar10.py
@@ -1,11 +1,17 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""CIFAR-10 classification streaming dataset.
+
+It is one of the most widely used datasets for machine learning research. Please refer to the
+`CIFAR-10 Dataset <https://www.cs.toronto.edu/~kriz/cifar.html>`_ for more details.
+"""
+
 from streaming.vision.base import ImageClassDataset
 
 
 class CIFAR10(ImageClassDataset):
-    """Streaming CIFAR10.
+    """Implementation of the CIFAR-10 dataset using streaming Dataset.
 
     Args:
         local (str): Local filesystem directory where dataset is cached during operation.
diff --git a/streaming/vision/coco.py b/streaming/vision/coco.py
index 2b3932f95..db3ae3aac 100644
--- a/streaming/vision/coco.py
+++ b/streaming/vision/coco.py
@@ -1,14 +1,19 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""COCO (Common Objects in Context) dataset.
+
+COCO is a large-scale object detection, segmentation, and captioning dataset. Please refer to the
+`COCO dataset <https://cocodataset.org>`_ for more details.
+"""
+
 from typing import Any, Callable, Optional
 
 from streaming.base import Dataset
 
 
 class COCO(Dataset):
-    """
-    Implementation of the COCO dataset using streaming Dataset.
+    """Implementation of the COCO dataset using streaming Dataset.
 
     Args:
         local (str): Local filesystem directory where dataset is cached during operation.
@@ -51,6 +56,14 @@ def __init__(self,
         self.transform = transform
 
     def __getitem__(self, idx: int) -> Any:
+        """Get sample by global index, blocking to load its shard if missing.
+
+        Args:
+            idx (int): Sample index.
+
+        Returns:
+            Any: Sample data.
+        """
         x = super().__getitem__(idx)
         img = x['img'].convert('RGB')
         img_id = x['img_id']
diff --git a/streaming/vision/convert/__init__.py b/streaming/vision/convert/__init__.py
index 636fee966..3e01af96a 100644
--- a/streaming/vision/convert/__init__.py
+++ b/streaming/vision/convert/__init__.py
@@ -1,2 +1,4 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""Data conversion scripts for Computer Vision."""
diff --git a/streaming/vision/convert/ade20k.py b/streaming/vision/convert/ade20k.py
index 99f771ac6..9265021bf 100644
--- a/streaming/vision/convert/ade20k.py
+++ b/streaming/vision/convert/ade20k.py
@@ -16,23 +16,23 @@
 
 
 def parse_args() -> Namespace:
-    """Parse command line arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Command line arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
     args.add_argument(
         '--in_root',
         type=str,
         required=True,
-        help='Location of the input dataset',
+        help='Directory path of the input dataset',
     )
     args.add_argument(
         '--out_root',
         type=str,
         required=True,
-        help='Location to store the output dataset',
+        help='Directory path to store the output dataset',
     )
     args.add_argument(
         '--splits',
@@ -53,23 +53,23 @@ def parse_args() -> Namespace:
         help='Hashing algorithms to apply to shard files. Default: sha1,xxh64',
     )
     args.add_argument(
-        '--limit',
+        '--size_limit',
         type=int,
         default=1 << 22,
-        help='Shard size limit, after which point to start a new shard. Default: 4194304',
+        help='Shard size limit, after which point to start a new shard. Default: 1 << 22',
     )
     args.add_argument(
         '--progbar',
         type=int,
         default=1,
-        help='tqdm progress bar. Default: 1 (Act as True)',
+        help='tqdm progress bar. Default: 1 (True)',
     )
     args.add_argument(
         '--leave',
         type=int,
         default=0,
         help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' +
-        '(Act as False)',
+        '(False)',
     )
     return args.parse_args()
 
@@ -136,7 +136,7 @@ def main(args: Namespace) -> None:
     """Main: create streaming ADE20K dataset.
 
     Args:
-        args (Namespace): Command line arguments.
+        args (Namespace): command-line arguments.
     """
     fields = {'uid': 'bytes', 'x': 'jpeg', 'y': 'png'}
 
@@ -156,7 +156,8 @@ def main(args: Namespace) -> None:
         if args.progbar:
             samples = tqdm(samples, leave=args.leave)
 
-        with MDSWriter(split_images_out_dir, fields, args.compression, hashes, args.limit) as out:
+        with MDSWriter(split_images_out_dir, fields, args.compression, hashes,
+                       args.size_limit) as out:
             for sample in each(samples):
                 out.write(sample)
 
diff --git a/streaming/vision/convert/base.py b/streaming/vision/convert/base.py
index 0e94cea50..77b25d4b9 100644
--- a/streaming/vision/convert/base.py
+++ b/streaming/vision/convert/base.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""Utility and helper functions to convert CV datasets."""
+
 import os
 from typing import List, Optional
 
diff --git a/streaming/vision/convert/cifar10.py b/streaming/vision/convert/cifar10.py
index 1677e87f8..2de4a6603 100644
--- a/streaming/vision/convert/cifar10.py
+++ b/streaming/vision/convert/cifar10.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""CIFAR10 streaming dataset conversion scripts."""
+
 from argparse import ArgumentParser, Namespace
 
 from torchvision.datasets import CIFAR10
@@ -10,20 +12,61 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
-    args.add_argument('--in_root', type=str, required=True)
-    args.add_argument('--out_root', type=str, required=True)
-    args.add_argument('--splits', type=str, default='train,val')
-    args.add_argument('--compression', type=str, default='')
-    args.add_argument('--hashes', type=str, default='sha1,xxh64')
-    args.add_argument('--limit', type=int, default=1 << 20)
-    args.add_argument('--progbar', type=int, default=1)
-    args.add_argument('--leave', type=int, default=0)
+    args.add_argument(
+        '--in_root',
+        type=str,
+        required=True,
+        help='Directory path of the input dataset',
+    )
+    args.add_argument(
+        '--out_root',
+        type=str,
+        required=True,
+        help='Directory path to store the output dataset',
+    )
+    args.add_argument(
+        '--splits',
+        type=str,
+        default='train,val',
+        help='Split to use. Default: train,val',
+    )
+    args.add_argument(
+        '--compression',
+        type=str,
+        default='',
+        help='Compression algorithm to use. Default: None',
+    )
+    args.add_argument(
+        '--hashes',
+        type=str,
+        default='sha1,xxh64',
+        help='Hashing algorithms to apply to shard files. Default: sha1,xxh64',
+    )
+    args.add_argument(
+        '--size_limit',
+        type=int,
+        default=1 << 20,
+        help='Shard size limit, after which point to start a new shard. Default: 1 << 20',
+    )
+    args.add_argument(
+        '--progbar',
+        type=int,
+        default=1,
+        help='tqdm progress bar. Default: 1 (True)',
+    )
+    args.add_argument(
+        '--leave',
+        type=int,
+        default=0,
+        help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' +
+        '(False)',
+    )
     return args.parse_args()
 
 
@@ -31,14 +74,14 @@ def main(args: Namespace) -> None:
     """Main: create streaming CIFAR10 dataset.
 
     Args:
-        args (Namespace): Commandline arguments.
+        args (Namespace): command-line arguments.
     """
     splits = get_list_arg(args.splits)
     hashes = get_list_arg(args.hashes)
     for split in splits:
         dataset = CIFAR10(root=args.in_root, train=(split == 'train'), download=True)
         convert_image_class_dataset(dataset, args.out_root, split, args.compression, hashes,
-                                    args.limit, args.progbar, args.leave, 'pil')
+                                    args.size_limit, args.progbar, args.leave, 'pil')
 
 
 if __name__ == '__main__':
diff --git a/streaming/vision/convert/coco.py b/streaming/vision/convert/coco.py
index d9e4ea2ac..c4a9e5fc2 100644
--- a/streaming/vision/convert/coco.py
+++ b/streaming/vision/convert/coco.py
@@ -19,10 +19,10 @@
 
 
 def parse_args() -> Namespace:
-    """Parse command line arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Command line arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
     args.add_argument(
@@ -56,10 +56,10 @@ def parse_args() -> Namespace:
         help='Hashing algorithms to apply to shard files. Default: sha1,xxh64',
     )
     args.add_argument(
-        '--limit',
+        '--size_limit',
         type=int,
         default=1 << 25,
-        help='Shard size limit, after which point to start a new shard. Default: 33554432',
+        help='Shard size limit, after which point to start a new shard. Default: 1 << 25',
     )
     args.add_argument(
         '--progbar',
@@ -197,7 +197,7 @@ def main(args: Namespace) -> None:
     """Main: create COCO streaming dataset.
 
     Args:
-        args (Namespace): Command line arguments.
+        args (Namespace): command-line arguments.
     """
     fields = {
         'img': 'jpeg',
@@ -235,7 +235,7 @@ def main(args: Namespace) -> None:
         else:
             dataset = each(dataset, shuffle)
 
-        with MDSWriter(split_out_dir, fields, args.compression, hashes, args.limit) as out:
+        with MDSWriter(split_out_dir, fields, args.compression, hashes, args.size_limit) as out:
             for sample in dataset:
                 out.write(sample)
 
diff --git a/streaming/vision/convert/image_folder.py b/streaming/vision/convert/imagenet.py
similarity index 61%
rename from streaming/vision/convert/image_folder.py
rename to streaming/vision/convert/imagenet.py
index 743a09d88..4e75475c9 100644
--- a/streaming/vision/convert/image_folder.py
+++ b/streaming/vision/convert/imagenet.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""ImageNet streaming dataset conversion scripts."""
+
 import os
 from argparse import ArgumentParser, Namespace
 from glob import glob
@@ -15,22 +17,73 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commandline arguments.
+    """Parse command-line arguments.
 
     Args:
-        Namespace: Commandline arguments.
+        Namespace: command-line arguments.
     """
     args = ArgumentParser()
-    args.add_argument('--in_root', type=str, required=True)
-    args.add_argument('--out_root', type=str, required=True)
-    args.add_argument('--splits', type=str, default='train,val')
-    args.add_argument('--compression', type=str, default='')
-    args.add_argument('--hashes', type=str, default='sha1,xxh64')
-    args.add_argument('--size_limit', type=int, default=1 << 26)
-    args.add_argument('--progbar', type=int, default=1)
-    args.add_argument('--leave', type=int, default=0)
-    args.add_argument('--validate', type=int, default=1)
-    args.add_argument('--extensions', type=str, default='jpeg')
+    args.add_argument(
+        '--in_root',
+        type=str,
+        required=True,
+        help='Directory path of the input dataset',
+    )
+    args.add_argument(
+        '--out_root',
+        type=str,
+        required=True,
+        help='Directory path to store the output dataset',
+    )
+    args.add_argument(
+        '--splits',
+        type=str,
+        default='train,val',
+        help='Split to use. Default: train,val',
+    )
+    args.add_argument(
+        '--compression',
+        type=str,
+        default='',
+        help='Compression algorithm to use. Default: None',
+    )
+    args.add_argument(
+        '--hashes',
+        type=str,
+        default='sha1,xxh64',
+        help='Hashing algorithms to apply to shard files. Default: sha1,xxh64',
+    )
+    args.add_argument(
+        '--size_limit',
+        type=int,
+        default=1 << 26,
+        help='Shard size limit, after which point to start a new shard. Default: 1 << 26',
+    )
+    args.add_argument(
+        '--progbar',
+        type=int,
+        default=1,
+        help='tqdm progress bar. Default: 1 (True)',
+    )
+    args.add_argument(
+        '--leave',
+        type=int,
+        default=0,
+        help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' +
+        '(False)',
+    )
+    args.add_argument(
+        '--validate',
+        type=int,
+        default=1,
+        help='Validate that it is an Image. Default: 1 (True)',
+    )
+    args.add_argument(
+        '--extensions',
+        type=str,
+        default='jpeg',
+        help='Validate filename extensions. Default: jpeg',
+    )
     return args.parse_args()
 
 
@@ -75,10 +128,10 @@ def get_classes(filenames: List[str],
 
 
 def main(args: Namespace) -> None:
-    """Main: create streaming CIFAR10 dataset.
+    """Main: create streaming ImageNet dataset.
 
     Args:
-        args (Namespace): Commandline arguments.
+        args (Namespace): command-line arguments.
     """
     splits = get_list_arg(args.splits)
     columns = {'i': 'int', 'x': 'jpeg', 'y': 'int'}
diff --git a/streaming/vision/imagenet.py b/streaming/vision/imagenet.py
index c60fad1e4..4436d08ef 100644
--- a/streaming/vision/imagenet.py
+++ b/streaming/vision/imagenet.py
@@ -1,11 +1,17 @@
 # Copyright 2022 MosaicML Streaming authors
 # SPDX-License-Identifier: Apache-2.0
 
+"""ImageNet classification streaming dataset.
+
+The most widely used dataset for Image Classification algorithms. Please refer to the `ImageNet
+2012 Classification Dataset <http://image-net.org/>`_ for more details.
+"""
+
 from streaming.vision.base import ImageClassDataset
 
 
 class ImageNet(ImageClassDataset):
-    """Streaming ImageNet.
+    """Implementation of the ImageNet dataset using streaming Dataset.
 
     Args:
         local (str): Local filesystem directory where dataset is cached during operation.
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index b7ef7185e..4ac0a1e06 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -347,7 +347,9 @@ def test_dataloader_single_device(remote_local: Tuple[str, str], batch_size: int
 
 def check_for_diff_files(dir: dircmp, compression_ext: Union[None, str]):
     """Check recursively for different files in a dircmp object.
-    Local directory also has the uncompressed files, ignore it during file comparison."""
+
+    Local directory also has the uncompressed files, ignore it during file comparison.
+    """
     if compression_ext:
         for file in dir.diff_files:
             assert not file.endswith(compression_ext)
diff --git a/tests/test_streaming_remote.py b/tests/test_streaming_remote.py
index 714fa4bda..21d050b57 100644
--- a/tests/test_streaming_remote.py
+++ b/tests/test_streaming_remote.py
@@ -104,7 +104,6 @@ def get_dataset(name: str,
 ])
 @pytest.mark.parametrize('split', ['val'])
 def test_streaming_remote_dataset(tmp_path: pathlib.Path, name: str, split: str) -> None:
-
     # Build StreamingDataset
     build_start = time.time()
     expected_samples, dataset = get_dataset(name=name,

From b71489a0c5527816068aaa4833b66d8e89297972 Mon Sep 17 00:00:00 2001
From: Karan Jariwala <karankjariwala@gmail.com>
Date: Thu, 22 Sep 2022 15:03:19 -0700
Subject: [PATCH 2/3] Skip streaming base and version file in doc section

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index f4bdc3aba..7768dbd42 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -375,7 +375,7 @@ def _modules_to_rst() -> List[types.ModuleType]:
     document_modules: List[types.Module] = [
         streaming,
     ]
-    exclude_modules: List[types.Module] = []
+    exclude_modules: List[types.Module] = [streaming.base, streaming._version]
     for name in streaming.__dict__:
         obj = streaming.__dict__[name]
         if isinstance(obj, types.ModuleType) and obj not in exclude_modules:

From b9e4a2d410dea27e5b08949e7ddca132ee29444d Mon Sep 17 00:00:00 2001
From: Karan Jariwala <karankjariwala@gmail.com>
Date: Sat, 24 Sep 2022 18:32:27 -0700
Subject: [PATCH 3/3] Add text and vision to a doc

---
 docs/source/conf.py          | 6 ++----
 streaming/__init__.py        | 5 ++++-
 streaming/vision/__init__.py | 2 +-
 streaming/vision/ade20k.py   | 4 ++--
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7768dbd42..4dd52612e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -16,15 +16,13 @@
 import ast
 import importlib
 import inspect
-import json
 import os
 import shutil
 import sys
 import tempfile
-import textwrap
 import types
 import warnings
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Tuple, Type
 
 import sphinx.application
 import sphinx.ext.autodoc
@@ -375,7 +373,7 @@ def _modules_to_rst() -> List[types.ModuleType]:
     document_modules: List[types.Module] = [
         streaming,
     ]
-    exclude_modules: List[types.Module] = [streaming.base, streaming._version]
+    exclude_modules: List[types.Module] = [streaming._version]
     for name in streaming.__dict__:
         obj = streaming.__dict__[name]
         if isinstance(obj, types.ModuleType) and obj not in exclude_modules:
diff --git a/streaming/__init__.py b/streaming/__init__.py
index 4c9ab3693..aa4e721d7 100644
--- a/streaming/__init__.py
+++ b/streaming/__init__.py
@@ -3,10 +3,13 @@
 
 """MosaicML Streaming Datasets for cloud-native model training."""
 
+import streaming.text as text
+import streaming.vision as vision
 from streaming._version import __version__
 from streaming.base import (CSVWriter, Dataset, JSONWriter, LocalDataset, MDSWriter, TSVWriter,
                             XSVWriter)
 
 __all__ = [
-    'Dataset', 'CSVWriter', 'JSONWriter', 'MDSWriter', 'TSVWriter', 'XSVWriter', 'LocalDataset'
+    'Dataset', 'CSVWriter', 'JSONWriter', 'MDSWriter', 'TSVWriter', 'XSVWriter', 'LocalDataset',
+    'vision', 'text'
 ]
diff --git a/streaming/vision/__init__.py b/streaming/vision/__init__.py
index 2b25df288..89b7d3aa6 100644
--- a/streaming/vision/__init__.py
+++ b/streaming/vision/__init__.py
@@ -3,7 +3,7 @@
 
 """Natively supported CV datasets."""
 
-from streaming.vision.ade20k import ADE20k as ADE20K
+from streaming.vision.ade20k import ADE20K as ADE20K
 from streaming.vision.cifar10 import CIFAR10 as CIFAR10
 from streaming.vision.coco import COCO as COCO
 from streaming.vision.imagenet import ImageNet as ImageNet
diff --git a/streaming/vision/ade20k.py b/streaming/vision/ade20k.py
index 40d05fcee..76ed22e86 100644
--- a/streaming/vision/ade20k.py
+++ b/streaming/vision/ade20k.py
@@ -12,8 +12,8 @@
 from streaming.base import Dataset
 
 
-class ADE20k(Dataset):
-    """Implementation of the ADE20k dataset using streaming Dataset.
+class ADE20K(Dataset):
+    """Implementation of the ADE20K dataset using streaming Dataset.
 
     Args:
         local (str): Local dataset directory where shards are cached by split.