From 337e7e83a59cfaa0f6d26e9b4322d3ebaf987a5c Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Thu, 22 Sep 2022 13:53:39 -0700 Subject: [PATCH 1/3] Added pydocstyle and docformatter in pre-commit config --- .pre-commit-config.yaml | 16 ++++ docs/source/conf.py | 8 +- docs/source/doctest_fixtures.py | 5 +- pyproject.toml | 5 ++ setup.py | 5 +- streaming/__init__.py | 2 + streaming/base/__init__.py | 2 + streaming/base/compression/__init__.py | 2 + streaming/base/compression/bench.py | 8 +- streaming/base/compression/compression.py | 2 + streaming/base/compression/plot.py | 8 +- streaming/base/dataset.py | 3 +- streaming/base/distributed.py | 30 ++++++- streaming/base/download.py | 2 + streaming/base/format/__init__.py | 12 +++ streaming/base/format/base/__init__.py | 2 + streaming/base/format/base/reader.py | 2 + streaming/base/format/base/writer.py | 2 + streaming/base/format/json/README.md | 2 + streaming/base/format/json/__init__.py | 2 + streaming/base/format/json/encodings.py | 2 + streaming/base/format/json/reader.py | 2 + streaming/base/format/json/writer.py | 4 +- streaming/base/format/mds/README.md | 2 + streaming/base/format/mds/__init__.py | 2 + streaming/base/format/mds/encodings.py | 2 + streaming/base/format/mds/reader.py | 2 + streaming/base/format/mds/writer.py | 2 + streaming/base/format/xsv/README.md | 2 + streaming/base/format/xsv/__init__.py | 2 + streaming/base/format/xsv/encodings.py | 2 + streaming/base/format/xsv/reader.py | 2 + streaming/base/format/xsv/writer.py | 8 +- streaming/base/hashing/__init__.py | 2 + streaming/base/hashing/bench.py | 8 +- streaming/base/hashing/hashing.py | 2 + streaming/base/hashing/plot.py | 8 +- streaming/base/index.py | 7 +- streaming/base/local.py | 21 +++++ streaming/base/util.py | 4 +- streaming/text/__init__.py | 2 + streaming/text/c4.py | 8 +- streaming/text/convert/__init__.py | 2 + streaming/text/convert/c4.py | 64 ++++++++++++--- streaming/vision/__init__.py | 2 + streaming/vision/ade20k.py | 8 +- streaming/vision/base.py | 5 +- streaming/vision/cifar10.py | 8 +- streaming/vision/coco.py | 17 +++- streaming/vision/convert/__init__.py | 2 + streaming/vision/convert/ade20k.py | 21 ++--- streaming/vision/convert/base.py | 2 + streaming/vision/convert/cifar10.py | 67 ++++++++++++--- streaming/vision/convert/coco.py | 12 +-- .../convert/{image_folder.py => imagenet.py} | 81 +++++++++++++++---- streaming/vision/imagenet.py | 8 +- tests/test_streaming.py | 4 +- tests/test_streaming_remote.py | 1 - 58 files changed, 422 insertions(+), 98 deletions(-) rename streaming/vision/convert/{image_folder.py => imagenet.py} (61%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e27daabd7..644cdd6eb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -68,3 +68,19 @@ repos: pass_filenames: false args: [--warnings] additional_dependencies: ["pyright@1.1.256"] + - repo: https://github.com/myint/docformatter + rev: v1.5.0 + hooks: + - id: docformatter + args: [--in-place, --wrap-summaries=99, --wrap-descriptions=99] + - repo: https://github.com/PyCQA/pydocstyle + hooks: + - id: pydocstyle + name: pydocstyle + entry: pydocstyle + language: python + types: [python] + exclude: '(tests|.ci|.github)' + additional_dependencies: + - "toml" + rev: 6.1.1 diff --git a/docs/source/conf.py b/docs/source/conf.py index 6388224dd..f4bdc3aba 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -371,9 +371,7 @@ def _auto_rst_for_module(module: types.ModuleType, exclude_members: List[Any]) - def _modules_to_rst() -> List[types.ModuleType]: - """Return the list of modules for which to generate API reference rst - files.""" - + """Return the list of modules for which to generate API reference rst files.""" document_modules: List[types.Module] = [ streaming, ] @@ -389,8 +387,8 @@ def _modules_to_rst() -> List[types.ModuleType]: def _generate_rst_files_for_modules() -> None: """Generate .rst files for each module to include in the API reference. - These files contain the module docstring followed by tables listing - all the functions, classes, etc. + These files contain the module docstring followed by tables listing all the functions, classes, + etc. """ docs_dir = os.path.abspath(os.path.dirname(__file__)) module_rst_save_dir = os.path.join(docs_dir, 'api_reference') diff --git a/docs/source/doctest_fixtures.py b/docs/source/doctest_fixtures.py index a997373c2..b7ff8c4fe 100644 --- a/docs/source/doctest_fixtures.py +++ b/docs/source/doctest_fixtures.py @@ -6,9 +6,8 @@ """Fixtures available in doctests. -The script is run before any doctests are executed, so all imports and -variables are available in any doctest. The output of this setup script -does not show up in the documentation. +The script is run before any doctests are executed, so all imports and variables are available in +any doctest. The output of this setup script does not show up in the documentation. """ import os import sys diff --git a/pyproject.toml b/pyproject.toml index a0192d86f..ea3db9337 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -475,3 +475,8 @@ ignore_patterns = [ "wandb/**/*.py", "build/**/*.py", ] + +[tool.pydocstyle] +convention="google" +add_ignore="D102,D105,D107,D401" +add_select="D400,D404" diff --git a/setup.py b/setup.py index f91b08bb7..ebb83a02d 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,7 @@ -# Copyright 2022 MosaicML. All Rights Reserved. +# Copyright 2022 MosaicML Streaming authors +# SPDX-License-Identifier: Apache-2.0 + +"""Streaming package setup.""" import os diff --git a/streaming/__init__.py b/streaming/__init__.py index 52191ad6e..4c9ab3693 100644 --- a/streaming/__init__.py +++ b/streaming/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""MosaicML Streaming Datasets for cloud-native model training.""" + from streaming._version import __version__ from streaming.base import (CSVWriter, Dataset, JSONWriter, LocalDataset, MDSWriter, TSVWriter, XSVWriter) diff --git a/streaming/base/__init__.py b/streaming/base/__init__.py index 29281609a..e791cfcf3 100644 --- a/streaming/base/__init__.py +++ b/streaming/base/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""MosaicML Streaming Datasets for cloud-native model training.""" + from streaming.base.dataset import Dataset from streaming.base.format import CSVWriter, JSONWriter, MDSWriter, TSVWriter, XSVWriter from streaming.base.local import LocalDataset diff --git a/streaming/base/compression/__init__.py b/streaming/base/compression/__init__.py index ab1ea64f2..09c6533c0 100644 --- a/streaming/base/compression/__init__.py +++ b/streaming/base/compression/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""A collection of compression algorithm.""" + from streaming.base.compression.compression import (compress, decompress, get_compression_extension, get_compressions, is_compression) diff --git a/streaming/base/compression/bench.py b/streaming/base/compression/bench.py index f02c81e96..335653087 100644 --- a/streaming/base/compression/bench.py +++ b/streaming/base/compression/bench.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Script to Benchmark compression algorithms.""" + from argparse import ArgumentParser, Namespace from time import time from typing import Iterator @@ -11,10 +13,10 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() args.add_argument('--data', type=str, required=True) @@ -48,7 +50,7 @@ def main(args: Namespace) -> None: """Benchmark compression algorithms. Args: - args (Namespace): Commandline flags. + args (Namespace): command-line flags. """ data = open(args.data, 'rb').read() for algo in sorted(get_compressions()): diff --git a/streaming/base/compression/compression.py b/streaming/base/compression/compression.py index 26064494e..4f4423400 100644 --- a/streaming/base/compression/compression.py +++ b/streaming/base/compression/compression.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""List of Compression and Decompression algorithms.""" + import bz2 import gzip from abc import ABC, abstractmethod diff --git a/streaming/base/compression/plot.py b/streaming/base/compression/plot.py index 7a097e2db..19f8065c8 100644 --- a/streaming/base/compression/plot.py +++ b/streaming/base/compression/plot.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Utility and helper functions to plot compression information.""" + from argparse import ArgumentParser, Namespace from collections import defaultdict from dataclasses import dataclass @@ -72,10 +74,10 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() args.add_argument('--data', type=str, required=True) @@ -252,7 +254,7 @@ def main(args: Namespace) -> None: """Plot info about compression. Args: - args (Namespace): Commandline arguments. + args (Namespace): command-line arguments. """ data = load(args.data, args.min_dec_size) plot_compression_rates(data, algo2color, args.dpi, args.font_size, args.line_width, diff --git a/streaming/base/dataset.py b/streaming/base/dataset.py index 1834e5cab..e9e5603ec 100644 --- a/streaming/base/dataset.py +++ b/streaming/base/dataset.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""The :class:`Dataset` class, used for building streaming iterable datasets.""" + import json import os from enum import IntEnum @@ -292,7 +294,6 @@ def _preload_shard(self, shard: int, partition: Partition) -> bool: Returns: bool: Whether shard is present. """ - assert shard in partition.shards reader = self.shards[shard] for raw_info, zip_info in reader.file_pairs: diff --git a/streaming/base/distributed.py b/streaming/base/distributed.py index 0e4f7628b..7d6ece165 100644 --- a/streaming/base/distributed.py +++ b/streaming/base/distributed.py @@ -1,22 +1,44 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Helper methods to get the distributed attributes.""" + import os __all__ = ['get_global_rank', 'get_local_rank', 'get_local_world_size', 'get_world_size'] -def get_global_rank(): +def get_global_rank() -> int: + """Returns the global rank of the current process, which is on ``[0; WORLD_SIZE - 1]``. + + Returns: + int: The global rank. + """ return int(os.environ.get('RANK', 0)) -def get_world_size(): +def get_world_size() -> int: + """Returns the world size, which is the number of processes participating in this training run. + + Returns: + int: The world size. + """ return int(os.environ.get('WORLD_SIZE', 1)) -def get_local_rank(): +def get_local_rank() -> int: + """Returns the local rank for the current process, which is on ``[0; LOCAL_WORLD_SIZE - 1]``. + + Returns: + int: The local rank. + """ return int(os.environ.get('LOCAL_RANK', 0)) -def get_local_world_size(): +def get_local_world_size() -> int: + """Returns the local world size, which is the number of processes for the current node. + + Returns: + int: The local world size. + """ return int(os.environ.get('LOCAL_WORLD_SIZE', 1)) diff --git a/streaming/base/download.py b/streaming/base/download.py index 86af8d174..54243b5e9 100644 --- a/streaming/base/download.py +++ b/streaming/base/download.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Download handling for :class:`Dataset`.""" + import os import shutil import urllib.parse diff --git a/streaming/base/format/__init__.py b/streaming/base/format/__init__.py index a483e6dff..7210b0219 100644 --- a/streaming/base/format/__init__.py +++ b/streaming/base/format/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Individual dataset writer for every format.""" + from typing import Any, Dict, Optional from streaming.base.format.base.reader import Reader @@ -19,6 +21,16 @@ def reader_from_json(dirname: str, split: Optional[str], obj: Dict[str, Any]) -> Reader: + """Initialize the reader from JSON object. + + Args: + dirname (str): Local directory containing shards. + split (str, optional): Which dataset split to use, if any. + obj (Dict[str, Any]): JSON object to load. + + Returns: + Reader: Loaded Reader of `format` type + """ assert obj['version'] == 2 cls = _readers[obj['format']] return cls.from_json(dirname, split, obj) diff --git a/streaming/base/format/base/__init__.py b/streaming/base/format/base/__init__.py index 636fee966..7e551a3c8 100644 --- a/streaming/base/format/base/__init__.py +++ b/streaming/base/format/base/__init__.py @@ -1,2 +1,4 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 + +"""Base module for dataset reader and writer.""" diff --git a/streaming/base/format/base/reader.py b/streaming/base/format/base/reader.py index 2d2d1df65..c851ff8ce 100644 --- a/streaming/base/format/base/reader.py +++ b/streaming/base/format/base/reader.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Read and decode sample from shards.""" + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Dict, Iterator, List, Optional diff --git a/streaming/base/format/base/writer.py b/streaming/base/format/base/writer.py index 94ef315f9..e3f7b8710 100644 --- a/streaming/base/format/base/writer.py +++ b/streaming/base/format/base/writer.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Convert a list of samples into a format files that can be read as a :class:`Dataset`.""" + import json import os from abc import ABC, abstractmethod diff --git a/streaming/base/format/json/README.md b/streaming/base/format/json/README.md index f41c9ed71..13cd1fd99 100644 --- a/streaming/base/format/json/README.md +++ b/streaming/base/format/json/README.md @@ -1,5 +1,6 @@ Example: +```json { "columns": { "number": "int", @@ -48,3 +49,4 @@ Example: } } } +``` diff --git a/streaming/base/format/json/__init__.py b/streaming/base/format/json/__init__.py index 869e6deed..10a569a69 100644 --- a/streaming/base/format/json/__init__.py +++ b/streaming/base/format/json/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Module to write and read the dataset in JSON format.""" + from streaming.base.format.json.reader import JSONReader from streaming.base.format.json.writer import JSONWriter diff --git a/streaming/base/format/json/encodings.py b/streaming/base/format/json/encodings.py index 2c07359de..4834cdd98 100644 --- a/streaming/base/format/json/encodings.py +++ b/streaming/base/format/json/encodings.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Check whether sample encoding is of supported JSON types.""" + from abc import ABC, abstractmethod from typing import Any diff --git a/streaming/base/format/json/reader.py b/streaming/base/format/json/reader.py index d6eae6417..4ac4409c9 100644 --- a/streaming/base/format/json/reader.py +++ b/streaming/base/format/json/reader.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +""":class:`JSONReader` reads samples from binary ``.json`` files that were written out by :class:`MDSWriter`.""" + import json import os from copy import deepcopy diff --git a/streaming/base/format/json/writer.py b/streaming/base/format/json/writer.py index d575daf3d..217a123ae 100644 --- a/streaming/base/format/json/writer.py +++ b/streaming/base/format/json/writer.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +""":class:`JSONWriter` converts a list of samples into binary `.mds` files that can be read as a :class:`JSONReader`.""" + import json from typing import Any, Dict, List, Optional, Tuple @@ -11,7 +13,7 @@ class JSONWriter(SplitWriter): - """Writes a streaming JSON dataset. + r"""Writes a streaming JSON dataset. Args: dirname (str): Local dataset directory. diff --git a/streaming/base/format/mds/README.md b/streaming/base/format/mds/README.md index 11ad219e9..7bf04983b 100644 --- a/streaming/base/format/mds/README.md +++ b/streaming/base/format/mds/README.md @@ -1,5 +1,6 @@ Example: +```json { "column_encodings": [ "int", @@ -39,3 +40,4 @@ Example: } } } +``` diff --git a/streaming/base/format/mds/__init__.py b/streaming/base/format/mds/__init__.py index 2272ba3ef..084c5fd71 100644 --- a/streaming/base/format/mds/__init__.py +++ b/streaming/base/format/mds/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Module to write and read the dataset in MDS format.""" + from streaming.base.format.mds.reader import MDSReader from streaming.base.format.mds.writer import MDSWriter diff --git a/streaming/base/format/mds/encodings.py b/streaming/base/format/mds/encodings.py index 93eebba92..176bc1d6b 100644 --- a/streaming/base/format/mds/encodings.py +++ b/streaming/base/format/mds/encodings.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Encode and Decode samples in a supported MDS format.""" + import json import pickle from abc import ABC, abstractmethod diff --git a/streaming/base/format/mds/reader.py b/streaming/base/format/mds/reader.py index 021405837..698bc2979 100644 --- a/streaming/base/format/mds/reader.py +++ b/streaming/base/format/mds/reader.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +""":class:`MDSReader` reads samples from binary ``.mds`` files that were written out by:class:`StreamingDatasetWriter`.""" + import os from copy import deepcopy from typing import Any, Dict, List, Optional diff --git a/streaming/base/format/mds/writer.py b/streaming/base/format/mds/writer.py index 1884738e4..ec97d776d 100644 --- a/streaming/base/format/mds/writer.py +++ b/streaming/base/format/mds/writer.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +""":class:`MDSWriter` converts a list of samples into binary `.mds` files that can be read as a :class:`MDSReader`.""" + import json from typing import Any, Dict, List, Optional diff --git a/streaming/base/format/xsv/README.md b/streaming/base/format/xsv/README.md index 296822b78..242a5ef9e 100644 --- a/streaming/base/format/xsv/README.md +++ b/streaming/base/format/xsv/README.md @@ -1,5 +1,6 @@ Example: +```json { "column_encodings": [ "int", @@ -53,3 +54,4 @@ Example: } } } +``` diff --git a/streaming/base/format/xsv/__init__.py b/streaming/base/format/xsv/__init__.py index d71949a04..5d7f691f7 100644 --- a/streaming/base/format/xsv/__init__.py +++ b/streaming/base/format/xsv/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Module to write and read the dataset in Tabular format.""" + from streaming.base.format.xsv.reader import CSVReader, TSVReader, XSVReader from streaming.base.format.xsv.writer import CSVWriter, TSVWriter, XSVWriter diff --git a/streaming/base/format/xsv/encodings.py b/streaming/base/format/xsv/encodings.py index da936250d..f8fb94cab 100644 --- a/streaming/base/format/xsv/encodings.py +++ b/streaming/base/format/xsv/encodings.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Encode and Decode samples in a supported Tabular format.""" + from abc import ABC, abstractmethod from typing import Any diff --git a/streaming/base/format/xsv/reader.py b/streaming/base/format/xsv/reader.py index 8a08c6e8b..d53a797ab 100644 --- a/streaming/base/format/xsv/reader.py +++ b/streaming/base/format/xsv/reader.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Reads and decode samples from a Tabular format files such as XSV, CSV, and TSV.""" + import os from copy import deepcopy from typing import Any, Dict, List, Optional diff --git a/streaming/base/format/xsv/writer.py b/streaming/base/format/xsv/writer.py index 47deb1e06..05e8c95b8 100644 --- a/streaming/base/format/xsv/writer.py +++ b/streaming/base/format/xsv/writer.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Converts a list of samples into a tabular data format files such as XSV, CSV, and TSV.""" + import json from typing import Any, Dict, List, Optional, Tuple @@ -11,7 +13,7 @@ class XSVWriter(SplitWriter): - """Writes a streaming XSV dataset. + r"""Writes a streaming XSV dataset. Args: dirname (str): Local dataset directory. @@ -107,7 +109,7 @@ def encode_split_shard(self) -> Tuple[bytes, bytes]: class CSVWriter(XSVWriter): - """Writes a streaming CSV dataset. + r"""Writes a streaming CSV dataset. Args: dirname (str): Local dataset directory. @@ -147,7 +149,7 @@ def get_config(self) -> Dict[str, Any]: class TSVWriter(XSVWriter): - """Writes a streaming TSV dataset. + r"""Writes a streaming TSV dataset. Args: dirname (str): Local dataset directory. diff --git a/streaming/base/hashing/__init__.py b/streaming/base/hashing/__init__.py index 501d6a82b..362426685 100644 --- a/streaming/base/hashing/__init__.py +++ b/streaming/base/hashing/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Hashing algorithms.""" + from streaming.base.hashing.hashing import get_hash, get_hashes, is_hash __all__ = ['get_hash', 'get_hashes', 'is_hash'] diff --git a/streaming/base/hashing/bench.py b/streaming/base/hashing/bench.py index 8c95e8407..9354e857a 100644 --- a/streaming/base/hashing/bench.py +++ b/streaming/base/hashing/bench.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Script to benchmark various hashing algorithms.""" + from argparse import ArgumentParser, Namespace from time import time from typing import Iterator @@ -11,10 +13,10 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() args.add_argument('--data', type=str, required=True) @@ -48,7 +50,7 @@ def main(args: Namespace) -> None: """Benchmark hash algorithms. Args: - args (Namespace): Commandline flags. + args (Namespace): command-line flags. """ data = open(args.data, 'rb').read() for algo in sorted(get_hashes()): diff --git a/streaming/base/hashing/hashing.py b/streaming/base/hashing/hashing.py index 32401d0b7..fe33443ef 100644 --- a/streaming/base/hashing/hashing.py +++ b/streaming/base/hashing/hashing.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Setter and Getter for support hashing algorithms.""" + import hashlib from typing import Any, Callable, Dict, Set diff --git a/streaming/base/hashing/plot.py b/streaming/base/hashing/plot.py index a6f630453..f82a958e8 100644 --- a/streaming/base/hashing/plot.py +++ b/streaming/base/hashing/plot.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Utility and helper functions to plot hashing information.""" + from argparse import ArgumentParser, Namespace from collections import defaultdict from typing import Dict, List, Optional, Tuple @@ -30,10 +32,10 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() args.add_argument('--data', type=str, required=True) @@ -112,7 +114,7 @@ def main(args: Namespace) -> None: """Plot info about hashing. Args: - args (Namespace): Commandline arguments. + args (Namespace): command-line arguments. """ data = load(args.data) plot_hash_rates(data, algo2color, args.dpi, args.font_size, args.line_width, args.hash_rates) diff --git a/streaming/base/index.py b/streaming/base/index.py index 39f8490b7..a8902ec75 100644 --- a/streaming/base/index.py +++ b/streaming/base/index.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Helper methods to get the shard attributes.""" + from math import ceil from typing import List, Optional, Tuple @@ -41,9 +43,8 @@ def __init__(self, shards: List[int], shards_to_download: List[int], min_sample_ class Index(object): """An index of sample ranges (corresponding to shards). - Enables (a) finding the shard for a given sample, (b) getting the - per-device dataset size, and (c) getting this device/worker's sample - range of the dataset. + Enables (a) finding the shard for a given sample, (b) getting the per-device dataset size, and + (c) getting this device/worker's sample range of the dataset. """ def __init__(self, samples_per_shard: List[int], batch_size: Optional[int] = None) -> None: diff --git a/streaming/base/local.py b/streaming/base/local.py index 882b84567..ad232099e 100644 --- a/streaming/base/local.py +++ b/streaming/base/local.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Local Dataset.""" + import json import os from typing import Any, Dict, Optional @@ -12,6 +14,12 @@ class LocalDataset(Dataset): + """The dataset resides locally in a machine. + + Args: + dirname (str): Local dataset directory where the dataset is present. + split (str, optional): Which dataset split to use, if any. Defaults to ``None``. + """ def __init__(self, dirname: str, split: Optional[str] = None): split = split or '' @@ -32,9 +40,22 @@ def __init__(self, dirname: str, split: Optional[str] = None): self.index = Index(shard_sizes) def __len__(self) -> int: + """Get the length as an IterableDataset. + + Returns: + int: Dataset length. + """ return self.index.total_samples def __getitem__(self, idx: int) -> Dict[str, Any]: + """Get sample by global index. + + Args: + idx (int): Sample index. + + Returns: + Dict[str, Any]: Column name with sample data. + """ shard_idx, idx_in_shard = self.index.find_sample(idx) shard = self.shards[shard_idx] return shard[idx_in_shard] diff --git a/streaming/base/util.py b/streaming/base/util.py index 3b6d5a57a..a523bd4a9 100644 --- a/streaming/base/util.py +++ b/streaming/base/util.py @@ -1,11 +1,13 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Utility and helper functions for datasets.""" + from typing import List def get_list_arg(text: str) -> List[str]: - """Pass a list as a commandline flag. + """Pass a list as a command-line flag. Args: text (str): Text to split. diff --git a/streaming/text/__init__.py b/streaming/text/__init__.py index ea6a79b3c..91a442c4e 100644 --- a/streaming/text/__init__.py +++ b/streaming/text/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Natively supported NLP datasets.""" + from streaming.text.c4 import C4 as C4 __all__ = ['C4'] diff --git a/streaming/text/c4.py b/streaming/text/c4.py index 9a9457449..8387cf9e6 100644 --- a/streaming/text/c4.py +++ b/streaming/text/c4.py @@ -1,6 +1,12 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""C4 (Colossal Cleaned Common Crawl) dataset. + +This dataset is a colossal, cleaned version of Common Crawl's web crawl corpus and it is based on +the `Common Crawl `_ dataset. +""" + from typing import Any, Dict, Optional from transformers.models.auto.tokenization_auto import AutoTokenizer @@ -9,7 +15,7 @@ class C4(Dataset): - """C4 (Colossal Cleaned Common Crawl) dataset. + """Implementation of the C4 (Colossal Cleaned Common Crawl) dataset using streaming Dataset. Args: tokenizer_name (str): The name of the HuggingFace tokenizer to use to tokenize samples. diff --git a/streaming/text/convert/__init__.py b/streaming/text/convert/__init__.py index 636fee966..361a845c8 100644 --- a/streaming/text/convert/__init__.py +++ b/streaming/text/convert/__init__.py @@ -1,2 +1,4 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 + +"""Data conversion scripts for Natural Language Processing.""" diff --git a/streaming/text/convert/c4.py b/streaming/text/convert/c4.py index b59940f5d..43b28244e 100644 --- a/streaming/text/convert/c4.py +++ b/streaming/text/convert/c4.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""C4 (Colossal Cleaned Common Crawl) streaming dataset conversion scripts.""" + import os from argparse import ArgumentParser, Namespace from typing import Any, Dict, Iterator @@ -15,19 +17,55 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() - args.add_argument('--out_root', type=str, required=True) - args.add_argument('--compression', type=str, default='zstd:7') - args.add_argument('--hashes', type=str, default='sha1,xxh64') - args.add_argument('--limit', type=int, default=1 << 27) - args.add_argument('--batch_size', type=int, default=512) - args.add_argument('--progbar', type=int, default=1) - args.add_argument('--leave', type=int, default=0) + args.add_argument( + '--out_root', + type=str, + required=True, + help='Directory path to store the output dataset', + ) + args.add_argument( + '--compression', + type=str, + default='zstd:7', + help='Compression algorithm to use. Default: zstd:7', + ) + args.add_argument( + '--hashes', + type=str, + default='sha1,xxh64', + help='Hashing algorithms to apply to shard files. Default: sha1,xxh64', + ) + args.add_argument( + '--size_limit', + type=int, + default=1 << 27, + help='Shard size limit, after which point to start a new shard. Default: 1 << 27', + ) + args.add_argument( + '--batch_size', + type=int, + default=512, + help='DataLoader batch size. Default: 512', + ) + args.add_argument( + '--progbar', + type=int, + default=1, + help='tqdm progress bar. Default: 1 (True)', + ) + args.add_argument( + '--leave', + type=int, + default=0, + help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' + + '(False)', + ) return args.parse_args() @@ -72,6 +110,7 @@ def each(dataset: Dataset, num_workers: int, batch_size: int) -> Iterator[Dict[s dataset (Dataset): A HuggingFace Dataset locally downloaded. num_workers (int): DataLoader number of workers. batch_size (int): DataLoader batch size. + Returns: Iterator[Dict[str, Any]]: Sample dicts. """ @@ -89,9 +128,10 @@ def each(dataset: Dataset, num_workers: int, batch_size: int) -> Iterator[Dict[s def main(args: Namespace) -> None: - """Main: create streaming CIFAR10 dataset. + """Main: create streaming C4 dataset. + Args: - args (Namespace): Commandline arguments. + args (Namespace): command-line arguments. """ splits = [ ('train', 'train', 364868892, 64), @@ -102,7 +142,7 @@ def main(args: Namespace) -> None: for old_split, new_split, num_samples, num_workers in splits: dataset = get(old_split) split_dir = os.path.join(args.out_root, new_split) - with MDSWriter(split_dir, fields, args.compression, hashes, args.limit) as out: + with MDSWriter(split_dir, fields, args.compression, hashes, args.size_limit) as out: samples = each(dataset, num_workers, args.batch_size) # pyright: ignore if args.progbar: samples = tqdm(samples, total=num_samples, leave=args.leave) diff --git a/streaming/vision/__init__.py b/streaming/vision/__init__.py index 0852fb45f..2b25df288 100644 --- a/streaming/vision/__init__.py +++ b/streaming/vision/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Natively supported CV datasets.""" + from streaming.vision.ade20k import ADE20k as ADE20K from streaming.vision.cifar10 import CIFAR10 as CIFAR10 from streaming.vision.coco import COCO as COCO diff --git a/streaming/vision/ade20k.py b/streaming/vision/ade20k.py index af69827d3..40d05fcee 100644 --- a/streaming/vision/ade20k.py +++ b/streaming/vision/ade20k.py @@ -1,6 +1,12 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""ADE20K Semantic segmentation and scene parsing dataset. + +Please refer to the `ADE20K dataset `_ for more details about this +dataset. +""" + from typing import Any, Callable, Optional, Tuple from streaming.base import Dataset @@ -57,7 +63,7 @@ def __getitem__(self, idx: int) -> Tuple[Any, Any]: idx (int): Sample index. Returns: - Any: Sample data. + Tuple[Any, Any]: Sample data and label. """ obj = super().__getitem__(idx) x = obj['x'] diff --git a/streaming/vision/base.py b/streaming/vision/base.py index 1e0f7124a..1960af117 100644 --- a/streaming/vision/base.py +++ b/streaming/vision/base.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Base Class for making Computer Vision datasets which are compatible with :class:`Dataset`.""" + from typing import Any, Callable, Optional, Tuple from torchvision.transforms.functional import to_tensor @@ -11,8 +13,7 @@ class StandardTransform(object): - """Individual input and output transforms called jointly, following - torchvision. + """Individual input and output transforms called jointly, following torchvision. Args: transform (Callable, optional): Input transform. Defaults to ``None``. diff --git a/streaming/vision/cifar10.py b/streaming/vision/cifar10.py index 0b87babec..9a9a0cd3a 100644 --- a/streaming/vision/cifar10.py +++ b/streaming/vision/cifar10.py @@ -1,11 +1,17 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""CIFAR-10 classification streaming dataset. + +It is one of the most widely used datasets for machine learning research. Please refer to the +`CIFAR-10 Dataset `_ for more details. +""" + from streaming.vision.base import ImageClassDataset class CIFAR10(ImageClassDataset): - """Streaming CIFAR10. + """Implementation of the CIFAR-10 dataset using streaming Dataset. Args: local (str): Local filesystem directory where dataset is cached during operation. diff --git a/streaming/vision/coco.py b/streaming/vision/coco.py index 2b3932f95..db3ae3aac 100644 --- a/streaming/vision/coco.py +++ b/streaming/vision/coco.py @@ -1,14 +1,19 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""COCO (Common Objects in Context) dataset. + +COCO is a large-scale object detection, segmentation, and captioning dataset. Please refer to the +`COCO dataset `_ for more details. +""" + from typing import Any, Callable, Optional from streaming.base import Dataset class COCO(Dataset): - """ - Implementation of the COCO dataset using streaming Dataset. + """Implementation of the COCO dataset using streaming Dataset. Args: local (str): Local filesystem directory where dataset is cached during operation. @@ -51,6 +56,14 @@ def __init__(self, self.transform = transform def __getitem__(self, idx: int) -> Any: + """Get sample by global index, blocking to load its shard if missing. + + Args: + idx (int): Sample index. + + Returns: + Any: Sample data. + """ x = super().__getitem__(idx) img = x['img'].convert('RGB') img_id = x['img_id'] diff --git a/streaming/vision/convert/__init__.py b/streaming/vision/convert/__init__.py index 636fee966..3e01af96a 100644 --- a/streaming/vision/convert/__init__.py +++ b/streaming/vision/convert/__init__.py @@ -1,2 +1,4 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 + +"""Data conversion scripts for Computer Vision.""" diff --git a/streaming/vision/convert/ade20k.py b/streaming/vision/convert/ade20k.py index 99f771ac6..9265021bf 100644 --- a/streaming/vision/convert/ade20k.py +++ b/streaming/vision/convert/ade20k.py @@ -16,23 +16,23 @@ def parse_args() -> Namespace: - """Parse command line arguments. + """Parse command-line arguments. Args: - Namespace: Command line arguments. + Namespace: command-line arguments. """ args = ArgumentParser() args.add_argument( '--in_root', type=str, required=True, - help='Location of the input dataset', + help='Directory path of the input dataset', ) args.add_argument( '--out_root', type=str, required=True, - help='Location to store the output dataset', + help='Directory path to store the output dataset', ) args.add_argument( '--splits', @@ -53,23 +53,23 @@ def parse_args() -> Namespace: help='Hashing algorithms to apply to shard files. Default: sha1,xxh64', ) args.add_argument( - '--limit', + '--size_limit', type=int, default=1 << 22, - help='Shard size limit, after which point to start a new shard. Default: 4194304', + help='Shard size limit, after which point to start a new shard. Default: 1 << 22', ) args.add_argument( '--progbar', type=int, default=1, - help='tqdm progress bar. Default: 1 (Act as True)', + help='tqdm progress bar. Default: 1 (True)', ) args.add_argument( '--leave', type=int, default=0, help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' + - '(Act as False)', + '(False)', ) return args.parse_args() @@ -136,7 +136,7 @@ def main(args: Namespace) -> None: """Main: create streaming ADE20K dataset. Args: - args (Namespace): Command line arguments. + args (Namespace): command-line arguments. """ fields = {'uid': 'bytes', 'x': 'jpeg', 'y': 'png'} @@ -156,7 +156,8 @@ def main(args: Namespace) -> None: if args.progbar: samples = tqdm(samples, leave=args.leave) - with MDSWriter(split_images_out_dir, fields, args.compression, hashes, args.limit) as out: + with MDSWriter(split_images_out_dir, fields, args.compression, hashes, + args.size_limit) as out: for sample in each(samples): out.write(sample) diff --git a/streaming/vision/convert/base.py b/streaming/vision/convert/base.py index 0e94cea50..77b25d4b9 100644 --- a/streaming/vision/convert/base.py +++ b/streaming/vision/convert/base.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""Utility and helper functions to convert CV datasets.""" + import os from typing import List, Optional diff --git a/streaming/vision/convert/cifar10.py b/streaming/vision/convert/cifar10.py index 1677e87f8..2de4a6603 100644 --- a/streaming/vision/convert/cifar10.py +++ b/streaming/vision/convert/cifar10.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""CIFAR10 streaming dataset conversion scripts.""" + from argparse import ArgumentParser, Namespace from torchvision.datasets import CIFAR10 @@ -10,20 +12,61 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() - args.add_argument('--in_root', type=str, required=True) - args.add_argument('--out_root', type=str, required=True) - args.add_argument('--splits', type=str, default='train,val') - args.add_argument('--compression', type=str, default='') - args.add_argument('--hashes', type=str, default='sha1,xxh64') - args.add_argument('--limit', type=int, default=1 << 20) - args.add_argument('--progbar', type=int, default=1) - args.add_argument('--leave', type=int, default=0) + args.add_argument( + '--in_root', + type=str, + required=True, + help='Directory path of the input dataset', + ) + args.add_argument( + '--out_root', + type=str, + required=True, + help='Directory path to store the output dataset', + ) + args.add_argument( + '--splits', + type=str, + default='train,val', + help='Split to use. Default: train,val', + ) + args.add_argument( + '--compression', + type=str, + default='', + help='Compression algorithm to use. Default: None', + ) + args.add_argument( + '--hashes', + type=str, + default='sha1,xxh64', + help='Hashing algorithms to apply to shard files. Default: sha1,xxh64', + ) + args.add_argument( + '--size_limit', + type=int, + default=1 << 20, + help='Shard size limit, after which point to start a new shard. Default: 1 << 20', + ) + args.add_argument( + '--progbar', + type=int, + default=1, + help='tqdm progress bar. Default: 1 (True)', + ) + args.add_argument( + '--leave', + type=int, + default=0, + help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' + + '(False)', + ) return args.parse_args() @@ -31,14 +74,14 @@ def main(args: Namespace) -> None: """Main: create streaming CIFAR10 dataset. Args: - args (Namespace): Commandline arguments. + args (Namespace): command-line arguments. """ splits = get_list_arg(args.splits) hashes = get_list_arg(args.hashes) for split in splits: dataset = CIFAR10(root=args.in_root, train=(split == 'train'), download=True) convert_image_class_dataset(dataset, args.out_root, split, args.compression, hashes, - args.limit, args.progbar, args.leave, 'pil') + args.size_limit, args.progbar, args.leave, 'pil') if __name__ == '__main__': diff --git a/streaming/vision/convert/coco.py b/streaming/vision/convert/coco.py index d9e4ea2ac..c4a9e5fc2 100644 --- a/streaming/vision/convert/coco.py +++ b/streaming/vision/convert/coco.py @@ -19,10 +19,10 @@ def parse_args() -> Namespace: - """Parse command line arguments. + """Parse command-line arguments. Args: - Namespace: Command line arguments. + Namespace: command-line arguments. """ args = ArgumentParser() args.add_argument( @@ -56,10 +56,10 @@ def parse_args() -> Namespace: help='Hashing algorithms to apply to shard files. Default: sha1,xxh64', ) args.add_argument( - '--limit', + '--size_limit', type=int, default=1 << 25, - help='Shard size limit, after which point to start a new shard. Default: 33554432', + help='Shard size limit, after which point to start a new shard. Default: 1 << 25', ) args.add_argument( '--progbar', @@ -197,7 +197,7 @@ def main(args: Namespace) -> None: """Main: create COCO streaming dataset. Args: - args (Namespace): Command line arguments. + args (Namespace): command-line arguments. """ fields = { 'img': 'jpeg', @@ -235,7 +235,7 @@ def main(args: Namespace) -> None: else: dataset = each(dataset, shuffle) - with MDSWriter(split_out_dir, fields, args.compression, hashes, args.limit) as out: + with MDSWriter(split_out_dir, fields, args.compression, hashes, args.size_limit) as out: for sample in dataset: out.write(sample) diff --git a/streaming/vision/convert/image_folder.py b/streaming/vision/convert/imagenet.py similarity index 61% rename from streaming/vision/convert/image_folder.py rename to streaming/vision/convert/imagenet.py index 743a09d88..4e75475c9 100644 --- a/streaming/vision/convert/image_folder.py +++ b/streaming/vision/convert/imagenet.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""ImageNet streaming dataset conversion scripts.""" + import os from argparse import ArgumentParser, Namespace from glob import glob @@ -15,22 +17,73 @@ def parse_args() -> Namespace: - """Parse commandline arguments. + """Parse command-line arguments. Args: - Namespace: Commandline arguments. + Namespace: command-line arguments. """ args = ArgumentParser() - args.add_argument('--in_root', type=str, required=True) - args.add_argument('--out_root', type=str, required=True) - args.add_argument('--splits', type=str, default='train,val') - args.add_argument('--compression', type=str, default='') - args.add_argument('--hashes', type=str, default='sha1,xxh64') - args.add_argument('--size_limit', type=int, default=1 << 26) - args.add_argument('--progbar', type=int, default=1) - args.add_argument('--leave', type=int, default=0) - args.add_argument('--validate', type=int, default=1) - args.add_argument('--extensions', type=str, default='jpeg') + args.add_argument( + '--in_root', + type=str, + required=True, + help='Directory path of the input dataset', + ) + args.add_argument( + '--out_root', + type=str, + required=True, + help='Directory path to store the output dataset', + ) + args.add_argument( + '--splits', + type=str, + default='train,val', + help='Split to use. Default: train,val', + ) + args.add_argument( + '--compression', + type=str, + default='', + help='Compression algorithm to use. Default: None', + ) + args.add_argument( + '--hashes', + type=str, + default='sha1,xxh64', + help='Hashing algorithms to apply to shard files. Default: sha1,xxh64', + ) + args.add_argument( + '--size_limit', + type=int, + default=1 << 26, + help='Shard size limit, after which point to start a new shard. Default: 1 << 26', + ) + args.add_argument( + '--progbar', + type=int, + default=1, + help='tqdm progress bar. Default: 1 (True)', + ) + args.add_argument( + '--leave', + type=int, + default=0, + help='Keeps all traces of the progressbar upon termination of iteration. Default: 0 ' + + '(False)', + ) + args.add_argument( + '--validate', + type=int, + default=1, + help='Validate that it is an Image. Default: 1 (True)', + ) + args.add_argument( + '--extensions', + type=str, + default='jpeg', + help='Validate filename extensions. Default: jpeg', + ) return args.parse_args() @@ -75,10 +128,10 @@ def get_classes(filenames: List[str], def main(args: Namespace) -> None: - """Main: create streaming CIFAR10 dataset. + """Main: create streaming ImageNet dataset. Args: - args (Namespace): Commandline arguments. + args (Namespace): command-line arguments. """ splits = get_list_arg(args.splits) columns = {'i': 'int', 'x': 'jpeg', 'y': 'int'} diff --git a/streaming/vision/imagenet.py b/streaming/vision/imagenet.py index c60fad1e4..4436d08ef 100644 --- a/streaming/vision/imagenet.py +++ b/streaming/vision/imagenet.py @@ -1,11 +1,17 @@ # Copyright 2022 MosaicML Streaming authors # SPDX-License-Identifier: Apache-2.0 +"""ImageNet classification streaming dataset. + +The most widely used dataset for Image Classification algorithms. Please refer to the `ImageNet +2012 Classification Dataset `_ for more details. +""" + from streaming.vision.base import ImageClassDataset class ImageNet(ImageClassDataset): - """Streaming ImageNet. + """Implementation of the ImageNet dataset using streaming Dataset. Args: local (str): Local filesystem directory where dataset is cached during operation. diff --git a/tests/test_streaming.py b/tests/test_streaming.py index b7ef7185e..4ac0a1e06 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -347,7 +347,9 @@ def test_dataloader_single_device(remote_local: Tuple[str, str], batch_size: int def check_for_diff_files(dir: dircmp, compression_ext: Union[None, str]): """Check recursively for different files in a dircmp object. - Local directory also has the uncompressed files, ignore it during file comparison.""" + + Local directory also has the uncompressed files, ignore it during file comparison. + """ if compression_ext: for file in dir.diff_files: assert not file.endswith(compression_ext) diff --git a/tests/test_streaming_remote.py b/tests/test_streaming_remote.py index 714fa4bda..21d050b57 100644 --- a/tests/test_streaming_remote.py +++ b/tests/test_streaming_remote.py @@ -104,7 +104,6 @@ def get_dataset(name: str, ]) @pytest.mark.parametrize('split', ['val']) def test_streaming_remote_dataset(tmp_path: pathlib.Path, name: str, split: str) -> None: - # Build StreamingDataset build_start = time.time() expected_samples, dataset = get_dataset(name=name, From b71489a0c5527816068aaa4833b66d8e89297972 Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Thu, 22 Sep 2022 15:03:19 -0700 Subject: [PATCH 2/3] Skip streaming base and version file in doc section --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index f4bdc3aba..7768dbd42 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -375,7 +375,7 @@ def _modules_to_rst() -> List[types.ModuleType]: document_modules: List[types.Module] = [ streaming, ] - exclude_modules: List[types.Module] = [] + exclude_modules: List[types.Module] = [streaming.base, streaming._version] for name in streaming.__dict__: obj = streaming.__dict__[name] if isinstance(obj, types.ModuleType) and obj not in exclude_modules: From b9e4a2d410dea27e5b08949e7ddca132ee29444d Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Sat, 24 Sep 2022 18:32:27 -0700 Subject: [PATCH 3/3] Add text and vision to a doc --- docs/source/conf.py | 6 ++---- streaming/__init__.py | 5 ++++- streaming/vision/__init__.py | 2 +- streaming/vision/ade20k.py | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7768dbd42..4dd52612e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,15 +16,13 @@ import ast import importlib import inspect -import json import os import shutil import sys import tempfile -import textwrap import types import warnings -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Tuple, Type import sphinx.application import sphinx.ext.autodoc @@ -375,7 +373,7 @@ def _modules_to_rst() -> List[types.ModuleType]: document_modules: List[types.Module] = [ streaming, ] - exclude_modules: List[types.Module] = [streaming.base, streaming._version] + exclude_modules: List[types.Module] = [streaming._version] for name in streaming.__dict__: obj = streaming.__dict__[name] if isinstance(obj, types.ModuleType) and obj not in exclude_modules: diff --git a/streaming/__init__.py b/streaming/__init__.py index 4c9ab3693..aa4e721d7 100644 --- a/streaming/__init__.py +++ b/streaming/__init__.py @@ -3,10 +3,13 @@ """MosaicML Streaming Datasets for cloud-native model training.""" +import streaming.text as text +import streaming.vision as vision from streaming._version import __version__ from streaming.base import (CSVWriter, Dataset, JSONWriter, LocalDataset, MDSWriter, TSVWriter, XSVWriter) __all__ = [ - 'Dataset', 'CSVWriter', 'JSONWriter', 'MDSWriter', 'TSVWriter', 'XSVWriter', 'LocalDataset' + 'Dataset', 'CSVWriter', 'JSONWriter', 'MDSWriter', 'TSVWriter', 'XSVWriter', 'LocalDataset', + 'vision', 'text' ] diff --git a/streaming/vision/__init__.py b/streaming/vision/__init__.py index 2b25df288..89b7d3aa6 100644 --- a/streaming/vision/__init__.py +++ b/streaming/vision/__init__.py @@ -3,7 +3,7 @@ """Natively supported CV datasets.""" -from streaming.vision.ade20k import ADE20k as ADE20K +from streaming.vision.ade20k import ADE20K as ADE20K from streaming.vision.cifar10 import CIFAR10 as CIFAR10 from streaming.vision.coco import COCO as COCO from streaming.vision.imagenet import ImageNet as ImageNet diff --git a/streaming/vision/ade20k.py b/streaming/vision/ade20k.py index 40d05fcee..76ed22e86 100644 --- a/streaming/vision/ade20k.py +++ b/streaming/vision/ade20k.py @@ -12,8 +12,8 @@ from streaming.base import Dataset -class ADE20k(Dataset): - """Implementation of the ADE20k dataset using streaming Dataset. +class ADE20K(Dataset): + """Implementation of the ADE20K dataset using streaming Dataset. Args: local (str): Local dataset directory where shards are cached by split.