Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added pydocstyle and docformatter in pre-commit config #20

Merged
merged 4 commits into from
Sep 26, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,19 @@ repos:
pass_filenames: false
args: [--warnings]
additional_dependencies: ["pyright@1.1.256"]
- repo: https://github.com/myint/docformatter
rev: v1.5.0
hooks:
- id: docformatter
args: [--in-place, --wrap-summaries=99, --wrap-descriptions=99]
- repo: https://github.com/PyCQA/pydocstyle
hooks:
- id: pydocstyle
name: pydocstyle
entry: pydocstyle
language: python
types: [python]
exclude: '(tests|.ci|.github)'
additional_dependencies:
- "toml"
rev: 6.1.1
10 changes: 4 additions & 6 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,13 +371,11 @@ def _auto_rst_for_module(module: types.ModuleType, exclude_members: List[Any]) -


def _modules_to_rst() -> List[types.ModuleType]:
"""Return the list of modules for which to generate API reference rst
files."""

"""Return the list of modules for which to generate API reference rst files."""
document_modules: List[types.Module] = [
streaming,
]
exclude_modules: List[types.Module] = []
exclude_modules: List[types.Module] = [streaming.base, streaming._version]
karan6181 marked this conversation as resolved.
Show resolved Hide resolved
for name in streaming.__dict__:
obj = streaming.__dict__[name]
if isinstance(obj, types.ModuleType) and obj not in exclude_modules:
Expand All @@ -389,8 +387,8 @@ def _modules_to_rst() -> List[types.ModuleType]:
def _generate_rst_files_for_modules() -> None:
"""Generate .rst files for each module to include in the API reference.

These files contain the module docstring followed by tables listing
all the functions, classes, etc.
These files contain the module docstring followed by tables listing all the functions, classes,
etc.
"""
docs_dir = os.path.abspath(os.path.dirname(__file__))
module_rst_save_dir = os.path.join(docs_dir, 'api_reference')
Expand Down
5 changes: 2 additions & 3 deletions docs/source/doctest_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@

"""Fixtures available in doctests.

The script is run before any doctests are executed, so all imports and
variables are available in any doctest. The output of this setup script
does not show up in the documentation.
The script is run before any doctests are executed, so all imports and variables are available in
any doctest. The output of this setup script does not show up in the documentation.
"""
import os
import sys
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,8 @@ ignore_patterns = [
"wandb/**/*.py",
"build/**/*.py",
]

[tool.pydocstyle]
convention="google"
add_ignore="D102,D105,D107,D401"
add_select="D400,D404"
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Copyright 2022 MosaicML. All Rights Reserved.
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Streaming package setup."""

import os

Expand Down
2 changes: 2 additions & 0 deletions streaming/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""MosaicML Streaming Datasets for cloud-native model training."""

from streaming._version import __version__
from streaming.base import (CSVWriter, Dataset, JSONWriter, LocalDataset, MDSWriter, TSVWriter,
XSVWriter)
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""MosaicML Streaming Datasets for cloud-native model training."""

from streaming.base.dataset import Dataset
from streaming.base.format import CSVWriter, JSONWriter, MDSWriter, TSVWriter, XSVWriter
from streaming.base.local import LocalDataset
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/compression/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""A collection of compression algorithm."""

from streaming.base.compression.compression import (compress, decompress,
get_compression_extension, get_compressions,
is_compression)
Expand Down
8 changes: 5 additions & 3 deletions streaming/base/compression/bench.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Script to Benchmark compression algorithms."""

from argparse import ArgumentParser, Namespace
from time import time
from typing import Iterator
Expand All @@ -11,10 +13,10 @@


def parse_args() -> Namespace:
"""Parse commandline arguments.
"""Parse command-line arguments.

Args:
Namespace: Commandline arguments.
Namespace: command-line arguments.
"""
args = ArgumentParser()
args.add_argument('--data', type=str, required=True)
Expand Down Expand Up @@ -48,7 +50,7 @@ def main(args: Namespace) -> None:
"""Benchmark compression algorithms.

Args:
args (Namespace): Commandline flags.
args (Namespace): command-line flags.
"""
data = open(args.data, 'rb').read()
for algo in sorted(get_compressions()):
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/compression/compression.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""List of Compression and Decompression algorithms."""

import bz2
import gzip
from abc import ABC, abstractmethod
Expand Down
8 changes: 5 additions & 3 deletions streaming/base/compression/plot.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Utility and helper functions to plot compression information."""

from argparse import ArgumentParser, Namespace
from collections import defaultdict
from dataclasses import dataclass
Expand Down Expand Up @@ -72,10 +74,10 @@


def parse_args() -> Namespace:
"""Parse commandline arguments.
"""Parse command-line arguments.

Args:
Namespace: Commandline arguments.
Namespace: command-line arguments.
"""
args = ArgumentParser()
args.add_argument('--data', type=str, required=True)
Expand Down Expand Up @@ -252,7 +254,7 @@ def main(args: Namespace) -> None:
"""Plot info about compression.

Args:
args (Namespace): Commandline arguments.
args (Namespace): command-line arguments.
"""
data = load(args.data, args.min_dec_size)
plot_compression_rates(data, algo2color, args.dpi, args.font_size, args.line_width,
Expand Down
3 changes: 2 additions & 1 deletion streaming/base/dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""The :class:`Dataset` class, used for building streaming iterable datasets."""

import json
import os
from enum import IntEnum
Expand Down Expand Up @@ -292,7 +294,6 @@ def _preload_shard(self, shard: int, partition: Partition) -> bool:
Returns:
bool: Whether shard is present.
"""

assert shard in partition.shards
reader = self.shards[shard]
for raw_info, zip_info in reader.file_pairs:
Expand Down
30 changes: 26 additions & 4 deletions streaming/base/distributed.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,44 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Helper methods to get the distributed attributes."""

import os

__all__ = ['get_global_rank', 'get_local_rank', 'get_local_world_size', 'get_world_size']


def get_global_rank():
def get_global_rank() -> int:
"""Returns the global rank of the current process, which is on ``[0; WORLD_SIZE - 1]``.

Returns:
int: The global rank.
"""
return int(os.environ.get('RANK', 0))


def get_world_size():
def get_world_size() -> int:
"""Returns the world size, which is the number of processes participating in this training run.

Returns:
int: The world size.
"""
return int(os.environ.get('WORLD_SIZE', 1))


def get_local_rank():
def get_local_rank() -> int:
"""Returns the local rank for the current process, which is on ``[0; LOCAL_WORLD_SIZE - 1]``.

Returns:
int: The local rank.
"""
return int(os.environ.get('LOCAL_RANK', 0))


def get_local_world_size():
def get_local_world_size() -> int:
"""Returns the local world size, which is the number of processes for the current node.

Returns:
int: The local world size.
"""
return int(os.environ.get('LOCAL_WORLD_SIZE', 1))
2 changes: 2 additions & 0 deletions streaming/base/download.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Download handling for :class:`Dataset`."""

import os
import shutil
import urllib.parse
Expand Down
12 changes: 12 additions & 0 deletions streaming/base/format/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Individual dataset writer for every format."""

from typing import Any, Dict, Optional

from streaming.base.format.base.reader import Reader
Expand All @@ -19,6 +21,16 @@


def reader_from_json(dirname: str, split: Optional[str], obj: Dict[str, Any]) -> Reader:
"""Initialize the reader from JSON object.

Args:
dirname (str): Local directory containing shards.
split (str, optional): Which dataset split to use, if any.
obj (Dict[str, Any]): JSON object to load.

Returns:
Reader: Loaded Reader of `format` type
"""
assert obj['version'] == 2
cls = _readers[obj['format']]
return cls.from_json(dirname, split, obj)
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Base module for dataset reader and writer."""
2 changes: 2 additions & 0 deletions streaming/base/format/base/reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Read and decode sample from shards."""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, Iterator, List, Optional
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/base/writer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Convert a list of samples into a format files that can be read as a :class:`Dataset`."""

import json
import os
from abc import ABC, abstractmethod
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/json/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Example:

```json
{
"columns": {
"number": "int",
Expand Down Expand Up @@ -48,3 +49,4 @@ Example:
}
}
}
```
2 changes: 2 additions & 0 deletions streaming/base/format/json/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Module to write and read the dataset in JSON format."""

from streaming.base.format.json.reader import JSONReader
from streaming.base.format.json.writer import JSONWriter

Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/json/encodings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Check whether sample encoding is of supported JSON types."""

from abc import ABC, abstractmethod
from typing import Any

Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/json/reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

""":class:`JSONReader` reads samples from binary ``.json`` files that were written out by :class:`MDSWriter`."""

import json
import os
from copy import deepcopy
Expand Down
4 changes: 3 additions & 1 deletion streaming/base/format/json/writer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

""":class:`JSONWriter` converts a list of samples into binary `.mds` files that can be read as a :class:`JSONReader`."""

import json
from typing import Any, Dict, List, Optional, Tuple

Expand All @@ -11,7 +13,7 @@


class JSONWriter(SplitWriter):
"""Writes a streaming JSON dataset.
r"""Writes a streaming JSON dataset.

Args:
dirname (str): Local dataset directory.
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/mds/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Example:

```json
{
"column_encodings": [
"int",
Expand Down Expand Up @@ -39,3 +40,4 @@ Example:
}
}
}
```
2 changes: 2 additions & 0 deletions streaming/base/format/mds/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Module to write and read the dataset in MDS format."""

from streaming.base.format.mds.reader import MDSReader
from streaming.base.format.mds.writer import MDSWriter

Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/mds/encodings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

"""Encode and Decode samples in a supported MDS format."""

import json
import pickle
from abc import ABC, abstractmethod
Expand Down
2 changes: 2 additions & 0 deletions streaming/base/format/mds/reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright 2022 MosaicML Streaming authors
# SPDX-License-Identifier: Apache-2.0

""":class:`MDSReader` reads samples from binary ``.mds`` files that were written out by:class:`StreamingDatasetWriter`."""

import os
from copy import deepcopy
from typing import Any, Dict, List, Optional
Expand Down
Loading