Skip to content

Commit

Permalink
Merge pull request #138 from facebookresearch/dataset-deprecations
Browse files Browse the repository at this point in the history
Begin deprecation of existing Dataset API
  • Loading branch information
ChrisCummins authored Mar 20, 2021
2 parents 14331af + 07b5311 commit c5c448c
Show file tree
Hide file tree
Showing 15 changed files with 112 additions and 59 deletions.
10 changes: 8 additions & 2 deletions compiler_gym/bin/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,13 @@
import humanize
from absl import app, flags

from compiler_gym.datasets.dataset import Dataset, activate, deactivate, delete, require
from compiler_gym.datasets.dataset import (
LegacyDataset,
activate,
deactivate,
delete,
require,
)
from compiler_gym.util.flags.env_from_flags import env_from_flags
from compiler_gym.util.tabulate import tabulate

Expand Down Expand Up @@ -183,7 +189,7 @@ def enumerate_directory(name: str, path: Path):
for path in path.iterdir():
if not path.is_file() or not path.name.endswith(".json"):
continue
dataset = Dataset.from_json_file(path)
dataset = LegacyDataset.from_json_file(path)
rows.append(
(dataset.name, dataset.license, dataset.file_count, dataset.size_bytes)
)
Expand Down
10 changes: 8 additions & 2 deletions compiler_gym/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""Manage datasets of benchmarks."""
from compiler_gym.datasets.dataset import Dataset, activate, deactivate, delete, require
from compiler_gym.datasets.dataset import (
LegacyDataset,
activate,
deactivate,
delete,
require,
)

__all__ = ["Dataset", "require", "activate", "deactivate", "delete"]
__all__ = ["LegacyDataset", "require", "activate", "deactivate", "delete"]
57 changes: 44 additions & 13 deletions compiler_gym/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,20 @@
from typing import List, NamedTuple, Optional, Union

import fasteners
from deprecated.sphinx import deprecated

from compiler_gym.util.download import download


class Dataset(NamedTuple):
"""A collection of benchmarks for use by an environment."""
class LegacyDataset(NamedTuple):
"""A collection of benchmarks for use by an environment.
.. deprecated:: 0.1.4
The next release of CompilerGym will introduce a new API for describing
datasets with extended functionality. See
`here <https://github.com/facebookresearch/CompilerGym/issues/45>`_ for
more information.
"""

name: str
"""The name of the dataset."""
Expand Down Expand Up @@ -57,11 +65,11 @@ def deprecated(self) -> bool:
return bool(self.deprecated_since)

@classmethod
def from_json_file(cls, path: Path) -> "Dataset":
def from_json_file(cls, path: Path) -> "LegacyDataset":
"""Construct a dataset form a JSON metadata file.
:param path: Path of the JSON metadata.
:return: A Dataset instance.
:return: A LegacyDataset instance.
"""
try:
with open(str(path), "rb") as f:
Expand All @@ -85,6 +93,13 @@ def to_json_file(self, path: Path) -> Path:
return path


@deprecated(
version="0.1.4",
reason=(
"Activating datasets will be removed in v0.1.5. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def activate(env, name: str) -> bool:
"""Move a directory from the inactive to active benchmark directory.
Expand All @@ -107,6 +122,13 @@ def activate(env, name: str) -> bool:
return True


@deprecated(
version="0.1.4",
reason=(
"Deleting datasets will be removed in v0.1.5. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def delete(env, name: str) -> bool:
"""Delete a directory in the inactive benchmark directory.
Expand All @@ -127,6 +149,13 @@ def delete(env, name: str) -> bool:
return deleted


@deprecated(
version="0.1.4",
reason=(
"Deactivating datasets will be removed in v0.1.5. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def deactivate(env, name: str) -> bool:
"""Move a directory from active to the inactive benchmark directory.
Expand All @@ -145,7 +174,7 @@ def deactivate(env, name: str) -> bool:
return True


def require(env, dataset: Union[str, Dataset]) -> bool:
def require(env, dataset: Union[str, LegacyDataset]) -> bool:
"""Require that the given dataset is available to the environment.
This will download and activate the dataset if it is not already installed.
Expand All @@ -160,12 +189,14 @@ def require(env, dataset: Union[str, Dataset]) -> bool:
:param env: The environment that this dataset is required for.
:param dataset: The name of the dataset to download, the URL of the dataset,
or a :class:`Dataset` instance.
or a :class:`LegacyDataset` instance.
:return: :code:`True` if the dataset was downloaded, or :code:`False` if the
dataset was already available.
"""

def download_and_unpack_archive(url: str, sha256: Optional[str] = None) -> Dataset:
def download_and_unpack_archive(
url: str, sha256: Optional[str] = None
) -> LegacyDataset:
json_files_before = {
f
for f in env.inactive_datasets_site_path.iterdir()
Expand All @@ -182,9 +213,9 @@ def download_and_unpack_archive(url: str, sha256: Optional[str] = None) -> Datas
new_json = json_files_after - json_files_before
if not len(new_json):
raise OSError(f"Downloaded dataset {url} contains no metadata JSON file")
return Dataset.from_json_file(list(new_json)[0])
return LegacyDataset.from_json_file(list(new_json)[0])

def unpack_local_archive(path: Path) -> Dataset:
def unpack_local_archive(path: Path) -> LegacyDataset:
if not path.is_file():
raise FileNotFoundError(f"File not found: {path}")
json_files_before = {
Expand All @@ -202,21 +233,21 @@ def unpack_local_archive(path: Path) -> Dataset:
new_json = json_files_after - json_files_before
if not len(new_json):
raise OSError(f"Downloaded dataset {url} contains no metadata JSON file")
return Dataset.from_json_file(list(new_json)[0])
return LegacyDataset.from_json_file(list(new_json)[0])

with fasteners.InterProcessLock(env.datasets_site_path / "LOCK"):
# Resolve the name and URL of the dataset.
sha256 = None
if isinstance(dataset, Dataset):
if isinstance(dataset, LegacyDataset):
name, url = dataset.name, dataset.url
elif isinstance(dataset, str):
# Check if we have already downloaded the dataset.
if "://" in dataset:
name, url = None, dataset
dataset: Optional[Dataset] = None
dataset: Optional[LegacyDataset] = None
else:
try:
dataset: Optional[Dataset] = env.available_datasets[dataset]
dataset: Optional[LegacyDataset] = env.available_datasets[dataset]
except KeyError:
raise ValueError(f"Dataset not found: {dataset}")
name, url, sha256 = dataset.name, dataset.url, dataset.sha256
Expand Down
23 changes: 12 additions & 11 deletions compiler_gym/envs/compiler_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from gym.spaces import Space

from compiler_gym.compiler_env_state import CompilerEnvState
from compiler_gym.datasets.dataset import Dataset, require
from compiler_gym.datasets.dataset import LegacyDataset, require
from compiler_gym.service import (
CompilerGymServiceConnection,
ConnectionOpts,
Expand Down Expand Up @@ -132,9 +132,9 @@ class CompilerEnv(gym.Env):
to store benchmarks.
:vartype datasets_site_path: Optional[Path]
:ivar available_datasets: A mapping from dataset name to :class:`Dataset`
:ivar available_datasets: A mapping from dataset name to :class:`LegacyDataset`
objects that are available to download.
:vartype available_datasets: Dict[str, Dataset]
:vartype available_datasets: Dict[str, LegacyDataset]
:ivar observation: A view of the available observation spaces that permits
on-demand computation of observations.
Expand Down Expand Up @@ -223,7 +223,7 @@ def __init__(
self._service_endpoint: Union[str, Path] = service
self._connection_settings = connection_settings or ConnectionOpts()
self.datasets_site_path: Optional[Path] = None
self.available_datasets: Dict[str, Dataset] = {}
self.available_datasets: Dict[str, LegacyDataset] = {}

self.action_space_name = action_space

Expand Down Expand Up @@ -891,7 +891,7 @@ def _reward_view_type(self):
"""
return RewardView

def require_datasets(self, datasets: List[Union[str, Dataset]]) -> bool:
def require_datasets(self, datasets: List[Union[str, LegacyDataset]]) -> bool:
"""Require that the given datasets are available to the environment.
Example usage:
Expand All @@ -907,10 +907,11 @@ def require_datasets(self, datasets: List[Union[str, Dataset]]) -> bool:
:param datasets: A list of datasets to require. Each dataset is the name
of an available dataset, the URL of a dataset to download, or a
:class:`Dataset` instance.
:class:`LegacyDataset` instance.
:return: :code:`True` if one or more datasets were downloaded, or
:code:`False` if all datasets were already available.
"""
self.logger.debug("Requiring datasets: %s", datasets)
dataset_installed = False
for dataset in datasets:
dataset_installed |= require(self, dataset)
Expand All @@ -927,14 +928,14 @@ def require_datasets(self, datasets: List[Union[str, Dataset]]) -> bool:
self.make_manifest_file()
return dataset_installed

def require_dataset(self, dataset: Union[str, Dataset]) -> bool:
def require_dataset(self, dataset: Union[str, LegacyDataset]) -> bool:
"""Require that the given dataset is available to the environment.
Alias for
:meth:`env.require_datasets([dataset]) <compiler_gym.envs.CompilerEnv.require_datasets>`.
:param dataset: The name of the dataset to download, the URL of the dataset, or a
:class:`Dataset` instance.
:class:`LegacyDataset` instance.
:return: :code:`True` if the dataset was downloaded, or :code:`False` if
the dataset was already available.
"""
Expand Down Expand Up @@ -964,7 +965,7 @@ def make_manifest_file(self) -> Path:
)
return manifest_path

def register_dataset(self, dataset: Dataset) -> bool:
def register_dataset(self, dataset: LegacyDataset) -> bool:
"""Register a new dataset.
After registering, the dataset name may be used by
Expand All @@ -973,13 +974,13 @@ def register_dataset(self, dataset: Dataset) -> bool:
Example usage:
>>> my_dataset = Dataset(name="my-dataset-v0", ...)
>>> my_dataset = LegacyDataset(name="my-dataset-v0", ...)
>>> env = gym.make("llvm-v0")
>>> env.register_dataset(my_dataset)
>>> env.require_dataset("my-dataset-v0")
>>> env.benchmark = "my-dataset-v0/1"
:param dataset: A :class:`Dataset` instance describing the new dataset.
:param dataset: A :class:`LegacyDataset` instance describing the new dataset.
:return: :code:`True` if the dataset was added, else :code:`False`.
:raises ValueError: If a dataset with this name is already registered.
"""
Expand Down
6 changes: 3 additions & 3 deletions compiler_gym/envs/llvm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ py_library(
)

py_library(
name = "datasets",
srcs = ["datasets.py"],
name = "legacy_datasets",
srcs = ["legacy_datasets.py"],
data = [
"//compiler_gym/third_party/llvm:clang",
"//compiler_gym/third_party/llvm:lli",
Expand All @@ -55,7 +55,7 @@ py_library(
],
deps = [
":benchmarks",
":datasets",
":legacy_datasets",
":llvm_rewards",
"//compiler_gym/envs:compiler_env",
"//compiler_gym/spaces",
Expand Down
Loading

0 comments on commit c5c448c

Please sign in to comment.