From 4e57d8341282ce4e2b313156960345587a3cbf7d Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Fri, 23 Apr 2021 15:54:59 +0100 Subject: [PATCH] [llvm] Add a CLgen dataset. This adds a dataset of 1k OpenCL kernels that were used in the paper: Cummins, Chris, Pavlos Petoumenos, Zheng Wang, and Hugh Leather. "Synthesizing benchmarks for predictive modeling." In 2017 IEEE/ACM International Symposium on Code Generation and Optimization (CGO), pp. 86-99. IEEE, 2017. The OpenCL kernels are compiled on-demand. Issue #45. --- compiler_gym/envs/llvm/datasets/BUILD | 1 + compiler_gym/envs/llvm/datasets/__init__.py | 3 + compiler_gym/envs/llvm/datasets/clgen.py | 170 ++++++++++++++++++++ tests/llvm/datasets/BUILD | 14 ++ tests/llvm/datasets/clgen_test.py | 50 ++++++ 5 files changed, 238 insertions(+) create mode 100644 compiler_gym/envs/llvm/datasets/clgen.py create mode 100644 tests/llvm/datasets/clgen_test.py diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD index fd865d8adc..fd2fb7799c 100644 --- a/compiler_gym/envs/llvm/datasets/BUILD +++ b/compiler_gym/envs/llvm/datasets/BUILD @@ -8,6 +8,7 @@ py_library( name = "datasets", srcs = [ "__init__.py", + "clgen.py", "csmith.py", "llvm_stress.py", "poj104.py", diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py index 11a01e9e96..31ddd5f52c 100644 --- a/compiler_gym/envs/llvm/datasets/__init__.py +++ b/compiler_gym/envs/llvm/datasets/__init__.py @@ -7,6 +7,7 @@ from typing import Iterable, Optional from compiler_gym.datasets import Dataset, TarDatasetWithManifest +from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset @@ -202,6 +203,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset site_data_base = site_data_base or site_data_path("llvm-v0") yield BlasDataset(site_data_base=site_data_base, sort_order=0) + yield CLgenDataset(site_data_base=site_data_base, sort_order=0) yield CsmithDataset(site_data_base=site_data_base, sort_order=0) yield GitHubDataset(site_data_base=site_data_base, sort_order=0) yield LinuxDataset(site_data_base=site_data_base, sort_order=0) @@ -216,6 +218,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset __all__ = [ "BlasDataset", + "CLgenDataset", "CsmithDataset", "CsmithBenchmark", "get_llvm_datasets", diff --git a/compiler_gym/envs/llvm/datasets/clgen.py b/compiler_gym/envs/llvm/datasets/clgen.py new file mode 100644 index 0000000000..3cc1629362 --- /dev/null +++ b/compiler_gym/envs/llvm/datasets/clgen.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import io +import shutil +import subprocess +import tarfile +from pathlib import Path +from typing import List, Optional + +from fasteners import InterProcessLock + +from compiler_gym.datasets import Benchmark, BenchmarkInitError, TarDatasetWithManifest +from compiler_gym.datasets.benchmark import BenchmarkWithSource +from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation +from compiler_gym.util.download import download +from compiler_gym.util.filesystem import atomic_file_write +from compiler_gym.util.truncate import truncate + + +class CLgenDataset(TarDatasetWithManifest): + """The CLgen dataset contains 1000 synthetically generated OpenCL kernels. + + The dataset is from: + + Cummins, Chris, Pavlos Petoumenos, Zheng Wang, and Hugh Leather. + "Synthesizing benchmarks for predictive modeling." In 2017 IEEE/ACM + International Symposium on Code Generation and Optimization (CGO), + pp. 86-99. IEEE, 2017. + + And is available at: + + https://github.com/ChrisCummins/paper-synthesizing-benchmarks + + Installation + ------------ + + The CLgen dataset consists of OpenCL kernels that are compiled to LLVM-IR + on-demand and cached. The first time each benchmark is used there is an + overhead of compiling it from OpenCL to bitcode. This is a one-off cost. + Compiling OpenCL to bitcode requires third party headers that are downloaded + on the first call to :code:`install()`. + """ + + def __init__(self, site_data_base: Path, sort_order: int = 0): + super().__init__( + name="benchmark://clgen-v0", + description="Synthetically generated OpenCL kernels", + references={ + "Paper": "https://chriscummins.cc/pub/2017-cgo.pdf", + "Homepage": "https://github.com/ChrisCummins/clgen", + }, + license="GNU General Public License v3.0", + site_data_base=site_data_base, + manifest_urls=[ + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-clgen-v0-manifest.bz2" + ], + manifest_sha256="d2bbc1da5a24a8cb03b604d1d8e59227b33bdfcd964ebe741ca8339f1c8d65cc", + tar_urls=[ + "https://github.com/ChrisCummins/paper-synthesizing-benchmarks/raw/e45b6dffe9998f612624f05a6c4878ab4bcc84ec/data/clgen-1000.tar.bz2" + ], + tar_sha256="0bbd1b737f2537305e4db09b2971a5fa848b7c3a978bff6b570f45d1a488a72c", + strip_prefix="clgen-1000/kernels", + tar_compression="bz2", + benchmark_file_suffix=".bc", + sort_order=sort_order, + ) + + self._opencl_installed = False + self._opencl_headers_installed_marker = ( + self._site_data_path / ".opencl-installed" + ) + self.libclc_dir = self.site_data_path / "libclc" + self.opencl_h_path = self.site_data_path / "opencl.h" + + def install(self): + super().install() + + if not self._opencl_installed: + self._opencl_installed = self._opencl_headers_installed_marker.is_file() + + if self._opencl_installed: + return + + with self._tar_lock, InterProcessLock(self._tar_lockfile): + # Repeat install check now that we are in the locked region. + if self._opencl_headers_installed_marker.is_file(): + return + + # Download the libclc headers. + shutil.rmtree(self.libclc_dir, ignore_errors=True) + self.logger.info("Downloading OpenCL headers") + tar_data = io.BytesIO( + download( + "https://dl.fbaipublicfiles.com/compiler_gym/libclc-v0.tar.bz2", + sha256="f1c511f2ac12adf98dcc0fbfc4e09d0f755fa403c18f1fb1ffa5547e1fa1a499", + ) + ) + with tarfile.open(fileobj=tar_data, mode="r:bz2") as arc: + arc.extractall(str(self.site_data_path / "libclc")) + + # Download the OpenCL header. + with open(self.opencl_h_path, "wb") as f: + f.write( + download( + "https://github.com/ChrisCummins/clgen/raw/463c0adcd8abcf2432b24df0aca594b77a69e9d3/deeplearning/clgen/data/include/opencl.h", + sha256="f95b9f4c8b1d09114e491846d0d41425d24930ac167e024f45dab8071d19f3f7", + ) + ) + + self._opencl_headers_installed_marker.touch() + + def benchmark(self, uri: Optional[str] = None) -> Benchmark: + self.install() + if uri is None or len(uri) <= len(self.name) + 1: + return self._get_benchmark_by_index(self.random.integers(self.size)) + + # The absolute path of the file, without an extension. + path_stem = self.dataset_root / uri[len(self.name) + 1 :] + + # If the file does not exist, compile it on-demand. + bc_path, cl_path = Path(f"{path_stem}.bc"), Path(f"{path_stem}.cl") + + if not bc_path.is_file(): + if not cl_path.is_file(): + raise LookupError( + f"Benchmark not found: {uri} (file not found: {cl_path})" + ) + + # Compile the OpenCL kernel into a bitcode file. + with atomic_file_write(bc_path) as tmp_bc_path: + compile_command: List[str] = ClangInvocation.from_c_file( + cl_path, + copt=[ + "-isystem", + str(self.libclc_dir), + "-include", + str(self.opencl_h_path), + "-target", + "nvptx64-nvidia-nvcl", + "-ferror-limit=1", # Stop on first error. + "-w", # No warnings. + ], + ).command(outpath=tmp_bc_path) + self.logger.debug("Exec %s", compile_command) + clang = subprocess.Popen( + compile_command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + _, stderr = clang.communicate(timeout=300) + + if clang.returncode: + compile_command = " ".join(compile_command) + error = truncate( + stderr.decode("utf-8"), max_lines=20, max_line_len=20000 + ) + raise BenchmarkInitError( + f"Compilation job failed!\n" + f"Command: {compile_command}\n" + f"Error: {error}" + ) + if not bc_path.is_file(): + raise BenchmarkInitError( + f"Compilation job failed to produce output file!\nCommand: {compile_command}" + ) + + return BenchmarkWithSource.create(uri, bc_path, "kernel.cl", cl_path) diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD index d3e570e347..fe923406b3 100644 --- a/tests/llvm/datasets/BUILD +++ b/tests/llvm/datasets/BUILD @@ -4,6 +4,20 @@ # LICENSE file in the root directory of this source tree. load("@rules_python//python:defs.bzl", "py_test") +py_test( + name = "clgen_test", + timeout = "moderate", + srcs = ["clgen_test.py"], + shard_count = 8, + deps = [ + "//compiler_gym/envs/llvm", + "//compiler_gym/envs/llvm/datasets", + "//tests:test_main", + "//tests/pytest_plugins:common", + "//tests/pytest_plugins:llvm", + ], +) + py_test( name = "csmith_test", timeout = "long", diff --git a/tests/llvm/datasets/clgen_test.py b/tests/llvm/datasets/clgen_test.py new file mode 100644 index 0000000000..b6531c6ecd --- /dev/null +++ b/tests/llvm/datasets/clgen_test.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""Tests for the CLgen dataset.""" +from itertools import islice +from pathlib import Path + +import gym +import pytest + +import compiler_gym.envs.llvm # noqa register environments +from compiler_gym.envs.llvm import LlvmEnv +from compiler_gym.envs.llvm.datasets import CLgenDataset +from tests.pytest_plugins.common import skip_on_ci +from tests.test_main import main + +pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"] + + +@pytest.fixture(scope="module") +def clgen_dataset() -> CLgenDataset: + env = gym.make("llvm-v0") + try: + ds = env.datasets["benchmark://clgen-v0"] + finally: + env.close() + yield ds + + +def test_clgen_size(clgen_dataset: CLgenDataset): + assert clgen_dataset.size == 996 + + +@skip_on_ci +@pytest.mark.parametrize("index", range(250)) +def test_clgen_random_select( + env: LlvmEnv, clgen_dataset: CLgenDataset, index: int, tmpwd: Path +): + uri = next(islice(clgen_dataset.benchmark_uris(), index, None)) + benchmark = clgen_dataset.benchmark(uri) + env.reset(benchmark=benchmark) + + assert benchmark.source + benchmark.write_sources_to_directory(tmpwd) + assert (tmpwd / "kernel.cl").is_file() + + +if __name__ == "__main__": + main()