From c6de6fb7a3ef9d5c9f16543bb27ad9037ede5b72 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Fri, 23 Apr 2021 18:20:13 +0100 Subject: [PATCH] [llvm] Add the AnghaBench dataset. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dataset is from: da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM International Symposium on Code Generation and Optimization (CGO), pp. 378-390. IEEE, 2021. Issue #45. --- compiler_gym/envs/llvm/datasets/BUILD | 1 + compiler_gym/envs/llvm/datasets/__init__.py | 3 + compiler_gym/envs/llvm/datasets/anghabench.py | 120 ++++++++++++++++++ tests/llvm/datasets/BUILD | 14 ++ tests/llvm/datasets/anghabench_test.py | 54 ++++++++ 5 files changed, 192 insertions(+) create mode 100644 compiler_gym/envs/llvm/datasets/anghabench.py create mode 100644 tests/llvm/datasets/anghabench_test.py diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD index fd2fb7799c..63a0e4c7e3 100644 --- a/compiler_gym/envs/llvm/datasets/BUILD +++ b/compiler_gym/envs/llvm/datasets/BUILD @@ -8,6 +8,7 @@ py_library( name = "datasets", srcs = [ "__init__.py", + "anghabench.py", "clgen.py", "csmith.py", "llvm_stress.py", diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py index 31ddd5f52c..9b3396204c 100644 --- a/compiler_gym/envs/llvm/datasets/__init__.py +++ b/compiler_gym/envs/llvm/datasets/__init__.py @@ -7,6 +7,7 @@ from typing import Iterable, Optional from compiler_gym.datasets import Dataset, TarDatasetWithManifest +from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset @@ -202,6 +203,7 @@ def __init__(self, site_data_base: Path, sort_order: int = 0): def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset]: site_data_base = site_data_base or site_data_path("llvm-v0") + yield AnghaBenchDataset(site_data_base=site_data_base, sort_order=0) yield BlasDataset(site_data_base=site_data_base, sort_order=0) yield CLgenDataset(site_data_base=site_data_base, sort_order=0) yield CsmithDataset(site_data_base=site_data_base, sort_order=0) @@ -217,6 +219,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset __all__ = [ + "AnghaBenchDataset", "BlasDataset", "CLgenDataset", "CsmithDataset", diff --git a/compiler_gym/envs/llvm/datasets/anghabench.py b/compiler_gym/envs/llvm/datasets/anghabench.py new file mode 100644 index 0000000000..66edef8ed2 --- /dev/null +++ b/compiler_gym/envs/llvm/datasets/anghabench.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import subprocess +import sys +from concurrent.futures import as_completed +from pathlib import Path +from typing import Optional + +from compiler_gym.datasets import Benchmark, TarDatasetWithManifest +from compiler_gym.datasets.benchmark import BenchmarkWithSource +from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation +from compiler_gym.util import thread_pool +from compiler_gym.util.filesystem import atomic_file_write + + +class AnghaBenchDataset(TarDatasetWithManifest): + """A dataset of C programs curated from GitHub source code. + + The dataset is from: + + da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza + Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and + Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million + Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM + International Symposium on Code Generation and Optimization (CGO), + pp. 378-390. IEEE, 2021. + + And is available at: + + http://cuda.dcc.ufmg.br/angha/home + + Installation + ------------ + + The AnghaBench dataset consists of C functions that are compiled to LLVM-IR + on-demand and cached. The first time each benchmark is used there is an + overhead of compiling it from C to bitcode. This is a one-off cost. + """ + + def __init__(self, site_data_base: Path, sort_order: int = 0): + manifest_url, manifest_sha256 = { + "darwin": ( + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2", + "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1", + ), + "linux": ( + "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2", + "a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477", + ), + }[sys.platform] + super().__init__( + name="benchmark://anghabench-v0", + description="Compile-only C/C++ functions extracted from GitHub", + references={ + "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf", + "Homepage": "http://cuda.dcc.ufmg.br/angha/", + }, + license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1", + site_data_base=site_data_base, + manifest_urls=[manifest_url], + manifest_sha256=manifest_sha256, + tar_urls=[ + "https://github.com/brenocfg/AnghaBench/archive/d8034ac8562b8c978376008f4b33df01b8887b19.tar.gz" + ], + tar_sha256="85d068e4ce44f2581e3355ee7a8f3ccb92568e9f5bd338bc3a918566f3aff42f", + strip_prefix="AnghaBench-d8034ac8562b8c978376008f4b33df01b8887b19", + tar_compression="gz", + benchmark_file_suffix=".bc", + sort_order=sort_order, + ) + + def benchmark(self, uri: Optional[str] = None) -> Benchmark: + self.install() + if uri is None or len(uri) <= len(self.name) + 1: + return self._get_benchmark_by_index(self.random.integers(self.size)) + + # The absolute path of the file, without an extension. + path_stem = self.dataset_root / uri[len(self.name) + 1 :] + + bitcode_abspath = Path(f"{path_stem}.bc") + c_file_abspath = Path(f"{path_stem}.c") + + if not bitcode_abspath.is_file(): + if not c_file_abspath.is_file(): + raise LookupError( + f"Benchmark not found: {uri} (file not found: {c_file_abspath})" + ) + + # If the file does not exist, compile it on-demand. + with atomic_file_write(bitcode_abspath) as tmp_path: + compile_cmd = ClangInvocation.from_c_file( + c_file_abspath, + copt=[ + "-ferror-limit=1", # Stop on first error. + "-w", # No warnings. + ], + ).command(outpath=tmp_path) + subprocess.check_call(compile_cmd, timeout=300) + + return BenchmarkWithSource.create( + uri, bitcode_abspath, "function.c", c_file_abspath + ) + + def compile_all(self): + n = self.size + executor = thread_pool.get_thread_pool_executor() + # Since the dataset is lazily compiled, simply iterating over the full + # set of URIs will compile everything. Do this in parallel. + futures = ( + executor.submit(self.benchmark, uri) for uri in self.benchmark_uris() + ) + for i, future in enumerate(as_completed(futures), start=1): + future.result() + print( + f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)", + flush=True, + end="", + ) diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD index fe923406b3..708146828e 100644 --- a/tests/llvm/datasets/BUILD +++ b/tests/llvm/datasets/BUILD @@ -4,6 +4,20 @@ # LICENSE file in the root directory of this source tree. load("@rules_python//python:defs.bzl", "py_test") +py_test( + name = "anghabench_test", + timeout = "long", + srcs = ["anghabench_test.py"], + shard_count = 8, + deps = [ + "//compiler_gym/envs/llvm", + "//compiler_gym/envs/llvm/datasets", + "//tests:test_main", + "//tests/pytest_plugins:common", + "//tests/pytest_plugins:llvm", + ], +) + py_test( name = "clgen_test", timeout = "moderate", diff --git a/tests/llvm/datasets/anghabench_test.py b/tests/llvm/datasets/anghabench_test.py new file mode 100644 index 0000000000..dc9e0b7163 --- /dev/null +++ b/tests/llvm/datasets/anghabench_test.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""Tests for the AnghaBench dataset.""" +import sys +from itertools import islice +from pathlib import Path + +import gym +import pytest + +import compiler_gym.envs.llvm # noqa register environments +from compiler_gym.envs.llvm import LlvmEnv +from compiler_gym.envs.llvm.datasets import AnghaBenchDataset +from tests.pytest_plugins.common import skip_on_ci +from tests.test_main import main + +pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"] + + +@pytest.fixture(scope="module") +def anghabench_dataset() -> AnghaBenchDataset: + env = gym.make("llvm-v0") + try: + ds = env.datasets["anghabench-v0"] + finally: + env.close() + yield ds + + +def test_anghabench_size(anghabench_dataset: AnghaBenchDataset): + if sys.platform == "darwin": + assert anghabench_dataset.size == 1042908 + else: + assert anghabench_dataset.size == 1042976 + + +@skip_on_ci +@pytest.mark.parametrize("index", range(250)) +def test_anghabench_random_select( + env: LlvmEnv, anghabench_dataset: AnghaBenchDataset, index: int, tmpwd: Path +): + uri = next(islice(anghabench_dataset.benchmark_uris(), index, None)) + benchmark = anghabench_dataset.benchmark(uri) + env.reset(benchmark=benchmark) + + assert benchmark.source + benchmark.write_sources_to_directory(tmpwd) + assert (tmpwd / "function.c").is_file() + + +if __name__ == "__main__": + main()