Skip to content

Commit

Permalink
[llvm] Add the AnghaBench dataset.
Browse files Browse the repository at this point in the history
The dataset is from:

    da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de
    Souza Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira
    Guimaraes, and Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite
    with One Million Compilable C Benchmarks for Code-Size Reduction."
    In 2021 IEEE/ACM International Symposium on Code Generation and
    Optimization (CGO), pp. 378-390. IEEE, 2021.

Issue #45.
  • Loading branch information
ChrisCummins committed Apr 26, 2021
1 parent 72255d9 commit c43019f
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 0 deletions.
1 change: 1 addition & 0 deletions compiler_gym/envs/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ py_library(
name = "datasets",
srcs = [
"__init__.py",
"anghabench.py",
"clgen.py",
"csmith.py",
"llvm_stress.py",
Expand Down
3 changes: 3 additions & 0 deletions compiler_gym/envs/llvm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Iterable, Optional

from compiler_gym.datasets import Dataset, TarDatasetWithManifest
from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
Expand Down Expand Up @@ -202,6 +203,7 @@ def __init__(self, site_data_base: Path, sort_order: int = 0):
def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset]:
site_data_base = site_data_base or site_data_path("llvm-v0")

yield AnghaBenchDataset(site_data_base=site_data_base, sort_order=0)
yield BlasDataset(site_data_base=site_data_base, sort_order=0)
yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
Expand All @@ -217,6 +219,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset


__all__ = [
"AnghaBenchDataset",
"BlasDataset",
"CLgenDataset",
"CsmithDataset",
Expand Down
120 changes: 120 additions & 0 deletions compiler_gym/envs/llvm/datasets/anghabench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
import sys
from concurrent.futures import as_completed
from pathlib import Path
from typing import Optional

from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
from compiler_gym.datasets.benchmark import BenchmarkWithSource
from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
from compiler_gym.util import thread_pool
from compiler_gym.util.filesystem import atomic_file_write


class AnghaBenchDataset(TarDatasetWithManifest):
"""A dataset of C programs curated from GitHub source code.
The dataset is from:
da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
International Symposium on Code Generation and Optimization (CGO),
pp. 378-390. IEEE, 2021.
And is available at:
http://cuda.dcc.ufmg.br/angha/home
Installation
------------
The AnghaBench dataset consists of C functions that are compiled to LLVM-IR
on-demand and cached. The first time each benchmark is used there is an
overhead of compiling it from C to bitcode. This is a one-off cost.
"""

def __init__(self, site_data_base: Path, sort_order: int = 0):
manifest_url, manifest_sha256 = {
"darwin": (
"https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2",
"39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
),
"linux": (
"https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2",
"a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477",
),
}[sys.platform]
super().__init__(
name="benchmark://anghabench-v0",
description="Compile-only C/C++ functions extracted from GitHub",
references={
"Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
"Homepage": "http://cuda.dcc.ufmg.br/angha/",
},
license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1",
site_data_base=site_data_base,
manifest_urls=[manifest_url],
manifest_sha256=manifest_sha256,
tar_urls=[
"https://github.com/brenocfg/AnghaBench/archive/d8034ac8562b8c978376008f4b33df01b8887b19.tar.gz"
],
tar_sha256="85d068e4ce44f2581e3355ee7a8f3ccb92568e9f5bd338bc3a918566f3aff42f",
strip_prefix="AnghaBench-d8034ac8562b8c978376008f4b33df01b8887b19",
tar_compression="gz",
benchmark_file_suffix=".bc",
sort_order=sort_order,
)

def benchmark(self, uri: Optional[str] = None) -> Benchmark:
self.install()
if uri is None or len(uri) <= len(self.name) + 1:
return self._get_benchmark_by_index(self.random.integers(self.size))

# The absolute path of the file, without an extension.
path_stem = self.dataset_root / uri[len(self.name) + 1 :]

bitcode_abspath = Path(f"{path_stem}.bc")
c_file_abspath = Path(f"{path_stem}.c")

if not bitcode_abspath.is_file():
if not c_file_abspath.is_file():
raise LookupError(
f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
)

# If the file does not exist, compile it on-demand.
with atomic_file_write(bitcode_abspath) as tmp_path:
compile_cmd = ClangInvocation.from_c_file(
c_file_abspath,
copt=[
"-ferror-limit=1", # Stop on first error.
"-w", # No warnings.
],
).command(outpath=tmp_path)
subprocess.check_call(compile_cmd, timeout=300)

return BenchmarkWithSource.create(
uri, bitcode_abspath, "function.c", c_file_abspath
)

def compile_all(self):
n = self.size
executor = thread_pool.get_thread_pool_executor()
# Since the dataset is lazily compiled, simply iterating over the full
# set of URIs will compile everything. Do this in parallel.
futures = (
executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
)
for i, future in enumerate(as_completed(futures), start=1):
future.result()
print(
f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
flush=True,
end="",
)
14 changes: 14 additions & 0 deletions tests/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@
# LICENSE file in the root directory of this source tree.
load("@rules_python//python:defs.bzl", "py_test")

py_test(
name = "anghabench_test",
timeout = "long",
srcs = ["anghabench_test.py"],
shard_count = 8,
deps = [
"//compiler_gym/envs/llvm",
"//compiler_gym/envs/llvm/datasets",
"//tests:test_main",
"//tests/pytest_plugins:common",
"//tests/pytest_plugins:llvm",
],
)

py_test(
name = "clgen_test",
timeout = "moderate",
Expand Down
53 changes: 53 additions & 0 deletions tests/llvm/datasets/anghabench_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""Tests for the AnghaBench dataset."""
import sys
from pathlib import Path

import gym
import pytest

import compiler_gym.envs.llvm # noqa register environments
from compiler_gym.envs.llvm import LlvmEnv
from compiler_gym.envs.llvm.datasets import AnghaBenchDataset
from tests.pytest_plugins.common import skip_on_ci
from tests.test_main import main

pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]


@pytest.fixture(scope="module")
def anghabench_dataset() -> AnghaBenchDataset:
env = gym.make("llvm-v0")
try:
ds = env.datasets["anghabench-v0"]
finally:
env.close()
yield ds


def test_anghabench_size(anghabench_dataset: AnghaBenchDataset):
if sys.platform == "darwin":
assert anghabench_dataset.size == 1042908
else:
assert anghabench_dataset.size == 1042976


@skip_on_ci
@pytest.mark.parametrize("seed", range(250))
def test_anghabench_random_select(
env: LlvmEnv, anghabench_dataset: AnghaBenchDataset, seed: int, tmpwd: Path
):
anghabench_dataset.seed(seed)
benchmark = anghabench_dataset.benchmark()
env.reset(benchmark=benchmark)

assert benchmark.source
benchmark.write_sources_to_directory(tmpwd)
assert (tmpwd / "function.c").is_file()


if __name__ == "__main__":
main()

0 comments on commit c43019f

Please sign in to comment.