Skip to content

Commit

Permalink
[llvm] Add the AnghaBench dataset.
Browse files Browse the repository at this point in the history
The dataset is from:

    da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de
    Souza Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira
    Guimaraes, and Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite
    with One Million Compilable C Benchmarks for Code-Size Reduction."
    In 2021 IEEE/ACM International Symposium on Code Generation and
    Optimization (CGO), pp. 378-390. IEEE, 2021.

Issue #45.
  • Loading branch information
ChrisCummins committed Apr 26, 2021
1 parent 4e57d83 commit c6de6fb
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 0 deletions.
1 change: 1 addition & 0 deletions compiler_gym/envs/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ py_library(
name = "datasets",
srcs = [
"__init__.py",
"anghabench.py",
"clgen.py",
"csmith.py",
"llvm_stress.py",
Expand Down
3 changes: 3 additions & 0 deletions compiler_gym/envs/llvm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Iterable, Optional

from compiler_gym.datasets import Dataset, TarDatasetWithManifest
from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
Expand Down Expand Up @@ -202,6 +203,7 @@ def __init__(self, site_data_base: Path, sort_order: int = 0):
def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset]:
site_data_base = site_data_base or site_data_path("llvm-v0")

yield AnghaBenchDataset(site_data_base=site_data_base, sort_order=0)
yield BlasDataset(site_data_base=site_data_base, sort_order=0)
yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
Expand All @@ -217,6 +219,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset


__all__ = [
"AnghaBenchDataset",
"BlasDataset",
"CLgenDataset",
"CsmithDataset",
Expand Down
120 changes: 120 additions & 0 deletions compiler_gym/envs/llvm/datasets/anghabench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
import sys
from concurrent.futures import as_completed
from pathlib import Path
from typing import Optional

from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
from compiler_gym.datasets.benchmark import BenchmarkWithSource
from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
from compiler_gym.util import thread_pool
from compiler_gym.util.filesystem import atomic_file_write


class AnghaBenchDataset(TarDatasetWithManifest):
"""A dataset of C programs curated from GitHub source code.
The dataset is from:
da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
International Symposium on Code Generation and Optimization (CGO),
pp. 378-390. IEEE, 2021.
And is available at:
http://cuda.dcc.ufmg.br/angha/home
Installation
------------
The AnghaBench dataset consists of C functions that are compiled to LLVM-IR
on-demand and cached. The first time each benchmark is used there is an
overhead of compiling it from C to bitcode. This is a one-off cost.
"""

def __init__(self, site_data_base: Path, sort_order: int = 0):
manifest_url, manifest_sha256 = {
"darwin": (
"https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2",
"39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
),
"linux": (
"https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2",
"a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477",
),
}[sys.platform]
super().__init__(
name="benchmark://anghabench-v0",
description="Compile-only C/C++ functions extracted from GitHub",
references={
"Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
"Homepage": "http://cuda.dcc.ufmg.br/angha/",
},
license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1",
site_data_base=site_data_base,
manifest_urls=[manifest_url],
manifest_sha256=manifest_sha256,
tar_urls=[
"https://github.com/brenocfg/AnghaBench/archive/d8034ac8562b8c978376008f4b33df01b8887b19.tar.gz"
],
tar_sha256="85d068e4ce44f2581e3355ee7a8f3ccb92568e9f5bd338bc3a918566f3aff42f",
strip_prefix="AnghaBench-d8034ac8562b8c978376008f4b33df01b8887b19",
tar_compression="gz",
benchmark_file_suffix=".bc",
sort_order=sort_order,
)

def benchmark(self, uri: Optional[str] = None) -> Benchmark:
self.install()
if uri is None or len(uri) <= len(self.name) + 1:
return self._get_benchmark_by_index(self.random.integers(self.size))

# The absolute path of the file, without an extension.
path_stem = self.dataset_root / uri[len(self.name) + 1 :]

bitcode_abspath = Path(f"{path_stem}.bc")
c_file_abspath = Path(f"{path_stem}.c")

if not bitcode_abspath.is_file():
if not c_file_abspath.is_file():
raise LookupError(
f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
)

# If the file does not exist, compile it on-demand.
with atomic_file_write(bitcode_abspath) as tmp_path:
compile_cmd = ClangInvocation.from_c_file(
c_file_abspath,
copt=[
"-ferror-limit=1", # Stop on first error.
"-w", # No warnings.
],
).command(outpath=tmp_path)
subprocess.check_call(compile_cmd, timeout=300)

return BenchmarkWithSource.create(
uri, bitcode_abspath, "function.c", c_file_abspath
)

def compile_all(self):
n = self.size
executor = thread_pool.get_thread_pool_executor()
# Since the dataset is lazily compiled, simply iterating over the full
# set of URIs will compile everything. Do this in parallel.
futures = (
executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
)
for i, future in enumerate(as_completed(futures), start=1):
future.result()
print(
f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
flush=True,
end="",
)
14 changes: 14 additions & 0 deletions tests/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@
# LICENSE file in the root directory of this source tree.
load("@rules_python//python:defs.bzl", "py_test")

py_test(
name = "anghabench_test",
timeout = "long",
srcs = ["anghabench_test.py"],
shard_count = 8,
deps = [
"//compiler_gym/envs/llvm",
"//compiler_gym/envs/llvm/datasets",
"//tests:test_main",
"//tests/pytest_plugins:common",
"//tests/pytest_plugins:llvm",
],
)

py_test(
name = "clgen_test",
timeout = "moderate",
Expand Down
54 changes: 54 additions & 0 deletions tests/llvm/datasets/anghabench_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""Tests for the AnghaBench dataset."""
import sys
from itertools import islice
from pathlib import Path

import gym
import pytest

import compiler_gym.envs.llvm # noqa register environments
from compiler_gym.envs.llvm import LlvmEnv
from compiler_gym.envs.llvm.datasets import AnghaBenchDataset
from tests.pytest_plugins.common import skip_on_ci
from tests.test_main import main

pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]


@pytest.fixture(scope="module")
def anghabench_dataset() -> AnghaBenchDataset:
env = gym.make("llvm-v0")
try:
ds = env.datasets["anghabench-v0"]
finally:
env.close()
yield ds


def test_anghabench_size(anghabench_dataset: AnghaBenchDataset):
if sys.platform == "darwin":
assert anghabench_dataset.size == 1042908
else:
assert anghabench_dataset.size == 1042976


@skip_on_ci
@pytest.mark.parametrize("index", range(250))
def test_anghabench_random_select(
env: LlvmEnv, anghabench_dataset: AnghaBenchDataset, index: int, tmpwd: Path
):
uri = next(islice(anghabench_dataset.benchmark_uris(), index, None))
benchmark = anghabench_dataset.benchmark(uri)
env.reset(benchmark=benchmark)

assert benchmark.source
benchmark.write_sources_to_directory(tmpwd)
assert (tmpwd / "function.c").is_file()


if __name__ == "__main__":
main()

0 comments on commit c6de6fb

Please sign in to comment.