From c6de6fb7a3ef9d5c9f16543bb27ad9037ede5b72 Mon Sep 17 00:00:00 2001
From: Chris Cummins <chrisc.101@gmail.com>
Date: Fri, 23 Apr 2021 18:20:13 +0100
Subject: [PATCH] [llvm] Add the AnghaBench dataset.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dataset is from:

    da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de
    Souza Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira
    Guimaraes, and Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite
    with One Million Compilable C Benchmarks for Code-Size Reduction."
    In 2021 IEEE/ACM International Symposium on Code Generation and
    Optimization (CGO), pp. 378-390. IEEE, 2021.

Issue #45.
---
 compiler_gym/envs/llvm/datasets/BUILD         |   1 +
 compiler_gym/envs/llvm/datasets/__init__.py   |   3 +
 compiler_gym/envs/llvm/datasets/anghabench.py | 120 ++++++++++++++++++
 tests/llvm/datasets/BUILD                     |  14 ++
 tests/llvm/datasets/anghabench_test.py        |  54 ++++++++
 5 files changed, 192 insertions(+)
 create mode 100644 compiler_gym/envs/llvm/datasets/anghabench.py
 create mode 100644 tests/llvm/datasets/anghabench_test.py

diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD
index fd2fb7799c..63a0e4c7e3 100644
--- a/compiler_gym/envs/llvm/datasets/BUILD
+++ b/compiler_gym/envs/llvm/datasets/BUILD
@@ -8,6 +8,7 @@ py_library(
     name = "datasets",
     srcs = [
         "__init__.py",
+        "anghabench.py",
         "clgen.py",
         "csmith.py",
         "llvm_stress.py",
diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
index 31ddd5f52c..9b3396204c 100644
--- a/compiler_gym/envs/llvm/datasets/__init__.py
+++ b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -7,6 +7,7 @@
 from typing import Iterable, Optional
 
 from compiler_gym.datasets import Dataset, TarDatasetWithManifest
+from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
 from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
 from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
 from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
@@ -202,6 +203,7 @@ def __init__(self, site_data_base: Path, sort_order: int = 0):
 def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset]:
     site_data_base = site_data_base or site_data_path("llvm-v0")
 
+    yield AnghaBenchDataset(site_data_base=site_data_base, sort_order=0)
     yield BlasDataset(site_data_base=site_data_base, sort_order=0)
     yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
     yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
@@ -217,6 +219,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
 
 
 __all__ = [
+    "AnghaBenchDataset",
     "BlasDataset",
     "CLgenDataset",
     "CsmithDataset",
diff --git a/compiler_gym/envs/llvm/datasets/anghabench.py b/compiler_gym/envs/llvm/datasets/anghabench.py
new file mode 100644
index 0000000000..66edef8ed2
--- /dev/null
+++ b/compiler_gym/envs/llvm/datasets/anghabench.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import subprocess
+import sys
+from concurrent.futures import as_completed
+from pathlib import Path
+from typing import Optional
+
+from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
+from compiler_gym.datasets.benchmark import BenchmarkWithSource
+from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
+from compiler_gym.util import thread_pool
+from compiler_gym.util.filesystem import atomic_file_write
+
+
+class AnghaBenchDataset(TarDatasetWithManifest):
+    """A dataset of C programs curated from GitHub source code.
+
+    The dataset is from:
+
+        da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
+        Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
+        Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
+        Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
+        International Symposium on Code Generation and Optimization (CGO),
+        pp. 378-390. IEEE, 2021.
+
+    And is available at:
+
+        http://cuda.dcc.ufmg.br/angha/home
+
+    Installation
+    ------------
+
+    The AnghaBench dataset consists of C functions that are compiled to LLVM-IR
+    on-demand and cached. The first time each benchmark is used there is an
+    overhead of compiling it from C to bitcode. This is a one-off cost.
+    """
+
+    def __init__(self, site_data_base: Path, sort_order: int = 0):
+        manifest_url, manifest_sha256 = {
+            "darwin": (
+                "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-macos-manifest.bz2",
+                "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
+            ),
+            "linux": (
+                "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2",
+                "a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477",
+            ),
+        }[sys.platform]
+        super().__init__(
+            name="benchmark://anghabench-v0",
+            description="Compile-only C/C++ functions extracted from GitHub",
+            references={
+                "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
+                "Homepage": "http://cuda.dcc.ufmg.br/angha/",
+            },
+            license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1",
+            site_data_base=site_data_base,
+            manifest_urls=[manifest_url],
+            manifest_sha256=manifest_sha256,
+            tar_urls=[
+                "https://github.com/brenocfg/AnghaBench/archive/d8034ac8562b8c978376008f4b33df01b8887b19.tar.gz"
+            ],
+            tar_sha256="85d068e4ce44f2581e3355ee7a8f3ccb92568e9f5bd338bc3a918566f3aff42f",
+            strip_prefix="AnghaBench-d8034ac8562b8c978376008f4b33df01b8887b19",
+            tar_compression="gz",
+            benchmark_file_suffix=".bc",
+            sort_order=sort_order,
+        )
+
+    def benchmark(self, uri: Optional[str] = None) -> Benchmark:
+        self.install()
+        if uri is None or len(uri) <= len(self.name) + 1:
+            return self._get_benchmark_by_index(self.random.integers(self.size))
+
+        # The absolute path of the file, without an extension.
+        path_stem = self.dataset_root / uri[len(self.name) + 1 :]
+
+        bitcode_abspath = Path(f"{path_stem}.bc")
+        c_file_abspath = Path(f"{path_stem}.c")
+
+        if not bitcode_abspath.is_file():
+            if not c_file_abspath.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
+                )
+
+            # If the file does not exist, compile it on-demand.
+            with atomic_file_write(bitcode_abspath) as tmp_path:
+                compile_cmd = ClangInvocation.from_c_file(
+                    c_file_abspath,
+                    copt=[
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_path)
+                subprocess.check_call(compile_cmd, timeout=300)
+
+        return BenchmarkWithSource.create(
+            uri, bitcode_abspath, "function.c", c_file_abspath
+        )
+
+    def compile_all(self):
+        n = self.size
+        executor = thread_pool.get_thread_pool_executor()
+        # Since the dataset is lazily compiled, simply iterating over the full
+        # set of URIs will compile everything. Do this in parallel.
+        futures = (
+            executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
+        )
+        for i, future in enumerate(as_completed(futures), start=1):
+            future.result()
+            print(
+                f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
+                flush=True,
+                end="",
+            )
diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD
index fe923406b3..708146828e 100644
--- a/tests/llvm/datasets/BUILD
+++ b/tests/llvm/datasets/BUILD
@@ -4,6 +4,20 @@
 # LICENSE file in the root directory of this source tree.
 load("@rules_python//python:defs.bzl", "py_test")
 
+py_test(
+    name = "anghabench_test",
+    timeout = "long",
+    srcs = ["anghabench_test.py"],
+    shard_count = 8,
+    deps = [
+        "//compiler_gym/envs/llvm",
+        "//compiler_gym/envs/llvm/datasets",
+        "//tests:test_main",
+        "//tests/pytest_plugins:common",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "clgen_test",
     timeout = "moderate",
diff --git a/tests/llvm/datasets/anghabench_test.py b/tests/llvm/datasets/anghabench_test.py
new file mode 100644
index 0000000000..dc9e0b7163
--- /dev/null
+++ b/tests/llvm/datasets/anghabench_test.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the AnghaBench dataset."""
+import sys
+from itertools import islice
+from pathlib import Path
+
+import gym
+import pytest
+
+import compiler_gym.envs.llvm  # noqa register environments
+from compiler_gym.envs.llvm import LlvmEnv
+from compiler_gym.envs.llvm.datasets import AnghaBenchDataset
+from tests.pytest_plugins.common import skip_on_ci
+from tests.test_main import main
+
+pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]
+
+
+@pytest.fixture(scope="module")
+def anghabench_dataset() -> AnghaBenchDataset:
+    env = gym.make("llvm-v0")
+    try:
+        ds = env.datasets["anghabench-v0"]
+    finally:
+        env.close()
+    yield ds
+
+
+def test_anghabench_size(anghabench_dataset: AnghaBenchDataset):
+    if sys.platform == "darwin":
+        assert anghabench_dataset.size == 1042908
+    else:
+        assert anghabench_dataset.size == 1042976
+
+
+@skip_on_ci
+@pytest.mark.parametrize("index", range(250))
+def test_anghabench_random_select(
+    env: LlvmEnv, anghabench_dataset: AnghaBenchDataset, index: int, tmpwd: Path
+):
+    uri = next(islice(anghabench_dataset.benchmark_uris(), index, None))
+    benchmark = anghabench_dataset.benchmark(uri)
+    env.reset(benchmark=benchmark)
+
+    assert benchmark.source
+    benchmark.write_sources_to_directory(tmpwd)
+    assert (tmpwd / "function.c").is_file()
+
+
+if __name__ == "__main__":
+    main()