From 4e57d8341282ce4e2b313156960345587a3cbf7d Mon Sep 17 00:00:00 2001
From: Chris Cummins <chrisc.101@gmail.com>
Date: Fri, 23 Apr 2021 15:54:59 +0100
Subject: [PATCH] [llvm] Add a CLgen dataset.

This adds a dataset of 1k OpenCL kernels that were used in the paper:

    Cummins, Chris, Pavlos Petoumenos, Zheng Wang, and Hugh
    Leather. "Synthesizing benchmarks for predictive modeling." In
    2017 IEEE/ACM International Symposium on Code Generation and
    Optimization (CGO), pp. 86-99. IEEE, 2017.

The OpenCL kernels are compiled on-demand.

Issue #45.
---
 compiler_gym/envs/llvm/datasets/BUILD       |   1 +
 compiler_gym/envs/llvm/datasets/__init__.py |   3 +
 compiler_gym/envs/llvm/datasets/clgen.py    | 170 ++++++++++++++++++++
 tests/llvm/datasets/BUILD                   |  14 ++
 tests/llvm/datasets/clgen_test.py           |  50 ++++++
 5 files changed, 238 insertions(+)
 create mode 100644 compiler_gym/envs/llvm/datasets/clgen.py
 create mode 100644 tests/llvm/datasets/clgen_test.py

diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD
index fd865d8adc..fd2fb7799c 100644
--- a/compiler_gym/envs/llvm/datasets/BUILD
+++ b/compiler_gym/envs/llvm/datasets/BUILD
@@ -8,6 +8,7 @@ py_library(
     name = "datasets",
     srcs = [
         "__init__.py",
+        "clgen.py",
         "csmith.py",
         "llvm_stress.py",
         "poj104.py",
diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
index 11a01e9e96..31ddd5f52c 100644
--- a/compiler_gym/envs/llvm/datasets/__init__.py
+++ b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -7,6 +7,7 @@
 from typing import Iterable, Optional
 
 from compiler_gym.datasets import Dataset, TarDatasetWithManifest
+from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
 from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
 from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
 from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset
@@ -202,6 +203,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
     site_data_base = site_data_base or site_data_path("llvm-v0")
 
     yield BlasDataset(site_data_base=site_data_base, sort_order=0)
+    yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
     yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
     yield GitHubDataset(site_data_base=site_data_base, sort_order=0)
     yield LinuxDataset(site_data_base=site_data_base, sort_order=0)
@@ -216,6 +218,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
 
 __all__ = [
     "BlasDataset",
+    "CLgenDataset",
     "CsmithDataset",
     "CsmithBenchmark",
     "get_llvm_datasets",
diff --git a/compiler_gym/envs/llvm/datasets/clgen.py b/compiler_gym/envs/llvm/datasets/clgen.py
new file mode 100644
index 0000000000..3cc1629362
--- /dev/null
+++ b/compiler_gym/envs/llvm/datasets/clgen.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import io
+import shutil
+import subprocess
+import tarfile
+from pathlib import Path
+from typing import List, Optional
+
+from fasteners import InterProcessLock
+
+from compiler_gym.datasets import Benchmark, BenchmarkInitError, TarDatasetWithManifest
+from compiler_gym.datasets.benchmark import BenchmarkWithSource
+from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
+from compiler_gym.util.download import download
+from compiler_gym.util.filesystem import atomic_file_write
+from compiler_gym.util.truncate import truncate
+
+
+class CLgenDataset(TarDatasetWithManifest):
+    """The CLgen dataset contains 1000 synthetically generated OpenCL kernels.
+
+    The dataset is from:
+
+        Cummins, Chris, Pavlos Petoumenos, Zheng Wang, and Hugh Leather.
+        "Synthesizing benchmarks for predictive modeling." In 2017 IEEE/ACM
+        International Symposium on Code Generation and Optimization (CGO),
+        pp. 86-99. IEEE, 2017.
+
+    And is available at:
+
+        https://github.com/ChrisCummins/paper-synthesizing-benchmarks
+
+    Installation
+    ------------
+
+    The CLgen dataset consists of OpenCL kernels that are compiled to LLVM-IR
+    on-demand and cached. The first time each benchmark is used there is an
+    overhead of compiling it from OpenCL to bitcode. This is a one-off cost.
+    Compiling OpenCL to bitcode requires third party headers that are downloaded
+    on the first call to :code:`install()`.
+    """
+
+    def __init__(self, site_data_base: Path, sort_order: int = 0):
+        super().__init__(
+            name="benchmark://clgen-v0",
+            description="Synthetically generated OpenCL kernels",
+            references={
+                "Paper": "https://chriscummins.cc/pub/2017-cgo.pdf",
+                "Homepage": "https://github.com/ChrisCummins/clgen",
+            },
+            license="GNU General Public License v3.0",
+            site_data_base=site_data_base,
+            manifest_urls=[
+                "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-clgen-v0-manifest.bz2"
+            ],
+            manifest_sha256="d2bbc1da5a24a8cb03b604d1d8e59227b33bdfcd964ebe741ca8339f1c8d65cc",
+            tar_urls=[
+                "https://github.com/ChrisCummins/paper-synthesizing-benchmarks/raw/e45b6dffe9998f612624f05a6c4878ab4bcc84ec/data/clgen-1000.tar.bz2"
+            ],
+            tar_sha256="0bbd1b737f2537305e4db09b2971a5fa848b7c3a978bff6b570f45d1a488a72c",
+            strip_prefix="clgen-1000/kernels",
+            tar_compression="bz2",
+            benchmark_file_suffix=".bc",
+            sort_order=sort_order,
+        )
+
+        self._opencl_installed = False
+        self._opencl_headers_installed_marker = (
+            self._site_data_path / ".opencl-installed"
+        )
+        self.libclc_dir = self.site_data_path / "libclc"
+        self.opencl_h_path = self.site_data_path / "opencl.h"
+
+    def install(self):
+        super().install()
+
+        if not self._opencl_installed:
+            self._opencl_installed = self._opencl_headers_installed_marker.is_file()
+
+        if self._opencl_installed:
+            return
+
+        with self._tar_lock, InterProcessLock(self._tar_lockfile):
+            # Repeat install check now that we are in the locked region.
+            if self._opencl_headers_installed_marker.is_file():
+                return
+
+            # Download the libclc headers.
+            shutil.rmtree(self.libclc_dir, ignore_errors=True)
+            self.logger.info("Downloading OpenCL headers")
+            tar_data = io.BytesIO(
+                download(
+                    "https://dl.fbaipublicfiles.com/compiler_gym/libclc-v0.tar.bz2",
+                    sha256="f1c511f2ac12adf98dcc0fbfc4e09d0f755fa403c18f1fb1ffa5547e1fa1a499",
+                )
+            )
+            with tarfile.open(fileobj=tar_data, mode="r:bz2") as arc:
+                arc.extractall(str(self.site_data_path / "libclc"))
+
+            # Download the OpenCL header.
+            with open(self.opencl_h_path, "wb") as f:
+                f.write(
+                    download(
+                        "https://github.com/ChrisCummins/clgen/raw/463c0adcd8abcf2432b24df0aca594b77a69e9d3/deeplearning/clgen/data/include/opencl.h",
+                        sha256="f95b9f4c8b1d09114e491846d0d41425d24930ac167e024f45dab8071d19f3f7",
+                    )
+                )
+
+            self._opencl_headers_installed_marker.touch()
+
+    def benchmark(self, uri: Optional[str] = None) -> Benchmark:
+        self.install()
+        if uri is None or len(uri) <= len(self.name) + 1:
+            return self._get_benchmark_by_index(self.random.integers(self.size))
+
+        # The absolute path of the file, without an extension.
+        path_stem = self.dataset_root / uri[len(self.name) + 1 :]
+
+        # If the file does not exist, compile it on-demand.
+        bc_path, cl_path = Path(f"{path_stem}.bc"), Path(f"{path_stem}.cl")
+
+        if not bc_path.is_file():
+            if not cl_path.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {cl_path})"
+                )
+
+            # Compile the OpenCL kernel into a bitcode file.
+            with atomic_file_write(bc_path) as tmp_bc_path:
+                compile_command: List[str] = ClangInvocation.from_c_file(
+                    cl_path,
+                    copt=[
+                        "-isystem",
+                        str(self.libclc_dir),
+                        "-include",
+                        str(self.opencl_h_path),
+                        "-target",
+                        "nvptx64-nvidia-nvcl",
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_bc_path)
+                self.logger.debug("Exec %s", compile_command)
+                clang = subprocess.Popen(
+                    compile_command,
+                    stdin=subprocess.PIPE,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                _, stderr = clang.communicate(timeout=300)
+
+            if clang.returncode:
+                compile_command = " ".join(compile_command)
+                error = truncate(
+                    stderr.decode("utf-8"), max_lines=20, max_line_len=20000
+                )
+                raise BenchmarkInitError(
+                    f"Compilation job failed!\n"
+                    f"Command: {compile_command}\n"
+                    f"Error: {error}"
+                )
+            if not bc_path.is_file():
+                raise BenchmarkInitError(
+                    f"Compilation job failed to produce output file!\nCommand: {compile_command}"
+                )
+
+        return BenchmarkWithSource.create(uri, bc_path, "kernel.cl", cl_path)
diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD
index d3e570e347..fe923406b3 100644
--- a/tests/llvm/datasets/BUILD
+++ b/tests/llvm/datasets/BUILD
@@ -4,6 +4,20 @@
 # LICENSE file in the root directory of this source tree.
 load("@rules_python//python:defs.bzl", "py_test")
 
+py_test(
+    name = "clgen_test",
+    timeout = "moderate",
+    srcs = ["clgen_test.py"],
+    shard_count = 8,
+    deps = [
+        "//compiler_gym/envs/llvm",
+        "//compiler_gym/envs/llvm/datasets",
+        "//tests:test_main",
+        "//tests/pytest_plugins:common",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "csmith_test",
     timeout = "long",
diff --git a/tests/llvm/datasets/clgen_test.py b/tests/llvm/datasets/clgen_test.py
new file mode 100644
index 0000000000..b6531c6ecd
--- /dev/null
+++ b/tests/llvm/datasets/clgen_test.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the CLgen dataset."""
+from itertools import islice
+from pathlib import Path
+
+import gym
+import pytest
+
+import compiler_gym.envs.llvm  # noqa register environments
+from compiler_gym.envs.llvm import LlvmEnv
+from compiler_gym.envs.llvm.datasets import CLgenDataset
+from tests.pytest_plugins.common import skip_on_ci
+from tests.test_main import main
+
+pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]
+
+
+@pytest.fixture(scope="module")
+def clgen_dataset() -> CLgenDataset:
+    env = gym.make("llvm-v0")
+    try:
+        ds = env.datasets["benchmark://clgen-v0"]
+    finally:
+        env.close()
+    yield ds
+
+
+def test_clgen_size(clgen_dataset: CLgenDataset):
+    assert clgen_dataset.size == 996
+
+
+@skip_on_ci
+@pytest.mark.parametrize("index", range(250))
+def test_clgen_random_select(
+    env: LlvmEnv, clgen_dataset: CLgenDataset, index: int, tmpwd: Path
+):
+    uri = next(islice(clgen_dataset.benchmark_uris(), index, None))
+    benchmark = clgen_dataset.benchmark(uri)
+    env.reset(benchmark=benchmark)
+
+    assert benchmark.source
+    benchmark.write_sources_to_directory(tmpwd)
+    assert (tmpwd / "kernel.cl").is_file()
+
+
+if __name__ == "__main__":
+    main()