From 8dd573ef2eea329f382b3b3477b524156df0aaa1 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Fri, 30 Jul 2021 10:33:17 +0000 Subject: [PATCH 1/2] Compile and ship csmith as part of python package. Currently, when the csmith dataset is first used, the source code is downloaded, unpacked and compiled on the local machine. This can cause issues for systems without a standard toolchain for building C code, or for other systems with limitations on running arbitrary code, for example, in Google's Colab notebook environments. This patch adds csmith to the set of compiled C code used to build the CompilerGym package. It adds the generated binaries and headers to the pip wheel so that `pip install compiler_gym` will ship with a working Csmith binary. Fixes #345. --- WORKSPACE | 11 ++ compiler_gym/envs/llvm/datasets/BUILD | 1 + compiler_gym/envs/llvm/datasets/csmith.py | 150 +++++----------------- compiler_gym/third_party/csmith/BUILD | 20 +++ setup.py | 2 + 5 files changed, 68 insertions(+), 116 deletions(-) create mode 100644 compiler_gym/third_party/csmith/BUILD diff --git a/WORKSPACE b/WORKSPACE index abbd8a24e..8335e7f71 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -290,6 +290,17 @@ http_archive( urls = ["https://github.com/pytorch/cpuinfo/archive/63b254577ed77a8004a9be6ac707f3dccc4e1fd9.tar.gz"], ) +# === Csmith === +# https://embed.cs.utah.edu/csmith/ + +http_archive( + name = "csmith", + build_file_content = all_content, + sha256 = "ba871c1e5a05a71ecd1af514fedba30561b16ee80b8dd5ba8f884eaded47009f", + strip_prefix = "csmith-csmith-2.3.0", + urls = ["https://github.com/csmith-project/csmith/archive/refs/tags/csmith-2.3.0.tar.gz"], +) + # === DeepDataFlow === # https://zenodo.org/record/4122437 diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD index b2129ed3c..1238e9317 100644 --- a/compiler_gym/envs/llvm/datasets/BUILD +++ b/compiler_gym/envs/llvm/datasets/BUILD @@ -16,6 +16,7 @@ py_library( "llvm_stress.py", "poj104.py", ], + data = ["//compiler_gym/third_party/csmith:all"], visibility = ["//visibility:public"], deps = [ "//compiler_gym/datasets", diff --git a/compiler_gym/envs/llvm/datasets/csmith.py b/compiler_gym/envs/llvm/datasets/csmith.py index 1204439a4..d41819fe4 100644 --- a/compiler_gym/envs/llvm/datasets/csmith.py +++ b/compiler_gym/envs/llvm/datasets/csmith.py @@ -2,35 +2,29 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import io import logging import subprocess -import sys -import tarfile -import tempfile from pathlib import Path -from threading import Lock -from typing import Iterable, List +from typing import Iterable, List, Optional import numpy as np -from fasteners import InterProcessLock from compiler_gym.datasets import Benchmark, BenchmarkSource, Dataset from compiler_gym.datasets.benchmark import BenchmarkInitError, BenchmarkWithSource -from compiler_gym.datasets.dataset import DatasetInitError from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation from compiler_gym.service.proto import BenchmarkDynamicConfig from compiler_gym.util.decorators import memoized_property -from compiler_gym.util.download import download -from compiler_gym.util.runfiles_path import transient_cache_path +from compiler_gym.util.runfiles_path import runfiles_path from compiler_gym.util.shell_format import plural from compiler_gym.util.truncate import truncate # The maximum value for the --seed argument to csmith. UINT_MAX = (2 ** 32) - 1 -# A lock for exclusive access to the Csmith build logic. -_CSMITH_BUILD_LOCK = Lock() +_CSMITH_BIN = runfiles_path("compiler_gym/third_party/csmith/csmith/bin/csmith") +_CSMITH_INCLUDES = runfiles_path( + "compiler_gym/third_party/csmith/csmith/include/csmith-2.3.0" +) class CsmithBenchmark(BenchmarkWithSource): @@ -68,31 +62,6 @@ def source(self) -> str: return self._src.decode("utf-8") -class CsmithBuildError(DatasetInitError): - """Error raised if :meth:`CsmithDataset.install() - ` fails.""" - - def __init__(self, failing_stage: str, stdout: str, stderr: str): - install_instructions = { - "linux": "sudo apt install g++ m4", - "darwin": "brew install m4", - }[sys.platform] - - super().__init__( - "\n".join( - [ - f"Failed to build Csmith from source, `{failing_stage}` failed.", - "You may be missing installation dependencies. Install them using:", - f" {install_instructions}", - "See https://github.com/csmith-project/csmith#install-csmith for more details", - f"--- Start `{failing_stage}` logs: ---\n", - stdout, - stderr, - ] - ) - ) - - class CsmithDataset(Dataset): """A dataset which uses Csmith to generate programs. @@ -128,7 +97,28 @@ class CsmithDataset(Dataset): details. """ - def __init__(self, site_data_base: Path, sort_order: int = 0): + def __init__( + self, + site_data_base: Path, + sort_order: int = 0, + csmith_bin: Optional[Path] = None, + csmith_includes: Optional[Path] = None, + ): + """Constructor. + + :param site_data_base: The base path of a directory that will be used to + store installed files. + + :param sort_order: An optional numeric value that should be used to + order this dataset relative to others. Lowest value sorts first. + + :param csmith_bin: The path of the Csmith binary to use. If not + provided, the version of Csmith shipped with CompilerGym is used. + + :param csmith_includes: The path of the Csmith includes directory. If + not provided, the includes of the Csmith shipped with CompilerGym is + used. + """ super().__init__( name="generator://csmith-v0", description="Random conformant C99 programs", @@ -141,93 +131,21 @@ def __init__(self, site_data_base: Path, sort_order: int = 0): sort_order=sort_order, benchmark_class=CsmithBenchmark, ) - self.csmith_path = self.site_data_path / "bin" / "csmith" - csmith_include_dir = self.site_data_path / "include" / "csmith-2.3.0" - - self._installed = False - self._build_lockfile = self.site_data_path / ".build.LOCK" - self._build_markerfile = self.site_data_path / ".built" - + self.csmith_bin_path = csmith_bin or _CSMITH_BIN + self.csmith_includes_path = csmith_includes or _CSMITH_INCLUDES # The command that is used to compile an LLVM-IR bitcode file from a # Csmith input. Reads from stdin, writes to stdout. self.clang_compile_command: List[str] = ClangInvocation.from_c_file( "-", # Read from stdin. copt=[ - "-xc", + "-xc", # The C programming language. "-ferror-limit=1", # Stop on first error. "-w", # No warnings. - f"-I{csmith_include_dir}", # Include the Csmith headers. + f"-I{self.csmith_includes_path}", # Include the Csmith headers. ], ).command( - outpath="-" - ) # Write to stdout. - - @property - def installed(self) -> bool: - # Fast path for repeated checks to 'installed' without a disk op. - if not self._installed: - self._installed = self._build_markerfile.is_file() - return self._installed - - def install(self) -> None: - """Download and build the Csmith binary.""" - super().install() - - if self.installed: - return - - with _CSMITH_BUILD_LOCK, InterProcessLock(self._build_lockfile): - # Repeat the check to see if we have already installed the dataset - # now that we have acquired the lock. - if not self.installed: - self.logger.info("Downloading and building Csmith") - self._build_csmith(self.site_data_path, self.logger) - self._build_markerfile.touch() - - @staticmethod - def _build_csmith(install_root: Path, logger: logging.Logger): - """Download, build, and install Csmith to the given directory.""" - tar_data = io.BytesIO( - download( - urls=[ - "https://github.com/csmith-project/csmith/archive/refs/tags/csmith-2.3.0.tar.gz", - ], - sha256="ba871c1e5a05a71ecd1af514fedba30561b16ee80b8dd5ba8f884eaded47009f", - ) + outpath="-" # Write to stdout. ) - # Csmith uses a standard `configure` + `make install` build process. - with tempfile.TemporaryDirectory( - dir=transient_cache_path("."), prefix="csmith-" - ) as d: - with tarfile.open(fileobj=tar_data, mode="r:gz") as arc: - arc.extractall(d) - - # The path of the extracted sources. - src_dir = Path(d) / "csmith-csmith-2.3.0" - - logger.debug("Configuring Csmith at %s", d) - configure = subprocess.Popen( - ["./configure", f"--prefix={install_root}"], - cwd=src_dir, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) - stdout, stderr = configure.communicate(timeout=600) - if configure.returncode: - raise CsmithBuildError("./configure", stdout, stderr) - - logger.debug("Installing Csmith to %s", install_root) - make = subprocess.Popen( - ["make", "-j", "install"], - cwd=src_dir, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) - stdout, stderr = make.communicate(timeout=600) - if make.returncode: - raise CsmithBuildError("make install", stdout, stderr) @property def size(self) -> float: @@ -271,7 +189,7 @@ def benchmark_from_seed( # assemble a bitcode. self.logger.debug("Exec csmith --seed %d", seed) csmith = subprocess.Popen( - [str(self.csmith_path), "--seed", str(seed)], + [str(self.csmith_bin_path), "--seed", str(seed)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) diff --git a/compiler_gym/third_party/csmith/BUILD b/compiler_gym/third_party/csmith/BUILD new file mode 100644 index 000000000..4f26818dc --- /dev/null +++ b/compiler_gym/third_party/csmith/BUILD @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +load("@rules_foreign_cc//tools/build_defs:configure.bzl", "configure_make") + +# Funnel the Csmith sources through a no-op filegroup to fix an issue in +# collecting the generated files for use in `data` attributes of some targets. +# See: https://github.com/bazelbuild/rules_foreign_cc/issues/619 +filegroup( + name = "all", + srcs = [":csmith"], + visibility = ["//visibility:public"], +) + +configure_make( + name = "csmith", + binaries = ["csmith"], + lib_source = "@csmith//:all", +) diff --git a/setup.py b/setup.py index a82db7f96..47f9219e7 100644 --- a/setup.py +++ b/setup.py @@ -82,6 +82,8 @@ def get_tag(self): "envs/llvm/service/passes/*.txt", "third_party/cbench/benchmarks.txt", "third_party/cbench/cbench-v*/*", + "third_party/csmith/csmith/bin/csmith", + "third_party/csmith/csmith/include/csmith-2.3.0/*.h", "third_party/inst2vec/*.pickle", ] }, From 63acf99e0d5016ead79b9748b094f4c599d485d2 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Fri, 30 Jul 2021 15:18:46 +0100 Subject: [PATCH 2/2] [llvm] Fix csmith build on macOS. --- compiler_gym/third_party/csmith/BUILD | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compiler_gym/third_party/csmith/BUILD b/compiler_gym/third_party/csmith/BUILD index 4f26818dc..44fb6b995 100644 --- a/compiler_gym/third_party/csmith/BUILD +++ b/compiler_gym/third_party/csmith/BUILD @@ -16,5 +16,15 @@ filegroup( configure_make( name = "csmith", binaries = ["csmith"], + configure_env_vars = { + # Workaround error with libtool usage on macOS. See: + # https://github.com/bazelbuild/rules_foreign_cc/issues/185 + "AR": "/usr/bin/ar", + # Csmith uses decreated stdlib functions like std::bind2nd(). + "CXXFLAGS": "-D_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES", + }, + # Workaround a strange bug where the srand48_deterministic test returns + # true on macOS, although this only available and needed for OpenBSD. + configure_options = ["ac_cv_func_srand48_deterministic=no"], lib_source = "@csmith//:all", )