From e194a43663482ccdc26c788f1563d293a47abb32 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 27 Jul 2023 17:12:59 -0700 Subject: [PATCH] Move mutable globals into a central structure This is a stab at an alternative to #7717, per discussion there. It doesn't necessarily have all the mutable globals moved here, but at least the ones we discussed. Better? Worse? What do you think? --- Makefile | 3 +- src/CMakeLists.txt | 3 + src/CompilerGlobals.cpp | 43 ++++++++++++++ src/CompilerGlobals.h | 62 ++++++++++++++++++++ src/Function.cpp | 8 +-- src/IROperator.cpp | 9 ++- src/Module.cpp | 4 ++ src/Util.cpp | 75 ++++++------------------ src/Util.h | 7 +++ test/generator/multitarget_aottest.cpp | 32 ++++++++-- test/generator/multitarget_generator.cpp | 4 ++ 11 files changed, 179 insertions(+), 71 deletions(-) create mode 100644 src/CompilerGlobals.cpp create mode 100644 src/CompilerGlobals.h diff --git a/Makefile b/Makefile index 0679590bd8a5..60990926a4e5 100644 --- a/Makefile +++ b/Makefile @@ -490,6 +490,7 @@ SOURCE_FILES = \ CodeGen_WebGPU_Dev.cpp \ CodeGen_X86.cpp \ CompilerLogger.cpp \ + CompilerGlobals.cpp \ CPlusPlusMangle.cpp \ CSE.cpp \ Debug.cpp \ @@ -676,7 +677,7 @@ HEADER_FILES = \ CodeGen_PyTorch.h \ CodeGen_Targets.h \ CodeGen_WebGPU_Dev.h \ - CompilerLogger.h \ + CompilerGlobals.h \ ConciseCasts.h \ CPlusPlusMangle.h \ CSE.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d56bb5288ec5..244a89e7e9a6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,6 +41,8 @@ set(HEADER_FILES CodeGen_Targets.h CodeGen_Vulkan_Dev.h CodeGen_WebGPU_Dev.h + # No: we don't want this in Halide.h + # CompilerGlobals.h CompilerLogger.h ConciseCasts.h CPlusPlusMangle.h @@ -211,6 +213,7 @@ set(SOURCE_FILES CodeGen_WebAssembly.cpp CodeGen_WebGPU_Dev.cpp CodeGen_X86.cpp + CompilerGlobals.cpp CompilerLogger.cpp CPlusPlusMangle.cpp CSE.cpp diff --git a/src/CompilerGlobals.cpp b/src/CompilerGlobals.cpp new file mode 100644 index 000000000000..4fa99028787c --- /dev/null +++ b/src/CompilerGlobals.cpp @@ -0,0 +1,43 @@ +#include "CompilerGlobals.h" + +#include "Util.h" + +namespace Halide { +namespace Internal { + +namespace { + +Globals the_real_globals; + +size_t compute_stack_size() { + std::string stack_size = get_env_variable("HL_COMPILER_STACK_SIZE"); + if (stack_size.empty()) { + return default_compiler_stack_size; + } else { + return std::stoi(stack_size); + } +} + +} // namespace + +Globals &globals() { + return the_real_globals; +} + +Globals::Globals() { + reset(); +} + +void Globals::reset() { + random_float_counter = 0; + random_uint_counter = 0; + random_variable_counter = 0; + for (int i = 0; i < num_unique_name_counters; i++) { + unique_name_counters[i] = 0; + } + tick_stack.clear(); + compiler_stack_size = compute_stack_size(); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/CompilerGlobals.h b/src/CompilerGlobals.h new file mode 100644 index 000000000000..a3d7c6a38701 --- /dev/null +++ b/src/CompilerGlobals.h @@ -0,0 +1,62 @@ +#ifndef HALIDE_COMPILER_GLOBALS_H +#define HALIDE_COMPILER_GLOBALS_H + +/** \file + * Contains a struct that defines all of the Compiler's global state. + */ + +#include +#include +#include + +#include "Util.h" + +namespace Halide { +namespace Internal { + +/** This struct is designed to contain all of the *mutable* global data + * used by the Halide compiler. (Global data that is declared const must + * not go here.) */ +struct Globals { + struct TickStackEntry { + std::chrono::time_point time; + std::string file; + int line; + }; + + // A counter to use in random_float() calls. + std::atomic random_float_counter; + + // A counter to use in random_uint() and random_int() calls. + std::atomic random_uint_counter; + + // A counter to use in tagging random variables. + // Note that this will be reset by Internal::reset_random_counters(). + std::atomic random_variable_counter; + + // Counters used for the unique_name() utilities. + std::atomic unique_name_counters[num_unique_name_counters]; + + // The stack for tic/toc utilities + std::vector tick_stack; + + // The amount of stack space to use for the compiler, in bytes. + size_t compiler_stack_size; + + Globals(); + + void reset(); + +private: + Globals(const Globals ©) = delete; + Globals &operator=(const Globals &) = delete; + Globals(Globals &&) = delete; + Globals &operator=(Globals &&) = delete; +}; + +Globals &globals(); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/Function.cpp b/src/Function.cpp index c644d53c2784..02d9cced27a9 100644 --- a/src/Function.cpp +++ b/src/Function.cpp @@ -5,6 +5,7 @@ #include #include "CSE.h" +#include "CompilerGlobals.h" #include "Func.h" #include "Function.h" #include "IR.h" @@ -299,9 +300,6 @@ class FreezeFunctions : public IRGraphVisitor { } }; -// A counter to use in tagging random variables -std::atomic rand_counter{0}; - } // namespace Function::Function(const FunctionPtr &ptr) @@ -542,7 +540,7 @@ void Function::define(const vector &args, vector values) { } // Tag calls to random() with the free vars - int tag = rand_counter++; + int tag = globals().random_variable_counter++; vector free_vars; free_vars.reserve(args.size()); for (const auto &arg : args) { @@ -752,7 +750,7 @@ void Function::define_update(const vector &_args, vector values) { free_vars.emplace_back(RVar(check.reduction_domain, i)); } } - int tag = rand_counter++; + int tag = globals().random_variable_counter++; for (auto &arg : args) { arg = lower_random(arg, free_vars, tag); } diff --git a/src/IROperator.cpp b/src/IROperator.cpp index 1b1d4f816bbc..92f088207f18 100644 --- a/src/IROperator.cpp +++ b/src/IROperator.cpp @@ -6,6 +6,7 @@ #include #include "CSE.h" +#include "CompilerGlobals.h" #include "Debug.h" #include "Func.h" #include "IREquality.h" @@ -2595,8 +2596,7 @@ Expr mod_round_to_zero(Expr x, Expr y) { Expr random_float(Expr seed) { // Random floats get even IDs - static std::atomic counter{0}; - int id = (counter++) * 2; + int id = (globals().random_float_counter++) * 2; std::vector args; if (seed.defined()) { @@ -2615,8 +2615,7 @@ Expr random_float(Expr seed) { Expr random_uint(Expr seed) { // Random ints get odd IDs - static std::atomic counter{0}; - int id = (counter++) * 2 + 1; + int id = (globals().random_uint_counter++) * 2 + 1; std::vector args; if (seed.defined()) { @@ -2632,7 +2631,7 @@ Expr random_uint(Expr seed) { } Expr random_int(Expr seed) { - return cast(random_uint(std::move(seed))); + return reinterpret(random_uint(std::move(seed))); } Expr likely(Expr e) { diff --git a/src/Module.cpp b/src/Module.cpp index bd7ce475b64b..ca726953a9b7 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -9,6 +9,7 @@ #include "CodeGen_C.h" #include "CodeGen_Internal.h" #include "CodeGen_PyTorch.h" +#include "CompilerGlobals.h" #include "CompilerLogger.h" #include "Debug.h" #include "HexagonOffload.h" @@ -813,6 +814,7 @@ void compile_multitarget(const std::string &fn_name, // This would make the filename outputs more symmetrical (ie the same for n=1 as for n>1) // but at the expense of breaking existing users. So for now, we're going to continue // with the legacy treatment below: + globals().reset(); module_factory(fn_name, base_target).compile(output_files); return; } @@ -877,6 +879,8 @@ void compile_multitarget(const std::string &fn_name, // We always produce the runtime separately, so add NoRuntime explicitly. Target sub_fn_target = target.with_feature(Target::NoRuntime); + // Ensure that each subtarget sees the same sequence of random numbers + globals().reset(); { ScopedCompilerLogger activate(compiler_logger_factory, sub_fn_name, sub_fn_target); Module sub_module = module_factory(sub_fn_name, sub_fn_target); diff --git a/src/Util.cpp b/src/Util.cpp index e1d1f3307848..f58332438511 100644 --- a/src/Util.cpp +++ b/src/Util.cpp @@ -5,6 +5,8 @@ #endif #include "Util.h" + +#include "CompilerGlobals.h" #include "Debug.h" #include "Error.h" #include "Introspection.h" @@ -158,23 +160,10 @@ string running_program_name() { } namespace { -// We use 64K of memory to store unique counters for the purpose of -// making names unique. Using less memory increases the likelihood of -// hash collisions. This wouldn't break anything, but makes stmts -// slightly confusing to read because names that are actually unique -// will get suffixes that falsely hint that they are not. - -const int num_unique_name_counters = (1 << 14); - -// We want to init these to zero, but cannot use = {0} because that -// would invoke a (deleted) copy ctor. The default initialization for -// atomics doesn't guarantee any actual initialization. Fortunately -// this is a global, which is always zero-initialized. -std::atomic unique_name_counters[num_unique_name_counters] = {}; int unique_count(size_t h) { h = h & (num_unique_name_counters - 1); - return unique_name_counters[h]++; + return globals().unique_name_counters[h]++; } } // namespace @@ -611,30 +600,18 @@ bool mul_with_overflow(int bits, int64_t a, int64_t b, int64_t *result) { } } -struct TickStackEntry { - std::chrono::time_point time; - string file; - int line; -}; - -namespace { - -vector tick_stack; - -} // namespace - void halide_tic_impl(const char *file, int line) { string f = file; f = split_string(f, "/").back(); - tick_stack.push_back({std::chrono::high_resolution_clock::now(), f, line}); + globals().tick_stack.push_back({std::chrono::high_resolution_clock::now(), f, line}); } void halide_toc_impl(const char *file, int line) { - auto t1 = tick_stack.back(); + auto t1 = globals().tick_stack.back(); auto t2 = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = t2 - t1.time; - tick_stack.pop_back(); - for (size_t i = 0; i < tick_stack.size(); i++) { + globals().tick_stack.pop_back(); + for (size_t i = 0; i < globals().tick_stack.size(); i++) { debug(1) << " "; } string f = file; @@ -702,28 +679,12 @@ void WINAPI generic_fiber_entry_point(LPVOID argument) { } // namespace Internal -namespace { - -struct CompilerStackSize { - CompilerStackSize() { - std::string stack_size = Internal::get_env_variable("HL_COMPILER_STACK_SIZE"); - if (stack_size.empty()) { - size = default_compiler_stack_size; - } else { - size = std::atoi(stack_size.c_str()); - } - } - size_t size; -} stack_size; - -} // namespace - void set_compiler_stack_size(size_t sz) { - stack_size.size = sz; + Internal::globals().compiler_stack_size = sz; } size_t get_compiler_stack_size() { - return stack_size.size; + return Internal::globals().compiler_stack_size; } namespace Internal { @@ -751,7 +712,9 @@ thread_local void *run_with_large_stack_arg = nullptr; #endif void run_with_large_stack(const std::function &action) { - if (stack_size.size == 0) { + const size_t compiler_stack_size = Internal::globals().compiler_stack_size; + + if (compiler_stack_size == 0) { // User has requested no stack swapping action(); return; @@ -765,8 +728,8 @@ void run_with_large_stack(const std::function &action) { GetCurrentThreadStackLimits(&stack_low, &stack_high); ptrdiff_t stack_remaining = (char *)&approx_stack_pos - (char *)stack_low; - if (stack_remaining < stack_size.size) { - debug(1) << "Insufficient stack space (" << stack_remaining << " bytes). Switching to fiber with " << stack_size.size << "-byte stack.\n"; + if (stack_remaining < compiler_stack_size) { + debug(1) << "Insufficient stack space (" << stack_remaining << " bytes). Switching to fiber with " << compiler_stack_size << "-byte stack.\n"; auto was_a_fiber = IsThreadAFiber(); @@ -774,7 +737,7 @@ void run_with_large_stack(const std::function &action) { internal_assert(main_fiber) << "ConvertThreadToFiber failed with code: " << GetLastError() << "\n"; GenericFiberArgs fiber_args{action, main_fiber}; - auto *lower_fiber = CreateFiber(stack_size.size, generic_fiber_entry_point, &fiber_args); + auto *lower_fiber = CreateFiber(compiler_stack_size, generic_fiber_entry_point, &fiber_args); internal_assert(lower_fiber) << "CreateFiber failed with code: " << GetLastError() << "\n"; SwitchToFiber(lower_fiber); @@ -838,17 +801,17 @@ void run_with_large_stack(const std::function &action) { // stack frames - 64k. const size_t guard_band = 64 * 1024; - void *stack = mmap(nullptr, stack_size.size + guard_band, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + void *stack = mmap(nullptr, compiler_stack_size + guard_band, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); internal_assert(stack) << "mmap failed with error " << strerror(errno); - int err = mprotect((char *)stack + stack_size.size, guard_band, PROT_NONE); + int err = mprotect((char *)stack + compiler_stack_size, guard_band, PROT_NONE); internal_assert(err == 0) << "mprotect failed with error " << strerror(errno); err = getcontext(&context); internal_assert(err == 0) << "getcontext failed with error " << strerror(errno); context.uc_stack.ss_sp = stack; - context.uc_stack.ss_size = stack_size.size; + context.uc_stack.ss_size = compiler_stack_size; context.uc_stack.ss_flags = 0; context.uc_link = &calling_context; @@ -858,7 +821,7 @@ void run_with_large_stack(const std::function &action) { err = swapcontext(&calling_context, &context); internal_assert(err == 0) << "swapcontext failed with error " << strerror(errno); - err = munmap(stack, stack_size.size + guard_band); + err = munmap(stack, compiler_stack_size + guard_band); internal_assert(err == 0) << "munmap failed with error " << strerror(errno); #ifdef HALIDE_WITH_EXCEPTIONS diff --git a/src/Util.h b/src/Util.h index 1bc53d5b3691..5d9b58c903a1 100644 --- a/src/Util.h +++ b/src/Util.h @@ -152,6 +152,13 @@ std::string get_env_variable(char const *env_var_name); * If program name cannot be retrieved, function returns an empty string. */ std::string running_program_name(); +// We use 64K of memory to store unique counters for the purpose of +// making names unique. Using less memory increases the likelihood of +// hash collisions. This wouldn't break anything, but makes stmts +// slightly confusing to read because names that are actually unique +// will get suffixes that falsely hint that they are not. +constexpr int num_unique_name_counters = (1 << 14); + /** Generate a unique name starting with the given prefix. It's unique * relative to all other strings returned by unique_name in this * process. diff --git a/test/generator/multitarget_aottest.cpp b/test/generator/multitarget_aottest.cpp index fe161bc1509b..b6e700df6760 100644 --- a/test/generator/multitarget_aottest.cpp +++ b/test/generator/multitarget_aottest.cpp @@ -2,6 +2,7 @@ #include "HalideRuntime.h" #include "multitarget.h" #include +#include #include #include @@ -58,11 +59,13 @@ int my_can_use_target_features(int count, const uint64_t *features) { int main(int argc, char **argv) { const int W = 32, H = 32; Buffer output(W, H); + auto random_float_output = Buffer::make_scalar(); + auto random_int_output = Buffer::make_scalar(); halide_set_error_handler(my_error_handler); halide_set_custom_can_use_target_features(my_can_use_target_features); - if (HalideTest::multitarget(output) != 0) { + if (HalideTest::multitarget(output, random_float_output, random_int_output) != 0) { printf("Error at multitarget\n"); return 1; } @@ -78,11 +81,33 @@ int main(int argc, char **argv) { } } } + printf("Saw %x for no_bounds_query=%d\n", output(0, 0), use_noboundsquery_feature()); + + // We expect the "random" result to be the same for both subtargets. + { + const int32_t expected = -1000221372; + const int32_t actual = random_int_output(); + if (actual != expected) { + printf("Error for random_int_output: expected %d, got %d\n", expected, actual); + return 1; + } + printf("Saw %d for random_int_output() w/ no_bounds_query=%d\n", actual, use_noboundsquery_feature()); + } + + { + const float expected = 0.827175f; + const float actual = random_float_output(); + if (fabs(actual - expected) > 0.000001f) { + printf("Error for random_float_output: expected %f, got %f\n", expected, actual); + return 1; + } + printf("Saw %f for random_float_output() w/ no_bounds_query=%d\n", actual, use_noboundsquery_feature()); + } // halide_can_use_target_features() should be called exactly once, with the // result cached; call this a few more times to verify. for (int i = 0; i < 10; ++i) { - if (HalideTest::multitarget(output) != 0) { + if (HalideTest::multitarget(output, random_float_output, random_int_output) != 0) { printf("Error at multitarget\n"); return 1; } @@ -96,14 +121,13 @@ int main(int argc, char **argv) { // Verify that the multitarget wrapper code propagates nonzero error // results back to the caller properly. Buffer bad_type(W, H); - int result = HalideTest::multitarget(bad_type); + int result = HalideTest::multitarget(bad_type, random_float_output, random_int_output); if (result != halide_error_code_bad_type) { printf("Error: expected to fail with halide_error_code_bad_type (%d) but actually got %d!\n", (int)halide_error_code_bad_type, result); return 1; } } - printf("Saw %x for no_bounds_query=%d\n", output(0, 0), use_noboundsquery_feature()); printf("Success!\n"); return 0; } diff --git a/test/generator/multitarget_generator.cpp b/test/generator/multitarget_generator.cpp index 422bcf58188c..aff0321ffd66 100644 --- a/test/generator/multitarget_generator.cpp +++ b/test/generator/multitarget_generator.cpp @@ -5,6 +5,8 @@ namespace { class Multitarget : public Halide::Generator { public: Output> output{"output"}; + Output> random_float_output{"random_float_output"}; + Output> random_int_output{"random_int_output"}; void generate() { Var x, y; @@ -16,6 +18,8 @@ class Multitarget : public Halide::Generator { } else { output(x, y) = cast((int32_t)0xf00dcafe); } + random_float_output() = Halide::random_float(); + random_int_output() = Halide::random_int(); } };