From 2810646473893e49419148ba4732edee24417477 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Fri, 4 Feb 2022 01:47:00 +0000 Subject: [PATCH] Add a DCE barrier builtin In #43852 we noticed that the compiler is getting good enough to completely DCE a number of our benchmarks. We need to add some sort of mechanism to prevent the compiler from doing so. This adds just such an intrinsic. The intrinsic itself doesn't do anything, but it is considered effectful by our optimizer, preventing it from being DCE'd. At the LLVM level, it turns into a volatile store to an alloca (or an llvm.sideeffect if the values passed to the `dcebarrier` do not have any actual LLVM-level representation). The docs for the new intrinsic are as follows: ``` dcebarrier(args...) This function prevents dead-code elimination (DCE) of itself and any arguments passed to it, but is otherwise the lightest barrier possible. In particular, it is not a GC safepoint, does model an observable heap effect, does not expand to any code itself and may be re-ordered with respect to other side effects (though the total number of executions may not change). A useful model for this function is that it hashes all memory `reachable` from args and escapes this information through some observable side-channel that does not otherwise impact program behavior. Of course that's just a model. The function does nothing and returns `nothing`. This is intended for use in benchmarks that want to guarantee that `args` are actually computed. (Otherwise DCE may see that the result of the benchmark is unused and delete the entire benchmark code). **Note**: `dcebarrier` does not affect constant foloding. For example, in `dcebarrier(1+1)`, no add instruction needs to be executed at runtime and the code is semantically equivalent to `dcebarrier(2).` *# Examples function loop() for i = 1:1000 # The complier must guarantee that there are 1000 program points (in the correct # order) at which the value of `i` is in a register, but has otherwise # total control over the program. dcebarrier(i) end end ``` I believe the voltatile store at the LLVM level is actually somewhat stronger than what we want here. Ideally the `dcebarrier` would not and up generating any machine code at all and would also be compatible with optimizations like SROA and vectorization. However, I think this is fine for now. --- base/boot.jl | 2 ++ base/compiler/tfuncs.jl | 3 +++ base/docs/basedocs.jl | 35 +++++++++++++++++++++++++++++++++++ src/builtin_proto.h | 2 ++ src/builtins.c | 6 ++++++ src/codegen.cpp | 26 ++++++++++++++++++++++++++ src/staticdata.c | 5 +++-- test/compiler/codegen.jl | 7 +++++++ 8 files changed, 84 insertions(+), 2 deletions(-) diff --git a/base/boot.jl b/base/boot.jl index abdd7987ce901..f6b95e032ca7b 100644 --- a/base/boot.jl +++ b/base/boot.jl @@ -198,6 +198,8 @@ export <:, typeof, isa, typeassert, # method reflection applicable, invoke, + # dcebarrier + dcebarrier, # constants nothing, Main diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl index 177f33bd227f8..80d7806b10277 100644 --- a/base/compiler/tfuncs.jl +++ b/base/compiler/tfuncs.jl @@ -527,6 +527,7 @@ add_tfunc(atomic_pointerset, 3, 3, (a, v, order) -> (@nospecialize; a), 5) add_tfunc(atomic_pointerswap, 3, 3, (a, v, order) -> (@nospecialize; pointer_eltype(a)), 5) add_tfunc(atomic_pointermodify, 4, 4, atomic_pointermodify_tfunc, 5) add_tfunc(atomic_pointerreplace, 5, 5, atomic_pointerreplace_tfunc, 5) +add_tfunc(dcebarrier, 0, INT_INF, (@nospecialize args...)->Nothing, 0) # more accurate typeof_tfunc for vararg tuples abstract only in length function typeof_concrete_vararg(t::DataType) @@ -1695,6 +1696,8 @@ function _builtin_nothrow(@nospecialize(f), argtypes::Array{Any,1}, @nospecializ return true end return false + elseif f === dcebarrier + return true end return false end diff --git a/base/docs/basedocs.jl b/base/docs/basedocs.jl index 3cbe180233d9c..e5241066368aa 100644 --- a/base/docs/basedocs.jl +++ b/base/docs/basedocs.jl @@ -2897,4 +2897,39 @@ See also [`"`](@ref \") """ kw"\"\"\"" +""" + dcebarrier(args...) + +This function prevents dead-code elimination (DCE) of itself and any arguments +passed to it, but is otherwise the lightest barrier possible. In particular, +it is not a GC safepoint, does model an observable heap effect, does not expand +to any code itself and may be re-ordered with respect to other side effects +(though the total number of executions may not change). + +A useful model for this function is that it hashes all memory `reachable` from +args and escapes this information through some observable side-channel that does +not otherwise impact program behavior. Of course that's just a model. The +function does nothing and returns `nothing`. + +This is intended for use in benchmarks that want to guarantee that `args` are +actually computed. (Otherwise DCE may see that the result of the benchmark is +unused and delete the entire benchmark code). + +**Note**: `dcebarrier` does not affect constant foloding. For example, in + `dcebarrier(1+1)`, no add instruction needs to be executed at runtime and + the code is semantically equivalent to `dcebarrier(2).` + +# Examples + +function loop() + for i = 1:1000 + # The complier must guarantee that there are 1000 program points (in the correct + # order) at which the value of `i` is in a register, but has otherwise + # total control over the program. + dcebarrier(i) + end +end +""" +dcebarrier + end diff --git a/src/builtin_proto.h b/src/builtin_proto.h index e0b328e664d6c..2a6676161204c 100644 --- a/src/builtin_proto.h +++ b/src/builtin_proto.h @@ -53,6 +53,7 @@ DECLARE_BUILTIN(typeassert); DECLARE_BUILTIN(_typebody); DECLARE_BUILTIN(typeof); DECLARE_BUILTIN(_typevar); +DECLARE_BUILTIN(dcebarrier); JL_CALLABLE(jl_f_invoke_kwsorter); #ifdef DEFINE_BUILTIN_GLOBALS @@ -65,6 +66,7 @@ JL_CALLABLE(jl_f__abstracttype); JL_CALLABLE(jl_f__primitivetype); JL_CALLABLE(jl_f__setsuper); JL_CALLABLE(jl_f__equiv_typedef); +JL_CALLABLE(jl_f_dcebarrier); #ifdef __cplusplus } diff --git a/src/builtins.c b/src/builtins.c index b5368ad36a164..e72f6da4e1678 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -1472,6 +1472,11 @@ JL_CALLABLE(jl_f__setsuper) return jl_nothing; } +JL_CALLABLE(jl_f_dcebarrier) +{ + return jl_nothing; +} + static int equiv_field_types(jl_value_t *old, jl_value_t *ft) { size_t nf = jl_svec_len(ft); @@ -1834,6 +1839,7 @@ void jl_init_primitives(void) JL_GC_DISABLED add_builtin_func("_setsuper!", jl_f__setsuper); jl_builtin__typebody = add_builtin_func("_typebody!", jl_f__typebody); add_builtin_func("_equiv_typedef", jl_f__equiv_typedef); + jl_builtin_dcebarrier = add_builtin_func("dcebarrier", jl_f_dcebarrier); // builtin types add_builtin("Any", (jl_value_t*)jl_any_type); diff --git a/src/codegen.cpp b/src/codegen.cpp index 2580f693356ea..2ff1ec7eb09da 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -3464,6 +3464,32 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f, return true; } + else if (f == jl_builtin_dcebarrier) { + *ret = mark_julia_const(ctx, jl_nothing); + bool emitted_any_side_effect = false; + for (size_t i = 1; i <= nargs; ++i) { + const jl_cgval_t &obj = argv[i]; + if (obj.V) { + // TODO is this strong enough to constitute a read of any contained + // pointers? + Value *V = obj.V; + if (obj.isboxed) { + V = emit_pointer_from_objref(ctx, V); + } + Value *slotv = emit_static_alloca(ctx, V->getType()); + ctx.builder.CreateStore(V, slotv, true); + emitted_any_side_effect = true; + } + } + if (!emitted_any_side_effect) { + Function *sideeffect_func = Intrinsic::getDeclaration( + ctx.f->getParent(), + Intrinsic::sideeffect); + ctx.builder.CreateCall(sideeffect_func); + } + return true; + } + return false; } diff --git a/src/staticdata.c b/src/staticdata.c index 7427e23d391aa..f4d8be2c9d6de 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -26,7 +26,7 @@ extern "C" { // TODO: put WeakRefs on the weak_refs list during deserialization // TODO: handle finalizers -#define NUM_TAGS 152 +#define NUM_TAGS 153 // An array of references that need to be restored from the sysimg // This is a manually constructed dual of the gvars array, which would be produced by codegen for Julia code, for C. @@ -198,6 +198,7 @@ jl_value_t **const*const get_tags(void) { INSERT_TAG(jl_builtin__expr); INSERT_TAG(jl_builtin_ifelse); INSERT_TAG(jl_builtin__typebody); + INSERT_TAG(jl_builtin_dcebarrier); // All optional tags must be placed at the end, so that we // don't accidentally have a `NULL` in the middle @@ -252,7 +253,7 @@ static const jl_fptr_args_t id_to_fptrs[] = { &jl_f_applicable, &jl_f_invoke, &jl_f_sizeof, &jl_f__expr, &jl_f__typevar, &jl_f_ifelse, &jl_f__structtype, &jl_f__abstracttype, &jl_f__primitivetype, &jl_f__typebody, &jl_f__setsuper, &jl_f__equiv_typedef, &jl_f_opaque_closure_call, - NULL }; + &jl_f_dcebarrier, NULL }; typedef struct { ios_t *s; diff --git a/test/compiler/codegen.jl b/test/compiler/codegen.jl index 7469dc74c8156..3b40c5b2167a7 100644 --- a/test/compiler/codegen.jl +++ b/test/compiler/codegen.jl @@ -711,3 +711,10 @@ end @test !cmp43123(Ref{Function}(+), Ref{Union{typeof(+), typeof(-)}}(-)) @test cmp43123(Function[+], Union{typeof(+), typeof(-)}[+]) @test !cmp43123(Function[+], Union{typeof(+), typeof(-)}[-]) + +# Test that dcebarrier survives through to LLVM time +f_dcebarrier_input(x) = dcebarrier(x+1) +f_dcebarrier_const() = dcebarrier(1+1) +@test occursin("store", get_llvm(f_dcebarrier_input, Tuple{Int64}, true, false, false)) +@test !occursin("store", get_llvm(f_dcebarrier_const, Tuple{}, true, false, false)) +@test occursin("llvm.sideeffect", get_llvm(f_dcebarrier_const, Tuple{}, true, false, false))