From 6ca9fabdfc488b0ed16e7a78731d4feaeef45bf6 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 5 Jun 2024 01:42:38 +0000 Subject: [PATCH 1/3] implement "engine" for managing inference/codegen Continuing from previous PRs to making CodeInstance the primary means of tracking compilation, this introduces an "engine" which keeps track externally of whether a particular inference result is in progress and where. At present, this handles unexpected cycles by permitting both threads to work on it. This is likely to be optimal most of the time currently, until we have the ability to do work-stealing of the results. To assist with that, CodeInstance is now primarily allocated by `jl_engine_reserve`, which also tracks that this is being currently inferred. This creates a sort of per-(MI,owner) tuple lock mechanism, which can be used with the double-check pattern to see if inference was completed while waiting on that. The `world` value is not included since that is inferred later, so there is a possibility that a thread waits only to discover that the result was already invalid before it could use it (though this should be unlikely). The process then can notify when it has finished and wants to release the reservation lock on that identity pair. When doing so, it may also provide source code, allowing the process to potentially begin a threadpool to compile that result while the main thread continues with the job of inference. Includes fix for #53434, by ensuring SOURCE_MODE_ABI results in the item going into the global cache. Fixes #53433, as inInference is computed by the engine and protected by a lock, which also fixes #53680. --- base/compiler/typeinfer.jl | 350 ++++++++++++++++++++----------------- base/compiler/types.jl | 37 +--- doc/src/devdocs/locks.md | 32 +--- src/Makefile | 2 +- src/aotcompile.cpp | 2 +- src/codegen.cpp | 2 +- src/engine.cpp | 159 +++++++++++++++++ src/gc-stacks.c | 10 +- src/gc.c | 13 +- src/gc.h | 2 +- src/gf.c | 74 +++++--- src/jl_exported_funcs.inc | 2 - src/jltypes.c | 12 +- src/julia.h | 1 - src/julia_gcext.h | 18 +- src/julia_internal.h | 8 +- src/julia_threads.h | 1 + src/method.c | 1 - src/toplevel.c | 4 +- 19 files changed, 452 insertions(+), 278 deletions(-) create mode 100644 src/engine.cpp diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl index 804989d0c5295..c49a49d2e4bf7 100644 --- a/base/compiler/typeinfer.jl +++ b/base/compiler/typeinfer.jl @@ -4,14 +4,6 @@ const track_newly_inferred = RefValue{Bool}(false) const newly_inferred = CodeInstance[] -# build (and start inferring) the inference frame for the top-level MethodInstance -function typeinf(interp::AbstractInterpreter, result::InferenceResult, cache_mode::Symbol) - frame = InferenceState(result, cache_mode, interp) - frame === nothing && return false - cache_mode === :global && lock_mi_inference(interp, result.linfo) - return typeinf(interp, frame) -end - """ The module `Core.Compiler.Timings` provides a simple implementation of nested timers that can be used to measure the exclusive time spent inferring each method instance that is @@ -230,41 +222,38 @@ function finish!(interp::AbstractInterpreter, caller::InferenceState; result.src = opt = ir_to_codeinf!(opt) end if isdefined(result, :ci) + ci = result.ci inferred_result = nothing relocatability = 0x1 const_flag = is_result_constabi_eligible(result) - if is_cached(caller) || !can_discard_trees - ci = result.ci - if !(is_result_constabi_eligible(result) && can_discard_trees) - inferred_result = transform_result_for_cache(interp, result.linfo, result.valid_worlds, result, can_discard_trees) - relocatability = 0x0 - if inferred_result isa CodeInfo - edges = inferred_result.debuginfo - uncompressed = inferred_result - inferred_result = maybe_compress_codeinfo(interp, result.linfo, inferred_result, can_discard_trees) - result.is_src_volatile |= uncompressed !== inferred_result - elseif ci.owner === nothing - # The global cache can only handle objects that codegen understands - inferred_result = nothing - end - if isa(inferred_result, String) - t = @_gc_preserve_begin inferred_result - relocatability = unsafe_load(unsafe_convert(Ptr{UInt8}, inferred_result), Core.sizeof(inferred_result)) - @_gc_preserve_end t - elseif inferred_result === nothing - relocatability = 0x1 - end + if !can_discard_trees || (is_cached(caller) && !const_flag) + inferred_result = transform_result_for_cache(interp, result.linfo, result.valid_worlds, result, can_discard_trees) + relocatability = 0x0 + if inferred_result isa CodeInfo + edges = inferred_result.debuginfo + uncompressed = inferred_result + inferred_result = maybe_compress_codeinfo(interp, result.linfo, inferred_result, can_discard_trees) + result.is_src_volatile |= uncompressed !== inferred_result + elseif ci.owner === nothing + # The global cache can only handle objects that codegen understands + inferred_result = nothing + end + if isa(inferred_result, String) + t = @_gc_preserve_begin inferred_result + relocatability = unsafe_load(unsafe_convert(Ptr{UInt8}, inferred_result), Core.sizeof(inferred_result)) + @_gc_preserve_end t end - # n.b. relocatability = (isa(inferred_result, String) && inferred_result[end]) || inferred_result === nothing end + # n.b. relocatability = isa(inferred_result, String) && inferred_result[end] if !@isdefined edges edges = DebugInfo(result.linfo) end ccall(:jl_update_codeinst, Cvoid, (Any, Any, Int32, UInt, UInt, UInt32, Any, UInt8, Any), - ci, inferred_result, const_flag, - first(result.valid_worlds), last(result.valid_worlds), - encode_effects(result.ipo_effects), result.analysis_results, - relocatability, edges) + ci, inferred_result, const_flag, + first(result.valid_worlds), last(result.valid_worlds), + encode_effects(result.ipo_effects), result.analysis_results, + relocatability, edges) + engine_reject(interp, ci) end return nothing end @@ -293,7 +282,6 @@ function finish_nocycle(::AbstractInterpreter, frame::InferenceState) optimize(frame.interp, opt, frame.result) end finish!(frame.interp, frame) - unlock_mi_inference(frame.interp, frame.linfo) return nothing end @@ -322,7 +310,6 @@ function finish_cycle(::AbstractInterpreter, frames::Vector{InferenceState}) end for caller in frames finish!(caller.interp, caller) - unlock_mi_inference(caller.interp, caller.linfo) end return nothing end @@ -346,46 +333,7 @@ function is_result_constabi_eligible(result::InferenceResult) result_type = result.result return isa(result_type, Const) && is_foldable_nothrow(result.ipo_effects) && is_inlineable_constant(result_type.val) end -function CodeInstance(interp::AbstractInterpreter, result::InferenceResult) - local const_flags::Int32 - result_type = result.result - @assert !(result_type === nothing || result_type isa LimitedAccuracy) - if is_result_constabi_eligible(result) - # use constant calling convention - rettype_const = result_type.val - const_flags = 0x3 - else - if isa(result_type, Const) - rettype_const = result_type.val - const_flags = 0x2 - elseif isa(result_type, PartialOpaque) - rettype_const = result_type - const_flags = 0x2 - elseif isconstType(result_type) - rettype_const = result_type.parameters[1] - const_flags = 0x2 - elseif isa(result_type, PartialStruct) - rettype_const = result_type.fields - const_flags = 0x2 - elseif isa(result_type, InterConditional) - rettype_const = result_type - const_flags = 0x2 - elseif isa(result_type, InterMustAlias) - rettype_const = result_type - const_flags = 0x2 - else - rettype_const = nothing - const_flags = 0x00 - end - end - relocatability = 0x0 - owner = cache_owner(interp) - return CodeInstance(result.linfo, owner, - widenconst(result_type), widenconst(result.exc_result), rettype_const, nothing, - const_flags, first(result.valid_worlds), last(result.valid_worlds), - encode_effects(result.ipo_effects), result.analysis_results, - relocatability, nothing) -end + function transform_result_for_cache(interp::AbstractInterpreter, ::MethodInstance, valid_worlds::WorldRange, result::InferenceResult, @@ -424,7 +372,7 @@ function cache_result!(interp::AbstractInterpreter, result::InferenceResult) # check if the existing linfo metadata is also sufficient to describe the current inference result # to decide if it is worth caching this right now mi = result.linfo - cache_results = !already_inferred_quick_test(interp, mi) + cache_results = true cache = WorldView(code_cache(interp), result.valid_worlds) if cache_results && haskey(cache, mi) ci = cache[mi] @@ -438,16 +386,15 @@ function cache_result!(interp::AbstractInterpreter, result::InferenceResult) end if cache_results - code_cache(interp)[mi] = ci = CodeInstance(interp, result) - result.ci = ci + code_cache(interp)[mi] = result.ci if track_newly_inferred[] m = mi.def if isa(m, Method) && m.module != Core - ccall(:jl_push_newly_inferred, Cvoid, (Any,), ci) + ccall(:jl_push_newly_inferred, Cvoid, (Any,), result.ci) end end end - nothing + return cache_results end function cycle_fix_limited(@nospecialize(typ), sv::InferenceState) @@ -600,10 +547,11 @@ function finishinfer!(me::InferenceState, interp::AbstractInterpreter) end end end - me.result.valid_worlds = me.valid_worlds - me.result.result = bestguess - ipo_effects = me.result.ipo_effects = me.ipo_effects = adjust_effects(me) - me.result.exc_result = me.exc_bestguess = refine_exception_type(me.exc_bestguess, ipo_effects) + result = me.result + result.valid_worlds = me.valid_worlds + result.result = bestguess + ipo_effects = result.ipo_effects = me.ipo_effects = adjust_effects(me) + result.exc_result = me.exc_bestguess = refine_exception_type(me.exc_bestguess, ipo_effects) me.src.rettype = widenconst(ignorelimited(bestguess)) me.src.min_world = first(me.valid_worlds) me.src.max_world = last(me.valid_worlds) @@ -611,13 +559,13 @@ function finishinfer!(me::InferenceState, interp::AbstractInterpreter) if limited_ret # a parent may be cached still, but not this intermediate work: # we can throw everything else away now - me.result.src = nothing + result.src = nothing me.cache_mode = CACHE_MODE_NULL set_inlineable!(me.src, false) elseif limited_src # a type result will be cached still, but not this intermediate work: # we can throw everything else away now - me.result.src = nothing + result.src = nothing set_inlineable!(me.src, false) else # annotate fulltree with type information, @@ -628,18 +576,54 @@ function finishinfer!(me::InferenceState, interp::AbstractInterpreter) # disable optimization if we don't use this later (me.cache_mode != CACHE_MODE_NULL || me.parent !== nothing) && # disable optimization if we've already obtained very accurate result - !result_is_constabi(interp, me.result, mayopt) + !result_is_constabi(interp, result, mayopt) if doopt - me.result.src = OptimizationState(me, interp) + result.src = OptimizationState(me, interp) else - me.result.src = me.src # for reflection etc. + result.src = me.src # for reflection etc. end end maybe_validate_code(me.linfo, me.src, "inferred") - if is_cached(me) - cache_result!(me.interp, me.result) + # finish populating inference results into the CodeInstance if possible, and maybe cache that globally for use elsewhere + if isdefined(result, :ci) && !limited_ret + result_type = result.result + @assert !(result_type === nothing || result_type isa LimitedAccuracy) + if isa(result_type, Const) + rettype_const = result_type.val + const_flags = is_result_constabi_eligible(result) ? 0x3 : 0x2 + elseif isa(result_type, PartialOpaque) + rettype_const = result_type + const_flags = 0x2 + elseif isconstType(result_type) + rettype_const = result_type.parameters[1] + const_flags = 0x2 + elseif isa(result_type, PartialStruct) + rettype_const = result_type.fields + const_flags = 0x2 + elseif isa(result_type, InterConditional) + rettype_const = result_type + const_flags = 0x2 + elseif isa(result_type, InterMustAlias) + rettype_const = result_type + const_flags = 0x2 + else + rettype_const = nothing + const_flags = 0x00 + end + relocatability = 0x0 + edges = nothing + ccall(:jl_fill_codeinst, Cvoid, (Any, Any, Any, Any, Int32, UInt, UInt, UInt32, Any, Any), + result.ci, widenconst(result_type), widenconst(result.exc_result), rettype_const, const_flags, + first(result.valid_worlds), last(result.valid_worlds), + encode_effects(result.ipo_effects), result.analysis_results, edges) + if is_cached(me) + cached_results = cache_result!(me.interp, me.result) + if !cached_results + me.cache_mode = CACHE_MODE_NULL + end + end end nothing end @@ -879,21 +863,21 @@ end # compute (and cache) an inferred AST and return the current best estimate of the result type function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize(atype), sparams::SimpleVector, caller::AbsIntState) mi = specialize_method(method, atype, sparams)::MethodInstance - codeinst = get(code_cache(interp), mi, nothing) + cache_mode = CACHE_MODE_GLOBAL # cache edge targets globally by default force_inline = is_stmt_inline(get_curr_ssaflag(caller)) - if codeinst isa CodeInstance # return existing rettype if the code is already inferred - inferred = @atomic :monotonic codeinst.inferred - if inferred === nothing && force_inline - # we already inferred this edge before and decided to discard the inferred code, - # nevertheless we re-infer it here again in order to propagate the re-inferred - # source to the inliner as a volatile result - cache_mode = CACHE_MODE_VOLATILE - else - @assert codeinst.def === mi "MethodInstance for cached edge does not match" - return return_cached_result(interp, codeinst, caller) + let codeinst = get(code_cache(interp), mi, nothing) + if codeinst isa CodeInstance # return existing rettype if the code is already inferred + inferred = @atomic :monotonic codeinst.inferred + if inferred === nothing && force_inline + # we already inferred this edge before and decided to discard the inferred code, + # nevertheless we re-infer it here again in order to propagate the re-inferred + # source to the inliner as a volatile result + cache_mode = CACHE_MODE_VOLATILE + else + @assert codeinst.def === mi "MethodInstance for cached edge does not match" + return return_cached_result(interp, codeinst, caller) + end end - else - cache_mode = CACHE_MODE_GLOBAL # cache edge targets globally by default end if ccall(:jl_get_module_infer, Cint, (Any,), method.module) == 0 && !generating_output(#=incremental=#false) add_remark!(interp, caller, "Inference is disabled for the target module") @@ -907,14 +891,33 @@ function typeinf_edge(interp::AbstractInterpreter, method::Method, @nospecialize frame = resolve_call_cycle!(interp, mi, caller) end if frame === false - # completely new - lock_mi_inference(interp, mi) + # completely new, but check again after reserving in the engine + if cache_mode == CACHE_MODE_GLOBAL + ci = engine_reserve(interp, mi) + let codeinst = get(code_cache(interp), mi, nothing) + if codeinst isa CodeInstance # return existing rettype if the code is already inferred + engine_reject(interp, ci) + inferred = @atomic :monotonic codeinst.inferred + if inferred === nothing && force_inline + cache_mode = CACHE_MODE_VOLATILE + else + @assert codeinst.def === mi "MethodInstance for cached edge does not match" + return return_cached_result(interp, codeinst, caller) + end + end + end + end result = InferenceResult(mi, typeinf_lattice(interp)) + if cache_mode == CACHE_MODE_GLOBAL + result.ci = ci + end frame = InferenceState(result, cache_mode, interp) # always use the cache for edge targets if frame === nothing add_remark!(interp, caller, "Failed to retrieve source") # can't get the source for this, so we know nothing - unlock_mi_inference(interp, mi) + if cache_mode == CACHE_MODE_GLOBAL + engine_reject(interp, ci) + end return EdgeCallResult(Any, Any, nothing, Effects()) end if is_cached(caller) || frame_parent(caller) !== nothing # don't involve uncached functions in cycle resolution @@ -1133,7 +1136,7 @@ function ci_has_abi(code::CodeInstance) return code.invoke !== C_NULL end -function ci_meets_requirement(code::CodeInstance, source_mode::UInt8, ci_is_cached::Bool) +function ci_meets_requirement(code::CodeInstance, source_mode::UInt8) source_mode == SOURCE_MODE_NOT_REQUIRED && return true source_mode == SOURCE_MODE_ABI && return ci_has_abi(code) source_mode == SOURCE_MODE_FORCE_SOURCE && return ci_has_source(code) @@ -1146,17 +1149,18 @@ _uncompressed_ir(codeinst::CodeInstance, s::String) = # compute (and cache) an inferred AST and return type function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mode::UInt8) start_time = ccall(:jl_typeinf_timing_begin, UInt64, ()) - code = get(code_cache(interp), mi, nothing) - if code isa CodeInstance - # see if this code already exists in the cache - if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(code) - code = codeinstance_for_const_with_code(interp, code) - ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) - return code - end - if ci_meets_requirement(code, source_mode, true) - ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) - return code + let code = get(code_cache(interp), mi, nothing) + if code isa CodeInstance + # see if this code already exists in the cache + if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(code) + code = codeinstance_for_const_with_code(interp, code) + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return code + end + if ci_meets_requirement(code, source_mode) + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return code + end end end def = mi.def @@ -1169,44 +1173,55 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod UInt32(0), nothing, UInt8(0), src.debuginfo) end end - lock_mi_inference(interp, mi) + ci = engine_reserve(interp, mi) + # check cache again if it is still new after reserving in the engine + let code = get(code_cache(interp), mi, nothing) + if code isa CodeInstance + # see if this code already exists in the cache + if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(code) + engine_reject(interp, ci) + code = codeinstance_for_const_with_code(interp, code) + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return code + end + if ci_meets_requirement(code, source_mode) + engine_reject(interp, ci) + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return code + end + end + end result = InferenceResult(mi, typeinf_lattice(interp)) + result.ci = ci frame = InferenceState(result, #=cache_mode=#:global, interp) - frame === nothing && return nothing + if frame === nothing + engine_reject(interp, ci) + return nothing + end typeinf(interp, frame) ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) - if isdefined(result, :ci) - if ci_meets_requirement(result.ci, source_mode, true) - # Inference result was cacheable and is in global cache. Return it. - return result.ci - elseif use_const_api(result.ci) - code = codeinstance_for_const_with_code(interp, result.ci) - @assert ci_meets_requirement(code, source_mode, false) - return code - end - end - - # Inference result is not cacheable or is was cacheable, but we do not want to - # store the source in the cache, but the caller wanted it anyway (e.g. for reflection). - # We construct a new CodeInstance for it that is not part of the cache hierarchy. - can_discard_trees = source_mode == SOURCE_MODE_NOT_REQUIRED || - is_result_constabi_eligible(result) - code = CodeInstance(interp, result) - if source_mode == SOURCE_MODE_ABI - # XXX: this should be using the CI from the cache, if possible: haskey(cache, mi) && (ci = cache[mi]) - code_cache(interp)[mi] = code - end - result.ci = code - finish!(interp, frame; can_discard_trees) - # If the caller cares about the code and this is constabi, still use our synthesis function - # anyway, because we will have not finished inferring the code inside the CodeInstance once - # we realized it was constabi, but we want reflection to pretend that we did. - if use_const_api(code) && source_mode == SOURCE_MODE_FORCE_SOURCE - return codeinstance_for_const_with_code(interp, code) - end - @assert ci_meets_requirement(code, source_mode, false) - return code + ci = result.ci # reload from result in case it changed + if source_mode == SOURCE_MODE_ABI && frame.cache_mode != CACHE_MODE_GLOBAL + # XXX: jl_type_infer somewhat ambiguously assumes this must be cached, while jl_ci_cache_lookup sort of ambiguously re-caches it + # XXX: this should be using the CI from the cache, if possible instead: haskey(cache, mi) && (ci = cache[mi]) + code_cache(interp)[mi] = ci + end + if source_mode == SOURCE_MODE_FORCE_SOURCE && use_const_api(ci) + # If the caller cares about the code and this is constabi, still use our synthesis function + # anyway, because we will have not finished inferring the code inside the CodeInstance once + # we realized it was constabi, but we want reflection to pretend that we did. + # XXX: the one user of this does not actually want this behavior, but it is required by the flag definition currently + ci = codeinstance_for_const_with_code(interp, ci) + @assert ci_meets_requirement(ci, source_mode) + return ci + end + if !ci_meets_requirement(ci, source_mode) + can_discard_trees = false + finish!(interp, frame; can_discard_trees) # redo finish! with the correct can_discard_trees parameter value + @assert ci_meets_requirement(ci, source_mode) + end + return ci end # compute (and cache) an inferred AST and return the inferred return type @@ -1219,15 +1234,32 @@ end typeinf_type(interp::AbstractInterpreter, match::MethodMatch) = typeinf_type(interp, specialize_method(match)) function typeinf_type(interp::AbstractInterpreter, mi::MethodInstance) + # n.b.: this could be replaced with @something(typeinf_ext(interp, mi, SOURCE_MODE_NOT_REQUIRED), return nothing).rettype start_time = ccall(:jl_typeinf_timing_begin, UInt64, ()) - code = get(code_cache(interp), mi, nothing) - if code isa CodeInstance - # see if this rettype already exists in the cache - ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) - return code.rettype + let code = get(code_cache(interp), mi, nothing) + if code isa CodeInstance + # see if this rettype already exists in the cache + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return code.rettype + end + end + ci = engine_reserve(interp, mi) + let code = get(code_cache(interp), mi, nothing) + if code isa CodeInstance + engine_reject(interp, ci) + # see if this rettype already exists in the cache + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return code.rettype + end end result = InferenceResult(mi, typeinf_lattice(interp)) - typeinf(interp, result, :global) + result.ci = ci + frame = InferenceState(result, #=cache_mode=#:global, interp) + if frame === nothing + engine_reject(interp, ci) + return nothing + end + typeinf(interp, frame) ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) is_inferred(result) || return nothing return widenconst(ignorelimited(result.result)) diff --git a/base/compiler/types.jl b/base/compiler/types.jl index a6f5488ef6703..eae7dfb84c6be 100644 --- a/base/compiler/types.jl +++ b/base/compiler/types.jl @@ -89,7 +89,7 @@ mutable struct InferenceResult effects::Effects # if optimization is finished analysis_results::AnalysisResults # AnalysisResults with e.g. result::ArgEscapeCache if optimized, otherwise NULL_ANALYSIS_RESULTS is_src_volatile::Bool # `src` has been cached globally as the compressed format already, allowing `src` to be used destructively - ci::CodeInstance # CodeInstance if this result has been added to the cache + ci::CodeInstance # CodeInstance if this result may be added to the cache function InferenceResult(mi::MethodInstance, argtypes::Vector{Any}, overridden_by_const::Union{Nothing,BitVector}) return new(mi, argtypes, overridden_by_const, nothing, nothing, nothing, WorldRange(), Effects(), Effects(), NULL_ANALYSIS_RESULTS, false) @@ -402,36 +402,15 @@ get_inference_world(interp::NativeInterpreter) = interp.world get_inference_cache(interp::NativeInterpreter) = interp.inf_cache cache_owner(interp::NativeInterpreter) = nothing -""" - already_inferred_quick_test(::AbstractInterpreter, ::MethodInstance) +engine_reserve(interp::AbstractInterpreter, mi::MethodInstance) = engine_reserve(mi, cache_owner(interp)) +engine_reserve(mi::MethodInstance, @nospecialize owner) = ccall(:jl_engine_reserve, Any, (Any, Any), mi, owner)::CodeInstance +# engine_fulfill(::AbstractInterpreter, ci::CodeInstance, src::CodeInfo) = ccall(:jl_engine_fulfill, Cvoid, (Any, Any), ci, src) # currently the same as engine_reject, so just use that one +engine_reject(::AbstractInterpreter, ci::CodeInstance) = ccall(:jl_engine_fulfill, Cvoid, (Any, Ptr{Cvoid}), ci, C_NULL) -For the `NativeInterpreter`, we don't need to do an actual cache query to know if something -was already inferred. If we reach this point, but the inference flag has been turned off, -then it's in the cache. This is purely for a performance optimization. -""" -already_inferred_quick_test(interp::NativeInterpreter, mi::MethodInstance) = !mi.inInference -already_inferred_quick_test(interp::AbstractInterpreter, mi::MethodInstance) = false -""" - lock_mi_inference(::AbstractInterpreter, mi::MethodInstance) - -Hint that `mi` is in inference to help accelerate bootstrapping. -This is particularly used by `NativeInterpreter` and helps us limit the amount of wasted -work we might do when inference is working on initially inferring itself by letting us -detect when inference is already in progress and not running a second copy on it. -This creates a data-race, but the entry point into this code from C (`jl_type_infer`) -already includes detection and restriction on recursion, so it is hopefully mostly a -benign problem, since it should really only happen during the first phase of bootstrapping -that we encounter this flag. -""" -lock_mi_inference(::NativeInterpreter, mi::MethodInstance) = (mi.inInference = true; nothing) -lock_mi_inference(::AbstractInterpreter, ::MethodInstance) = return - -""" -See `lock_mi_inference`. -""" -unlock_mi_inference(::NativeInterpreter, mi::MethodInstance) = (mi.inInference = false; nothing) -unlock_mi_inference(::AbstractInterpreter, ::MethodInstance) = return +function already_inferred_quick_test end +function lock_mi_inference end +function unlock_mi_inference end """ add_remark!(::AbstractInterpreter, sv::InferenceState, remark) diff --git a/doc/src/devdocs/locks.md b/doc/src/devdocs/locks.md index 50cdd738e3b34..8d6672842c3c8 100644 --- a/doc/src/devdocs/locks.md +++ b/doc/src/devdocs/locks.md @@ -71,19 +71,8 @@ The following is a level 5 lock The following are a level 6 lock, which can only recurse to acquire locks at lower levels: -> * codegen > * jl_modules_mutex -The following is an almost root lock (level end-1), meaning only the root look may be held when -trying to acquire it: - -> * typeinf -> -> > this one is perhaps one of the most tricky ones, since type-inference can be invoked from many -> > points -> > -> > currently the lock is merged with the codegen lock, since they call each other recursively - The following lock synchronizes IO operation. Be aware that doing any I/O (for example, printing warning messages or debug information) while holding any other lock listed above may result in pernicious and hard-to-find deadlocks. BE VERY CAREFUL! @@ -149,7 +138,7 @@ Module serializer : toplevel lock JIT & type-inference : codegen lock -MethodInstance/CodeInstance updates : Method->writelock, codegen lock +MethodInstance/CodeInstance updates : Method->writelock > * These are set at construction and immutable: > * specTypes @@ -157,26 +146,9 @@ MethodInstance/CodeInstance updates : Method->writelock, codegen lock > * def > * owner -> * These are set by `jl_type_infer` (while holding codegen lock): -> * cache -> * rettype -> * inferred - * valid ages - -> * `inInference` flag: -> * optimization to quickly avoid recurring into `jl_type_infer` while it is already running -> * actual state (of setting `inferred`, then `fptr`) is protected by codegen lock - > * Function pointers: -> * these transition once, from `NULL` to a value, while the codegen lock is held -> -> * Code-generator cache (the contents of `functionObjectsDecls`): -> * these can transition multiple times, but only while the codegen lock is held -> * it is valid to use old version of this, or block for new versions of this, so races are benign, -> as long as the code is careful not to reference other data in the method instance (such as `rettype`) -> and assume it is coordinated, unless also holding the codegen lock +> * these transition once, from `NULL` to a value, which is coordinated internal to the JIT > -LLVMContext : codegen lock Method : Method->writelock diff --git a/src/Makefile b/src/Makefile index e6c8ffcc7499d..eb7c9a6135a28 100644 --- a/src/Makefile +++ b/src/Makefile @@ -46,7 +46,7 @@ SRCS := \ simplevector runtime_intrinsics precompile jloptions mtarraylist \ threading scheduler stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \ jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \ - crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall + crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine RT_LLVMLINK := CG_LLVMLINK := diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 1d7bbd710f26b..1d1ca87b1fc5f 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -307,7 +307,7 @@ jl_code_instance_t *jl_ci_cache_lookup(const jl_cgparams_t &cgparams, jl_method_ jl_error("Refusing to automatically run type inference with custom cache lookup."); } else { - codeinst = jl_type_infer(mi, world, 0, SOURCE_MODE_ABI); + codeinst = jl_type_infer(mi, world, SOURCE_MODE_ABI); /* Even if this codeinst is ordinarily not cacheable, we need to force * it into the cache here, since it was explicitly requested and is * otherwise not reachable from anywhere in the system image. diff --git a/src/codegen.cpp b/src/codegen.cpp index 2bc40b1b8a26d..1fa10f91d281b 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -9895,7 +9895,7 @@ void jl_compile_workqueue( if (policy != CompilationPolicy::Default && jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) { // Codegen lock is held, so SOURCE_MODE_FORCE_SOURCE_UNCACHED is not required - codeinst = jl_type_infer(codeinst->def, jl_atomic_load_relaxed(&codeinst->max_world), 0, SOURCE_MODE_FORCE_SOURCE); + codeinst = jl_type_infer(codeinst->def, jl_atomic_load_relaxed(&codeinst->max_world), SOURCE_MODE_FORCE_SOURCE); } if (codeinst) { orc::ThreadSafeModule result_m = diff --git a/src/engine.cpp b/src/engine.cpp new file mode 100644 index 0000000000000..f88ee5c006798 --- /dev/null +++ b/src/engine.cpp @@ -0,0 +1,159 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include +#include +#include +#include +#include +#include "julia.h" +#include "julia_internal.h" +#include "julia_assert.h" + +using namespace llvm; + +struct ReservationInfo { + int16_t tid = 0; + jl_code_instance_t *ci = nullptr; +}; + +struct InferKey { + jl_method_instance_t *mi = nullptr; + jl_value_t *owner = nullptr; +}; + +template<> struct llvm::DenseMapInfo { + using FirstInfo = DenseMapInfo; + using SecondInfo = DenseMapInfo; + + static inline InferKey getEmptyKey() { + return InferKey{FirstInfo::getEmptyKey(), + SecondInfo::getEmptyKey()}; + } + + static inline InferKey getTombstoneKey() { + return InferKey{FirstInfo::getTombstoneKey(), + SecondInfo::getTombstoneKey()}; + } + + static unsigned getHashValue(const InferKey& PairVal) { + return detail::combineHashValue(FirstInfo::getHashValue(PairVal.mi), + SecondInfo::getHashValue(PairVal.owner)); + } + + static bool isEqual(const InferKey &LHS, const InferKey &RHS) { + return LHS.mi == RHS.mi && LHS.owner == RHS.owner; + } +}; + +static std::mutex engine_lock; +static std::condition_variable engine_wait; +// map from MethodInstance to threadid that owns it currently for inference +static DenseMap Reservations; +// vector of which threads are blocked and which lease they need +static SmallVector Awaiting; // (this could be merged into ptls also) + + +#ifdef __cplusplus +extern "C" { +#endif + +jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner) +{ + jl_task_t *ct = jl_current_task; + ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph + jl_code_instance_t *ci = jl_new_codeinst_uninit(m, owner); // allocate a placeholder + JL_GC_PUSH1(&ci); + InferKey key = {m, owner}; + std::unique_lock lock(engine_lock); + auto tid = jl_atomic_load_relaxed(&ct->tid); + if ((signed)Awaiting.size() < tid + 1) + Awaiting.resize(tid + 1); + while (1) { + auto record = Reservations.find(key); + if (record == Reservations.end()) { + Reservations[key] = ReservationInfo{tid, ci}; + JL_GC_POP(); + return ci; + } + // before waiting, need to run deadlock/cycle detection + // there is a cycle if the thread holding our lease is blocked + // and waiting for (transitively) any lease that is held by this thread + auto wait_tid = record->second.tid; + while (1) { + if (wait_tid == tid) { + JL_GC_POP(); + ct->ptls->engine_nqueued--; + return ci; // break the cycle + } + if ((signed)Awaiting.size() <= wait_tid) + break; // no cycle, since it is running (and this should be unreachable) + auto key2 = Awaiting[wait_tid]; + if (key2.mi == nullptr) + break; // no cycle, since it is running + auto record2 = Reservations.find(key2); + if (record2 == Reservations.end()) + break; // no cycle, since it is about to resume + assert(wait_tid != record2->second.tid); + wait_tid = record2->second.tid; + } + int8_t gc_state = jl_gc_safe_enter(ct->ptls); + Awaiting[tid] = key; + engine_wait.wait(lock); + Awaiting[tid] = InferKey{}; + jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint + } +} + +int jl_engine_hasreserved(jl_method_instance_t *m, jl_value_t *owner) +{ + jl_task_t *ct = jl_current_task; + InferKey key = {m, owner}; + std::unique_lock lock(engine_lock); + auto record = Reservations.find(key); + return record != Reservations.end() && record->second.tid == jl_atomic_load_relaxed(&ct->tid); +} + +STATIC_INLINE int gc_marked(uintptr_t bits) JL_NOTSAFEPOINT +{ + return (bits & GC_MARKED) != 0; +} + +void jl_engine_sweep(jl_ptls_t *gc_all_tls_states) +{ + std::unique_lock lock(engine_lock); + bool any = false; + for (auto I = Reservations.begin(); I != Reservations.end(); ++I) { + jl_code_instance_t *ci = I->second.ci; + if (!gc_marked(jl_astaggedvalue(ci)->bits.gc)) { + auto tid = I->second.tid; + Reservations.erase(I); + jl_ptls_t ptls2 = gc_all_tls_states[tid]; + ptls2->engine_nqueued--; + any = true; + } + } + if (any) + engine_wait.notify_all(); +} + +void jl_engine_inhibit_finalizers(void) +{ +} + +void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src) +{ + jl_task_t *ct = jl_current_task; + std::unique_lock lock(engine_lock); + auto record = Reservations.find(InferKey{ci->def, ci->owner}); + if (record == Reservations.end() || record->second.ci != ci) + return; + assert(jl_atomic_load_relaxed(&ct->tid) == record->second.tid); + ct->ptls->engine_nqueued--; // re-enables finalizers, but doesn't immediately try to run them + Reservations.erase(record); + engine_wait.notify_all(); +} + + +#ifdef __cplusplus +} +#endif diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 7882cef73bdb6..2a31d3b73f02b 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -55,7 +55,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT } -static void free_stack(void *stkbuf, size_t bufsz) +static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT { #ifdef JL_USE_GUARD_PAGE size_t guard_size = LLT_ALIGN(jl_guard_size, jl_page_size); @@ -111,7 +111,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT } # endif -static void free_stack(void *stkbuf, size_t bufsz) +static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT { #ifdef JL_USE_GUARD_PAGE size_t guard_size = LLT_ALIGN(jl_guard_size, jl_page_size); @@ -124,7 +124,7 @@ static void free_stack(void *stkbuf, size_t bufsz) } #endif -JL_DLLEXPORT uint32_t jl_get_num_stack_mappings(void) +JL_DLLEXPORT uint32_t jl_get_num_stack_mappings(void) JL_NOTSAFEPOINT { return jl_atomic_load_relaxed(&num_stack_mappings); } @@ -159,7 +159,7 @@ static unsigned select_pool(size_t nb) JL_NOTSAFEPOINT } -static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) +static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT { #ifdef _COMPILER_ASAN_ENABLED_ __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz); @@ -238,7 +238,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO return stk; } -void sweep_stack_pools(void) +void sweep_stack_pools(void) JL_NOTSAFEPOINT { // Stack sweeping algorithm: // // deallocate stacks if we have too many sitting around unused diff --git a/src/gc.c b/src/gc.c index ad9ce0a55b38a..91eb057f30028 100644 --- a/src/gc.c +++ b/src/gc.c @@ -444,7 +444,7 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct) if (ct == NULL) ct = jl_current_task; jl_ptls_t ptls = ct->ptls; - if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) { + if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0 && ptls->engine_nqueued == 0) { run_finalizers(ct, 0); } } @@ -620,7 +620,7 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) } // explicitly scheduled objects for the sweepfunc callback -static void gc_sweep_foreign_objs_in_list(arraylist_t *objs) +static void gc_sweep_foreign_objs_in_list(arraylist_t *objs) JL_NOTSAFEPOINT { size_t p = 0; for (size_t i = 0; i < objs->len; i++) { @@ -638,7 +638,7 @@ static void gc_sweep_foreign_objs_in_list(arraylist_t *objs) objs->len = p; } -static void gc_sweep_foreign_objs(void) +static void gc_sweep_foreign_objs(void) JL_NOTSAFEPOINT { assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { @@ -1584,8 +1584,11 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa // sweep over all memory that is being used and not in a pool static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT { + sweep_stack_pools(); + gc_sweep_foreign_objs(); sweep_malloced_memory(); sweep_big(ptls, sweep_full); + jl_engine_sweep(gc_all_tls_states); } static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT @@ -3662,8 +3665,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #endif current_sweep_full = sweep_full; sweep_weak_refs(); - sweep_stack_pools(); - gc_sweep_foreign_objs(); gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); @@ -3953,7 +3954,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) // Only disable finalizers on current thread // Doing this on all threads is racy (it's impossible to check // or wait for finalizers on other threads without dead lock). - if (!ptls->finalizers_inhibited && ptls->locks.len == 0) { + if (!ptls->finalizers_inhibited && ptls->locks.len == 0 && ptls->engine_nqueued == 0) { JL_TIMING(GC, GC_Finalizers); run_finalizers(ct, 0); } diff --git a/src/gc.h b/src/gc.h index c1fec6e71251e..26f53d2faa471 100644 --- a/src/gc.h +++ b/src/gc.h @@ -521,7 +521,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); void gc_sweep_pool_parallel(jl_ptls_t ptls); void gc_free_pages(void); -void sweep_stack_pools(void); +void sweep_stack_pools(void) JL_NOTSAFEPOINT; void jl_gc_debug_init(void); // GC pages diff --git a/src/gf.c b/src/gf.c index 75c0fd59f906e..5c7d4b77026aa 100644 --- a/src/gf.c +++ b/src/gf.c @@ -340,7 +340,7 @@ jl_datatype_t *jl_mk_builtin_func(jl_datatype_t *dt, const char *name, jl_fptr_a // returns the inferred source, and may cache the result in mi // if successful, also updates the mi argument to describe the validity of this src // if inference doesn't occur (or can't finish), returns NULL instead -jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int force, uint8_t source_mode) +jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, uint8_t source_mode) { if (jl_typeinf_func == NULL) return NULL; @@ -356,7 +356,7 @@ jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int fo jl_code_instance_t *ci = NULL; #ifdef ENABLE_INFERENCE - if (mi->inInference && !force) + if (jl_engine_hasreserved(mi, jl_nothing)) // don't recur on a thread on the same MethodInstance--force it to interpret it until the inference has finished return NULL; JL_TIMING(INFERENCE, INFERENCE); jl_value_t **fargs; @@ -382,7 +382,6 @@ jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int fo ct->ptls->in_pure_callback = 0; size_t last_age = ct->world_age; ct->world_age = jl_typeinf_world; - mi->inInference = 1; // first bit is for reentrant timing, // so adding 1 to the bit above performs // inference reentrancy counter addition. @@ -417,7 +416,6 @@ jl_code_instance_t *jl_type_infer(jl_method_instance_t *mi, size_t world, int fo ct->world_age = last_age; ct->reentrant_timing -= 0b10; ct->ptls->in_pure_callback = last_pure; - mi->inInference = 0; #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif @@ -567,8 +565,6 @@ JL_DLLEXPORT void jl_update_codeinst( uint32_t effects, jl_value_t *analysis_results, uint8_t relocatability, jl_debuginfo_t *edges /* , int absolute_max*/) { - jl_atomic_store_relaxed(&codeinst->min_world, min_world); - jl_atomic_store_relaxed(&codeinst->max_world, max_world); codeinst->relocatability = relocatability; codeinst->analysis_results = analysis_results; jl_gc_wb(codeinst, analysis_results); @@ -581,6 +577,47 @@ JL_DLLEXPORT void jl_update_codeinst( } jl_atomic_store_release(&codeinst->inferred, inferred); jl_gc_wb(codeinst, inferred); + jl_atomic_store_relaxed(&codeinst->min_world, min_world); // XXX: these should be unchanged? + jl_atomic_store_relaxed(&codeinst->max_world, max_world); // since the edges shouldn't change after jl_fill_codeinst +} + +JL_DLLEXPORT void jl_fill_codeinst( + jl_code_instance_t *codeinst, + jl_value_t *rettype, jl_value_t *exctype, + jl_value_t *inferred_const, + int32_t const_flags, size_t min_world, size_t max_world, + uint32_t effects, jl_value_t *analysis_results, + jl_debuginfo_t *edges /* , int absolute_max*/) +{ + assert(min_world <= max_world && "attempting to set invalid world constraints"); + codeinst->rettype = rettype; + jl_gc_wb(codeinst, rettype); + codeinst->exctype = exctype; + jl_gc_wb(codeinst, exctype); + if ((const_flags & 2) != 0) { + codeinst->rettype_const = inferred_const; + jl_gc_wb(codeinst, inferred_const); + } + jl_atomic_store_relaxed(&codeinst->debuginfo, (jl_value_t*)edges == jl_nothing ? NULL : edges); + jl_gc_wb(codeinst, edges); + if ((const_flags & 1) != 0) { + // TODO: may want to follow ordering restrictions here (see jitlayers.cpp) + assert(const_flags & 2); + jl_atomic_store_release(&codeinst->invoke, jl_fptr_const_return); + } + jl_atomic_store_relaxed(&codeinst->ipo_purity_bits, effects); + codeinst->analysis_results = analysis_results; + assert(jl_atomic_load_relaxed(&codeinst->min_world) == 1); + assert(jl_atomic_load_relaxed(&codeinst->max_world) == 0); + jl_atomic_store_release(&codeinst->min_world, min_world); + jl_atomic_store_release(&codeinst->max_world, max_world); +} + +JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst_uninit(jl_method_instance_t *mi, jl_value_t *owner) +{ + jl_code_instance_t *codeinst = jl_new_codeinst(mi, owner, NULL, NULL, NULL, NULL, 0, 0, 0, 0, NULL, 0, NULL); + jl_atomic_store_relaxed(&codeinst->min_world, 1); // make temporarily invalid before returning, so that jl_fill_codeinst is valid later + return codeinst; } JL_DLLEXPORT void jl_mi_cache_insert(jl_method_instance_t *mi JL_ROOTING_ARGUMENT, @@ -764,7 +801,7 @@ JL_DLLEXPORT void jl_set_typeinf_func(jl_value_t *f) for (i = 0, l = jl_array_nrows(unspec); i < l; i++) { jl_method_instance_t *mi = (jl_method_instance_t*)jl_array_ptr_ref(unspec, i); if (jl_rettype_inferred_native(mi, world, world) == jl_nothing) - jl_type_infer(mi, world, 1, SOURCE_MODE_NOT_REQUIRED); + jl_type_infer(mi, world, SOURCE_MODE_NOT_REQUIRED); } JL_GC_POP(); } @@ -2636,13 +2673,15 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t int is_recompile = jl_atomic_load_relaxed(&mi->cache) != NULL; // This codeinst hasn't been previously inferred do that now + // jl_type_infer will internally do a cache lookup and jl_engine_reserve call + // to synchronize this across threads if (!codeinst) { // Don't bother inferring toplevel thunks or macros - the performance cost of inference is likely // to significantly exceed the actual runtime. int should_skip_inference = !jl_is_method(mi->def.method) || jl_symbol_name(mi->def.method->name)[0] == '@'; if (!should_skip_inference) { - codeinst = jl_type_infer(mi, world, 0, SOURCE_MODE_ABI); + codeinst = jl_type_infer(mi, world, SOURCE_MODE_ABI); } } @@ -2661,7 +2700,8 @@ jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t if (jl_atomic_load_relaxed(&codeinst->invoke) == NULL) { // Something went wrong. Bail to the fallback path. codeinst = NULL; - } else if (did_compile) { + } + else if (did_compile) { record_precompile_statement(mi, compile_time); } JL_GC_POP(); @@ -2939,7 +2979,7 @@ static void _generate_from_hint(jl_method_instance_t *mi, size_t world) { jl_value_t *codeinst = jl_rettype_inferred_native(mi, world, world); if (codeinst == jl_nothing) { - (void)jl_type_infer(mi, world, 1, SOURCE_MODE_NOT_REQUIRED); + (void)jl_type_infer(mi, world, SOURCE_MODE_NOT_REQUIRED); codeinst = jl_rettype_inferred_native(mi, world, world); } if (codeinst != jl_nothing) { @@ -2980,10 +3020,10 @@ JL_DLLEXPORT void jl_compile_method_instance(jl_method_instance_t *mi, jl_tuplet JL_GC_POP(); jl_atomic_store_relaxed(&mi2->precompiled, 1); if (jl_rettype_inferred_native(mi2, world, world) == jl_nothing) - (void)jl_type_infer(mi2, world, 1, SOURCE_MODE_NOT_REQUIRED); + (void)jl_type_infer(mi2, world, SOURCE_MODE_NOT_REQUIRED); if (jl_typeinf_func && jl_atomic_load_relaxed(&mi->def.method->primary_world) <= tworld) { if (jl_rettype_inferred_native(mi2, tworld, tworld) == jl_nothing) - (void)jl_type_infer(mi2, tworld, 1, SOURCE_MODE_NOT_REQUIRED); + (void)jl_type_infer(mi2, tworld, SOURCE_MODE_NOT_REQUIRED); } } } @@ -4182,16 +4222,6 @@ JL_DLLEXPORT void jl_typeinf_timing_end(uint64_t start, int is_recompile) } } -JL_DLLEXPORT void jl_typeinf_lock_begin(void) -{ - JL_LOCK(&jl_codegen_lock); -} - -JL_DLLEXPORT void jl_typeinf_lock_end(void) -{ - JL_UNLOCK(&jl_codegen_lock); -} - #ifdef __cplusplus } #endif diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index 5e70566ab310e..853469bf2086a 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -471,8 +471,6 @@ XX(jl_try_substrtof) \ XX(jl_tty_set_mode) \ XX(jl_typeassert) \ - XX(jl_typeinf_lock_begin) \ - XX(jl_typeinf_lock_end) \ XX(jl_typeinf_timing_begin) \ XX(jl_typeinf_timing_end) \ XX(jl_typename_str) \ diff --git a/src/jltypes.c b/src/jltypes.c index e857c3f6677a5..78ecb9231df7e 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -3439,30 +3439,28 @@ void jl_init_types(void) JL_GC_DISABLED jl_method_instance_type = jl_new_datatype(jl_symbol("MethodInstance"), core, jl_any_type, jl_emptysvec, - jl_perm_symsvec(8, + jl_perm_symsvec(7, "def", "specTypes", "sparam_vals", "backedges", "cache", - "inInference", "cache_with_orig", "precompiled"), - jl_svec(8, + jl_svec(7, jl_new_struct(jl_uniontype_type, jl_method_type, jl_module_type), jl_any_type, jl_simplevector_type, jl_array_any_type, jl_any_type/*jl_code_instance_type*/, jl_bool_type, - jl_bool_type, jl_bool_type), jl_emptysvec, 0, 1, 3); // These fields should be constant, but Serialization wants to mutate them in initialization - //const static uint32_t method_instance_constfields[1] = { 0x00000007 }; // (1<<0)|(1<<1)|(1<<2); - const static uint32_t method_instance_atomicfields[1] = { 0x0000090 }; // (1<<4)|(1<<7); - //Fields 4 and 5 must be protected by method->write_lock, and thus all operations on jl_method_instance_t are threadsafe. TODO: except inInference + //const static uint32_t method_instance_constfields[1] = { 0b0000111 }; // fields 1, 2, 3 + const static uint32_t method_instance_atomicfields[1] = { 0b1010000 }; // fields 5, 7 + //Fields 4 and 5 must be protected by method->write_lock, and thus all operations on jl_method_instance_t are threadsafe. //jl_method_instance_type->name->constfields = method_instance_constfields; jl_method_instance_type->name->atomicfields = method_instance_atomicfields; diff --git a/src/julia.h b/src/julia.h index 5380128d06202..b05b4ce6f3216 100644 --- a/src/julia.h +++ b/src/julia.h @@ -416,7 +416,6 @@ struct _jl_method_instance_t { jl_svec_t *sparam_vals; // static parameter values, indexed by def.method->sig jl_array_t *backedges; // list of method-instances which call this method-instance; `invoke` records (invokesig, caller) pairs _Atomic(struct _jl_code_instance_t*) cache; - uint8_t inInference; // flags to tell if inference is running on this object uint8_t cache_with_orig; // !cache_with_specTypes _Atomic(uint8_t) precompiled; // true if this instance was generated by an explicit `precompile(...)` call }; diff --git a/src/julia_gcext.h b/src/julia_gcext.h index c65586b85547a..05140e4b09ace 100644 --- a/src/julia_gcext.h +++ b/src/julia_gcext.h @@ -39,8 +39,8 @@ typedef void (*jl_gc_cb_notify_gc_pressure_t)(void) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable); // Types for custom mark and sweep functions. -typedef uintptr_t (*jl_markfunc_t)(jl_ptls_t, jl_value_t *obj); -typedef void (*jl_sweepfunc_t)(jl_value_t *obj); +typedef uintptr_t (*jl_markfunc_t)(jl_ptls_t, jl_value_t *obj) JL_NOTSAFEPOINT; +typedef void (*jl_sweepfunc_t)(jl_value_t *obj) JL_NOTSAFEPOINT; // Function to create a new foreign type with custom // mark and sweep functions. @@ -60,10 +60,10 @@ JL_DLLEXPORT int jl_reinit_foreign_type( jl_markfunc_t markfunc, jl_sweepfunc_t sweepfunc); -JL_DLLEXPORT int jl_is_foreign_type(jl_datatype_t *dt); +JL_DLLEXPORT int jl_is_foreign_type(jl_datatype_t *dt) JL_NOTSAFEPOINT; -JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void); -JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void); +JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void) JL_NOTSAFEPOINT; // Field layout descriptor for custom types that do // not fit Julia layout conventions. This is associated with @@ -80,9 +80,9 @@ JL_DLLEXPORT void *jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty); // Queue an object or array of objects for scanning by the garbage collector. // These functions must only be called from within a root scanner callback // or from within a custom mark function. -JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj); +JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, - jl_value_t **objs, size_t nobjs); + jl_value_t **objs, size_t nobjs) JL_NOTSAFEPOINT; // Sweep functions will not automatically be called for objects of // foreign types, as that may not always be desired. Only calling @@ -133,7 +133,7 @@ JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void); // jl_typeof(obj) is an actual type object. // // NOTE: Only valid to call from within a GC context. -JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p); +JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) JL_NOTSAFEPOINT; // Return a non-null pointer to the start of the stack area if the task // has an associated stack buffer. In that case, *size will also contain @@ -150,7 +150,7 @@ JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *tid) // and may not be tight. JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task, char **active_start, char **active_end, - char **total_start, char **total_end); + char **total_start, char **total_end) JL_NOTSAFEPOINT; #ifdef __cplusplus } diff --git a/src/julia_internal.h b/src/julia_internal.h index ae6c97d9c9a28..35e0ae384d83a 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -654,7 +654,12 @@ typedef union { #define SOURCE_MODE_ABI 0x1 #define SOURCE_MODE_FORCE_SOURCE 0x2 -JL_DLLEXPORT jl_code_instance_t *jl_type_infer(jl_method_instance_t *li, size_t world, int force, uint8_t source_mode); +JL_DLLEXPORT jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner); +JL_DLLEXPORT void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src); +void jl_engine_sweep(jl_ptls_t *gc_all_tls_states) JL_NOTSAFEPOINT; +int jl_engine_hasreserved(jl_method_instance_t *m, jl_value_t *owner) JL_NOTSAFEPOINT; + +JL_DLLEXPORT jl_code_instance_t *jl_type_infer(jl_method_instance_t *li, size_t world, uint8_t source_mode); JL_DLLEXPORT jl_code_info_t *jl_gdbcodetyped1(jl_method_instance_t *mi, size_t world); JL_DLLEXPORT jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *meth JL_PROPAGATES_ROOT, size_t world); JL_DLLEXPORT jl_code_instance_t *jl_get_method_inferred( @@ -662,6 +667,7 @@ JL_DLLEXPORT jl_code_instance_t *jl_get_method_inferred( size_t min_world, size_t max_world, jl_debuginfo_t *edges); jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROPAGATES_ROOT); +JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst_uninit(jl_method_instance_t *mi, jl_value_t *owner); JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst( jl_method_instance_t *mi, jl_value_t *owner, jl_value_t *rettype, jl_value_t *exctype, diff --git a/src/julia_threads.h b/src/julia_threads.h index beb0ab2eae9fc..404c13c63f3f0 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -268,6 +268,7 @@ typedef struct _jl_tls_states_t { // currently-held locks, to be released when an exception is thrown small_arraylist_t locks; + size_t engine_nqueued; JULIA_DEBUG_SLEEPWAKE( uint64_t uv_run_enter; diff --git a/src/method.c b/src/method.c index cb3d481cfc22c..095f18c692535 100644 --- a/src/method.c +++ b/src/method.c @@ -628,7 +628,6 @@ JL_DLLEXPORT jl_method_instance_t *jl_new_method_instance_uninit(void) mi->sparam_vals = jl_emptysvec; mi->backedges = NULL; jl_atomic_store_relaxed(&mi->cache, NULL); - mi->inInference = 0; mi->cache_with_orig = 0; jl_atomic_store_relaxed(&mi->precompiled, 0); return mi; diff --git a/src/toplevel.c b/src/toplevel.c index 22ef91d55fb01..6db7e1b7b5372 100644 --- a/src/toplevel.c +++ b/src/toplevel.c @@ -955,7 +955,7 @@ JL_DLLEXPORT jl_value_t *jl_toplevel_eval_flex(jl_module_t *JL_NONNULL m, jl_val size_t world = jl_atomic_load_acquire(&jl_world_counter); ct->world_age = world; if (!has_defs && jl_get_module_infer(m) != 0) { - (void)jl_type_infer(mfunc, world, 0, SOURCE_MODE_NOT_REQUIRED); + (void)jl_type_infer(mfunc, world, SOURCE_MODE_NOT_REQUIRED); } result = jl_invoke(/*func*/NULL, /*args*/NULL, /*nargs*/0, mfunc); ct->world_age = last_age; @@ -1038,7 +1038,7 @@ JL_DLLEXPORT jl_value_t *jl_infer_thunk(jl_code_info_t *thk, jl_module_t *m) JL_GC_PUSH1(&li); jl_resolve_globals_in_ir((jl_array_t*)thk->code, m, NULL, 0); jl_task_t *ct = jl_current_task; - jl_code_instance_t *ci = jl_type_infer(li, ct->world_age, 0, SOURCE_MODE_NOT_REQUIRED); + jl_code_instance_t *ci = jl_type_infer(li, ct->world_age, SOURCE_MODE_NOT_REQUIRED); JL_GC_POP(); if (ci) return ci->rettype; From 21e5e6a17951259b161d7c0351fd25fe342f46b2 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 19 Jun 2024 14:17:26 +0000 Subject: [PATCH 2/3] fix deadlock misconfiguration There is not permitted to be either a safepoint or equivalently a safepoint region transition while a lock is being held that might be needed by GC as that forms a cycle. We must ensure that this thread is permitted to keep running, so that it becomes possible for it to reach the condition wait call and release the lock to keep in synchronized with the notify_all in the sweep. Alternatively we could use a timed_wait and continuous poll the GC (to ensure we actually run GC periodically), but I would rather that be an internal GC policy than an external forcing factor here. This prevents the GC from recruiting this thread (via a signal) to help GC even though it is observably sleeping, but we might consider later integrating a way for the GC to notify a set of condition variables upon starting to wake them and recruit them temporarily via a subsequent safepoint such as (in julia-syntax psuedocode): gc_safe_enter() lock(condvar) do while !condmet wait(condvar) while gc_running unlock(lock) unsafe_enter() gc_safepoint() unsafe_leave() lock(lock) end end end gc_safe_leave() --- src/engine.cpp | 11 +++++------ src/julia_threads.h | 10 ++++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index f88ee5c006798..6db4dce44e48e 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -63,6 +63,7 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph jl_code_instance_t *ci = jl_new_codeinst_uninit(m, owner); // allocate a placeholder JL_GC_PUSH1(&ci); + int8_t gc_state = jl_gc_safe_enter(ct->ptls); InferKey key = {m, owner}; std::unique_lock lock(engine_lock); auto tid = jl_atomic_load_relaxed(&ct->tid); @@ -72,6 +73,8 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner auto record = Reservations.find(key); if (record == Reservations.end()) { Reservations[key] = ReservationInfo{tid, ci}; + lock.unlock(); + jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint JL_GC_POP(); return ci; } @@ -81,6 +84,8 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner auto wait_tid = record->second.tid; while (1) { if (wait_tid == tid) { + lock.unlock(); + jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint JL_GC_POP(); ct->ptls->engine_nqueued--; return ci; // break the cycle @@ -96,11 +101,9 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner assert(wait_tid != record2->second.tid); wait_tid = record2->second.tid; } - int8_t gc_state = jl_gc_safe_enter(ct->ptls); Awaiting[tid] = key; engine_wait.wait(lock); Awaiting[tid] = InferKey{}; - jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint } } @@ -136,10 +139,6 @@ void jl_engine_sweep(jl_ptls_t *gc_all_tls_states) engine_wait.notify_all(); } -void jl_engine_inhibit_finalizers(void) -{ -} - void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src) { jl_task_t *ct = jl_current_task; diff --git a/src/julia_threads.h b/src/julia_threads.h index 404c13c63f3f0..f844882eb9307 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -352,10 +352,12 @@ STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state)); } #ifdef __clang_gcanalyzer__ -int8_t jl_gc_unsafe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE; // this could be a safepoint, but we will assume it is not -void jl_gc_unsafe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER; -int8_t jl_gc_safe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER; -void jl_gc_safe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_LEAVE; // this might not be a safepoint, but we have to assume it could be (statically) +// these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically) +// however mark a delineated region in which safepoints would be not permissible +int8_t jl_gc_unsafe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT_LEAVE; +void jl_gc_unsafe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_ENTER; +int8_t jl_gc_safe_enter(jl_ptls_t ptls) JL_NOTSAFEPOINT_ENTER; +void jl_gc_safe_leave(jl_ptls_t ptls, int8_t state) JL_NOTSAFEPOINT_LEAVE; #else #define jl_gc_unsafe_enter(ptls) jl_gc_state_save_and_set(ptls, JL_GC_STATE_UNSAFE) #define jl_gc_unsafe_leave(ptls, state) ((void)jl_gc_state_set(ptls, (state), JL_GC_STATE_UNSAFE)) From e71f6877687efab56aa57cbb005ea828e06d09d2 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Sat, 22 Jun 2024 18:52:57 +0000 Subject: [PATCH 3/3] workaround C11 standards bug This bug was fixed in C17, but we are using gcc 9.1, which caused misalignment here on 32-bit platforms. We have this behind a lock necessarily, so it could just be a boolean, but that might need some more code cleanup. --- src/julia_threads.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/julia_threads.h b/src/julia_threads.h index f844882eb9307..9d5160ab2ff64 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -261,7 +261,7 @@ typedef struct _jl_tls_states_t { jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; arraylist_t sweep_objs; - _Atomic(int64_t) gc_sweeps_requested; + _Atomic(size_t) gc_sweeps_requested; // Saved exception for previous *external* API call or NULL if cleared. // Access via jl_exception_occurred(). struct _jl_value_t *previous_exception;