diff --git a/Make.inc b/Make.inc
index 609f698059a13..5bbb52617aa98 100644
--- a/Make.inc
+++ b/Make.inc
@@ -89,6 +89,9 @@ WITH_GC_DEBUG_ENV := 0
 # Enable DTrace support
 WITH_DTRACE := 0
 
+# Enable ITTAPI integration
+WITH_ITTAPI := 0
+
 # Prevent picking up $ARCH from the environment variables
 ARCH:=
 
@@ -303,6 +306,9 @@ private_libdir := $(libdir)/julia
 endif
 build_private_libdir := $(build_libdir)/julia
 
+private_libexecdir := $(libexecdir)/julia
+build_private_libexecdir := $(build_libexecdir)/julia
+
 # A helper functions for dealing with lazily-evaluated, expensive operations..  Spinning
 # up a python process to, for exaxmple, parse a TOML file is expensive, and we must wait
 # until the TOML files are on-disk before we can parse them.  This means that we cannot
@@ -327,7 +333,7 @@ define cache_rel_path
 $(1)_rel_eval = $(call rel_path,$(2),$($(1)))
 $(1)_rel = $$(call hit_cache,$(1)_rel_eval)
 endef
-$(foreach D,libdir private_libdir datarootdir libexecdir docdir sysconfdir includedir,$(eval $(call cache_rel_path,$(D),$(bindir))))
+$(foreach D,libdir private_libdir datarootdir libexecdir private_libexecdir docdir sysconfdir includedir,$(eval $(call cache_rel_path,$(D),$(bindir))))
 $(foreach D,build_libdir build_private_libdir,$(eval $(call cache_rel_path,$(D),$(build_bindir))))
 
 # Save a special one: reverse_private_libdir_rel: usually just `../`, but good to be general:
@@ -733,7 +739,12 @@ ifeq ($(WITH_DTRACE), 1)
 JCXXFLAGS += -DUSE_DTRACE
 JCFLAGS += -DUSE_DTRACE
 DTRACE := dtrace
-else
+endif
+
+ifeq ($(WITH_ITTAPI), 1)
+JCXXFLAGS += -DUSE_ITTAPI
+JCFLAGS += -DUSE_ITTAPI
+LIBITTAPI:=-littnotify
 endif
 
 # ===========================================================================
diff --git a/Makefile b/Makefile
index 9c3dd44b704cd..ef0ade09e20b3 100644
--- a/Makefile
+++ b/Makefile
@@ -238,7 +238,7 @@ endef
 
 install: $(build_depsbindir)/stringreplace $(BUILDROOT)/doc/_build/html/en/index.html
 	@$(MAKE) $(QUIET_MAKE) $(JULIA_BUILD_MODE)
-	@for subdir in $(bindir) $(datarootdir)/julia/stdlib/$(VERSDIR) $(docdir) $(man1dir) $(includedir)/julia $(libdir) $(private_libdir) $(sysconfdir) $(libexecdir); do \
+	@for subdir in $(bindir) $(datarootdir)/julia/stdlib/$(VERSDIR) $(docdir) $(man1dir) $(includedir)/julia $(libdir) $(private_libdir) $(sysconfdir) $(private_libexecdir); do \
 		mkdir -p $(DESTDIR)$$subdir; \
 	done
 
@@ -253,8 +253,8 @@ else ifeq ($(JULIA_BUILD_MODE),debug)
 	-$(INSTALL_M) $(build_libdir)/libjulia-internal-debug.dll.a $(DESTDIR)$(libdir)/
 endif
 
-	# We have a single exception; we want 7z.dll to live in libexec, not bin, so that 7z.exe can find it.
-	-mv $(DESTDIR)$(bindir)/7z.dll $(DESTDIR)$(libexecdir)/
+	# We have a single exception; we want 7z.dll to live in private_libexecdir, not bindir, so that 7z.exe can find it.
+	-mv $(DESTDIR)$(bindir)/7z.dll $(DESTDIR)$(private_libexecdir)/
 	-$(INSTALL_M) $(build_bindir)/libopenlibm.dll.a $(DESTDIR)$(libdir)/
 	-$(INSTALL_M) $(build_libdir)/libssp.dll.a $(DESTDIR)$(libdir)/
 	# The rest are compiler dependencies, as an example memcpy is exported by msvcrt
@@ -311,14 +311,14 @@ endif
 		done \
 	done
 endif
-	# Install `7z` into libexec/
-	$(INSTALL_M) $(build_bindir)/7z$(EXE) $(DESTDIR)$(libexecdir)/
+	# Install `7z` into private_libexecdir
+	$(INSTALL_M) $(build_bindir)/7z$(EXE) $(DESTDIR)$(private_libexecdir)/
 
-	# Install `lld` into libexec/
-	$(INSTALL_M) $(build_depsbindir)/lld$(EXE) $(DESTDIR)$(libexecdir)/
+	# Install `lld` into private_libexecdir
+	$(INSTALL_M) $(build_depsbindir)/lld$(EXE) $(DESTDIR)$(private_libexecdir)/
 
-	# Install `dsymutil` into libexec/
-	$(INSTALL_M) $(build_depsbindir)/dsymutil$(EXE) $(DESTDIR)$(libexecdir)/
+	# Install `dsymutil` into private_libexecdir/
+	$(INSTALL_M) $(build_depsbindir)/dsymutil$(EXE) $(DESTDIR)$(private_libexecdir)/
 
 	# Copy public headers
 	cp -R -L $(build_includedir)/julia/* $(DESTDIR)$(includedir)/julia
diff --git a/THIRDPARTY.md b/THIRDPARTY.md
index 4a35bbdb1b7ce..51950d9e2c6a1 100644
--- a/THIRDPARTY.md
+++ b/THIRDPARTY.md
@@ -24,6 +24,10 @@ own licenses:
 - [LLVM](https://releases.llvm.org/12.0.1/LICENSE.TXT) [APACHE 2.0 with LLVM Exception]
 - [UTF8PROC](https://github.com/JuliaStrings/utf8proc) [MIT]
 
+and optionally:
+
+- [ITTAPI](https://github.com/intel/ittapi/blob/master/LICENSES/BSD-3-Clause.txt) [BSD-3]
+
 Julia's `stdlib` uses the following external libraries, which have their own licenses:
 
 - [DSFMT](https://github.com/MersenneTwister-Lab/dSFMT/blob/master/LICENSE.txt) [BSD-3]
diff --git a/base/Makefile b/base/Makefile
index d92302b766988..0ea0359c8cc8e 100644
--- a/base/Makefile
+++ b/base/Makefile
@@ -66,6 +66,7 @@ ifeq ($(OS),WINNT)
 	@printf 'const LIBDIR = "%s"\n' '$(subst /,\\,$(libdir_rel))' >> $@
 	@printf 'const LIBEXECDIR = "%s"\n' '$(subst /,\\,$(libexecdir_rel))' >> $@
 	@printf 'const PRIVATE_LIBDIR = "%s"\n' '$(subst /,\\,$(private_libdir_rel))' >> $@
+	@printf 'const PRIVATE_LIBEXECDIR = "%s"\n' '$(subst /,\\,$(private_libexecdir_rel))' >> $@
 	@printf 'const INCLUDEDIR = "%s"\n' '$(subst /,\\,$(includedir_rel))' >> $@
 else
 	@echo "const SYSCONFDIR = \"$(sysconfdir_rel)\"" >> $@
@@ -74,6 +75,7 @@ else
 	@echo "const LIBDIR = \"$(libdir_rel)\"" >> $@
 	@echo "const LIBEXECDIR = \"$(libexecdir_rel)\"" >> $@
 	@echo "const PRIVATE_LIBDIR = \"$(private_libdir_rel)\"" >> $@
+	@echo "const PRIVATE_LIBEXECDIR = \"$(private_libexecdir_rel)\"" >> $@
 	@echo "const INCLUDEDIR = \"$(includedir_rel)\"" >> $@
 endif
 ifeq ($(DARWIN_FRAMEWORK), 1)
diff --git a/base/abstractarray.jl b/base/abstractarray.jl
index 2e1d885ca5a3f..9fc6b3fa1f457 100644
--- a/base/abstractarray.jl
+++ b/base/abstractarray.jl
@@ -1982,12 +1982,16 @@ julia> cat(1, [2], [3;;]; dims=Val(2))
 
 # The specializations for 1 and 2 inputs are important
 # especially when running with --inline=no, see #11158
+# The specializations for Union{AbstractVecOrMat,Number} are necessary
+# to have more specialized methods here than in LinearAlgebra/uniformscaling.jl
 vcat(A::AbstractArray) = cat(A; dims=Val(1))
 vcat(A::AbstractArray, B::AbstractArray) = cat(A, B; dims=Val(1))
 vcat(A::AbstractArray...) = cat(A...; dims=Val(1))
+vcat(A::Union{AbstractVecOrMat,Number}...) = cat(A...; dims=Val(1))
 hcat(A::AbstractArray) = cat(A; dims=Val(2))
 hcat(A::AbstractArray, B::AbstractArray) = cat(A, B; dims=Val(2))
 hcat(A::AbstractArray...) = cat(A...; dims=Val(2))
+hcat(A::Union{AbstractVecOrMat,Number}...) = cat(A...; dims=Val(2))
 
 typed_vcat(T::Type, A::AbstractArray) = _cat_t(Val(1), T, A)
 typed_vcat(T::Type, A::AbstractArray, B::AbstractArray) = _cat_t(Val(1), T, A, B)
@@ -2137,6 +2141,8 @@ end
 
 hvcat(rows::Tuple{Vararg{Int}}, xs::Number...) = typed_hvcat(promote_typeof(xs...), rows, xs...)
 hvcat(rows::Tuple{Vararg{Int}}, xs...) = typed_hvcat(promote_eltypeof(xs...), rows, xs...)
+# the following method is needed to provide a more specific one compared to LinearAlgebra/uniformscaling.jl
+hvcat(rows::Tuple{Vararg{Int}}, xs::Union{AbstractVecOrMat,Number}...) = typed_hvcat(promote_eltypeof(xs...), rows, xs...)
 
 function typed_hvcat(::Type{T}, rows::Tuple{Vararg{Int}}, xs::Number...) where T
     nr = length(rows)
diff --git a/base/array.jl b/base/array.jl
index 5257caabf2d45..1d0a641bd0040 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -177,11 +177,11 @@ function _unsetindex!(A::Array{T}, i::Int) where {T}
     t = @_gc_preserve_begin A
     p = Ptr{Ptr{Cvoid}}(pointer(A, i))
     if !allocatedinline(T)
-        unsafe_store!(p, C_NULL)
+        Intrinsics.atomic_pointerset(p, C_NULL, :monotonic)
     elseif T isa DataType
         if !datatype_pointerfree(T)
-            for j = 1:(Core.sizeof(T) ÷ Core.sizeof(Ptr{Cvoid}))
-                unsafe_store!(p, C_NULL, j)
+            for j = 1:Core.sizeof(Ptr{Cvoid}):Core.sizeof(T)
+                Intrinsics.atomic_pointerset(p + j - 1, C_NULL, :monotonic)
             end
         end
     end
@@ -1916,7 +1916,7 @@ function reverse!(v::AbstractVector, start::Integer, stop::Integer=lastindex(v))
     return v
 end
 
-# concatenations of homogeneous combinations of vectors, horizontal and vertical
+# concatenations of (in)homogeneous combinations of vectors, horizontal and vertical
 
 vcat() = Vector{Any}()
 hcat() = Vector{Any}()
@@ -1930,6 +1930,7 @@ function hcat(V::Vector{T}...) where T
     end
     return [ V[j][i]::T for i=1:length(V[1]), j=1:length(V) ]
 end
+hcat(A::Vector...) = cat(A...; dims=Val(2)) # more special than SparseArrays's hcat
 
 function vcat(arrays::Vector{T}...) where T
     n = 0
@@ -1946,6 +1947,19 @@ function vcat(arrays::Vector{T}...) where T
     end
     return arr
 end
+vcat(A::Vector...) = cat(A...; dims=Val(1)) # more special than SparseArrays's vcat
+
+# disambiguation with LinAlg/special.jl
+# Union{Number,Vector,Matrix} is for LinearAlgebra._DenseConcatGroup
+# VecOrMat{T} is for LinearAlgebra._TypedDenseConcatGroup
+hcat(A::Union{Number,Vector,Matrix}...) = cat(A...; dims=Val(2))
+hcat(A::VecOrMat{T}...) where {T} = typed_hcat(T, A...)
+vcat(A::Union{Number,Vector,Matrix}...) = cat(A...; dims=Val(1))
+vcat(A::VecOrMat{T}...) where {T} = typed_vcat(T, A...)
+hvcat(rows::Tuple{Vararg{Int}}, xs::Union{Number,Vector,Matrix}...) =
+    typed_hvcat(promote_eltypeof(xs...), rows, xs...)
+hvcat(rows::Tuple{Vararg{Int}}, xs::VecOrMat{T}...) where {T} =
+    typed_hvcat(T, rows, xs...)
 
 _cat(n::Integer, x::Integer...) = reshape([x...], (ntuple(Returns(1), n-1)..., length(x)))
 
diff --git a/base/compiler/abstractinterpretation.jl b/base/compiler/abstractinterpretation.jl
index 1ee84f50edf2f..b17a48f893cd1 100644
--- a/base/compiler/abstractinterpretation.jl
+++ b/base/compiler/abstractinterpretation.jl
@@ -204,7 +204,9 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
     if seen ≠ napplicable
         # there is unanalyzed candidate, widen type and effects to the top
         rettype = Any
-        all_effects = Effects()
+        # there may be unanalyzed effects within unseen dispatch candidate,
+        # but we can still ignore nonoverlayed effect here since we already accounted for it
+        all_effects = merge_effects(all_effects, EFFECTS_UNKNOWN)
     elseif isa(matches, MethodMatches) ? (!matches.fullmatch || any_ambig(matches)) :
             (!all(matches.fullmatches) || any_ambig(matches))
         # Account for the fact that we may encounter a MethodError with a non-covered or ambiguous signature.
@@ -1304,6 +1306,14 @@ function ssa_def_slot(@nospecialize(arg), sv::InferenceState)
     return arg
 end
 
+struct AbstractIterationResult
+    cti::Vector{Any}
+    info::MaybeAbstractIterationInfo
+    ai_effects::Effects
+end
+AbstractIterationResult(cti::Vector{Any}, info::MaybeAbstractIterationInfo) =
+    AbstractIterationResult(cti, info, EFFECTS_TOTAL)
+
 # `typ` is the inferred type for expression `arg`.
 # if the expression constructs a container (e.g. `svec(x,y,z)`),
 # refine its type to an array of element types.
@@ -1311,14 +1321,17 @@ end
 # returns an array of types
 function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft), @nospecialize(typ),
                                 sv::Union{InferenceState, IRCode})
-    if isa(typ, PartialStruct) && typ.typ.name === Tuple.name
-        return typ.fields, nothing
+    if isa(typ, PartialStruct)
+        widet = typ.typ
+        if isa(widet, DataType) && widet.name === Tuple.name
+            return AbstractIterationResult(typ.fields, nothing)
+        end
     end
 
     if isa(typ, Const)
         val = typ.val
         if isa(val, SimpleVector) || isa(val, Tuple)
-            return Any[ Const(val[i]) for i in 1:length(val) ], nothing # avoid making a tuple Generator here!
+            return AbstractIterationResult(Any[ Const(val[i]) for i in 1:length(val) ], nothing) # avoid making a tuple Generator here!
         end
     end
 
@@ -1333,12 +1346,12 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
     if isa(tti, Union)
         utis = uniontypes(tti)
         if any(@nospecialize(t) -> !isa(t, DataType) || !(t <: Tuple) || !isknownlength(t), utis)
-            return Any[Vararg{Any}], nothing
+            return AbstractIterationResult(Any[Vararg{Any}], nothing, EFFECTS_UNKNOWN′)
         end
         ltp = length((utis[1]::DataType).parameters)
         for t in utis
             if length((t::DataType).parameters) != ltp
-                return Any[Vararg{Any}], nothing
+                return AbstractIterationResult(Any[Vararg{Any}], nothing)
             end
         end
         result = Any[ Union{} for _ in 1:ltp ]
@@ -1349,12 +1362,12 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
                 result[j] = tmerge(result[j], rewrap_unionall(tps[j], tti0))
             end
         end
-        return result, nothing
+        return AbstractIterationResult(result, nothing)
     elseif tti0 <: Tuple
         if isa(tti0, DataType)
-            return Any[ p for p in tti0.parameters ], nothing
+            return AbstractIterationResult(Any[ p for p in tti0.parameters ], nothing)
         elseif !isa(tti, DataType)
-            return Any[Vararg{Any}], nothing
+            return AbstractIterationResult(Any[Vararg{Any}], nothing)
         else
             len = length(tti.parameters)
             last = tti.parameters[len]
@@ -1363,12 +1376,14 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
             if va
                 elts[len] = Vararg{elts[len]}
             end
-            return elts, nothing
+            return AbstractIterationResult(elts, nothing)
         end
-    elseif tti0 === SimpleVector || tti0 === Any
-        return Any[Vararg{Any}], nothing
+    elseif tti0 === SimpleVector
+        return AbstractIterationResult(Any[Vararg{Any}], nothing)
+    elseif tti0 === Any
+        return AbstractIterationResult(Any[Vararg{Any}], nothing, EFFECTS_UNKNOWN′)
     elseif tti0 <: Array
-        return Any[Vararg{eltype(tti0)}], nothing
+        return AbstractIterationResult(Any[Vararg{eltype(tti0)}], nothing)
     else
         return abstract_iteration(interp, itft, typ, sv)
     end
@@ -1379,7 +1394,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
     if isa(itft, Const)
         iteratef = itft.val
     else
-        return Any[Vararg{Any}], nothing
+        return AbstractIterationResult(Any[Vararg{Any}], nothing, EFFECTS_UNKNOWN′)
     end
     @assert !isvarargtype(itertype)
     call = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[itft, itertype]), StmtInfo(true), sv)
@@ -1389,7 +1404,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
     # WARNING: Changes to the iteration protocol must be reflected here,
     # this is not just an optimization.
     # TODO: this doesn't realize that Array, SimpleVector, Tuple, and NamedTuple do not use the iterate protocol
-    stateordonet === Bottom && return Any[Bottom], AbstractIterationInfo(CallMeta[CallMeta(Bottom, call.effects, info)])
+    stateordonet === Bottom && return AbstractIterationResult(Any[Bottom], AbstractIterationInfo(CallMeta[CallMeta(Bottom, call.effects, info)], true))
     valtype = statetype = Bottom
     ret = Any[]
     calls = CallMeta[call]
@@ -1399,7 +1414,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
     # length iterators, or interesting prefix
     while true
         if stateordonet_widened === Nothing
-            return ret, AbstractIterationInfo(calls)
+            return AbstractIterationResult(ret, AbstractIterationInfo(calls, true))
         end
         if Nothing <: stateordonet_widened || length(ret) >= InferenceParams(interp).MAX_TUPLE_SPLAT
             break
@@ -1411,7 +1426,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
         # If there's no new information in this statetype, don't bother continuing,
         # the iterator won't be finite.
         if ⊑(typeinf_lattice(interp), nstatetype, statetype)
-            return Any[Bottom], nothing
+            return AbstractIterationResult(Any[Bottom], AbstractIterationInfo(calls, false), EFFECTS_THROWS)
         end
         valtype = getfield_tfunc(typeinf_lattice(interp), stateordonet, Const(1))
         push!(ret, valtype)
@@ -1441,7 +1456,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
                 # ... but cannot terminate
                 if !may_have_terminated
                     #  ... and cannot have terminated prior to this loop
-                    return Any[Bottom], nothing
+                    return AbstractIterationResult(Any[Bottom], AbstractIterationInfo(calls, false), EFFECTS_UNKNOWN′)
                 else
                     # iterator may have terminated prior to this loop, but not during it
                     valtype = Bottom
@@ -1451,13 +1466,15 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
         end
         valtype = tmerge(valtype, nounion.parameters[1])
         statetype = tmerge(statetype, nounion.parameters[2])
-        stateordonet = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[Const(iteratef), itertype, statetype]), StmtInfo(true), sv).rt
+        call = abstract_call_known(interp, iteratef, ArgInfo(nothing, Any[Const(iteratef), itertype, statetype]), StmtInfo(true), sv)
+        push!(calls, call)
+        stateordonet = call.rt
         stateordonet_widened = widenconst(stateordonet)
     end
     if valtype !== Union{}
         push!(ret, Vararg{valtype})
     end
-    return ret, nothing
+    return AbstractIterationResult(ret, AbstractIterationInfo(calls, false))
 end
 
 # do apply(af, fargs...), where af is a function value
@@ -1488,13 +1505,9 @@ function abstract_apply(interp::AbstractInterpreter, argtypes::Vector{Any}, si::
         infos′ = Vector{MaybeAbstractIterationInfo}[]
         for ti in (splitunions ? uniontypes(aargtypes[i]) : Any[aargtypes[i]])
             if !isvarargtype(ti)
-                cti_info = precise_container_type(interp, itft, ti, sv)
-                cti = cti_info[1]::Vector{Any}
-                info = cti_info[2]::MaybeAbstractIterationInfo
+                (;cti, info, ai_effects) = precise_container_type(interp, itft, ti, sv)
             else
-                cti_info = precise_container_type(interp, itft, unwrapva(ti), sv)
-                cti = cti_info[1]::Vector{Any}
-                info = cti_info[2]::MaybeAbstractIterationInfo
+                (;cti, info, ai_effects) = precise_container_type(interp, itft, unwrapva(ti), sv)
                 # We can't represent a repeating sequence of the same types,
                 # so tmerge everything together to get one type that represents
                 # everything.
@@ -1507,6 +1520,12 @@ function abstract_apply(interp::AbstractInterpreter, argtypes::Vector{Any}, si::
                 end
                 cti = Any[Vararg{argt}]
             end
+            effects = merge_effects(effects, ai_effects)
+            if info !== nothing
+                for call in info.each
+                    effects = merge_effects(effects, call.effects)
+                end
+            end
             if any(@nospecialize(t) -> t === Bottom, cti)
                 continue
             end
diff --git a/base/compiler/inferencestate.jl b/base/compiler/inferencestate.jl
index ca2e1acc2ee94..efee2357fff15 100644
--- a/base/compiler/inferencestate.jl
+++ b/base/compiler/inferencestate.jl
@@ -459,7 +459,18 @@ function sptypes_from_meth_instance(linfo::MethodInstance)
         v = sp[i]
         if v isa TypeVar
             fromArg = 0
-            maybe_undef = !constrains_param(v, linfo.specTypes, #=covariant=#true)
+            maybe_undef = !(let sig=sig
+                # if the specialized signature `linfo.specTypes` doesn't contain any free
+                # type variables, we can use it for a more accurate analysis of whether `v`
+                # is constrained or not, otherwise we should use `def.sig` which always
+                # doesn't contain any free type variables
+                if !has_free_typevars(linfo.specTypes)
+                    sig = linfo.specTypes
+                else
+                    @assert !has_free_typevars(sig)
+                end
+                constrains_param(v, sig, #=covariant=#true)
+            end)
             temp = sig
             for j = 1:i-1
                 temp = temp.body
diff --git a/base/compiler/ssair/inlining.jl b/base/compiler/ssair/inlining.jl
index c9fcb32ac58ab..c1c4ee18e8742 100644
--- a/base/compiler/ssair/inlining.jl
+++ b/base/compiler/ssair/inlining.jl
@@ -729,7 +729,7 @@ function rewrite_apply_exprargs!(todo::Vector{Pair{Int,Any}},
         def = argexprs[i]
         def_type = argtypes[i]
         thisarginfo = arginfos[i-arg_start]
-        if thisarginfo === nothing
+        if thisarginfo === nothing || !thisarginfo.complete
             if def_type isa PartialStruct
                 # def_type.typ <: Tuple is assumed
                 def_argtypes = def_type.fields
@@ -1141,9 +1141,9 @@ function inline_apply!(todo::Vector{Pair{Int,Any}},
         for i = (arg_start + 1):length(argtypes)
             thisarginfo = nothing
             if !is_valid_type_for_apply_rewrite(argtypes[i], state.params)
-                if isa(info, ApplyCallInfo) && info.arginfo[i-arg_start] !== nothing
-                    thisarginfo = info.arginfo[i-arg_start]
-                else
+                isa(info, ApplyCallInfo) || return nothing
+                thisarginfo = info.arginfo[i-arg_start]
+                if thisarginfo === nothing || !thisarginfo.complete
                     return nothing
                 end
             end
diff --git a/base/compiler/ssair/irinterp.jl b/base/compiler/ssair/irinterp.jl
index a6a4e52c38963..3ce968eb1131c 100644
--- a/base/compiler/ssair/irinterp.jl
+++ b/base/compiler/ssair/irinterp.jl
@@ -66,7 +66,17 @@ function kill_def_use!(tpdum::TwoPhaseDefUseMap, def::Int, use::Int)
     if !tpdum.complete
         tpdum.ssa_uses[def] -= 1
     else
-        @assert false && "TODO"
+        range = tpdum.ssa_uses[def]:(def == length(tpdum.ssa_uses) ? length(tpdum.data) : (tpdum.ssa_uses[def + 1] - 1))
+        # TODO: Sorted
+        useidx = findfirst(idx->tpdum.data[idx] == use, range)
+        @assert useidx !== nothing
+        idx = range[useidx]
+        while idx < lastindex(range)
+            ndata = tpdum.data[idx+1]
+            ndata == 0 && break
+            tpdum.data[idx] = ndata
+        end
+        tpdum.data[idx + 1] = 0
     end
 end
 kill_def_use!(tpdum::TwoPhaseDefUseMap, def::SSAValue, use::Int) =
@@ -261,11 +271,11 @@ function process_terminator!(ir::IRCode, idx::Int, bb::Int,
         end
         return false
     elseif isa(inst, GotoNode)
-        backedge = inst.label < bb
+        backedge = inst.label <= bb
         !backedge && push!(ip, inst.label)
         return backedge
     elseif isa(inst, GotoIfNot)
-        backedge = inst.dest < bb
+        backedge = inst.dest <= bb
         !backedge && push!(ip, inst.dest)
         push!(ip, bb + 1)
         return backedge
diff --git a/base/compiler/stmtinfo.jl b/base/compiler/stmtinfo.jl
index 556c0082e4532..23f8c3aba908e 100644
--- a/base/compiler/stmtinfo.jl
+++ b/base/compiler/stmtinfo.jl
@@ -114,6 +114,7 @@ Each (abstract) call to `iterate`, corresponds to one entry in `ainfo.each::Vect
 """
 struct AbstractIterationInfo
     each::Vector{CallMeta}
+    complete::Bool
 end
 
 const MaybeAbstractIterationInfo = Union{Nothing, AbstractIterationInfo}
diff --git a/base/errorshow.jl b/base/errorshow.jl
index 636357827a32a..be94c0997d102 100644
--- a/base/errorshow.jl
+++ b/base/errorshow.jl
@@ -925,7 +925,7 @@ Experimental.register_error_hint(noncallable_number_hint_handler, MethodError)
 # (probably attempting concatenation)
 function string_concatenation_hint_handler(io, ex, arg_types, kwargs)
     @nospecialize
-    if (ex.f == +) && all(i -> i <: AbstractString, arg_types)
+    if (ex.f === +) && all(i -> i <: AbstractString, arg_types)
         print(io, "\nString concatenation is performed with ")
         printstyled(io, "*", color=:cyan)
         print(io, " (See also: https://docs.julialang.org/en/v1/manual/strings/#man-concatenation).")
diff --git a/base/initdefs.jl b/base/initdefs.jl
index 97a67c88fe713..89ebecaefbdc4 100644
--- a/base/initdefs.jl
+++ b/base/initdefs.jl
@@ -315,6 +315,9 @@ end
     set_active_project(projfile::Union{AbstractString,Nothing})
 
 Set the active `Project.toml` file to `projfile`. See also [`Base.active_project`](@ref).
+
+!!! compat "Julia 1.8"
+    This function requires at least Julia 1.8.
 """
 function set_active_project(projfile::Union{AbstractString,Nothing})
     ACTIVE_PROJECT[] = projfile
diff --git a/base/linking.jl b/base/linking.jl
index 38cbdcd562693..4dd6d9e7a644f 100644
--- a/base/linking.jl
+++ b/base/linking.jl
@@ -49,8 +49,8 @@ end
 
 function __init_lld_path()
     # Prefer our own bundled lld, but if we don't have one, pick it up off of the PATH
-    # If this is an in-tree build, `lld` will live in `tools`.  Otherwise, it'll be in `libexec`
-    for bundled_lld_path in (joinpath(Sys.BINDIR, Base.LIBEXECDIR, lld_exe),
+    # If this is an in-tree build, `lld` will live in `tools`.  Otherwise, it'll be in `private_libexecdir`
+    for bundled_lld_path in (joinpath(Sys.BINDIR, Base.PRIVATE_LIBEXECDIR, lld_exe),
                              joinpath(Sys.BINDIR, "..", "tools", lld_exe),
                              joinpath(Sys.BINDIR, lld_exe))
         if isfile(bundled_lld_path)
@@ -64,7 +64,7 @@ end
 
 function __init_dsymutil_path()
     #Same as with lld but for dsymutil
-    for bundled_dsymutil_path in (joinpath(Sys.BINDIR, Base.LIBEXECDIR, dsymutil_exe),
+    for bundled_dsymutil_path in (joinpath(Sys.BINDIR, Base.PRIVATE_LIBEXECDIR, dsymutil_exe),
                              joinpath(Sys.BINDIR, "..", "tools", dsymutil_exe),
                              joinpath(Sys.BINDIR, dsymutil_exe))
         if isfile(bundled_dsymutil_path)
diff --git a/base/loading.jl b/base/loading.jl
index edbfb96211917..477b8b6bfd8e9 100644
--- a/base/loading.jl
+++ b/base/loading.jl
@@ -1080,7 +1080,6 @@ function register_restored_modules(sv::SimpleVector, pkg::PkgId, path::String)
 end
 
 function run_package_callbacks(modkey::PkgId)
-    run_extension_callbacks(modkey)
     assert_havelock(require_lock)
     unlock(require_lock)
     try
@@ -1197,60 +1196,65 @@ function run_extension_callbacks(extid::ExtensionId)
         true
     catch
         # Try to continue loading if loading an extension errors
+        errs = current_exceptions()
         @error "Error during loading of extension $(extid.id.name) of $(extid.parentid.name), \
-                use `Base.retry_load_extensions()` to retry."
+                use `Base.retry_load_extensions()` to retry." exception=errs
         false
     end
     return succeeded
 end
 
-function run_extension_callbacks(pkgid::PkgId)
+function run_extension_callbacks()
     assert_havelock(require_lock)
-    # take ownership of extids that depend on this pkgid
-    extids = pop!(EXT_DORMITORY, pkgid, nothing)
-    extids === nothing && return
-    for extid in extids
-        if extid.ntriggers > 0
-            # It is possible that pkgid was loaded in an environment
-            # below the one of the parent. This will cause a load failure when the
-            # pkg ext tries to load the triggers. Therefore, check this first
-            # before loading the pkg ext.
-            pkgenv = Base.identify_package_env(extid.id, pkgid.name)
-            ext_not_allowed_load = false
-            if pkgenv === nothing
-                ext_not_allowed_load = true
-            else
-                pkg, env = pkgenv
-                path = Base.locate_package(pkg, env)
-                if path === nothing
+    loaded_triggers = collect(intersect(keys(Base.loaded_modules), keys(Base.EXT_DORMITORY)))
+    sort!(loaded_triggers; by=x->x.uuid)
+    for pkgid in loaded_triggers
+        # take ownership of extids that depend on this pkgid
+        extids = pop!(EXT_DORMITORY, pkgid, nothing)
+        extids === nothing && continue
+        for extid in extids
+            if extid.ntriggers > 0
+                # It is possible that pkgid was loaded in an environment
+                # below the one of the parent. This will cause a load failure when the
+                # pkg ext tries to load the triggers. Therefore, check this first
+                # before loading the pkg ext.
+                pkgenv = Base.identify_package_env(extid.id, pkgid.name)
+                ext_not_allowed_load = false
+                if pkgenv === nothing
                     ext_not_allowed_load = true
+                else
+                    pkg, env = pkgenv
+                    path = Base.locate_package(pkg, env)
+                    if path === nothing
+                        ext_not_allowed_load = true
+                    end
+                end
+                if ext_not_allowed_load
+                    @debug "Extension $(extid.id.name) of $(extid.parentid.name) will not be loaded \
+                            since $(pkgid.name) loaded in environment lower in load path"
+                    # indicate extid is expected to fail
+                    extid.ntriggers *= -1
+                else
+                    # indicate pkgid is loaded
+                    extid.ntriggers -= 1
                 end
             end
-            if ext_not_allowed_load
-                @debug "Extension $(extid.id.name) of $(extid.parentid.name) will not be loaded \
-                        since $(pkgid.name) loaded in environment lower in load path"
-                # indicate extid is expected to fail
-                extid.ntriggers *= -1
-            else
+            if extid.ntriggers < 0
                 # indicate pkgid is loaded
-                extid.ntriggers -= 1
+                extid.ntriggers += 1
+                succeeded = false
+            else
+                succeeded = true
+            end
+            if extid.ntriggers == 0
+                # actually load extid, now that all dependencies are met,
+                # and record the result
+                succeeded = succeeded && run_extension_callbacks(extid)
+                succeeded || push!(EXT_DORMITORY_FAILED, extid)
             end
-        end
-        if extid.ntriggers < 0
-            # indicate pkgid is loaded
-            extid.ntriggers += 1
-            succeeded = false
-        else
-            succeeded = true
-        end
-        if extid.ntriggers == 0
-            # actually load extid, now that all dependencies are met,
-            # and record the result
-            succeeded = succeeded && run_extension_callbacks(extid)
-            succeeded || push!(EXT_DORMITORY_FAILED, extid)
         end
     end
-    nothing
+    return
 end
 
 """
@@ -1627,6 +1631,10 @@ function _require_prelocked(uuidkey::PkgId, env=nothing)
     else
         newm = root_module(uuidkey)
     end
+    # Load extensions when not precompiling and not in a nested package load
+    if JLOptions().incremental == 0 && isempty(package_locks)
+        run_extension_callbacks()
+    end
     return newm
 end
 
diff --git a/base/partr.jl b/base/partr.jl
index c5bb6603d53af..a02272ceab202 100644
--- a/base/partr.jl
+++ b/base/partr.jl
@@ -179,13 +179,12 @@ function multiq_deletemin()
     return task
 end
 
-
 function multiq_check_empty()
-    for j = UInt32(1):length(heaps)
-        for i = UInt32(1):length(heaps[j])
-            if heaps[j][i].ntasks != 0
-                return false
-            end
+    tid = Threads.threadid()
+    tp = ccall(:jl_threadpoolid, Int8, (Int16,), tid-1) + 1
+    for i = UInt32(1):length(heaps[tp])
+        if heaps[tp][i].ntasks != 0
+            return false
         end
     end
     return true
diff --git a/base/process.jl b/base/process.jl
index 55df523c1f7d2..ed51a30ae3ced 100644
--- a/base/process.jl
+++ b/base/process.jl
@@ -413,7 +413,7 @@ process failed, or if the process attempts to print anything to stdout.
 """
 function open(f::Function, cmds::AbstractCmd, args...; kwargs...)
     P = open(cmds, args...; kwargs...)
-    function waitkill(P::Process)
+    function waitkill(P::Union{Process,ProcessChain})
         close(P)
         # 0.1 seconds after we hope it dies (from closing stdio),
         # we kill the process with SIGTERM (15)
diff --git a/base/sort.jl b/base/sort.jl
index 9f37807b1c7cc..5ea8d37e446dc 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -1927,6 +1927,7 @@ julia> map(x->issorted(x[k]), (s1, s2))
 
 julia> s1[k] == s2[k]
 true
+```
 """
 struct PartialQuickSort{T <: Union{Integer,OrdinalRange}} <: Algorithm
     k::T
diff --git a/base/task.jl b/base/task.jl
index ce34d2f179fc5..63d0e9b6bd757 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -767,22 +767,33 @@ end
 
 function enq_work(t::Task)
     (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable")
-    if t.sticky || Threads.threadpoolsize() == 1
+
+    # Sticky tasks go into their thread's work queue.
+    if t.sticky
         tid = Threads.threadid(t)
         if tid == 0
-            # Issue #41324
-            # t.sticky && tid == 0 is a task that needs to be co-scheduled with
-            # the parent task. If the parent (current_task) is not sticky we must
-            # set it to be sticky.
-            # XXX: Ideally we would be able to unset this
-            current_task().sticky = true
+            # The task is not yet stuck to a thread. Stick it to the current
+            # thread and do the same to the parent task (the current task) so
+            # that the tasks are correctly co-scheduled (issue #41324).
+            # XXX: Ideally we would be able to unset this.
             tid = Threads.threadid()
             ccall(:jl_set_task_tid, Cint, (Any, Cint), t, tid-1)
+            current_task().sticky = true
         end
         push!(workqueue_for(tid), t)
     else
-        Partr.multiq_insert(t, t.priority)
-        tid = 0
+        tp = Threads.threadpool(t)
+        if Threads.threadpoolsize(tp) == 1
+            # There's only one thread in the task's assigned thread pool;
+            # use its work queue.
+            tid = (tp === :default) ? 1 : Threads.threadpoolsize(:default)+1
+            ccall(:jl_set_task_tid, Cint, (Any, Cint), t, tid-1)
+            push!(workqueue_for(tid), t)
+        else
+            # Otherwise, put the task in the multiqueue.
+            Partr.multiq_insert(t, t.priority)
+            tid = 0
+        end
     end
     ccall(:jl_wakeup_thread, Cvoid, (Int16,), (tid - 1) % Int16)
     return t
diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl
index 271d6ea9f7664..ff7b0278d8eba 100644
--- a/base/threadingconstructs.jl
+++ b/base/threadingconstructs.jl
@@ -32,16 +32,7 @@ See also `BLAS.get_num_threads` and `BLAS.set_num_threads` in the [`LinearAlgebr
 man-linalg) standard library, and `nprocs()` in the [`Distributed`](@ref man-distributed)
 standard library and [`Threads.maxthreadid()`](@ref).
 """
-function nthreads(pool::Symbol)
-    if pool === :default
-        tpid = Int8(0)
-    elseif pool === :interactive
-        tpid = Int8(1)
-    else
-        error("invalid threadpool specified")
-    end
-    return _nthreads_in_pool(tpid)
-end
+nthreads(pool::Symbol) = threadpoolsize(pool)
 
 function _nthreads_in_pool(tpid::Int8)
     p = unsafe_load(cglobal(:jl_n_threads_per_pool, Ptr{Cint}))
@@ -66,15 +57,25 @@ Returns the number of threadpools currently configured.
 nthreadpools() = Int(unsafe_load(cglobal(:jl_n_threadpools, Cint)))
 
 """
-    Threads.threadpoolsize()
+    Threads.threadpoolsize(pool::Symbol = :default) -> Int
 
-Get the number of threads available to the Julia default worker-thread pool.
+Get the number of threads available to the default thread pool (or to the
+specified thread pool).
 
 See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the
 [`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the
 [`Distributed`](@ref man-distributed) standard library.
 """
-threadpoolsize() = Threads._nthreads_in_pool(Int8(0))
+function threadpoolsize(pool::Symbol = :default)
+    if pool === :default
+        tpid = Int8(0)
+    elseif pool === :interactive
+        tpid = Int8(1)
+    else
+        error("invalid threadpool specified")
+    end
+    return _nthreads_in_pool(tpid)
+end
 
 function threading_run(fun, static)
     ccall(:jl_enter_threaded_region, Cvoid, ())
@@ -343,7 +344,11 @@ macro spawn(args...)
         let $(letargs...)
             local task = Task($thunk)
             task.sticky = false
-            ccall(:jl_set_task_threadpoolid, Cint, (Any, Int8), task, $tpid)
+            local tpid_actual = $tpid
+            if _nthreads_in_pool(tpid_actual) == 0
+                tpid_actual = Int8(0)
+            end
+            ccall(:jl_set_task_threadpoolid, Cint, (Any, Int8), task, tpid_actual)
             if $(Expr(:islocal, var))
                 put!($var, task)
             end
diff --git a/base/timing.jl b/base/timing.jl
index e082c09156b84..c994889d8902c 100644
--- a/base/timing.jl
+++ b/base/timing.jl
@@ -18,8 +18,8 @@ struct GC_Num
     full_sweep      ::Cint
     max_pause       ::Int64
     max_memory      ::Int64
-    time_to_safepoint             ::Int64
-    max_time_to_safepointp        ::Int64
+    time_to_safepoint           ::Int64
+    max_time_to_safepoint       ::Int64
     sweep_time      ::Int64
     mark_time       ::Int64
     total_sweep_time  ::Int64
diff --git a/contrib/refresh_checksums.mk b/contrib/refresh_checksums.mk
index fc632728e9a9e..bc32f9dfc2b2e 100644
--- a/contrib/refresh_checksums.mk
+++ b/contrib/refresh_checksums.mk
@@ -28,7 +28,7 @@ BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwi
 BB_GCC_EXPANDED_PROJECTS=openblas csl
 BB_CXX_EXPANDED_PROJECTS=gmp llvm clang llvm-tools lld
 # These are non-BB source-only deps
-NON_BB_PROJECTS=patchelf mozillacert lapack libwhich utf8proc
+NON_BB_PROJECTS=patchelf mozillacert lapack libwhich utf8proc ittapi
 
 ifneq ($(VERBOSE),1)
 QUIET_MAKE := -s
diff --git a/deps/Makefile b/deps/Makefile
index 4f0cc48b01971..e1f2f6cfc2c12 100644
--- a/deps/Makefile
+++ b/deps/Makefile
@@ -151,6 +151,16 @@ ifeq ($(USE_SYSTEM_P7ZIP), 0)
 DEP_LIBS += p7zip
 endif
 
+ifeq ($(USE_INTEL_JITEVENTS), 1)
+ifeq ($(USE_BINARYBUILDER_LLVM), 0)
+DEP_LIBS += ittapi
+endif
+endif
+
+ifeq ($(WITH_ITTAPI),1)
+DEP_LIBS += ittapi
+endif
+
 
 # Only compile standalone LAPACK if we are not using OpenBLAS.
 # OpenBLAS otherwise compiles LAPACK as part of its build.
@@ -172,7 +182,7 @@ endif
 DEP_LIBS_STAGED_ALL := llvm llvm-tools clang llvmunwind unwind libuv pcre \
 	openlibm dsfmt blastrampoline openblas lapack gmp mpfr patchelf utf8proc \
 	objconv mbedtls libssh2 nghttp2 curl libgit2 libwhich zlib p7zip csl \
-	libsuitesparse lld
+	libsuitesparse lld ittapi
 DEP_LIBS_ALL := $(DEP_LIBS_STAGED_ALL)
 
 ifneq ($(USE_BINARYBUILDER_OPENBLAS),0)
@@ -207,6 +217,7 @@ distcleanall: $(addprefix distclean-, $(DEP_LIBS_ALL))
 getall: $(addprefix get-, $(DEP_LIBS_ALL))
 
 include $(SRCDIR)/csl.mk
+include $(SRCDIR)/ittapi.mk
 include $(SRCDIR)/llvm.mk
 include $(SRCDIR)/libuv.mk
 include $(SRCDIR)/pcre.mk
diff --git a/deps/checksums/Pkg-1b73599d2ed8ef26ded339b1a3e80b6f26afd553.tar.gz/md5 b/deps/checksums/Pkg-1b73599d2ed8ef26ded339b1a3e80b6f26afd553.tar.gz/md5
new file mode 100644
index 0000000000000..7f9cf58dbc425
--- /dev/null
+++ b/deps/checksums/Pkg-1b73599d2ed8ef26ded339b1a3e80b6f26afd553.tar.gz/md5
@@ -0,0 +1 @@
+82e01a597e35e2f3f53a04ddc2776a8c
diff --git a/deps/checksums/Pkg-1b73599d2ed8ef26ded339b1a3e80b6f26afd553.tar.gz/sha512 b/deps/checksums/Pkg-1b73599d2ed8ef26ded339b1a3e80b6f26afd553.tar.gz/sha512
new file mode 100644
index 0000000000000..cfcfad9f002dc
--- /dev/null
+++ b/deps/checksums/Pkg-1b73599d2ed8ef26ded339b1a3e80b6f26afd553.tar.gz/sha512
@@ -0,0 +1 @@
+7a1179bed5c0b1fdc2f8e826ca89ae33713b24d5fa065d9ced1bb78386d9bad876ffe486e11400c3d67946ff77d58bdddf1309892c4a4f662c07b9ad243ae76a
diff --git a/deps/checksums/Pkg-3ced87de6b48ac8b886f5b26b2a1e8dd764614ae.tar.gz/md5 b/deps/checksums/Pkg-3ced87de6b48ac8b886f5b26b2a1e8dd764614ae.tar.gz/md5
deleted file mode 100644
index 7b3a24a6a0c8b..0000000000000
--- a/deps/checksums/Pkg-3ced87de6b48ac8b886f5b26b2a1e8dd764614ae.tar.gz/md5
+++ /dev/null
@@ -1 +0,0 @@
-58075bda169e76716f06c2ad9a0885e6
diff --git a/deps/checksums/Pkg-3ced87de6b48ac8b886f5b26b2a1e8dd764614ae.tar.gz/sha512 b/deps/checksums/Pkg-3ced87de6b48ac8b886f5b26b2a1e8dd764614ae.tar.gz/sha512
deleted file mode 100644
index 0d82e16d4da30..0000000000000
--- a/deps/checksums/Pkg-3ced87de6b48ac8b886f5b26b2a1e8dd764614ae.tar.gz/sha512
+++ /dev/null
@@ -1 +0,0 @@
-071f016efa5bf24599395eee018dc7ebde1f106ac2ef366cd739a5973126f4256a8e0c2b787207ae29467d94284821af0afe48491418c79a0e16f9bd3afe8898
diff --git a/deps/checksums/ittapi b/deps/checksums/ittapi
new file mode 100644
index 0000000000000..896e44d8f2907
--- /dev/null
+++ b/deps/checksums/ittapi
@@ -0,0 +1,2 @@
+ittapi-0014aec56fea2f30c1374f40861e1bccdd53d0cb.tar.gz/md5/932501cdb0e1c7841e23c12da7740419
+ittapi-0014aec56fea2f30c1374f40861e1bccdd53d0cb.tar.gz/sha512/4dd3343837398ada0cdcdaaff630d8d91738d166897d86b77770facde30da99dbb90931b58a4a887399e6bc9a7a1c245057d0a0f63762230d577d71da871701f
diff --git a/deps/ittapi.mk b/deps/ittapi.mk
new file mode 100644
index 0000000000000..1a47c3ae89390
--- /dev/null
+++ b/deps/ittapi.mk
@@ -0,0 +1,43 @@
+## ittapi ##
+include $(SRCDIR)/ittapi.version
+
+ITTAPI_GIT_URL := https://github.com/intel/ittapi.git
+ITTAPI_TAR_URL = https://api.github.com/repos/intel/ittapi/tarball/$1
+$(eval $(call git-external,ittapi,ITTAPI,CMakeLists.txt,,$(SRCCACHE)))
+
+ITTAPI_OPTS := $(CMAKE_COMMON) -DCMAKE_BUILD_TYPE=Release -DITT_API_IPT_SUPPORT= -DITT_API_FORTRAN_SUPPORT=0
+
+$(BUILDDIR)/$(ITTAPI_SRC_DIR)/build-configured: $(SRCCACHE)/$(ITTAPI_SRC_DIR)/source-extracted
+	mkdir -p $(dir $@)
+	cd $(dir $@) && \
+	$(CMAKE) $(dir $<) $(ITTAPI_OPTS)
+	echo 1 > $@
+
+$(BUILDDIR)/$(ITTAPI_SRC_DIR)/build-compiled: $(BUILDDIR)/$(ITTAPI_SRC_DIR)/build-configured
+	$(MAKE) -C $(dir $<)
+	echo 1 > $@
+
+define ITTAPI_INSTALL
+	mkdir -p $2/$$(build_libdir)
+	mkdir -p $2/$$(build_includedir)/ittapi
+	cp -a $1/bin/libittnotify.a $2/$$(build_libdir)
+	cp -a $1/bin/libjitprofiling.a $2/$$(build_libdir)
+	# cp -a $1/bin/libadvisor.a $2/$$(build_libdir)
+	cp -a $(SRCCACHE)/$(ITTAPI_SRC_DIR)/include/ittnotify.h $2/$$(build_includedir)/ittapi/
+	cp -a $(SRCCACHE)/$(ITTAPI_SRC_DIR)/include/ittnotify-zca.h $2/$$(build_includedir)/ittapi/
+	cp -a $(SRCCACHE)/$(ITTAPI_SRC_DIR)/include/jitprofiling.h $2/$$(build_includedir)/ittapi/
+endef
+
+$(eval $(call staged-install, \
+	ittapi,$(ITTAPI_SRC_DIR), \
+	ITTAPI_INSTALL,,,))
+
+get-ittapi: $(ITTAPI_SRC_FILE)
+extract-ittapi: $(SRCCACHE)/$(ITTAPI_SRC_DIR)/source-extracted
+configure-ittapi: $(BUILDDIR)/$(ITTAPI_SRC_DIR)/build-configured
+compile-ittapi: $(BUILDDIR)/$(ITTAPI_SRC_DIR)/build-compiled
+fastcheck-ittapi: #none
+check-ittapi: #none
+
+clean-ittapi:
+	-rm -f $(BUILDDIR)/$(ITTAPI_SRC_DIR)/build-compiled $(build_libdir)/libopenlibm.a
diff --git a/deps/ittapi.version b/deps/ittapi.version
new file mode 100644
index 0000000000000..81afb6de2add2
--- /dev/null
+++ b/deps/ittapi.version
@@ -0,0 +1,3 @@
+## source build
+ITTAPI_BRANCH=v3.24.0
+ITTAPI_SHA1=0014aec56fea2f30c1374f40861e1bccdd53d0cb
diff --git a/deps/llvm.mk b/deps/llvm.mk
index 78d037ec126d0..e64da6908a276 100644
--- a/deps/llvm.mk
+++ b/deps/llvm.mk
@@ -120,7 +120,7 @@ ifeq ($(USE_LLVM_SHLIB),1)
 LLVM_CMAKE += -DLLVM_BUILD_LLVM_DYLIB:BOOL=ON -DLLVM_LINK_LLVM_DYLIB:BOOL=ON
 endif
 ifeq ($(USE_INTEL_JITEVENTS), 1)
-LLVM_CMAKE += -DLLVM_USE_INTEL_JITEVENTS:BOOL=ON
+LLVM_CMAKE += -DLLVM_USE_INTEL_JITEVENTS:BOOL=ON -DITTAPI_SOURCE_DIR=$(SRCCACHE)/$(ITTAPI_SRC_DIR)
 endif # USE_INTEL_JITEVENTS
 
 ifeq ($(USE_OPROFILE_JITEVENTS), 1)
@@ -286,6 +286,11 @@ configure-llvm: $(LLVM_BUILDDIR_withtype)/build-configured
 compile-llvm: $(LLVM_BUILDDIR_withtype)/build-compiled
 fastcheck-llvm: #none
 check-llvm: $(LLVM_BUILDDIR_withtype)/build-checked
+
+ifeq ($(USE_INTEL_JITEVENTS),1)
+extract-llvm: $(SRCCACHE)/$(ITTAPI_SRC_DIR)/source-extracted
+endif
+
 #todo: LLVM make check target is broken on julia.mit.edu (and really slow elsewhere)
 
 else # USE_BINARYBUILDER_LLVM
diff --git a/doc/src/base/constants.md b/doc/src/base/constants.md
index 4ba0e627b0c54..14ddbc02698d0 100644
--- a/doc/src/base/constants.md
+++ b/doc/src/base/constants.md
@@ -23,6 +23,3 @@ See also:
   * [`stderr`](@ref)
   * [`ENV`](@ref)
   * [`ENDIAN_BOM`](@ref)
-  * `Libc.MS_ASYNC`
-  * `Libc.MS_INVALIDATE`
-  * `Libc.MS_SYNC`
diff --git a/doc/src/devdocs/build/build.md b/doc/src/devdocs/build/build.md
index e812e383c0592..6d5d4a54c8d64 100644
--- a/doc/src/devdocs/build/build.md
+++ b/doc/src/devdocs/build/build.md
@@ -187,6 +187,7 @@ uses are listed in [`deps/$(libname).version`](https://github.com/JuliaLang/juli
 - **[mbedtls]**              — library used for cryptography and transport layer security, used by libssh2
 - **[utf8proc]**             — a library for processing UTF-8 encoded Unicode strings.
 - **[LLVM libunwind]**       — LLVM's fork of [libunwind], a library that determines the call-chain of a program.
+- **[ITTAPI]**               — Intel's Instrumentation and Tracing Technology and Just-In-Time API.
 
 [GNU make]:     https://www.gnu.org/software/make
 [patch]:        https://www.gnu.org/software/patch
@@ -222,6 +223,7 @@ uses are listed in [`deps/$(libname).version`](https://github.com/JuliaLang/juli
 [pkg-config]:   https://www.freedesktop.org/wiki/Software/pkg-config/
 [powershell]:   https://docs.microsoft.com/en-us/powershell/scripting/wmf/overview
 [which]:        https://carlowood.github.io/which/
+[ITTAPI]:       https://github.com/intel/ittapi
 
 ## Build dependencies
 
diff --git a/src/array.c b/src/array.c
index ae89087502627..0b582296774b5 100644
--- a/src/array.c
+++ b/src/array.c
@@ -627,7 +627,7 @@ JL_DLLEXPORT void jl_arrayunset(jl_array_t *a, size_t i)
     if (i >= jl_array_len(a))
         jl_bounds_error_int((jl_value_t*)a, i + 1);
     if (a->flags.ptrarray)
-        jl_atomic_store_release(((_Atomic(jl_value_t*)*)a->data) + i, NULL);
+        jl_atomic_store_relaxed(((_Atomic(jl_value_t*)*)a->data) + i, NULL);
     else if (a->flags.hasptr) {
         size_t elsize = a->elsize;
         jl_assume(elsize >= sizeof(void*) && elsize % sizeof(void*) == 0);
diff --git a/src/codegen.cpp b/src/codegen.cpp
index f7adf9c3f6efd..462e9c0fb93ef 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -8421,11 +8421,11 @@ jl_llvm_functions_t jl_emit_code(
     JL_CATCH {
         // Something failed! This is very, very bad.
         // Try to pretend that it isn't and attempt to recover.
-        const char *mname = m.getModuleUnlocked()->getModuleIdentifier().data();
+        std::string mname = m.getModuleUnlocked()->getModuleIdentifier();
         m = orc::ThreadSafeModule();
         decls.functionObject = "";
         decls.specFunctionObject = "";
-        jl_printf((JL_STREAM*)STDERR_FILENO, "Internal error: encountered unexpected error during compilation of %s:\n", mname);
+        jl_printf((JL_STREAM*)STDERR_FILENO, "Internal error: encountered unexpected error during compilation of %s:\n", mname.c_str());
         jl_static_show((JL_STREAM*)STDERR_FILENO, jl_current_exception());
         jl_printf((JL_STREAM*)STDERR_FILENO, "\n");
         jlbacktrace(); // written to STDERR_FILENO
diff --git a/src/datatype.c b/src/datatype.c
index 17f5d53e59d23..c8805076d4109 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -68,7 +68,7 @@ JL_DLLEXPORT jl_typename_t *jl_new_typename_in(jl_sym_t *name, jl_module_t *modu
     tn->name = name;
     tn->module = module;
     tn->wrapper = NULL;
-    jl_atomic_store_release(&tn->Typeofwrapper, NULL);
+    jl_atomic_store_relaxed(&tn->Typeofwrapper, NULL);
     jl_atomic_store_relaxed(&tn->cache, jl_emptysvec);
     jl_atomic_store_relaxed(&tn->linearcache, jl_emptysvec);
     tn->names = NULL;
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 40292cf472037..b35c1722c82ff 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -165,9 +165,11 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
         ssize = LLT_ALIGN(ssize, jl_page_size);
     }
     if (stk == NULL) {
-        if (jl_atomic_load_relaxed(&num_stack_mappings) >= MAX_STACK_MAPPINGS)
+        if (jl_atomic_load_relaxed(&num_stack_mappings) >= MAX_STACK_MAPPINGS) {
             // we accept that this can go over by as much as nthreads since it's not a CAS
+            errno = ENOMEM;
             return NULL;
+        }
         // TODO: allocate blocks of stacks? but need to mprotect individually anyways
         stk = malloc_stack(ssize);
         if (stk == MAP_FAILED)
diff --git a/src/gc.c b/src/gc.c
index c6f338725dd1b..da79a58065b8c 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -184,6 +184,13 @@ pagetable_t memory_map;
 // List of marked big objects.  Not per-thread.  Accessed only by master thread.
 bigval_t *big_objects_marked = NULL;
 
+// Eytzinger tree of images. Used for very fast jl_object_in_image queries during gc
+// See https://algorithmica.org/en/eytzinger
+static arraylist_t eytzinger_image_tree;
+static arraylist_t eytzinger_idxs;
+static uintptr_t gc_img_min;
+static uintptr_t gc_img_max;
+
 // -- Finalization --
 // `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers.
 // If an object pointer has the lowest bit set, the next pointer is an unboxed c function pointer.
@@ -194,6 +201,118 @@ arraylist_t finalizer_list_marked;
 arraylist_t to_finalize;
 JL_DLLEXPORT _Atomic(int) jl_gc_have_pending_finalizers = 0;
 
+static int ptr_cmp(const void *l, const void *r)
+{
+    uintptr_t left = *(const uintptr_t*)l;
+    uintptr_t right = *(const uintptr_t*)r;
+    // jl_safe_printf("cmp %p %p\n", (void*)left, (void*)right);
+    return (left > right) - (left < right);
+}
+
+// Build an eytzinger tree from a sorted array
+static int eytzinger(uintptr_t *src, uintptr_t *dest, size_t i, size_t k, size_t n)
+{
+    if (k <= n) {
+        i = eytzinger(src, dest, i, 2 * k, n);
+        dest[k-1] = src[i];
+        i++;
+        i = eytzinger(src, dest, i, 2 * k + 1, n);
+    }
+    return i;
+}
+
+static size_t eyt_obj_idx(jl_value_t *obj) JL_NOTSAFEPOINT
+{
+    size_t n = eytzinger_image_tree.len - 1;
+    if (n == 0)
+        return n;
+    assert(n % 2 == 0 && "Eytzinger tree not even length!");
+    uintptr_t cmp = (uintptr_t) obj;
+    if (cmp <= gc_img_min || cmp > gc_img_max)
+        return n;
+    uintptr_t *tree = (uintptr_t*)eytzinger_image_tree.items;
+    size_t k = 1;
+    // note that k preserves the history of how we got to the current node
+    while (k <= n) {
+        int greater = (cmp > tree[k - 1]);
+        k <<= 1;
+        k |= greater;
+    }
+    // Free to assume k is nonzero, since we start with k = 1
+    // and cmp > gc_img_min
+    // This shift does a fast revert of the path until we get
+    // to a node that evaluated less than cmp.
+    k >>= (__builtin_ctzll(k) + 1);
+    assert(k != 0);
+    assert(k <= n && "Eytzinger tree index out of bounds!");
+    assert(tree[k - 1] < cmp && "Failed to find lower bound for object!");
+    return k - 1;
+}
+
+//used in staticdata.c after we add an image
+void rebuild_image_blob_tree(void)
+{
+    size_t inc = 1 + jl_linkage_blobs.len - eytzinger_image_tree.len;
+    assert(eytzinger_idxs.len == eytzinger_image_tree.len);
+    assert(eytzinger_idxs.max == eytzinger_image_tree.max);
+    arraylist_grow(&eytzinger_idxs, inc);
+    arraylist_grow(&eytzinger_image_tree, inc);
+    eytzinger_idxs.items[eytzinger_idxs.len - 1] = (void*)jl_linkage_blobs.len;
+    eytzinger_image_tree.items[eytzinger_image_tree.len - 1] = (void*)1; // outside image
+    for (size_t i = 0; i < jl_linkage_blobs.len; i++) {
+        assert((uintptr_t) jl_linkage_blobs.items[i] % 4 == 0 && "Linkage blob not 4-byte aligned!");
+        // We abuse the pointer here a little so that a couple of properties are true:
+        // 1. a start and an end are never the same value. This simplifies the binary search.
+        // 2. ends are always after starts. This also simplifies the binary search.
+        // We assume that there exist no 0-size blobs, but that's a safe assumption
+        // since it means nothing could be there anyways
+        uintptr_t val = (uintptr_t) jl_linkage_blobs.items[i];
+        eytzinger_idxs.items[i] = (void*)(val + (i & 1));
+    }
+    qsort(eytzinger_idxs.items, eytzinger_idxs.len - 1, sizeof(void*), ptr_cmp);
+    gc_img_min = (uintptr_t) eytzinger_idxs.items[0];
+    gc_img_max = (uintptr_t) eytzinger_idxs.items[eytzinger_idxs.len - 2] + 1;
+    eytzinger((uintptr_t*)eytzinger_idxs.items, (uintptr_t*)eytzinger_image_tree.items, 0, 1, eytzinger_idxs.len - 1);
+    // Reuse the scratch memory to store the indices
+    // Still O(nlogn) because binary search
+    for (size_t i = 0; i < jl_linkage_blobs.len; i ++) {
+        uintptr_t val = (uintptr_t) jl_linkage_blobs.items[i];
+        // This is the same computation as in the prior for loop
+        uintptr_t eyt_val = val + (i & 1);
+        size_t eyt_idx = eyt_obj_idx((jl_value_t*)(eyt_val + 1)); assert(eyt_idx < eytzinger_idxs.len - 1);
+        assert(eytzinger_image_tree.items[eyt_idx] == (void*)eyt_val && "Eytzinger tree failed to find object!");
+        if (i & 1)
+            eytzinger_idxs.items[eyt_idx] = (void*)n_linkage_blobs();
+        else
+            eytzinger_idxs.items[eyt_idx] = (void*)(i / 2);
+    }
+}
+
+static int eyt_obj_in_img(jl_value_t *obj) JL_NOTSAFEPOINT
+{
+    assert((uintptr_t) obj % 4 == 0 && "Object not 4-byte aligned!");
+    int idx = eyt_obj_idx(obj);
+    // Now we use a tiny trick: tree[idx] & 1 is whether or not tree[idx] is a
+    // start (0) or an end (1) of a blob. If it's a start, then the object is
+    // in the image, otherwise it is not.
+    int in_image = ((uintptr_t)eytzinger_image_tree.items[idx] & 1) == 0;
+    return in_image;
+}
+
+size_t external_blob_index(jl_value_t *v) JL_NOTSAFEPOINT
+{
+    assert((uintptr_t) v % 4 == 0 && "Object not 4-byte aligned!");
+    int eyt_idx = eyt_obj_idx(v);
+    // We fill the invalid slots with the length, so we can just return that
+    size_t idx = (size_t) eytzinger_idxs.items[eyt_idx];
+    return idx;
+}
+
+uint8_t jl_object_in_image(jl_value_t *obj) JL_NOTSAFEPOINT
+{
+    return eyt_obj_in_img(obj);
+}
+
 NOINLINE uintptr_t gc_get_stack_ptr(void)
 {
     return (uintptr_t)jl_get_frame_addr();
@@ -2673,7 +2792,7 @@ mark: {
         jl_datatype_t *vt = (jl_datatype_t*)tag;
         int foreign_alloc = 0;
         int update_meta = __likely(!meta_updated && !gc_verifying);
-        if (update_meta && jl_object_in_image(new_obj)) {
+        if (update_meta && o->bits.in_image) {
             foreign_alloc = 1;
             update_meta = 0;
         }
@@ -3366,12 +3485,14 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 
     // update heuristics only if this GC was automatically triggered
     if (collection == JL_GC_AUTO) {
-        if (not_freed_enough) {
-            gc_num.interval = gc_num.interval * 2;
-        }
         if (large_frontier) {
             sweep_full = 1;
+            gc_num.interval = last_long_collect_interval;
+        }
+        if (not_freed_enough || large_frontier) {
+            gc_num.interval = gc_num.interval * 2;
         }
+
         size_t maxmem = 0;
 #ifdef _P64
         // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2
@@ -3404,6 +3525,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // on the first collection after sweep_full, and the current scan
         perm_scanned_bytes = 0;
         promoted_bytes = 0;
+        last_long_collect_interval = gc_num.interval;
     }
     scanned_bytes = 0;
     // 5. start sweeping
@@ -3478,21 +3600,34 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     live_bytes += -gc_num.freed + gc_num.since_sweep;
 
     if (collection == JL_GC_AUTO) {
-      // If the current interval is larger than half the live data decrease the interval
-      int64_t half = live_bytes/2;
-      if (gc_num.interval > half) gc_num.interval = half;
-      // But never go below default
-      if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval;
+        //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster
+        if(!not_freed_enough || large_frontier) {
+            int64_t tot = 2 * (live_bytes + gc_num.since_sweep) / 3;
+            if (gc_num.interval > tot) {
+                gc_num.interval = tot;
+                last_long_collect_interval = tot;
+            }
+        // If the current interval is larger than half the live data decrease the interval
+        } else {
+            int64_t half = (live_bytes / 2);
+            if (gc_num.interval > half)
+                gc_num.interval = half;
+        }
+
+        // But never go below default
+        if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval;
     }
 
     if (gc_num.interval + live_bytes > max_total_memory) {
         if (live_bytes < max_total_memory) {
             gc_num.interval = max_total_memory - live_bytes;
-        } else {
+            last_long_collect_interval = max_total_memory - live_bytes;
+        }
+        else {
             // We can't stay under our goal so let's go back to
             // the minimum interval and hope things get better
             gc_num.interval = default_collect_interval;
-       }
+        }
     }
 
     gc_time_summary(sweep_full, t_start, gc_end_time, gc_num.freed,
@@ -3668,6 +3803,10 @@ void jl_gc_init(void)
 
     arraylist_new(&finalizer_list_marked, 0);
     arraylist_new(&to_finalize, 0);
+    arraylist_new(&eytzinger_image_tree, 0);
+    arraylist_new(&eytzinger_idxs, 0);
+    arraylist_push(&eytzinger_idxs, (void*)0);
+    arraylist_push(&eytzinger_image_tree, (void*)1); // outside image
 
     gc_num.interval = default_collect_interval;
     last_long_collect_interval = default_collect_interval;
diff --git a/src/gf.c b/src/gf.c
index 443cebcf58e71..cdbf2c724b514 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -207,6 +207,9 @@ JL_DLLEXPORT jl_value_t *jl_specializations_lookup(jl_method_t *m, jl_value_t *t
 
 JL_DLLEXPORT jl_value_t *jl_methtable_lookup(jl_methtable_t *mt, jl_value_t *type, size_t world)
 {
+    // TODO: this is sort of an odd lookup strategy (and the only user of
+    // jl_typemap_assoc_by_type with subtype=0), while normally jl_gf_invoke_lookup would be
+    // expected to be used instead
     struct jl_typemap_assoc search = {type, world, NULL, 0, ~(size_t)0};
     jl_typemap_entry_t *sf = jl_typemap_assoc_by_type(jl_atomic_load_relaxed(&mt->defs), &search, /*offs*/0, /*subtype*/0);
     if (!sf)
@@ -642,8 +645,7 @@ static jl_value_t *inst_varargp_in_env(jl_value_t *decl, jl_svec_t *sparams)
                 int T_has_tv = T && jl_has_typevar(T, v);
                 int N_has_tv = N && jl_has_typevar(N, v); // n.b. JL_VARARG_UNBOUND check means this should be false
                 assert(!N_has_tv || N == (jl_value_t*)v);
-                if (T_has_tv)
-                    vm = jl_type_unionall(v, T);
+                vm = T_has_tv ? jl_type_unionall(v, T) : T;
                 if (N_has_tv)
                     N = NULL;
                 vm = (jl_value_t*)jl_wrap_vararg(vm, N); // this cannot throw for these inputs
@@ -1380,7 +1382,9 @@ struct matches_env {
     struct typemap_intersection_env match;
     jl_typemap_entry_t *newentry;
     jl_value_t *shadowed;
+    jl_typemap_entry_t *replaced;
 };
+
 static int get_intersect_visitor(jl_typemap_entry_t *oldentry, struct typemap_intersection_env *closure0)
 {
     struct matches_env *closure = container_of(closure0, struct matches_env, match);
@@ -1391,13 +1395,17 @@ static int get_intersect_visitor(jl_typemap_entry_t *oldentry, struct typemap_in
         // also be careful not to try to scan something from the current dump-reload though
         return 1;
     jl_method_t *oldmethod = oldentry->func.method;
+    if (closure->match.issubty // e.g. jl_subtype(closure->newentry.sig, oldentry->sig)
+        && jl_subtype(oldmethod->sig, (jl_value_t*)closure->newentry->sig)) { // e.g. jl_type_equal(closure->newentry->sig, oldentry->sig)
+        closure->replaced = oldentry;
+    }
     if (closure->shadowed == NULL)
         closure->shadowed = (jl_value_t*)jl_alloc_vec_any(0);
     jl_array_ptr_1d_push((jl_array_t*)closure->shadowed, (jl_value_t*)oldmethod);
     return 1;
 }
 
-static jl_value_t *get_intersect_matches(jl_typemap_t *defs, jl_typemap_entry_t *newentry)
+static jl_value_t *get_intersect_matches(jl_typemap_t *defs, jl_typemap_entry_t *newentry, jl_typemap_entry_t **replaced)
 {
     jl_tupletype_t *type = newentry->sig;
     jl_tupletype_t *ttypes = (jl_tupletype_t*)jl_unwrap_unionall((jl_value_t*)type);
@@ -1412,9 +1420,12 @@ static jl_value_t *get_intersect_matches(jl_typemap_t *defs, jl_typemap_entry_t
     }
     struct matches_env env = {{get_intersect_visitor, (jl_value_t*)type, va,
             /* .ti = */ NULL, /* .env = */ jl_emptysvec, /* .issubty = */ 0},
-        /* .newentry = */ newentry, /* .shadowed */ NULL};
+        /* .newentry = */ newentry, /* .shadowed */ NULL, /* .replaced */ NULL};
     JL_GC_PUSH3(&env.match.env, &env.match.ti, &env.shadowed);
     jl_typemap_intersection_visitor(defs, 0, &env.match);
+    env.match.env = NULL;
+    env.match.ti = NULL;
+    *replaced = env.replaced;
     JL_GC_POP();
     return env.shadowed;
 }
@@ -1610,31 +1621,32 @@ JL_DLLEXPORT void jl_method_instance_add_backedge(jl_method_instance_t *callee,
     JL_LOCK(&callee->def.method->writelock);
     if (invokesig == jl_nothing)
         invokesig = NULL;      // julia uses `nothing` but C uses NULL (#undef)
+    int found = 0;
+    // TODO: use jl_cache_type_(invokesig) like cache_method does to save memory
     if (!callee->backedges) {
         // lazy-init the backedges array
         callee->backedges = jl_alloc_vec_any(0);
         jl_gc_wb(callee, callee->backedges);
-        push_edge(callee->backedges, invokesig, caller);
     }
     else {
         size_t i = 0, l = jl_array_len(callee->backedges);
-        int found = 0;
-        jl_value_t *invokeTypes;
-        jl_method_instance_t *mi;
-        while (i < l) {
-            i = get_next_edge(callee->backedges, i, &invokeTypes, &mi);
-            // TODO: it would be better to canonicalize (how?) the Tuple-type so
-            // that we don't have to call `jl_egal`
-            if (mi == caller && ((invokesig == NULL && invokeTypes == NULL) ||
-                                 (invokesig && invokeTypes && jl_egal(invokesig, invokeTypes)))) {
+        for (i = 0; i < l; i++) {
+            // optimized version of while (i < l) i = get_next_edge(callee->backedges, i, &invokeTypes, &mi);
+            jl_value_t *mi = jl_array_ptr_ref(callee->backedges, i);
+            if (mi != (jl_value_t*)caller)
+                continue;
+            jl_value_t *invokeTypes = i > 0 ? jl_array_ptr_ref(callee->backedges, i - 1) : NULL;
+            if (invokeTypes && jl_is_method_instance(invokeTypes))
+                invokeTypes = NULL;
+            if ((invokesig == NULL && invokeTypes == NULL) ||
+                (invokesig && invokeTypes && jl_types_equal(invokesig, invokeTypes))) {
                 found = 1;
                 break;
             }
         }
-        if (!found) {
-            push_edge(callee->backedges, invokesig, caller);
-        }
     }
+    if (!found)
+        push_edge(callee->backedges, invokesig, caller);
     JL_UNLOCK(&callee->def.method->writelock);
 }
 
@@ -1650,6 +1662,7 @@ JL_DLLEXPORT void jl_method_table_add_backedge(jl_methtable_t *mt, jl_value_t *t
         jl_array_ptr_set(mt->backedges, 1, caller);
     }
     else {
+        // TODO: use jl_cache_type_(tt) like cache_method does, instead of a linear scan
         size_t i, l = jl_array_len(mt->backedges);
         for (i = 1; i < l; i += 2) {
             if (jl_types_equal(jl_array_ptr_ref(mt->backedges, i - 1), typ)) {
@@ -1733,8 +1746,9 @@ static jl_typemap_entry_t *do_typemap_search(jl_methtable_t *mt JL_PROPAGATES_RO
 }
 #endif
 
-static void jl_method_table_invalidate(jl_methtable_t *mt, jl_typemap_entry_t *methodentry, jl_method_t *method, size_t max_world)
+static void jl_method_table_invalidate(jl_methtable_t *mt, jl_typemap_entry_t *methodentry, size_t max_world)
 {
+    jl_method_t *method = methodentry->func.method;
     assert(!method->is_for_opaque_closure);
     method->deleted_world = methodentry->max_world = max_world;
     // drop this method from mt->cache
@@ -1758,16 +1772,18 @@ static void jl_method_table_invalidate(jl_methtable_t *mt, jl_typemap_entry_t *m
     }
     // Invalidate the backedges
     int invalidated = 0;
-    jl_svec_t *specializations = jl_atomic_load_relaxed(&methodentry->func.method->specializations);
+    jl_svec_t *specializations = jl_atomic_load_relaxed(&method->specializations);
     l = jl_svec_len(specializations);
     for (i = 0; i < l; i++) {
         jl_method_instance_t *mi = (jl_method_instance_t*)jl_svecref(specializations, i);
         if ((jl_value_t*)mi != jl_nothing) {
             invalidated = 1;
-            invalidate_external(mi, methodentry->max_world);
-            invalidate_backedges(&do_nothing_with_codeinst, mi, methodentry->max_world, "jl_method_table_disable");
+            invalidate_external(mi, max_world);
+            invalidate_backedges(&do_nothing_with_codeinst, mi, max_world, "jl_method_table_disable");
         }
     }
+    // XXX: this might have resolved an ambiguity, for which we have not tracked the edge here,
+    // and thus now introduce a mistake into inference
     if (invalidated && _jl_debug_method_invalidation) {
         jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)method);
         jl_value_t *loctag = jl_cstr_to_string("jl_method_table_disable");
@@ -1786,7 +1802,7 @@ JL_DLLEXPORT void jl_method_table_disable(jl_methtable_t *mt, jl_method_t *metho
     JL_LOCK(&mt->writelock);
     // Narrow the world age on the method to make it uncallable
     size_t world = jl_atomic_fetch_add(&jl_world_counter, 1);
-    jl_method_table_invalidate(mt, methodentry, method, world);
+    jl_method_table_invalidate(mt, methodentry, world);
     JL_UNLOCK(&mt->writelock);
 }
 
@@ -1816,6 +1832,36 @@ static int jl_type_intersection2(jl_value_t *t1, jl_value_t *t2, jl_value_t **is
     return 1;
 }
 
+enum morespec_options {
+    morespec_unknown,
+    morespec_isnot,
+    morespec_is
+};
+
+// check if `type` is replacing `m` with an ambiguity here, given other methods in `d` that already match it
+// precondition: type is not more specific than `m`
+static int is_replacing(jl_value_t *type, jl_method_t *m, jl_method_t *const *d, size_t n, jl_value_t *isect, jl_value_t *isect2, char *morespec)
+{
+    size_t k;
+    for (k = 0; k < n; k++) {
+        jl_method_t *m2 = d[k];
+        // see if m2 also fully covered this intersection
+        if (m == m2 || !(jl_subtype(isect, m2->sig) || (isect2 && jl_subtype(isect2, m2->sig))))
+            continue;
+        if (morespec[k] == (char)morespec_unknown)
+            morespec[k] = (char)(jl_type_morespecific(m2->sig, type) ? morespec_is : morespec_isnot);
+        if (morespec[k] == (char)morespec_is)
+            // not actually shadowing this--m2 will still be better
+            return 0;
+        // since m2 was also a previous match over isect,
+        // see if m was also previously dominant over all m2
+        if (!jl_type_morespecific(m->sig, m2->sig))
+            // m and m2 were previously ambiguous over the full intersection of mi with type, and will still be ambiguous with type
+            return 0;
+    }
+    return 1;
+}
+
 JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method, jl_tupletype_t *simpletype)
 {
     JL_TIMING(ADD_METHOD);
@@ -1834,23 +1880,22 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
     jl_typemap_entry_t *newentry = NULL;
     JL_GC_PUSH7(&oldvalue, &oldmi, &newentry, &loctag, &isect, &isect2, &isect3);
     JL_LOCK(&mt->writelock);
-    // first find if we have an existing entry to delete
-    struct jl_typemap_assoc search = {(jl_value_t*)type, method->primary_world, NULL, 0, ~(size_t)0};
-    jl_typemap_entry_t *oldentry = jl_typemap_assoc_by_type(jl_atomic_load_relaxed(&mt->defs), &search, /*offs*/0, /*subtype*/0);
-    // then add our new entry
+    // add our new entry
     newentry = jl_typemap_alloc((jl_tupletype_t*)type, simpletype, jl_emptysvec,
             (jl_value_t*)method, method->primary_world, method->deleted_world);
     jl_typemap_insert(&mt->defs, (jl_value_t*)mt, newentry, 0);
-    if (oldentry) {
-        jl_method_t *m = oldentry->func.method;
-        method_overwrite(newentry, m);
-        jl_method_table_invalidate(mt, oldentry, m, max_world);
+    jl_typemap_entry_t *replaced = NULL;
+    // then check what entries we replaced
+    oldvalue = get_intersect_matches(jl_atomic_load_relaxed(&mt->defs), newentry, &replaced);
+    int invalidated = 0;
+    if (replaced) {
+        oldvalue = (jl_value_t*)replaced;
+        invalidated = 1;
+        method_overwrite(newentry, replaced->func.method);
+        jl_method_table_invalidate(mt, replaced, max_world);
     }
     else {
-        oldvalue = get_intersect_matches(jl_atomic_load_relaxed(&mt->defs), newentry);
-
-        int invalidated = 0;
-        jl_method_t **d;
+        jl_method_t *const *d;
         size_t j, n;
         if (oldvalue == NULL) {
             d = NULL;
@@ -1879,6 +1924,7 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
                     //    -> less specific or ambiguous with any one of them: can ignore the missing edge (not missing)
                     //      -> some may have been ambiguous: still are
                     //      -> some may have been called: they may be partly replaced (will be detected in the loop later)
+                    // c.f. `is_replacing`, which is a similar query, but with an existing method match to compare against
                     missing = 1;
                     size_t j;
                     for (j = 0; j < n; j++) {
@@ -1913,11 +1959,6 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
         }
         if (oldvalue) {
             oldmi = jl_alloc_vec_any(0);
-            enum morespec_options {
-                morespec_unknown,
-                morespec_isnot,
-                morespec_is
-            };
             char *morespec = (char*)alloca(n);
             memset(morespec, morespec_unknown, n);
             for (j = 0; j < n; j++) {
@@ -1934,6 +1975,11 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
                         continue;
                     isect3 = jl_type_intersection(m->sig, (jl_value_t*)mi->specTypes);
                     if (jl_type_intersection2(type, isect3, &isect, &isect2)) {
+                        // TODO: this only checks pair-wise for ambiguities, but the ambiguities could arise from the interaction of multiple methods
+                        // and thus might miss a case where we introduce an ambiguity between two existing methods
+                        // We could instead work to sort this into 3 groups `morespecific .. ambiguous .. lesspecific`, with `type` in ambiguous,
+                        // such that everything in `morespecific` dominates everything in `ambiguous`, and everything in `ambiguous` dominates everything in `lessspecific`
+                        // And then compute where each isect falls, and whether it changed group--necessitating invalidation--or not.
                         if (morespec[j] == (char)morespec_unknown)
                             morespec[j] = (char)(jl_type_morespecific(m->sig, type) ? morespec_is : morespec_isnot);
                         if (morespec[j] == (char)morespec_is)
@@ -1942,61 +1988,42 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
                         if (ambig == morespec_unknown)
                             ambig = jl_type_morespecific(type, m->sig) ? morespec_is : morespec_isnot;
                         // replacing a method--see if this really was the selected method previously
-                        // over the intersection
-                        if (ambig == morespec_isnot)  {
-                            size_t k;
-                            for (k = 0; k < n; k++) {
-                                jl_method_t *m2 = d[k];
-                                if (m == m2 || !(jl_subtype(isect, m2->sig) || (isect && jl_subtype(isect, m2->sig))))
-                                    continue;
-                                if (morespec[k] == (char)morespec_unknown)
-                                    morespec[k] = (char)(jl_type_morespecific(m2->sig, type) ? morespec_is : morespec_isnot);
-                                if (morespec[k] == (char)morespec_is)
-                                    // not actually shadowing this--m2 will still be better
-                                    break;
-                                // since m2 was also a previous match over isect,
-                                // see if m was also previously dominant over all m2
-                                if (!jl_type_morespecific(m->sig, m2->sig))
-                                    break;
-                            }
-                            if (k != n)
-                                continue;
-                        }
-                        // Before deciding whether to invalidate `mi`, check each backedge for `invoke`s
-                        if (mi->backedges) {
-                            jl_array_t *backedges = mi->backedges;
+                        // over the intersection (not ambiguous) and the new method will be selected now (morespec_is)
+                        int replaced_dispatch = ambig == morespec_is || is_replacing(type, m, d, n, isect, isect2, morespec);
+                        // found that this specialization dispatch got replaced by m
+                        // call invalidate_backedges(&do_nothing_with_codeinst, mi, max_world, "jl_method_table_insert");
+                        // but ignore invoke-type edges
+                        jl_array_t *backedges = mi->backedges;
+                        if (backedges) {
                             size_t ib = 0, insb = 0, nb = jl_array_len(backedges);
                             jl_value_t *invokeTypes;
                             jl_method_instance_t *caller;
                             while (ib < nb) {
                                 ib = get_next_edge(backedges, ib, &invokeTypes, &caller);
-                                if (!invokeTypes) {
-                                    // ordinary dispatch, invalidate
+                                int replaced_edge;
+                                if (invokeTypes) {
+                                    // n.b. normally we must have mi.specTypes <: invokeTypes <: m.sig (though it might not strictly hold), so we only need to check the other subtypes
+                                    replaced_edge = jl_subtype(invokeTypes, type) && (ambig == morespec_is || is_replacing(type, m, d, n, invokeTypes, NULL, morespec));
+                                }
+                                else {
+                                    replaced_edge = replaced_dispatch;
+                                }
+                                if (replaced_edge) {
                                     invalidate_method_instance(&do_nothing_with_codeinst, caller, max_world, 1);
                                     invalidated = 1;
-                                } else {
-                                    // invoke-dispatch, check invokeTypes for validity
-                                    struct jl_typemap_assoc search = {invokeTypes, method->primary_world, NULL, 0, ~(size_t)0};
-                                    oldentry = jl_typemap_assoc_by_type(jl_atomic_load_relaxed(&mt->defs), &search, /*offs*/0, /*subtype*/0);
-                                    if (oldentry && oldentry->func.method == mi->def.method) {
-                                        // We can safely keep this method
-                                        jl_array_ptr_set(backedges, insb++, invokeTypes);
-                                        jl_array_ptr_set(backedges, insb++, caller);
-                                    } else {
-                                        invalidate_method_instance(&do_nothing_with_codeinst, caller, max_world, 1);
-                                        invalidated = 1;
-                                    }
+                                }
+                                else {
+                                    insb = set_next_edge(backedges, insb, invokeTypes, caller);
                                 }
                             }
                             jl_array_del_end(backedges, nb - insb);
                         }
-                        if (!mi->backedges || jl_array_len(mi->backedges) == 0) {
-                            jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
-                            invalidate_external(mi, max_world);
-                            if (mi->backedges) {
-                                invalidated = 1;
-                                invalidate_backedges(&do_nothing_with_codeinst, mi, max_world, "jl_method_table_insert");
-                            }
+                        jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
+                        invalidate_external(mi, max_world);
+                        if (_jl_debug_method_invalidation) {
+                            jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)mi);
+                            loctag = jl_cstr_to_string("jl_method_table_insert");
+                            jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
                         }
                     }
                 }
@@ -2024,13 +2051,13 @@ JL_DLLEXPORT void jl_method_table_insert(jl_methtable_t *mt, jl_method_t *method
                 }
             }
         }
-        if (invalidated && _jl_debug_method_invalidation) {
-            jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)method);
-            loctag = jl_cstr_to_string("jl_method_table_insert");
-            jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
-        }
-        update_max_args(mt, type);
     }
+    if (invalidated && _jl_debug_method_invalidation) {
+        jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)method);
+        loctag = jl_cstr_to_string("jl_method_table_insert");
+        jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
+    }
+    update_max_args(mt, type);
     JL_UNLOCK(&mt->writelock);
     JL_GC_POP();
 }
@@ -3435,22 +3462,9 @@ static jl_value_t *ml_matches(jl_methtable_t *mt,
                     }
                     if (ti != jl_bottom_type) {
                         disjoint = 0;
-                        // m and m2 are ambiguous, but let's see if we can find another method (m3)
-                        // that dominates their intersection, and means we can ignore this
-                        size_t k;
-                        for (k = i; k > 0; k--) {
-                            jl_method_match_t *matc3 = (jl_method_match_t*)jl_array_ptr_ref(env.t, k - 1);
-                            jl_method_t *m3 = matc3->method;
-                            if ((jl_subtype(ti, m3->sig) || (isect2 && jl_subtype(isect2, m3->sig)))
-                                    && jl_type_morespecific((jl_value_t*)m3->sig, (jl_value_t*)m->sig)
-                                    && jl_type_morespecific((jl_value_t*)m3->sig, (jl_value_t*)m2->sig))
-                                break;
-                        }
-                        if (k == 0) {
-                            ambig_groupid[j - 1] = i; // ambiguity covering range [i:j)
-                            isect2 = NULL;
-                            break;
-                        }
+                        ambig_groupid[j - 1] = i; // ambiguity covering range [i:j)
+                        isect2 = NULL;
+                        break;
                     }
                     isect2 = NULL;
                 }
@@ -3529,19 +3543,89 @@ static jl_value_t *ml_matches(jl_methtable_t *mt,
             // Compute whether anything could be ambiguous by seeing if any two
             // remaining methods in the result are in the same ambiguity group.
             assert(len > 0);
-            uint32_t agid = ambig_groupid[0];
-            for (i = 1; i < len; i++) {
-                if (!skip[i]) {
-                    if (agid == ambig_groupid[i]) {
-                        has_ambiguity = 1;
-                        break;
+            if (!has_ambiguity) {
+                // quick test
+                uint32_t agid = ambig_groupid[0];
+                for (i = 1; i < len; i++) {
+                    if (!skip[i]) {
+                        if (agid == ambig_groupid[i]) {
+                            has_ambiguity = 1;
+                            break;
+                        }
+                        agid = ambig_groupid[i];
+                    }
+                }
+                // laborious test, checking for existence and coverage of m3
+                if (has_ambiguity) {
+                    // some method is ambiguous, but let's see if we can find another method (m3)
+                    // outside of the ambiguity group that dominates any ambiguous methods,
+                    // and means we can ignore this for has_ambiguity
+                    has_ambiguity = 0;
+                    for (i = 0; i < len; i++) {
+                        if (skip[i])
+                            continue;
+                        uint32_t agid = ambig_groupid[i];
+                        jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(env.t, i);
+                        jl_method_t *m = matc->method;
+                        int subt = matc->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m->sig)
+                        for (j = agid; j < len && ambig_groupid[j] == agid; j++) {
+                            // n.b. even if we skipped them earlier, they still might
+                            // contribute to the ambiguities (due to lock of transitivity of
+                            // morespecific over subtyping)
+                            if (j == i)
+                                continue;
+                            jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(env.t, j);
+                            jl_method_t *m2 = matc2->method;
+                            int subt2 = matc2->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m2->sig)
+                            // if they aren't themselves simply ordered
+                            if (jl_type_morespecific((jl_value_t*)m->sig, (jl_value_t*)m2->sig) ||
+                                jl_type_morespecific((jl_value_t*)m2->sig, (jl_value_t*)m->sig))
+                                continue;
+                            jl_value_t *ti;
+                            if (subt) {
+                                ti = (jl_value_t*)matc2->spec_types;
+                                isect2 = NULL;
+                            }
+                            else if (subt2) {
+                                ti = (jl_value_t*)matc->spec_types;
+                                isect2 = NULL;
+                            }
+                            else {
+                                jl_type_intersection2((jl_value_t*)matc->spec_types, (jl_value_t*)matc2->spec_types, &env.match.ti, &isect2);
+                                ti = env.match.ti;
+                            }
+                            // and their intersection contributes to the ambiguity cycle
+                            if (ti != jl_bottom_type) {
+                                // now look for a third method m3 outside of this ambiguity group that fully resolves this intersection
+                                size_t k;
+                                for (k = agid; k > 0; k--) {
+                                    jl_method_match_t *matc3 = (jl_method_match_t*)jl_array_ptr_ref(env.t, k);
+                                    jl_method_t *m3 = matc3->method;
+                                    if ((jl_subtype(ti, m3->sig) || (isect2 && jl_subtype(isect2, m3->sig)))
+                                            && jl_type_morespecific((jl_value_t*)m3->sig, (jl_value_t*)m->sig)
+                                            && jl_type_morespecific((jl_value_t*)m3->sig, (jl_value_t*)m2->sig)) {
+                                        //if (jl_subtype(matc->spec_types, ti) || jl_subtype(matc->spec_types, matc3->m3->sig))
+                                        //    // check if it covered not only this intersection, but all intersections with matc
+                                        //    // if so, we do not need to check all of them separately
+                                        //    j = len;
+                                        break;
+                                    }
+                                }
+                                if (k == 0)
+                                    has_ambiguity = 1;
+                                isect2 = NULL;
+                            }
+                            if (has_ambiguity)
+                                break;
+                        }
+                        if (has_ambiguity)
+                            break;
                     }
-                    agid = ambig_groupid[i];
                 }
             }
             // If we're only returning possible matches, now filter out any method
             // whose intersection is fully ambiguous with the group it is in.
-            if (!include_ambiguous) {
+            if (!include_ambiguous && has_ambiguity) {
                 for (i = 0; i < len; i++) {
                     if (skip[i])
                         continue;
@@ -3559,7 +3643,7 @@ static jl_value_t *ml_matches(jl_methtable_t *mt,
                         int subt2 = matc2->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m2->sig)
                         // if their intersection contributes to the ambiguity cycle
                         if (subt || subt2 || !jl_has_empty_intersection((jl_value_t*)ti, m2->sig)) {
-                            // and the contribution of m is ambiguous with the portion of the cycle from m2
+                            // and the contribution of m is fully ambiguous with the portion of the cycle from m2
                             if (subt2 || jl_subtype((jl_value_t*)ti, m2->sig)) {
                                 // but they aren't themselves simply ordered (here
                                 // we don't consider that a third method might be
diff --git a/src/julia.expmap b/src/julia.expmap
index 35cc5eac48b6a..7df813498182b 100644
--- a/src/julia.expmap
+++ b/src/julia.expmap
@@ -35,6 +35,9 @@
     LLVMExtra*;
     llvmGetPassPluginInfo;
 
+    /* Make visible so that linker will merge duplicate definitions across DSO boundaries */
+    _ZN4llvm3Any6TypeId*;
+
     /* freebsd */
     environ;
     __progname;
diff --git a/src/julia.h b/src/julia.h
index 2fe299c6edffb..e39a8d66cacd3 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -91,6 +91,7 @@ typedef struct _jl_value_t jl_value_t;
 
 struct _jl_taggedvalue_bits {
     uintptr_t gc:2;
+    uintptr_t in_image:1;
 };
 
 JL_EXTENSION struct _jl_taggedvalue_t {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 6cf491df71dde..15e004e0ba0b4 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -301,6 +301,7 @@ static inline void memmove_refs(void **dstp, void *const *srcp, size_t n) JL_NOT
 #define GC_MARKED 1 // reachable and young
 #define GC_OLD    2 // if it is reachable it will be marked as old
 #define GC_OLD_MARKED (GC_OLD | GC_MARKED) // reachable and old
+#define GC_IN_IMAGE 4
 
 // useful constants
 extern jl_methtable_t *jl_type_type_mt JL_GLOBALLY_ROOTED;
@@ -958,28 +959,9 @@ STATIC_INLINE size_t n_linkage_blobs(void) JL_NOTSAFEPOINT
     return jl_image_relocs.len;
 }
 
-// TODO: Makes this a binary search
-STATIC_INLINE size_t external_blob_index(jl_value_t *v) JL_NOTSAFEPOINT {
-    size_t i, nblobs = n_linkage_blobs();
-    assert(jl_linkage_blobs.len == 2*nblobs);
-    for (i = 0; i < nblobs; i++) {
-        uintptr_t left = (uintptr_t)jl_linkage_blobs.items[2*i];
-        uintptr_t right = (uintptr_t)jl_linkage_blobs.items[2*i + 1];
-        if (left < (uintptr_t)v && (uintptr_t)v <= right) {
-            // the last object may be a singleton (v is shifted by a type tag, so we use exclusive bounds here)
-            break;
-        }
-    }
-    return i;
-}
+size_t external_blob_index(jl_value_t *v) JL_NOTSAFEPOINT;
 
-STATIC_INLINE uint8_t jl_object_in_image(jl_value_t* v) JL_NOTSAFEPOINT {
-    size_t blob = external_blob_index(v);
-    if (blob == n_linkage_blobs()) {
-        return 0;
-    }
-    return 1;
-}
+uint8_t jl_object_in_image(jl_value_t* v) JL_NOTSAFEPOINT;
 
 typedef struct {
     LLVMOrcThreadSafeModuleRef TSM;
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 6ed664199fd2b..c1947b4b4dce4 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -654,7 +654,7 @@ static void allocate_segv_handler(void)
 static void *alloc_sigstack(size_t *ssize)
 {
     void *stk = jl_malloc_stack(ssize, NULL);
-    if (stk == MAP_FAILED)
+    if (stk == NULL)
         jl_errorf("fatal error allocating signal stack: mmap: %s", strerror(errno));
     return stk;
 }
diff --git a/src/signals-win.c b/src/signals-win.c
index f20a4d5287669..5dd6b34558ca6 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -483,11 +483,15 @@ void jl_install_default_signal_handlers(void)
 
 void jl_install_thread_signal_handler(jl_ptls_t ptls)
 {
-    size_t ssize = sig_stack_size;
-    void *stk = jl_malloc_stack(&ssize, NULL);
-    collect_backtrace_fiber.uc_stack.ss_sp = (void*)stk;
-    collect_backtrace_fiber.uc_stack.ss_size = ssize;
-    jl_makecontext(&collect_backtrace_fiber, start_backtrace_fiber);
-    uv_mutex_init(&backtrace_lock);
-    have_backtrace_fiber = 1;
+    if (!have_backtrace_fiber) {
+        size_t ssize = sig_stack_size;
+        void *stk = jl_malloc_stack(&ssize, NULL);
+        if (stk == NULL)
+            jl_errorf("fatal error allocating signal stack: mmap: %s", strerror(errno));
+        collect_backtrace_fiber.uc_stack.ss_sp = (void*)stk;
+        collect_backtrace_fiber.uc_stack.ss_size = ssize;
+        jl_makecontext(&collect_backtrace_fiber, start_backtrace_fiber);
+        uv_mutex_init(&backtrace_lock);
+        have_backtrace_fiber = 1;
+    }
 }
diff --git a/src/staticdata.c b/src/staticdata.c
index 926ee4e2b3186..f3725dc1508dc 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -1837,6 +1837,7 @@ void gc_sweep_sysimg(void)
             last_pos = pos;
             jl_taggedvalue_t *o = (jl_taggedvalue_t *)(base + pos);
             o->bits.gc = GC_OLD;
+            assert(o->bits.in_image == 1);
         }
     }
 }
@@ -2284,6 +2285,7 @@ static void jl_prepare_serialization_data(jl_array_t *mod_array, jl_array_t *new
     }
 
     if (edges) {
+        size_t world = jl_atomic_load_acquire(&jl_world_counter);
         jl_collect_missing_backedges(jl_type_type_mt);
         jl_collect_missing_backedges(jl_nonfunction_mt);
         // jl_collect_extext_methods_from_mod and jl_collect_missing_backedges also accumulate data in callers_with_edges.
@@ -2293,7 +2295,7 @@ static void jl_prepare_serialization_data(jl_array_t *mod_array, jl_array_t *new
         *method_roots_list = jl_alloc_vec_any(0);
         // Collect the new method roots
         jl_collect_new_roots(*method_roots_list, *new_specializations, worklist_key);
-        jl_collect_edges(*edges, *ext_targets, *new_specializations);
+        jl_collect_edges(*edges, *ext_targets, *new_specializations, world);
     }
     assert(edges_map == NULL); // jl_collect_edges clears this when done
 
@@ -2451,6 +2453,10 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
         jl_write_relocations(&s);
     }
 
+    // This ensures that we can use the low bit of addresses for
+    // identifying end pointers in gc's eytzinger search.
+    write_padding(&sysimg, 4 - (sysimg.size % 4));
+
     if (sysimg.size > ((uintptr_t)1 << RELOC_TAG_OFFSET)) {
         jl_printf(
             JL_STDERR,
@@ -2744,6 +2750,8 @@ JL_DLLEXPORT void jl_set_sysimg_so(void *handle)
 // }
 #endif
 
+extern void rebuild_image_blob_tree(void);
+
 static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl_array_t *depmods, uint64_t checksum,
                                 /* outputs */    jl_array_t **restored,         jl_array_t **init_order,
                                                  jl_array_t **extext_methods,
@@ -2891,7 +2899,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
         *base = image_base;
 
     s.s = &sysimg;
-    jl_read_reloclist(&s, s.link_ids_gctags, GC_OLD); // gctags
+    jl_read_reloclist(&s, s.link_ids_gctags, GC_OLD | GC_IN_IMAGE); // gctags
     size_t sizeof_tags = ios_pos(&relocs);
     (void)sizeof_tags;
     jl_read_reloclist(&s, s.link_ids_relocs, 0); // general relocs
@@ -3002,7 +3010,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
             arraylist_push(&cleanup_list, (void*)obj);
         }
         if (tag)
-            *pfld = (uintptr_t)newobj | GC_OLD;
+            *pfld = (uintptr_t)newobj | GC_OLD | GC_IN_IMAGE;
         else
             *pfld = (uintptr_t)newobj;
         assert(!(image_base < (char*)newobj && (char*)newobj <= image_base + sizeof_sysimg + sizeof(uintptr_t)));
@@ -3045,6 +3053,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
             memset(o, 0xba, sizeof(jl_value_t*) + sizeof(jl_datatype_t));
         else
             memset(o, 0xba, sizeof(jl_value_t*) + 0); // singleton
+        o->bits.in_image = 1;
     }
     arraylist_grow(&cleanup_list, -cleanup_list.len);
     // finally cache all our new types now
@@ -3119,6 +3128,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
         jl_value_t *t = jl_typeof(item);
         if (t == (jl_value_t*)jl_method_instance_type)
             memset(o, 0xba, sizeof(jl_value_t*) * 3); // only specTypes and sparams fields stored
+        o->bits.in_image = 1;
     }
     arraylist_free(&cleanup_list);
     for (size_t i = 0; i < s.fixup_objs.len; i++) {
@@ -3254,6 +3264,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
     arraylist_push(&jl_linkage_blobs, (void*)image_base);
     arraylist_push(&jl_linkage_blobs, (void*)(image_base + sizeof_sysimg + sizeof(uintptr_t)));
     arraylist_push(&jl_image_relocs, (void*)relocs_base);
+    rebuild_image_blob_tree();
 
     // jl_printf(JL_STDOUT, "%ld blobs to link against\n", jl_linkage_blobs.len >> 1);
     jl_gc_enable(en);
@@ -3328,7 +3339,8 @@ static jl_value_t *jl_restore_package_image_from_stream(ios_t *f, jl_image_t *im
             // Add roots to methods
             jl_copy_roots(method_roots_list, jl_worklist_key((jl_array_t*)restored));
             // Handle edges
-            jl_insert_backedges((jl_array_t*)edges, (jl_array_t*)ext_targets, (jl_array_t*)new_specializations); // restore external backedges (needs to be last)
+            size_t world = jl_atomic_load_acquire(&jl_world_counter);
+            jl_insert_backedges((jl_array_t*)edges, (jl_array_t*)ext_targets, (jl_array_t*)new_specializations, world); // restore external backedges (needs to be last)
             // reinit ccallables
             jl_reinit_ccallable(&ccallable_list, base, NULL);
             arraylist_free(&ccallable_list);
diff --git a/src/staticdata_utils.c b/src/staticdata_utils.c
index 60cea5b99c4b5..9bbcf5ece0caa 100644
--- a/src/staticdata_utils.c
+++ b/src/staticdata_utils.c
@@ -420,9 +420,8 @@ static void jl_record_edges(jl_method_instance_t *caller, arraylist_t *wq, jl_ar
 // Extract `edges` and `ext_targets` from `edges_map`
 // `edges` = [caller1, targets_indexes1, ...], the list of methods and their edges
 // `ext_targets` is [invokesig1, callee1, matches1, ...], the edges for each target
-static void jl_collect_edges(jl_array_t *edges, jl_array_t *ext_targets, jl_array_t *external_cis)
+static void jl_collect_edges(jl_array_t *edges, jl_array_t *ext_targets, jl_array_t *external_cis, size_t world)
 {
-    size_t world = jl_atomic_load_acquire(&jl_world_counter);
     htable_t external_mis;
     htable_new(&external_mis, 0);
     if (external_cis) {
@@ -822,38 +821,40 @@ static void jl_copy_roots(jl_array_t *method_roots_list, uint64_t key)
     }
 }
 
+
 // verify that these edges intersect with the same methods as before
-static jl_array_t *jl_verify_edges(jl_array_t *targets)
+static jl_array_t *jl_verify_edges(jl_array_t *targets, size_t minworld)
 {
-    size_t world = jl_atomic_load_acquire(&jl_world_counter);
     size_t i, l = jl_array_len(targets) / 3;
-    jl_array_t *valids = jl_alloc_array_1d(jl_array_uint8_type, l);
-    memset(jl_array_data(valids), 1, l);
+    static jl_value_t *ulong_array JL_ALWAYS_LEAFTYPE = NULL;
+    if (ulong_array == NULL)
+        ulong_array = jl_apply_array_type((jl_value_t*)jl_ulong_type, 1);
+    jl_array_t *maxvalids = jl_alloc_array_1d(ulong_array, l);
+    memset(jl_array_data(maxvalids), 0, l * sizeof(size_t));
     jl_value_t *loctag = NULL;
     jl_value_t *matches = NULL;
-    JL_GC_PUSH3(&valids, &matches, &loctag);
+    JL_GC_PUSH3(&maxvalids, &matches, &loctag);
     for (i = 0; i < l; i++) {
         jl_value_t *invokesig = jl_array_ptr_ref(targets, i * 3);
         jl_value_t *callee = jl_array_ptr_ref(targets, i * 3 + 1);
         jl_value_t *expected = jl_array_ptr_ref(targets, i * 3 + 2);
-        int valid = 1;
         size_t min_valid = 0;
         size_t max_valid = ~(size_t)0;
         if (invokesig) {
             assert(callee && "unsupported edge");
             jl_methtable_t *mt = jl_method_get_table(((jl_method_instance_t*)callee)->def.method);
             if ((jl_value_t*)mt == jl_nothing) {
-                valid = 0;
+                max_valid = 0;
             }
             else {
-                matches = jl_gf_invoke_lookup_worlds(invokesig, (jl_value_t*)mt, world, &min_valid, &max_valid);
+                matches = jl_gf_invoke_lookup_worlds(invokesig, (jl_value_t*)mt, minworld, &min_valid, &max_valid);
                 if (matches == jl_nothing) {
-                     valid = 0;
+                     max_valid = 0;
                 }
                 else {
                     matches = (jl_value_t*)((jl_method_match_t*)matches)->method;
                     if (matches != expected) {
-                        valid = 0;
+                        max_valid = 0;
                     }
                 }
             }
@@ -868,15 +869,15 @@ static jl_array_t *jl_verify_edges(jl_array_t *targets)
             int ambig = 0;
             // TODO: possibly need to included ambiguities too (for the optimizer correctness)?
             matches = jl_matching_methods((jl_tupletype_t*)sig, jl_nothing,
-                    -1, 0, world, &min_valid, &max_valid, &ambig);
+                    -1, 0, minworld, &min_valid, &max_valid, &ambig);
             if (matches == jl_nothing) {
-                valid = 0;
+                max_valid = 0;
             }
             else {
                 // setdiff!(matches, expected)
                 size_t j, k, ins = 0;
                 if (jl_array_len(matches) != jl_array_len(expected)) {
-                    valid = 0;
+                    max_valid = 0;
                 }
                 for (k = 0; k < jl_array_len(matches); k++) {
                     jl_method_t *match = ((jl_method_match_t*)jl_array_ptr_ref(matches, k))->method;
@@ -888,18 +889,18 @@ static jl_array_t *jl_verify_edges(jl_array_t *targets)
                         // intersection has a new method or a method was
                         // deleted--this is now probably no good, just invalidate
                         // everything about it now
-                        valid = 0;
+                        max_valid = 0;
                         if (!_jl_debug_method_invalidation)
                             break;
                         jl_array_ptr_set(matches, ins++, match);
                     }
                 }
-                if (!valid && _jl_debug_method_invalidation)
+                if (max_valid != ~(size_t)0 && _jl_debug_method_invalidation)
                     jl_array_del_end((jl_array_t*)matches, jl_array_len(matches) - ins);
             }
         }
-        jl_array_uint8_set(valids, i, valid);
-        if (!valid && _jl_debug_method_invalidation) {
+        ((size_t*)(jl_array_data(maxvalids)))[i] = max_valid;
+        if (max_valid != ~(size_t)0 && _jl_debug_method_invalidation) {
             jl_array_ptr_1d_push(_jl_debug_method_invalidation, invokesig ? (jl_value_t*)invokesig : callee);
             loctag = jl_cstr_to_string("insert_backedges_callee");
             jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
@@ -912,161 +913,168 @@ static jl_array_t *jl_verify_edges(jl_array_t *targets)
         //ios_puts(valid ? "valid\n" : "INVALID\n", ios_stderr);
     }
     JL_GC_POP();
-    return valids;
+    return maxvalids;
 }
 
-// Combine all edges relevant to a method into the visited table
-static void jl_verify_methods(jl_array_t *edges, jl_array_t *valids, htable_t *visited)
+// Combine all edges relevant to a method to initialize the maxvalids list
+static jl_array_t *jl_verify_methods(jl_array_t *edges, jl_array_t *maxvalids)
 {
     jl_value_t *loctag = NULL;
-    JL_GC_PUSH1(&loctag);
+    jl_array_t *maxvalids2 = NULL;
+    JL_GC_PUSH2(&loctag, &maxvalids2);
     size_t i, l = jl_array_len(edges) / 2;
-    htable_new(visited, l);
+    maxvalids2 = jl_alloc_array_1d(jl_typeof(maxvalids), l);
+    size_t *maxvalids2_data = (size_t*)jl_array_data(maxvalids2);
+    memset(maxvalids2_data, 0, l * sizeof(size_t));
     for (i = 0; i < l; i++) {
         jl_method_instance_t *caller = (jl_method_instance_t*)jl_array_ptr_ref(edges, 2 * i);
         assert(jl_is_method_instance(caller) && jl_is_method(caller->def.method));
         jl_array_t *callee_ids = (jl_array_t*)jl_array_ptr_ref(edges, 2 * i + 1);
         assert(jl_typeis((jl_value_t*)callee_ids, jl_array_int32_type));
-        int valid = 1;
         if (callee_ids == NULL) {
             // serializing the edges had failed
-            valid = 0;
+            maxvalids2_data[i] = 0;
         }
         else {
             int32_t *idxs = (int32_t*)jl_array_data(callee_ids);
             size_t j;
-            for (j = 0; valid && j < idxs[0]; j++) {
+            maxvalids2_data[i] = ~(size_t)0;
+            for (j = 0; j < idxs[0]; j++) {
                 int32_t idx = idxs[j + 1];
-                valid = jl_array_uint8_ref(valids, idx);
-                if (!valid && _jl_debug_method_invalidation) {
+                size_t max_valid = ((size_t*)(jl_array_data(maxvalids)))[idx];
+                if (max_valid != ~(size_t)0 && _jl_debug_method_invalidation) {
                     jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)caller);
                     loctag = jl_cstr_to_string("verify_methods");
                     jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
                     loctag = jl_box_int32((int32_t)idx);
                     jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
                 }
+                if (max_valid < maxvalids2_data[i])
+                    maxvalids2_data[i] = max_valid;
+                if (max_valid == 0)
+                    break;
             }
         }
-        ptrhash_put(visited, caller, (void*)(((char*)HT_NOTFOUND) + valid + 1));
         //jl_static_show((JL_STREAM*)ios_stderr, (jl_value_t*)caller);
-        //ios_puts(valid ? "valid\n" : "INVALID\n", ios_stderr);
-        // HT_NOTFOUND: valid (no invalid edges)
-        // HT_NOTFOUND + 1: invalid
-        // HT_NOTFOUND + 2: need to scan
-        // HT_NOTFOUND + 3 + depth: in-progress
+        //ios_puts(maxvalid2_data[i] == ~(size_t)0 ? "valid\n" : "INVALID\n", ios_stderr);
     }
     JL_GC_POP();
+    return maxvalids2;
 }
 
 
 // Visit the entire call graph, starting from edges[idx] to determine if that method is valid
 // Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable
-static int jl_verify_graph_edge(jl_array_t *edges, int idx, htable_t *visited, arraylist_t *stack)
+// and slightly modified with an early termination option once the computation reaches its minimum
+static int jl_verify_graph_edge(size_t *maxvalids2_data, jl_array_t *edges, size_t idx, arraylist_t *visited, arraylist_t *stack)
 {
-    jl_method_instance_t *caller = (jl_method_instance_t*)jl_array_ptr_ref(edges, idx * 2);
-    assert(jl_is_method_instance(caller) && jl_is_method(caller->def.method));
-    int found = (char*)ptrhash_get(visited, (void*)caller) - (char*)HT_NOTFOUND;
-    if (found == 0)
-        return 1; // NOTFOUND == valid
-    if (found == 1)
-        return 0; // invalid
-    if (found != 2)
-        return found - 1; // depth
-    found = 0;
+    if (maxvalids2_data[idx] == 0) {
+        visited->items[idx] = (void*)1;
+        return 0;
+    }
+    size_t cycle = (size_t)visited->items[idx];
+    if (cycle != 0)
+        return cycle - 1; // depth remaining
     jl_value_t *cause = NULL;
-    arraylist_push(stack, (void*)caller);
-    int depth = stack->len;
-    ptrhash_put(visited, (void*)caller, (void*)((char*)HT_NOTFOUND + 3 + depth)); // change 2 to in-progress at depth
+    arraylist_push(stack, (void*)idx);
+    size_t depth = stack->len;
+    visited->items[idx] = (void*)(1 + depth);
     jl_array_t *callee_ids = (jl_array_t*)jl_array_ptr_ref(edges, idx * 2 + 1);
     assert(jl_typeis((jl_value_t*)callee_ids, jl_array_int32_type));
     int32_t *idxs = (int32_t*)jl_array_data(callee_ids);
-    int cycle = 0;
     size_t i, n = jl_array_len(callee_ids);
     for (i = idxs[0] + 1; i < n; i++) {
-        int32_t idx = idxs[i];
-        int child_found = jl_verify_graph_edge(edges, idx, visited, stack);
-        if (child_found == 0) {
+        int32_t childidx = idxs[i];
+        int child_cycle = jl_verify_graph_edge(maxvalids2_data, edges, childidx, visited, stack);
+        size_t child_max_valid = maxvalids2_data[childidx];
+        if (child_max_valid < maxvalids2_data[idx]) {
+            maxvalids2_data[idx] = child_max_valid;
+            cause = jl_array_ptr_ref(edges, childidx * 2);
+        }
+        if (child_max_valid == 0) {
             // found what we were looking for, so terminate early
-            found = 1;
-            cause = jl_array_ptr_ref(edges, idx * 2);
             break;
         }
-        else if (child_found >= 2 && child_found - 2 < cycle) {
+        else if (child_cycle && child_cycle < cycle) {
             // record the cycle will resolve at depth "cycle"
-            cycle = child_found - 2;
-            assert(cycle);
+            cycle = child_cycle;
         }
     }
-    if (!found && cycle && cycle != depth)
-        return cycle + 2;
+    size_t max_valid = maxvalids2_data[idx];
+    if (max_valid != 0 && cycle && cycle != depth)
+        return cycle;
     // If we are the top of the current cycle, now mark all other parts of
     // our cycle with what we found.
-    // Or if we found a backedge, also mark all of the other parts of the
-    // cycle as also having an backedge.
+    // Or if we found a failed edge, also mark all of the other parts of the
+    // cycle as also having an failed edge.
     while (stack->len >= depth) {
-        void *mi = arraylist_pop(stack);
-        assert((char*)ptrhash_get(visited, mi) - (char*)HT_NOTFOUND == 4 + stack->len);
-        if (found)
-            ptrhash_put(visited, mi, (void*)((char*)HT_NOTFOUND + 1 + found));
-        else
-            ptrhash_remove(visited, mi); // assign as NOTFOUND in table
-        if (_jl_debug_method_invalidation && found) {
-            jl_value_t *loctag = NULL;
-            JL_GC_PUSH1(&loctag);
-            jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)mi);
-            loctag = jl_cstr_to_string("verify_methods");
-            jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
-            jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)cause);
-            JL_GC_POP();
+        size_t childidx = (size_t)arraylist_pop(stack);
+        assert(visited->items[childidx] == (void*)(2 + stack->len));
+        if (idx != childidx) {
+            if (max_valid < maxvalids2_data[childidx])
+                maxvalids2_data[childidx] = max_valid;
+            if (_jl_debug_method_invalidation && max_valid != ~(size_t)0) {
+                jl_method_instance_t *mi = (jl_method_instance_t*)jl_array_ptr_ref(edges, childidx * 2);
+                jl_value_t *loctag = NULL;
+                JL_GC_PUSH1(&loctag);
+                jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)mi);
+                loctag = jl_cstr_to_string("verify_methods");
+                jl_array_ptr_1d_push(_jl_debug_method_invalidation, loctag);
+                jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)cause);
+                JL_GC_POP();
+            }
         }
+        visited->items[childidx] = (void*)1;
     }
-    return found ? 0 : 1;
+    return 0;
 }
 
 // Visit all entries in edges, verify if they are valid
-static jl_array_t *jl_verify_graph(jl_array_t *edges, htable_t *visited)
+static void jl_verify_graph(jl_array_t *edges, jl_array_t *maxvalids2)
 {
-    arraylist_t stack;
+    arraylist_t stack, visited;
     arraylist_new(&stack, 0);
     size_t i, n = jl_array_len(edges) / 2;
-    jl_array_t *valids = jl_alloc_array_1d(jl_array_uint8_type, n);
-    JL_GC_PUSH1(&valids);
-    int8_t *valids_data = (int8_t*)jl_array_data(valids);
-    for (i = 0; i < n; i++)
-        valids_data[i] = jl_verify_graph_edge(edges, i, visited, &stack);
+    arraylist_new(&visited, n);
+    memset(visited.items, 0, n * sizeof(size_t));
+    size_t *maxvalids2_data = (size_t*)jl_array_data(maxvalids2);
+    for (i = 0; i < n; i++) {
+        assert(visited.items[i] == (void*)0 || visited.items[i] == (void*)1);
+        int child_cycle = jl_verify_graph_edge(maxvalids2_data, edges, i, &visited, &stack);
+        assert(child_cycle == 0); (void)child_cycle;
+        assert(stack.len == 0);
+        assert(visited.items[i] == (void*)1);
+    }
     arraylist_free(&stack);
-    JL_GC_POP();
-    return valids;
+    arraylist_free(&visited);
 }
 
 // Restore backedges to external targets
 // `edges` = [caller1, targets_indexes1, ...], the list of worklist-owned methods calling external methods.
 // `ext_targets` is [invokesig1, callee1, matches1, ...], the global set of non-worklist callees of worklist-owned methods.
-static void jl_insert_backedges(jl_array_t *edges, jl_array_t *ext_targets, jl_array_t *ci_list)
+static void jl_insert_backedges(jl_array_t *edges, jl_array_t *ext_targets, jl_array_t *ci_list, size_t minworld)
 {
     // determine which CodeInstance objects are still valid in our image
-    size_t world = jl_atomic_load_acquire(&jl_world_counter);
-    jl_array_t *valids = jl_verify_edges(ext_targets);
+    jl_array_t *valids = jl_verify_edges(ext_targets, minworld);
     JL_GC_PUSH1(&valids);
-    htable_t visited;
-    htable_new(&visited, 0);
-    jl_verify_methods(edges, valids, &visited); // consumes valids, creates visited
-    valids = jl_verify_graph(edges, &visited); // consumes visited, creates valids
+    valids = jl_verify_methods(edges, valids); // consumes edges valids, initializes methods valids
+    jl_verify_graph(edges, valids); // propagates methods valids for each edge
     size_t i, l;
 
     // next build a map from external MethodInstances to their CodeInstance for insertion
     l = jl_array_len(ci_list);
-    htable_reset(&visited, l);
+    htable_t visited;
+    htable_new(&visited, l);
     for (i = 0; i < l; i++) {
         jl_code_instance_t *ci = (jl_code_instance_t*)jl_array_ptr_ref(ci_list, i);
-        assert(ci->min_world == world);
+        assert(ci->min_world == minworld);
         if (ci->max_world == 1) { // sentinel value: has edges to external callables
             ptrhash_put(&visited, (void*)ci->def, (void*)ci);
         }
         else {
             assert(ci->max_world == ~(size_t)0);
             jl_method_instance_t *caller = ci->def;
-            if (ci->inferred && jl_rettype_inferred(caller, world, ~(size_t)0) == jl_nothing) {
+            if (ci->inferred && jl_rettype_inferred(caller, minworld, ~(size_t)0) == jl_nothing) {
                 jl_mi_cache_insert(caller, ci);
             }
             //jl_static_show((jl_stream*)ios_stderr, (jl_value_t*)caller);
@@ -1078,28 +1086,28 @@ static void jl_insert_backedges(jl_array_t *edges, jl_array_t *ext_targets, jl_a
     l = jl_array_len(edges) / 2;
     for (i = 0; i < l; i++) {
         jl_method_instance_t *caller = (jl_method_instance_t*)jl_array_ptr_ref(edges, 2 * i);
-        int valid = jl_array_uint8_ref(valids, i);
-        if (!valid)
-            continue;
-        // if this callee is still valid, add all the backedges
-        jl_array_t *callee_ids = (jl_array_t*)jl_array_ptr_ref(edges, 2 * i + 1);
-        int32_t *idxs = (int32_t*)jl_array_data(callee_ids);
-        for (size_t j = 0; j < idxs[0]; j++) {
-            int32_t idx = idxs[j + 1];
-            jl_value_t *invokesig = jl_array_ptr_ref(ext_targets, idx * 3);
-            jl_value_t *callee = jl_array_ptr_ref(ext_targets, idx * 3 + 1);
-            if (callee && jl_is_method_instance(callee)) {
-                jl_method_instance_add_backedge((jl_method_instance_t*)callee, invokesig, caller);
-            }
-            else {
-                jl_value_t *sig = callee == NULL ? invokesig : callee;
-                jl_methtable_t *mt = jl_method_table_for(sig);
-                // FIXME: rarely, `callee` has an unexpected `Union` signature,
-                // see https://github.com/JuliaLang/julia/pull/43990#issuecomment-1030329344
-                // Fix the issue and turn this back into an `assert((jl_value_t*)mt != jl_nothing)`
-                // This workaround exposes us to (rare) 265-violations.
-                if ((jl_value_t*)mt != jl_nothing)
-                    jl_method_table_add_backedge(mt, sig, (jl_value_t*)caller);
+        size_t maxvalid = ((size_t*)(jl_array_data(valids)))[i];
+        if (maxvalid == ~(size_t)0) {
+            // if this callee is still valid, add all the backedges
+            jl_array_t *callee_ids = (jl_array_t*)jl_array_ptr_ref(edges, 2 * i + 1);
+            int32_t *idxs = (int32_t*)jl_array_data(callee_ids);
+            for (size_t j = 0; j < idxs[0]; j++) {
+                int32_t idx = idxs[j + 1];
+                jl_value_t *invokesig = jl_array_ptr_ref(ext_targets, idx * 3);
+                jl_value_t *callee = jl_array_ptr_ref(ext_targets, idx * 3 + 1);
+                if (callee && jl_is_method_instance(callee)) {
+                    jl_method_instance_add_backedge((jl_method_instance_t*)callee, invokesig, caller);
+                }
+                else {
+                    jl_value_t *sig = callee == NULL ? invokesig : callee;
+                    jl_methtable_t *mt = jl_method_table_for(sig);
+                    // FIXME: rarely, `callee` has an unexpected `Union` signature,
+                    // see https://github.com/JuliaLang/julia/pull/43990#issuecomment-1030329344
+                    // Fix the issue and turn this back into an `assert((jl_value_t*)mt != jl_nothing)`
+                    // This workaround exposes us to (rare) 265-violations.
+                    if ((jl_value_t*)mt != jl_nothing)
+                        jl_method_table_add_backedge(mt, sig, (jl_value_t*)caller);
+                }
             }
         }
         // then enable any methods associated with it
@@ -1109,9 +1117,9 @@ static void jl_insert_backedges(jl_array_t *edges, jl_array_t *ext_targets, jl_a
             // have some new external code to use
             assert(jl_is_code_instance(ci));
             jl_code_instance_t *codeinst = (jl_code_instance_t*)ci;
-            assert(codeinst->min_world == world && codeinst->inferred);
-            codeinst->max_world = ~(size_t)0;
-            if (jl_rettype_inferred(caller, world, ~(size_t)0) == jl_nothing) {
+            assert(codeinst->min_world == minworld && codeinst->inferred);
+            codeinst->max_world = maxvalid;
+            if (jl_rettype_inferred(caller, minworld, maxvalid) == jl_nothing) {
                 jl_mi_cache_insert(caller, codeinst);
             }
         }
diff --git a/src/subtype.c b/src/subtype.c
index 41dcaff6d475c..fa6e63381ad40 100644
--- a/src/subtype.c
+++ b/src/subtype.c
@@ -368,6 +368,12 @@ static int obviously_disjoint(jl_value_t *a, jl_value_t *b, int specificity)
         return 1;
     if (jl_is_unionall(a)) a = jl_unwrap_unionall(a);
     if (jl_is_unionall(b)) b = jl_unwrap_unionall(b);
+    if (jl_is_uniontype(a))
+        return obviously_disjoint(((jl_uniontype_t *)a)->a, b, specificity) &&
+               obviously_disjoint(((jl_uniontype_t *)a)->b, b, specificity);
+    if (jl_is_uniontype(b))
+        return obviously_disjoint(a, ((jl_uniontype_t *)b)->a, specificity) &&
+               obviously_disjoint(a, ((jl_uniontype_t *)b)->b, specificity);
     if (jl_is_datatype(a) && jl_is_datatype(b)) {
         jl_datatype_t *ad = (jl_datatype_t*)a, *bd = (jl_datatype_t*)b;
         if (ad->name != bd->name) {
@@ -519,28 +525,43 @@ static jl_unionall_t *rename_unionall(jl_unionall_t *u)
 
 static int subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param);
 
-static jl_value_t *pick_union_element(jl_value_t *u JL_PROPAGATES_ROOT, jl_stenv_t *e, int8_t R) JL_NOTSAFEPOINT
+static int next_union_state(jl_stenv_t *e, int8_t R) JL_NOTSAFEPOINT
 {
     jl_unionstate_t *state = R ? &e->Runions : &e->Lunions;
+    if (state->more == 0)
+        return 0;
+    // reset `used` and let `pick_union_decision` clean the stack.
+    state->used = state->more;
+    statestack_set(state, state->used - 1, 1);
+    return 1;
+}
+
+static int pick_union_decision(jl_stenv_t *e, int8_t R) JL_NOTSAFEPOINT
+{
+    jl_unionstate_t *state = R ? &e->Runions : &e->Lunions;
+    if (state->depth >= state->used) {
+        statestack_set(state, state->used, 0);
+        state->used++;
+    }
+    int ui = statestack_get(state, state->depth);
+    state->depth++;
+    if (ui == 0)
+        state->more = state->depth; // memorize that this was the deepest available choice
+    return ui;
+}
+
+static jl_value_t *pick_union_element(jl_value_t *u JL_PROPAGATES_ROOT, jl_stenv_t *e, int8_t R) JL_NOTSAFEPOINT
+{
     do {
-        if (state->depth >= state->used) {
-            statestack_set(state, state->used, 0);
-            state->used++;
-        }
-        int ui = statestack_get(state, state->depth);
-        state->depth++;
-        if (ui == 0) {
-            state->more = state->depth; // memorize that this was the deepest available choice
-            u = ((jl_uniontype_t*)u)->a;
-        }
-        else {
+        if (pick_union_decision(e, R))
             u = ((jl_uniontype_t*)u)->b;
-        }
+        else
+            u = ((jl_uniontype_t*)u)->a;
     } while (jl_is_uniontype(u));
     return u;
 }
 
-static int forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param);
+static int local_forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param, int limit_slow);
 
 // subtype for variable bounds consistency check. needs its own forall/exists environment.
 static int subtype_ccheck(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
@@ -556,17 +577,7 @@ static int subtype_ccheck(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
     if (x == (jl_value_t*)jl_any_type && jl_is_datatype(y))
         return 0;
     jl_saved_unionstate_t oldLunions; push_unionstate(&oldLunions, &e->Lunions);
-    jl_saved_unionstate_t oldRunions; push_unionstate(&oldRunions, &e->Runions);
-    int sub;
-    e->Lunions.used = e->Runions.used = 0;
-    e->Runions.depth = 0;
-    e->Runions.more = 0;
-    e->Lunions.depth = 0;
-    e->Lunions.more = 0;
-
-    sub = forall_exists_subtype(x, y, e, 0);
-
-    pop_unionstate(&e->Runions, &oldRunions);
+    int sub = local_forall_exists_subtype(x, y, e, 0, 1);
     pop_unionstate(&e->Lunions, &oldLunions);
     return sub;
 }
@@ -1174,19 +1185,6 @@ static int subtype_tuple(jl_datatype_t *xd, jl_datatype_t *yd, jl_stenv_t *e, in
     return ans;
 }
 
-static int equal_unions(jl_uniontype_t *x, jl_uniontype_t *y, jl_stenv_t *e)
-{
-    jl_value_t *saved=NULL; jl_savedenv_t se;
-    JL_GC_PUSH1(&saved);
-    save_env(e, &saved, &se);
-    int eq = forall_exists_equal(x->a, y->a, e) && forall_exists_equal(x->b, y->b, e);
-    if (!eq)
-        restore_env(e, saved, &se);
-    free_env(&se);
-    JL_GC_POP();
-    return eq;
-}
-
 // `param` means we are currently looking at a parameter of a type constructor
 // (as opposed to being outside any type constructor, or comparing variable bounds).
 // this is used to record the positions where type variables occur for the
@@ -1208,15 +1206,9 @@ static int subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param)
             // of unions and vars: if matching `typevar <: union`, first try to match the whole
             // union against the variable before trying to take it apart to see if there are any
             // variables lurking inside.
-            jl_unionstate_t *state = &e->Runions;
-            if (state->depth >= state->used) {
-                statestack_set(state, state->used, 0);
-                state->used++;
-            }
-            ui = statestack_get(state, state->depth);
-            state->depth++;
-            if (ui == 0)
-                state->more = state->depth; // memorize that this was the deepest available choice
+            // note: for forall var, there's no need to split y if it has no free typevars.
+            jl_varbinding_t *xx = lookup(e, (jl_tvar_t *)x);
+            ui = ((xx && xx->right) || jl_has_free_typevars(y)) && pick_union_decision(e, 1);
         }
         if (ui == 1)
             y = pick_union_element(y, e, 1);
@@ -1368,75 +1360,102 @@ static int is_definite_length_tuple_type(jl_value_t *x)
     return k == JL_VARARG_NONE || k == JL_VARARG_INT;
 }
 
-static int forall_exists_equal(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
-{
-    if (obviously_egal(x, y)) return 1;
+static int _forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param, int *count, int *noRmore);
 
-    if ((is_indefinite_length_tuple_type(x) && is_definite_length_tuple_type(y)) ||
-        (is_definite_length_tuple_type(x) && is_indefinite_length_tuple_type(y)))
+static int may_contain_union_decision(jl_value_t *x, jl_stenv_t *e, jl_typeenv_t *log) JL_NOTSAFEPOINT
+{
+    if (x == NULL || x == (jl_value_t*)jl_any_type || x == jl_bottom_type)
         return 0;
-
-    if (jl_is_uniontype(x) && jl_is_uniontype(y)) {
-        // For 2 unions, try a more efficient greedy algorithm that compares the unions
-        // componentwise. If it returns `false`, we forget it and proceed with the usual
-        // algorithm. If it returns `true` we try returning `true`, but need to come back
-        // here to try the usual algorithm if subtyping later fails.
-        jl_unionstate_t *state = &e->Runions;
-        jl_saved_unionstate_t oldRunions; push_unionstate(&oldRunions, state);
-        if (state->depth >= state->used) {
-            statestack_set(state, state->used, 0);
-            state->used++;
-        }
-        int ui = statestack_get(state, state->depth);
-        state->depth++;
-        if (ui == 0) {
-            state->more = state->depth; // memorize that this was the deepest available choice
-            if (equal_unions((jl_uniontype_t*)x, (jl_uniontype_t*)y, e))
+    if (jl_is_unionall(x))
+        return may_contain_union_decision(((jl_unionall_t *)x)->body, e, log);
+    if (jl_is_datatype(x)) {
+        jl_datatype_t *xd = (jl_datatype_t *)x;
+        for (int i = 0; i < jl_nparams(xd); i++) {
+            jl_value_t *param = jl_tparam(xd, i);
+            if (jl_is_vararg(param))
+                param = jl_unwrap_vararg(param);
+            if (may_contain_union_decision(param, e, log))
                 return 1;
-            pop_unionstate(state, &oldRunions);
         }
+        return 0;
     }
+    if (!jl_is_typevar(x))
+        return 1;
+    jl_typeenv_t *t = log;
+    while (t != NULL) {
+        if (x == (jl_value_t *)t->var)
+            return 1;
+        t = t->prev;
+    }
+    jl_typeenv_t newlog = { (jl_tvar_t*)x, NULL, log };
+    jl_varbinding_t *xb = lookup(e, (jl_tvar_t *)x);
+    return may_contain_union_decision(xb ? xb->lb : ((jl_tvar_t *)x)->lb, e, &newlog) ||
+           may_contain_union_decision(xb ? xb->ub : ((jl_tvar_t *)x)->ub, e, &newlog);
+}
 
-    jl_saved_unionstate_t oldLunions; push_unionstate(&oldLunions, &e->Lunions);
-    e->Lunions.used = 0;
+static int local_forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param, int limit_slow)
+{
+    int16_t oldRmore = e->Runions.more;
     int sub;
-
-    if (!jl_has_free_typevars(x) || !jl_has_free_typevars(y)) {
+    if (may_contain_union_decision(y, e, NULL) && pick_union_decision(e, 1) == 0) {
         jl_saved_unionstate_t oldRunions; push_unionstate(&oldRunions, &e->Runions);
-        e->Runions.used = 0;
-        e->Runions.depth = 0;
-        e->Runions.more = 0;
-        e->Lunions.depth = 0;
-        e->Lunions.more = 0;
-
-        sub = forall_exists_subtype(x, y, e, 2);
-
+        e->Lunions.used = e->Runions.used = 0;
+        e->Lunions.depth = e->Runions.depth = 0;
+        e->Lunions.more = e->Runions.more = 0;
+        int count = 0, noRmore = 0;
+        sub = _forall_exists_subtype(x, y, e, param, &count, &noRmore);
         pop_unionstate(&e->Runions, &oldRunions);
+        // we should not try the slow path if `forall_exists_subtype` has tested all cases;
+        // Once limit_slow == 1, also skip it if
+        // 1) `forall_exists_subtype` return false
+        // 2) the left `Union` looks big
+        if (noRmore || (limit_slow && (count > 3  || !sub)))
+            e->Runions.more = oldRmore;
     }
     else {
-        int lastset = 0;
+        // slow path
+        e->Lunions.used = 0;
         while (1) {
             e->Lunions.more = 0;
             e->Lunions.depth = 0;
-            sub = subtype(x, y, e, 2);
-            int set = e->Lunions.more;
-            if (!sub || !set)
+            sub = subtype(x, y, e, param);
+            if (!sub || !next_union_state(e, 0))
                 break;
-            for (int i = set; i <= lastset; i++)
-                statestack_set(&e->Lunions, i, 0);
-            lastset = set - 1;
-            statestack_set(&e->Lunions, lastset, 1);
+        }
+    }
+    return sub;
+}
+
+static int forall_exists_equal(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
+{
+    if (obviously_egal(x, y)) return 1;
+
+    if ((is_indefinite_length_tuple_type(x) && is_definite_length_tuple_type(y)) ||
+        (is_definite_length_tuple_type(x) && is_indefinite_length_tuple_type(y)))
+        return 0;
+
+    if ((jl_is_uniontype(x) && jl_is_uniontype(y))) {
+        // For 2 unions, first try a more efficient greedy algorithm that compares the unions
+        // componentwise. If failed, `exists_subtype` would memorize that this branch should be skipped.
+        if (pick_union_decision(e, 1) == 0) {
+            return forall_exists_equal(((jl_uniontype_t *)x)->a, ((jl_uniontype_t *)y)->a, e) &&
+                   forall_exists_equal(((jl_uniontype_t *)x)->b, ((jl_uniontype_t *)y)->b, e);
         }
     }
 
+    jl_saved_unionstate_t oldLunions; push_unionstate(&oldLunions, &e->Lunions);
+
+    int limit_slow = !jl_has_free_typevars(x) || !jl_has_free_typevars(y);
+    int sub = local_forall_exists_subtype(x, y, e, 2, limit_slow) &&
+              local_forall_exists_subtype(y, x, e, 0, 0);
+
     pop_unionstate(&e->Lunions, &oldLunions);
-    return sub && subtype(y, x, e, 0);
+    return sub;
 }
 
 static int exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, jl_value_t *saved, jl_savedenv_t *se, int param)
 {
     e->Runions.used = 0;
-    int lastset = 0;
     while (1) {
         e->Runions.depth = 0;
         e->Runions.more = 0;
@@ -1444,8 +1463,7 @@ static int exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, jl_value_
         e->Lunions.more = 0;
         if (subtype(x, y, e, param))
             return 1;
-        int set = e->Runions.more;
-        if (set) {
+        if (next_union_state(e, 1)) {
             // We preserve `envout` here as `subtype_unionall` needs previous assigned env values.
             int oldidx = e->envidx;
             e->envidx = e->envsz;
@@ -1456,14 +1474,10 @@ static int exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, jl_value_
             restore_env(e, saved, se);
             return 0;
         }
-        for (int i = set; i <= lastset; i++)
-            statestack_set(&e->Runions, i, 0);
-        lastset = set - 1;
-        statestack_set(&e->Runions, lastset, 1);
     }
 }
 
-static int forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param)
+static int _forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param, int *count, int *noRmore)
 {
     // The depth recursion has the following shape, after simplification:
     // ∀₁
@@ -1475,19 +1489,17 @@ static int forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, in
     save_env(e, &saved, &se);
 
     e->Lunions.used = 0;
-    int lastset = 0;
     int sub;
+    if (count) *count = 0;
+    if (noRmore) *noRmore = 1;
     while (1) {
         sub = exists_subtype(x, y, e, saved, &se, param);
-        int set = e->Lunions.more;
-        if (!sub || !set)
+        if (count) *count = (*count < 4) ? *count + 1 : 4;
+        if (noRmore) *noRmore = *noRmore && e->Runions.more == 0;
+        if (!sub || !next_union_state(e, 0))
             break;
         free_env(&se);
         save_env(e, &saved, &se);
-        for (int i = set; i <= lastset; i++)
-            statestack_set(&e->Lunions, i, 0);
-        lastset = set - 1;
-        statestack_set(&e->Lunions, lastset, 1);
     }
 
     free_env(&se);
@@ -1495,6 +1507,11 @@ static int forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, in
     return sub;
 }
 
+static int forall_exists_subtype(jl_value_t *x, jl_value_t *y, jl_stenv_t *e, int param)
+{
+    return _forall_exists_subtype(x, y, e, param, NULL, NULL);
+}
+
 static void init_stenv(jl_stenv_t *e, jl_value_t **env, int envsz)
 {
     e->vars = NULL;
@@ -3326,39 +3343,30 @@ static jl_value_t *intersect_all(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
     jl_value_t **merged = &is[3];
     jl_savedenv_t se, me;
     save_env(e, saved, &se);
-    int lastset = 0, niter = 0, total_iter = 0;
-    jl_value_t *ii = intersect(x, y, e, 0);
-    is[0] = ii;  // root
+    int niter = 0, total_iter = 0;
+    is[0] = intersect(x, y, e, 0); // root
     if (is[0] != jl_bottom_type)
         niter = merge_env(e, merged, &me, niter);
     restore_env(e, *saved, &se);
-    while (e->Runions.more) {
-        if (e->emptiness_only && ii != jl_bottom_type)
+    while (next_union_state(e, 1)) {
+        if (e->emptiness_only && is[0] != jl_bottom_type)
             break;
         e->Runions.depth = 0;
-        int set = e->Runions.more - 1;
         e->Runions.more = 0;
-        statestack_set(&e->Runions, set, 1);
-        for (int i = set + 1; i <= lastset; i++)
-            statestack_set(&e->Runions, i, 0);
-        lastset = set;
 
-        is[0] = ii;
         is[1] = intersect(x, y, e, 0);
         if (is[1] != jl_bottom_type)
             niter = merge_env(e, merged, &me, niter);
         restore_env(e, *saved, &se);
         if (is[0] == jl_bottom_type)
-            ii = is[1];
-        else if (is[1] == jl_bottom_type)
-            ii = is[0];
-        else {
+            is[0] = is[1];
+        else if (is[1] != jl_bottom_type) {
             // TODO: the repeated subtype checks in here can get expensive
-            ii = jl_type_union(is, 2);
+            is[0] = jl_type_union(is, 2);
         }
         total_iter++;
         if (niter > 4 || total_iter > 400000) {
-            ii = y;
+            is[0] = y;
             break;
         }
     }
@@ -3368,7 +3376,7 @@ static jl_value_t *intersect_all(jl_value_t *x, jl_value_t *y, jl_stenv_t *e)
     }
     free_env(&se);
     JL_GC_POP();
-    return ii;
+    return is[0];
 }
 
 // type intersection entry points
diff --git a/src/task.c b/src/task.c
index 1772127391183..4bb5d666a073a 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1555,12 +1555,15 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi)
 #endif
         if (jl_setjmp(ptls->copy_stack_ctx.uc_mcontext, 0))
             start_task(); // sanitizer_finish_switch_fiber is part of start_task
-        return ct;
     }
-    ssize = JL_STACK_SIZE;
-    char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL);
-    ptls->stackbase = stkbuf + ssize;
-    ptls->stacksize = ssize;
+    else {
+        ssize = JL_STACK_SIZE;
+        char *stkbuf = jl_alloc_fiber(&ptls->base_ctx, &ssize, NULL);
+        if (stkbuf != NULL) {
+            ptls->stackbase = stkbuf + ssize;
+            ptls->stacksize = ssize;
+        }
+    }
 #endif
 
     if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON)
diff --git a/stdlib/LLD_jll/src/LLD_jll.jl b/stdlib/LLD_jll/src/LLD_jll.jl
index 80653353a7c17..a59d8deb8c7b5 100644
--- a/stdlib/LLD_jll/src/LLD_jll.jl
+++ b/stdlib/LLD_jll/src/LLD_jll.jl
@@ -70,8 +70,8 @@ end
 
 function init_lld_path()
     # Prefer our own bundled lld, but if we don't have one, pick it up off of the PATH
-    # If this is an in-tree build, `lld` will live in `tools`.  Otherwise, it'll be in `libexec`
-    for bundled_lld_path in (joinpath(Sys.BINDIR, Base.LIBEXECDIR, lld_exe),
+    # If this is an in-tree build, `lld` will live in `tools`.  Otherwise, it'll be in `private_libexecdir`
+    for bundled_lld_path in (joinpath(Sys.BINDIR, Base.PRIVATE_LIBEXECDIR, lld_exe),
                              joinpath(Sys.BINDIR, "..", "tools", lld_exe),
                              joinpath(Sys.BINDIR, lld_exe))
         if isfile(bundled_lld_path)
diff --git a/stdlib/LinearAlgebra/src/diagonal.jl b/stdlib/LinearAlgebra/src/diagonal.jl
index 291233ebe2e6a..6364169b2ba8a 100644
--- a/stdlib/LinearAlgebra/src/diagonal.jl
+++ b/stdlib/LinearAlgebra/src/diagonal.jl
@@ -355,6 +355,12 @@ function (*)(Da::Diagonal, A::AbstractMatrix, Db::Diagonal)
     return broadcast(*, Da.diag, A, permutedims(Db.diag))
 end
 
+function (*)(Da::Diagonal, Db::Diagonal, Dc::Diagonal)
+    _muldiag_size_check(Da, Db)
+    _muldiag_size_check(Db, Dc)
+    return Diagonal(Da.diag .* Db.diag .* Dc.diag)
+end
+
 # Get ambiguous method if try to unify AbstractVector/AbstractMatrix here using AbstractVecOrMat
 @inline mul!(out::AbstractVector, D::Diagonal, V::AbstractVector, alpha::Number, beta::Number) =
     _muldiag!(out, D, V, alpha, beta)
diff --git a/stdlib/LinearAlgebra/src/special.jl b/stdlib/LinearAlgebra/src/special.jl
index 8af8625a0e817..3974243d74a1f 100644
--- a/stdlib/LinearAlgebra/src/special.jl
+++ b/stdlib/LinearAlgebra/src/special.jl
@@ -408,9 +408,7 @@ const _TypedDenseConcatGroup{T} = Union{Vector{T}, Adjoint{T,Vector{T}}, Transpo
 promote_to_array_type(::Tuple{Vararg{Union{_DenseConcatGroup,UniformScaling}}}) = Matrix
 
 Base._cat(dims, xs::_DenseConcatGroup...) = Base._cat_t(dims, promote_eltype(xs...), xs...)
-vcat(A::Vector...) = Base.typed_vcat(promote_eltype(A...), A...)
 vcat(A::_DenseConcatGroup...) = Base.typed_vcat(promote_eltype(A...), A...)
-hcat(A::Vector...) = Base.typed_hcat(promote_eltype(A...), A...)
 hcat(A::_DenseConcatGroup...) = Base.typed_hcat(promote_eltype(A...), A...)
 hvcat(rows::Tuple{Vararg{Int}}, xs::_DenseConcatGroup...) = Base.typed_hvcat(promote_eltype(xs...), rows, xs...)
 # For performance, specially handle the case where the matrices/vectors have homogeneous eltype
diff --git a/stdlib/LinearAlgebra/src/uniformscaling.jl b/stdlib/LinearAlgebra/src/uniformscaling.jl
index 428acf469c9b2..8a776d6f3b2ce 100644
--- a/stdlib/LinearAlgebra/src/uniformscaling.jl
+++ b/stdlib/LinearAlgebra/src/uniformscaling.jl
@@ -403,10 +403,14 @@ promote_to_arrays(n,k, ::Type{T}, A, B, Cs...) where {T} =
     (promote_to_arrays_(n[k], T, A), promote_to_arrays_(n[k+1], T, B), promote_to_arrays(n,k+2, T, Cs...)...)
 promote_to_array_type(A::Tuple{Vararg{Union{AbstractVecOrMat,UniformScaling,Number}}}) = Matrix
 
+_us2number(A) = A
+_us2number(J::UniformScaling) = J.λ
+
 for (f, _f, dim, name) in ((:hcat, :_hcat, 1, "rows"), (:vcat, :_vcat, 2, "cols"))
     @eval begin
         @inline $f(A::Union{AbstractVecOrMat,UniformScaling}...) = $_f(A...)
-        @inline $f(A::Union{AbstractVecOrMat,UniformScaling,Number}...) = $_f(A...)
+        # if there's a Number present, J::UniformScaling must be 1x1-dimensional
+        @inline $f(A::Union{AbstractVecOrMat,UniformScaling,Number}...) = $f(map(_us2number, A)...)
         function $_f(A::Union{AbstractVecOrMat,UniformScaling,Number}...; array_type = promote_to_array_type(A))
             n = -1
             for a in A
diff --git a/stdlib/LinearAlgebra/test/diagonal.jl b/stdlib/LinearAlgebra/test/diagonal.jl
index 83a2a896e736c..130a66ea0a1d5 100644
--- a/stdlib/LinearAlgebra/test/diagonal.jl
+++ b/stdlib/LinearAlgebra/test/diagonal.jl
@@ -1133,4 +1133,15 @@ Base.size(::SMatrix1) = (1, 1)
     @test C isa Matrix{SMatrix1{String}}
 end
 
+@testset "diagonal triple multiplication (#49005)" begin
+    n = 10
+    @test *(Diagonal(ones(n)), Diagonal(1:n), Diagonal(ones(n))) isa Diagonal
+    @test_throws DimensionMismatch (*(Diagonal(ones(n)), Diagonal(1:n), Diagonal(ones(n+1))))
+    @test_throws DimensionMismatch (*(Diagonal(ones(n)), Diagonal(1:n+1), Diagonal(ones(n+1))))
+    @test_throws DimensionMismatch (*(Diagonal(ones(n+1)), Diagonal(1:n), Diagonal(ones(n))))
+
+    # currently falls back to two-term *
+    @test *(Diagonal(ones(n)), Diagonal(1:n), Diagonal(ones(n)), Diagonal(1:n)) isa Diagonal
+end
+
 end # module TestDiagonal
diff --git a/stdlib/Pkg.version b/stdlib/Pkg.version
index 05d8a6f9260d3..eec85ee1bae84 100644
--- a/stdlib/Pkg.version
+++ b/stdlib/Pkg.version
@@ -1,4 +1,4 @@
 PKG_BRANCH = release-1.9
-PKG_SHA1 = 3ced87de6b48ac8b886f5b26b2a1e8dd764614ae
+PKG_SHA1 = 1b73599d2ed8ef26ded339b1a3e80b6f26afd553
 PKG_GIT_URL := https://github.com/JuliaLang/Pkg.jl.git
 PKG_TAR_URL = https://api.github.com/repos/JuliaLang/Pkg.jl/tarball/$1
diff --git a/stdlib/REPL/src/REPL.jl b/stdlib/REPL/src/REPL.jl
index b2eb8cf63c8da..ef0ad50e74c52 100644
--- a/stdlib/REPL/src/REPL.jl
+++ b/stdlib/REPL/src/REPL.jl
@@ -1410,12 +1410,24 @@ function repl_eval_counter(hp)
 end
 
 function out_transform(@nospecialize(x), n::Ref{Int})
-    return quote
+    return Expr(:toplevel, get_usings!([], x)..., quote
         let __temp_val_a72df459 = $x
             $capture_result($n, __temp_val_a72df459)
             __temp_val_a72df459
         end
+    end)
+end
+
+function get_usings!(usings, ex)
+    # get all `using` and `import` statements which are at the top level
+    for (i, arg) in enumerate(ex.args)
+        if Base.isexpr(arg, :toplevel)
+            get_usings!(usings, arg)
+        elseif Base.isexpr(arg, [:using, :import])
+            push!(usings, popat!(ex.args, i))
+        end
     end
+    return usings
 end
 
 function capture_result(n::Ref{Int}, @nospecialize(x))
diff --git a/stdlib/REPL/test/repl.jl b/stdlib/REPL/test/repl.jl
index edcb91defc9ab..2cac474326679 100644
--- a/stdlib/REPL/test/repl.jl
+++ b/stdlib/REPL/test/repl.jl
@@ -1645,6 +1645,11 @@ fake_repl() do stdin_write, stdout_read, repl
     s = sendrepl2("x_47878 = range(-1; stop = 1)\n", "-1:1")
     @test contains(s, "Out[11]: -1:1")
 
+    # Test for https://github.com/JuliaLang/julia/issues/49041
+    s = sendrepl2("using Test; @test true", "In [14]")
+    @test !contains(s, "ERROR")
+    @test contains(s, "Test Passed")
+
     write(stdin_write, '\x04')
     Base.wait(repltask)
 end
diff --git a/stdlib/TOML/src/print.jl b/stdlib/TOML/src/print.jl
index 61d13a8f4853e..316a7a7259678 100644
--- a/stdlib/TOML/src/print.jl
+++ b/stdlib/TOML/src/print.jl
@@ -163,7 +163,8 @@ function print_table(f::MbyFunc, io::IO, a::AbstractDict,
         end
         if is_table(value)
             push!(ks, String(key))
-            header = isempty(value) || !all(is_tabular(v) for v in values(value))::Bool
+            _values = @invokelatest values(value)
+            header = isempty(value) || !all(is_tabular(v) for v in _values)::Bool
             if header
                 # print table
                 first_block || println(io)
diff --git a/stdlib/p7zip_jll/src/p7zip_jll.jl b/stdlib/p7zip_jll/src/p7zip_jll.jl
index 4320003b282f7..eaa709735c383 100644
--- a/stdlib/p7zip_jll/src/p7zip_jll.jl
+++ b/stdlib/p7zip_jll/src/p7zip_jll.jl
@@ -69,8 +69,8 @@ end
 
 function init_p7zip_path()
     # Prefer our own bundled p7zip, but if we don't have one, pick it up off of the PATH
-    # If this is an in-tree build, `7z` will live in `bin`.  Otherwise, it'll be in `libexec`
-    for bundled_p7zip_path in (joinpath(Sys.BINDIR, Base.LIBEXECDIR, p7zip_exe),
+    # If this is an in-tree build, `7z` will live in `bindir`.  Otherwise, it'll be in `private_libexecdir`
+    for bundled_p7zip_path in (joinpath(Sys.BINDIR, Base.PRIVATE_LIBEXECDIR, p7zip_exe),
                                joinpath(Sys.BINDIR, p7zip_exe))
         if isfile(bundled_p7zip_path)
             global p7zip_path = abspath(bundled_p7zip_path)
diff --git a/test/compiler/AbstractInterpreter.jl b/test/compiler/AbstractInterpreter.jl
index dfdf84d9c2fbe..f6d92ea24e8c8 100644
--- a/test/compiler/AbstractInterpreter.jl
+++ b/test/compiler/AbstractInterpreter.jl
@@ -47,14 +47,6 @@ import Base.Experimental: @MethodTable, @overlay
 @MethodTable(OverlayedMT)
 CC.method_table(interp::MTOverlayInterp) = CC.OverlayMethodTable(CC.get_world_counter(interp), OverlayedMT)
 
-function CC.add_remark!(interp::MTOverlayInterp, ::CC.InferenceState, remark)
-    if interp.meta !== nothing
-        # Core.println(remark)
-        push!(interp.meta, remark)
-    end
-    return nothing
-end
-
 strangesin(x) = sin(x)
 @overlay OverlayedMT strangesin(x::Float64) = iszero(x) ? nothing : cos(x)
 
@@ -74,21 +66,6 @@ end |> !Core.Compiler.is_nonoverlayed
     @invoke strangesin(x::Float64)
 end |> !Core.Compiler.is_nonoverlayed
 
-# account for overlay possibility in unanalyzed matching method
-callstrange(::Nothing) = Core.compilerbarrier(:type, nothing) # trigger inference bail out
-callstrange(::Float64) = strangesin(x)
-callstrange_entry(x) = callstrange(x) # needs to be defined here because of world age
-let interp = MTOverlayInterp(; meta=Set{Any}())
-    matches = Core.Compiler.findall(Tuple{typeof(callstrange),Any}, Core.Compiler.method_table(interp)).matches
-    @test Core.Compiler.length(matches) == 2
-    if Core.Compiler.getindex(matches, 1).method == which(callstrange, (Nothing,))
-        @test Base.infer_effects(callstrange_entry, (Any,); interp) |> !Core.Compiler.is_nonoverlayed
-        @test "Call inference reached maximally imprecise information. Bailing on." in interp.meta
-    else
-        @warn "`nonoverlayed` test for inference bailing out is skipped since the method match sort order is changed."
-    end
-end
-
 # but it should never apply for the native compilation
 @test Base.infer_effects((Float64,)) do x
     strangesin(x)
diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl
index a1ca7505fe80a..7366e56f34e7a 100644
--- a/test/compiler/inference.jl
+++ b/test/compiler/inference.jl
@@ -4293,3 +4293,39 @@ unknown_sparam_nothrow2(x::Ref{Ref{T}}) where T = @isdefined(T) ? T::Type : noth
 @test only(Base.return_types(unknown_sparam_throw, (Any,))) === Union{Nothing,Type}
 @test only(Base.return_types(unknown_sparam_nothrow1, (Ref,))) === Type
 @test only(Base.return_types(unknown_sparam_nothrow2, (Ref{Ref{T}} where T,))) === Type
+
+struct Issue49027{Ty<:Number}
+    x::Ty
+end
+function issue49027(::Type{<:Issue49027{Ty}}) where Ty
+    if @isdefined Ty # should be false when `Ty` is given as a free type var.
+        return Ty::DataType
+    end
+    return nothing
+end
+@test only(Base.return_types(issue49027, (Type{Issue49027{TypeVar(:Ty)}},))) >: Nothing
+@test isnothing(issue49027(Issue49027{TypeVar(:Ty)}))
+function issue49027_integer(::Type{<:Issue49027{Ty}}) where Ty<:Integer
+    if @isdefined Ty # should be false when `Ty` is given as a free type var.
+        return Ty::DataType
+    end
+    nothing
+end
+@test only(Base.return_types(issue49027_integer, (Type{Issue49027{TypeVar(:Ty,Int)}},))) >: Nothing
+@test isnothing(issue49027_integer(Issue49027{TypeVar(:Ty,Int)}))
+
+# Issue #47688: Abstract iteration should take into account `iterate` effects
+global it_count47688 = 0
+struct CountsIterate47688{N}; end
+function Base.iterate(::CountsIterate47688{N}, n=0) where N
+	global it_count47688 += 1
+	n <= N ? (n, n+1) : nothing
+end
+foo47688() = tuple(CountsIterate47688{5}()...)
+bar47688() = foo47688()
+@test only(Base.return_types(bar47688)) == NTuple{6, Int}
+@test it_count47688 == 0
+@test isa(bar47688(), NTuple{6, Int})
+@test it_count47688 == 7
+@test isa(foo47688(), NTuple{6, Int})
+@test it_count47688 == 14
diff --git a/test/core.jl b/test/core.jl
index f4e463cd61326..c42ac60248d3b 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -7898,3 +7898,6 @@ let spec = only(methods(g47476)).specializations
     @test any(mi -> mi !== nothing && Base.isvatuple(mi.specTypes), spec)
     @test all(mi -> mi === nothing || !Base.has_free_typevars(mi.specTypes), spec)
 end
+
+f48950(::Union{Int,d}, ::Union{c,Nothing}...) where {c,d} = 1
+@test f48950(1, 1, 1) == 1
diff --git a/test/osutils.jl b/test/osutils.jl
index 36f2878017129..5e72675279cbc 100644
--- a/test/osutils.jl
+++ b/test/osutils.jl
@@ -51,7 +51,7 @@ end
 if Sys.iswindows()
     @testset "path variables use correct path delimiters on windows" begin
         for path in (Base.SYSCONFDIR, Base.DATAROOTDIR, Base.DOCDIR,
-                     Base.LIBDIR, Base.PRIVATE_LIBDIR, Base.INCLUDEDIR, Base.LIBEXECDIR)
+                     Base.LIBDIR, Base.PRIVATE_LIBDIR, Base.INCLUDEDIR, Base.LIBEXECDIR, Base.PRIVATE_LIBEXECDIR)
             @test !occursin("/", path)
             @test !occursin("\\\\", path)
         end
diff --git a/test/subtype.jl b/test/subtype.jl
index d9bafa8138da2..3a46abb3348da 100644
--- a/test/subtype.jl
+++ b/test/subtype.jl
@@ -1406,6 +1406,8 @@ f24521(::Type{T}, ::Type{T}) where {T} = T
 @test !(Ref{Union{Int64, Val{Number}}} <: Ref{Union{Val{T}, T}} where T)
 @test !(Ref{Union{Ref{Number}, Int64}} <: Ref{Union{Ref{T}, T}} where T)
 @test !(Ref{Union{Val{Number}, Int64}} <: Ref{Union{Val{T}, T}} where T)
+@test !(Val{Ref{Union{Int64, Ref{Number}}}} <: Val{S} where {S<:Ref{Union{Ref{T}, T}} where T})
+@test !(Tuple{Ref{Union{Int64, Ref{Number}}}} <: Tuple{S} where {S<:Ref{Union{Ref{T}, T}} where T})
 
 # issue #26180
 @test !(Ref{Union{Ref{Int64}, Ref{Number}}} <: Ref{Ref{T}} where T)
@@ -2270,8 +2272,8 @@ abstract type P47654{A} end
     @test_broken typeintersect(Tuple{Vector{VT}, Vector{VT}} where {N1, VT<:AbstractVector{N1}},
                 Tuple{Vector{VN} where {N, VN<:AbstractVector{N}}, Vector{Vector{Float64}}}) !== Union{}
     #issue 40865
-    @test_broken Tuple{Set{Ref{Int}}, Set{Ref{Int}}} <: Tuple{Set{KV}, Set{K}} where {K,KV<:Union{K,Ref{K}}}
-    @test_broken Tuple{Set{Val{Int}}, Set{Val{Int}}} <: Tuple{Set{KV}, Set{K}} where {K,KV<:Union{K,Val{K}}}
+    @test Tuple{Set{Ref{Int}}, Set{Ref{Int}}} <: Tuple{Set{KV}, Set{K}} where {K,KV<:Union{K,Ref{K}}}
+    @test Tuple{Set{Val{Int}}, Set{Val{Int}}} <: Tuple{Set{KV}, Set{K}} where {K,KV<:Union{K,Val{K}}}
 
     #issue 39099
     A = Tuple{Tuple{Int, Int, Vararg{Int, N}}, Tuple{Int, Vararg{Int, N}}, Tuple{Vararg{Int, N}}} where N
@@ -2308,7 +2310,18 @@ end
 
 # try to fool a greedy algorithm that picks X=Int, Y=String here
 @test Tuple{Ref{Union{Int,String}}, Ref{Union{Int,String}}} <: Tuple{Ref{Union{X,Y}}, Ref{X}} where {X,Y}
-# this slightly more complex case has been broken since 1.0 (worked in 0.6)
-@test_broken Tuple{Ref{Union{Int,String,Missing}}, Ref{Union{Int,String}}} <: Tuple{Ref{Union{X,Y}}, Ref{X}} where {X,Y}
+@test Tuple{Ref{Union{Int,String,Missing}}, Ref{Union{Int,String}}} <: Tuple{Ref{Union{X,Y}}, Ref{X}} where {X,Y}
 
 @test !(Tuple{Any, Any, Any} <: Tuple{Any, Vararg{T}} where T)
+
+let a = (isodd(i) ? Pair{Char, String} : Pair{String, String} for i in 1:2000)
+    @test Tuple{Type{Pair{Union{Char, String}, String}}, a...} <: Tuple{Type{Pair{K, V}}, Vararg{Pair{A, B} where B where A}} where V where K
+end
+
+#issue 48582
+@test !<:(Tuple{Pair{<:T,<:T}, Val{S} where {S}} where {T<:Base.BitInteger},
+          Tuple{Pair{<:T,<:T}, Val{Int}} where {T<:Base.BitInteger})
+
+# requires assertions enabled (to test union-split in `obviously_disjoint`)
+@test !<:(Tuple{Type{Int}, Int}, Tuple{Type{Union{Int, T}}, T} where T<:Union{Int8,Int16})
+@test <:(Tuple{Type{Int}, Int}, Tuple{Type{Union{Int, T}}, T} where T<:Union{Int8,Int})
diff --git a/test/threadpool_use.jl b/test/threadpool_use.jl
index 47c45bdd71eb8..64227c8a8110b 100644
--- a/test/threadpool_use.jl
+++ b/test/threadpool_use.jl
@@ -6,11 +6,6 @@ using Base.Threads
 @test nthreadpools() == 2
 @test threadpool() === :default
 @test threadpool(2) === :interactive
-dtask() = @test threadpool(current_task()) === :default
-itask() = @test threadpool(current_task()) === :interactive
-dt1 = @spawn dtask()
-dt2 = @spawn :default dtask()
-it = @spawn :interactive itask()
-wait(dt1)
-wait(dt2)
-wait(it)
+@test fetch(Threads.@spawn Threads.threadpool()) === :default
+@test fetch(Threads.@spawn :default Threads.threadpool()) === :default
+@test fetch(Threads.@spawn :interactive Threads.threadpool()) === :interactive
diff --git a/test/worlds.jl b/test/worlds.jl
index 39a9dc4d9a788..a2baa741b592a 100644
--- a/test/worlds.jl
+++ b/test/worlds.jl
@@ -408,3 +408,27 @@ wc_aiw2 = get_world_counter()
 @test Base.invoke_in_world(wc_aiw2, f_inworld, 2) == "world two; x=2"
 @test Base.invoke_in_world(wc_aiw1, g_inworld, 2, y=3) == "world one; x=2, y=3"
 @test Base.invoke_in_world(wc_aiw2, g_inworld, 2, y=3) == "world two; x=2, y=3"
+
+# logging
+mc48954(x, y) = false
+mc48954(x::Int, y::Int) = x == y
+mc48954(x::Symbol, y::Symbol) = x == y
+function mcc48954(container, y)
+    x = container[1]
+    return mc48954(x, y)
+end
+
+mcc48954(Any[1], 1)
+mc48954i = method_instance(mc48954, (Any, Int))
+mcc48954i = method_instance(mcc48954, (Vector{Any}, Int))
+list48954 = ccall(:jl_debug_method_invalidation, Any, (Cint,), 1)
+mc48954(x::AbstractFloat, y::Int) = x == y
+ccall(:jl_debug_method_invalidation, Any, (Cint,), 0)
+@test list48954 == [
+    mcc48954i,
+    1,
+    mc48954i,
+    "jl_method_table_insert",
+    which(mc48954, (AbstractFloat, Int)),
+    "jl_method_table_insert"
+]