From 435bf88a3386500d1c8bac051e0c8b93d45745e2 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Sat, 18 Jul 2020 19:35:56 -0400
Subject: [PATCH] Propagate iteration info to optimizer (#36684)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This supersedes #36169. Rather than re-implementing the iteration
analysis as done there, this uses the new stmtinfo infrastrcture
to propagate all the analysis done during inference all the way
to inlining. As a result, it applies not only to splats of
singletons, but also to splats of any other short iterable
that inference can analyze. E.g.:

```
f(x) = (x...,)
@code_typed f(1=>2)
@benchmark f(1=>2)
```

Before:
```
julia> @code_typed f(1=>2)
CodeInfo(
1 ─ %1 = Core._apply_iterate(Base.iterate, Core.tuple, x)::Tuple{Int64,Int64}
└──      return %1
) => Tuple{Int64,Int64}

julia> @benchmark f(1=>2)
BenchmarkTools.Trial:
  memory estimate:  96 bytes
  allocs estimate:  3
  --------------
  minimum time:     242.659 ns (0.00% GC)
  median time:      246.904 ns (0.00% GC)
  mean time:        255.390 ns (1.08% GC)
  maximum time:     4.415 μs (93.94% GC)
  --------------
  samples:          10000
  evals/sample:     405
```

After:
```
julia> @code_typed f(1=>2)
CodeInfo(
1 ─ %1 = Base.getfield(x, 1)::Int64
│   %2 = Base.getfield(x, 2)::Int64
│   %3 = Core.tuple(%1, %2)::Tuple{Int64,Int64}
└──      return %3
) => Tuple{Int64,Int64}

julia> @benchmark f(1=>2)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.701 ns (0.00% GC)
  median time:      1.925 ns (0.00% GC)
  mean time:        1.904 ns (0.00% GC)
  maximum time:     6.941 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000
```

I also implemented the TODO, I had left in #36169 to inline
the iterate calls themselves, which gives another 3x
improvement over the solution in that PR:

```
julia> @code_typed f(1)
CodeInfo(
1 ─ %1 = Core.tuple(x)::Tuple{Int64}
└──      return %1
) => Tuple{Int64}

julia> @benchmark f(1)
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.696 ns (0.00% GC)
  median time:      1.699 ns (0.00% GC)
  mean time:        1.702 ns (0.00% GC)
  maximum time:     5.389 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000
```

Fixes #36087
Fixes #29114
---
 base/compiler/abstractinterpretation.jl |  78 +++---
 base/compiler/ssair/inlining.jl         | 319 ++++++++++++++----------
 base/compiler/ssair/ir.jl               |  18 +-
 base/compiler/ssair/passes.jl           |   4 +-
 base/compiler/stmtinfo.jl               |  68 ++++-
 test/compiler/inline.jl                 |  13 +
 6 files changed, 321 insertions(+), 179 deletions(-)

diff --git a/base/compiler/abstractinterpretation.jl b/base/compiler/abstractinterpretation.jl
index 298e3ccd65728..03a8530c7aead 100644
--- a/base/compiler/abstractinterpretation.jl
+++ b/base/compiler/abstractinterpretation.jl
@@ -78,7 +78,7 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
                 push!(fullmatch, thisfullmatch)
             end
         end
-        info = UnionSplitInfo(splitsigs, infos)
+        info = UnionSplitInfo(infos)
     else
         mt = ccall(:jl_method_table_for, Any, (Any,), atype)
         if mt === nothing
@@ -505,13 +505,13 @@ end
 # returns an array of types
 function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft), @nospecialize(typ), vtypes::VarTable, sv::InferenceState)
     if isa(typ, PartialStruct) && typ.typ.name === Tuple.name
-        return typ.fields
+        return typ.fields, nothing
     end
 
     if isa(typ, Const)
         val = typ.val
         if isa(val, SimpleVector) || isa(val, Tuple)
-            return Any[ Const(val[i]) for i in 1:length(val) ] # avoid making a tuple Generator here!
+            return Any[ Const(val[i]) for i in 1:length(val) ], nothing # avoid making a tuple Generator here!
         end
     end
 
@@ -529,27 +529,27 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
     if isa(tti, Union)
         utis = uniontypes(tti)
         if _any(t -> !isa(t, DataType) || !(t <: Tuple) || !isknownlength(t), utis)
-            return Any[Vararg{Any}]
+            return Any[Vararg{Any}], nothing
         end
         result = Any[rewrap_unionall(p, tti0) for p in utis[1].parameters]
         for t in utis[2:end]
             if length(t.parameters) != length(result)
-                return Any[Vararg{Any}]
+                return Any[Vararg{Any}], nothing
             end
             for j in 1:length(t.parameters)
                 result[j] = tmerge(result[j], rewrap_unionall(t.parameters[j], tti0))
             end
         end
-        return result
+        return result, nothing
     elseif tti0 <: Tuple
         if isa(tti0, DataType)
             if isvatuple(tti0) && length(tti0.parameters) == 1
-                return Any[Vararg{unwrapva(tti0.parameters[1])}]
+                return Any[Vararg{unwrapva(tti0.parameters[1])}], nothing
             else
-                return Any[ p for p in tti0.parameters ]
+                return Any[ p for p in tti0.parameters ], nothing
             end
         elseif !isa(tti, DataType)
-            return Any[Vararg{Any}]
+            return Any[Vararg{Any}], nothing
         else
             len = length(tti.parameters)
             last = tti.parameters[len]
@@ -558,12 +558,12 @@ function precise_container_type(interp::AbstractInterpreter, @nospecialize(itft)
             if va
                 elts[len] = Vararg{elts[len]}
             end
-            return elts
+            return elts, nothing
         end
     elseif tti0 === SimpleVector || tti0 === Any
-        return Any[Vararg{Any}]
+        return Any[Vararg{Any}], nothing
     elseif tti0 <: Array
-        return Any[Vararg{eltype(tti0)}]
+        return Any[Vararg{eltype(tti0)}], nothing
     else
         return abstract_iteration(interp, itft, typ, vtypes, sv)
     end
@@ -572,7 +572,7 @@ end
 # simulate iteration protocol on container type up to fixpoint
 function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @nospecialize(itertype), vtypes::VarTable, sv::InferenceState)
     if !isdefined(Main, :Base) || !isdefined(Main.Base, :iterate) || !isconst(Main.Base, :iterate)
-        return Any[Vararg{Any}]
+        return Any[Vararg{Any}], nothing
     end
     if itft === nothing
         iteratef = getfield(Main.Base, :iterate)
@@ -580,22 +580,26 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
     elseif isa(itft, Const)
         iteratef = itft.val
     else
-        return Any[Vararg{Any}]
+        return Any[Vararg{Any}], nothing
     end
     @assert !isvarargtype(itertype)
-    stateordonet = abstract_call_known(interp, iteratef, nothing, Any[itft, itertype], vtypes, sv).rt
+    call = abstract_call_known(interp, iteratef, nothing, Any[itft, itertype], vtypes, sv)
+    stateordonet = call.rt
+    info = call.info
     # Return Bottom if this is not an iterator.
     # WARNING: Changes to the iteration protocol must be reflected here,
     # this is not just an optimization.
-    stateordonet === Bottom && return Any[Bottom]
+    stateordonet === Bottom && return Any[Bottom], AbstractIterationInfo(CallMeta[CallMeta(Bottom, info)])
     valtype = statetype = Bottom
     ret = Any[]
+    calls = CallMeta[call]
+
     # Try to unroll the iteration up to MAX_TUPLE_SPLAT, which covers any finite
     # length iterators, or interesting prefix
     while true
         stateordonet_widened = widenconst(stateordonet)
         if stateordonet_widened === Nothing
-            return ret
+            return ret, AbstractIterationInfo(calls)
         end
         if Nothing <: stateordonet_widened || length(ret) >= InferenceParams(interp).MAX_TUPLE_SPLAT
             break
@@ -607,12 +611,14 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
         # If there's no new information in this statetype, don't bother continuing,
         # the iterator won't be finite.
         if nstatetype ⊑ statetype
-            return Any[Bottom]
+            return Any[Bottom], nothing
         end
         valtype = getfield_tfunc(stateordonet, Const(1))
         push!(ret, valtype)
         statetype = nstatetype
-        stateordonet = abstract_call_known(interp, iteratef, nothing, Any[Const(iteratef), itertype, statetype], vtypes, sv).rt
+        call = abstract_call_known(interp, iteratef, nothing, Any[Const(iteratef), itertype, statetype], vtypes, sv)
+        stateordonet = call.rt
+        push!(calls, call)
     end
     # From here on, we start asking for results on the widened types, rather than
     # the precise (potentially const) state type
@@ -629,7 +635,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
         if nounion.parameters[1] <: valtype && nounion.parameters[2] <: statetype
             if typeintersect(stateordonet, Nothing) === Union{}
                 # Reached a fixpoint, but Nothing is not possible => iterator is infinite or failing
-                return Any[Bottom]
+                return Any[Bottom], nothing
             end
             break
         end
@@ -637,7 +643,7 @@ function abstract_iteration(interp::AbstractInterpreter, @nospecialize(itft), @n
         statetype = tmerge(statetype, nounion.parameters[2])
     end
     push!(ret, Vararg{valtype})
-    return ret
+    return ret, nothing
 end
 
 # do apply(af, fargs...), where af is a function value
@@ -656,13 +662,15 @@ function abstract_apply(interp::AbstractInterpreter, @nospecialize(itft), @nospe
     nargs = length(aargtypes)
     splitunions = 1 < countunionsplit(aargtypes) <= InferenceParams(interp).MAX_APPLY_UNION_ENUM
     ctypes = Any[Any[aft]]
+    infos = [Union{Nothing, AbstractIterationInfo}[]]
     for i = 1:nargs
         ctypes´ = []
+        infos′ = []
         for ti in (splitunions ? uniontypes(aargtypes[i]) : Any[aargtypes[i]])
             if !isvarargtype(ti)
-                cti = precise_container_type(interp, itft, ti, vtypes, sv)
+                cti, info = precise_container_type(interp, itft, ti, vtypes, sv)
             else
-                cti = precise_container_type(interp, itft, unwrapva(ti), vtypes, sv)
+                cti, info = precise_container_type(interp, itft, unwrapva(ti), vtypes, sv)
                 # We can't represent a repeating sequence of the same types,
                 # so tmerge everything together to get one type that represents
                 # everything.
@@ -678,19 +686,29 @@ function abstract_apply(interp::AbstractInterpreter, @nospecialize(itft), @nospe
             if _any(t -> t === Bottom, cti)
                 continue
             end
-            for ct in ctypes
+            for j = 1:length(ctypes)
+                ct = ctypes[j]
                 if isvarargtype(ct[end])
+                    # This is vararg, we're not gonna be able to do any inling,
+                    # drop the info
+                    info = nothing
+
                     tail = tuple_tail_elem(unwrapva(ct[end]), cti)
                     push!(ctypes´, push!(ct[1:(end - 1)], tail))
                 else
                     push!(ctypes´, append!(ct[:], cti))
                 end
+                push!(infos′, push!(copy(infos[j]), info))
             end
         end
         ctypes = ctypes´
+        infos = infos′
     end
-    local info = nothing
-    for ct in ctypes
+    retinfos = ApplyCallInfo[]
+    retinfo = UnionSplitApplyCallInfo(retinfos)
+    for i = 1:length(ctypes)
+        ct = ctypes[i]
+        arginfo = infos[i]
         lct = length(ct)
         # truncate argument list at the first Vararg
         for i = 1:lct-1
@@ -701,15 +719,17 @@ function abstract_apply(interp::AbstractInterpreter, @nospecialize(itft), @nospe
             end
         end
         call = abstract_call(interp, nothing, ct, vtypes, sv, max_methods)
-        info = call.info
+        push!(retinfos, ApplyCallInfo(call.info, arginfo))
         res = tmerge(res, call.rt)
         if res === Any
+            # No point carrying forward the info, we're not gonna inline it anyway
+            retinfo = nothing
             break
         end
     end
     # TODO: Add a special info type to capture all the iteration info.
     # For now, only propagate info if we don't also union-split the iteration
-    return CallMeta(res, length(ctypes) == 1 ? info : false)
+    return CallMeta(res, retinfo)
 end
 
 function is_method_pure(method::Method, @nospecialize(sig), sparams::SimpleVector)
@@ -779,7 +799,7 @@ function abstract_call_builtin(interp::AbstractInterpreter, f::Builtin, fargs::U
     end
     rt = builtin_tfunction(interp, f, argtypes[2:end], sv)
     if f === getfield && isa(fargs, Vector{Any}) && la == 3 && isa(argtypes[3], Const) && isa(argtypes[3].val, Int) && argtypes[2] ⊑ Tuple
-        cti = precise_container_type(interp, nothing, argtypes[2], vtypes, sv)
+        cti, _ = precise_container_type(interp, nothing, argtypes[2], vtypes, sv)
         idx = argtypes[3].val
         if 1 <= idx <= length(cti)
             rt = unwrapva(cti[idx])
diff --git a/base/compiler/ssair/inlining.jl b/base/compiler/ssair/inlining.jl
index 91458186a1c6b..6a05c17d12998 100644
--- a/base/compiler/ssair/inlining.jl
+++ b/base/compiler/ssair/inlining.jl
@@ -330,7 +330,7 @@ function ir_inline_item!(compact::IncrementalCompact, idx::Int, argexprs::Vector
         terminator = item.ir[SSAValue(last(inline_cfg.blocks[1].stmts))]
         #compact[idx] = nothing
         inline_compact = IncrementalCompact(compact, item.ir, compact.result_idx)
-        for (idx′, stmt′) in inline_compact
+        for ((_, idx′), stmt′) in inline_compact
             # This dance is done to maintain accurate usage counts in the
             # face of rename_arguments! mutating in place - should figure out
             # something better eventually.
@@ -360,7 +360,7 @@ function ir_inline_item!(compact::IncrementalCompact, idx::Int, argexprs::Vector
         pn = PhiNode()
         #compact[idx] = nothing
         inline_compact = IncrementalCompact(compact, item.ir, compact.result_idx)
-        for (idx′, stmt′) in inline_compact
+        for ((_, idx′), stmt′) in inline_compact
             inline_compact[idx′] = nothing
             stmt′ = ssa_substitute!(idx′, stmt′, argexprs, item.method.sig, item.sparams, linetable_offset, boundscheck_idx, compact)
             if isa(stmt′, ReturnNode)
@@ -529,8 +529,8 @@ function batch_inline!(todo::Vector{Any}, ir::IRCode, linetable::Vector{LineInfo
         resize!(compact, nnewnodes)
         item = popfirst!(todo)
         inline_idx = item.idx
-        for (idx, stmt) in compact
-            if compact.idx - 1 == inline_idx
+        for ((old_idx, idx), stmt) in compact
+            if old_idx == inline_idx
                 argexprs = copy(stmt.args)
                 refinish = false
                 if compact.result_idx == first(compact.result_bbs[compact.active_result_bb].stmts)
@@ -550,7 +550,7 @@ function batch_inline!(todo::Vector{Any}, ir::IRCode, linetable::Vector{LineInfo
                         end
                 end
                 if isa(item, InliningTodo)
-                    compact.ssa_rename[compact.idx-1] = ir_inline_item!(compact, idx, argexprs, linetable, item, boundscheck, state.todo_bbs)
+                    compact.ssa_rename[old_idx] = ir_inline_item!(compact, idx, argexprs, linetable, item, boundscheck, state.todo_bbs)
                 elseif isa(item, UnionSplit)
                     ir_inline_unionsplit!(compact, idx, argexprs, linetable, item, boundscheck, state.todo_bbs)
                 end
@@ -596,49 +596,77 @@ function spec_lambda(@nospecialize(atype), sv::OptimizationState, @nospecialize(
 end
 
 # This assumes the caller has verified that all arguments to the _apply call are Tuples.
-function rewrite_apply_exprargs!(ir::IRCode, idx::Int, argexprs::Vector{Any}, atypes::Vector{Any}, arg_start::Int)
+function rewrite_apply_exprargs!(ir::IRCode, todo::Vector{Any}, idx::Int, argexprs::Vector{Any}, atypes::Vector{Any}, arginfos::Vector{Any}, arg_start::Int, sv::OptimizationState)
     new_argexprs = Any[argexprs[arg_start]]
     new_atypes = Any[atypes[arg_start]]
     # loop over original arguments and flatten any known iterators
     for i in (arg_start+1):length(argexprs)
         def = argexprs[i]
         def_type = atypes[i]
-        if def_type isa PartialStruct
-            # def_type.typ <: Tuple is assumed
-            def_atypes = def_type.fields
-        else
-            def_atypes = Any[]
-            if isa(def_type, Const) # && isa(def_type.val, Union{Tuple, SimpleVector}) is implied
-                for p in def_type.val
-                    push!(def_atypes, Const(p))
-                end
+        thisarginfo = arginfos[i-arg_start]
+        if thisarginfo === nothing
+            if def_type isa PartialStruct
+                # def_type.typ <: Tuple is assumed
+                def_atypes = def_type.fields
             else
-                ti = widenconst(def_type)
-                if ti.name === NamedTuple_typename
-                    ti = ti.parameters[2]
-                end
-                for p in ti.parameters
-                    if isa(p, DataType) && isdefined(p, :instance)
-                        # replace singleton types with their equivalent Const object
-                        p = Const(p.instance)
-                    elseif isconstType(p)
-                        p = Const(p.parameters[1])
+                def_atypes = Any[]
+                if isa(def_type, Const) # && isa(def_type.val, Union{Tuple, SimpleVector}) is implied
+                    for p in def_type.val
+                        push!(def_atypes, Const(p))
+                    end
+                else
+                    ti = widenconst(def_type)
+                    if ti.name === NamedTuple_typename
+                        ti = ti.parameters[2]
+                    end
+                    for p in ti.parameters
+                        if isa(p, DataType) && isdefined(p, :instance)
+                            # replace singleton types with their equivalent Const object
+                            p = Const(p.instance)
+                        elseif isconstType(p)
+                            p = Const(p.parameters[1])
+                        end
+                        push!(def_atypes, p)
                     end
-                    push!(def_atypes, p)
                 end
             end
-        end
-        # now push flattened types into new_atypes and getfield exprs into new_argexprs
-        for j in 1:length(def_atypes)
-            def_atype = def_atypes[j]
-            if isa(def_atype, Const) && is_inlineable_constant(def_atype.val)
-                new_argexpr = quoted(def_atype.val)
-            else
-                new_call = Expr(:call, Core.getfield, def, j)
-                new_argexpr = insert_node!(ir, idx, def_atype, new_call)
+            # now push flattened types into new_atypes and getfield exprs into new_argexprs
+            for j in 1:length(def_atypes)
+                def_atype = def_atypes[j]
+                if isa(def_atype, Const) && is_inlineable_constant(def_atype.val)
+                    new_argexpr = quoted(def_atype.val)
+                else
+                    new_call = Expr(:call, GlobalRef(Core, :getfield), def, j)
+                    new_argexpr = insert_node!(ir, idx, def_atype, new_call)
+                end
+                push!(new_argexprs, new_argexpr)
+                push!(new_atypes, def_atype)
+            end
+        else
+            state = Core.svec()
+            for i = 1:length(thisarginfo.each)
+                call = thisarginfo.each[i]
+                new_stmt = Expr(:call, argexprs[2], def, state...)
+                state1 = insert_node!(ir, idx, call.rt, new_stmt)
+                new_sig = with_atype(call_sig(ir, new_stmt))
+                if isa(call.info, MethodMatchInfo) || isa(call.info, UnionSplitInfo)
+                    info = isa(call.info, MethodMatchInfo) ?
+                        MethodMatchInfo[call.info] : call.info.matches
+                    # See if we can inline this call to `iterate`
+                    analyze_single_call!(ir, todo, state1.id, new_stmt,
+                        new_sig, call.rt, info, sv)
+                end
+                if i != length(thisarginfo.each)
+                    valT = getfield_tfunc(call.rt, Const(1))
+                    val_extracted = insert_node!(ir, idx, valT,
+                        Expr(:call, GlobalRef(Core, :getfield), state1, 1))
+                    push!(new_argexprs, val_extracted)
+                    push!(new_atypes, valT)
+                    state_extracted = insert_node!(ir, idx, getfield_tfunc(call.rt, Const(2)),
+                        Expr(:call, GlobalRef(Core, :getfield), state1, 2))
+                    state = Core.svec(state_extracted)
+                end
             end
-            push!(new_argexprs, new_argexpr)
-            push!(new_atypes, def_atype)
         end
     end
     return new_argexprs, new_atypes
@@ -876,9 +904,23 @@ function call_sig(ir::IRCode, stmt::Expr)
     Signature(f, ft, atypes)
 end
 
-function inline_apply!(ir::IRCode, idx::Int, sig::Signature, params::OptimizationParams)
+function inline_apply!(ir::IRCode, todo::Vector{Any}, idx::Int, sig::Signature,
+                       params::OptimizationParams, sv::OptimizationState)
     stmt = ir.stmts[idx][:inst]
     while sig.f === Core._apply || sig.f === Core._apply_iterate
+        info = ir.stmts[idx][:info]
+        if isa(info, UnionSplitApplyCallInfo)
+            if length(info.infos) != 1
+                # TODO: Handle union split applies?
+                new_info = info = nothing
+            else
+                info = info.infos[1]
+                new_info = info.call
+            end
+        else
+            @assert info === nothing || info === false
+            new_info = info = nothing
+        end
         arg_start = sig.f === Core._apply ? 2 : 3
         atypes = sig.atypes
         if arg_start > length(atypes)
@@ -906,15 +948,22 @@ function inline_apply!(ir::IRCode, idx::Int, sig::Signature, params::Optimizatio
         end
         # Try to figure out the signature of the function being called
         # and if rewrite_apply_exprargs can deal with this form
+        infos = Any[]
         for i = (arg_start + 1):length(atypes)
-            # TODO: We could basically run the iteration protocol here
+            thisarginfo = nothing
             if !is_valid_type_for_apply_rewrite(atypes[i], params)
-                return nothing
+                if isa(info, ApplyCallInfo) && info.arginfo[i-arg_start] !== nothing
+                    thisarginfo = info.arginfo[i-arg_start]
+                else
+                    return nothing
+                end
             end
+            push!(infos, thisarginfo)
         end
         # Independent of whether we can inline, the above analysis allows us to rewrite
         # this apply call to a regular call
-        stmt.args, atypes = rewrite_apply_exprargs!(ir, idx, stmt.args, atypes, arg_start)
+        stmt.args, atypes = rewrite_apply_exprargs!(ir, todo, idx, stmt.args, atypes, infos, arg_start, sv)
+        ir.stmts[idx][:info] = new_info
         has_free_typevars(ft) && return nothing
         f = singleton_type(ft)
         sig = Signature(f, ft, atypes)
@@ -945,7 +994,7 @@ end
 # Handles all analysis and inlining of intrinsics and builtins. In particular,
 # this method does not access the method table or otherwise process generic
 # functions.
-function process_simple!(ir::IRCode, idx::Int, params::OptimizationParams, world::UInt)
+function process_simple!(ir::IRCode, todo, idx::Int, params::OptimizationParams, world::UInt, sv)
     stmt = ir.stmts[idx][:inst]
     stmt isa Expr || return nothing
     if stmt.head === :splatnew
@@ -959,7 +1008,7 @@ function process_simple!(ir::IRCode, idx::Int, params::OptimizationParams, world
     sig === nothing && return nothing
 
     # Handle _apply
-    sig = inline_apply!(ir, idx, sig, params)
+    sig = inline_apply!(ir, todo, idx, sig, params, sv)
     sig === nothing && return nothing
 
     # Check if we match any of the early inliners
@@ -997,7 +1046,7 @@ end
 # This is not currently called in the regular course, but may be needed
 # if we ever want to re-run inlining again later in the pass pipeline after
 # additional type information was discovered.
-function recompute_method_matches(atype, sv)
+function recompute_method_matches(@nospecialize(atype), sv::OptimizationState)
     # Regular case: Retrieve matching methods from cache (or compute them)
     # World age does not need to be taken into account in the cache
     # because it is forwarded from type inference through `sv.params`
@@ -1010,13 +1059,97 @@ function recompute_method_matches(atype, sv)
     MethodMatchInfo(meth, ambig)
 end
 
+function analyze_single_call!(ir::IRCode, todo::Vector{Any}, idx::Int, @nospecialize(stmt),
+        sig::Signature, @nospecialize(calltype), infos::Vector{MethodMatchInfo}, sv::OptimizationState)
+    cases = Pair{Any, Any}[]
+    signature_union = Union{}
+    only_method = nothing  # keep track of whether there is one matching method
+    too_many = false
+    local meth
+    local fully_covered = true
+    for i in 1:length(infos)
+        info = infos[i]
+        meth = info.applicable
+        if meth === false || info.ambig
+            # Too many applicable methods
+            # Or there is a (partial?) ambiguity
+            too_many = true
+            break
+        elseif length(meth) == 0
+            # No applicable methods; try next union split
+            continue
+        elseif length(meth) == 1 && only_method !== false
+            if only_method === nothing
+                only_method = meth[1][3]
+            elseif only_method !== meth[1][3]
+                only_method = false
+            end
+        else
+            only_method = false
+        end
+        for match in meth::Vector{Any}
+            (metharg, methsp, method) = (match[1]::Type, match[2]::SimpleVector, match[3]::Method)
+            signature_union = Union{signature_union, metharg}
+            if !isdispatchtuple(metharg)
+                fully_covered = false
+                continue
+            end
+            case_sig = Signature(sig.f, sig.ft, sig.atypes, metharg)
+            case = analyze_method!(idx, case_sig, metharg, methsp, method,
+                stmt, sv, false, nothing, calltype)
+            if case === nothing
+                fully_covered = false
+                continue
+            elseif _any(p->p[1] === metharg, cases)
+                continue
+            end
+            push!(cases, Pair{Any,Any}(metharg, case))
+        end
+    end
+
+    too_many && return
+
+    signature_fully_covered = sig.atype <: signature_union
+    # If we're fully covered and there's only one applicable method,
+    # we inline, even if the signature is not a dispatch tuple
+    if signature_fully_covered && length(cases) == 0 && only_method isa Method
+        if length(infos) > 1
+            method = only_method
+            (metharg, methsp) = ccall(:jl_type_intersection_with_env, Any, (Any, Any),
+                sig.atype, method.sig)::SimpleVector
+        else
+            @assert length(meth) == 1
+            (metharg, methsp, method) = (meth[1][1]::Type, meth[1][2]::SimpleVector, meth[1][3]::Method)
+        end
+        fully_covered = true
+        case = analyze_method!(idx, sig, metharg, methsp, method,
+            stmt, sv, false, nothing, calltype)
+        case === nothing && return
+        push!(cases, Pair{Any,Any}(metharg, case))
+    end
+    if !signature_fully_covered
+        fully_covered = false
+    end
+
+    # If we only have one case and that case is fully covered, we may either
+    # be able to do the inlining now (for constant cases), or push it directly
+    # onto the todo list
+    if fully_covered && length(cases) == 1
+        handle_single_case!(ir, stmt, idx, cases[1][2], false, todo)
+        return
+    end
+    length(cases) == 0 && return
+    push!(todo, UnionSplit(idx, fully_covered, sig.atype, cases))
+    return nothing
+end
+
 function assemble_inline_todo!(ir::IRCode, sv::OptimizationState)
     # todo = (inline_idx, (isva, isinvoke, na), method, spvals, inline_linetable, inline_ir, lie)
     todo = Any[]
     skip = find_throw_blocks(ir.stmts.inst, RefValue(ir))
     for idx in 1:length(ir.stmts)
         idx in skip && continue
-        r = process_simple!(ir, idx, sv.params, sv.world)
+        r = process_simple!(ir, todo, idx, sv.params, sv.world, sv)
         r === nothing && continue
 
         stmt = ir.stmts[idx][:inst]
@@ -1039,107 +1172,21 @@ function assemble_inline_todo!(ir::IRCode, sv::OptimizationState)
         nu = countunionsplit(sig.atypes)
         if nu == 1 || nu > sv.params.MAX_UNION_SPLITTING
             if !isa(info, MethodMatchInfo)
-                info = nothing
+                info = recompute_method_matches(sig.atype, sv)
             end
-            infos = Any[info]
-            splits = Any[sig.atype]
+            infos = MethodMatchInfo[info]
         else
             if !isa(info, UnionSplitInfo)
-                splits = Any[]
+                infos = MethodMatchInfo[]
                 for union_sig in UnionSplitSignature(sig.atypes)
-                    push!(splits, argtypes_to_type(union_sig))
+                    push!(infos, recompute_method_matches(union_sig, sv))
                 end
-                infos = Any[nothing for i = 1:length(splits)]
             else
-                splits = info.sigs
                 infos = info.matches
             end
         end
 
-        cases = Pair{Any, Any}[]
-        signature_union = Union{}
-        only_method = nothing  # keep track of whether there is one matching method
-        too_many = false
-        local meth
-        local fully_covered = true
-        for i in 1:length(splits)
-            atype = splits[i]
-            info = infos[i]
-            if info === nothing
-                info = recompute_method_matches(atype, sv)
-            end
-            meth = info.applicable
-            if meth === false || info.ambig
-                # Too many applicable methods
-                # Or there is a (partial?) ambiguity
-                too_many = true
-                break
-            elseif length(meth) == 0
-                # No applicable methods; try next union split
-                continue
-            elseif length(meth) == 1 && only_method !== false
-                if only_method === nothing
-                    only_method = meth[1][3]
-                elseif only_method !== meth[1][3]
-                    only_method = false
-                end
-            else
-                only_method = false
-            end
-            for match in meth::Vector{Any}
-                (metharg, methsp, method) = (match[1]::Type, match[2]::SimpleVector, match[3]::Method)
-                # TODO: This could be better
-                signature_union = Union{signature_union, metharg}
-                if !isdispatchtuple(metharg)
-                    fully_covered = false
-                    continue
-                end
-                case_sig = Signature(sig.f, sig.ft, sig.atypes, metharg)
-                case = analyze_method!(idx, case_sig, metharg, methsp, method,
-                    stmt, sv, false, nothing, calltype)
-                if case === nothing
-                    fully_covered = false
-                    continue
-                elseif _any(p->p[1] === metharg, cases)
-                    continue
-                end
-                push!(cases, Pair{Any,Any}(metharg, case))
-            end
-        end
-
-        too_many && continue
-
-        signature_fully_covered = sig.atype <: signature_union
-        # If we're fully covered and there's only one applicable method,
-        # we inline, even if the signature is not a dispatch tuple
-        if signature_fully_covered && length(cases) == 0 && only_method isa Method
-            if length(splits) > 1
-                method = only_method
-                (metharg, methsp) = ccall(:jl_type_intersection_with_env, Any, (Any, Any),
-                    sig.atype, method.sig)::SimpleVector
-            else
-                @assert length(meth) == 1
-                (metharg, methsp, method) = (meth[1][1]::Type, meth[1][2]::SimpleVector, meth[1][3]::Method)
-            end
-            fully_covered = true
-            case = analyze_method!(idx, sig, metharg, methsp, method,
-                stmt, sv, false, nothing, calltype)
-            case === nothing && continue
-            push!(cases, Pair{Any,Any}(metharg, case))
-        end
-        if !signature_fully_covered
-            fully_covered = false
-        end
-
-        # If we only have one case and that case is fully covered, we may either
-        # be able to do the inlining now (for constant cases), or push it directly
-        # onto the todo list
-        if fully_covered && length(cases) == 1
-            handle_single_case!(ir, stmt, idx, cases[1][2], false, todo)
-            continue
-        end
-        length(cases) == 0 && continue
-        push!(todo, UnionSplit(idx, fully_covered, sig.atype, cases))
+        analyze_single_call!(ir, todo, idx, stmt, sig, calltype, infos, sv)
     end
     todo
 end
diff --git a/base/compiler/ssair/ir.jl b/base/compiler/ssair/ir.jl
index 5e34e20831c82..621072527a334 100644
--- a/base/compiler/ssair/ir.jl
+++ b/base/compiler/ssair/ir.jl
@@ -275,8 +275,11 @@ function getindex(x::IRCode, s::SSAValue)
 end
 
 function setindex!(x::IRCode, @nospecialize(repl), s::SSAValue)
-    @assert s.id <= length(x.stmts)
-    x.stmts[s.id][:inst] = repl
+    if s.id <= length(x.stmts)
+        x.stmts[s.id][:inst] = repl
+    else
+        x.new_nodes.stmts[s.id - length(x.stmts)][:inst] = repl
+    end
     return x
 end
 
@@ -1074,7 +1077,9 @@ function process_newnode!(compact::IncrementalCompact, new_idx::Int, new_node_en
         finish_current_bb!(compact, active_bb, old_result_idx)
     end
     (old_result_idx == result_idx) && return iterate(compact, (idx, active_bb))
-    return Pair{Int, Any}(old_result_idx, compact.result[old_result_idx][:inst]), (idx, active_bb)
+    return Pair{Pair{Int, Int}, Any}(
+        Pair{Int,Int}(new_idx,old_result_idx),
+        compact.result[old_result_idx][:inst]), (idx, active_bb)
 end
 
 struct CompactPeekIterator
@@ -1141,9 +1146,9 @@ function iterate(compact::IncrementalCompact, (idx, active_bb)::Tuple{Int, Int}=
         # Move to next block
         compact.idx += 1
         if finish_current_bb!(compact, active_bb, old_result_idx, true)
-            return iterate(compact, (compact.idx, active_bb + 1))
+            return iterate(compact, (compact.idx-1, active_bb + 1))
         else
-            return Pair{Int, Any}(old_result_idx, compact.result[old_result_idx][:inst]), (compact.idx, active_bb + 1)
+            return Pair{Pair{Int, Int}, Any}(Pair{Int,Int}(compact.idx-1, old_result_idx), compact.result[old_result_idx][:inst]), (compact.idx, active_bb + 1)
         end
     end
     if compact.new_nodes_idx <= length(compact.perm) &&
@@ -1180,7 +1185,8 @@ function iterate(compact::IncrementalCompact, (idx, active_bb)::Tuple{Int, Int}=
         @goto restart
     end
     @assert isassigned(compact.result.inst, old_result_idx)
-    return Pair{Int, Any}(old_result_idx, compact.result[old_result_idx][:inst]), (compact.idx, active_bb)
+    return Pair{Pair{Int,Int}, Any}(Pair{Int,Int}(compact.idx-1, old_result_idx),
+        compact.result[old_result_idx][:inst]), (compact.idx, active_bb)
 end
 
 function maybe_erase_unused!(extra_worklist, compact, idx, callback = x->nothing)
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 4b444aa504715..a2dc6d6e75f60 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -531,7 +531,7 @@ function getfield_elim_pass!(ir::IRCode, domtree::DomTree)
     lifting_cache = IdDict{Pair{AnySSAValue, Any}, AnySSAValue}()
     revisit_worklist = Int[]
     #ndone, nmax = 0, 200
-    for (idx, stmt) in compact
+    for ((_, idx), stmt) in compact
         isa(stmt, Expr) || continue
         #ndone >= nmax && continue
         #ndone += 1
@@ -872,7 +872,7 @@ function adce_pass!(ir::IRCode)
     phi_uses = fill(0, length(ir.stmts) + length(ir.new_nodes))
     all_phis = Int[]
     compact = IncrementalCompact(ir)
-    for (idx, stmt) in compact
+    for ((_, idx), stmt) in compact
         if isa(stmt, PhiNode)
             push!(all_phis, idx)
         end
diff --git a/base/compiler/stmtinfo.jl b/base/compiler/stmtinfo.jl
index dda51817f76be..39952dff75c1b 100644
--- a/base/compiler/stmtinfo.jl
+++ b/base/compiler/stmtinfo.jl
@@ -1,19 +1,75 @@
+"""
+    struct MethodMatchInfo
+
+Captures the result of a `method_matches` lookup for the given call. This
+info may then be used by the optimizer to inline the matches, without having
+to re-consult the method table. This info is illegal on any statement that is
+not a call to a generic function.
+"""
 struct MethodMatchInfo
     applicable::Any
     ambig::Bool
 end
 
+"""
+    struct MethodMatchInfo
+
+If inference decides to partition the method search space by splitting unions,
+it will issue a method lookup query for each such partition. This info indicates
+that such partitioning happened and wraps the corresponding MethodMatchInfo for
+each partition. This info is illegal on any statement that is not a call to a
+generic function.
+"""
 struct UnionSplitInfo
-    # TODO: In principle we shouldn't have to store this, but could just
-    # recompute it using `switchtuple` union. However, it is not the case
-    # that if T == S, then switchtupleunion(T) == switchtupleunion(S), e.g. for
-    # T = Tuple{Tuple{Union{Float64, Int64},String}}
-    # S = Tuple{Union{Tuple{Float64, String}, Tuple{Int64, String}}}
-    sigs::Vector{Any}
     matches::Vector{MethodMatchInfo}
 end
 
+"""
+    struct CallMeta
+
+A simple struct that captures both the return type any any additional `info`
+for a given generic call.
+"""
 struct CallMeta
     rt::Any
     info::Any
 end
+
+"""
+    struct AbstractIterationInfo
+
+Captures all the information for abstract iteration analysis of a single value.
+Each (abstract) call to `iterate`, corresponds to one entry in `each`.
+"""
+struct AbstractIterationInfo
+    each::Vector{CallMeta}
+end
+
+"""
+    struct ApplyCallInfo
+
+This info applies to any call of _apply_iterate(...) and captures both the
+info of the actual call being applied and the info for any implicit call
+to the `iterate` function. Note that it is possible for the call itself
+to be yet another `_apply_iterate`, in which case the `.call` field will
+be another ApplyCallInfo. This info is illegal on any statement that is
+not an _apply_iterate call.
+"""
+struct ApplyCallInfo
+    # The info for the call itself
+    call::Any
+    # AbstractIterationInfo for each argument, if applicable
+    arginfo::Vector{Union{Nothing, AbstractIterationInfo}}
+end
+
+"""
+    struct UnionSplitApplyCallInfo
+
+Like `UnionSplitInfo`, but for `ApplyCallInfo` rather than MethodMatchInfo.
+This info is illegal on any statement that is not an _apply_iterate call.
+"""
+struct UnionSplitApplyCallInfo
+    infos::Vector{ApplyCallInfo}
+end
+
+
diff --git a/test/compiler/inline.jl b/test/compiler/inline.jl
index 4c3812ddc04e5..85ac8fd6a0f4c 100644
--- a/test/compiler/inline.jl
+++ b/test/compiler/inline.jl
@@ -294,3 +294,16 @@ f_inline_global_getindex() = _a_global_array[1]
 let ci = code_typed(f_inline_global_getindex, Tuple{})[1].first
     @test any(x->(isexpr(x, :call) && x.args[1] === GlobalRef(Base, :arrayref)), ci.code)
 end
+
+# Issue #29114 & #36087 - Inlining of non-tuple splats
+f_29115(x) = (x...,)
+@test @allocated(f_29115(1)) == 0
+@test @allocated(f_29115(1=>2)) == 0
+let ci = code_typed(f_29115, Tuple{Int64})[1].first
+    @test length(ci.code) == 2 && isexpr(ci.code[1], :call) &&
+        ci.code[1].args[1] === GlobalRef(Core, :tuple)
+end
+let ci = code_typed(f_29115, Tuple{Pair{Int64, Int64}})[1].first
+    @test length(ci.code) == 4 && isexpr(ci.code[1], :call) &&
+        ci.code[end-1].args[1] === GlobalRef(Core, :tuple)
+end