From 95cfd62d0953395b9b9f37399a9e761cb44cee6e Mon Sep 17 00:00:00 2001
From: Shuhei Kadowaki <40514306+aviatesk@users.noreply.github.com>
Date: Sat, 8 Oct 2022 15:39:39 +0900
Subject: [PATCH] improve performance issue of `@nospecialize`-d keyword func
 call (#47059)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit tries to fix and improve performance for calling keyword
funcs whose arguments types are not fully known but `@nospecialize`-d.

The final result would look like (this particular example is taken from
our Julia-level compiler implementation):
```julia
abstract type CallInfo end
struct NoCallInfo <: CallInfo end
struct NewInstruction
    stmt::Any
    type::Any
    info::CallInfo
    line::Int32
    flag::UInt8
    function NewInstruction(@nospecialize(stmt), @nospecialize(type), @nospecialize(info::CallInfo),
                            line::Int32, flag::UInt8)
        return new(stmt, type, info, line, flag)
    end
end
@nospecialize
function NewInstruction(newinst::NewInstruction;
    stmt=newinst.stmt,
    type=newinst.type,
    info::CallInfo=newinst.info,
    line::Int32=newinst.line,
    flag::UInt8=newinst.flag)
    return NewInstruction(stmt, type, info, line, flag)
end
@specialize

using BenchmarkTools
struct VirtualKwargs
    stmt::Any
    type::Any
    info::CallInfo
end
vkws = VirtualKwargs(nothing, Any, NoCallInfo())
newinst = NewInstruction(nothing, Any, NoCallInfo(), zero(Int32), zero(UInt8))
runner(newinst, vkws) = NewInstruction(newinst; vkws.stmt, vkws.type, vkws.info)
@benchmark runner($newinst, $vkws)
```

> on master
```
BenchmarkTools.Trial: 10000 samples with 186 evaluations.
 Range (min … max):  559.898 ns …   4.173 μs  ┊ GC (min … max): 0.00% … 85.29%
 Time  (median):     605.608 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   638.170 ns ± 125.080 ns  ┊ GC (mean ± σ):  0.06% ±  0.85%

  █▇▂▆▄  ▁█▇▄▂                                                  ▂
  ██████▅██████▇▇▇██████▇▇▇▆▆▅▄▅▄▂▄▄▅▇▆▆▆▆▆▅▆▆▄▄▅▅▄▃▄▄▄▅▃▅▅▆▅▆▆ █
  560 ns        Histogram: log(frequency) by time       1.23 μs <

 Memory estimate: 32 bytes, allocs estimate: 2.
```

> on this commit
```julia
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min … max):  3.080 ns … 83.177 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.098 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.118 ns ±  0.885 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

       ▂▅▇█▆▅▄▂
  ▂▄▆▆▇████████▆▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▂▂▂▂▂▂▁▁▂▁▂▂▂▂▂▂▂▂▂ ▃
  3.08 ns        Histogram: frequency by time        3.19 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.
```

So for this particular case it achieves roughly 200x speed up.
This is because this commit allows inlining of a call to keyword sorter
as well as removal of `NamedTuple` call.

Especially this commit is composed of the following improvements:
- Add early return case for `structdiff`:
  This change improves the return type inference for a case when
  compared `NamedTuple`s are type unstable but there is no difference
  in their names, e.g. given two `NamedTuple{(:a,:b),T} where T<:Tuple{Any,Any}`s.
  And in such case the optimizer will remove `structdiff` and succeeding
  `pairs` calls, letting the keyword sorter to be inlined.
- Tweak the core `NamedTuple{names}(args::Tuple)` constructor so that it
  directly forms `:splatnew` allocation rather than redirects to the
  general `NamedTuple` constructor, that could be confused for abstract
  input tuple type.
- Improve `nfields_tfunc` accuracy as for abstract `NamedTuple` types.
  This improvement lets `inline_splatnew` to handle more abstract
  `NamedTuple`s, especially whose names are fully known but its fields
  tuple type is abstract.

Those improvements are combined to allow our SROA pass to optimize away
`NamedTuple` and `tuple` calls generated for keyword argument handling.
E.g. the IR for the example `NewInstruction` constructor is now fairly
optimized, like:
```julia
julia> Base.code_ircode((NewInstruction,Any,Any,CallInfo)) do newinst, stmt, type, info
           NewInstruction(newinst; stmt, type, info)
       end |> only
2 1 ─ %1 = Base.getfield(_2, :line)::Int32                  │╻╷  Type##kw
  │   %2 = Base.getfield(_2, :flag)::UInt8                  ││┃   getproperty
  │   %3 = %new(Main.NewInstruction, _3, _4, _5, %1, %2)::NewInstructionstruction
  └──      return %3                                        │
 => NewInstruction
```
---
 base/boot.jl                            |  3 +-
 base/compiler/abstractinterpretation.jl |  6 ++--
 base/compiler/ssair/passes.jl           | 12 ++++++-
 base/compiler/tfuncs.jl                 | 20 ++++++++++--
 base/namedtuple.jl                      | 11 +++++--
 test/compiler/inference.jl              | 12 +++++++
 test/compiler/inline.jl                 | 43 +++++++++++++++++++++++++
 test/compiler/irutils.jl                |  1 +
 8 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/base/boot.jl b/base/boot.jl
index 5f3b99df1c716..38d011a0a5d05 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -615,7 +615,8 @@ end
 
 NamedTuple() = NamedTuple{(),Tuple{}}(())
 
-NamedTuple{names}(args::Tuple) where {names} = NamedTuple{names,typeof(args)}(args)
+eval(Core, :(NamedTuple{names}(args::Tuple) where {names} =
+             $(Expr(:splatnew, :(NamedTuple{names,typeof(args)}), :args))))
 
 using .Intrinsics: sle_int, add_int
 
diff --git a/base/compiler/abstractinterpretation.jl b/base/compiler/abstractinterpretation.jl
index a5b7deacf121c..d2298eeada0db 100644
--- a/base/compiler/abstractinterpretation.jl
+++ b/base/compiler/abstractinterpretation.jl
@@ -2109,16 +2109,16 @@ function abstract_eval_statement_expr(interp::AbstractInterpreter, e::Expr, vtyp
     elseif ehead === :splatnew
         t, isexact = instanceof_tfunc(abstract_eval_value(interp, e.args[1], vtypes, sv))
         nothrow = false # TODO: More precision
-        if length(e.args) == 2 && isconcretetype(t) && !ismutabletype(t)
+        if length(e.args) == 2 && isconcretedispatch(t) && !ismutabletype(t)
             at = abstract_eval_value(interp, e.args[2], vtypes, sv)
             n = fieldcount(t)
             if isa(at, Const) && isa(at.val, Tuple) && n == length(at.val::Tuple) &&
                 let t = t, at = at; _all(i->getfield(at.val::Tuple, i) isa fieldtype(t, i), 1:n); end
-                nothrow = isexact && isconcretedispatch(t)
+                nothrow = isexact
                 t = Const(ccall(:jl_new_structt, Any, (Any, Any), t, at.val))
             elseif isa(at, PartialStruct) && at ⊑ᵢ Tuple && n == length(at.fields::Vector{Any}) &&
                 let t = t, at = at; _all(i->(at.fields::Vector{Any})[i] ⊑ᵢ fieldtype(t, i), 1:n); end
-                nothrow = isexact && isconcretedispatch(t)
+                nothrow = isexact
                 t = PartialStruct(t, at.fields::Vector{Any})
             end
         end
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 8ad72285fcaeb..3d7b54eb36149 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -401,6 +401,16 @@ function lift_leaves(compact::IncrementalCompact,
                 end
                 lift_arg!(compact, leaf, cache_key, def, 1+field, lifted_leaves)
                 continue
+            # NOTE we can enable this, but most `:splatnew` expressions are transformed into
+            #      `:new` expressions by the inlinear
+            # elseif isexpr(def, :splatnew) && length(def.args) == 2 && isa(def.args[2], AnySSAValue)
+            #     tplssa = def.args[2]::AnySSAValue
+            #     tplexpr = compact[tplssa][:inst]
+            #     if is_known_call(tplexpr, tuple, compact) && 1 ≤ field < length(tplexpr.args)
+            #         lift_arg!(compact, tplssa, cache_key, tplexpr, 1+field, lifted_leaves)
+            #         continue
+            #     end
+            #     return nothing
             elseif is_getfield_captures(def, compact)
                 # Walk to new_opaque_closure
                 ocleaf = def.args[2]
@@ -469,7 +479,7 @@ function lift_arg!(
         end
     end
     lifted_leaves[cache_key] = LiftedValue(lifted)
-    nothing
+    return nothing
 end
 
 function walk_to_def(compact::IncrementalCompact, @nospecialize(leaf))
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index 9c206270976b8..8044f9c4d784e 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -403,11 +403,19 @@ add_tfunc(Core.sizeof, 1, 1, sizeof_tfunc, 1)
 function nfields_tfunc(@nospecialize(x))
     isa(x, Const) && return Const(nfields(x.val))
     isa(x, Conditional) && return Const(0)
-    x = unwrap_unionall(widenconst(x))
+    xt = widenconst(x)
+    x = unwrap_unionall(xt)
     isconstType(x) && return Const(nfields(x.parameters[1]))
     if isa(x, DataType) && !isabstracttype(x)
-        if !(x.name === Tuple.name && isvatuple(x)) &&
-           !(x.name === _NAMEDTUPLE_NAME && !isconcretetype(x))
+        if x.name === Tuple.name
+            isvatuple(x) && return Int
+            return Const(length(x.types))
+        elseif x.name === _NAMEDTUPLE_NAME
+            length(x.parameters) == 2 || return Int
+            names = x.parameters[1]
+            isa(names, Tuple{Vararg{Symbol}}) || return nfields_tfunc(rewrap_unionall(x.parameters[2], xt))
+            return Const(length(names))
+        else
             return Const(isdefined(x, :types) ? length(x.types) : length(x.name.names))
         end
     end
@@ -1594,6 +1602,12 @@ function apply_type_tfunc(@nospecialize(headtypetype), @nospecialize args...)
     end
     if istuple
         return Type{<:appl}
+    elseif isa(appl, DataType) && appl.name === _NAMEDTUPLE_NAME && length(appl.parameters) == 2 &&
+           (appl.parameters[1] === () || appl.parameters[2] === Tuple{})
+        # if the first/second parameter of `NamedTuple` is known to be empty,
+        # the second/first argument should also be empty tuple type,
+        # so refine it here
+        return Const(NamedTuple{(),Tuple{}})
     end
     ans = Type{appl}
     for i = length(outervars):-1:1
diff --git a/base/namedtuple.jl b/base/namedtuple.jl
index 3e9f1272d588e..b53304de7d8cc 100644
--- a/base/namedtuple.jl
+++ b/base/namedtuple.jl
@@ -335,7 +335,7 @@ reverse(nt::NamedTuple) = NamedTuple{reverse(keys(nt))}(reverse(values(nt)))
 end
 
 """
-    structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn},Type{NamedTuple{bn}}}) where {an,bn}
+    structdiff(a::NamedTuple, b::Union{NamedTuple,Type{NamedTuple}})
 
 Construct a copy of named tuple `a`, except with fields that exist in `b` removed.
 `b` can be a named tuple, or a type of the form `NamedTuple{field_names}`.
@@ -343,14 +343,19 @@ Construct a copy of named tuple `a`, except with fields that exist in `b` remove
 function structdiff(a::NamedTuple{an}, b::Union{NamedTuple{bn}, Type{NamedTuple{bn}}}) where {an, bn}
     if @generated
         names = diff_names(an, bn)
+        isempty(names) && return (;) # just a fast pass
         idx = Int[ fieldindex(a, names[n]) for n in 1:length(names) ]
         types = Tuple{Any[ fieldtype(a, idx[n]) for n in 1:length(idx) ]...}
         vals = Any[ :(getfield(a, $(idx[n]))) for n in 1:length(idx) ]
-        :( NamedTuple{$names,$types}(($(vals...),)) )
+        return :( NamedTuple{$names,$types}(($(vals...),)) )
     else
         names = diff_names(an, bn)
+        # N.B this early return is necessary to get a better type stability,
+        # and also allows us to cut off the cost from constructing
+        # potentially type unstable closure passed to the `map` below
+        isempty(names) && return (;)
         types = Tuple{Any[ fieldtype(typeof(a), names[n]) for n in 1:length(names) ]...}
-        NamedTuple{names,types}(map(Fix1(getfield, a), names))
+        return NamedTuple{names,types}(map(n::Symbol->getfield(a, n), names))
     end
 end
 
diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl
index dcd57916589cf..e174fc1470eb8 100644
--- a/test/compiler/inference.jl
+++ b/test/compiler/inference.jl
@@ -1526,6 +1526,11 @@ end
 @test nfields_tfunc(Tuple{Int, Vararg{Int}}) === Int
 @test nfields_tfunc(Tuple{Int, Integer}) === Const(2)
 @test nfields_tfunc(Union{Tuple{Int, Float64}, Tuple{Int, Int}}) === Const(2)
+@test nfields_tfunc(@NamedTuple{a::Int,b::Integer}) === Const(2)
+@test nfields_tfunc(NamedTuple{(:a,:b),T} where T<:Tuple{Int,Integer}) === Const(2)
+@test nfields_tfunc(NamedTuple{(:a,:b)}) === Const(2)
+@test nfields_tfunc(NamedTuple{names,Tuple{Any,Any}} where names) === Const(2)
+@test nfields_tfunc(Union{NamedTuple{(:a,:b)},NamedTuple{(:c,:d)}}) === Const(2)
 
 using Core.Compiler: typeof_tfunc
 @test typeof_tfunc(Tuple{Vararg{Int}}) == Type{Tuple{Vararg{Int,N}}} where N
@@ -2336,6 +2341,13 @@ end
 # Equivalence of Const(T.instance) and T for singleton types
 @test Const(nothing) ⊑ Nothing && Nothing ⊑ Const(nothing)
 
+# `apply_type_tfunc` should always return accurate result for empty NamedTuple case
+import Core: Const
+import Core.Compiler: apply_type_tfunc
+@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple{}) === Const(typeof((;)))
+@test apply_type_tfunc(Const(NamedTuple), Const(()), Type{T} where T<:Tuple) === Const(typeof((;)))
+@test apply_type_tfunc(Const(NamedTuple), Tuple{Vararg{Symbol}}, Type{Tuple{}}) === Const(typeof((;)))
+
 # Don't pessimize apply_type to anything worse than Type and yield Bottom for invalid Unions
 @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union}}) == Type{Union{}}
 @test Core.Compiler.return_type(Core.apply_type, Tuple{Type{Union},Any}) == Type
diff --git a/test/compiler/inline.jl b/test/compiler/inline.jl
index 1f594f74a24f4..f582277ec06f7 100644
--- a/test/compiler/inline.jl
+++ b/test/compiler/inline.jl
@@ -1770,3 +1770,46 @@ f_ifelse_3(a, b) = Core.ifelse(a, true, b)
 @test fully_eliminated(f_ifelse_1, Tuple{Any, Any}; retval=Core.Argument(2))
 @test fully_eliminated(f_ifelse_2, Tuple{Any, Any}; retval=Core.Argument(3))
 @test !fully_eliminated(f_ifelse_3, Tuple{Any, Any})
+
+# inline_splatnew for abstract `NamedTuple`
+@eval construct_splatnew(T, fields) = $(Expr(:splatnew, :T, :fields))
+for tt = Any[(Int,Int), (Integer,Integer), (Any,Any)]
+    let src = code_typed1(tt) do a, b
+            construct_splatnew(NamedTuple{(:a,:b),typeof((a,b))}, (a,b))
+        end
+        @test count(issplatnew, src.code) == 0
+        @test count(isnew, src.code) == 1
+    end
+end
+
+# optimize away `NamedTuple`s used for handling `@nospecialize`d keyword-argument
+# https://github.com/JuliaLang/julia/pull/47059
+abstract type CallInfo end
+struct NewInstruction
+    stmt::Any
+    type::Any
+    info::CallInfo
+    line::Int32
+    flag::UInt8
+    function NewInstruction(@nospecialize(stmt), @nospecialize(type), @nospecialize(info::CallInfo),
+                            line::Int32, flag::UInt8)
+        return new(stmt, type, info, line, flag)
+    end
+end
+@nospecialize
+function NewInstruction(newinst::NewInstruction;
+    stmt=newinst.stmt,
+    type=newinst.type,
+    info::CallInfo=newinst.info,
+    line::Int32=newinst.line,
+    flag::UInt8=newinst.flag)
+    return NewInstruction(stmt, type, info, line, flag)
+end
+@specialize
+let src = code_typed1((NewInstruction,Any,Any,CallInfo)) do newinst, stmt, type, info
+        NewInstruction(newinst; stmt, type, info)
+    end
+    @test count(issplatnew, src.code) == 0
+    @test count(iscall((src,NamedTuple)), src.code) == 0
+    @test count(isnew, src.code) == 1
+end
diff --git a/test/compiler/irutils.jl b/test/compiler/irutils.jl
index 76f883d6cea2c..ef8fe3efbb315 100644
--- a/test/compiler/irutils.jl
+++ b/test/compiler/irutils.jl
@@ -8,6 +8,7 @@ get_code(args...; kwargs...) = code_typed1(args...; kwargs...).code
 
 # check if `x` is a statement with a given `head`
 isnew(@nospecialize x) = isexpr(x, :new)
+issplatnew(@nospecialize x) = isexpr(x, :splatnew)
 isreturn(@nospecialize x) = isa(x, ReturnNode)
 
 # check if `x` is a dynamic call of a given function