unoptimized code generation with tuple arguments #6670

mlubin · 2014-04-27T17:43:47Z

Consider:

function f1(values)
    s1 = values[1]
    s2 = values[2]
    s3 = 2*s2
    s4 = s1 + s3
    return s4
end


function f2(input)
    values = input[1]
    s1 = values[1]
    s2 = values[2]
    s3 = 2*s2
    s4 = s1 + s3
    return s4
end

Type inference is essentially the same for both:

julia> code_typed(f1, (Vector{Float64},))
1-element Array{Any,1}:
 :($(Expr(:lambda, {:values}, {{:s1,:s2,:s3,:s4},{{:values,Array{Float64,1},0},{:s1,Float64,18},{:s2,Float64,18},{:s3,Float64,18},{:s4,Float64,18}},{}}, :(begin  # none, line 2:
        s1 = top(arrayref)(values::Array{Float64,1},1)::Float64 # line 3:
        s2 = top(arrayref)(values::Array{Float64,1},2)::Float64 # line 4:
        s3 = top(box)(Float64,top(mul_float)(top(box)(Float64,top(sitofp)(Float64,2))::Float64,s2::Float64))::Float64 # line 5:
        s4 = top(box)(Float64,top(add_float)(s1::Float64,s3::Float64))::Float64 # line 6:
        return s4::Float64
    end::Float64))))

julia> code_typed(f2, ((Vector{Float64},),))
1-element Array{Any,1}:
 :($(Expr(:lambda, {:input}, {{:values,:s1,:s2,:s3,:s4},{{:input,(Array{Float64,1},),0},{:values,Array{Float64,1},18},{:s1,Float64,18},{:s2,Float64,18},{:s3,Float64,18},{:s4,Float64,18}},{}}, :(begin  # none, line 2:
        values = top(tupleref)(input::(Array{Float64,1},),1)::Array{Float64,1} # line 3:
        s1 = top(arrayref)(values::Array{Float64,1},1)::Float64 # line 4:
        s2 = top(arrayref)(values::Array{Float64,1},2)::Float64 # line 5:
        s3 = top(box)(Float64,top(mul_float)(top(box)(Float64,top(sitofp)(Float64,2))::Float64,s2::Float64))::Float64 # line 6:
        s4 = top(box)(Float64,top(add_float)(s1::Float64,s3::Float64))::Float64 # line 7:
        return s4::Float64
    end::Float64))))

but very different llvm code is generated:

julia> code_llvm(f1, (Vector{Float64},))

define double @julia_f116865(%jl_value_t*) {
...
idxend2:                                          ; preds = %idxend
  %7 = getelementptr inbounds %jl_value_t* %0, i64 1, i32 0, !dbg !1047
  %8 = load %jl_value_t** %7, align 8, !dbg !1047, !tbaa %jtbaa_arrayptr
  %9 = bitcast %jl_value_t* %8 to double*, !dbg !1047
  %10 = load double* %9, align 8, !dbg !1047, !tbaa %jtbaa_user
  %11 = getelementptr %jl_value_t* %8, i64 1, !dbg !1053
  %12 = bitcast %jl_value_t* %11 to double*, !dbg !1053
  %13 = load double* %12, align 8, !dbg !1053, !tbaa %jtbaa_user
  %14 = fmul double %13, 2.000000e+00, !dbg !1056
  %15 = fadd double %10, %14, !dbg !1057
  ret double %15, !dbg !1058
}

julia> code_llvm(f2, ((Vector{Float64},),))

define %jl_value_t* @julia_f216867(%jl_value_t*, %jl_value_t**, i32) {
...
pass:                                             ; preds = %top
  %17 = getelementptr inbounds %jl_value_t* %12, i64 2, i32 0, !dbg !1054
  %18 = load %jl_value_t** %17, align 8, !dbg !1054
  store %jl_value_t* %18, %jl_value_t** %10, align 8, !dbg !1059
  store %jl_value_t* inttoptr (i64 23413456 to %jl_value_t*), %jl_value_t** %11, align 8, !dbg !1059
  %19 = call %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 39256272 to %jl_value_t*), %jl_value_t** %10, i32 2), !dbg !1059
  store %jl_value_t* %19, %jl_value_t** %4, align 8, !dbg !1059
  store %jl_value_t* %18, %jl_value_t** %10, align 8, !dbg !1060
  store %jl_value_t* inttoptr (i64 23413488 to %jl_value_t*), %jl_value_t** %11, align 8, !dbg !1060
  %20 = call %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 39256272 to %jl_value_t*), %jl_value_t** %10, i32 2), !dbg !1060
  store %jl_value_t* %20, %jl_value_t** %7, align 8, !dbg !1060
  store %jl_value_t* inttoptr (i64 23413488 to %jl_value_t*), %jl_value_t** %10, align 8, !dbg !1061
  store %jl_value_t* %20, %jl_value_t** %11, align 8, !dbg !1061
  %21 = call %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 35192416 to %jl_value_t*), %jl_value_t** %10, i32 2), !dbg !1061
  store %jl_value_t* %21, %jl_value_t** %8, align 8, !dbg !1061
  store %jl_value_t* %19, %jl_value_t** %10, align 8, !dbg !1062
  store %jl_value_t* %21, %jl_value_t** %11, align 8, !dbg !1062
  %22 = call %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 40835744 to %jl_value_t*), %jl_value_t** %10, i32 2), !dbg !1062
  store %jl_value_t* %22, %jl_value_t** %9, align 8, !dbg !1062
  %23 = load %jl_value_t** %5, align 8, !dbg !1063
  %24 = getelementptr inbounds %jl_value_t* %23, i64 0, i32 0, !dbg !1063
  store %jl_value_t** %24, %jl_value_t*** @jl_pgcstack, align 8, !dbg !1063
  ret %jl_value_t* %22, !dbg !1063
}

Why does the second version generate unoptimized code?

The text was updated successfully, but these errors were encountered:

JeffBezanson · 2014-04-27T17:47:10Z

Planning to address this. Generally, we don't specialize on all tuple types because there are just too many of them. Due to the code in inference.jl the number is actually unbounded if we aren't careful.

mlubin · 2014-04-27T18:00:51Z

Ok thanks, got a nice speedup from restructuring the code to avoid this.

timholy · 2014-04-27T18:03:16Z

@mlubin, in case you can't wait, Jeff once mentioned a nice trick to me: write the signature as

function f2{T}(input::T)

That will force specialization. Whether this helps is quite specific for tuple inputs; in general, it's still true that in the vast majority of cases there's no performance advantage for declaring input types.

mlubin · 2014-04-28T00:07:53Z

Feel free to close if this is subsumed by another issue.

simonster · 2014-04-28T01:42:16Z

I think the closest issue is #4090, but it's closed.

timholy mentioned this issue Apr 28, 2014

More efficient permutedims #6517

Closed

JeffBezanson closed this as completed in f170bd3 Apr 29, 2014

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

unoptimized code generation with tuple arguments #6670

unoptimized code generation with tuple arguments #6670

mlubin commented Apr 27, 2014

JeffBezanson commented Apr 27, 2014

mlubin commented Apr 27, 2014

timholy commented Apr 27, 2014

mlubin commented Apr 28, 2014

simonster commented Apr 28, 2014

unoptimized code generation with tuple arguments #6670

unoptimized code generation with tuple arguments #6670

Comments

mlubin commented Apr 27, 2014

JeffBezanson commented Apr 27, 2014

mlubin commented Apr 27, 2014

timholy commented Apr 27, 2014

mlubin commented Apr 28, 2014

simonster commented Apr 28, 2014