Code generation and cartesian iteration #9080

timholy · 2014-11-20T10:38:00Z

Compared to manual iteration, the new cartesian iterator has a small performance penalty:

function sumcart_manual(A::AbstractMatrix)
    s = 0.0
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        s += A[i,j]
    end
    s
end

function sumcart_iter(A)
    s = 0.0
    @inbounds for I in CartesianIndices(size(A))
        s += A[I]
    end
    s
end

A = rand(10^4,10^4);
sumcart_manual(A);
sumcart_iter(A);

julia> @time sumcart_manual(A)
elapsed time: 0.176363505 seconds (96 bytes allocated)
4.999591778337127e7

julia> @time sumcart_iter(A)
elapsed time: 0.250890195 seconds (96 bytes allocated)
4.999591778337127e7

In this gist, I compare the code that gets generated. The comparison reveals that the CartesianIndex is not fully elided by the compiler.

Any ideas what's happening?

EDIT: the core problem is that next contains a branch, and testing done is a 2nd branch, so there are two branches per iteration. See #16035 (comment)

The text was updated successfully, but these errors were encountered:

timholy · 2014-12-10T02:06:36Z

Here's an example that doesn't rely on all the code-generation facilities in the iterators in Base:

module Iter

import Base: start, next, done, getindex

immutable Raw2
    i1::Int
    i2::Int
end
Raw2(dims::(Int,Int)) = Raw2(dims[1], dims[2])

start(sz::Raw2) = Raw2(1,1)
@inline next(sz::Raw2, state::Raw2) = state.i1 < sz.i1 ?
                                     (state, Raw2(state.i1+1,state.i2)) :
                                     (state, Raw2(1, state.i2+1))
done(sz::Raw2, state::Raw2) = state.i2 > sz.i2

getindex(A::AbstractArray, I::Raw2) = A[I.i1,I.i2]

end

function mysum(A)
    s = 0.0
    @inbounds for I in Iter.Raw2(size(A))
        s += A[I]
    end
    s
end

function mysum2(A)
    s = 0.0
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        s += A[i,j]
    end
    s
end

Then do

A = reshape(1:12, 3, 4)
@code_llvm mysum(A)
@code_llvm mysum2(A)

You should see that the code for mysum is not nearly as well-optimized as mysum2.

timholy · 2014-12-12T17:39:07Z

@simonster, while we're on the topic of SubArrays and indexing code generation, if you have time I'd be grateful for any insights you have here.

Comparing mysum and mysum2 for A = rand(10^4, 10^4) reveals that there's a 40% speed gap, too. The critical section of @code_llvm for mysum is

L3:                                               ; preds = %L2, %if1
  %_var0.0 = phi [2 x %Raw2] [ %41, %L2 ], [ %37, %if1 ]
  %42 = extractvalue [2 x %Raw2] %_var0.0, 0, !dbg !796, !julia_type !803
  %43 = extractvalue [2 x %Raw2] %_var0.0, 1, !dbg !796, !julia_type !803
  %44 = extractvalue %Raw2 %42, 0, !dbg !804
  %45 = add i64 %44, -1, !dbg !804
  %46 = extractvalue %Raw2 %42, 1, !dbg !804
  %47 = add i64 %46, -1, !dbg !804
  %48 = mul i64 %47, %13, !dbg !804
  %49 = add i64 %45, %48, !dbg !804
  %50 = getelementptr double* %28, i64 %49, !dbg !804
  %51 = load double* %50, align 8, !dbg !804, !tbaa %jtbaa_user
  %52 = fadd double %s.0, %51, !dbg !804
  %53 = extractvalue %Raw2 %43, 1, !dbg !804
  %54 = icmp slt i64 %26, %53, !dbg !804
  br i1 %54, label %L6, label %L, !dbg !804

compared to

L2:                                               ; preds = %L2.preheader, %L2
  %s.1 = phi double [ %24, %L2 ], [ %s.0, %L2.preheader ]
  %"#s10.0" = phi i64 [ %20, %L2 ], [ 1, %L2.preheader ]
  %20 = add i64 %"#s10.0", 1, !dbg !798
  %21 = add i64 %19, %"#s10.0", !dbg !805
  %22 = getelementptr double* %10, i64 %21, !dbg !805
  %23 = load double* %22, align 8, !dbg !805, !tbaa %jtbaa_user
  %24 = fadd double %s.1, %23, !dbg !805
  %25 = icmp eq i64 %"#s10.0", %15, !dbg !805
  br i1 %25, label %L5, label %L2, !dbg !805

for mysum2.

Anything obvious to change to narrow the gap between these?

simonster · 2014-12-12T21:48:20Z

This generates better code for me:

start(sz::Raw2) = Raw2(0,0)
@inline function next(sz::Raw2, state::Raw2)
    i1 = state.i1+1
    i2 = state.i2
    if i1 == sz.i1
        i1 = 0
        i2 = state.i2+1
    end
    return (state, Raw2(i1, i2))
end
done(sz::Raw2, state::Raw2) = state.i2 == sz.i2

getindex(A::AbstractArray, I::Raw2) = A[I.i1+1,I.i2+1]

This is ~5% slower on my system instead of ~20%. But both the original code and this change are still suboptimal because the inner loop has a branch:

L:                                                ; preds = %L.preheader, %L2
  %s.0 = phi double [ %24, %L2 ], [ 0.000000e+00, %L.preheader ]
  %"#s2307.0" = phi %Raw2 [ %19, %L2 ], [ zeroinitializer, %L.preheader ]
  %9 = ptrtoint %jl_value_t* %8 to i64
  %10 = extractvalue %Raw2 %"#s2307.0", 0, !dbg !1614
  %11 = add i64 %10, 1, !dbg !1614
  %12 = extractvalue %Raw2 %"#s2307.0", 1, !dbg !1614
  %13 = icmp eq i64 %11, %9, !dbg !1614
  br i1 %13, label %if1, label %L2, !dbg !1614

if1:                                              ; preds = %L
  %14 = add i64 %12, 1, !dbg !1614
  br label %L2, !dbg !1614

L2:                                               ; preds = %L, %if1
  %_var1.0 = phi i64 [ %11, %L ], [ 0, %if1 ]
  %_var2.0 = phi i64 [ %12, %L ], [ %14, %if1 ]
  %15 = ptrtoint %jl_value_t* %2 to i64
  %16 = ptrtoint %jl_value_t* %8 to i64
  %17 = bitcast i8* %6 to double*
  %18 = insertvalue %Raw2 undef, i64 %_var1.0, 0, !dbg !1614
  %19 = insertvalue %Raw2 %18, i64 %_var2.0, 1, !dbg !1614, !julia_type !1621
  %20 = mul i64 %12, %16, !dbg !1622
  %21 = add i64 %20, %10, !dbg !1622
  %22 = getelementptr double* %17, i64 %21, !dbg !1622
  %23 = load double* %22, align 8, !dbg !1622, !tbaa %jtbaa_user
  %24 = fadd double %s.0, %23, !dbg !1622
  %25 = icmp eq i64 %_var2.0, %15, !dbg !1622
  br i1 %25, label %L5, label %L, !dbg !1622

This is pretty much what you'd expect given the iteration protocol. In Julia, it would look something like:

@label start
newi1 = i1 + 1
if newi1 == sz1
    newi1 = 0
    newi2 += 1
else
    newi2 = i2
end
ind = i2*sz1+i1
@inbounds s += A[ind+1]
newi2 == sz2 && @goto done
i1 = newi1
i2 = newi2
@goto start
@label done

(This generates exactly the same assembly for the inner loop). Rearranging this a little allows LLVM to optimize it to match the performance of mysum2:

@label start
ind = i2*sz1+i1
@inbounds s += A[ind+1]
newi1 = i1 + 1
if newi1 == sz1
    newi1 = 0
    newi2 += 1
    newi2 == sz2 && @goto done
else
    newi2 = i2
end
i1 = newi1
i2 = newi2
@goto start
@label done

But I can't see how to express this in the current iteration framework.

edit: Ref iteration protocol issues #6125, #8149, #9182

StefanKarpinski · 2014-12-13T00:44:41Z

Worth keeping in mind wrt #8149 – if we could make it easier to express internal iteration and get a performance bump at the same time, that would be the best.

timholy · 2014-12-13T10:45:36Z

Thanks a million, @simonster. The impact of these "stylistic" differences is quite surprising to me---I've learned to lean on LLVM to do the right thing, but this is a good counterexample.

I'll look into adopting your version for our code generation for next.

timholy · 2014-12-13T11:29:36Z

Hmm, on my system, my original version is faster than your modification. This is weird.

simonster · 2014-12-13T12:19:33Z

I tried on another system, which I probably should have done earlier. On my Sandy Bridge system on Linux, mysum is 37% slower than mysum2 with your original version and 35% slower than mysum2 with my modifications. On my Haswell system on OS X, mysum is 22% slower than mysum2 with your original version and 7% slower than mysum2 with my modifications. Both systems are running LLVM 3.3 and the generated assembly is identical, so the performance difference seems to be in the architecture. I'm just benchmarking with:

function bench(A)
    t1 = 0.0
    t2 = 0.0
    for i = 1:50
        t1 += @elapsed mysum(A)
        t2 += @elapsed mysum2(A)
    end
    t1/t2
end

timholy · 2014-12-14T01:34:45Z

From reading a bit more, it seems that the cost of branches is a big difference among architectures (mine's a Sandy Bridge).

Just for fun I tried #9182, but by itself it didn't change the performance. I wondered if it did the nextstate at the end of the loop, rather than right after nextval, if it might fare better---you wouldn't need to essentially have two state variables (old and new).

To test this, I followed your lead and wrote out some explicit versions using while, and got pretty interesting results:

function mysum_while1(A)
    s = 0.0
    i = j = 1
    inew = jnew = 1
    while j <= size(A,2)
        if inew < size(A,1)
            inew += 1
        else
            inew = 1
            jnew += 1
        end
        @inbounds s += A[i,j]
        i = inew
        j = jnew
    end
    s
end

function mysum_while2(A)
    s = 0.0
    i = j = 1
    while j <= size(A,2)
        @inbounds s += A[i,j]
        if i < size(A,1)
            i += 1
        else
            i = 1
            j += 1
        end
    end
    s
end

function mysum_while3(A)
    s = 0.0
    j = 1
    while j <= size(A,2)
        i = 1
        while i <= size(A,1)
            @inbounds s += A[i,j]
            i += 1
        end
        j += 1
    end
    s
end

function mysum_while4(A)
    s = 0.0
    i = j = 1
    inew = jnew = 1
    notdone = true
    while notdone
        inew += 1
        if inew > size(A, 1)
            inew = 1
            jnew += 1
            notdone = jnew <= size(A,2)
        end
        @inbounds s += A[i,j]
        i = inew
        j = jnew
    end
    s
end


function mysum_while5(A)
    s = 0.0
    i = j = 1
    notdone = true
    while notdone
        @inbounds s += A[i,j]
        i += 1
        if i > size(A, 1)
            i = 1
            j += 1
            notdone = j <= size(A,2)
        end
    end
    s
end

function mysum_while6(A)
    s = 0.0
    i = j = 1
    while j <= size(A,2)
        @inbounds s += A[i,j]
        i += 1
        if i > size(A, 1)
            i = 1
            j += 1
        end
    end
    s
end

Timing results after warmup:

julia> @time mysum_while1(A)
elapsed time: 0.258024361 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while2(A)
elapsed time: 0.216438899 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while3(A)
elapsed time: 0.157289985 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while4(A)
elapsed time: 0.227041972 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while5(A)
elapsed time: 0.140768946 seconds (96 bytes allocated)
4.9997446791483015e7

julia> @time mysum_while6(A)
elapsed time: 0.190532756 seconds (96 bytes allocated)
4.9997446791483015e7

So, at least on my machine

the placement of the increment matters (end of the loop is best)
using a boolean variable for while testing seems to help

Interestingly, the iterators in multidimensional already use the boolean version, so it might just be a case of rewriting the way next works. But first, can you check on your system that mysum_while5 is among the best performers?

timholy · 2014-12-14T01:58:50Z

One more:

function mysum_while7(A)
    s = 0.0
    i = j = 1
    notdone = true
    while notdone
        @inbounds s += A[i,j]
        if i < size(A,1)
            i += 1
        else
            i = 1
            j += 1
            notdone = j <= size(A,2)
        end
    end
    s
end

is just as fast as the fastest ones above. So it really seems to come down to (1) putting the increment at the end, and (2) using a boolean for the while testing (and updating that boolean at the deepest level of the increment testing).

I realized the only difference between Zip2 and general Zip should be a single `...` token.

mbauman · 2015-04-18T19:14:38Z

The test case in Tim's StackOverflow answer is much improved with the Tuple overhaul. I believe enumerate and zip were slow previously due to nested tuples. That seems to no longer be the case after the tupocolypse! 🎆

Unfortunately, the main test case in this PR (for Cartesian Indexing) isn't affected.

mbauman · 2015-07-28T16:58:31Z

~~Looks like #12090 also solved this guy!~~

julia> @time sumcart_manual(A); @time sumcart_iter(A); # Edit: cheating with eachindex
 142.564 milliseconds (5 allocations: 176 bytes)
 142.117 milliseconds (5 allocations: 176 bytes)

~~And the @code_llvm is actually simpler for sumcart_iter since it's only one loop: gist with llvm output.~~

mbauman · 2015-07-28T17:06:50Z

The Cartesian objects aren't elided for LinearSlow arrays, but I can't measure a performance difference with sparse arrays:

julia> A = sprand(10^3,10^3,.1);

julia> @time sumcart_manual(A); @time sumcart_iter(A)
  35.287 milliseconds (5 allocations: 176 bytes)
  35.934 milliseconds (5 allocations: 176 bytes)
50221.35105393453

timholy · 2015-07-28T17:36:57Z

Not true for me. I updated the code sample at the top; the change in eachindex to use linear indexing, when applicable, might be confusing?

mbauman · 2015-07-28T17:39:57Z

Duh. That's the second time I've done that, isn't it?

timholy · 2015-07-28T17:41:17Z

I'm not smart enough to keep track. And really, given how much you've improved Array-type operations, why would I want to??

@benchmark

Or, alternatively: "Look ma! No CartesianRanges!" This dramatically simplifies the generated code for iteration over CartesianRanges -- in fact, no references to CartesianRange appear in the LLVM IR with this commit. While it does simplify the code in JuliaLang#9080, it does not solve the performance problem there (I see no difference). It does, however, speed up `copy(::SubArray)` by 1.3 - 1.6x: ```jl julia> A = sub(reshape(1:5^3,5,5,5), 1:2:5, :, 1:2:5); julia> @benchmark copy!(similar(A), A) # current master ================ Benchmark Results ======================== Time per evaluation: 232.69 ns [227.97 ns, 237.42 ns] Proportion of time in GC: 0.00% [0.00%, 0.00%] Memory allocated: 0.00 bytes Number of allocations: 0 allocations Number of samples: 4301 Number of evaluations: 120601 R² of OLS model: 0.953 Time spent benchmarking: 5.53 s julia> @benchmark copy!(similar(A), A) # this PR ================ Benchmark Results ======================== Time per evaluation: 168.91 ns [165.67 ns, 172.14 ns] Proportion of time in GC: 0.00% [0.00%, 0.00%] Memory allocated: 0.00 bytes Number of allocations: 0 allocations Number of samples: 4601 Number of evaluations: 160601 R² of OLS model: 0.955 Time spent benchmarking: 5.33 s ```

KristofferC · 2018-10-23T22:20:22Z

Seems a bit worse than #9080 (comment) now.

julia> @btime sumcart_manual(A)
  100.859 ms (1 allocation: 16 bytes)
5.000302140597446e7

julia> @btime sumcart_iter(A)
  204.743 ms (1 allocation: 16 bytes)
5.000302140597446e7

tkf · 2020-03-06T01:21:52Z

I've commented this already in #15648 (comment). But I don't think iterator is a good abstraction to handle this. I think the approach based on foreach/foldl/reduce would be much better.

Here is a minimal implementation of foreach to handle CartesianIndices-like iteration:

@inline foreach′(f, I::CartesianIndices) =
    foreach_product((args...) -> f(CartesianIndex(reverse(args)...)), reverse(I.indices)...)

@inline function foreach_product(f::F, itr, itrs...) where F
    @inbounds for x in itr
        @inline g(args...) = f(x, args...)
        foreach_product(g, itrs...)
    end
end

@inline function foreach_product(f::F, itr) where F
    @inbounds for x in itr
        f(x)
    end
end

(Note: the order of axes in foreach_product is reversed compared to Iterators.product.)

Benchmark code:

using BenchmarkTools

function copyto_manual!(A, B)
    @assert axes(A) == axes(B)
    @inbounds for j = 1:size(A,2), i = 1:size(A,1)
        A[i, j] = B[i, j]
    end
end

function copyto_foreach!(A, B)
    foreach′(eachindex(A, B)) do I
        @inbounds A[I] = B[I]
    end
end

function copyto_iterate!(A, B)
    @inbounds for I in eachindex(A, B)
        A[I] = B[I]
    end
end

B = rand(10^2, 10^2);
A = similar(B');

Results:

julia> @btime copyto_manual!($A, $B');
  6.401 μs (1 allocation: 16 bytes)

julia> @btime copyto_foreach!($A, $B');
  6.243 μs (1 allocation: 16 bytes)

julia> @btime copyto_iterate!($A, $B');
  16.035 μs (1 allocation: 16 bytes)

As you can see, the performance of foreach-based code is equivalent to the manual loop and much faster than the iterate-based code.

For implementing summation as in the OP, we need foldl and/or reduce. It is actually a better strategy to implement foreach based on foldl to keep things DRY. But I chose foreach as the MWE as I thought it may be simpler to understand.

timholy · 2020-03-06T11:11:00Z

Love the implementation!

But I don't think iterator is a good abstraction to handle this. I think the approach based on foreach/foldl/reduce would be much better.

I am excited about the transducer approach and need to learn & use it more, but I don't think iterators are "wrong." A lot of the virtue of CartesianIndex iteration is being able to perform manipulations on the indexes for stencil operations. Is it easy to write all the operations in https://julialang.org/blog/2016/02/iteration/ in terms of transducers?

tkf · 2020-03-07T01:38:02Z

I was trying to bring up the difference between foldl and "iterator", or more specifically the iterate function, as the implementation of, well, iteration (maybe it's confusing). In particular, I have no criticism on the concrete interfaces like eachindex, CartesianIndices, etc. They are all fantastic! What I wanted to point out was that (1) implementing specific foldl for them (i.e., bypassing iterate) can yield a huge speedup gain sometimes and (2) "functional" entry points like foreach will be more favorable once you have that.

Iterator has been the way to provide customizable iterations in many languages so I can imagine my point may be very fuzzy. But actually iterate and foldl are equally powerful formulation of customizable iterations (or rather foldl can be extended to be so). I think it is conceivable to have a programming language where the for loop syntax is lowered to foldl rather than iterate. Of course, I'm not suggesting to do this as it would be just way too destructive. Furthermore, foldl and iterate are good at different things. foldl is good at consuming collections and iterate is good at constructing them. So, having both of them may ultimately be a good idea. I think it's like any other Julia functions; you add a performance specialization on top of a generic fallback when it is a big deal.

Is it easy to write all the operations in https://julialang.org/blog/2016/02/iteration/ in terms of transducers?

(BTW, just a bit of terminology nitpick (sorry!). You may be using "transducers" as a broad term where I do some functional stuff, but what I'm presenting here is nothing to do with transducers per se and instead it's about foldl. Transducers are tools for using foldl; just like Base.Iterators is a tool kit for using for loop.)

Transforming for loop to foldl or foreach is straightforward. You can just replace for x in xs to foreach(xs) do x if there is no state (as in copyto!). When you have state, you can put it in the accumulator of foldl:

function extrema_loop(xs)
    small = typemax(eltype(xs))
    large = typemin(eltype(xs))
    for x in xs
        small = min(small, x)
        large = max(large, x)
    end
    return (small, large)
end

becomes

function extrema_foldl(xs)
    small = typemax(eltype(xs))
    large = typemin(eltype(xs))
    foldl(xs, init=(small, large)) do (small, large), x
        small = min(small, x)
        large = max(large, x)
        (small, large)
    end
end

But I think more important question is how to expose the algorithms in the blog post in reusable APIs and how to optimize them. Maybe a generalized version of boxcar3 is a good example. You can define foreach_boxcar(f, A::AbstractArray, ::Val{width}) such that boxcar3 can be written as

function boxcar3(A::AbstractArray) where {F}
    out = similar(A)
    foreach_boxcar(A, Val(3)) do I, x
        out[I] = mean(x)
    end
    return out
end

This API makes writing other variants (say using median instead of mean) very easy.

An implementation of foreach_boxcar is something like this

using StaticArrays

_valof(::Val{x}) where {x} = x

function foreach_boxcar(f, A::AbstractArray{<:Any, N}, width::Val = Val(3)) where {N}
    # For simplicity:
    @assert isodd(_valof(width))
    @assert all(s -> s >= _valof(width), size(A))

    R = CartesianIndices(A)
    I1 = oneunit(first(R))

    # Phase 1: Loop over the "main" part (w/o edges).
    main = ntuple(N) do d
        k = _valof(width) ÷ 2
        k+1:size(A, d)-k
    end
    # Pass a static array to `f`, assuming that `width` is small:
    satype = SArray{Tuple{ntuple(_ -> _valof(width), N)...}, eltype(A)}
    for I in R[main...]
        f(I, satype(view(A, I-I1:I+I1)))
    end
    # Note: We can use `foreach(R[main...]) do I` for better
    # performance.

    # Phase 2: Loop over the edges.
    Ifirst, Ilast = first(R), last(R)
    for kinds in Iterators.product(ntuple(_ -> (-1, 0, 1), N)...)
        all(iszero, kinds) && continue
        ranges = ntuple(N) do d
            k = _valof(width) ÷ 2
            if kinds[d] == -1  # left edge
                1:k
            elseif kinds[d] == 1  # right edge
                size(A, d)-k+1:size(A, d)
            else  # main
                k+1:size(A, d)-k
            end
        end
        for I in R[ranges...]
            J = max(Ifirst, I-I1):min(Ilast, I+I1)
            # Pass a view, as the shapes are heterogeneous:
            f(I, view(A, J))
        end
    end
end

I did a bit of optimization that uses SArray while looping over the "main" part. This is to demonstrate that, unlike iterate, you can have heterogeneous "element types" (where element is a pair (I, block)) without invoking type-instability in the functional approach. This is because you can call f in multiple code locations.

The important point is that it's the implementer of foreach who controls the loop. When you expose this with iterate it's the other way around; i.e., the caller controls the loop. You can do a lot of optimization if you control the loop. In foreach_boxcar, I can pass SArray without type instability because I can split the loop in to phases. In foreach_product, I can "lower" Cartesian iteration to nested for loop because I control the loop. Furthermore, this pattern is composable; I could use foreach_product from foreach_boxcar.

timholy · 2020-03-07T11:35:46Z

No, I understand that inversion of control can be very useful. But notice that in the key portion of your code here you're using iteration, not foldl. That was my only point. Some things are easier to write using iteration, so it's not a "bad abstraction" (you're using it yourself).

tkf · 2020-03-07T16:12:17Z

for loop syntax is just sugar. I "ended up" using iterate because Julia's for loop is happened to be lowered to it. It is nothing to do with iterate being "good" or "bad." As I said, it is possible to have a language where for loop is lowered to foldl. In such a language, you'd write foldl for basic concepts like UnitRange using, e.g., while loop. You don't need to have iterate in such a language. It's a trivial statement, but, if for loops were lowered to foldl in Julia, my code didn't use iterate (although then there is no need to use foreach or foldl directly like this). Another way to put this: I could, in principle, write foreach_boxcar using while loop plus some abstractions built on top it.

chriselrod · 2020-03-09T11:21:00Z

Interestingly, the performance regression seems to go away when adding @simd to the Cartesian examples, even when it should be irrelevant, as in the copyto! case:

julia> @btime copyto_iterate!($A, $B');
  13.590 μs (1 allocation: 16 bytes)

julia> @btime copyto_manual!($A, $B');
  3.086 μs (1 allocation: 16 bytes)

julia> @btime copyto_foreach!($A, $B');
  3.348 μs (3 allocations: 80 bytes)

julia> function copyto_iterate!(A, B)
       @inbounds @simd for I in eachindex(A, B)
       A[I] = B[I]
       end
       end
copyto_iterate! (generic function with 1 method)

julia> @btime copyto_iterate!($A, $B');
  3.012 μs (1 allocation: 16 bytes)

Or, the example from the opening post:

julia> A = rand(256,20);

julia> function sumcart_manual(A::AbstractMatrix)
           s = 0.0
           @inbounds for j = 1:size(A,2), i = 1:size(A,1)
               s += A[i,j]
           end
           s
       end
sumcart_manual (generic function with 1 method)

julia> function sumcart_iter(A)
           s = 0.0
           @inbounds for I in CartesianIndices(size(A))
               s += A[I]
           end
           s
       end
sumcart_iter (generic function with 1 method)

julia> @btime sumcart_manual($A)
  4.432 μs (0 allocations: 0 bytes)
2556.470985732226

julia> @btime sumcart_iter($A)
  4.640 μs (0 allocations: 0 bytes)
2556.470985732226

julia> function sumcart_manual(A::AbstractMatrix)
           s = 0.0
           @inbounds for j = 1:size(A,2); @simd for i = 1:size(A,1)
               s += A[i,j]
           end; end
           s
       end
sumcart_manual (generic function with 1 method)

julia> function sumcart_iter(A)
           s = 0.0
           @inbounds @simd for I in CartesianIndices(size(A))
               s += A[I]
           end
           s
       end
sumcart_iter (generic function with 1 method)

julia> @btime sumcart_manual($A)
  333.601 ns (0 allocations: 0 bytes)
2556.470985732225

julia> @btime sumcart_iter($A)
  332.655 ns (0 allocations: 0 bytes)
2556.470985732225

timholy · 2020-03-09T12:01:09Z

Because of these methods, @simd converts the loop into the equivalent of

R = CartesianIndices(A)
for Itail in CartesianIndices(Base.tail(R.indices)), i in R.indices[1]
    s += A[i, Itail]
end

By splitting out the first loop you mitigate but don't actually fix the problem. You should still be able to detect it if, for example, A is 3-dimensional and the first dimension is very short (e.g., size 1 or 2). Note that the magnitude of the effect is very processor-dependent.

Last time I investigated, the real problem with performance in the non-@simd version is that there are a minimum of two branches per iteration: one to test whether we need to perform a "carry" on the iterator, and another to decide whether iterate returned nothing and the loop should terminate (if you have to perform multiple carries it's even worse). Theoretically the optimizer should be able to realize that iterate won't return nothing if it doesn't carry, and then reorder the operations, but that doesn't seem to happen. As @tkf says, that optimization happens almost automatically for foldl because by calling front you can do that peeling one iterator at a time (#35036), so you never even need to implement a carry operation.

But we have to fix this anyway, and I don't think it will be hard. I was hoping it would arise from a more general optimization but perhaps it's time to add a compiler pass that detects calls to iterate(::CartesianIndex{N}) and rewrites the following code block using a set of N nested loops. I should just sit down and do it, it might be an interesting experiment in the context of evaluating deeper integration with LoopVectorization.

tkf · 2020-03-10T05:33:15Z

If it is easy to fix I agree it's a good idea to do this since it would make existing code faster. I assumed that it was a hard problem since it has been opened for a long time.

But I think it's still fair to say it becomes almost impossible to define efficient iterate when the collections are "complex enough" (e.g., sparse arrays, B-trees). From this point of view, it's a good idea to stop recommending iterate when writing generic functions. Though I suppose this discussion has to happen elsewhere to keep the scope of this issue simple.

tkf · 2020-03-10T17:45:35Z

I was curious how easy it is so I had a quick go at it. This improves the performance although it doesn't bring us to the performance of the nested loops:

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 66a22b7005..e3306c1033 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -345,7 +345,7 @@ module IteratorsMD
     end
     @inline function iterate(iter::CartesianIndices, state)
         valid, I = __inc(state.I, first(iter).I, last(iter).I)
-        valid || return nothing
+        valid === Val(true) || return nothing
         return CartesianIndex(I...), CartesianIndex(I...)
     end

@@ -356,15 +356,18 @@ module IteratorsMD
     end

     # increment post check to avoid integer overflow
-    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = false, ()
+    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = Val(false), ()
     @inline function __inc(state::Tuple{Int}, start::Tuple{Int}, stop::Tuple{Int})
-        valid = state[1] < stop[1]
-        return valid, (state[1]+1,)
+        if state[1] < stop[1]
+            return Val(true), (state[1]+1,)
+        else
+            return Val(false), (state[1]+1,)
+        end
     end

     @inline function __inc(state, start, stop)
         if state[1] < stop[1]
-            return true, (state[1]+1, tail(state)...)
+            return Val(true), (state[1]+1, tail(state)...)
         end
         valid, I = __inc(tail(state), tail(start), tail(stop))
         return valid, (start[1], I...)

Before:

julia> @btime sumcart_manual($A)
  129.077 ms (0 allocations: 0 bytes)
5.000437863394746e7

julia> @btime sumcart_iter($A)
  145.541 ms (0 allocations: 0 bytes)
5.000437863394746e7

After:

julia> @btime sumcart_iter($A)
  135.652 ms (29490 allocations: 617.03 KiB)
5.000437863394746e7

I'm not sure why it allocates a lot, though. The output of @code_typed optimize=true sumcart_iter(A) looks clean to me.

tkf · 2020-03-10T18:05:31Z

This gives us the performance very close to the manual nested loop:

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 66a22b7005..209c3707b8 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -344,30 +344,34 @@ module IteratorsMD
         iterfirst, iterfirst
     end
     @inline function iterate(iter::CartesianIndices, state)
-        valid, I = __inc(state.I, first(iter).I, last(iter).I)
-        valid || return nothing
+        I = __inc(state.I, first(iter).I, last(iter).I)
+        I === nothing && return nothing
         return CartesianIndex(I...), CartesianIndex(I...)
     end

     # increment & carry
     @inline function inc(state, start, stop)
-        _, I = __inc(state, start, stop)
+        I = __inc(state, start, stop)
         return CartesianIndex(I...)
     end

     # increment post check to avoid integer overflow
-    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = false, ()
+    @inline __inc(::Tuple{}, ::Tuple{}, ::Tuple{}) = nothing
     @inline function __inc(state::Tuple{Int}, start::Tuple{Int}, stop::Tuple{Int})
-        valid = state[1] < stop[1]
-        return valid, (state[1]+1,)
+        if state[1] < stop[1]
+            return (state[1]+1,)
+        else
+            return nothing
+        end
     end

     @inline function __inc(state, start, stop)
         if state[1] < stop[1]
-            return true, (state[1]+1, tail(state)...)
+            return (state[1]+1, tail(state)...)
         end
-        valid, I = __inc(tail(state), tail(start), tail(stop))
-        return valid, (start[1], I...)
+        I = __inc(tail(state), tail(start), tail(stop))
+        I === nothing && return nothing
+        return (start[1], I...)
     end

     # 0-d cartesian ranges are special-cased to iterate once and only once

After:

julia> @btime sumcart_iter($A)
  133.548 ms (0 allocations: 0 bytes)
5.000437863394746e7

Not sure where extra ~7 ms is coming from, though (the manual case is 126--129 ms in my laptop).

This replaces iteration over CartesianIndices{N} with a set of N nested one-dimensional loops. Fixes #9080. It also paves the way for loop-reordering and other transformations.

timholy · 2020-10-09T17:29:36Z

So incredible to watch this close before my eyes! I have been waiting almost 6 years for this. You are a hero, @johnnychen94!

timholy mentioned this issue Dec 14, 2014

WIP: split next into nextval & nextstate #9182

Closed

timholy mentioned this issue Jan 9, 2015

RFC: some modifications in cartesian indexing and iterating #9688

Merged

timholy referenced this issue Feb 12, 2015

make zip for >2 arguments about 20x faster

eaf5a95

I realized the only difference between Zip2 and general Zip should be a single `...` token.

timholy mentioned this issue Mar 23, 2015

Taking vector transposes seriously #4774

Closed

timholy mentioned this issue Apr 1, 2015

Make eachindex more efficient for linear arrays #10704

Merged

timholy mentioned this issue Apr 17, 2015

laplace equation benchmark performance #1168

Closed

mbauman added the performance Must go faster label Apr 18, 2015

timholy referenced this issue Apr 26, 2015

faster copy! between arrays of different types (ref #11004)

102e840

timholy mentioned this issue Jul 28, 2015

Indexing with an empty vector "regression" #12271

Closed

mbauman closed this as completed Jul 28, 2015

timholy reopened this Jul 28, 2015

bjarthur mentioned this issue Mar 14, 2016

extrema() is missing [,dims] #15376

Closed

mbauman mentioned this issue Apr 5, 2016

Inline start(::CartesianRange) #15775

Merged

timholy mentioned this issue Apr 21, 2016

Implement permutedims for general AbstractArrays #15814

Merged

timholy mentioned this issue Apr 11, 2017

Support arbitrary indices JuliaImages/ImageTransformations.jl#18

Merged

timholy mentioned this issue Apr 27, 2017

Fix type instability when using custom indices #21599

Closed

timholy mentioned this issue May 30, 2017

Make use of mapslices consistent throughout Julia #3893

Open

timholy mentioned this issue Dec 26, 2017

Iteration protocol change #25261

Merged

3 tasks

timholy mentioned this issue Feb 9, 2019

fix overflow in CartesianIndices iteration #31011

Merged

timholy mentioned this issue Feb 21, 2020

Performance of CartensianIndices and Views #34838

Open

timholy mentioned this issue Mar 5, 2020

RFC: support external compiler passes #35015

Closed

tkf mentioned this issue Mar 7, 2020

Optimize foldl/foreach for zip(arrays...), CartesianIndices, etc. #35036

Open

timholy mentioned this issue Mar 9, 2020

WIP: support for CartesianIndices JuliaSIMD/LoopVectorization.jl#66

Merged

timholy mentioned this issue Mar 11, 2020

Add a compiler pass for iterate(::CartesianIndices) #35074

Closed

tkf mentioned this issue Apr 19, 2020

Widening-based map() to remove return_type andyferris/StaticArraysLite.jl#1

Open

timholy mentioned this issue Apr 26, 2020

Support ⊙ and ⊗ as elementwise- and outer-product operators #35150

Merged

timholy mentioned this issue Jun 7, 2020

a few compiler improvements #36176

Merged

johnnychen94 mentioned this issue Oct 7, 2020

add StepRange support for CartesianIndices #37829

Merged

timholy closed this as completed in #37829 Oct 9, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Code generation and cartesian iteration #9080

Code generation and cartesian iteration #9080

timholy commented Nov 20, 2014 •

edited by KristofferC

Loading

timholy commented Dec 10, 2014

timholy commented Dec 12, 2014

simonster commented Dec 12, 2014

StefanKarpinski commented Dec 13, 2014

timholy commented Dec 13, 2014

timholy commented Dec 13, 2014

simonster commented Dec 13, 2014

timholy commented Dec 14, 2014

timholy commented Dec 14, 2014

mbauman commented Apr 18, 2015

mbauman commented Jul 28, 2015

mbauman commented Jul 28, 2015

timholy commented Jul 28, 2015

mbauman commented Jul 28, 2015

timholy commented Jul 28, 2015

KristofferC commented Oct 23, 2018

tkf commented Mar 6, 2020 •

edited

Loading

timholy commented Mar 6, 2020

tkf commented Mar 7, 2020

timholy commented Mar 7, 2020

tkf commented Mar 7, 2020

chriselrod commented Mar 9, 2020

timholy commented Mar 9, 2020

tkf commented Mar 10, 2020

tkf commented Mar 10, 2020

tkf commented Mar 10, 2020

timholy commented Oct 9, 2020

Code generation and cartesian iteration #9080

Code generation and cartesian iteration #9080

Comments

timholy commented Nov 20, 2014 • edited by KristofferC Loading

timholy commented Dec 10, 2014

timholy commented Dec 12, 2014

simonster commented Dec 12, 2014

StefanKarpinski commented Dec 13, 2014

timholy commented Dec 13, 2014

timholy commented Dec 13, 2014

simonster commented Dec 13, 2014

timholy commented Dec 14, 2014

timholy commented Dec 14, 2014

mbauman commented Apr 18, 2015

mbauman commented Jul 28, 2015

mbauman commented Jul 28, 2015

timholy commented Jul 28, 2015

mbauman commented Jul 28, 2015

timholy commented Jul 28, 2015

KristofferC commented Oct 23, 2018

tkf commented Mar 6, 2020 • edited Loading

timholy commented Mar 6, 2020

tkf commented Mar 7, 2020

timholy commented Mar 7, 2020

tkf commented Mar 7, 2020

chriselrod commented Mar 9, 2020

timholy commented Mar 9, 2020

tkf commented Mar 10, 2020

tkf commented Mar 10, 2020

tkf commented Mar 10, 2020

timholy commented Oct 9, 2020

timholy commented Nov 20, 2014 •

edited by KristofferC

Loading

tkf commented Mar 6, 2020 •

edited

Loading