WIP: Add Cartesian product iteration. Fixes #1917 #6437

timholy · 2014-04-06T12:58:49Z

This is a WIP because next is not currently being inlined, and consequently this has a massive performance hit.

function sumloop{T}(A::AbstractArray{T,2}, iter)
    s = zero(typeof(one(T)+one(T)))
    for k = 1:iter
        for i2 = 1:size(A,2)
            for i1 = 1:size(A,1)
                s += A[i1,i2]
            end
        end
    end
    s
end

function sumiter{T}(A::AbstractArray{T}, iter)
    s = zero(typeof(one(T)+one(T)))
    for k = 1:iter
        for I = eachindex(A)
            s += A[I...]
        end
    end
    s
end

A = rand(500,500);

julia> @time sumloop(A, 100)
elapsed time: 0.048933864 seconds (64 bytes allocated)
1.2497917273335826e7

julia> @time sumiter(A, 100)
elapsed time: 4.077510313 seconds (2400008064 bytes allocated)
1.2497917273335826e7

julia> code_typed(sumiter, (typeof(A), Int))
1-element Array{Any,1}:
 :($(Expr(:lambda, {:A,:iter}, {{:s,:#s39,:#s38,:#s37,:k,:#s36,:#s35,:#s34,:I,:_var0,:_var1,:_var2,:_var3,:_var4,:_var5,:_var6},{{:A,Array{Float64,2},0},{:iter,Int64,0},{:s,Float64,2},{:#s39,UnitRange{Int64},18},{:#s38,Int64,2},{:#s37,(Int64,Int64),18},{:k,Int64,18},{:#s36,SizeIterator{2},18},{:#s35,(Int64,Int64),2},{:#s34,((Int64,Int64),(Int64,Int64)),18},{:I,(Int64,Int64),18},{:_var0,Int64,18},{:_var1,Bool,18},{:_var2,(Int64,Int64),18},{:_var3,Float64,18},{:_var4,Bool,18},{:_var5,Int64,18},{:_var6,Int64,18}},{}}, :(begin  # /tmp/iteration.jl, line 14:
        s = 0.0 # line 15:
        _var0 = top(getfield)(Intrinsics,:select_value)(top(sle_int)(1,iter::Int64)::Bool,iter::Int64,top(box)(Int64,top(sub_int)(1,1))::Int64)::Int64
        #s39 = $(Expr(:new, UnitRange{Int64}, 1, :(_var0::Int64)))::UnitRange{Int64}
        #s38 = top(getfield)(#s39::UnitRange{Int64},:start)::Int64
        _var1 = #s38::Int64 === top(box)(Int64,top(add_int)(top(getfield)(#s39::UnitRange{Int64},:stop)::Int64,1))::Int64::Bool
        unless top(box)(Bool,top(not_int)(_var1::Bool))::Bool goto 1
        2: 
        _var5 = #s38::Int64
        _var6 = top(box)(Int64,top(add_int)(#s38::Int64,1))::Int64
        k = _var5::Int64
        #s38 = _var6::Int64 # line 16:
        _var2 = top(tuple)(arraysize(A::Array{Float64,2},1)::Int64,arraysize(A::Array{Float64,2},2)::Int64)::(Int64,Int64)
        #s36 = $(Expr(:new, SizeIterator{2}, :(convert_tuple((Int64,Int64),_var2::(Int64,Int64),convert)::(Int64,Int64))))::SizeIterator{2}
        #s35 = top(tuple)(1,1)::(Int64,Int64)
        unless top(box)(Bool,top(not_int)(top(slt_int)(tupleref(top(getfield)(#s36::SizeIterator{2},:I)::(Int64,Int64),2)::Int64,tupleref(#s35::(Int64,Int64),2)::Int64)::Bool))::Bool goto 5
        6: 
        #s34 = top(next)(#s36::SizeIterator{2},#s35::(Int64,Int64))::((Int64,Int64),(Int64,Int64))
        I = top(tupleref)(#s34::((Int64,Int64),(Int64,Int64)),1)::(Int64,Int64)
        #s35 = top(tupleref)(#s34::((Int64,Int64),(Int64,Int64)),2)::(Int64,Int64) # line 17:
        _var3 = arrayref(A::Array{Float64,2},top(tupleref)(I::(Int64,Int64),1)::Int64,top(tupleref)(I::(Int64,Int64),2)::Int64)::Float64
        s = top(box)(Float64,top(add_float)(s::Float64,_var3::Float64))::Float64
        7: 
        unless top(box)(Bool,top(not_int)(top(box)(Bool,top(not_int)(top(slt_int)(tupleref(top(getfield)(#s36::SizeIterator{2},:I)::(Int64,Int64),2)::Int64,tupleref(#s35::(Int64,Int64),2)::Int64)::Bool))::Bool))::Bool goto 6
        5: 
        4: 
        3: 
        _var4 = #s38::Int64 === top(box)(Int64,top(add_int)(top(getfield)(#s39::UnitRange{Int64},:stop)::Int64,1))::Int64::Bool
        unless top(box)(Bool,top(not_int)(top(box)(Bool,top(not_int)(_var4::Bool))::Bool))::Bool goto 2
        1: 
        0:  # line 20:
        return s::Float64
    end::Float64))))

I'm aware that most of the performance hit comes from not eliding tuples in places where we ideally could. But once that problem gets solved, we'll have to face the next performance problem that stems more specifically from the lack of inlining. (Heck, we were worried about #5469, and compared to inlining the increment operation, moving the placement of state += 1 is peanuts.) So we'll need to fix the inlining anyway, and it neatly sidesteps the (harder?) problem of tuple elision in this particular case.

Once the problems are fixed we can expect to be within a small factor of manually-coded (or Cartesian-coded) loops. See discussion in #3796.

Jutho · 2014-05-06T22:32:51Z

Any progress on this and the inlining issues? I would also very much like to see this functionality, cause I have also been thinking about iterators for running over higher-dimensional arrays. For example, you could create iterators where you specify a permutation of the array dimensions.

You could then write permutedims! as

for I,J=zip(iterate(A,1:N),iterate(B,permutation)
    B[J]=A[I]
end

If you combine this with the implementation of different iterators that access the data according to different patterns, e.g. some cache friendly blocking order or some divide and conquer order, you could write very simple expressions for cache-friendly methods for higher order arrays. I could easily imagine writing complicated tensor contractions with three simple for loops just like matrix multiplication, where the iterators take care of matching all the right elements and at the same time ensure cache-friendlyness.

But of course, none of this is possible as long as the inline keyword is not in place.

timholy · 2014-06-28T09:59:44Z

@JeffBezanson, ifelse doesn't seem to help here:

function sumiter1{T}(A::AbstractArray{T,2}, iter)
    s = zero(typeof(one(T)+one(T)))
    for k = 1:iter
        I = (1, 1)
        lim = size(A)
        while I[2] <= lim[2]
            s += A[I[1], I[2]]
            I = ifelse(I[1] < lim[1], (I[1]+1,I[2]), (1, I[2]+1))
        end
    end
    s
end

function sumiter2{T}(A::AbstractArray{T,2}, iter)
    s = zero(typeof(one(T)+one(T)))
    for k = 1:iter
        I = (1, 1)
        lim = size(A)
        while I[2] <= lim[2]
            s += A[I[1], I[2]]
            I = I[1] < lim[1] ? (I[1]+1,I[2]) : (1, I[2]+1)
        end
    end
    s
end

After warmup:

julia> @time sumiter1(A, 100)
elapsed time: 2.756650596 seconds (1600000064 bytes allocated)
1.2504108036569213e7

julia> @time sumiter2(A, 100)
elapsed time: 0.118034392 seconds (64 bytes allocated)
1.2504108036569213e7

The ifelse version is allocating something on every iteration, presumably because you're passing tuples between functions?

timholy · 2014-06-28T10:54:03Z

I also put a little time into trying to figure out why sumiter2 is slower than sumloop. After adding @inbounds to simplify the output of code_llvm, using A a Matrix{Float64} and iter an Int64, the inner loop of sumloop is this:

L4:                                               ; preds = %L4.preheader, %L4
  %s.2 = phi double [ %25, %L4 ], [ %s.1, %L4.preheader ]
  %"#s218.0" = phi i64 [ %20, %L4 ], [ 1, %L4.preheader ]
  %20 = add i64 %"#s218.0", 1, !dbg !1803
  %21 = add i64 %19, %"#s218.0", !dbg !1804
  %22 = getelementptr %jl_value_t* %4, i64 %21, !dbg !1804
  %23 = bitcast %jl_value_t* %22 to double*, !dbg !1804
  %24 = load double* %23, align 8, !dbg !1804, !tbaa %jtbaa_user
  %25 = fadd double %s.2, %24, !dbg !1804
  %26 = icmp eq i64 %"#s218.0", %15, !dbg !1804
  br i1 %26, label %L7, label %L4, !dbg !1804

which as far as I understand it seems like a fairly straightforward lookup-a-value-from-pointer-indexed-memory and add it to %s.2. In contrast, for sumiter2 the core load double and fadd operations are split across blocks:

L2:                                               ; preds = %L, %L5
  %s.1 = phi double [ %27, %L5 ], [ %s.0, %L ]
  %I.0 = phi <2 x i64> [ %I.1, %L5 ], [ <i64 1, i64 1>, %L ]
  %12 = extractelement <2 x i64> %I.0, i32 0, !dbg !1807
  %13 = add i64 %12, -1, !dbg !1807
  %14 = extractelement <2 x i64> %I.0, i32 1, !dbg !1807
  %15 = add i64 %14, -1, !dbg !1807
  %16 = mul i64 %15, %7, !dbg !1807
  %17 = add i64 %13, %16, !dbg !1807
  %18 = getelementptr %jl_value_t* %4, i64 %17, !dbg !1807
  %19 = bitcast %jl_value_t* %18 to double*, !dbg !1807
  %20 = load double* %19, align 8, !dbg !1807, !tbaa %jtbaa_user
  %21 = icmp slt i64 %12, %7, !dbg !1809
  br i1 %21, label %if3, label %L4, !dbg !1809

if3:                                              ; preds = %L2
  %22 = add i64 %12, 1, !dbg !1809
  %23 = insertelement <2 x i64> undef, i64 %22, i32 0, !dbg !1809, !julia_type !1810
  %24 = shufflevector <2 x i64> %23, <2 x i64> %I.0, <2 x i32> <i32 0, i32 3>, !dbg !1809
  br label %L5, !dbg !1809

L4:                                               ; preds = %L2
  %25 = add i64 %14, 1, !dbg !1809
  %26 = insertelement <2 x i64> <i64 1, i64 undef>, i64 %25, i32 1, !dbg !1809, !julia_type !1810
  br label %L5, !dbg !1809

L5:                                               ; preds = %L4, %if3
  %I.1 = phi <2 x i64> [ %26, %L4 ], [ %24, %if3 ]
  %27 = fadd double %s.1, %20, !dbg !1807
  %28 = extractelement <2 x i64> %I.1, i32 1, !dbg !1809
  %29 = icmp sgt i64 %28, %10, !dbg !1809
  br i1 %29, label %L7, label %L2, !dbg !1809

To me it looks like the tuple is not being elided as a pair of integer loop variables.

timholy · 2014-06-28T11:08:39Z

Finally, I seem to remember that there's an issue (couldn't find it despite searching, unfortunately) where @carlobaldassi showed that if you replace the tuple with an immutable, you get rid of the performance hit. I seem to remember that Carlo himself thought that approach was a little too ugly, because you need an Iterator3 immutable for 3 dimensions, an Iterator4 for 4 dimensions, etc. But it does show that in principle this can be done.

Closes #1917, closes #6437

Efficient cartesian iteration (new version of #6437)

Closes JuliaLang#1917, closes JuliaLang#6437

Add Cartesian product iteration. Fixes #1917

b83975b

This was referenced Apr 16, 2014

drop dimensions indexed with a scalar? #5949

Closed

fix #6599 #6605

Merged

timholy mentioned this pull request Aug 10, 2014

Towards array nirvana #7941

Closed

15 tasks

This was referenced Sep 4, 2014

WIP: add views founded on cartesian indexing #8235

Closed

BitArray broadcasting #6035

Merged

timholy added a commit that referenced this pull request Sep 20, 2014

Add a multidimensional (cartesian) iterator

96e85ff

Closes #1917, closes #6437

timholy mentioned this pull request Sep 21, 2014

Efficient cartesian iteration (new version of #6437) #8432

Merged

jiahao force-pushed the master branch 3 times, most recently from 6c7c7e3 to 1a4c02f Compare October 11, 2014 22:06

jiahao force-pushed the master branch from cdde4df to 7fdc860 Compare October 28, 2014 04:20

MikeInnes force-pushed the master branch from 5c60996 to b1c3df3 Compare November 14, 2014 17:07

timholy closed this in 0314913 Nov 15, 2014

timholy added a commit that referenced this pull request Nov 15, 2014

Merge pull request #8432 from JuliaLang/teh/cartesian_iteration

37ffa13

Efficient cartesian iteration (new version of #6437)

waTeim pushed a commit to waTeim/julia that referenced this pull request Nov 23, 2014

Add a multidimensional (cartesian) iterator

6df91e9

Closes JuliaLang#1917, closes JuliaLang#6437

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

WIP: Add Cartesian product iteration. Fixes #1917 #6437

WIP: Add Cartesian product iteration. Fixes #1917 #6437

timholy commented Apr 6, 2014

Jutho commented May 6, 2014

timholy commented Jun 28, 2014

timholy commented Jun 28, 2014

timholy commented Jun 28, 2014

WIP: Add Cartesian product iteration. Fixes #1917 #6437

WIP: Add Cartesian product iteration. Fixes #1917 #6437

Conversation

timholy commented Apr 6, 2014

Jutho commented May 6, 2014

timholy commented Jun 28, 2014

timholy commented Jun 28, 2014

timholy commented Jun 28, 2014