From 2e2c16a446838a3789c459eba79438f8aad0c244 Mon Sep 17 00:00:00 2001 From: N5N3 <2642243996@qq.com> Date: Sat, 26 Feb 2022 01:18:10 +0800 Subject: [PATCH] Fix `@simd` for non 1 step CartesianPartition (#42736) --- base/broadcast.jl | 6 ++-- base/multidimensional.jl | 64 +++++++++++++++++++++++----------------- test/iterators.jl | 13 ++++---- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/base/broadcast.jl b/base/broadcast.jl index fb9ba9555cfd9..c9694d6645099 100644 --- a/base/broadcast.jl +++ b/base/broadcast.jl @@ -973,14 +973,14 @@ end destc = dest.chunks cind = 1 bc′ = preprocess(dest, bc) - for P in Iterators.partition(eachindex(bc′), bitcache_size) + @inbounds for P in Iterators.partition(eachindex(bc′), bitcache_size) ind = 1 @simd for I in P - @inbounds tmp[ind] = bc′[I] + tmp[ind] = bc′[I] ind += 1 end @simd for i in ind:bitcache_size - @inbounds tmp[i] = false + tmp[i] = false end dumpbitcache(destc, cind, tmp) cind += bitcache_chunks diff --git a/base/multidimensional.jl b/base/multidimensional.jl index 7eb3cd915c3eb..b5e401a7834e7 100644 --- a/base/multidimensional.jl +++ b/base/multidimensional.jl @@ -477,9 +477,8 @@ module IteratorsMD simd_inner_length(iter::CartesianIndices, I::CartesianIndex) = Base.length(iter.indices[1]) simd_index(iter::CartesianIndices{0}, ::CartesianIndex, I1::Int) = first(iter) - @propagate_inbounds function simd_index(iter::CartesianIndices, Ilast::CartesianIndex, I1::Int) - CartesianIndex(getindex(iter.indices[1], I1+first(Base.axes1(iter.indices[1]))), Ilast.I...) - end + @propagate_inbounds simd_index(iter::CartesianIndices, Ilast::CartesianIndex, I1::Int) = + CartesianIndex(iter.indices[1][I1+firstindex(iter.indices[1])], Ilast) # Split out the first N elements of a tuple @inline function split(t, V::Val) @@ -585,7 +584,7 @@ module IteratorsMD CartesianIndices(intersect.(a.indices, b.indices)) # Views of reshaped CartesianIndices are used for partitions — ensure these are fast - const CartesianPartition{T<:CartesianIndex, P<:CartesianIndices, R<:ReshapedArray{T,1,P}} = SubArray{T,1,R,Tuple{UnitRange{Int}},false} + const CartesianPartition{T<:CartesianIndex, P<:CartesianIndices, R<:ReshapedArray{T,1,P}} = SubArray{T,1,R,<:Tuple{AbstractUnitRange{Int}},false} eltype(::Type{PartitionIterator{T}}) where {T<:ReshapedArrayLF} = SubArray{eltype(T), 1, T, Tuple{UnitRange{Int}}, true} eltype(::Type{PartitionIterator{T}}) where {T<:ReshapedArray} = SubArray{eltype(T), 1, T, Tuple{UnitRange{Int}}, false} Iterators.IteratorEltype(::Type{<:PartitionIterator{T}}) where {T<:ReshapedArray} = Iterators.IteratorEltype(T) @@ -594,7 +593,6 @@ module IteratorsMD eltype(::Type{PartitionIterator{T}}) where {T<:Union{UnitRange, StepRange, StepRangeLen, LinRange}} = T Iterators.IteratorEltype(::Type{<:PartitionIterator{T}}) where {T<:Union{OneTo, UnitRange, StepRange, StepRangeLen, LinRange}} = Iterators.IteratorEltype(T) - @inline function iterate(iter::CartesianPartition) isempty(iter) && return nothing f = first(iter) @@ -610,33 +608,45 @@ module IteratorsMD # In general, the Cartesian Partition might start and stop in the middle of the outer # dimensions — thus the outer range of a CartesianPartition is itself a # CartesianPartition. - t = tail(iter.parent.parent.indices) - ci = CartesianIndices(t) - li = LinearIndices(t) - return @inbounds view(ci, li[tail(iter[1].I)...]:li[tail(iter[end].I)...]) + mi = iter.parent.mi + ci = iter.parent.parent + ax, ax1 = axes(ci), Base.axes1(ci) + subs = Base.ind2sub_rs(ax, mi, first(iter.indices[1])) + vl, fl = Base._sub2ind(tail(ax), tail(subs)...), subs[1] + vr, fr = divrem(last(iter.indices[1]) - 1, mi[end]) .+ (1, first(ax1)) + oci = CartesianIndices(tail(ci.indices)) + # A fake CartesianPartition to reuse the outer iterate fallback + outer = @inbounds view(ReshapedArray(oci, (length(oci),), mi), vl:vr) + init = @inbounds dec(oci[tail(subs)...].I, oci.indices) # real init state + # Use Generator to make inner loop branchless + @inline function skip_len_I(i::Int, I::CartesianIndex) + l = i == 1 ? fl : first(ax1) + r = i == length(outer) ? fr : last(ax1) + l - first(ax1), r - l + 1, I + end + (skip_len_I(i, I) for (i, I) in Iterators.enumerate(Iterators.rest(outer, (init, 0)))) end - function simd_outer_range(iter::CartesianPartition{CartesianIndex{2}}) + @inline function simd_outer_range(iter::CartesianPartition{CartesianIndex{2}}) # But for two-dimensional Partitions the above is just a simple one-dimensional range # over the second dimension; we don't need to worry about non-rectangular staggers in # higher dimensions. - return @inbounds CartesianIndices((iter[1][2]:iter[end][2],)) - end - @inline function simd_inner_length(iter::CartesianPartition, I::CartesianIndex) - inner = iter.parent.parent.indices[1] - @inbounds fi = iter[1].I - @inbounds li = iter[end].I - inner_start = I.I == tail(fi) ? fi[1] : first(inner) - inner_end = I.I == tail(li) ? li[1] : last(inner) - return inner_end - inner_start + 1 - end - @inline function simd_index(iter::CartesianPartition, Ilast::CartesianIndex, I1::Int) - # I1 is the 0-based distance from the first dimension's offest - offset = first(iter.parent.parent.indices[1]) # (this is 1 for 1-based arrays) - # In the first column we need to also add in the iter's starting point (branchlessly) - f = @inbounds iter[1] - startoffset = (Ilast.I == tail(f.I))*(f[1] - 1) - CartesianIndex((I1 + offset + startoffset, Ilast.I...)) + mi = iter.parent.mi + ci = iter.parent.parent + ax, ax1 = axes(ci), Base.axes1(ci) + fl, vl = Base.ind2sub_rs(ax, mi, first(iter.indices[1])) + fr, vr = Base.ind2sub_rs(ax, mi, last(iter.indices[1])) + outer = @inbounds CartesianIndices((ci.indices[2][vl:vr],)) + # Use Generator to make inner loop branchless + @inline function skip_len_I(I::CartesianIndex{1}) + l = I == first(outer) ? fl : first(ax1) + r = I == last(outer) ? fr : last(ax1) + l - first(ax1), r - l + 1, I + end + (skip_len_I(I) for I in outer) end + @inline simd_inner_length(iter::CartesianPartition, (_, len, _)::Tuple{Int,Int,CartesianIndex}) = len + @propagate_inbounds simd_index(iter::CartesianPartition, (skip, _, I)::Tuple{Int,Int,CartesianIndex}, n::Int) = + simd_index(iter.parent.parent, I, n + skip) end # IteratorsMD diff --git a/test/iterators.jl b/test/iterators.jl index 5b68a8e84a712..8a068f161f146 100644 --- a/test/iterators.jl +++ b/test/iterators.jl @@ -550,12 +550,15 @@ end (1,1), (8,8), (11, 13), (1,1,1), (8, 4, 2), (11, 13, 17)), part in (1, 7, 8, 11, 63, 64, 65, 142, 143, 144) - P = partition(CartesianIndices(dims), part) - for I in P - @test length(I) == iterate_length(I) == simd_iterate_length(I) == simd_trip_count(I) - @test collect(I) == iterate_elements(I) == simd_iterate_elements(I) == index_elements(I) + for fun in (i -> 1:i, i -> 1:2:2i, i -> Base.IdentityUnitRange(-i:i)) + iter = CartesianIndices(map(fun, dims)) + P = partition(iter, part) + for I in P + @test length(I) == iterate_length(I) == simd_iterate_length(I) == simd_trip_count(I) + @test collect(I) == iterate_elements(I) == simd_iterate_elements(I) == index_elements(I) + end + @test all(Base.splat(==), zip(Iterators.flatten(map(collect, P)), iter)) end - @test all(Base.splat(==), zip(Iterators.flatten(map(collect, P)), CartesianIndices(dims))) end @testset "empty/invalid partitions" begin @test_throws ArgumentError partition(1:10, 0)