diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 335d2cd9..8071493c 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -537,7 +537,7 @@ end include("nditeration.jl") using .NDIteration -import .NDIteration: get +import .NDIteration: get, getrange ### # Kernel closure struct @@ -590,11 +590,13 @@ backend(kernel::Kernel) = kernel.backend error(errmsg) end + offsets = nothing + if static_ndrange <: StaticSize if ndrange !== nothing && ndrange != get(static_ndrange) error("Static NDRange ($static_ndrange) and launch NDRange ($ndrange) differ") end - ndrange = get(static_ndrange) + ndrange, offsets = getrange(static_ndrange) end if static_workgroupsize <: StaticSize @@ -623,7 +625,7 @@ backend(kernel::Kernel) = kernel.backend workgroupsize = CartesianIndices(workgroupsize) end - iterspace = NDRange{length(ndrange), static_blocks, static_workgroupsize}(blocks, workgroupsize) + iterspace = NDRange{length(ndrange), offsets, static_blocks, static_workgroupsize}(blocks, workgroupsize) return iterspace, dynamic end diff --git a/src/macros.jl b/src/macros.jl index e93bc386..5bde61a3 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -1,5 +1,8 @@ import MacroTools: splitdef, combinedef, isexpr, postwalk +@inline contiguousrange(range::NTuple{N, Int}, offset::NTuple{N, Int}) where N = + Tuple(1+o:r+o for (r, o) in zip(range, offset)) + function find_return(stmt) result = false postwalk(stmt) do expr @@ -54,6 +57,8 @@ function __kernel(expr, generate_cpu=true) Core.@__doc__ $name(dev) = $name(dev, $DynamicSize(), $DynamicSize()) $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize()) $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range)) + $name(dev, size, range, ::Nothing) = $name(dev, size, range) + $name(dev, size, range, offset) = $name(dev, $StaticSize(size), $StaticSize($contiguousrange(range, offset))) function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S<:$_Size, NDRange<:$_Size} if $isgpu(dev) return $construct(dev, sz, range, $gpu_name) diff --git a/src/nditeration.jl b/src/nditeration.jl index d7598ae2..d6d4730f 100644 --- a/src/nditeration.jl +++ b/src/nditeration.jl @@ -13,21 +13,33 @@ abstract type _Size end struct DynamicSize <: _Size end struct StaticSize{S} <: _Size function StaticSize{S}() where S - new{S::Tuple{Vararg{Int}}}() + new{S::Tuple{Vararg}}() end end @pure StaticSize(s::Tuple{Vararg{Int}}) = StaticSize{s}() @pure StaticSize(s::Int...) = StaticSize{s}() @pure StaticSize(s::Type{<:Tuple}) = StaticSize{tuple(s.parameters...)}() +@pure StaticSize(s::Tuple{Vararg{UnitRange{Int}}}) = StaticSize{s}() # Some @pure convenience functions for `StaticSize` @pure get(::Type{StaticSize{S}}) where {S} = S @pure get(::StaticSize{S}) where {S} = S @pure Base.getindex(::StaticSize{S}, i::Int) where {S} = i <= length(S) ? S[i] : 1 -@pure Base.ndims(::StaticSize{S}) where {S} = length(S) -@pure Base.length(::StaticSize{S}) where {S} = prod(S) +@pure Base.ndims(::StaticSize{S}) where {S} = length(S) +@pure Base.length(::StaticSize{S}) where {S} = prod(worksize.(S)) +@inline getrange(::StaticSize{S}) where {S} = worksize(S), offsets(S) +@inline getrange(::Type{StaticSize{S}}) where {S} = worksize(S), offsets(S) + +@inline worksize(i::Tuple) = worksize.(i) +@inline worksize(i::Int) = i +@inline worksize(i::UnitRange) = length(i) + +@inline offsets(i) = offsets.(i) +@inline offsets(::NTuple{N, Int}) where N = nothing +@inline offsets(::Int) = nothing +@inline offsets(i::UnitRange) = i.start - 1 """ NDRange @@ -36,7 +48,7 @@ Encodes a blocked iteration space. # Example ``` -ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32))) +ndrange = NDRange{2, nothing, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32))) for block in ndrange for items in workitems(ndrange) I = expand(ndrange, block, items) @@ -46,23 +58,31 @@ for block in ndrange end ``` """ -struct NDRange{N, StaticBlocks, StaticWorkitems, DynamicBlock, DynamicWorkitems} +struct NDRange{N, Offsets, StaticBlocks, StaticWorkitems, DynamicBlock, DynamicWorkitems} blocks::DynamicBlock workitems::DynamicWorkitems function NDRange{N, B, W}() where {N, B, W} - new{N, B, W, Nothing, Nothing}(nothing, nothing) + new{N, nothing, B, W, Nothing, Nothing}(nothing, nothing) end function NDRange{N, B, W}(blocks, workitems) where {N, B, W} - new{N, B, W, typeof(blocks), typeof(workitems)}(blocks, workitems) + new{N, nothing, B, W, typeof(blocks), typeof(workitems)}(blocks, workitems) + end + + function NDRange{N, O, B, W}() where {N, O, B, W} + new{N, O, B, W, Nothing, Nothing}(nothing, nothing) + end + + function NDRange{N, O, B, W}(blocks, workitems) where {N, O, B, W} + new{N, O, B, W, typeof(blocks), typeof(workitems)}(blocks, workitems) end end -@inline workitems(range::NDRange{N, B, W}) where {N,B,W<:DynamicSize} = range.workitems::CartesianIndices{N} -@inline workitems(range::NDRange{N, B, W}) where {N,B,W<:StaticSize} = CartesianIndices(get(W))::CartesianIndices{N} -@inline blocks(range::NDRange{N, B}) where {N,B<:DynamicSize} = range.blocks::CartesianIndices{N} -@inline blocks(range::NDRange{N, B}) where {N,B<:StaticSize} = CartesianIndices(get(B))::CartesianIndices{N} +@inline workitems(range::NDRange{N, O, B, W}) where {N,O,B,W<:DynamicSize} = range.workitems::CartesianIndices{N} +@inline workitems(range::NDRange{N, O, B, W}) where {N,O,B,W<:StaticSize} = CartesianIndices(get(W))::CartesianIndices{N} +@inline blocks(range::NDRange{N, O, B}) where {N,O,B<:DynamicSize} = range.blocks::CartesianIndices{N} +@inline blocks(range::NDRange{N, O, B}) where {N,O,B<:StaticSize} = CartesianIndices(get(B))::CartesianIndices{N} import Base.iterate @inline iterate(range::NDRange) = iterate(blocks(range)) @@ -70,12 +90,22 @@ import Base.iterate Base.length(range::NDRange) = length(blocks(range)) -@inline function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where N +@inline function expand(ndrange::NDRange{N, nothing}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where N nI = ntuple(Val(N)) do I Base.@_inline_meta stride = size(workitems(ndrange), I) gidx = groupidx.I[I] - (gidx-1)*stride + idx.I[I] + (gidx-1)*stride + idx.I[I] + end + CartesianIndex(nI) +end + +@inline function expand(ndrange::NDRange{N, Offsets}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N, Offsets} + nI = ntuple(Val(N)) do I + Base.@_inline_meta + stride = size(workitems(ndrange), I) + gidx = groupidx.I[I] + (gidx-1)*stride + idx.I[I] + Offsets[I] end CartesianIndex(nI) end @@ -121,7 +151,6 @@ needs to perform dynamic bounds-checking. dynamic[] |= mod(ndrange[I], workgroupsize[I]) != 0 return fld1(ndrange[I], workgroupsize[I]) end - return blocks, workgroupsize, dynamic[] ? DynamicCheck() : NoDynamicCheck() end end diff --git a/test/test.jl b/test/test.jl index 88086342..aaeb6710 100644 --- a/test/test.jl +++ b/test/test.jl @@ -215,6 +215,28 @@ end synchronize(Backend()) end +@kernel function index_global_offset!(a) + i, j = @index(Global, NTuple) + n, m = size(a) + @inbounds a[i, j] = i + n * j +end + +@conditional_testset "Offset iteration space $Backend" skip_tests begin + a = KernelAbstractions.zeros(Backend(), 7, 9) + index_global_offset!(Backend(), (2, 2), size(a) .- 4, (2, 2))(a) + synchronize(Backend()) + + b = KernelAbstractions.zeros(CPU(), 7, 9) + b .= a + + c = [i + 7 * j for i in 1:7, j in 1:9] + + @test b[3:5, 3:7] == c[3:5, 3:7] + @test b[1:2, :] == zeros(2, 9) + @test b[6:7, :] == zeros(2, 9) + @test b[:, 1:2] == zeros(7, 2) + @test b[:, 8:9] == zeros(7, 2) +end @conditional_testset "return statement" skip_tests begin try