forked from JuliaGPU/CUDAnative.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan.jl
85 lines (67 loc) · 1.87 KB
/
scan.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Work-inefficient inclusive scan
# - uses shared memory to reduce
#
# Based on http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
using CUDAdrv, CUDAnative
using Base.Test
function cpu_accumulate!{F<:Function,T}(op::F, data::Matrix{T})
cols = size(data,2)
for col in 1:cols
accum = zero(T)
rows = size(data,1)
for row in 1:size(data,1)
accum = op(accum, data[row,col])
data[row,col] = accum
end
end
return
end
function gpu_accumulate!{F<:Function,T}(op::F, data::CuDeviceArray{T,2})
col = blockIdx().x
cols = gridDim().x
row = threadIdx().x
rows = blockDim().x
if col <= cols && row <= rows
shmem = @cuDynamicSharedMem(T, 2*rows)
shmem[row] = data[row,col]
sync_threads()
# parallel reduction
pin, pout = 1, 0
offset = 1
while offset < rows
pout = 1 - pout
pin = 1 - pin
if row > offset
shmem[pout * rows + row] =
op(shmem[pin * rows + row],
shmem[pin * rows + row - offset])
else
shmem[pout * rows + row] =
shmem[pin * rows + row]
end
sync_threads()
offset *= UInt32(2)
end
shmem[pin * rows + row] = shmem[pout * rows + row]
sync_threads()
# write back results
data[row,col] = shmem[row]
end
return
end
dev = CuDevice(0)
ctx = CuContext(dev)
rows = 5
cols = 4
a = ones(rows, cols)
a = collect(reshape(1:rows*cols, (rows,cols)))
cpu_a = copy(a)
cpu_accumulate!(+, cpu_a)
gpu_a = CuArray(a)
@cuda (cols,rows, cols*rows*sizeof(eltype(a))) gpu_accumulate!(+, gpu_a)
@test cpu_a ≈ Array(gpu_a)
destroy(ctx)
# FURTHER IMPROVEMENTS:
# - work efficiency
# - avoid memory bank conflcits
# - large array support