-
Notifications
You must be signed in to change notification settings - Fork 0
/
kernel.cu
207 lines (180 loc) · 6.47 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#include <cuda.h>
#include <stdio.h>
#include <cublas.h>
#include <iostream>
#include <algorithm>
#include <time.h>
//__device__ void swap(int i, int j) {
// float t;
// float *a=new float[];
// t = a[i];
// a[i] = a[j];
// a[j] = t;
//}
__device__ void insertion_sort(float* knnqueue, int* knnqueueind, int qpitch, int k, int threadId){
int n, o;
float var;
for (o = threadId; o < k*qpitch; o += qpitch){
n = o;
while (n > threadId){
if (knnqueue[n] >knnqueue[n - qpitch]){
var = knnqueue[n];
knnqueue[n] = knnqueue[n - qpitch];
knnqueue[n - qpitch] = var;
}
n -= qpitch;
}
}
}
__global__ void selection_of_knn(float *distlist, int *indlist, int listpitch, int k, int numofpoint, int refdimofpoint,
float *knnqueue, int *knnqueueind, int qpitch, int m, volatile int result)
//the visibility of memory operations on the result variable is ensured by declaring it as volatile
//From Nvidia documentation.
{
int thx;
int queueinit = 0;
int mval = m;
float locmax;
int j = 0;
int varpitch = qpitch;
__shared__ int flags[16];
int *flag = &flags[threadIdx.x / 32];
int b, c;//divfact for the dividing the sequence in the second bitonic sort step,
//initialization to 2 for the second stage of sorting as we devide the sequence into two lists and so on
float var, var1;
int var2 = 0;
//for bitonic diffrent stage sorting
int move = mval / 2;
thx = threadIdx.x + blockIdx.x*blockDim.x;
if (thx < numofpoint){
//queue initialization
if (queueinit == 0){
while (j < k){
knnqueue[j*qpitch + thx] = distlist[j*listpitch + thx];
//printf("knnqueueu: %d %f\n", thx, knnqueue[j*qpitch + thx]);
j++;
}
insertion_sort(knnqueue, 0, qpitch, k, thx);
queueinit = 1;
}
//insertion in the first level m
//locmax is the first element in the queue
locmax = knnqueue[thx];
for (int i = k*listpitch + thx; i < refdimofpoint*listpitch + thx; i += listpitch){
//locmax = knnqueue[thx];//re_assigning the locmax to the head of the first level in the queue
mval = m;
move = mval / 2;//reinitializing mval ,move to deal with the remaining elemnets in the list
if (distlist[i] < locmax){
knnqueue[thx] = distlist[i];
//insert to the first level m
insertion_sort(knnqueue, 0, qpitch, m, thx);
locmax = knnqueue[thx];
while ((locmax < knnqueue[mval*qpitch + thx]) && (mval <= k)){
//first bitonic sort step(two sorted list in decreasing order)
for (int a = mval*qpitch + thx; (a<k*qpitch + thx) && (a < (2 * mval*qpitch) + thx); a += qpitch){
if (knnqueue[a] > knnqueue[a - varpitch]){
var = knnqueue[a];
knnqueue[a] = knnqueue[a - varpitch];
knnqueue[a - varpitch] = var;
}
//else{
// break;//to finish the first bitonic step at the size of the previous level in the queue
//}
varpitch += 2 * qpitch;
}//end of for
//next bitonic sort steps
while (move >0){
for (c = 0; (c<(k*qpitch) + thx) && (c < (mval * 2 * qpitch) + thx); c += move * 2 * qpitch){
/*if (thx == 0)
printf("move,localmax,mval,move*2,b: %d %f %d %d \n", move, locmax, mval, move * 2);*/
for (b = thx; (b<k*qpitch + thx) && (b < move*qpitch + thx); b += qpitch){
if ((knnqueue[b + c] < knnqueue[b + (move*qpitch) + c]) && (b + (move*qpitch) + c<k*qpitch + thx)){
var1 = knnqueue[b + c];
knnqueue[b + c] = knnqueue[b + (move*qpitch) + c];
knnqueue[b + (move*qpitch) + c] = var1;
/*if (thx == 0){
for (int u = thx; u < k*qpitch; u += qpitch)
printf("next bitonic: %d %f\n", thx, knnqueue[u]);
printf("\n");
}*/
}
}
}
move /= 2;
}//end of while
//to compare with the next level in the queue
locmax = knnqueue[mval*qpitch + thx];//to ensure that the level heads are in decreasing order.
/* if (thx==0)
printf("locmax: %f\n", locmax);*/
mval = mval * 2;
move = mval / 2;
varpitch = qpitch;
}//the end of bitonic sort process for merging levels
}
locmax = knnqueue[thx];//re_assigning the locmax to the head of the first level in the queue
}
//bitonic merge
//if the head of the second level is less than that of the first one of size m
}//end of thx<numofpoint
if (thx == 0){
for (int y = thx; y < (k*qpitch) + thx; y += qpitch){
printf("last result: %d %f\n", var2, knnqueue[y]);
var2 += 1;
}
printf("\n");
}
}
int main(){
//testing width=8000; h=32000 k=64
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int width = 8192;//query points
int height = 32768;//ref points
int k = 1024;
size_t lpitch;
size_t qpitch;
cudaError_t val;
float *d_indistqueue;
float *d_list;
float *l_in = new float[height*width];
float *qh_in = new float[width*k];
//allocate cpu memor
float *h_out = (float *)malloc(width*k*sizeof(float));
// generate the input array on the host/
for (int i = 0; i < width*height; i++)
l_in[i] = (float)rand() / (float)RAND_MAX;
//l_in[i] = width*height - i;//should be used for testing bitonic sort
//float l_in[20] = {20,21,16,14,22,23,24,8,30,32,0,5,4,3,2,1,0,6,2,7};
val = cudaMallocPitch((void **)& d_indistqueue, &qpitch, width*sizeof(float), k);
val = cudaMallocPitch((void **)&d_list, &lpitch, width*sizeof(float), height);
if (val)
printf("Memorypitch Error: %s\n", cudaGetErrorString(val));
// transfer the array to the GPU
cudaMemcpy2D(d_indistqueue, qpitch, qh_in, width*sizeof(float), width*sizeof(float), k, cudaMemcpyHostToDevice);
cudaMemcpy2D(d_list, lpitch, l_in, width*sizeof(float), width*sizeof(float), height, cudaMemcpyHostToDevice);
// launch the kernel
dim3 Grid(width / 128 + 1, 1, 1);
dim3 threads(128, 1);
cudaEventRecord(start, 0);
selection_of_knn << <Grid, threads >> >(d_list, 0, lpitch / sizeof(float), k, width, height, d_indistqueue, 0, qpitch / sizeof(float), 8, 0);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
//bitonic_finalize_col<<<1,1>>>(d_indistqueue,1,1,8);
//
// copy back the result array to the CPU
cudaMemcpy2D(h_out, width*sizeof(float), d_indistqueue, qpitch, width*sizeof(float), k, cudaMemcpyDeviceToHost);
//cudaMemcpy(l_out, d_list, list_size*sizeof(int), cudaMemcpyDeviceToHost);
float milliseconds;
cudaEventElapsedTime(&milliseconds, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//
/*for (int j = 0; j < k*width; j++)
printf("dequeue: %d %f\n", j, h_out[j]);
*/
printf("The required time: %f\n", milliseconds / 1000);
cudaFree(d_indistqueue);
cudaFree(d_list);
return 0;
}