-
Notifications
You must be signed in to change notification settings - Fork 0
/
mmpy.cu
303 lines (251 loc) · 8.95 KB
/
mmpy.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*
* Simplest matrix multiplication in CUDA
*
* Scott B. Baden, University of California, San Diego
* April 2010
*
* We compute C = A * B
*
* This code assumes that the matrices are square though there
* are hooks to facilitate extending the code to non-square matrices
*
*/
// system includes
#include <stdio.h>
#include <assert.h>
#include <iostream>
// include the kernel
#include "mmpy_kernel.cu"
#include "types.h"
#include "utils.h"
// External function definitions
void genMatrix( _DOUBLE_ *a, unsigned int m, unsigned int n);
void genMatrix_bt( _DOUBLE_ *a, _DOUBLE_ *b, unsigned int n);
void genMatrix_rand( _DOUBLE_ *a, _DOUBLE_ *b, unsigned int n);
void verify( _DOUBLE_ *c, unsigned int m, unsigned int n, _DOUBLE_ eps, const char *mesg);
void verify_bt( _DOUBLE_ *c, unsigned int n, const char *mesg);
void verify( _DOUBLE_ *c_d, _DOUBLE_ *c_h, unsigned int m, unsigned int n, _DOUBLE_ eps, const char *mesg);
void verify_bt( _DOUBLE_ *c_d, _DOUBLE_ *c_h, unsigned int n, const char *mesg);
void verify_bt( _DOUBLE_ *c_d, _DOUBLE_ *c_h, unsigned int m, unsigned int n, const char *mesg);
void verify_rand( _DOUBLE_ *a, _DOUBLE_ *b, _DOUBLE_ *c, unsigned int n);
void printMatrix( _DOUBLE_ *a, unsigned int m, unsigned int n);
void cmdLine(int argc, char *argv[], int& n, int& reps, int& ntx, int& nty, _DOUBLE_ & eps, int& do_host, int& prefer_l1, int& use_rand, int& use_bt, int& use_shm_double);
void perfString(int n, int ntx, int nty, int reps, double t_h, double gflops_h, double t_d, double gflops_d, int do_host, int prefer_l1, int use_rand, int use_bt, int use_shm_double);
// extern "C"{
double getTime();
double gflops(int n, int niter, double time);
//}
void matMulHost(_DOUBLE_ *, const _DOUBLE_ *, const _DOUBLE_ *, unsigned int, unsigned int);
void setGrid(int n, dim3 &blockDim, dim3 &gridDim);
int
main(int argc, char** argv) {
// To improve repeatabilty of measurements taken on the device,
// we multiply the number of reps by this scale factor
// Adjust as needed
const int SCALE = 10;
// Read in the command line elements
int n, reps, ntx, nty, do_host, prefer_l1, use_rand, use_bt, use_shm_double;
_DOUBLE_ eps;
cmdLine(argc, argv, n, reps, ntx, nty, eps, do_host, prefer_l1, use_rand, use_bt, use_shm_double);
// The thread geometry must evenly divide N
/*if ((n % ntx != 0) || (n % nty != 0) )
{
printf("Thread geometry: %d x %d\n",ntx, nty);
printf("The length of the thread geometry axis ");
printf("[ %d x %d]\n",ntx, nty);
printf(" nust divide N [%d] evenly\n",n);
exit(-1);
}
*/
// Total amount of storage for entries
unsigned int n2 = n*n*sizeof(_DOUBLE_);
// Report on Device Characteristics
int capability = ReportDevice();
#ifdef _DOUBLE
int major = capability/100;
int minor = capability%100;
if ((major == 1) && (minor < 3)){
printf(" *** You are running on a capability %d.%d device\n",major, minor);
printf(" which does not support double precision arithmetic.\n");
printf(" Recompile with single precision.\n\n");
exit(-1);
}
#endif
// setup execution configurations
int _ntx, _nty;
#if (!defined(BLOCKDIM_X) && !defined(BLOCKDIM_Y))
_ntx = ntx;
_nty = nty;
#else
_ntx = BLOCKDIM_X;
_nty = BLOCKDIM_Y;
#endif
dim3 threads(_ntx, _nty,1);
int numblocksX = n/_ntx;
int numblocksY = n/_nty;
if( n % _ntx != 0 )
numblocksX++;
if( n % _nty != 0 )
numblocksY++;
dim3 grid(numblocksX, numblocksY, 1);
setGrid(n, threads, grid);
// print configurations
printf("n: %d, tx: %d, ty: %d, gridX: %d, gridY: %d, reps: %d, epsilon: %g\n\n", n, threads.x, threads.y, grid.x, grid.y, reps, eps);
#ifndef _DOUBLE
printf("Using Single precision arithmetic\n\n");
#else
printf("Using Double precision arithmetic\n\n");
#endif
if (use_bt)
printf("Using bidiagonal inputs\n");
if (use_rand)
printf("Using random inputs\n");
if (do_host)
printf("Doing host computation for comparison\n\n");
printf("\n");
// allocate an initialize host memory for A and B matrices
_DOUBLE_ *h_A = (_DOUBLE_ *) malloc(n2);
assert(h_A);
_DOUBLE_ *h_B = (_DOUBLE_ *) malloc(n2);
assert(h_B);
if (use_bt){
genMatrix_bt(h_A, h_B, n);
}
else if (use_rand){
genMatrix_rand(h_A, h_B, n);
}
else{
genMatrix(h_A, n, n);
genMatrix(h_B, n, n);
}
if (n <= 8){
cout << "\nA:\n";
printMatrix( h_A, n,n);
cout << "\nB:\n";
printMatrix( h_B, n,n);
}
_DOUBLE_ *hostC;
double t_host=0.0, gflops_h=0.0;
if (do_host){
// compute matrix product on the host
hostC = (_DOUBLE_ *) malloc(n2);
t_host = -getTime();
for (int r=0; r< reps; r++)
matMulHost(hostC, h_A, h_B, n, n);
t_host += getTime();
gflops_h = gflops(n, reps, t_host );
printf("Host computation time: %f sec. [%f gflops]\n",t_host,gflops_h);
// Verify host result
if (use_bt)
verify_bt( hostC,n, "Host result");
else if (use_rand)
cout << "Verfication of host result not supported for random matrices\n";
else
verify( hostC,n,n, eps, "Host result");
if (n <= 8){
printf("\nC:\n");
printMatrix( hostC, n,n);
}
}
// allocate device memory
_DOUBLE_ *d_A, *d_B, *d_C;
cudaMalloc((void**) &d_A, n2);
checkCUDAError("Error allocating device memory for matrix A");
cudaMalloc((void**) &d_B, n2);
checkCUDAError("Error allocating device memory for matrix B");
cudaMalloc((void**) &d_C, n2);
checkCUDAError("Error allocating device memory for matrix C");
cudaMemset((void **) d_A,-99,n2);
checkCUDAError("Error initializing device memory matrix A");
cudaMemset((void **) d_B,-99,n2);
checkCUDAError("Error initializing device memory matrix B");
cudaMemset((void **) d_C,0,n2);
checkCUDAError("Error clearing device memory matrix C");
// copy host memory to device
cudaMemcpy(d_A, h_A, n2, cudaMemcpyHostToDevice);
checkCUDAError("Error copying matrix A to device");
cudaMemcpy(d_B, h_B, n2, cudaMemcpyHostToDevice);
checkCUDAError("Error copying matrix B to device");
// allocate host memory for the result
_DOUBLE_ *h_C = (_DOUBLE_ *) malloc(n2);
assert(h_C);
// If we set the preference for L1 cache, rather than
// shared memory, we may run slightly faster on devices that have the capability
cudaFuncCache Preference;
if (prefer_l1){
Preference = cudaFuncCachePreferL1;
}
else{
Preference = cudaFuncCachePreferShared;
}
cudaFuncSetCacheConfig(matMul,Preference);
cudaSharedMemConfig shmPreference;
if (use_shm_double){
shmPreference = cudaSharedMemBankSizeEightByte;
}else{
shmPreference = cudaSharedMemBankSizeFourByte;
}
cudaFuncSetSharedMemConfig( matMul, shmPreference);
// Start the timer
#ifdef CUDA_TIMER
cudaEvent_t start_event, stop_event;
cudaEventCreate(&start_event) ;
cudaEventCreate(&stop_event);
#endif
#ifdef CUDA_TIMER
cudaEventRecord(start_event, 0);
float t_device;
#else
cudaThreadSynchronize();
double t_device = -getTime();
#endif
// execute the kernel
for (int r=0; r< SCALE*reps; r++)
matMul<<< grid, threads >>>(n, d_C, d_A, d_B);
#ifdef CUDA_TIMER
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&t_device, start_event, stop_event);
t_device /= 1000.0;
#else
// block until the device has finished
cudaThreadSynchronize();
// Stop the timer
t_device +=getTime();
#endif
checkCUDAError("Error in matrixMul kernel");
// copy result from device to host
cudaMemcpy(h_C, d_C, n2, cudaMemcpyDeviceToHost);
checkCUDAError("Unable to retrieve result from device");
double gflops_d = gflops(n, SCALE*reps, t_device );
printf("Device computation time: %f sec. [%f gflops]\n",t_device,gflops_d);
perfString(n, ntx, nty, reps, t_host, gflops_h, t_device, gflops_d, do_host, prefer_l1, use_rand, use_bt, use_shm_double);
if (n <= 8){
printf("\nC (device):\n");
printMatrix( h_C, n,n);
}
// Verify the device result
if (use_bt)
verify_bt( h_C,n,"Device result");
else if (use_rand)
verify_rand( h_A, h_B, h_C, n);
else
verify( h_C,n,n, eps,"Device result");
// But not for random matrices
if (do_host)
// Compare host and device results
if (use_bt)
verify_bt( h_C, hostC, n,"Device vs. host");
else if (!use_rand)
verify( h_C, hostC, n, n, eps,"Device vs. host");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
if (do_host)
free(hostC);
assert(cudaSuccess ==cudaFree(d_A));
assert(cudaSuccess ==cudaFree(d_B));
assert(cudaSuccess ==cudaFree(d_C));
cudaThreadExit();
}