-
Notifications
You must be signed in to change notification settings - Fork 0
/
opinion_01.cu
124 lines (105 loc) · 3.6 KB
/
opinion_01.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
/*Function representing the 2nd order differential equation (modify this according to your equation)
My Equation is:
*/
__device__ double f(double t, double y, double z) {
// Example: dz/dt = d^2y/dt^2 = = -A*Sin(2y) +B*Sin(wt)*Sin(y) -Cz + D
double A=1, B=2, C=3, D=4, w=5;
return -A*sin(2*y) + B*sin(w*t)*sin(y) -C*z + D;
}
// Euler method implementation
__global__ void eulerMethod(double* a, double* b, double* c, double t0, double y0, double t0, double dt) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
/* double t = t0; double y = y0; double z = z0; */
double t = t0; double y = a[row+BLOCK_SIZE*col]; double z = b[row+BLOCK_SIZE*col];
double y_next = y + dt * z;
double z_next = z + dt * f(t, y, z);
t += dt;
y = y_next;
z = z_next;
}
}
int main(int argc, char const *argv[])
{
/* Fixed seed for illustration */
srand(3333);
int m=16, n=16, k=16;
// allocate memory in host RAM, h_cc is used to store CPU result
int *h_a, *h_b, *h_c, *h_cc;
cudaMallocHost((void **) &h_a, sizeof(int)*m*n);
cudaMallocHost((void **) &h_b, sizeof(int)*n*k);
cudaMallocHost((void **) &h_c, sizeof(int)*m*k);
cudaMallocHost((void **) &h_cc, sizeof(int)*m*k);
// random initialize matrix A
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
// random initialize matrix B
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
// Allocate memory space on the device
int *d_a, *d_b, *d_c;
cudaMalloc((void **) &d_a, sizeof(int)*m*n);
cudaMalloc((void **) &d_b, sizeof(int)*n*k);
cudaMalloc((void **) &d_c, sizeof(int)*m*k);
// copy matrix A and B from host to device memory
cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice);
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Launch kernel
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);
//eulerMethod(double* d_a, double* d_b, double* d_c, double dt, double t_end)
eulerMethod<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, dt, t_end);
// Transefr results from device to host
cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
// validate results computed by GPU
int all_ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
printf("[%d][%d]:%d == [%d][%d]:%d, ", i, j, h_cc[i*k + j], i, j, h_c[i*k + j]);
if(h_cc[i*k + j] != h_c[i*k + j])
{
all_ok = 0;
}
}
//printf("\n");
}
// free memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
cudaFreeHost(h_cc);
return 0;
}