-
Notifications
You must be signed in to change notification settings - Fork 3
/
01-nbody.cu
158 lines (123 loc) · 4.32 KB
/
01-nbody.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#define BLOCK_SIZE 256
#define SOFTENING 1e-9f
/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/
typedef struct { float x, y, z, vx, vy, vz; } Body;
/*
* Do not modify this function. A constraint of this exercise is
* that it remain a host function.
*/
void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}
/*
* This function calculates the gravitational impact of all bodies in the system
* on all others, but does not update their positions.
*/
__global__ void bodyForce(Body *p, float dt, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
for (int tile = 0; tile < gridDim.x; tile++) {
__shared__ float3 spos[BLOCK_SIZE];
Body tpos = p[tile * blockDim.x + threadIdx.x];
spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; j++) {
float dx = spos[j].x - p[i].x;
float dy = spos[j].y - p[i].y;
float dz = spos[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
__syncthreads();
}
p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}
}
int main(const int argc, const char** argv) {
/*
* Do not change the value for `nBodies` here. If you would like to modify it,
* pass values into the command line.
*/
int nBodies = 2<<11;
int salt = 0;
if (argc > 1) nBodies = 2<<atoi(argv[1]);
/*
* This salt is for assessment reasons. Tampering with it will result in automatic failure.
*/
if (argc > 2) salt = atoi(argv[2]);
const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations
int bytes = nBodies * sizeof(Body);
float *buf;
buf = (float *)malloc(bytes);
Body *p = (Body*)buf;
/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
*/
randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
float *d_buf;
cudaMalloc(&d_buf, bytes);
Body *d_p = (Body*)d_buf;
int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
double totalTime = 0.0;
/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/
/*******************************************************************/
// Do not modify these 2 lines of code.
for (int iter = 0; iter < nIters; iter++) {
StartTimer();
/*******************************************************************/
/*
* You will likely wish to refactor the work being done in `bodyForce`,
* as well as the work to integrate the positions.
*/
cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
// bodyForce(p, dt, nBodies); // compute interbody forces
/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
for (int i = 0 ; i < nBodies; i++) { // integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}
/*******************************************************************/
// Do not modify the code in this section.
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}
double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
checkAccuracy(buf, nBodies);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
salt += 1;
#endif
/*******************************************************************/
/*
* Feel free to modify code below.
*/
free(buf);
cudaFree(d_buf);
}