Written by
David Moore
on
on
Phys 141 - O(N^2) Gravity in CUDA
Here’s some sample minimal CUDA code optimized for the lab computers. Simply extract to a directory, type “make run”, and watch the CUDA program create a 30 frame animated gif (actually 30*15=450 total timesteps) with 10,000 particles.
If things are intimidating, note that “kernel.cu” is mostly c/c++ code, with only about 50 lines of CUDA code. These are the 50 lines that actually run on the GPU, and like I say in the code comments: It is primarily a gutted and rearranged version of the GPU gems nbody code, and the CUDA nbody sample.
//Hybrid of GPU Gems 3 ch. 31 and CUDA nbody example.
//20 FLOPs
__device__ float3 bodyBodyInteraction(float4 bi, float4 bj, float3 ai) {
float3 r;
// r_ij [3 FLOPs]
r.x = bj.x - bi.x;
r.y = bj.y - bi.y;
r.z = bj.z - bi.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPs]
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS2;
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float distSixth = distSqr * distSqr * distSqr;
float invDistCube = 1.0f / sqrtf(distSixth);
// s = m_j * invDistCube [1 FLOP]
float s = bj.w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPs]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;
return ai;
}
/* Hybrid of GPU Gems 3 ch. 31 and samples/5_Simulations/nbody/bodyststemcuda.cu */
__device__ float4 calculate_acceleration(float4 *devX, int numParticles, int deviceOffset) {
__shared__ float4 shPosition[nThreads];
float4 myPosition;
int tile;
float3 acc = { 0.0f, 0.0f, 0.0f };
int gtid = deviceOffset + blockIdx.x * blockDim.x + threadIdx.x;
myPosition = devX[gtid];
for (tile = 0; tile<numParticles / blockDim.x; tile++) {
shPosition[threadIdx.x] = devX[tile * blockDim.x + threadIdx.x];
__syncthreads();
#pragma unroll 64
for (unsigned int j = 0; j < blockDim.x; j++) {
acc = bodyBodyInteraction(myPosition, shPosition[j], acc);
}
__syncthreads();
}
float4 acc4 = { acc.x, acc.y, acc.z, 0.0f };
return acc4;
}
__global__ void kernel_step(float4 *devX, float4 *devV, float dt, int numParticles, int deviceOffset) {
int index = deviceOffset + blockIdx.x * blockDim.x + threadIdx.x;
//6 FLOPs
devX[index].x += devV[index].x*dt;
devX[index].y += devV[index].y*dt;
devX[index].z += devV[index].z*dt;
float4 acc = calculate_acceleration(devX, numParticles, deviceOffset);
//6 FLOPs.
devV[index].x += acc.x*dt;
devV[index].y += acc.y*dt;
devV[index].z += acc.z*dt;
}