Phys 141 - O(N^2) Gravity in CUDA

Here’s some sample minimal CUDA code optimized for the lab computers. Simply extract to a directory, type “make run”, and watch the CUDA program create a 30 frame animated gif (actually 30*15=450 total timesteps) with 10,000 particles.

Download Here.

If things are intimidating, note that “kernel.cu” is mostly c/c++ code, with only about 50 lines of CUDA code. These are the 50 lines that actually run on the GPU, and like I say in the code comments: It is primarily a gutted and rearranged version of the GPU gems nbody code, and the CUDA nbody sample.

//Hybrid of GPU Gems 3 ch. 31 and CUDA nbody example.
//20 FLOPs
__device__ float3 bodyBodyInteraction(float4 bi, float4 bj, float3 ai) {
	float3 r;
	// r_ij  [3 FLOPs]
	r.x = bj.x - bi.x;
	r.y = bj.y - bi.y;
	r.z = bj.z - bi.z;
	// distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPs]
	float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS2;
	// invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
	float distSixth = distSqr * distSqr * distSqr;
	float invDistCube = 1.0f / sqrtf(distSixth);
	// s = m_j * invDistCube [1 FLOP]
	float s = bj.w * invDistCube;
	// a_i =  a_i + s * r_ij [6 FLOPs]
	ai.x += r.x * s;
	ai.y += r.y * s;
	ai.z += r.z * s;
	return ai;
}

/* Hybrid of GPU Gems 3 ch. 31 and samples/5_Simulations/nbody/bodyststemcuda.cu */
__device__ float4 calculate_acceleration(float4 *devX, int numParticles, int deviceOffset) {
	__shared__ float4 shPosition[nThreads];
	float4 myPosition;
	int tile;
	float3 acc = { 0.0f, 0.0f, 0.0f };
	int gtid = deviceOffset + blockIdx.x * blockDim.x + threadIdx.x;
	myPosition = devX[gtid];
	for (tile = 0; tile<numParticles / blockDim.x; tile++) {
		shPosition[threadIdx.x] = devX[tile * blockDim.x + threadIdx.x];
		__syncthreads();
#pragma unroll 64
		for (unsigned int j = 0; j < blockDim.x; j++) {
			acc = bodyBodyInteraction(myPosition, shPosition[j], acc);
		}
		__syncthreads();
	}
	float4 acc4 = { acc.x, acc.y, acc.z, 0.0f };
	return acc4;
}

__global__ void kernel_step(float4 *devX, float4 *devV, float dt, int numParticles, int deviceOffset) {
	int index = deviceOffset + blockIdx.x * blockDim.x + threadIdx.x;

	//6 FLOPs
	devX[index].x += devV[index].x*dt;
	devX[index].y += devV[index].y*dt;
	devX[index].z += devV[index].z*dt;

	float4 acc = calculate_acceleration(devX, numParticles, deviceOffset);

	//6 FLOPs.
	devV[index].x += acc.x*dt;
	devV[index].y += acc.y*dt;
	devV[index].z += acc.z*dt;
}