Skip to content

Commit

Permalink
Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
harrism committed Feb 20, 2014
1 parent 7228039 commit d96b97e
Show file tree
Hide file tree
Showing 20 changed files with 974 additions and 2 deletions.
38 changes: 36 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,38 @@
mini-nbody
==========
mini-nbody: A simple N-body Code
================================

A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations.

Benchmarks
----------

There are 5 different benchmarks provided for CUDA and MIC platforms.

1. nbody-orig : the original, unoptimized simulation (also for CPU)
2. nbody-soa : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout
3. nbody-flush : Flush denormals to zero (no code changes, just a command line option)
4. nbody-block : Cache blocking
5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC)

Files
-----

nbody.c : simple, unoptimized OpenMP C code
timer.h : simple cross-OS timing code

Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies.

cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU)
1. nbody-orig.cu : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh)
2. nbody-soa.cu : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh)
3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh)
4. nbody-block.cu : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh)
5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh)

mic/ : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P)
1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh)
2. nbody-soa.c : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh)
3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh)
4. nbody-block.c : cache blocking via loop splitting (shmoo-mic-nbody-block.sh)
5. nbody-align.c : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh)

99 changes: 99 additions & 0 deletions cuda/nbody-block.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"

#define BLOCK_SIZE 256
#define SOFTENING 1e-9f

typedef struct { float4 *pos, *vel; } BodySystem;

void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

__global__
void bodyForce(float4 *p, float4 *v, float dt, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;

for (int tile = 0; tile < gridDim.x; tile++) {
__shared__ float3 spos[BLOCK_SIZE];
float4 tpos = p[tile * blockDim.x + threadIdx.x];
spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
__syncthreads();

for (int j = 0; j < BLOCK_SIZE; j++) {
float dx = spos[j].x - p[i].x;
float dy = spos[j].y - p[i].y;
float dz = spos[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;

Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
__syncthreads();
}

v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
}
}

int main(const int argc, const char** argv) {

int nBodies = 30000;
if (argc > 1) nBodies = atoi(argv[1]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = 2*nBodies*sizeof(float4);
float *buf = (float*)malloc(bytes);
BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };

randomizeBodies(buf, 8*nBodies); // Init pos / vel data

float *d_buf;
cudaMalloc(&d_buf, bytes);
BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };

int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
double totalTime = 0.0;

for (int iter = 1; iter <= nIters; iter++) {
StartTimer();

cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);

for (int i = 0 ; i < nBodies; i++) { // integrate position
p.pos[i].x += p.vel[i].x*dt;
p.pos[i].y += p.vel[i].y*dt;
p.pos[i].z += p.vel[i].z*dt;
}

const double tElapsed = GetTimer() / 1000.0;
if (iter > 1) { // First iter is warm up
totalTime += tElapsed;
}
#ifndef SHMOO
printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
#endif
}
double avgTime = totalTime / (double)(nIters-1);

#ifdef SHMOO
printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
#else
printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
nIters, rate);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
#endif
free(buf);
cudaFree(d_buf);
}
91 changes: 91 additions & 0 deletions cuda/nbody-orig.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"

#define BLOCK_SIZE 256
#define SOFTENING 1e-9f

typedef struct { float x, y, z, vx, vy, vz; } Body;

void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

__global__
void bodyForce(Body *p, float dt, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;

for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;

Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}

p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}
}

int main(const int argc, const char** argv) {

int nBodies = 30000;
if (argc > 1) nBodies = atoi(argv[1]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = nBodies*sizeof(Body);
float *buf = (float*)malloc(bytes);
Body *p = (Body*)buf;

randomizeBodies(buf, 6*nBodies); // Init pos / vel data

float *d_buf;
cudaMalloc(&d_buf, bytes);
Body *d_p = (Body*)d_buf;

int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
double totalTime = 0.0;

for (int iter = 1; iter <= nIters; iter++) {
StartTimer();

cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);

for (int i = 0 ; i < nBodies; i++) { // integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}

const double tElapsed = GetTimer() / 1000.0;
if (iter > 1) { // First iter is warm up
totalTime += tElapsed;
}
#ifndef SHMOO
printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
#endif
}
double avgTime = totalTime / (double)(nIters-1);

#ifdef SHMOO
printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
#else
printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
nIters, rate);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
#endif
free(buf);
cudaFree(d_buf);
}
91 changes: 91 additions & 0 deletions cuda/nbody-soa.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"

#define BLOCK_SIZE 256
#define SOFTENING 1e-9f

typedef struct { float4 *pos, *vel; } BodySystem;

void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

__global__
void bodyForce(float4 *p, float4 *v, float dt, int n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;

for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;

Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}

v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
}
}

int main(const int argc, const char** argv) {

int nBodies = 30000;
if (argc > 1) nBodies = atoi(argv[1]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = 2*nBodies*sizeof(float4);
float *buf = (float*)malloc(bytes);
BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };

randomizeBodies(buf, 8*nBodies); // Init pos / vel data

float *d_buf;
cudaMalloc(&d_buf, bytes);
BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };

int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
double totalTime = 0.0;

for (int iter = 1; iter <= nIters; iter++) {
StartTimer();

cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);

for (int i = 0 ; i < nBodies; i++) { // integrate position
p.pos[i].x += p.vel[i].x*dt;
p.pos[i].y += p.vel[i].y*dt;
p.pos[i].z += p.vel[i].z*dt;
}

const double tElapsed = GetTimer() / 1000.0;
if (iter > 1) { // First iter is warm up
totalTime += tElapsed;
}
#ifndef SHMOO
printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
#endif
}
double avgTime = totalTime / (double)(nIters-1);

#ifdef SHMOO
printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
#else
printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
nIters, rate);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
#endif
free(buf);
cudaFree(d_buf);
}
Loading

0 comments on commit d96b97e

Please sign in to comment.