Initial Commit

harrism · Feb 20, 2014 · d96b97e · d96b97e
1 parent 7228039
commit d96b97e
Show file tree

Hide file tree

Showing 20 changed files with 974 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,38 @@
-mini-nbody
-==========
+mini-nbody: A simple N-body Code
+================================
 
 A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations.
+
+Benchmarks
+----------
+
+There are 5 different benchmarks provided for CUDA and MIC platforms.
+
+1. nbody-orig : the original, unoptimized simulation (also for CPU)
+2. nbody-soa  : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout
+3. nbody-flush : Flush denormals to zero (no code changes, just a command line option)
+4. nbody-block : Cache blocking
+5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC)
+
+Files
+-----
+
+nbody.c : simple, unoptimized OpenMP C code
+timer.h : simple cross-OS timing code
+
+Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies.
+
+cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU)
+  1. nbody-orig.cu   : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh)
+  2. nbody-soa.cu    : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh)
+  3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh)
+  4. nbody-block.cu  : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh)
+  5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh)
+
+mic/  : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P)
+  1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh)
+  2. nbody-soa.c     : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh)
+  3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh)
+  4. nbody-block.c   : cache blocking via loop splitting (shmoo-mic-nbody-block.sh)
+  5. nbody-align.c   : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh)
+
diff --git a/cuda/nbody-block.cu b/cuda/nbody-block.cu
@@ -0,0 +1,99 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float4 *pos, *vel; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(float4 *p, float4 *v, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int tile = 0; tile < gridDim.x; tile++) {
+      __shared__ float3 spos[BLOCK_SIZE];
+      float4 tpos = p[tile * blockDim.x + threadIdx.x];
+      spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
+      __syncthreads();
+
+      for (int j = 0; j < BLOCK_SIZE; j++) {
+        float dx = spos[j].x - p[i].x;
+        float dy = spos[j].y - p[i].y;
+        float dz = spos[j].z - p[i].z;
+        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+        float invDist = rsqrtf(distSqr);
+        float invDist3 = invDist * invDist * invDist;
+
+        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+      }
+      __syncthreads();
+    }
+
+    v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = 2*nBodies*sizeof(float4);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
+
+  randomizeBodies(buf, 8*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.pos[i].x += p.vel[i].x*dt;
+      p.pos[i].y += p.vel[i].y*dt;
+      p.pos[i].z += p.vel[i].z*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}
diff --git a/cuda/nbody-orig.cu b/cuda/nbody-orig.cu
@@ -0,0 +1,91 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float x, y, z, vx, vy, vz; } Body;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(Body *p, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int j = 0; j < n; j++) {
+      float dx = p[j].x - p[i].x;
+      float dy = p[j].y - p[i].y;
+      float dz = p[j].z - p[i].z;
+      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+      float invDist = rsqrtf(distSqr);
+      float invDist3 = invDist * invDist * invDist;
+
+      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = nBodies*sizeof(Body);
+  float *buf = (float*)malloc(bytes);
+  Body *p = (Body*)buf;
+
+  randomizeBodies(buf, 6*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  Body *d_p = (Body*)d_buf;
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p[i].x += p[i].vx*dt;
+      p[i].y += p[i].vy*dt;
+      p[i].z += p[i].vz*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}
diff --git a/cuda/nbody-soa.cu b/cuda/nbody-soa.cu
@@ -0,0 +1,91 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float4 *pos, *vel; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(float4 *p, float4 *v, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int j = 0; j < n; j++) {
+      float dx = p[j].x - p[i].x;
+      float dy = p[j].y - p[i].y;
+      float dz = p[j].z - p[i].z;
+      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+      float invDist = rsqrtf(distSqr);
+      float invDist3 = invDist * invDist * invDist;
+
+      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = 2*nBodies*sizeof(float4);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
+
+  randomizeBodies(buf, 8*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.pos[i].x += p.vel[i].x*dt;
+      p.pos[i].y += p.vel[i].y*dt;
+      p.pos[i].z += p.vel[i].z*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}