From d96b97ea64b3fcd17a7844e2839bb1845610791e Mon Sep 17 00:00:00 2001
From: Mark Harris <harrism@gmail.com>
Date: Thu, 20 Feb 2014 13:53:35 +1000
Subject: [PATCH] Initial Commit

---
 README.md                       |  38 +++++++++++-
 cuda/nbody-block.cu             |  99 +++++++++++++++++++++++++++++++
 cuda/nbody-orig.cu              |  91 +++++++++++++++++++++++++++++
 cuda/nbody-soa.cu               |  91 +++++++++++++++++++++++++++++
 cuda/nbody-unroll.cu            | 100 ++++++++++++++++++++++++++++++++
 cuda/shmoo-cuda-nbody-block.sh  |  14 +++++
 cuda/shmoo-cuda-nbody-ftz.sh    |  14 +++++
 cuda/shmoo-cuda-nbody-orig.sh   |  14 +++++
 cuda/shmoo-cuda-nbody-soa.sh    |  14 +++++
 cuda/shmoo-cuda-nbody-unroll.sh |  14 +++++
 mic/nbody-align.c               | 100 ++++++++++++++++++++++++++++++++
 mic/nbody-block.c               |  93 +++++++++++++++++++++++++++++
 mic/nbody-soa.c                 |  85 +++++++++++++++++++++++++++
 mic/shmoo-mic-nbody-align.sh    |  23 ++++++++
 mic/shmoo-mic-nbody-block.sh    |  23 ++++++++
 mic/shmoo-mic-nbody-ftz.sh      |  23 ++++++++
 mic/shmoo-mic-nbody-orig.sh     |  23 ++++++++
 mic/shmoo-mic-nbody-soa.sh      |  23 ++++++++
 nbody.c                         |  81 ++++++++++++++++++++++++++
 shmoo-cpu-nbody.sh              |  13 +++++
 20 files changed, 974 insertions(+), 2 deletions(-)
 create mode 100644 cuda/nbody-block.cu
 create mode 100644 cuda/nbody-orig.cu
 create mode 100644 cuda/nbody-soa.cu
 create mode 100644 cuda/nbody-unroll.cu
 create mode 100755 cuda/shmoo-cuda-nbody-block.sh
 create mode 100755 cuda/shmoo-cuda-nbody-ftz.sh
 create mode 100755 cuda/shmoo-cuda-nbody-orig.sh
 create mode 100755 cuda/shmoo-cuda-nbody-soa.sh
 create mode 100755 cuda/shmoo-cuda-nbody-unroll.sh
 create mode 100644 mic/nbody-align.c
 create mode 100644 mic/nbody-block.c
 create mode 100644 mic/nbody-soa.c
 create mode 100755 mic/shmoo-mic-nbody-align.sh
 create mode 100755 mic/shmoo-mic-nbody-block.sh
 create mode 100755 mic/shmoo-mic-nbody-ftz.sh
 create mode 100755 mic/shmoo-mic-nbody-orig.sh
 create mode 100755 mic/shmoo-mic-nbody-soa.sh
 create mode 100644 nbody.c
 create mode 100755 shmoo-cpu-nbody.sh

diff --git a/README.md b/README.md
index 29f0932..c33fa8a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,38 @@
-mini-nbody
-==========
+mini-nbody: A simple N-body Code
+================================
 
 A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations.
+
+Benchmarks
+----------
+
+There are 5 different benchmarks provided for CUDA and MIC platforms.
+
+1. nbody-orig : the original, unoptimized simulation (also for CPU)
+2. nbody-soa  : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout
+3. nbody-flush : Flush denormals to zero (no code changes, just a command line option)
+4. nbody-block : Cache blocking
+5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC)
+
+Files
+-----
+
+nbody.c : simple, unoptimized OpenMP C code
+timer.h : simple cross-OS timing code
+
+Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies.
+
+cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU)
+  1. nbody-orig.cu   : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh)
+  2. nbody-soa.cu    : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh)
+  3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh)
+  4. nbody-block.cu  : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh)
+  5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh)
+
+mic/  : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P)
+  1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh)
+  2. nbody-soa.c     : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh)
+  3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh)
+  4. nbody-block.c   : cache blocking via loop splitting (shmoo-mic-nbody-block.sh)
+  5. nbody-align.c   : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh)
+
diff --git a/cuda/nbody-block.cu b/cuda/nbody-block.cu
new file mode 100644
index 0000000..1466bbd
--- /dev/null
+++ b/cuda/nbody-block.cu
@@ -0,0 +1,99 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float4 *pos, *vel; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(float4 *p, float4 *v, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int tile = 0; tile < gridDim.x; tile++) {
+      __shared__ float3 spos[BLOCK_SIZE];
+      float4 tpos = p[tile * blockDim.x + threadIdx.x];
+      spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
+      __syncthreads();
+
+      for (int j = 0; j < BLOCK_SIZE; j++) {
+        float dx = spos[j].x - p[i].x;
+        float dy = spos[j].y - p[i].y;
+        float dz = spos[j].z - p[i].z;
+        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+        float invDist = rsqrtf(distSqr);
+        float invDist3 = invDist * invDist * invDist;
+
+        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+      }
+      __syncthreads();
+    }
+
+    v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+  
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+  
+  int bytes = 2*nBodies*sizeof(float4);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
+
+  randomizeBodies(buf, 8*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.pos[i].x += p.vel[i].x*dt;
+      p.pos[i].y += p.vel[i].y*dt;
+      p.pos[i].z += p.vel[i].z*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}
diff --git a/cuda/nbody-orig.cu b/cuda/nbody-orig.cu
new file mode 100644
index 0000000..3645686
--- /dev/null
+++ b/cuda/nbody-orig.cu
@@ -0,0 +1,91 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float x, y, z, vx, vy, vz; } Body;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(Body *p, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int j = 0; j < n; j++) {
+      float dx = p[j].x - p[i].x;
+      float dy = p[j].y - p[i].y;
+      float dz = p[j].z - p[i].z;
+      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+      float invDist = rsqrtf(distSqr);
+      float invDist3 = invDist * invDist * invDist;
+
+      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+  
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = nBodies*sizeof(Body);
+  float *buf = (float*)malloc(bytes);
+  Body *p = (Body*)buf;
+
+  randomizeBodies(buf, 6*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  Body *d_p = (Body*)d_buf;
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p[i].x += p[i].vx*dt;
+      p[i].y += p[i].vy*dt;
+      p[i].z += p[i].vz*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}
diff --git a/cuda/nbody-soa.cu b/cuda/nbody-soa.cu
new file mode 100644
index 0000000..0da50e6
--- /dev/null
+++ b/cuda/nbody-soa.cu
@@ -0,0 +1,91 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float4 *pos, *vel; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(float4 *p, float4 *v, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int j = 0; j < n; j++) {
+      float dx = p[j].x - p[i].x;
+      float dy = p[j].y - p[i].y;
+      float dz = p[j].z - p[i].z;
+      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+      float invDist = rsqrtf(distSqr);
+      float invDist3 = invDist * invDist * invDist;
+
+      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+  
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+  
+  int bytes = 2*nBodies*sizeof(float4);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
+
+  randomizeBodies(buf, 8*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.pos[i].x += p.vel[i].x*dt;
+      p.pos[i].y += p.vel[i].y*dt;
+      p.pos[i].z += p.vel[i].z*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}
diff --git a/cuda/nbody-unroll.cu b/cuda/nbody-unroll.cu
new file mode 100644
index 0000000..f99bf07
--- /dev/null
+++ b/cuda/nbody-unroll.cu
@@ -0,0 +1,100 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define BLOCK_SIZE 256
+#define SOFTENING 1e-9f
+
+typedef struct { float4 *pos, *vel; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+__global__
+void bodyForce(float4 *p, float4 *v, float dt, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int tile = 0; tile < gridDim.x; tile++) {
+      __shared__ float3 spos[BLOCK_SIZE];
+      float4 tpos = p[tile * blockDim.x + threadIdx.x];
+      spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z);
+      __syncthreads();
+
+      #pragma unroll
+      for (int j = 0; j < BLOCK_SIZE; j++) {
+        float dx = spos[j].x - p[i].x;
+        float dy = spos[j].y - p[i].y;
+        float dz = spos[j].z - p[i].z;
+        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+        float invDist = rsqrtf(distSqr);
+        float invDist3 = invDist * invDist * invDist;
+
+        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+      }
+      __syncthreads();
+    }
+
+    v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+  
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+  
+  int bytes = 2*nBodies*sizeof(float4);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies };
+
+  randomizeBodies(buf, 8*nBodies); // Init pos / vel data
+
+  float *d_buf;
+  cudaMalloc(&d_buf, bytes);
+  BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies };
+
+  int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  double totalTime = 0.0; 
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
+    bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies);
+    cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.pos[i].x += p.vel[i].x*dt;
+      p.pos[i].y += p.vel[i].y*dt;
+      p.pos[i].z += p.vel[i].z*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+  cudaFree(d_buf);
+}
diff --git a/cuda/shmoo-cuda-nbody-block.sh b/cuda/shmoo-cuda-nbody-block.sh
new file mode 100755
index 0000000..0eb4825
--- /dev/null
+++ b/cuda/shmoo-cuda-nbody-block.sh
@@ -0,0 +1,14 @@
+SRC=nbody-block.cu
+EXE=nbody-block
+
+nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ./$EXE $K
+    K=$(($K*2))
+done
+
diff --git a/cuda/shmoo-cuda-nbody-ftz.sh b/cuda/shmoo-cuda-nbody-ftz.sh
new file mode 100755
index 0000000..8263feb
--- /dev/null
+++ b/cuda/shmoo-cuda-nbody-ftz.sh
@@ -0,0 +1,14 @@
+SRC=nbody-soa.cu
+EXE=nbody-ftz
+
+nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ./$EXE $K
+    K=$(($K*2))
+done
+
diff --git a/cuda/shmoo-cuda-nbody-orig.sh b/cuda/shmoo-cuda-nbody-orig.sh
new file mode 100755
index 0000000..a0d5b4c
--- /dev/null
+++ b/cuda/shmoo-cuda-nbody-orig.sh
@@ -0,0 +1,14 @@
+SRC=nbody-orig.cu
+EXE=nbody-orig
+
+nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ./$EXE $K
+    K=$(($K*2))
+done
+
diff --git a/cuda/shmoo-cuda-nbody-soa.sh b/cuda/shmoo-cuda-nbody-soa.sh
new file mode 100755
index 0000000..5a6e5d7
--- /dev/null
+++ b/cuda/shmoo-cuda-nbody-soa.sh
@@ -0,0 +1,14 @@
+SRC=nbody-soa.cu
+EXE=nbody-soa
+
+nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ./$EXE $K
+    K=$(($K*2))
+done
+
diff --git a/cuda/shmoo-cuda-nbody-unroll.sh b/cuda/shmoo-cuda-nbody-unroll.sh
new file mode 100755
index 0000000..fb12c54
--- /dev/null
+++ b/cuda/shmoo-cuda-nbody-unroll.sh
@@ -0,0 +1,14 @@
+SRC=nbody-unroll.cu
+EXE=nbody-unroll
+
+nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ./$EXE $K
+    K=$(($K*2))
+done
+
diff --git a/mic/nbody-align.c b/mic/nbody-align.c
new file mode 100644
index 0000000..7fa9303
--- /dev/null
+++ b/mic/nbody-align.c
@@ -0,0 +1,100 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define CACHELINE 64 // size of cache line [bytes]
+#define SOFTENING 1e-9f
+
+typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+
+void bodyForce(BodySystem p, float dt, int n, int tileSize) {
+  for (int tile = 0; tile < n; tile += tileSize) {
+    int to = tile + tileSize; 
+    if (to > n) to = n;
+
+    #pragma omp parallel for schedule(dynamic)
+    for (int i = 0; i < n; i++) {
+      float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+      #pragma vector aligned
+      #pragma simd
+      for (int j = tile; j < to; j++) {
+        float dy = p.y[j] - p.y[i];
+        float dz = p.z[j] - p.z[i];
+        float dx = p.x[j] - p.x[i];
+        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+        float invDist = 1.0f / sqrtf(distSqr);
+        float invDist3 = invDist * invDist * invDist;
+
+        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;      
+      }
+    
+      p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz;
+    }
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  int tileSize = 24400;
+  if (tileSize > nBodies) tileSize = nBodies;
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  if ( tileSize % (CACHELINE/sizeof(float)) ) {
+    printf("ERROR: blockSize not multiple of %d vector elements\n", CACHELINE/(int)sizeof(float));
+    exit(1);
+  }
+
+  int bytes = 6*nBodies*sizeof(float);
+  float *buf = (float*)_mm_malloc(bytes, CACHELINE);
+  BodySystem p;
+  p.x  = buf+0*nBodies; p.y  = buf+1*nBodies; p.z  = buf+2*nBodies;
+  p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies;
+
+  randomizeBodies(buf, 6*nBodies); // Init pos / vel data
+
+  double totalTime = 0.0;
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    bodyForce(p, dt, nBodies, tileSize); // compute interbody forces
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.x[i] += p.vx[i]*dt;
+      p.y[i] += p.vy[i]*dt;
+      p.z[i] += p.vz[i]*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  _mm_free(buf);
+}
diff --git a/mic/nbody-block.c b/mic/nbody-block.c
new file mode 100644
index 0000000..2477a9a
--- /dev/null
+++ b/mic/nbody-block.c
@@ -0,0 +1,93 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define CACHELINE 64 // size of cache line [bytes]
+#define SOFTENING 1e-9f
+
+typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+
+void bodyForce(BodySystem p, float dt, int n, int tileSize) {
+  for (int tile = 0; tile < n; tile += tileSize) {
+    int to = tile + tileSize; 
+    if (to > n) to = n;
+
+    #pragma omp parallel for schedule(dynamic)
+    for (int i = 0; i < n; i++) {
+      float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+      for (int j = tile; j < to; j++) {
+        float dy = p.y[j] - p.y[i];
+        float dz = p.z[j] - p.z[i];
+        float dx = p.x[j] - p.x[i];
+        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+        float invDist = 1.0f / sqrtf(distSqr);
+        float invDist3 = invDist * invDist * invDist;
+
+        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;      
+      }
+    
+      p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz;
+    }
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  int tileSize = 24400;
+  if (tileSize > nBodies) tileSize = nBodies;
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = 6*nBodies*sizeof(float);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p;
+  p.x  = buf+0*nBodies; p.y  = buf+1*nBodies; p.z  = buf+2*nBodies;
+  p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies;
+
+  randomizeBodies(buf, 6*nBodies); // Init pos / vel data
+
+  double totalTime = 0.0;
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    bodyForce(p, dt, nBodies, tileSize); // compute interbody forces
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.x[i] += p.vx[i]*dt;
+      p.y[i] += p.vy[i]*dt;
+      p.z[i] += p.vz[i]*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+}
diff --git a/mic/nbody-soa.c b/mic/nbody-soa.c
new file mode 100644
index 0000000..33c1615
--- /dev/null
+++ b/mic/nbody-soa.c
@@ -0,0 +1,85 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+
+#define SOFTENING 1e-9f
+
+typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+
+void bodyForce(BodySystem p, float dt, int n) {
+  #pragma omp parallel for schedule(dynamic)
+  for (int i = 0; i < n; i++) { 
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int j = 0; j < n; j++) {
+      float dy = p.y[j] - p.y[i];
+      float dz = p.z[j] - p.z[i];
+      float dx = p.x[j] - p.x[i];
+      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+      float invDist = 1.0f / sqrtf(distSqr);
+      float invDist3 = invDist * invDist * invDist;
+
+      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = 6*nBodies*sizeof(float);
+  float *buf = (float*)malloc(bytes);
+  BodySystem p;
+  p.x  = buf+0*nBodies; p.y  = buf+1*nBodies; p.z  = buf+2*nBodies;
+  p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies;
+
+  randomizeBodies(buf, 6*nBodies); // Init pos / vel data
+
+  double totalTime = 0.0;
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    bodyForce(p, dt, nBodies); // compute interbody forces
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p.x[i] += p.vx[i]*dt;
+      p.y[i] += p.vy[i]*dt;
+      p.z[i] += p.vz[i]*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+}
diff --git a/mic/shmoo-mic-nbody-align.sh b/mic/shmoo-mic-nbody-align.sh
new file mode 100755
index 0000000..83f40ca
--- /dev/null
+++ b/mic/shmoo-mic-nbody-align.sh
@@ -0,0 +1,23 @@
+SRC=nbody-align.c
+EXE=nbody-align-mic
+MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
+MIC=mic0
+if [ $# -gt 0 ]
+  then
+    MIC=$1
+fi
+
+icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC
+
+scp $EXE $MIC:~/
+scp $MICROOT/libiomp5.so $MIC:~/
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
+    K=$(($K*2))
+done
+
diff --git a/mic/shmoo-mic-nbody-block.sh b/mic/shmoo-mic-nbody-block.sh
new file mode 100755
index 0000000..28d2869
--- /dev/null
+++ b/mic/shmoo-mic-nbody-block.sh
@@ -0,0 +1,23 @@
+SRC=nbody-block.c
+EXE=nbody-block-mic
+MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
+MIC=mic0
+if [ $# -gt 0 ]
+  then
+    MIC=$1
+fi
+
+icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC
+
+scp $EXE $MIC:~/
+scp $MICROOT/libiomp5.so $MIC:~/
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
+    K=$(($K*2))
+done
+
diff --git a/mic/shmoo-mic-nbody-ftz.sh b/mic/shmoo-mic-nbody-ftz.sh
new file mode 100755
index 0000000..256d00a
--- /dev/null
+++ b/mic/shmoo-mic-nbody-ftz.sh
@@ -0,0 +1,23 @@
+SRC=nbody-soa.c
+EXE=nbody-ftz-mic
+MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
+MIC=mic0
+if [ $# -gt 0 ]
+  then
+    MIC=$1
+fi
+
+icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC
+
+scp $EXE $MIC:~/
+scp $MICROOT/libiomp5.so $MIC:~/
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
+    K=$(($K*2))
+done
+
diff --git a/mic/shmoo-mic-nbody-orig.sh b/mic/shmoo-mic-nbody-orig.sh
new file mode 100755
index 0000000..4f010d0
--- /dev/null
+++ b/mic/shmoo-mic-nbody-orig.sh
@@ -0,0 +1,23 @@
+SRC=../nbody-orig.c
+EXE=nbody-orig-mic
+MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
+MIC=mic0
+if [ $# -gt 0 ]
+  then
+    MIC=$1
+fi
+
+icc -std=c99 -openmp -mmic -DSHMOO -o $EXE $SRC
+
+scp $EXE $MIC:~/
+scp $MICROOT/libiomp5.so $MIC:~/
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
+    K=$(($K*2))
+done
+
diff --git a/mic/shmoo-mic-nbody-soa.sh b/mic/shmoo-mic-nbody-soa.sh
new file mode 100755
index 0000000..f50674c
--- /dev/null
+++ b/mic/shmoo-mic-nbody-soa.sh
@@ -0,0 +1,23 @@
+SRC=nbody-soa.c
+EXE=nbody-soa-mic
+MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic
+MIC=mic0
+if [ $# -gt 0 ]
+  then
+    MIC=$1
+fi
+
+icc -std=c99 -openmp -mmic -DSHMOO -I../ -o $EXE $SRC
+
+scp $EXE $MIC:~/
+scp $MICROOT/libiomp5.so $MIC:~/
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K"
+    K=$(($K*2))
+done
+
diff --git a/nbody.c b/nbody.c
new file mode 100644
index 0000000..ab8eb63
--- /dev/null
+++ b/nbody.c
@@ -0,0 +1,81 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "timer.h"
+
+#define SOFTENING 1e-9f
+
+typedef struct { float x, y, z, vx, vy, vz; } Body;
+
+void randomizeBodies(float *data, int n) {
+  for (int i = 0; i < n; i++) {
+    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
+  }
+}
+
+void bodyForce(Body *p, float dt, int n) {
+  #pragma omp parallel for schedule(dynamic)
+  for (int i = 0; i < n; i++) { 
+    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
+
+    for (int j = 0; j < n; j++) {
+      float dx = p[j].x - p[i].x;
+      float dy = p[j].y - p[i].y;
+      float dz = p[j].z - p[i].z;
+      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
+      float invDist = 1.0f / sqrtf(distSqr);
+      float invDist3 = invDist * invDist * invDist;
+
+      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
+    }
+
+    p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
+  }
+}
+
+int main(const int argc, const char** argv) {
+  
+  int nBodies = 30000;
+  if (argc > 1) nBodies = atoi(argv[1]);
+
+  const float dt = 0.01f; // time step
+  const int nIters = 10;  // simulation iterations
+
+  int bytes = nBodies*sizeof(Body);
+  float *buf = (float*)malloc(bytes);
+  Body *p = (Body*)buf;
+
+  randomizeBodies(buf, 6*nBodies); // Init pos / vel data
+
+  double totalTime = 0.0;
+
+  for (int iter = 1; iter <= nIters; iter++) {
+    StartTimer();
+
+    bodyForce(p, dt, nBodies); // compute interbody forces
+
+    for (int i = 0 ; i < nBodies; i++) { // integrate position
+      p[i].x += p[i].vx*dt;
+      p[i].y += p[i].vy*dt;
+      p[i].z += p[i].vz*dt;
+    }
+
+    const double tElapsed = GetTimer() / 1000.0;
+    if (iter > 1) { // First iter is warm up
+      totalTime += tElapsed; 
+    }
+#ifndef SHMOO
+    printf("Iteration %d: %.3f seconds\n", iter, tElapsed);
+#endif
+  }
+  double avgTime = totalTime / (double)(nIters-1); 
+
+#ifdef SHMOO
+  printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#else
+  printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n",
+         nIters, rate);
+  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime);
+#endif
+  free(buf);
+}
diff --git a/shmoo-cpu-nbody.sh b/shmoo-cpu-nbody.sh
new file mode 100755
index 0000000..a06358f
--- /dev/null
+++ b/shmoo-cpu-nbody.sh
@@ -0,0 +1,13 @@
+SRC=nbody.c
+EXE=nbody
+gcc -std=c99 -O3 -fopenmp -DSHMOO -o $EXE $SRC -lm
+
+echo $EXE
+
+K=1024
+for i in {1..10}
+do
+    ./$EXE $K
+    K=$(($K*2))
+done
+