From d96b97ea64b3fcd17a7844e2839bb1845610791e Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 20 Feb 2014 13:53:35 +1000 Subject: [PATCH] Initial Commit --- README.md | 38 +++++++++++- cuda/nbody-block.cu | 99 +++++++++++++++++++++++++++++++ cuda/nbody-orig.cu | 91 +++++++++++++++++++++++++++++ cuda/nbody-soa.cu | 91 +++++++++++++++++++++++++++++ cuda/nbody-unroll.cu | 100 ++++++++++++++++++++++++++++++++ cuda/shmoo-cuda-nbody-block.sh | 14 +++++ cuda/shmoo-cuda-nbody-ftz.sh | 14 +++++ cuda/shmoo-cuda-nbody-orig.sh | 14 +++++ cuda/shmoo-cuda-nbody-soa.sh | 14 +++++ cuda/shmoo-cuda-nbody-unroll.sh | 14 +++++ mic/nbody-align.c | 100 ++++++++++++++++++++++++++++++++ mic/nbody-block.c | 93 +++++++++++++++++++++++++++++ mic/nbody-soa.c | 85 +++++++++++++++++++++++++++ mic/shmoo-mic-nbody-align.sh | 23 ++++++++ mic/shmoo-mic-nbody-block.sh | 23 ++++++++ mic/shmoo-mic-nbody-ftz.sh | 23 ++++++++ mic/shmoo-mic-nbody-orig.sh | 23 ++++++++ mic/shmoo-mic-nbody-soa.sh | 23 ++++++++ nbody.c | 81 ++++++++++++++++++++++++++ shmoo-cpu-nbody.sh | 13 +++++ 20 files changed, 974 insertions(+), 2 deletions(-) create mode 100644 cuda/nbody-block.cu create mode 100644 cuda/nbody-orig.cu create mode 100644 cuda/nbody-soa.cu create mode 100644 cuda/nbody-unroll.cu create mode 100755 cuda/shmoo-cuda-nbody-block.sh create mode 100755 cuda/shmoo-cuda-nbody-ftz.sh create mode 100755 cuda/shmoo-cuda-nbody-orig.sh create mode 100755 cuda/shmoo-cuda-nbody-soa.sh create mode 100755 cuda/shmoo-cuda-nbody-unroll.sh create mode 100644 mic/nbody-align.c create mode 100644 mic/nbody-block.c create mode 100644 mic/nbody-soa.c create mode 100755 mic/shmoo-mic-nbody-align.sh create mode 100755 mic/shmoo-mic-nbody-block.sh create mode 100755 mic/shmoo-mic-nbody-ftz.sh create mode 100755 mic/shmoo-mic-nbody-orig.sh create mode 100755 mic/shmoo-mic-nbody-soa.sh create mode 100644 nbody.c create mode 100755 shmoo-cpu-nbody.sh diff --git a/README.md b/README.md index 29f0932..c33fa8a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,38 @@ -mini-nbody -========== +mini-nbody: A simple N-body Code +================================ A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations. + +Benchmarks +---------- + +There are 5 different benchmarks provided for CUDA and MIC platforms. + +1. nbody-orig : the original, unoptimized simulation (also for CPU) +2. nbody-soa : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout +3. nbody-flush : Flush denormals to zero (no code changes, just a command line option) +4. nbody-block : Cache blocking +5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC) + +Files +----- + +nbody.c : simple, unoptimized OpenMP C code +timer.h : simple cross-OS timing code + +Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies. + +cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU) + 1. nbody-orig.cu : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh) + 2. nbody-soa.cu : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh) + 3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh) + 4. nbody-block.cu : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh) + 5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh) + +mic/ : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P) + 1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh) + 2. nbody-soa.c : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh) + 3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh) + 4. nbody-block.c : cache blocking via loop splitting (shmoo-mic-nbody-block.sh) + 5. nbody-align.c : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh) + diff --git a/cuda/nbody-block.cu b/cuda/nbody-block.cu new file mode 100644 index 0000000..1466bbd --- /dev/null +++ b/cuda/nbody-block.cu @@ -0,0 +1,99 @@ +#include +#include +#include +#include "timer.h" + +#define BLOCK_SIZE 256 +#define SOFTENING 1e-9f + +typedef struct { float4 *pos, *vel; } BodySystem; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + +__global__ +void bodyForce(float4 *p, float4 *v, float dt, int n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < n) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int tile = 0; tile < gridDim.x; tile++) { + __shared__ float3 spos[BLOCK_SIZE]; + float4 tpos = p[tile * blockDim.x + threadIdx.x]; + spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z); + __syncthreads(); + + for (int j = 0; j < BLOCK_SIZE; j++) { + float dx = spos[j].x - p[i].x; + float dy = spos[j].y - p[i].y; + float dz = spos[j].z - p[i].z; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = rsqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + __syncthreads(); + } + + v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = 2*nBodies*sizeof(float4); + float *buf = (float*)malloc(bytes); + BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; + + randomizeBodies(buf, 8*nBodies); // Init pos / vel data + + float *d_buf; + cudaMalloc(&d_buf, bytes); + BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; + + int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); + bodyForce<<>>(d_p.pos, d_p.vel, dt, nBodies); + cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p.pos[i].x += p.vel[i].x*dt; + p.pos[i].y += p.vel[i].y*dt; + p.pos[i].z += p.vel[i].z*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); + cudaFree(d_buf); +} diff --git a/cuda/nbody-orig.cu b/cuda/nbody-orig.cu new file mode 100644 index 0000000..3645686 --- /dev/null +++ b/cuda/nbody-orig.cu @@ -0,0 +1,91 @@ +#include +#include +#include +#include "timer.h" + +#define BLOCK_SIZE 256 +#define SOFTENING 1e-9f + +typedef struct { float x, y, z, vx, vy, vz; } Body; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + +__global__ +void bodyForce(Body *p, float dt, int n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < n) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int j = 0; j < n; j++) { + float dx = p[j].x - p[i].x; + float dy = p[j].y - p[i].y; + float dz = p[j].z - p[i].z; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = rsqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + + p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = nBodies*sizeof(Body); + float *buf = (float*)malloc(bytes); + Body *p = (Body*)buf; + + randomizeBodies(buf, 6*nBodies); // Init pos / vel data + + float *d_buf; + cudaMalloc(&d_buf, bytes); + Body *d_p = (Body*)d_buf; + + int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); + bodyForce<<>>(d_p, dt, nBodies); // compute interbody forces + cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p[i].x += p[i].vx*dt; + p[i].y += p[i].vy*dt; + p[i].z += p[i].vz*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); + cudaFree(d_buf); +} diff --git a/cuda/nbody-soa.cu b/cuda/nbody-soa.cu new file mode 100644 index 0000000..0da50e6 --- /dev/null +++ b/cuda/nbody-soa.cu @@ -0,0 +1,91 @@ +#include +#include +#include +#include "timer.h" + +#define BLOCK_SIZE 256 +#define SOFTENING 1e-9f + +typedef struct { float4 *pos, *vel; } BodySystem; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + +__global__ +void bodyForce(float4 *p, float4 *v, float dt, int n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < n) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int j = 0; j < n; j++) { + float dx = p[j].x - p[i].x; + float dy = p[j].y - p[i].y; + float dz = p[j].z - p[i].z; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = rsqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + + v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = 2*nBodies*sizeof(float4); + float *buf = (float*)malloc(bytes); + BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; + + randomizeBodies(buf, 8*nBodies); // Init pos / vel data + + float *d_buf; + cudaMalloc(&d_buf, bytes); + BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; + + int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); + bodyForce<<>>(d_p.pos, d_p.vel, dt, nBodies); + cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p.pos[i].x += p.vel[i].x*dt; + p.pos[i].y += p.vel[i].y*dt; + p.pos[i].z += p.vel[i].z*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); + cudaFree(d_buf); +} diff --git a/cuda/nbody-unroll.cu b/cuda/nbody-unroll.cu new file mode 100644 index 0000000..f99bf07 --- /dev/null +++ b/cuda/nbody-unroll.cu @@ -0,0 +1,100 @@ +#include +#include +#include +#include "timer.h" + +#define BLOCK_SIZE 256 +#define SOFTENING 1e-9f + +typedef struct { float4 *pos, *vel; } BodySystem; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + +__global__ +void bodyForce(float4 *p, float4 *v, float dt, int n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < n) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int tile = 0; tile < gridDim.x; tile++) { + __shared__ float3 spos[BLOCK_SIZE]; + float4 tpos = p[tile * blockDim.x + threadIdx.x]; + spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z); + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; j++) { + float dx = spos[j].x - p[i].x; + float dy = spos[j].y - p[i].y; + float dz = spos[j].z - p[i].z; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = rsqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + __syncthreads(); + } + + v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = 2*nBodies*sizeof(float4); + float *buf = (float*)malloc(bytes); + BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; + + randomizeBodies(buf, 8*nBodies); // Init pos / vel data + + float *d_buf; + cudaMalloc(&d_buf, bytes); + BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; + + int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); + bodyForce<<>>(d_p.pos, d_p.vel, dt, nBodies); + cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p.pos[i].x += p.vel[i].x*dt; + p.pos[i].y += p.vel[i].y*dt; + p.pos[i].z += p.vel[i].z*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); + cudaFree(d_buf); +} diff --git a/cuda/shmoo-cuda-nbody-block.sh b/cuda/shmoo-cuda-nbody-block.sh new file mode 100755 index 0000000..0eb4825 --- /dev/null +++ b/cuda/shmoo-cuda-nbody-block.sh @@ -0,0 +1,14 @@ +SRC=nbody-block.cu +EXE=nbody-block + +nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO + +echo $EXE + +K=1024 +for i in {1..10} +do + ./$EXE $K + K=$(($K*2)) +done + diff --git a/cuda/shmoo-cuda-nbody-ftz.sh b/cuda/shmoo-cuda-nbody-ftz.sh new file mode 100755 index 0000000..8263feb --- /dev/null +++ b/cuda/shmoo-cuda-nbody-ftz.sh @@ -0,0 +1,14 @@ +SRC=nbody-soa.cu +EXE=nbody-ftz + +nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO + +echo $EXE + +K=1024 +for i in {1..10} +do + ./$EXE $K + K=$(($K*2)) +done + diff --git a/cuda/shmoo-cuda-nbody-orig.sh b/cuda/shmoo-cuda-nbody-orig.sh new file mode 100755 index 0000000..a0d5b4c --- /dev/null +++ b/cuda/shmoo-cuda-nbody-orig.sh @@ -0,0 +1,14 @@ +SRC=nbody-orig.cu +EXE=nbody-orig + +nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC + +echo $EXE + +K=1024 +for i in {1..10} +do + ./$EXE $K + K=$(($K*2)) +done + diff --git a/cuda/shmoo-cuda-nbody-soa.sh b/cuda/shmoo-cuda-nbody-soa.sh new file mode 100755 index 0000000..5a6e5d7 --- /dev/null +++ b/cuda/shmoo-cuda-nbody-soa.sh @@ -0,0 +1,14 @@ +SRC=nbody-soa.cu +EXE=nbody-soa + +nvcc -arch=sm_35 -I../ -DSHMOO -o $EXE $SRC + +echo $EXE + +K=1024 +for i in {1..10} +do + ./$EXE $K + K=$(($K*2)) +done + diff --git a/cuda/shmoo-cuda-nbody-unroll.sh b/cuda/shmoo-cuda-nbody-unroll.sh new file mode 100755 index 0000000..fb12c54 --- /dev/null +++ b/cuda/shmoo-cuda-nbody-unroll.sh @@ -0,0 +1,14 @@ +SRC=nbody-unroll.cu +EXE=nbody-unroll + +nvcc -arch=sm_35 -ftz=true -I../ -o $EXE $SRC -DSHMOO + +echo $EXE + +K=1024 +for i in {1..10} +do + ./$EXE $K + K=$(($K*2)) +done + diff --git a/mic/nbody-align.c b/mic/nbody-align.c new file mode 100644 index 0000000..7fa9303 --- /dev/null +++ b/mic/nbody-align.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include "timer.h" + +#define CACHELINE 64 // size of cache line [bytes] +#define SOFTENING 1e-9f + +typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + + +void bodyForce(BodySystem p, float dt, int n, int tileSize) { + for (int tile = 0; tile < n; tile += tileSize) { + int to = tile + tileSize; + if (to > n) to = n; + + #pragma omp parallel for schedule(dynamic) + for (int i = 0; i < n; i++) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + #pragma vector aligned + #pragma simd + for (int j = tile; j < to; j++) { + float dy = p.y[j] - p.y[i]; + float dz = p.z[j] - p.z[i]; + float dx = p.x[j] - p.x[i]; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = 1.0f / sqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + + p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz; + } + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + int tileSize = 24400; + if (tileSize > nBodies) tileSize = nBodies; + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + if ( tileSize % (CACHELINE/sizeof(float)) ) { + printf("ERROR: blockSize not multiple of %d vector elements\n", CACHELINE/(int)sizeof(float)); + exit(1); + } + + int bytes = 6*nBodies*sizeof(float); + float *buf = (float*)_mm_malloc(bytes, CACHELINE); + BodySystem p; + p.x = buf+0*nBodies; p.y = buf+1*nBodies; p.z = buf+2*nBodies; + p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies; + + randomizeBodies(buf, 6*nBodies); // Init pos / vel data + + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + bodyForce(p, dt, nBodies, tileSize); // compute interbody forces + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p.x[i] += p.vx[i]*dt; + p.y[i] += p.vy[i]*dt; + p.z[i] += p.vz[i]*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + _mm_free(buf); +} diff --git a/mic/nbody-block.c b/mic/nbody-block.c new file mode 100644 index 0000000..2477a9a --- /dev/null +++ b/mic/nbody-block.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include "timer.h" + +#define CACHELINE 64 // size of cache line [bytes] +#define SOFTENING 1e-9f + +typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + + +void bodyForce(BodySystem p, float dt, int n, int tileSize) { + for (int tile = 0; tile < n; tile += tileSize) { + int to = tile + tileSize; + if (to > n) to = n; + + #pragma omp parallel for schedule(dynamic) + for (int i = 0; i < n; i++) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int j = tile; j < to; j++) { + float dy = p.y[j] - p.y[i]; + float dz = p.z[j] - p.z[i]; + float dx = p.x[j] - p.x[i]; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = 1.0f / sqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + + p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz; + } + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + int tileSize = 24400; + if (tileSize > nBodies) tileSize = nBodies; + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = 6*nBodies*sizeof(float); + float *buf = (float*)malloc(bytes); + BodySystem p; + p.x = buf+0*nBodies; p.y = buf+1*nBodies; p.z = buf+2*nBodies; + p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies; + + randomizeBodies(buf, 6*nBodies); // Init pos / vel data + + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + bodyForce(p, dt, nBodies, tileSize); // compute interbody forces + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p.x[i] += p.vx[i]*dt; + p.y[i] += p.vy[i]*dt; + p.z[i] += p.vz[i]*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); +} diff --git a/mic/nbody-soa.c b/mic/nbody-soa.c new file mode 100644 index 0000000..33c1615 --- /dev/null +++ b/mic/nbody-soa.c @@ -0,0 +1,85 @@ +#include +#include +#include +#include "timer.h" + + +#define SOFTENING 1e-9f + +typedef struct { float *x, *y, *z, *vx, *vy, *vz; } BodySystem; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + + +void bodyForce(BodySystem p, float dt, int n) { + #pragma omp parallel for schedule(dynamic) + for (int i = 0; i < n; i++) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int j = 0; j < n; j++) { + float dy = p.y[j] - p.y[i]; + float dz = p.z[j] - p.z[i]; + float dx = p.x[j] - p.x[i]; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = 1.0f / sqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + + p.vx[i] += dt*Fx; p.vy[i] += dt*Fy; p.vz[i] += dt*Fz; + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = 6*nBodies*sizeof(float); + float *buf = (float*)malloc(bytes); + BodySystem p; + p.x = buf+0*nBodies; p.y = buf+1*nBodies; p.z = buf+2*nBodies; + p.vx = buf+3*nBodies; p.vy = buf+4*nBodies; p.vz = buf+5*nBodies; + + randomizeBodies(buf, 6*nBodies); // Init pos / vel data + + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + bodyForce(p, dt, nBodies); // compute interbody forces + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p.x[i] += p.vx[i]*dt; + p.y[i] += p.vy[i]*dt; + p.z[i] += p.vz[i]*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); +} diff --git a/mic/shmoo-mic-nbody-align.sh b/mic/shmoo-mic-nbody-align.sh new file mode 100755 index 0000000..83f40ca --- /dev/null +++ b/mic/shmoo-mic-nbody-align.sh @@ -0,0 +1,23 @@ +SRC=nbody-align.c +EXE=nbody-align-mic +MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic +MIC=mic0 +if [ $# -gt 0 ] + then + MIC=$1 +fi + +icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC + +scp $EXE $MIC:~/ +scp $MICROOT/libiomp5.so $MIC:~/ + +echo $EXE + +K=1024 +for i in {1..10} +do + ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" + K=$(($K*2)) +done + diff --git a/mic/shmoo-mic-nbody-block.sh b/mic/shmoo-mic-nbody-block.sh new file mode 100755 index 0000000..28d2869 --- /dev/null +++ b/mic/shmoo-mic-nbody-block.sh @@ -0,0 +1,23 @@ +SRC=nbody-block.c +EXE=nbody-block-mic +MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic +MIC=mic0 +if [ $# -gt 0 ] + then + MIC=$1 +fi + +icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC + +scp $EXE $MIC:~/ +scp $MICROOT/libiomp5.so $MIC:~/ + +echo $EXE + +K=1024 +for i in {1..10} +do + ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" + K=$(($K*2)) +done + diff --git a/mic/shmoo-mic-nbody-ftz.sh b/mic/shmoo-mic-nbody-ftz.sh new file mode 100755 index 0000000..256d00a --- /dev/null +++ b/mic/shmoo-mic-nbody-ftz.sh @@ -0,0 +1,23 @@ +SRC=nbody-soa.c +EXE=nbody-ftz-mic +MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic +MIC=mic0 +if [ $# -gt 0 ] + then + MIC=$1 +fi + +icc -std=c99 -openmp -mmic -fimf-domain-exclusion=8 -DSHMOO -I../ -o $EXE $SRC + +scp $EXE $MIC:~/ +scp $MICROOT/libiomp5.so $MIC:~/ + +echo $EXE + +K=1024 +for i in {1..10} +do + ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" + K=$(($K*2)) +done + diff --git a/mic/shmoo-mic-nbody-orig.sh b/mic/shmoo-mic-nbody-orig.sh new file mode 100755 index 0000000..4f010d0 --- /dev/null +++ b/mic/shmoo-mic-nbody-orig.sh @@ -0,0 +1,23 @@ +SRC=../nbody-orig.c +EXE=nbody-orig-mic +MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic +MIC=mic0 +if [ $# -gt 0 ] + then + MIC=$1 +fi + +icc -std=c99 -openmp -mmic -DSHMOO -o $EXE $SRC + +scp $EXE $MIC:~/ +scp $MICROOT/libiomp5.so $MIC:~/ + +echo $EXE + +K=1024 +for i in {1..10} +do + ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" + K=$(($K*2)) +done + diff --git a/mic/shmoo-mic-nbody-soa.sh b/mic/shmoo-mic-nbody-soa.sh new file mode 100755 index 0000000..f50674c --- /dev/null +++ b/mic/shmoo-mic-nbody-soa.sh @@ -0,0 +1,23 @@ +SRC=nbody-soa.c +EXE=nbody-soa-mic +MICROOT=/shared/apps/rhel-6.2/intel/ics-2013/composerxe/lib/mic +MIC=mic0 +if [ $# -gt 0 ] + then + MIC=$1 +fi + +icc -std=c99 -openmp -mmic -DSHMOO -I../ -o $EXE $SRC + +scp $EXE $MIC:~/ +scp $MICROOT/libiomp5.so $MIC:~/ + +echo $EXE + +K=1024 +for i in {1..10} +do + ssh $MIC "export LD_LIBRARY_PATH=~/:$LD_LIBRARY_PATH; ./$EXE $K" + K=$(($K*2)) +done + diff --git a/nbody.c b/nbody.c new file mode 100644 index 0000000..ab8eb63 --- /dev/null +++ b/nbody.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include "timer.h" + +#define SOFTENING 1e-9f + +typedef struct { float x, y, z, vx, vy, vz; } Body; + +void randomizeBodies(float *data, int n) { + for (int i = 0; i < n; i++) { + data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; + } +} + +void bodyForce(Body *p, float dt, int n) { + #pragma omp parallel for schedule(dynamic) + for (int i = 0; i < n; i++) { + float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; + + for (int j = 0; j < n; j++) { + float dx = p[j].x - p[i].x; + float dy = p[j].y - p[i].y; + float dz = p[j].z - p[i].z; + float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; + float invDist = 1.0f / sqrtf(distSqr); + float invDist3 = invDist * invDist * invDist; + + Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; + } + + p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; + } +} + +int main(const int argc, const char** argv) { + + int nBodies = 30000; + if (argc > 1) nBodies = atoi(argv[1]); + + const float dt = 0.01f; // time step + const int nIters = 10; // simulation iterations + + int bytes = nBodies*sizeof(Body); + float *buf = (float*)malloc(bytes); + Body *p = (Body*)buf; + + randomizeBodies(buf, 6*nBodies); // Init pos / vel data + + double totalTime = 0.0; + + for (int iter = 1; iter <= nIters; iter++) { + StartTimer(); + + bodyForce(p, dt, nBodies); // compute interbody forces + + for (int i = 0 ; i < nBodies; i++) { // integrate position + p[i].x += p[i].vx*dt; + p[i].y += p[i].vy*dt; + p[i].z += p[i].vz*dt; + } + + const double tElapsed = GetTimer() / 1000.0; + if (iter > 1) { // First iter is warm up + totalTime += tElapsed; + } +#ifndef SHMOO + printf("Iteration %d: %.3f seconds\n", iter, tElapsed); +#endif + } + double avgTime = totalTime / (double)(nIters-1); + +#ifdef SHMOO + printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#else + printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", + nIters, rate); + printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); +#endif + free(buf); +} diff --git a/shmoo-cpu-nbody.sh b/shmoo-cpu-nbody.sh new file mode 100755 index 0000000..a06358f --- /dev/null +++ b/shmoo-cpu-nbody.sh @@ -0,0 +1,13 @@ +SRC=nbody.c +EXE=nbody +gcc -std=c99 -O3 -fopenmp -DSHMOO -o $EXE $SRC -lm + +echo $EXE + +K=1024 +for i in {1..10} +do + ./$EXE $K + K=$(($K*2)) +done +