-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
974 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,38 @@ | ||
mini-nbody | ||
========== | ||
mini-nbody: A simple N-body Code | ||
================================ | ||
|
||
A simple gravitational N-body simulation in less than 100 lines of C code, with CUDA optimizations. | ||
|
||
Benchmarks | ||
---------- | ||
|
||
There are 5 different benchmarks provided for CUDA and MIC platforms. | ||
|
||
1. nbody-orig : the original, unoptimized simulation (also for CPU) | ||
2. nbody-soa : Conversion from array of structures (AOS) data layout to structure of arrays (SOA) data layout | ||
3. nbody-flush : Flush denormals to zero (no code changes, just a command line option) | ||
4. nbody-block : Cache blocking | ||
5. nbody-unroll / nbody-align : platform specific final optimizations (loop unrolling in CUDA, and data alignment on MIC) | ||
|
||
Files | ||
----- | ||
|
||
nbody.c : simple, unoptimized OpenMP C code | ||
timer.h : simple cross-OS timing code | ||
|
||
Each directory below includes scripts for building and running a "shmoo" of five successive optimizations of the code over a range of data sizes from 1024 to 524,288 bodies. | ||
|
||
cuda/ : folder containing CUDA optimized versions of the original C code (in order of performance on Tesla K20c GPU) | ||
1. nbody-orig.cu : a straight port of the code to CUDA (shmoo-cuda-nbody-orig.sh) | ||
2. nbody-soa.cu : conversion to structure of arrays (SOA) data layout (shmoo-cuda-nbody-soa.sh) | ||
3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-cuda-nbody-ftz.sh) | ||
4. nbody-block.cu : cache blocking in CUDA shared memory (shmoo-cuda-nbody-block.sh) | ||
5. nbody-unroll.cu : addition of "#pragma unroll" to inner loop (shmoo-cuda-nbody-unroll.sh) | ||
|
||
mic/ : folder containing Intel Xeon Phi (MIC) optimized versions of the original C code (in order of performance on Xeon Phi 7110P) | ||
1. ../nbody-orig.cu : original code (shmoo-mic-nbody-orig.sh) | ||
2. nbody-soa.c : conversion to structure of arrays (SOA) data layout (shmoo-mic-nbody-soa.sh) | ||
3. nbody-soa.cu + ftz : Enable flush denorms to zero (shmoo-mic-nbody-ftz.sh) | ||
4. nbody-block.c : cache blocking via loop splitting (shmoo-mic-nbody-block.sh) | ||
5. nbody-align.c : aligned memory allocation and vector access (shmoo-mic-nbody-align.sh) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include "timer.h" | ||
|
||
#define BLOCK_SIZE 256 | ||
#define SOFTENING 1e-9f | ||
|
||
typedef struct { float4 *pos, *vel; } BodySystem; | ||
|
||
void randomizeBodies(float *data, int n) { | ||
for (int i = 0; i < n; i++) { | ||
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; | ||
} | ||
} | ||
|
||
__global__ | ||
void bodyForce(float4 *p, float4 *v, float dt, int n) { | ||
int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (i < n) { | ||
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; | ||
|
||
for (int tile = 0; tile < gridDim.x; tile++) { | ||
__shared__ float3 spos[BLOCK_SIZE]; | ||
float4 tpos = p[tile * blockDim.x + threadIdx.x]; | ||
spos[threadIdx.x] = make_float3(tpos.x, tpos.y, tpos.z); | ||
__syncthreads(); | ||
|
||
for (int j = 0; j < BLOCK_SIZE; j++) { | ||
float dx = spos[j].x - p[i].x; | ||
float dy = spos[j].y - p[i].y; | ||
float dz = spos[j].z - p[i].z; | ||
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; | ||
float invDist = rsqrtf(distSqr); | ||
float invDist3 = invDist * invDist * invDist; | ||
|
||
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; | ||
} | ||
__syncthreads(); | ||
} | ||
|
||
v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; | ||
} | ||
} | ||
|
||
int main(const int argc, const char** argv) { | ||
|
||
int nBodies = 30000; | ||
if (argc > 1) nBodies = atoi(argv[1]); | ||
|
||
const float dt = 0.01f; // time step | ||
const int nIters = 10; // simulation iterations | ||
|
||
int bytes = 2*nBodies*sizeof(float4); | ||
float *buf = (float*)malloc(bytes); | ||
BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; | ||
|
||
randomizeBodies(buf, 8*nBodies); // Init pos / vel data | ||
|
||
float *d_buf; | ||
cudaMalloc(&d_buf, bytes); | ||
BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; | ||
|
||
int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; | ||
double totalTime = 0.0; | ||
|
||
for (int iter = 1; iter <= nIters; iter++) { | ||
StartTimer(); | ||
|
||
cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); | ||
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies); | ||
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); | ||
|
||
for (int i = 0 ; i < nBodies; i++) { // integrate position | ||
p.pos[i].x += p.vel[i].x*dt; | ||
p.pos[i].y += p.vel[i].y*dt; | ||
p.pos[i].z += p.vel[i].z*dt; | ||
} | ||
|
||
const double tElapsed = GetTimer() / 1000.0; | ||
if (iter > 1) { // First iter is warm up | ||
totalTime += tElapsed; | ||
} | ||
#ifndef SHMOO | ||
printf("Iteration %d: %.3f seconds\n", iter, tElapsed); | ||
#endif | ||
} | ||
double avgTime = totalTime / (double)(nIters-1); | ||
|
||
#ifdef SHMOO | ||
printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); | ||
#else | ||
printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", | ||
nIters, rate); | ||
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); | ||
#endif | ||
free(buf); | ||
cudaFree(d_buf); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include "timer.h" | ||
|
||
#define BLOCK_SIZE 256 | ||
#define SOFTENING 1e-9f | ||
|
||
typedef struct { float x, y, z, vx, vy, vz; } Body; | ||
|
||
void randomizeBodies(float *data, int n) { | ||
for (int i = 0; i < n; i++) { | ||
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; | ||
} | ||
} | ||
|
||
__global__ | ||
void bodyForce(Body *p, float dt, int n) { | ||
int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (i < n) { | ||
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; | ||
|
||
for (int j = 0; j < n; j++) { | ||
float dx = p[j].x - p[i].x; | ||
float dy = p[j].y - p[i].y; | ||
float dz = p[j].z - p[i].z; | ||
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; | ||
float invDist = rsqrtf(distSqr); | ||
float invDist3 = invDist * invDist * invDist; | ||
|
||
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; | ||
} | ||
|
||
p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz; | ||
} | ||
} | ||
|
||
int main(const int argc, const char** argv) { | ||
|
||
int nBodies = 30000; | ||
if (argc > 1) nBodies = atoi(argv[1]); | ||
|
||
const float dt = 0.01f; // time step | ||
const int nIters = 10; // simulation iterations | ||
|
||
int bytes = nBodies*sizeof(Body); | ||
float *buf = (float*)malloc(bytes); | ||
Body *p = (Body*)buf; | ||
|
||
randomizeBodies(buf, 6*nBodies); // Init pos / vel data | ||
|
||
float *d_buf; | ||
cudaMalloc(&d_buf, bytes); | ||
Body *d_p = (Body*)d_buf; | ||
|
||
int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; | ||
double totalTime = 0.0; | ||
|
||
for (int iter = 1; iter <= nIters; iter++) { | ||
StartTimer(); | ||
|
||
cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); | ||
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces | ||
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); | ||
|
||
for (int i = 0 ; i < nBodies; i++) { // integrate position | ||
p[i].x += p[i].vx*dt; | ||
p[i].y += p[i].vy*dt; | ||
p[i].z += p[i].vz*dt; | ||
} | ||
|
||
const double tElapsed = GetTimer() / 1000.0; | ||
if (iter > 1) { // First iter is warm up | ||
totalTime += tElapsed; | ||
} | ||
#ifndef SHMOO | ||
printf("Iteration %d: %.3f seconds\n", iter, tElapsed); | ||
#endif | ||
} | ||
double avgTime = totalTime / (double)(nIters-1); | ||
|
||
#ifdef SHMOO | ||
printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); | ||
#else | ||
printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", | ||
nIters, rate); | ||
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); | ||
#endif | ||
free(buf); | ||
cudaFree(d_buf); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include "timer.h" | ||
|
||
#define BLOCK_SIZE 256 | ||
#define SOFTENING 1e-9f | ||
|
||
typedef struct { float4 *pos, *vel; } BodySystem; | ||
|
||
void randomizeBodies(float *data, int n) { | ||
for (int i = 0; i < n; i++) { | ||
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; | ||
} | ||
} | ||
|
||
__global__ | ||
void bodyForce(float4 *p, float4 *v, float dt, int n) { | ||
int i = blockDim.x * blockIdx.x + threadIdx.x; | ||
if (i < n) { | ||
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f; | ||
|
||
for (int j = 0; j < n; j++) { | ||
float dx = p[j].x - p[i].x; | ||
float dy = p[j].y - p[i].y; | ||
float dz = p[j].z - p[i].z; | ||
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING; | ||
float invDist = rsqrtf(distSqr); | ||
float invDist3 = invDist * invDist * invDist; | ||
|
||
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; | ||
} | ||
|
||
v[i].x += dt*Fx; v[i].y += dt*Fy; v[i].z += dt*Fz; | ||
} | ||
} | ||
|
||
int main(const int argc, const char** argv) { | ||
|
||
int nBodies = 30000; | ||
if (argc > 1) nBodies = atoi(argv[1]); | ||
|
||
const float dt = 0.01f; // time step | ||
const int nIters = 10; // simulation iterations | ||
|
||
int bytes = 2*nBodies*sizeof(float4); | ||
float *buf = (float*)malloc(bytes); | ||
BodySystem p = { (float4*)buf, ((float4*)buf) + nBodies }; | ||
|
||
randomizeBodies(buf, 8*nBodies); // Init pos / vel data | ||
|
||
float *d_buf; | ||
cudaMalloc(&d_buf, bytes); | ||
BodySystem d_p = { (float4*)d_buf, ((float4*)d_buf) + nBodies }; | ||
|
||
int nBlocks = (nBodies + BLOCK_SIZE - 1) / BLOCK_SIZE; | ||
double totalTime = 0.0; | ||
|
||
for (int iter = 1; iter <= nIters; iter++) { | ||
StartTimer(); | ||
|
||
cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice); | ||
bodyForce<<<nBlocks, BLOCK_SIZE>>>(d_p.pos, d_p.vel, dt, nBodies); | ||
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); | ||
|
||
for (int i = 0 ; i < nBodies; i++) { // integrate position | ||
p.pos[i].x += p.vel[i].x*dt; | ||
p.pos[i].y += p.vel[i].y*dt; | ||
p.pos[i].z += p.vel[i].z*dt; | ||
} | ||
|
||
const double tElapsed = GetTimer() / 1000.0; | ||
if (iter > 1) { // First iter is warm up | ||
totalTime += tElapsed; | ||
} | ||
#ifndef SHMOO | ||
printf("Iteration %d: %.3f seconds\n", iter, tElapsed); | ||
#endif | ||
} | ||
double avgTime = totalTime / (double)(nIters-1); | ||
|
||
#ifdef SHMOO | ||
printf("%d, %0.3f\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); | ||
#else | ||
printf("Average rate for iterations 2 through %d: %.3f +- %.3f steps per second.\n", | ||
nIters, rate); | ||
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, 1e-9 * nBodies * nBodies / avgTime); | ||
#endif | ||
free(buf); | ||
cudaFree(d_buf); | ||
} |
Oops, something went wrong.