diff --git a/README.md b/README.md
index 98dd9a8..bba3282 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,38 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Name: Zhan Xiong Chin
+* Tested on: Windows 7 Professional, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70 GHz 3.70 GHz, GTX 1070 8192MB (SIG Lab)
 
-### (TODO: Your README)
+![](images/boids2.gif)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+10000 particle simulation
+
+![](images/boids.gif)
+
+500000 particle simulation
+
+
+Build Instructions
+==================
+[See here](https://github.com/CIS565-Fall-2016/Project0-CUDA-Getting-Started/blob/master/INSTRUCTION.md)
+
+Performance analysis
+====================
+
+|                        | 5000 particles | 10000 particles | 50000 particles | 500000 particles |
+|------------------------|----------------|-----------------|-----------------|------------------|
+| Naive search           | 700 fps        | 350 fps         | 20 fps          | (crashes)        |
+| Scattered uniform grid | 770 fps        | 1100 fps        | 490 fps         | 6 fps            |
+| Coherent uniform grid  | 770 fps        | 1100 fps        | 1100 fps        | 72 fps           |
+
+* For the naive search, increasing the number of boids decreases performance, which is expected, as more computations
+ need to be done to figure out velocity changes when the number of boids increases.
+* However, for the uniform grids, the performance increases when going from 5000 to 10000 particles, but decreases
+after that. This may be related to branching effects: there may not be empty grid cells with a sufficient number of particles.
+
+* Increasing the block size does not change the performance of the uniform grid implementation significantly. This is
+because the overall utilization of the device remains similar.
+
+* The coherent uniform grid had significant performance improvement over the scattered uniform grid. This was expected,
+since this takes better advantage of caching and avoids expensive memory accesses.
\ No newline at end of file
diff --git a/images/boids.gif b/images/boids.gif
new file mode 100644
index 0000000..5c5d6b8
Binary files /dev/null and b/images/boids.gif differ
diff --git a/images/boids2.gif b/images/boids2.gif
new file mode 100644
index 0000000..f6bc0dc
Binary files /dev/null and b/images/boids2.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..dff0113 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..200b97c 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -83,8 +83,9 @@ thrust::device_ptr<int> dev_thrust_particleGridIndices;
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
-// TODO-2.3 - consider what additional buffers you might need to reshuffle
+// DONE-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_pos2;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -168,7 +169,21 @@ void Boids::initSimulation(int N) {
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  // DONE-2.1 DONE-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -230,21 +245,59 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 velocityChange(0.0f, 0.0f, 0.0f);
+  glm::vec3 ownPosition = pos[iSelf];
+  glm::vec3 sumPosition(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumStayAway(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumVelocity(0.0f, 0.0f, 0.0f);
+  float nearbyCount = 0.0;
+  for (int i = 0; i < N; i++) {
+    if (i == iSelf) {
+      continue;
+    }
+    float distance = glm::length(ownPosition - pos[i]);
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    if (distance < rule1Distance) {
+	    nearbyCount += 1.0;
+	    sumPosition += pos[i];
+    }
+    // Rule 2: boids try to stay a distance d away from each other
+    if (distance < rule2Distance) {
+	    sumStayAway += ownPosition - pos[i];
+    }
+    // Rule 3: boids try to match the speed of surrounding boids
+    if (distance < rule3Distance) {
+	    sumVelocity += vel[i];
+    }
+  }
+  if (nearbyCount > 0) {
+	  sumPosition = sumPosition / nearbyCount;
+	  velocityChange += (sumPosition - ownPosition) * rule1Scale;
+  }
+  velocityChange += sumStayAway * rule2Scale;
+  velocityChange += sumVelocity * rule3Scale;
+  return velocityChange;
 }
 
 /**
-* TODO-1.2 implement basic flocking
+* DONE-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  glm::vec3 newVelocity = vel1[index] + computeVelocityChange(N, index, pos, vel1);
   // Clamp the speed
+  float speed = glm::length(newVelocity);
+  if (speed > maxSpeed) {
+    newVelocity = (newVelocity / speed) * maxSpeed;
+  }
   // Record the new velocity into vel2. Question: why NOT vel1?
+  vel2[index] = newVelocity;
 }
 
 /**
@@ -285,10 +338,17 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+  // DONE-2.1
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+  // - Label each boid with the index of its grid cell.
+	glm::ivec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+	gridIndices[index] = gridIndex3Dto1D(gridPos.x, gridPos.y, gridPos.z, gridResolution);
+	// - Set up a parallel array of integer indices as pointers to the actual
+	//   boid data in pos and vel1/vel2
+	indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -302,10 +362,29 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
+  // DONE-2.1
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  int gridIndex = particleGridIndices[index];
+  if (index != N - 1) {
+    if (gridIndex != particleGridIndices[index + 1]) {
+      gridCellEndIndices[gridIndex] = index;
+    }
+  } else {
+    gridCellEndIndices[gridIndex] = index;
+  }
+  if (index != 0) {
+    if (gridIndex != particleGridIndices[index - 1]) {
+      gridCellStartIndices[gridIndex] = index;
+    }
+  } else {
+    gridCellStartIndices[gridIndex] = index;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -314,14 +393,76 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   int *gridCellStartIndices, int *gridCellEndIndices,
   int *particleArrayIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+  // DONE-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
   // - Identify the grid cell that this particle is in
+  glm::ivec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+  int gridIndex = gridIndex3Dto1D(gridPos.x, gridPos.y, gridPos.z, gridResolution);
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+  glm::vec3 velocityChange(0.0f, 0.0f, 0.0f);
+  glm::vec3 ownPosition = pos[index];
+  glm::vec3 sumPosition(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumStayAway(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumVelocity(0.0f, 0.0f, 0.0f);
+  float nearbyCount = 0.0;
+  glm::vec3 midpoints = (((glm::vec3)gridPos + glm::vec3(0.5f, 0.5f, 0.5f)) * cellWidth) + gridMin;
+  int xBegin = (pos[index].x < midpoints.x) ? -1 : 0;
+  int yBegin = (pos[index].y < midpoints.y) ? -1 : 0;
+	int zBegin = (pos[index].z < midpoints.z) ? -1 : 0;
+	int gridCellCount = gridResolution * gridResolution * gridResolution;
+  for (int dz = 0; dz <= 1; dz++) {
+    for (int dy = 0; dy <= 1; dy++) {
+      for (int dx = 0; dx <= 1; dx++) {
+        int otherGridIndex = gridIndex + (dx + xBegin) + gridResolution * (dy + yBegin) + gridResolution * gridResolution * (dz + zBegin);
+        if (otherGridIndex < 0 || otherGridIndex >= gridCellCount) continue;
+        // - For each cell, read the start/end indices in the boid pointer array.
+        int startIndex = gridCellStartIndices[otherGridIndex];
+        int endIndex = gridCellEndIndices[otherGridIndex];
+        if (startIndex == -1) {
+          continue;
+        }
+        // - Access each boid in the cell and compute velocity change from
+        //   the boids rules, if this boid is within the neighborhood distance.
+        for (int j = startIndex; j <= endIndex; j++) {
+          int otherBoidIndex = particleArrayIndices[j];
+          if (otherBoidIndex == index) {
+            continue;
+          }
+          float distance = glm::length(ownPosition - pos[otherBoidIndex]);
+          // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+          if (distance < rule1Distance) {
+            nearbyCount += 1.0;
+            sumPosition += pos[otherBoidIndex];
+          }
+          // Rule 2: boids try to stay a distance d away from each other
+          if (distance < rule2Distance) {
+            sumStayAway += ownPosition - pos[otherBoidIndex];
+          }
+          // Rule 3: boids try to match the speed of surrounding boids
+          if (distance < rule3Distance) {
+            sumVelocity += vel1[otherBoidIndex];
+          }
+        }
+      }
+    }
+  }
+  if (nearbyCount > 0) {
+    sumPosition = sumPosition / nearbyCount;
+    velocityChange += (sumPosition - ownPosition) * rule1Scale;
+  }
+  velocityChange += sumStayAway * rule2Scale;
+  velocityChange += sumVelocity * rule3Scale;
   // - Clamp the speed change before putting the new speed in vel2
+  glm::vec3 newVelocity = vel1[index] + velocityChange;
+  float speed = glm::length(newVelocity);
+  if (speed > maxSpeed) {
+    newVelocity = (newVelocity / speed) * maxSpeed;
+  }
+  vel2[index] = newVelocity;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -329,59 +470,168 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+  // DONE-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
   // except with one less level of indirection.
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
   // - Identify the grid cell that this particle is in
+  glm::ivec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+  int gridIndex = gridIndex3Dto1D(gridPos.x, gridPos.y, gridPos.z, gridResolution);
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
+  glm::vec3 velocityChange(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumPosition(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumStayAway(0.0f, 0.0f, 0.0f);
+  glm::vec3 sumVelocity(0.0f, 0.0f, 0.0f);
+	glm::vec3 ownPos = pos[index];
+  float nearbyCount = 0.0;
+
   //   DIFFERENCE: For best results, consider what order the cells should be
   //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+  glm::vec3 midpoints = (((glm::vec3)gridPos + glm::vec3(0.5f, 0.5f, 0.5f)) * cellWidth) + gridMin;
+	int xOffset = (ownPos.x < midpoints.x) ? -1 : 0;
+	int yOffset = (ownPos.y < midpoints.y) ? -1 : 0;
+	int zOffset = (ownPos.z < midpoints.z) ? -1 : 0;
+  for (int dz = 0; dz <= 1; dz++) {
+    for (int dy = 0; dy <= 1; dy++) {
+      for (int dx = 0; dx <= 1; dx++) {
+				int otherGridIndex = gridIndex 
+					+ (dx + xOffset) 
+					+ gridResolution * (dy + yOffset)
+					+ gridResolution * gridResolution * (dz + zOffset);
+				if (otherGridIndex < 0 || otherGridIndex >= gridResolution * gridResolution * gridResolution) continue;
+        // - For each cell, read the start/end indices in the boid pointer array.
+        int startIndex = gridCellStartIndices[otherGridIndex];
+        int endIndex = gridCellEndIndices[otherGridIndex];
+        if (startIndex == -1) {
+          continue;
+        }
+        // - Access each boid in the cell and compute velocity change from
+        //   the boids rules, if this boid is within the neighborhood distance.
+        for (int j = startIndex; j <= endIndex; j++) {
+          if (j == index) {
+            continue;
+          }
+					float distance = glm::length(ownPos - pos[j]);
+          // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+          if (distance < rule1Distance) {
+            nearbyCount += 1.0;
+            sumPosition += pos[j];
+          }
+          // Rule 2: boids try to stay a distance d away from each other
+          if (distance < rule2Distance) {
+						sumStayAway += ownPos - pos[j];
+          }
+          // Rule 3: boids try to match the speed of surrounding boids
+          if (distance < rule3Distance) {
+            sumVelocity += vel1[j];
+          }
+        }
+      }
+    }
+  }
+  if (nearbyCount > 0) {
+    sumPosition = sumPosition / nearbyCount;
+		velocityChange += (sumPosition - ownPos) * rule1Scale;
+  }
+  velocityChange += sumStayAway * rule2Scale;
+  velocityChange += sumVelocity * rule3Scale;
   // - Clamp the speed change before putting the new speed in vel2
+  glm::vec3 newVelocity = vel1[index] + velocityChange;
+  float speed = glm::length(newVelocity);
+  if (speed > maxSpeed) {
+    newVelocity = glm::normalize(newVelocity) * maxSpeed;
+  }
+  vel2[index] = newVelocity;
+}
+
+__global__ void kernCopyPosVelToCoherentArray(int N, int * particleArrayIndices,
+  glm::vec3 * pos, glm::vec3 * coherent_pos, glm::vec3 * vel, glm::vec3 * coherent_vel) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  // Copy pos and vel to new locations
+  int oldIndex = particleArrayIndices[index];
+  coherent_pos[index] = pos[oldIndex];
+  coherent_vel[index] = vel[oldIndex];
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+  // DONE-1.2 - use the kernels you wrote to step the simulation forward in time.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel1);
+  kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+  // DONE-1.2 ping-pong the velocity buffers
+  glm::vec3 *tempVelocityPointer = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = tempVelocityPointer;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
+  // DONE-2.1
+  dim3 fullBlocksPerGridObjects((numObjects + blockSize - 1) / blockSize);
+  dim3 fullBlocksPerGridCells((gridCellCount + blockSize - 1) / blockSize);
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+  kernComputeIndices << <fullBlocksPerGridObjects, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernResetIntBuffer << <fullBlocksPerGridCells, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <fullBlocksPerGridCells, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd << <fullBlocksPerGridObjects, blockSize >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchScattered << <fullBlocksPerGridObjects, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+      dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
   // - Update positions
+  kernUpdatePos << <fullBlocksPerGridObjects, blockSize >> >(numObjects, dt, dev_pos, dev_vel1);
   // - Ping-pong buffers as needed
+  glm::vec3 *tempVelocityPointer = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = tempVelocityPointer;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+  // DONE-2.3 - start by copying Boids::stepSimulationNaiveGrid
+  dim3 fullBlocksPerGridObjects((numObjects + blockSize - 1) / blockSize);
+  dim3 fullBlocksPerGridCells((gridCellCount + blockSize - 1) / blockSize);
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
   // In Parallel:
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+  kernComputeIndices << <fullBlocksPerGridObjects, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernResetIntBuffer << <fullBlocksPerGridCells, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << <fullBlocksPerGridCells, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+  kernIdentifyCellStartEnd << <fullBlocksPerGridObjects, blockSize >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+  kernCopyPosVelToCoherentArray << <fullBlocksPerGridObjects, blockSize >> >(numObjects, dev_particleArrayIndices, dev_pos, dev_pos2, dev_vel1, dev_vel2);
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGridObjects, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+    dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos2, dev_vel2, dev_vel1);
   // - Update positions
+  kernUpdatePos << <fullBlocksPerGridObjects, blockSize >> >(numObjects, dt, dev_pos2, dev_vel1);
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  glm::vec3 *tempPointer = dev_pos;
+  dev_pos = dev_pos2;
+  dev_pos2 = tempPointer;
 }
 
 void Boids::endSimulation() {
@@ -389,7 +639,13 @@ void Boids::endSimulation() {
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
 
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  // DONE-2.1 DONE-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_pos2);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index e416836..1389026 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**