diff --git a/README.md b/README.md
index d63a6a1..070a84c 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,42 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ruijun(Daniel) Zhong
+    * [LinkedIn](https://www.linkedin.com/in/daniel-z-73158b152/)    
+    * [Personal Website](https://www.danielzhongportfolio.com/)
+* Tested on: Windows 11 pro, 12th Gen Intel(R) Core(TM) i7-12700K 3.61 GHz 32.0 GB, NVIDIA GeForce RTX 3070 Ti (personal computer)
 
-### (TODO: Your README)
+### ScreenShots
+(Please allow a moment for the gif to load.)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+#Boids = 10000
+![](images/Screenshots/Boids.gif)
+
+#Boids = 500000, Naive Search
+![](images/Screenshots/Boids_500000_Naive.gif)
+
+#Boids = 500000, Scatter Uniform Grid
+![](images/Screenshots/Boids_500000_Uniform.gif)
+
+#Boids = 500000, Coherent Uniform Grid
+![](images/Screenshots/Boids_500000_Coherent.gif)
+
+### Analysis
+
+* For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+    As the number of boids increases, the frames per second (fps) for all implementations decrease, which is expected because more boids mean more calculations per frame. Both the Coherent and Uniform algorithms yield higher FPS, as they are optimized for efficiency and involve fewer boid calculations.
+![](images/Screenshots/Boids_FPS.png)
+
+* For each implementation, how does changing the block count and block size affect * performance? Why do you think this is?
+
+    As we can se in the below graph, block size rises from 16 to 128, FPS improves, likely due to GPU underutilization at smaller sizes; however, beyond 128, FPS declines, possibly due to inefficiencies from limited parallelism or increased overhead.
+![](images/Screenshots/BlockSize_FPS.png)
+
+* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+    The Coherent implementation's performance improvement was expected becuase it reduces latency and increases throughput as it avoid repeat calculation of boid's neighbour.
+
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+    Changing the cell width and opting between 27 versus 8 neighboring cells does impact performance, but its effect is tied to boid density. For densely packed boids, checking 27 cells is more efficient as it captures more relevant neighbors. However, with fewer or sparsely distributed boids, the 8-cell approach is optimal, avoiding unnecessary computations. The best method hinges on the balance between boid density and computational efficiency.
\ No newline at end of file
diff --git a/images/Screenshots/BlockSize_FPS.png b/images/Screenshots/BlockSize_FPS.png
new file mode 100644
index 0000000..ab42f0d
Binary files /dev/null and b/images/Screenshots/BlockSize_FPS.png differ
diff --git a/images/Screenshots/Boids.gif b/images/Screenshots/Boids.gif
new file mode 100644
index 0000000..642c394
Binary files /dev/null and b/images/Screenshots/Boids.gif differ
diff --git a/images/Screenshots/Boids_500000_Coherent.gif b/images/Screenshots/Boids_500000_Coherent.gif
new file mode 100644
index 0000000..4939121
Binary files /dev/null and b/images/Screenshots/Boids_500000_Coherent.gif differ
diff --git a/images/Screenshots/Boids_500000_Naive.gif b/images/Screenshots/Boids_500000_Naive.gif
new file mode 100644
index 0000000..e514c94
Binary files /dev/null and b/images/Screenshots/Boids_500000_Naive.gif differ
diff --git a/images/Screenshots/Boids_500000_Uniform.gif b/images/Screenshots/Boids_500000_Uniform.gif
new file mode 100644
index 0000000..e140e20
Binary files /dev/null and b/images/Screenshots/Boids_500000_Uniform.gif differ
diff --git a/images/Screenshots/Boids_FPS.png b/images/Screenshots/Boids_FPS.png
new file mode 100644
index 0000000..88f132c
Binary files /dev/null and b/images/Screenshots/Boids_FPS.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..573d191 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -86,6 +86,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3 *dev_coherentPos;
+glm::vec3* dev_coherentVel;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +172,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_coherentPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentPos failed!");
+
+  cudaMalloc((void**)&dev_coherentVel, numObjects * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentVel failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -233,9 +254,51 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_center(0.0f), c(0.0f), perceived_velocity(0.0f);
+    glm::vec3 result(0.0f);
+
+    int number_of_neighbors_rule1 = 0;
+    int number_of_neighbors_rule3 = 0;
+
+    for (int i = 0; i < N; ++i) {
+        if (i != iSelf) {
+            float dist = glm::length(pos[i] - pos[iSelf]);
+
+            //First Rule
+            if (dist < rule1Distance) {
+                perceived_center += pos[i];
+                number_of_neighbors_rule1++;
+            }
+
+            //Second Rule
+            if (dist < rule2Distance) {
+                c -= (pos[i] - pos[iSelf]);
+            }
+
+            //Third Rule
+            if (dist < rule3Distance) {
+                perceived_velocity += vel[i];
+                number_of_neighbors_rule3++;
+            }
+        }
+    }
+
+    if (number_of_neighbors_rule1 > 0) {
+        perceived_center /= float(number_of_neighbors_rule1);
+        perceived_center = (perceived_center - pos[iSelf]) * rule1Scale;
+    }
+
+    c *= rule2Scale;
+
+    if (number_of_neighbors_rule3 > 0) {
+        perceived_velocity /= float(number_of_neighbors_rule3);
+        perceived_velocity *= rule3Scale;
+    }
+    result = perceived_center + c + perceived_velocity;
+    return result;
 }
 
+
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
@@ -243,8 +306,19 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+  
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 new_velocity = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+    // Clamp the speed
+    float speed = glm::length(new_velocity);
+    if (speed > maxSpeed) {
+        new_velocity = glm::normalize(new_velocity) * maxSpeed;
+    }
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = new_velocity;
 }
 
 /**
@@ -289,6 +363,15 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    indices[index] = index;
+    glm::vec3 position = pos[index];
+    glm::vec3 gridPos = glm::floor((position - gridMin) * inverseCellWidth); //*inverseCellWidth is same as divide positive cellWidth
+    int newIndex = gridIndex3Dto1D(gridPos.x, gridPos.y, gridPos.z, gridResolution);
+    gridIndices[index] = newIndex;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -302,28 +385,124 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
+
   // TODO-2.1
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    if (index == 0 || particleGridIndices[index] != particleGridIndices[index - 1]) {
+        gridCellStartIndices[particleGridIndices[index]] = index;
+        if (index > 0) {
+            gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+        }
+    }
+
+    if (index == N - 1) {
+        gridCellEndIndices[particleGridIndices[index]] = index;
+    }
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int* gridCellStartIndices, int* gridCellEndIndices,
+    int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 perceived_center(0.0f), c(0.0f), perceived_velocity(0.0f);
+    int num_neighbors_rule1 = 0;
+    int num_neighbors_rule3 = 0;
+
+    glm::vec3 position = pos[index];
+    glm::vec3 gridPos = glm::floor((position - gridMin) * inverseCellWidth);
+    glm::vec3 gridBegin = gridPos - glm::vec3(0.5, 0.5, 0.5);
+    
+
+    glm::ivec3 minBound = glm::ivec3(imax(gridPos.x - 1, 0),
+        imax(gridPos.y - 1, 0),
+        imax(gridPos.z - 1, 0));
+
+    glm::ivec3 maxBound = glm::ivec3(imin(gridPos.x + 1, gridResolution - 1),
+        imin(gridPos.y + 1, gridResolution - 1),
+        imin(gridPos.z + 1, gridResolution - 1));
+
+
+    for (int x = minBound.x; x <= maxBound.x; x++) {
+        for (int y = minBound.y; y <= maxBound.y; y++) {
+            for (int z = minBound.z; z <= maxBound.z; z++) {
+                int neighborGridIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+                int startIdx = gridCellStartIndices[neighborGridIndex];
+                int endIdx = gridCellEndIndices[neighborGridIndex];
+
+                for (int i = startIdx; i <= endIdx; i++) {
+                    int neighborIdx = particleArrayIndices[i];
+                    if (neighborIdx != index) {
+                        glm::vec3 neighborPos = pos[neighborIdx];
+                        float dist = glm::length(neighborPos - pos[index]);
+
+                        //First Rule
+                        if (dist < rule1Distance) {
+                            perceived_center += neighborPos;
+                            num_neighbors_rule1++;
+                        }
+
+                        //Second Rule
+                        if (dist < rule2Distance) {
+                            c -= (neighborPos - pos[index]);
+                        }
+
+                        //Third Rule
+                        if (dist < rule3Distance) {
+                            perceived_velocity += vel1[neighborIdx];
+                            num_neighbors_rule3++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    glm::vec3 vel_del;
+    if (num_neighbors_rule1 > 0) {
+        perceived_center /= num_neighbors_rule1;
+        vel_del += (perceived_center - pos[index]) * rule1Scale;
+    }
+    vel_del += c * rule2Scale;
+    if (num_neighbors_rule3 > 0) {
+        perceived_velocity /= num_neighbors_rule3;
+        vel_del += perceived_velocity * rule3Scale;
+    }
+
+    glm::vec3 new_velocity = vel_del + vel1[index];
+    if (glm::dot(new_velocity, new_velocity) > 0) {
+        new_velocity = glm::normalize(new_velocity) * imin(glm::length(new_velocity), maxSpeed);
+    }
+
+    vel2[index] = new_velocity;
 }
 
+
+
 __global__ void kernUpdateVelNeighborSearchCoherent(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
@@ -341,16 +520,120 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 perceived_center(0.0f), c(0.0f), perceived_velocity(0.0f);
+    int num_neighbors_rule1 = 0;
+    int num_neighbors_rule3 = 0;
+
+    glm::vec3 position = pos[index];
+    glm::vec3 gridPos = glm::floor((position - gridMin) * inverseCellWidth);
+    glm::vec3 gridBegin = gridPos - glm::vec3(0.5, 0.5, 0.5);
+
+
+    glm::ivec3 minBound = glm::ivec3(imax(gridPos.x - 1, 0),
+        imax(gridPos.y - 1, 0),
+        imax(gridPos.z - 1, 0));
+
+    glm::ivec3 maxBound = glm::ivec3(imin(gridPos.x + 1, gridResolution - 1),
+        imin(gridPos.y + 1, gridResolution - 1),
+        imin(gridPos.z + 1, gridResolution - 1));
+
+
+    for (int x = minBound.x; x <= maxBound.x; x++) {
+        for (int y = minBound.y; y <= maxBound.y; y++) {
+            for (int z = minBound.z; z <= maxBound.z; z++) {
+                int neighborGridIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+                int startIdx = gridCellStartIndices[neighborGridIndex];
+                int endIdx = gridCellEndIndices[neighborGridIndex];
+
+                for (int i = startIdx; i <= endIdx; i++) {
+                    if (i != index) {
+                        glm::vec3 neighborPos = pos[i];
+                        float dist = glm::length(neighborPos - pos[index]);
+
+                        //First Rule
+                        if (dist < rule1Distance) {
+                            perceived_center += neighborPos;
+                            num_neighbors_rule1++;
+                        }
+
+                        //Second Rule
+                        if (dist < rule2Distance) {
+                            c -= (neighborPos - pos[index]);
+                        }
+
+                        //Third Rule
+                        if (dist < rule3Distance) {
+                            perceived_velocity += vel1[i];
+                            num_neighbors_rule3++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    glm::vec3 vel_del;
+    if (num_neighbors_rule1 > 0) {
+        perceived_center /= num_neighbors_rule1;
+        vel_del += (perceived_center - pos[index]) * rule1Scale;
+    }
+    vel_del += c * rule2Scale;
+    if (num_neighbors_rule3 > 0) {
+        perceived_velocity /= num_neighbors_rule3;
+        vel_del += perceived_velocity * rule3Scale;
+    }
+
+    glm::vec3 new_velocity = vel_del + vel1[index];
+    if (glm::dot(new_velocity, new_velocity) > 0) {
+        new_velocity = glm::normalize(new_velocity) * imin(glm::length(new_velocity), maxSpeed);
+    }
+
+    vel2[index] = new_velocity;
+}
+
+__global__ void kernRearrangeParticleData(int N, int* particleArrayIndices, glm::vec3* pos, glm::vec3* vel, glm::vec3* coherentPos, glm::vec3* coherentVel) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    int sortedIndex = particleArrayIndices[index];
+
+    coherentPos[index] = pos[sortedIndex];
+    coherentVel[index] = vel[sortedIndex];
 }
 
+
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+
+    cudaDeviceSynchronize();
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+    cudaDeviceSynchronize();
+    checkCUDAErrorWithLine("kernUpdatePositions failed!");
+
+    // TODO-1.2 ping-pong the velocity buffers
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
 }
 
+
+
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
@@ -364,6 +647,32 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGridForCell((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+    thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+    kernResetIntBuffer << <fullBlocksPerGridForCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGridForCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
+    
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +691,37 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGridForCell((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+    thrust::device_ptr<int> dev_thrust_keys(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_thrust_values(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values);
+
+    kernRearrangeParticleData << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_coherentPos, dev_coherentVel);
+    checkCUDAErrorWithLine("kernRearrangeParticleData failed!");
+
+    kernResetIntBuffer << <fullBlocksPerGridForCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGridForCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_coherentPos, dev_coherentVel, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_coherentPos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
+    
+    glm::vec3* temp2 = dev_pos;
+    dev_pos = dev_coherentPos;
+    dev_coherentPos = temp2;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +730,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_coherentPos);
+  cudaFree(dev_coherentVel);
+
+  checkCUDAErrorWithLine("cudaFree failed!");
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..ddd0e3b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;