diff --git a/README.md b/README.md
index d63a6a1..5cf49f2 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,57 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Jiyu Huang
+  * [LinkedIn](https://www.linkedin.com/in/jiyu-huang-0123)
+* Tested on: Windows 10, i7-6700 @ 3.41GHz 16GB, Quadro P1000 4096MB (Town 062 Lab)
 
-### (TODO: Your README)
+# Overview
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![GIF](/images/ScreenCapture.gif)
+
+This project simulates boid flocking on GPU using Cuda, where the boid particles follow the following rules:
+
+1. cohesion - boids move towards the perceived center of mass of their neighbors
+2. separation - boids avoid getting to close to their neighbors
+3. alignment - boids generally try to move with the same direction and speed as their neighbors
+
+The simulation is implemented using three methods:
+
+1. **Naive Neighbor Search**: at every timestep, a boid looks at every other boid in the simulation to determine whether count as a neighbor and affects its velocity.
+2. **Uniform Grid**: by creating a uniform spatial grid, we only need to search for neighbor boids among relevant grid cells (8 by default implementation), instead of the entire boid population.
+3. **Coherent Grid**: like uniform grid, except we reduce random accessing by reordering the arrays for boid position and velocity.
+
+## Performance Analysis
+
+For performance analysis, framerates are observed over 20 seconds of execution.
+
+### Number of Boids and FPS
+
+![graph1](/images/graph1.png)
+![graph2](/images/graph2.png)
+
+As seen from the graphs, framerate declines as the number of boids increases. Intuitively this makes sense since as the number for boids increases, the number of neighbors each boid has increases.
+
+### Block Size (Number of Blocks) and FPS
+
+![graph3](/images/graph3.png)
+
+When the block size is really small like when it's 32 in the graph, there's a slight performance drop likely due to the fact that the number of threads in a block is too small to hide stalling caused by memory accesses.
+
+### Performance Gain from Coherent Uniform Grid
+
+From the graphs we can see that there's an improvement in performance with the more coherent uniform grid. As previously touched upon, this is because the reordering of the arrays make for conntiguous access to the data, reducing random accesses and improving performance.
+
+### Cell Width and FPS
+
+Using the default configuration, changing the cell width from twice the maximum neighborhood distance (checking 8 relevant grid cells) to maximum neighborhood distance (checking 27 relevant grid cells) does not change framerate too much. This is likely because maximum distance is rather small (5 when scene_scale = 100). When we increase the neighborhood distance, smaller cell width starts to improve performance. As an extreme example, when the distance is 50, the smaller cell width performances at 700fps while the larger default cell width performs at 360fps.
+
+This is due to the fact that when we decrease cell width, we are checking smaller volumes for each cell, which means that the checking volume gets more granular, and the total volume that needs to be checked decreases ($27\times1^2<8\times2^2$)
+
+## Extra Credit
+
+### Grid Looping Optimization
+
+instead of hard-coding a designated search area, the search area is based on grid cells that have any aspect of them within maximum neighborhood distance. This avoids the need for hard-coding specific number of cells to check when changing cell widths.
+
+To toggle on or off the feature, simply change the value of GRID_LOOPING_OPTIMIZATION to 1 or 0 in line 9 of src/kernel.cu
diff --git a/images/ScreenCapture.gif b/images/ScreenCapture.gif
new file mode 100644
index 0000000..ca45617
Binary files /dev/null and b/images/ScreenCapture.gif differ
diff --git a/images/graph1.png b/images/graph1.png
new file mode 100644
index 0000000..7d15f68
Binary files /dev/null and b/images/graph1.png differ
diff --git a/images/graph2.png b/images/graph2.png
new file mode 100644
index 0000000..4a66d32
Binary files /dev/null and b/images/graph2.png differ
diff --git a/images/graph3.png b/images/graph3.png
new file mode 100644
index 0000000..8ed53df
Binary files /dev/null and b/images/graph3.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..4298b0e 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -6,6 +6,8 @@
 #include "utilityCore.hpp"
 #include "kernel.h"
 
+#define GRID_LOOPING_OPTIMIZATION 0
+
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
@@ -85,6 +87,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_pos2;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +172,19 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+    cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+    cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+    cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+    cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+    cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+    dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +249,40 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 selfPos = pos[iSelf];
+    glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+    int numNeighborsRule1 = 0, numNeighborsRule3 = 0;
+    for (int i = 0; i < N; ++i) {
+        if (i != iSelf) {
+            glm::vec3 iPos = pos[i];
+            float dist = glm::distance(iPos, selfPos);
+            if (dist < rule1Distance) {
+                v1 += iPos;
+                ++numNeighborsRule1;
+            }
+            if (dist < rule2Distance) {
+                v2 -= iPos - selfPos;
+            }
+            if (dist < rule3Distance) {
+                v3 += vel[i];
+                ++numNeighborsRule3;
+            }
+        }
+    }
+    glm::vec3 v(0.f);
+    if (numNeighborsRule1 > 0) {
+        v1 /= numNeighborsRule1;
+        v1 = (v1 - selfPos) * rule1Scale;
+        v += v1;
+    }
+    v2 *= rule2Scale;
+    v += v2;
+    if (numNeighborsRule3 > 0) {
+        v3 /= numNeighborsRule3;
+        v3 *= rule3Scale;
+        v += v3;
+    }
+    return v;
 }
 
 /**
@@ -245,6 +294,16 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 vel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+    float l = glm::length(vel);
+    if (l > maxSpeed) {
+        vel *= maxSpeed / l;
+    }
+    vel2[index] = vel;
 }
 
 /**
@@ -289,6 +348,13 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    indices[index] = index;
+    glm::ivec3 gridIndex = (glm::ivec3) ((pos[index] - gridMin) * inverseCellWidth);
+    gridIndices[index] = gridIndex3Dto1D(gridIndex.x, gridIndex.y, gridIndex.z, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +372,25 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < N) {
+        int curGrid = particleGridIndices[index];
+        if (index == 0 || curGrid != particleGridIndices[index - 1]) {
+            gridCellStartIndices[curGrid] = index;
+        }
+        if (index == N - 1 || curGrid != particleGridIndices[index + 1]) {
+            gridCellEndIndices[curGrid] = index;
+        }
+    }
+}
+
+__global__ void kernReshufflePosVel(int N, int* indices, glm::vec3* pos1, glm::vec3* pos2, glm::vec3* vel1, glm::vec3* vel2) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < N) {
+        int arrayIndex = indices[index];
+        pos2[index] = pos1[arrayIndex];
+        vel2[index] = vel1[arrayIndex];
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +407,73 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+    int numNeighborsRule1 = 0, numNeighborsRule3 = 0;
+
+    glm::vec3 selfPos = pos[index];
+    glm::vec3 gridPos = (selfPos - gridMin) * inverseCellWidth;
+    int xLower = imax(gridPos.x - 0.5f, 0.f);
+    int xUpper = imin(gridPos.x + 0.5f, gridResolution - 1);
+    int yLower = imax(gridPos.y - 0.5f, 0.f);
+    int yUpper = imin(gridPos.y + 0.5f, gridResolution - 1);
+    int zLower = imax(gridPos.z - 0.5f, 0.f);
+    int zUpper = imin(gridPos.z + 0.5f, gridResolution - 1);
+    for (int z = zLower; z <= zUpper; ++z) {
+        for (int y = yLower; y <= yUpper; ++y) {
+            for (int x = xLower; x <= xUpper; ++x) {
+                int cell = gridIndex3Dto1D(x, y, z, gridResolution);
+                int startIndex = gridCellStartIndices[cell];
+                if (startIndex == -1) {
+                    continue;
+                }
+                int endIndex = gridCellEndIndices[cell];
+                for (int i = startIndex; i <= endIndex; ++i) {
+                    int otherIndex = particleArrayIndices[i];
+                    if (index != otherIndex) {
+
+                        glm::vec3 otherPos = pos[otherIndex];
+                        float dist = glm::distance(selfPos, otherPos);
+                        if (dist < rule1Distance) {
+                            v1 += otherPos;
+                            ++numNeighborsRule1;
+                        }
+                        if (dist < rule2Distance) {
+                            v2 -= otherPos - selfPos;
+                        }
+                        if (dist < rule3Distance) {
+                            v3 += vel1[otherIndex];
+                            ++numNeighborsRule3;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    glm::vec3 vel = vel1[index];
+    if (numNeighborsRule1 > 0) {
+        v1 /= numNeighborsRule1;
+        v1 = (v1 - selfPos) * rule1Scale;
+        vel += v1;
+    }
+    v2 *= rule2Scale;
+    vel += v2;
+    if (numNeighborsRule3 > 0) {
+        v3 /= numNeighborsRule3;
+        v3 *= rule3Scale;
+        vel += v3;
+    }
+    
+    float l = glm::length(vel);
+    if (l > maxSpeed) {
+        vel *= maxSpeed / l;
+    }
+    vel2[index] = vel;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +493,82 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+    int numNeighborsRule1 = 0, numNeighborsRule3 = 0;
+
+    glm::vec3 selfPos = pos[index];
+    glm::vec3 gridPos = (selfPos - gridMin) * inverseCellWidth;
+#if GRID_LOOPING_OPTIMIZATION
+    float maxDist = imax(imax(rule1Distance, rule2Distance), rule3Distance);
+    int xLower = imax(gridPos.x - maxDist * inverseCellWidth, 0.f);
+    int xUpper = imin(gridPos.x + maxDist * inverseCellWidth, gridResolution - 1);
+    int yLower = imax(gridPos.y - maxDist * inverseCellWidth, 0.f);
+    int yUpper = imin(gridPos.y + maxDist * inverseCellWidth, gridResolution - 1);
+    int zLower = imax(gridPos.z - maxDist * inverseCellWidth, 0.f);
+    int zUpper = imin(gridPos.z + maxDist * inverseCellWidth, gridResolution - 1);
+#else
+    int xLower = imax(gridPos.x - 0.5f, 0.f);
+    int xUpper = imin(gridPos.x + 0.5f, gridResolution - 1);
+    int yLower = imax(gridPos.y - 0.5f, 0.f);
+    int yUpper = imin(gridPos.y + 0.5f, gridResolution - 1);
+    int zLower = imax(gridPos.z - 0.5f, 0.f);
+    int zUpper = imin(gridPos.z + 0.5f, gridResolution - 1);
+#endif
+    for (int z = zLower; z <= zUpper; ++z) {
+        for (int y = yLower; y <= yUpper; ++y) {
+            for (int x = xLower; x <= xUpper; ++x) {
+                int cell = gridIndex3Dto1D(x, y, z, gridResolution);
+                int startIndex = gridCellStartIndices[cell];
+                if (startIndex == -1) {
+                    continue;
+                }
+                int endIndex = gridCellEndIndices[cell];
+                for (int otherIndex = startIndex; otherIndex <= endIndex; ++otherIndex) {
+                    if (index != otherIndex) {
+
+                        glm::vec3 otherPos = pos[otherIndex];
+                        float dist = glm::distance(selfPos, otherPos);
+                        if (dist < rule1Distance) {
+                            v1 += otherPos;
+                            ++numNeighborsRule1;
+                        }
+                        if (dist < rule2Distance) {
+                            v2 -= otherPos - selfPos;
+                        }
+                        if (dist < rule3Distance) {
+                            v3 += vel1[otherIndex];
+                            ++numNeighborsRule3;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    glm::vec3 vel = vel1[index];
+    if (numNeighborsRule1 > 0) {
+        v1 /= numNeighborsRule1;
+        v1 = (v1 - selfPos) * rule1Scale;
+        vel += v1;
+    }
+    v2 *= rule2Scale;
+    vel += v2;
+    if (numNeighborsRule3 > 0) {
+        v3 /= numNeighborsRule3;
+        v3 *= rule3Scale;
+        vel += v3;
+    }
+
+    float l = glm::length(vel);
+    if (l > maxSpeed) {
+        vel *= maxSpeed / l;
+    }
+    vel2[index] = vel;
 }
 
 /**
@@ -349,6 +577,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+    kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +600,24 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+    dim3 fullBlocksPerGrid_Boids((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGrid_Cells((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    kernResetIntBuffer<<<fullBlocksPerGrid_Cells, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+    kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+    kernUpdatePos<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    glm::vec3* temp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = temp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +636,26 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dim3 fullBlocksPerGrid_Boids((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGrid_Cells((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    kernReshufflePosVel<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, dev_particleArrayIndices, dev_pos, dev_pos2, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernReshufflePosVel failed!");
+    kernResetIntBuffer<<<fullBlocksPerGrid_Cells, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+    kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos2, dev_vel2, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+    kernUpdatePos<<<fullBlocksPerGrid_Boids, blockSize>>>(numObjects, dt, dev_pos2, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    glm::vec3* temp = dev_pos;
+    dev_pos = dev_pos2;
+    dev_pos2 = temp;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +664,11 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    cudaFree(dev_pos2);
+    cudaFree(dev_particleArrayIndices);
+    cudaFree(dev_particleGridIndices);
+    cudaFree(dev_gridCellStartIndices);
+    cudaFree(dev_gridCellEndIndices);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..ddd0e3b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;