diff --git a/README.md b/README.md
index d63a6a1..dfcd4fa 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,32 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yichao Wang
+  * [LinkedIn](https://www.linkedin.com/in/wangyic/) 
+* Tested on: Windows 10 Home 64-bit (10.0, Build 18363)
+  * Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz (8 CPUs)
+  * GeForce GTX 1060	6.1
 
-### (TODO: Your README)
+**Boids Animation**
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](images/screenshot.PNG)
+
+![](images/demo.gif)
+
+**Performance Analysis**
+
+![Figure1](images/withoutV.png)
+
+![](images/withV.png)
+
+![](images/blockSize.png)
+
+From above plots, we can see:
+
+1. For each implementation, as the # of boids increases, the performance descreases. This is because we need to process more data as the # of boids increases.
+
+2. For each implementation, changing of block size does not affect performance. This is because changing of block size does not change the amount of threads we used to process data.
+
+3. The performance improves with the more coherent uniform grid. Because we reduce the random access by eliminating ```dev_particleArrayIndices``` and use contiguous memory by ordering boid's position and velocity in the coherent kernel.
+
+4. Surprisingly, checking 27 neighboring has better performance than checking 8 neighboring. This is because we have many if statements when checking 8 neighboring. The if statements make the branch in warp diverge which results in more run time.
diff --git a/images/blockSize.png b/images/blockSize.png
new file mode 100644
index 0000000..454bcc9
Binary files /dev/null and b/images/blockSize.png differ
diff --git a/images/demo.gif b/images/demo.gif
new file mode 100644
index 0000000..c6d69b6
Binary files /dev/null and b/images/demo.gif differ
diff --git a/images/screenshot.PNG b/images/screenshot.PNG
new file mode 100644
index 0000000..88a9876
Binary files /dev/null and b/images/screenshot.PNG differ
diff --git a/images/withV.png b/images/withV.png
new file mode 100644
index 0000000..b802fa7
Binary files /dev/null and b/images/withV.png differ
diff --git a/images/withoutV.png b/images/withoutV.png
new file mode 100644
index 0000000..55e11fc
Binary files /dev/null and b/images/withoutV.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..2c247da 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 1024
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -85,6 +85,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_sorted_pos;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -157,7 +158,7 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  gridCellWidth = 1.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
   gridSideCount = 2 * halfSideCount;
 
@@ -169,6 +170,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_sorted_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc sorted_dev_pos failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +252,49 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 finalVel = glm::vec3(0.f), perceivedCenter = finalVel, c = finalVel, perceivedVel = finalVel;
+    int rule1Neighbors = 0, rule3Neighbors = 0;
+    glm::vec3 boidPos = pos[iSelf];
+
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf) {
+            glm::vec3 bPos = pos[i];
+            float dist = glm::distance(bPos, boidPos);
+
+            // rule1: cohesion
+            if (dist < rule1Distance) {
+                rule1Neighbors++;
+                perceivedCenter += bPos;
+            }
+
+            // separation
+            if (dist < rule2Distance) {
+                c -= bPos - boidPos;
+            }
+
+            // alignment
+            if (dist < rule3Distance) {
+                rule3Neighbors++;
+                perceivedVel += vel[i];
+            }
+        }
+    }
+    
+    if (rule1Neighbors != 0) {
+        perceivedCenter /= rule1Neighbors;
+        finalVel += (perceivedCenter - boidPos) * rule1Scale;
+    }
+
+
+    finalVel += c * rule2Scale;
+    
+    // alignment
+    if (rule3Neighbors != 0) {
+        perceivedVel /= rule3Neighbors;
+        finalVel += perceivedVel * rule3Scale;
+    }
+
+  return vel[iSelf] + finalVel;
 }
 
 /**
@@ -245,6 +306,18 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (idx >= N) {
+        return;
+    }
+
+    glm::vec3 vel = computeVelocityChange(N, idx, pos, vel1);
+    if (glm::length(vel) > maxSpeed) {
+        vel = glm::normalize(vel) * maxSpeed;
+    }
+
+    vel2[idx] = vel;
+
 }
 
 /**
@@ -282,6 +355,10 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
+__device__ int gridIndex3Dto1D(glm::vec3 gridIndex3D, int gridResolution) {
+    return gridIndex3D.x + gridIndex3D.y * gridResolution + gridIndex3D.z * gridResolution * gridResolution;
+}
+
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
@@ -289,6 +366,20 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+       return;
+    }
+
+
+
+
+   glm::vec3 cellIdx3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+    
+    gridIndices[index] = gridIndex3Dto1D(cellIdx3D, gridResolution);
+    indices[index] = index;
+
+
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +397,138 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+       return;
+    }
+
+    int cellIndex = particleGridIndices[index];
+    if (index == 0) {
+        gridCellStartIndices[cellIndex] = index;
+        return;
+    }
+    if (index == N - 1) {
+        gridCellEndIndices[cellIndex] = index;
+        return;
+    }
+    int cellIndexBefore = particleGridIndices[index - 1];
+    if (cellIndex != cellIndexBefore) {
+        gridCellEndIndices[cellIndexBefore] = index - 1;
+        gridCellStartIndices[cellIndex] = index;
+    }
+
+
+}
+
+
+__device__ int* getNeighborCells8(int gridResolution, glm::vec3 gridMin, float cellWidth, 
+    glm::vec3 cellIdx3D, glm::vec3 boidPos, int* gridCellStartIndices, int* gridCellEndIndices) {
+    const int size = 8;
+    int neighborCellIdxs[size];
+    for (int i = 0; i < size; i++) {
+        neighborCellIdxs[i] = -1;
+    }
+    int tracker = -1;
+
+    int xDir, yDir, zDir = 0;
+    float searchDist = imax(imax(rule1Distance, rule2Distance), rule3Distance);
+    glm::vec3 cornerPos = gridMin + cellIdx3D * cellWidth;
+    
+    if (boidPos.x - searchDist < cornerPos.x) {
+        if (cellIdx3D.x != 0) {
+            xDir = -1;
+        }
+    } else {
+        if (cellIdx3D.x != gridResolution - 1) {
+            xDir = 1;
+        }
+    }
+
+    if (boidPos.y - searchDist < cornerPos.y) {
+        if (cellIdx3D.y != 0) {
+            yDir = -1;
+        }
+    }
+    else {
+        if (cellIdx3D.y != gridResolution - 1) {
+            yDir = 1;
+        }
+    }
+
+    if (boidPos.z - searchDist < cornerPos.z) {
+        if (cellIdx3D.z != 0) {
+            zDir = -1;
+        }
+    }
+    else {
+        if (cellIdx3D.z != gridResolution - 1) {
+            zDir = 1;
+        }
+    }
+
+    glm::vec3 neighborCellIdx3D = cellIdx3D;
+    neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+    if (zDir != 0) {
+        neighborCellIdx3D.z += zDir;
+        neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+    }
+
+    if (xDir != 0) {
+        glm::vec3 neighborCellIdx3D = cellIdx3D;
+        neighborCellIdx3D.x += xDir;
+        neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+        if (zDir != 0) {
+            neighborCellIdx3D.z += zDir;
+            neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+        }
+    }
+
+    if (yDir != 0) {
+        glm::vec3 neighborCellIdx3D = cellIdx3D;
+        neighborCellIdx3D.y += yDir;
+        neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+        if (zDir != 0) {
+            neighborCellIdx3D.z += zDir;
+            neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+        }
+    }
+
+    if (xDir != 0 && yDir != 0) {
+        glm::vec3 neighborCellIdx3D = cellIdx3D;
+        neighborCellIdx3D.x += xDir;
+        neighborCellIdx3D.y += yDir;
+        neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+        if (zDir != 0) {
+            neighborCellIdx3D.z += zDir;
+            neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+        }
+    }
+
+    return neighborCellIdxs;
+}
+
+__device__ int* getNeighborCells27(int gridResolution, glm::vec3 gridMin, float cellWidth,
+    glm::vec3 cellIdx3D, glm::vec3 boidPos, int* gridCellStartIndices, int* gridCellEndIndices) {
+    const int size = 27;
+    int neighborCellIdxs[size];
+    for (int i = 0; i < size; i++) {
+        neighborCellIdxs[i] = -1;
+    }
+    int tracker = -1;
+
+    for (int k = cellIdx3D.z - 1; k <= cellIdx3D.z + 1; k++) {
+        for (int j = cellIdx3D.y - 1; j <= cellIdx3D.y + 1; j++) {
+            for (int i = cellIdx3D.x - 1; i <= cellIdx3D.x + 1; i++) {
+                if (i < 0 || j < 0 || k < 0 || i >= gridResolution || j >= gridResolution || k >= gridResolution) {
+                    continue;
+                }
+                glm::vec3 neighborCellIdx3D(i, j, k);
+                neighborCellIdxs[++tracker] = gridIndex3Dto1D(neighborCellIdx3D, gridResolution);
+            }
+        }
+    }
+    return neighborCellIdxs;
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +545,86 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 finalVel = glm::vec3(0.f), perceivedCenter = finalVel, c = finalVel, perceivedVel = finalVel;
+    int rule1Neighbors = 0, rule3Neighbors = 0;
+
+    glm::vec3 cellIdx3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+    int cellIndex = gridIndex3Dto1D(cellIdx3D, gridResolution);
+
+    glm::vec3 boidPos = pos[index];
+    int* neighborCellIdxs = getNeighborCells27(gridResolution, gridMin, cellWidth, cellIdx3D, boidPos, 
+        gridCellStartIndices, gridCellEndIndices);
+
+    for (int i = 0; i < 27; i++) {
+        int cellIdx = neighborCellIdxs[i];
+        if (cellIdx == -1) {
+            continue;
+        }
+        if (gridCellStartIndices[cellIdx] == -1) {
+            continue;
+        }
+        for (int j = gridCellStartIndices[cellIdx]; j <= gridCellEndIndices[cellIdx]; j++) {
+            int boidIndex = particleArrayIndices[j];
+            if (boidIndex != index) {
+                glm::vec3 bPos = pos[boidIndex];
+                float dist = glm::distance(bPos, boidPos);
+
+                // rule1: cohesion
+                if (dist < rule1Distance) {
+                    rule1Neighbors++;
+                    perceivedCenter += bPos;
+                }
+
+                // separation
+                if (dist < rule2Distance) {
+                    c -= bPos - boidPos;
+                }
+
+                // alignment
+                if (dist < rule3Distance) {
+                    rule3Neighbors++;
+                    perceivedVel += vel1[boidIndex];
+                }
+            }
+        }
+    }
+
+    if (rule1Neighbors != 0) {
+        perceivedCenter /= rule1Neighbors;
+        finalVel += (perceivedCenter - boidPos) * rule1Scale;
+    }
+
+    finalVel += c * rule2Scale;
+
+    // alignment
+    if (rule3Neighbors != 0) {
+        perceivedVel /= rule3Neighbors;
+        finalVel += perceivedVel * rule3Scale;
+    }
+
+    finalVel += vel1[index];
+
+    if (glm::length(finalVel) > maxSpeed) {
+        finalVel = glm::normalize(finalVel) * maxSpeed;
+    }
+
+    vel2[index] = finalVel;
+}
+
+__global__ void kernSortPosVel(int N, int* particleArrayIndices, glm::vec3* pos,
+    glm::vec3 *sortPos, glm::vec3 *vel1, glm::vec3 *vel2) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    sortPos[index] = pos[particleArrayIndices[index]];
+    vel2[index] = vel1[particleArrayIndices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +644,75 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 finalVel = glm::vec3(0.f), perceivedCenter = finalVel, c = finalVel, perceivedVel = finalVel;
+    int rule1Neighbors = 0, rule3Neighbors = 0;
+
+    glm::vec3 cellIdx3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+    int cellIndex = gridIndex3Dto1D(cellIdx3D, gridResolution);
+
+    glm::vec3 boidPos = pos[index];
+    int* neighborCellIdxs = getNeighborCells27(gridResolution, gridMin, cellWidth, cellIdx3D, boidPos,
+        gridCellStartIndices, gridCellEndIndices);
+
+    for (int i = 0; i < 27; i++) {
+        int cellIdx = neighborCellIdxs[i];
+        if (cellIdx == -1) {
+            continue;
+        }
+        if (gridCellStartIndices[cellIdx] == -1) {
+            continue;
+        }
+        for (int j = gridCellStartIndices[cellIdx]; j <= gridCellEndIndices[cellIdx]; j++) {
+            //int boidIndex = particleArrayIndices[j];
+            if (j != index) {
+                glm::vec3 bPos = pos[j];
+                float dist = glm::distance(bPos, boidPos);
+
+                // rule1: cohesion
+                if (dist < rule1Distance) {
+                    rule1Neighbors++;
+                    perceivedCenter += bPos;
+                }
+
+                // separation
+                if (dist < rule2Distance) {
+                    c -= bPos - boidPos;
+                }
+
+                // alignment
+                if (dist < rule3Distance) {
+                    rule3Neighbors++;
+                    perceivedVel += vel1[j];
+                }
+            }
+        }
+    }
+
+    if (rule1Neighbors != 0) {
+        perceivedCenter /= rule1Neighbors;
+        finalVel += (perceivedCenter - boidPos) * rule1Scale;
+    }
+
+    finalVel += c * rule2Scale;
+
+    // alignment
+    if (rule3Neighbors != 0) {
+        perceivedVel /= rule3Neighbors;
+        finalVel += perceivedVel * rule3Scale;
+    }
+
+    finalVel += vel1[index];
+
+    if (glm::length(finalVel) > maxSpeed) {
+        finalVel = glm::normalize(finalVel) * maxSpeed;
+    }
+
+    vel2[index] = finalVel;
 }
 
 /**
@@ -349,6 +721,15 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce <<< fullBlocksPerGrid, blockSize >>> (numObjects,
+        dev_pos, dev_vel1, dev_vel2);
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+    // ping-pong
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +745,38 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGridCells((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+        gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, 
+        dev_thrust_particleArrayIndices);
+    checkCUDAErrorWithLine("thrust sort failed!");
+
+    kernResetIntBuffer << <fullBlocksPerGridCells, blockSize >> > (gridCellCount,
+        dev_gridCellStartIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+        dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+        gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    // ping-pong
+   cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +795,40 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGridCells((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum,
+        gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects,
+        dev_thrust_particleArrayIndices);
+    checkCUDAErrorWithLine("thrust sort failed!");
+
+    kernResetIntBuffer << <fullBlocksPerGridCells, blockSize >> > (gridCellCount,
+        dev_gridCellStartIndices, -1);
+    checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+        dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    kernSortPosVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, 
+        dev_sorted_pos, dev_vel1, dev_vel2);
+
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount,
+        gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_sorted_pos, dev_vel2, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_sorted_pos, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    // ping-pong
+    cudaMemcpy(dev_vel2, dev_vel1, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dev_pos, dev_sorted_pos, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +837,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_sorted_pos);
+
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..abd4a60 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 50000;
 const float DT = 0.2f;
 
 /**