diff --git a/.gitignore b/.gitignore
index 2e7a3c7..70a35c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -562,3 +562,5 @@ xcuserdata
 *.xccheckout
 *.moved-aside
 *.xcuserstate
+
+.vscode
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 150664e..11aac25 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,12 +29,15 @@ list(APPEND CUDA_NVCC_FLAGS ${CUDA_GENERATE_CODE})
 list(APPEND CUDA_NVCC_FLAGS_DEBUG "-g -G")
 set(CUDA_VERBOSE_BUILD ON)
 
+message(STATUS "CUDA compilation flags: ${CUDA_NVCC_FLAGS}")
+
 if(WIN32)
     # Set up include and lib paths
     set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE)
 endif(WIN32)
 ########################################
 
+set(OpenGL_GL_PREFERENCE GLVND)
 find_package(OpenGL REQUIRED)
 
 if(UNIX)
@@ -67,6 +70,7 @@ set(headers
     src/kernel.h
     src/main.hpp
     src/utilityCore.hpp
+    src/print_glm.hpp
     )
 
 set(sources
@@ -74,6 +78,7 @@ set(sources
     src/kernel.cu
     src/main.cpp
     src/utilityCore.cpp
+    src/print_glm.cpp
     )
 
 list(SORT headers)
diff --git a/README.md b/README.md
index d63a6a1..772992a 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,39 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zhihao Ruan (ruanzh@seas.upenn.edu)
+  * [LinkedIn](https://www.linkedin.com/in/zhihao-ruan-29b29a13a/), [personal website](https://zhihaoruan.xyz/)
+* Tested on: Ubuntu 20.04 LTS, Ryzen 3700X @ 2.22GHz 48GB, RTX 2060 Super @ 7976MB
 
-### (TODO: Your README)
+![](images/uniform-grid-flocking-coherent-highdpi.gif)
+
+## Introduction: Flocking Simulation
+
+Flocking is defined as the action of a crowd. In nature, flocking often happens on a crowd of birds or a school of fish. Birds, for example, often fly together as a whole in the sky, moving from one position to another. Although the shape of the crowd might change a lot, it is very amazing that each bird flies as if they knew the next steps of all other birds, so that it would never diverge from the crowd and they always stay together.
+
+Biologists have been studying the behavior of flocking for a long time. In such context, we would also call each individual a **boid**. One might very easily start to wonder whether there is any type of communications taking place within the crowd so that they could maintain as a whole. Unfortunately, however, there is no such communication mechanism between each two individuals. In fact, according to the [notes from Conrad Parker](http://www.vergenet.net/~conrad/boids/), each individual would be able to stay close to other boids as long as they follow 3 simple rules:
+1. Boids try to fly towards the centre of mass of neighboring boids.
+2. Boids try to keep a small distance away from other objects (including other boids).
+3. Boids try to match velocity with near boids.
+
+
+The objective of this project would be to build a flocking simulation using CUDA with these 3 simple rules. A demo of the final result has been showed right above this section.
+
+## Performance Analysis
+**For each implementation, how does changing the number of boids affect performance? Why do you think this is?**
+- The FPS would decrease as the number of boids increases. This is because GPU needs to compute more boid states and thus needs more threads to finish simulation per time step.
+
+**For each implementation, how does changing the block count and block size affect performance? Why do you think this is?**
+- The FPS would increase as the block size increases. This is because more boids could be computed in parallel in one block as the block size increases, thus boosting the performance. 
+
+**For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?**
+- My performance did improve with coherent grid compared to scattered grid. This outcome was expected as GPU would pull out one chunk of memory at a time for a warp to access, and thus with coherent grid all threads in one warp would read the same chunk of data pulled out from GPU memory, thus reducing the memory I/O.
+
+### FPS Graph Plots
+
+**FPS change with increasing number of boids in different modes:**
+![](images/FPS_num_boids.png)
+
+**FPS change with different block size (uniform coherent grid, number of boids set to 50k):**
+![](images/FPS_block_size.png)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/cmake/.clang-format b/cmake/.clang-format
new file mode 100644
index 0000000..2271a1a
--- /dev/null
+++ b/cmake/.clang-format
@@ -0,0 +1,10 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+AccessModifierOffset: -2
+AlignConsecutiveAssignments: true
+AlignConsecutiveMacros: true
+SortIncludes: false
+---
+
diff --git a/cmake/cuda_compute_capability.cpp b/cmake/cuda_compute_capability.cpp
index ef589a9..afa8481 100644
--- a/cmake/cuda_compute_capability.cpp
+++ b/cmake/cuda_compute_capability.cpp
@@ -1,12 +1,12 @@
 /*
-* Copyright (C) 2011 Florian Rathgeber, florian.rathgeber@gmail.com
-*
-* This code is licensed under the MIT License.  See the FindCUDA.cmake script
-* for the text of the license.
-*
-* Based on code by Christopher Bruns published on Stack Overflow (CC-BY):
-* http://stackoverflow.com/questions/2285185
-*/
+ * Copyright (C) 2011 Florian Rathgeber, florian.rathgeber@gmail.com
+ *
+ * This code is licensed under the MIT License.  See the FindCUDA.cmake script
+ * for the text of the license.
+ *
+ * Based on code by Christopher Bruns published on Stack Overflow (CC-BY):
+ * http://stackoverflow.com/questions/2285185
+ */
 
 #include <stdio.h>
 #include <cuda_runtime.h>
@@ -14,45 +14,45 @@
 #include <set>
 
 int main() {
-    int deviceCount;
-    int gpuDeviceCount = 0;
-    struct cudaDeviceProp properties;
+  int deviceCount;
+  int gpuDeviceCount = 0;
+  struct cudaDeviceProp properties;
 
-    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
-    {
-        printf("Couldn't get device count: %s\n", cudaGetErrorString(cudaGetLastError()));
-        return 1;
-    }
+  if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
+    printf("Couldn't get device count: %s\n",
+           cudaGetErrorString(cudaGetLastError()));
+    return 1;
+  }
 
-    std::set<int> computes;
-    typedef std::set<int>::iterator iter;
+  std::set<int> computes;
+  typedef std::set<int>::iterator iter;
 
-    // machines with no GPUs can still report one emulation device
-    for (int device = 0; device < deviceCount; ++device) {
-        int major = 9999, minor = 9999;
-        cudaGetDeviceProperties(&properties, device);
-        if (properties.major != 9999) { // 9999 means emulation only
-            ++gpuDeviceCount;
-            major = properties.major;
-            minor = properties.minor;
-            if ((major == 2 && minor == 1)) {
-                // There is no --arch compute_21 flag for nvcc, so force minor to 0
-                minor = 0;
-            }
-            computes.insert(10 * major + minor);
-        }
-    }
-    int i = 0;
-    for(iter it = computes.begin(); it != computes.end(); it++, i++) {
-        if(i > 0) {
-            printf(" ");
-        }
-        printf("%d", *it);
+  // machines with no GPUs can still report one emulation device
+  for (int device = 0; device < deviceCount; ++device) {
+    int major = 9999, minor = 9999;
+    cudaGetDeviceProperties(&properties, device);
+    if (properties.major != 9999) {  // 9999 means emulation only
+      ++gpuDeviceCount;
+      major = properties.major;
+      minor = properties.minor;
+      if ((major == 2 && minor == 1)) {
+        // There is no --arch compute_21 flag for nvcc, so force minor to 0
+        minor = 0;
+      }
+      computes.insert(10 * major + minor);
     }
-    /* don't just return the number of gpus, because other runtime cuda
-    errors can also yield non-zero return values */
-    if (gpuDeviceCount <= 0 || computes.size() <= 0) {
-        return 1; // failure
+  }
+  int i = 0;
+  for (iter it = computes.begin(); it != computes.end(); it++, i++) {
+    if (i > 0) {
+      printf(" ");
     }
-    return 0; // success
+    printf("%d", *it);
+  }
+  /* don't just return the number of gpus, because other runtime cuda
+  errors can also yield non-zero return values */
+  if (gpuDeviceCount <= 0 || computes.size() <= 0) {
+    return 1;  // failure
+  }
+  return 0;  // success
 }
diff --git a/images/FPS_block_size.png b/images/FPS_block_size.png
new file mode 100644
index 0000000..be84bd9
Binary files /dev/null and b/images/FPS_block_size.png differ
diff --git a/images/FPS_num_boids.png b/images/FPS_num_boids.png
new file mode 100644
index 0000000..18f4785
Binary files /dev/null and b/images/FPS_num_boids.png differ
diff --git a/images/result-naive-flocking.gif b/images/result-naive-flocking.gif
new file mode 100644
index 0000000..b4928ad
Binary files /dev/null and b/images/result-naive-flocking.gif differ
diff --git a/images/uniform-grid-flocking-coherent-highdpi.gif b/images/uniform-grid-flocking-coherent-highdpi.gif
new file mode 100644
index 0000000..f63c336
Binary files /dev/null and b/images/uniform-grid-flocking-coherent-highdpi.gif differ
diff --git a/images/uniform-grid-flocking-coherent.gif b/images/uniform-grid-flocking-coherent.gif
new file mode 100644
index 0000000..a1b3882
Binary files /dev/null and b/images/uniform-grid-flocking-coherent.gif differ
diff --git a/images/uniform-grid-flocking-scattered.gif b/images/uniform-grid-flocking-scattered.gif
new file mode 100644
index 0000000..d6fdee2
Binary files /dev/null and b/images/uniform-grid-flocking-scattered.gif differ
diff --git a/src/.clang-format b/src/.clang-format
new file mode 100644
index 0000000..618314c
--- /dev/null
+++ b/src/.clang-format
@@ -0,0 +1,9 @@
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+AccessModifierOffset: -2
+AlignConsecutiveAssignments: true
+AlignConsecutiveMacros: true
+---
+
diff --git a/src/cudaMat4.hpp b/src/cudaMat4.hpp
index 41aa606..4fe7a53 100644
--- a/src/cudaMat4.hpp
+++ b/src/cudaMat4.hpp
@@ -1,8 +1,8 @@
 /**
  * @file      cudaMat4.h
  * @brief     This file includes code from:
- *            Yining Karl Li's TAKUA Render, a massively parallel pathtracing renderer:
- *            http://www.yiningkarlli.com
+ *            Yining Karl Li's TAKUA Render, a massively parallel pathtracing
+ * renderer: http://www.yiningkarlli.com
  * @authors   Yining Karl Li
  * @date      2012
  * @copyright Yining Karl Li
@@ -10,18 +10,19 @@
 
 #pragma once
 
-#include <glm/glm.hpp>
 #include <cuda_runtime.h>
 
+#include <glm/glm.hpp>
+
 struct cudaMat3 {
-    glm::vec3 x;
-    glm::vec3 y;
-    glm::vec3 z;
+  glm::vec3 x;
+  glm::vec3 y;
+  glm::vec3 z;
 };
 
 struct cudaMat4 {
-    glm::vec4 x;
-    glm::vec4 y;
-    glm::vec4 z;
-    glm::vec4 w;
+  glm::vec4 x;
+  glm::vec4 y;
+  glm::vec4 z;
+  glm::vec4 w;
 };
diff --git a/src/glslUtility.cpp b/src/glslUtility.cpp
index d4876dd..b9d6084 100644
--- a/src/glslUtility.cpp
+++ b/src/glslUtility.cpp
@@ -7,190 +7,195 @@
  */
 
 #define _CRT_SECURE_NO_WARNINGS
-#include <iostream>
-#include <fstream>
-#include <string>
+#include "glslUtility.hpp"
+
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
+#include <iostream>
 #include <memory>
-#include "glslUtility.hpp"
+#include <string>
 
 using std::ios;
 
 namespace glslUtility {
 typedef struct {
-    GLint vertex;
-    GLint fragment;
-    GLint geometry;
+  GLint vertex;
+  GLint fragment;
+  GLint geometry;
 } shaders_t;
 
-char* loadFile(const char *fname, GLint &fSize) {
-    // file read based on example in cplusplus.com tutorial
-    std::ifstream file(fname, ios::in | ios::binary | ios::ate);
-    if (file.is_open()) {
-        unsigned int size = (unsigned int)file.tellg();
-        fSize = size;
-        char *memblock = new char [size];
-        file.seekg(0, ios::beg);
-        file.read(memblock, size);
-        file.close();
-        std::cout << "file " << fname << " loaded" << std::endl;
-        return memblock;
-    }
-
-    std::cout << "Unable to open file " << fname << std::endl;
-    exit(1);
+char *loadFile(const char *fname, GLint &fSize) {
+  // file read based on example in cplusplus.com tutorial
+  std::ifstream file(fname, ios::in | ios::binary | ios::ate);
+  if (file.is_open()) {
+    unsigned int size = (unsigned int)file.tellg();
+    fSize             = size;
+    char *memblock    = new char[size];
+    file.seekg(0, ios::beg);
+    file.read(memblock, size);
+    file.close();
+    std::cout << "file " << fname << " loaded" << std::endl;
+    return memblock;
+  }
+
+  std::cout << "Unable to open file " << fname << std::endl;
+  exit(1);
 }
 
 // printShaderInfoLog
 // From OpenGL Shading Language 3rd Edition, p215-216
 // Display (hopefully) useful error messages if shader fails to compile
 void printShaderInfoLog(GLint shader) {
-    int infoLogLen = 0;
-    int charsWritten = 0;
+  int infoLogLen   = 0;
+  int charsWritten = 0;
 
-    glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLogLen);
+  glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLogLen);
 
-    if (infoLogLen > 1) {
-        std::unique_ptr<GLchar[]> infoLog{ new GLchar[infoLogLen] };
-        // error check for fail to allocate memory omitted
-        glGetShaderInfoLog(shader, infoLogLen, &charsWritten, infoLog.get());
-        std::cout << "InfoLog:" << std::endl << infoLog.get() << std::endl;
-    }
+  if (infoLogLen > 1) {
+    std::unique_ptr<GLchar[]> infoLog{new GLchar[infoLogLen]};
+    // error check for fail to allocate memory omitted
+    glGetShaderInfoLog(shader, infoLogLen, &charsWritten, infoLog.get());
+    std::cout << "InfoLog:" << std::endl << infoLog.get() << std::endl;
+  }
 }
 
 void printLinkInfoLog(GLint prog) {
-    int infoLogLen = 0;
-    int charsWritten = 0;
+  int infoLogLen   = 0;
+  int charsWritten = 0;
 
-    glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &infoLogLen);
+  glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &infoLogLen);
 
-    if (infoLogLen > 1) {
-        std::unique_ptr<GLchar[]> infoLog{ new GLchar[infoLogLen] };
-        // error check for fail to allocate memory omitted
-        glGetProgramInfoLog(prog, infoLogLen, &charsWritten, infoLog.get());
-        std::cout << "InfoLog:" << std::endl << infoLog.get() << std::endl;
-    }
+  if (infoLogLen > 1) {
+    std::unique_ptr<GLchar[]> infoLog{new GLchar[infoLogLen]};
+    // error check for fail to allocate memory omitted
+    glGetProgramInfoLog(prog, infoLogLen, &charsWritten, infoLog.get());
+    std::cout << "InfoLog:" << std::endl << infoLog.get() << std::endl;
+  }
 }
 
-shaders_t loadShaders(const char * vert_path, const char * geom_path, const char * frag_path) {
-    GLint f, g = -1, v;
-
-    char *vs, *gs, *fs;
-
-    v = glCreateShader(GL_VERTEX_SHADER);
-    if (geom_path) {
-        g = glCreateShader(GL_GEOMETRY_SHADER);
-    }
-    f = glCreateShader(GL_FRAGMENT_SHADER);
-
-    // load shaders & get length of each
-    GLint vlen;
-    GLint glen;
-    GLint flen;
-    vs = loadFile(vert_path, vlen);
-    if (geom_path) {
-        gs = loadFile(geom_path, glen);
-    }
-    fs = loadFile(frag_path, flen);
-
-    const char * vv = vs;
-    const char * gg = geom_path ? gs : NULL;
-    const char * ff = fs;
-
-    glShaderSource(v, 1, &vv, &vlen);
-    if (geom_path) {
-        glShaderSource(g, 1, &gg, &glen);
-    }
-    glShaderSource(f, 1, &ff, &flen);
-
-    GLint compiled;
-
-    glCompileShader(v);
-    glGetShaderiv(v, GL_COMPILE_STATUS, &compiled);
+shaders_t loadShaders(const char *vert_path, const char *geom_path,
+                      const char *frag_path) {
+  GLint f, g = -1, v;
+
+  char *vs, *gs, *fs;
+
+  v = glCreateShader(GL_VERTEX_SHADER);
+  if (geom_path) {
+    g = glCreateShader(GL_GEOMETRY_SHADER);
+  }
+  f = glCreateShader(GL_FRAGMENT_SHADER);
+
+  // load shaders & get length of each
+  GLint vlen;
+  GLint glen;
+  GLint flen;
+  vs = loadFile(vert_path, vlen);
+  if (geom_path) {
+    gs = loadFile(geom_path, glen);
+  }
+  fs = loadFile(frag_path, flen);
+
+  const char *vv = vs;
+  const char *gg = geom_path ? gs : NULL;
+  const char *ff = fs;
+
+  glShaderSource(v, 1, &vv, &vlen);
+  if (geom_path) {
+    glShaderSource(g, 1, &gg, &glen);
+  }
+  glShaderSource(f, 1, &ff, &flen);
+
+  GLint compiled;
+
+  glCompileShader(v);
+  glGetShaderiv(v, GL_COMPILE_STATUS, &compiled);
+  if (!compiled) {
+    std::cout << "Vertex shader not compiled." << std::endl;
+  }
+  printShaderInfoLog(v);
+
+  if (geom_path) {
+    glCompileShader(g);
+    glGetShaderiv(g, GL_COMPILE_STATUS, &compiled);
     if (!compiled) {
-        std::cout << "Vertex shader not compiled." << std::endl;
-    }
-    printShaderInfoLog(v);
-
-    if (geom_path) {
-        glCompileShader(g);
-        glGetShaderiv(g, GL_COMPILE_STATUS, &compiled);
-        if (!compiled) {
-            std::cout << "Geometry shader not compiled." << std::endl;
-        }
-        printShaderInfoLog(g);
-    }
-
-    glCompileShader(f);
-    glGetShaderiv(f, GL_COMPILE_STATUS, &compiled);
-    if (!compiled) {
-        std::cout << "Fragment shader not compiled." << std::endl;
-    }
-    printShaderInfoLog(f);
-
-    shaders_t out;
-    out.vertex = v;
-    out.geometry = g;
-    out.fragment = f;
-
-    delete [] vs; // dont forget to free allocated memory, or else really bad things start happening
-    if (geom_path) {
-        delete[] gs;
+      std::cout << "Geometry shader not compiled." << std::endl;
     }
-    delete [] fs; // we allocated this in the loadFile function...
-
-    return out;
+    printShaderInfoLog(g);
+  }
+
+  glCompileShader(f);
+  glGetShaderiv(f, GL_COMPILE_STATUS, &compiled);
+  if (!compiled) {
+    std::cout << "Fragment shader not compiled." << std::endl;
+  }
+  printShaderInfoLog(f);
+
+  shaders_t out;
+  out.vertex   = v;
+  out.geometry = g;
+  out.fragment = f;
+
+  delete[] vs;  // dont forget to free allocated memory, or else really bad
+                // things start happening
+  if (geom_path) {
+    delete[] gs;
+  }
+  delete[] fs;  // we allocated this in the loadFile function...
+
+  return out;
 }
 
 void attachAndLinkProgram(GLuint program, shaders_t shaders) {
-    glAttachShader(program, shaders.vertex);
-    if (shaders.geometry >= 0) {
-        glAttachShader(program, shaders.geometry);
-    }
-    glAttachShader(program, shaders.fragment);
-
-    glLinkProgram(program);
-    GLint linked;
-    glGetProgramiv(program, GL_LINK_STATUS, &linked);
-    if (!linked) {
-        std::cout << "Program did not link." << std::endl;
-    }
-    printLinkInfoLog(program);
+  glAttachShader(program, shaders.vertex);
+  if (shaders.geometry >= 0) {
+    glAttachShader(program, shaders.geometry);
+  }
+  glAttachShader(program, shaders.fragment);
+
+  glLinkProgram(program);
+  GLint linked;
+  glGetProgramiv(program, GL_LINK_STATUS, &linked);
+  if (!linked) {
+    std::cout << "Program did not link." << std::endl;
+  }
+  printLinkInfoLog(program);
 }
 
-GLuint createProgram(
-    const char *vertexShaderPath,
-    const char *fragmentShaderPath,
-    const char *attributeLocations[], GLuint numberOfLocations) {
-    glslUtility::shaders_t shaders = glslUtility::loadShaders(vertexShaderPath, NULL, fragmentShaderPath);
+GLuint createProgram(const char *vertexShaderPath,
+                     const char *fragmentShaderPath,
+                     const char *attributeLocations[],
+                     GLuint numberOfLocations) {
+  glslUtility::shaders_t shaders =
+      glslUtility::loadShaders(vertexShaderPath, NULL, fragmentShaderPath);
 
-    GLuint program = glCreateProgram();
+  GLuint program = glCreateProgram();
 
-    for (GLuint i = 0; i < numberOfLocations; ++i) {
-        glBindAttribLocation(program, i, attributeLocations[i]);
-    }
+  for (GLuint i = 0; i < numberOfLocations; ++i) {
+    glBindAttribLocation(program, i, attributeLocations[i]);
+  }
 
-    glslUtility::attachAndLinkProgram(program, shaders);
+  glslUtility::attachAndLinkProgram(program, shaders);
 
-    return program;
+  return program;
 }
 
-GLuint createProgram(
-    const char *vertexShaderPath,
-    const char *geometryShaderPath,
-    const char *fragmentShaderPath,
-    const char *attributeLocations[], GLuint numberOfLocations) {
-    glslUtility::shaders_t shaders = glslUtility::loadShaders(vertexShaderPath, geometryShaderPath, fragmentShaderPath);
+GLuint createProgram(const char *vertexShaderPath,
+                     const char *geometryShaderPath,
+                     const char *fragmentShaderPath,
+                     const char *attributeLocations[],
+                     GLuint numberOfLocations) {
+  glslUtility::shaders_t shaders = glslUtility::loadShaders(
+      vertexShaderPath, geometryShaderPath, fragmentShaderPath);
 
-    GLuint program = glCreateProgram();
+  GLuint program = glCreateProgram();
 
-    for (GLuint i = 0; i < numberOfLocations; ++i) {
-        glBindAttribLocation(program, i, attributeLocations[i]);
-    }
+  for (GLuint i = 0; i < numberOfLocations; ++i) {
+    glBindAttribLocation(program, i, attributeLocations[i]);
+  }
 
-    glslUtility::attachAndLinkProgram(program, shaders);
+  glslUtility::attachAndLinkProgram(program, shaders);
 
-    return program;
-}
+  return program;
 }
+}  // namespace glslUtility
diff --git a/src/glslUtility.hpp b/src/glslUtility.hpp
index 8629153..2d92aa7 100644
--- a/src/glslUtility.hpp
+++ b/src/glslUtility.hpp
@@ -22,4 +22,4 @@ GLuint createProgram(const char *vertexShaderPath,
                      const char *fragmentShaderPath,
                      const char *attributeLocations[],
                      GLuint numberOfLocations);
-}
+}  // namespace glslUtility
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..281dd08 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -1,25 +1,27 @@
 #define GLM_FORCE_CUDA
-#include <stdio.h>
 #include <cuda.h>
+#include <stdio.h>
+
 #include <cmath>
 #include <glm/glm.hpp>
-#include "utilityCore.hpp"
+
 #include "kernel.h"
+#include "utilityCore.hpp"
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
-#define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
+#define imax(a, b) (((a) > (b)) ? (a) : (b))
 #endif
 
 #ifndef imin
-#define imin( a, b ) ( ((a) < (b)) ? (a) : (b) )
+#define imin(a, b) (((a) < (b)) ? (a) : (b))
 #endif
 
 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
 /**
-* Check for CUDA errors; print and exit if there was a problem.
-*/
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
 void checkCUDAError(const char *msg, int line = -1) {
   cudaError_t err = cudaGetLastError();
   if (cudaSuccess != err) {
@@ -31,13 +33,12 @@ void checkCUDAError(const char *msg, int line = -1) {
   }
 }
 
-
 /*****************
-* Configuration *
-*****************/
+ * Configuration *
+ *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 1024
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -55,8 +56,8 @@ void checkCUDAError(const char *msg, int line = -1) {
 #define scene_scale 100.0f
 
 /***********************************************
-* Kernel state (pointers are device pointers) *
-***********************************************/
+ * Kernel state (pointers are device pointers) *
+ ***********************************************/
 
 int numObjects;
 dim3 threadsPerBlock(blockSize);
@@ -74,17 +75,24 @@ glm::vec3 *dev_vel2;
 // pointers on your own too.
 
 // For efficient sorting and the uniform grid. These should always be parallel.
-int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
-int *dev_particleGridIndices; // What grid cell is this particle in?
+int *dev_particleArrayIndices;  // What index in dev_pos and dev_velX represents
+                                // this particle?
+int *dev_particleGridIndices;   // What grid cell is this particle in?
 // needed for use with thrust
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
-int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
-int *dev_gridCellEndIndices;   // to this cell?
+int *dev_gridCellStartIndices;  // What part of dev_particleArrayIndices belongs
+int *dev_gridCellEndIndices;    // to this cell?
 
-// TODO-2.3 - consider what additional buffers you might need to reshuffle
+// Part-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+int *dev_posGridIndices;
+int *dev_velGridIndices;
+thrust::device_ptr<glm::vec3> dev_thrust_pos;
+thrust::device_ptr<glm::vec3> dev_thrust_vel;
+thrust::device_ptr<int> dev_thrust_posGridIndices;
+thrust::device_ptr<int> dev_thrust_velGridIndices;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -95,8 +103,8 @@ float gridInverseCellWidth;
 glm::vec3 gridMinimum;
 
 /******************
-* initSimulation *
-******************/
+ * initSimulation *
+ ******************/
 
 __host__ __device__ unsigned int hash(unsigned int a) {
   a = (a + 0x7ed55d16) + (a << 12);
@@ -109,78 +117,106 @@ __host__ __device__ unsigned int hash(unsigned int a) {
 }
 
 /**
-* LOOK-1.2 - this is a typical helper function for a CUDA kernel.
-* Function for generating a random vec3.
-*/
+ * LOOK-1.2 - this is a typical helper function for a CUDA kernel.
+ * Function for generating a random vec3.
+ */
 __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
   thrust::default_random_engine rng(hash((int)(index * time)));
   thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
 
-  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng),
+                   (float)unitDistrib(rng));
 }
 
 /**
-* LOOK-1.2 - This is a basic CUDA kernel.
-* CUDA kernel for generating boids with a specified mass randomly around the star.
-*/
-__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
+ * LOOK-1.2 - This is a basic CUDA kernel.
+ * CUDA kernel for generating boids with a specified mass randomly around the
+ * star.
+ */
+__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 *arr,
+                                           float scale) {
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index < N) {
     glm::vec3 rand = generateRandomVec3(time, index);
-    arr[index].x = scale * rand.x;
-    arr[index].y = scale * rand.y;
-    arr[index].z = scale * rand.z;
+    arr[index].x   = scale * rand.x;
+    arr[index].y   = scale * rand.y;
+    arr[index].z   = scale * rand.z;
   }
 }
 
 /**
-* Initialize memory, update some globals
-*/
+ * Initialize memory, update some globals
+ */
 void Boids::initSimulation(int N) {
   numObjects = N;
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
   // LOOK-1.2 - This is basic CUDA memory management and error checking.
   // Don't forget to cudaFree in  Boids::endSimulation.
-  cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
+  cudaMalloc((void **)&dev_pos, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
 
-  cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
+  cudaMalloc((void **)&dev_vel1, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
 
-  cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
+  cudaMalloc((void **)&dev_vel2, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
+  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(
+      1, numObjects, dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+  gridCellWidth =
+      2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
+  gridSideCount     = 2 * halfSideCount;
 
-  gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+  gridCellCount        = gridSideCount * gridSideCount * gridSideCount;
   gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
+  float halfGridWidth  = gridCellWidth * halfSideCount;
   gridMinimum.x -= halfGridWidth;
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  std::cout << "gridCellWidth: " << gridCellWidth << std::endl;
+  std::cout << "gridInverseCellWidth: " << gridInverseCellWidth << "\n";
+  std::cout << "gridSideCount: " << gridSideCount << std::endl;
+  std::cout << "gridCellCount: " << gridCellCount << "\n";
+  std::cout << "gridMinimum: " << gridMinimum << "\n";
+
+  // Part-2.1 Part-2.3 - Allocate additional buffers here.
+  cudaMalloc((void **)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void **)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void **)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void **)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void **)&dev_posGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_posGridIndices failed!");
+
+  cudaMalloc((void **)&dev_velGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_velGridIndices failed!");
+
   cudaDeviceSynchronize();
 }
 
-
 /******************
-* copyBoidsToVBO *
-******************/
+ * copyBoidsToVBO *
+ ******************/
 
 /**
-* Copy the boid positions into the VBO so that they can be drawn by OpenGL.
-*/
-__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
+ * Copy the boid positions into the VBO so that they can be drawn by OpenGL.
+ */
+__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo,
+                                       float s_scale) {
   int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
   float c_scale = -1.0f / s_scale;
@@ -193,7 +229,8 @@ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float
   }
 }
 
-__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
+__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo,
+                                        float s_scale) {
   int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
   if (index < N) {
@@ -205,53 +242,112 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
 }
 
 /**
-* Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
-*/
-void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
+ * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
+ */
+void Boids::copyBoidsToVBO(float *vbodptr_positions,
+                           float *vbodptr_velocities) {
   dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+  kernCopyPositionsToVBO<<<fullBlocksPerGrid, blockSize>>>(
+      numObjects, dev_pos, vbodptr_positions, scene_scale);
+  kernCopyVelocitiesToVBO<<<fullBlocksPerGrid, blockSize>>>(
+      numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
   checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
   cudaDeviceSynchronize();
 }
 
-
 /******************
-* stepSimulation *
-******************/
+ * stepSimulation *
+ ******************/
 
 /**
-* LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
-* __device__ code can be called from a __global__ context
-* Compute the new velocity on the body with index `iSelf` due to the `N` boids
-* in the `pos` and `vel` arrays.
-*/
-__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
+ * LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
+ * __device__ code can be called from a __global__ context
+ * Compute the new velocity on the body with index `iSelf` due to the `N` boids
+ * in the `pos` and `vel` arrays.
+ */
+__device__ glm::vec3 computeVelocityChange(int N, int iSelf,
+                                           const glm::vec3 *pos,
+                                           const glm::vec3 *vel) {
+  const glm::vec3 self_pos = pos[iSelf];
+  const glm::vec3 self_vel = vel[iSelf];
+
+  glm::vec3 new_vel = self_vel;
+
+  // Rule 1: boids fly towards their local perceived center of mass, which
+  // excludes themselves
+  glm::vec3 perceived_center{0.0f, 0.0f, 0.0f};
+  int numInfluencingBoids_rule1 = 0;
+  for (int i = 0; i < N; ++i) {
+    if (i != iSelf && glm::distance(pos[i], self_pos) < rule1Distance) {
+      perceived_center += pos[i];
+      ++numInfluencingBoids_rule1;
+    }
+  }
+  if (numInfluencingBoids_rule1 > 0) {
+    perceived_center /= numInfluencingBoids_rule1;
+  }
+  new_vel += (perceived_center - self_pos) * rule1Scale;
+
+  // Rule 2: boids try to stay a distance d away from each
+  // other
+  glm::vec3 center{0.0f, 0.0f, 0.0f};
+  for (int i = 0; i < N; ++i) {
+    if (i != iSelf && glm::distance(pos[i], self_pos) < rule2Distance) {
+      center -= (pos[i] - self_pos);
+    }
+  }
+  new_vel += center * rule2Scale;
+
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 perceived_vel{0.0f, 0.0f, 0.0f};
+  int numInfluencingBoids_rule3 = 0;
+  for (int i = 0; i < N; ++i) {
+    if (i != iSelf && glm::distance(pos[i], self_pos) < rule3Distance) {
+      perceived_vel += vel[i];
+      ++numInfluencingBoids_rule3;
+    }
+  }
+  if (numInfluencingBoids_rule3 > 0) {
+    perceived_vel /= numInfluencingBoids_rule3;
+  }
+  new_vel += perceived_vel * rule3Scale;
+
+  return new_vel;
 }
 
 /**
-* TODO-1.2 implement basic flocking
-* For each of the `N` bodies, update its position based on its current velocity.
-*/
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+ * Implement basic flocking
+ * For each of the `N` bodies, update its position based on its current
+ * velocity.
+ */
+__global__ void kernUpdateVelocityBruteForce(int N, const glm::vec3 *pos,
+                                             const glm::vec3 *vel1,
+                                             glm::vec3 *vel2) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (idx < N) {
+    glm::vec3 new_vel{0.0f, 0.0f, 0.0f};
+    // Compute a new velocity based on pos and vel1
+    glm::vec3 new_vel_raw = computeVelocityChange(N, idx, pos, vel1);
+    // Clamp the speed
+    new_vel = (glm::length(new_vel_raw) <= maxSpeed)
+                  ? new_vel_raw
+                  : glm::normalize(new_vel_raw) * maxSpeed;
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[idx] = new_vel;
+  }
 }
 
 /**
-* LOOK-1.2 Since this is pretty trivial, we implemented it for you.
-* For each of the `N` bodies, update its position based on its current velocity.
-*/
-__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
+ * LOOK-1.2 Since this is pretty trivial, we implemented it for you.
+ * For each of the `N` bodies, update its position based on its current
+ * velocity.
+ */
+__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos,
+                              const glm::vec3 *vel) {
   // Update position by velocity
   int index = threadIdx.x + (blockIdx.x * blockDim.x);
   if (index >= N) {
@@ -282,13 +378,36 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
-__global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
+__device__ glm::ivec3 gridIndex3D(const glm::vec3 pos, const glm::vec3 gridMin,
+                                  const float inverseCellWidth) {
+  return static_cast<glm::ivec3>(
+      glm::floor((pos - gridMin) * inverseCellWidth));
+}
+
+__device__ int gridIndex1D(const glm::vec3 pos, const glm::vec3 gridMin,
+                           const float inverseCellWidth,
+                           const int gridResolution) {
+  glm::ivec3 pos_in_grid = gridIndex3D(pos, gridMin, inverseCellWidth);
+  return gridIndex3Dto1D(pos_in_grid.x, pos_in_grid.y, pos_in_grid.z,
+                         gridResolution);
+}
+
+__global__ void kernComputeIndices(int N, int gridResolution, glm::vec3 gridMin,
+                                   float inverseCellWidth, const glm::vec3 *pos,
+                                   int *indices, int *gridIndices) {
+  // Part 2.1
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < N) {
     // - Label each boid with the index of its grid cell.
+    const glm::vec3 boid_pos = pos[idx];
+    int boid_gridIdx =
+        gridIndex1D(boid_pos, gridMin, inverseCellWidth, gridResolution);
+    gridIndices[idx] = boid_gridIdx;
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    indices[idx] = idx;
+  }
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -300,88 +419,413 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
   }
 }
 
-__global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
+// Assumes particleGridIndices has been sorted
+__global__ void kernIdentifyCellStartEnd(int N, const int *particleGridIndices,
+                                         int *gridCellStartIndices,
+                                         int *gridCellEndIndices) {
+  // Part 2.1
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < N) {
+    int particleGridIdx      = particleGridIndices[idx];
+    int prev_particleGridIdx = -1, next_particleGridIdx = -1;
+
+    if (idx > 0) {
+      prev_particleGridIdx = particleGridIndices[idx - 1];
+    }
+    if (idx < N - 1) {
+      next_particleGridIdx = particleGridIndices[idx + 1];
+    }
+
+    if (idx == 0 || particleGridIdx != prev_particleGridIdx) {
+      gridCellStartIndices[particleGridIdx] = idx;
+    }
+
+    if (idx == N - 1 || particleGridIdx != next_particleGridIdx) {
+      gridCellEndIndices[particleGridIdx] = idx;
+    }
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth,
+    float cellWidth, int *gridCellStartIndices, int *gridCellEndIndices,
+    int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *vel1,
+    glm::vec3 *vel2) {
+  // Part-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < N) {
+    // - Identify the grid cell that this particle is in
+    glm::vec3 boid_pos      = pos[idx];
+    glm::ivec3 boid_gridPos = gridIndex3D(boid_pos, gridMin, inverseCellWidth);
+    int boid_gridIdx        = gridIndex3Dto1D(boid_gridPos.x, boid_gridPos.y,
+                                       boid_gridPos.z, gridResolution);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // use grid pos rounding to determine search directions!
+    glm::ivec3 boid_gridPosRound = static_cast<glm::ivec3>(
+        glm::round((boid_pos - gridMin) * inverseCellWidth));
+    glm::ivec3 biasAlongAxes{-1, -1, -1};
+    for (int i = 0; i < 3; ++i) {
+      if (boid_gridPosRound[i] != boid_gridPos[i]) biasAlongAxes[i] = 1;
+    }
+
+    int valid_neighbor_idx_list[8];
+    int num_valid_neighbors = 0;
+    for (int zStep = 0; zStep <= 1; ++zStep) {
+      for (int yStep = 0; yStep <= 1; ++yStep) {
+        for (int xStep = 0; xStep <= 1; ++xStep) {
+          // z-y-x loop corresponds to LOOK-2.3 question at gridIndex3Dto1D()
+          glm::ivec3 grid_pos =
+              boid_gridPos + glm::ivec3(xStep * biasAlongAxes[0],
+                                        yStep * biasAlongAxes[1],
+                                        zStep * biasAlongAxes[2]);
+          int grid_idx = gridIndex3Dto1D(grid_pos.x, grid_pos.y, grid_pos.z,
+                                         gridResolution);
+          if (gridCellStartIndices[grid_idx] >= 0) {
+            valid_neighbor_idx_list[num_valid_neighbors++] = grid_idx;
+          }
+        }
+      }
+    }
+
+    // - Neighbor Search Velocity Update Main Loop
+    glm::vec3 boid_vel = vel1[idx];
+    glm::vec3 new_vel  = boid_vel;
+    // Rule 1: boids fly towards their local perceived center of mass, which
+    // excludes themselves
+    glm::vec3 perceived_center{0.0f, 0.0f, 0.0f};
+    int numInfluencingNeighbors = 0;
+    for (int i = 0; i < num_valid_neighbors; ++i) {
+      // - For each cell, read the start/end indices in the boid pointer array.
+      int cell_idx            = valid_neighbor_idx_list[i];
+      int cell_boid_start_idx = gridCellStartIndices[cell_idx];
+      int cell_boid_end_idx   = gridCellEndIndices[cell_idx];
+      // - Access each boid in the cell and compute velocity change from
+      //   the boids rules, if this boid is within the neighborhood distance.
+      for (int j = cell_boid_start_idx; j <= cell_boid_end_idx; ++j) {
+        int neighbor_boid_idx = particleArrayIndices[j];
+        if (neighbor_boid_idx != idx &&
+            glm::distance(pos[neighbor_boid_idx], boid_pos) < rule1Distance) {
+          perceived_center += pos[neighbor_boid_idx];
+          ++numInfluencingNeighbors;
+        }
+      }
+    }
+    if (numInfluencingNeighbors > 0) {
+      perceived_center /= numInfluencingNeighbors;
+    }
+    new_vel += (perceived_center - boid_pos) * rule1Scale;
+
+    // Rule 2: boids try to stay a distance d away from each other
+    glm::vec3 center{0.f, 0.f, 0.f};
+    for (int i = 0; i < num_valid_neighbors; ++i) {
+      int cell_idx            = valid_neighbor_idx_list[i];
+      int cell_boid_start_idx = gridCellStartIndices[cell_idx];
+      int cell_boid_end_idx   = gridCellEndIndices[cell_idx];
+      for (int j = cell_boid_start_idx; j <= cell_boid_end_idx; ++j) {
+        int neighbor_boid_idx = particleArrayIndices[j];
+        if (neighbor_boid_idx != idx &&
+            glm::distance(pos[neighbor_boid_idx], boid_pos) < rule2Distance) {
+          center -= (pos[neighbor_boid_idx] - boid_pos);
+        }
+      }
+    }
+    new_vel += center * rule2Scale;
+
+    // Rule 3: boids try to match the speed of surrounding boids
+    numInfluencingNeighbors = 0;
+    glm::vec3 perceived_vel{0.f, 0.f, 0.f};
+    for (int i = 0; i < num_valid_neighbors; ++i) {
+      int cell_idx            = valid_neighbor_idx_list[i];
+      int cell_boid_start_idx = gridCellStartIndices[cell_idx];
+      int cell_boid_end_idx   = gridCellEndIndices[cell_idx];
+      for (int j = cell_boid_start_idx; j <= cell_boid_end_idx; ++j) {
+        int neighbor_boid_idx = particleArrayIndices[j];
+        if (neighbor_boid_idx != idx &&
+            glm::distance(pos[neighbor_boid_idx], boid_pos) < rule3Distance) {
+          perceived_vel += vel1[neighbor_boid_idx];
+          ++numInfluencingNeighbors;
+        }
+      }
+    }
+    if (numInfluencingNeighbors > 0) {
+      perceived_vel /= numInfluencingNeighbors;
+    }
+    new_vel += perceived_vel * rule3Scale;
+
+    // - Clamp the speed change before putting the new speed in vel2
+    if (glm::length(new_vel) > maxSpeed) {
+      new_vel = glm::normalize(new_vel) * maxSpeed;
+    }
+
+    // - Put the speed change in vel2
+    vel2[idx] = new_vel;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth,
+    float cellWidth, int *gridCellStartIndices, int *gridCellEndIndices,
+    glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+  // Part-2.3 - This should be very similar to
+  // kernUpdateVelNeighborSearchScattered, except with one less level of
+  // indirection. This should expect gridCellStartIndices and gridCellEndIndices
+  // to refer directly to pos and vel1.
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < N) {
+    // - Identify the grid cell that this particle is in
+    glm::vec3 boid_pos      = pos[idx];
+    glm::ivec3 boid_gridPos = gridIndex3D(boid_pos, gridMin, inverseCellWidth);
+    int boid_gridIdx        = gridIndex3Dto1D(boid_gridPos.x, boid_gridPos.y,
+                                       boid_gridPos.z, gridResolution);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    glm::ivec3 boid_gridPosRound = static_cast<glm::ivec3>(
+        glm::round((boid_pos - gridMin) * inverseCellWidth));
+    glm::ivec3 biasAlongAxes{-1, -1, -1};
+    for (int i = 0; i < 3; ++i) {
+      if (boid_gridPosRound[i] != boid_gridPos[i]) biasAlongAxes[i] = 1;
+    }
+
+    int valid_neighbor_idx_list[8];
+    int num_valid_neighbors = 0;
+    for (int zStep = 0; zStep <= 1; ++zStep) {
+      for (int yStep = 0; yStep <= 1; ++yStep) {
+        for (int xStep = 0; xStep <= 1; ++xStep) {
+          // z-y-x loop corresponds to LOOK-2.3 question at gridIndex3Dto1D()
+          glm::ivec3 grid_pos =
+              boid_gridPos + glm::ivec3(xStep * biasAlongAxes[0],
+                                        yStep * biasAlongAxes[1],
+                                        zStep * biasAlongAxes[2]);
+          int grid_idx = gridIndex3Dto1D(grid_pos.x, grid_pos.y, grid_pos.z,
+                                         gridResolution);
+          if (gridCellStartIndices[grid_idx] >= 0) {
+            valid_neighbor_idx_list[num_valid_neighbors++] = grid_idx;
+          }
+        }
+      }
+    }
+
+    // - Neighbor Search Velocity Update Main Loop
+    glm::vec3 boid_vel = vel1[idx];
+    glm::vec3 new_vel  = boid_vel;
+    // Rule 1: boids fly towards their local perceived center of mass, which
+    // excludes themselves
+    glm::vec3 perceived_center{0.0f, 0.0f, 0.0f};
+    int numInfluencingNeighbors = 0;
+    for (int i = 0; i < num_valid_neighbors; ++i) {
+      // - For each cell, read the start/end indices in the boid pointer array.
+      int cell_idx            = valid_neighbor_idx_list[i];
+      int cell_boid_start_idx = gridCellStartIndices[cell_idx];
+      int cell_boid_end_idx   = gridCellEndIndices[cell_idx];
+      // - Access each boid in the cell and compute velocity change from
+      //   the boids rules, if this boid is within the neighborhood distance.
+      for (int j = cell_boid_start_idx; j <= cell_boid_end_idx; ++j) {
+        if (j != idx && glm::distance(pos[j], boid_pos) < rule1Distance) {
+          perceived_center += pos[j];
+          ++numInfluencingNeighbors;
+        }
+      }
+    }
+    if (numInfluencingNeighbors > 0) {
+      perceived_center /= numInfluencingNeighbors;
+    }
+    new_vel += (perceived_center - boid_pos) * rule1Scale;
+
+    // Rule 2: boids try to stay a distance d away from each other
+    glm::vec3 center{0.f, 0.f, 0.f};
+    for (int i = 0; i < num_valid_neighbors; ++i) {
+      int cell_idx            = valid_neighbor_idx_list[i];
+      int cell_boid_start_idx = gridCellStartIndices[cell_idx];
+      int cell_boid_end_idx   = gridCellEndIndices[cell_idx];
+      for (int j = cell_boid_start_idx; j <= cell_boid_end_idx; ++j) {
+        if (j != idx && glm::distance(pos[j], boid_pos) < rule2Distance) {
+          center -= (pos[j] - boid_pos);
+        }
+      }
+    }
+    new_vel += center * rule2Scale;
+
+    // Rule 3: boids try to match the speed of surrounding boids
+    numInfluencingNeighbors = 0;
+    glm::vec3 perceived_vel{0.f, 0.f, 0.f};
+    for (int i = 0; i < num_valid_neighbors; ++i) {
+      int cell_idx            = valid_neighbor_idx_list[i];
+      int cell_boid_start_idx = gridCellStartIndices[cell_idx];
+      int cell_boid_end_idx   = gridCellEndIndices[cell_idx];
+      for (int j = cell_boid_start_idx; j <= cell_boid_end_idx; ++j) {
+        if (j != idx && glm::distance(pos[j], boid_pos) < rule3Distance) {
+          perceived_vel += vel1[j];
+          ++numInfluencingNeighbors;
+        }
+      }
+    }
+    if (numInfluencingNeighbors > 0) {
+      perceived_vel /= numInfluencingNeighbors;
+    }
+    new_vel += perceived_vel * rule3Scale;
+
+    // - Clamp the speed change before putting the new speed in vel2
+    if (glm::length(new_vel) > maxSpeed) {
+      new_vel = glm::normalize(new_vel) * maxSpeed;
+    }
+
+    // - Put the speed change in vel2
+    vel2[idx] = new_vel;
+  }
 }
 
 /**
-* Step the entire N-body simulation by `dt` seconds.
-*/
+ * Step the entire N-body simulation by `dt` seconds.
+ */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+  // use the kernels you wrote to step the simulation forward in time.
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(
+      numObjects, dev_pos, dev_vel1, dev_vel2);
+  cudaDeviceSynchronize();
+
+  // ping-pong the velocity buffers
+  std::swap(dev_vel1, dev_vel2);
+
+  // update position
+  kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos,
+                                                  dev_vel1);
+  cudaDeviceSynchronize();
+  checkCUDAErrorWithLine("Naive simulation step failed!");
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
+  // Part-2.1
   // Uniform Grid Neighbor search using Thrust sort.
+  dim3 fullBlocksPerGrid_particles((numObjects + blockSize - 1) / blockSize);
+  dim3 fullBlocksPerGrid_cells((gridCellCount + blockSize - 1) / blockSize);
+
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+  kernResetIntBuffer<<<fullBlocksPerGrid_cells, blockSize>>>(
+      gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid_cells, blockSize>>>(
+      gridCellCount, dev_gridCellEndIndices, -1);
+  cudaDeviceSynchronize();
+
+  kernComputeIndices<<<fullBlocksPerGrid_particles, blockSize>>>(
+      numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos,
+      dev_particleArrayIndices, dev_particleGridIndices);
+  cudaDeviceSynchronize();
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  dev_thrust_particleArrayIndices =
+      thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices =
+      thrust::device_pointer_cast<int>(dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices,
+                      dev_thrust_particleGridIndices + numObjects,
+                      dev_thrust_particleArrayIndices);
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid_particles, blockSize>>>(
+      numObjects, dev_particleGridIndices, dev_gridCellStartIndices,
+      dev_gridCellEndIndices);
+  cudaDeviceSynchronize();
+
   // - Perform velocity updates using neighbor search
-  // - Update positions
+  kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid_particles,
+                                         blockSize>>>(
+      numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+      gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+      dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+  cudaDeviceSynchronize();
+
   // - Ping-pong buffers as needed
+  std::swap(dev_vel1, dev_vel2);
+
+  // - Update positions
+  kernUpdatePos<<<fullBlocksPerGrid_particles, blockSize>>>(numObjects, dt,
+                                                            dev_pos, dev_vel1);
+  cudaDeviceSynchronize();
+  checkCUDAErrorWithLine("Naive simulation step failed!");
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+  // Part-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+  dim3 fullBlocksPerGrid_particles((numObjects + blockSize - 1) / blockSize);
+  dim3 fullBlocksPerGrid_cells((gridCellCount + blockSize - 1) / blockSize);
+
   // In Parallel:
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+  kernResetIntBuffer<<<fullBlocksPerGrid_cells, blockSize>>>(
+      gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer<<<fullBlocksPerGrid_cells, blockSize>>>(
+      gridCellCount, dev_gridCellEndIndices, -1);
+  cudaDeviceSynchronize();
+
+  kernComputeIndices<<<fullBlocksPerGrid_particles, blockSize>>>(
+      numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos,
+      dev_particleArrayIndices, dev_particleGridIndices);
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(dev_posGridIndices, dev_particleGridIndices,
+             numObjects * sizeof(int), cudaMemcpyDeviceToDevice);
+  cudaMemcpy(dev_velGridIndices, dev_particleGridIndices,
+             numObjects * sizeof(int), cudaMemcpyDeviceToDevice);
+  checkCUDAErrorWithLine(
+      "cudaMemcpy dev_posGridIndices/dev_velGridIndices failed!");
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  dev_thrust_particleArrayIndices =
+      thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices =
+      thrust::device_pointer_cast<int>(dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices,
+                      dev_thrust_particleGridIndices + numObjects,
+                      dev_thrust_particleArrayIndices);
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernIdentifyCellStartEnd<<<fullBlocksPerGrid_particles, blockSize>>>(
+      numObjects, dev_particleGridIndices, dev_gridCellStartIndices,
+      dev_gridCellEndIndices);
+  cudaDeviceSynchronize();
+
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+  dev_thrust_posGridIndices =
+      thrust::device_pointer_cast<int>(dev_posGridIndices);
+  dev_thrust_pos = thrust::device_pointer_cast<glm::vec3>(dev_pos);
+  thrust::sort_by_key(dev_thrust_posGridIndices,
+                      dev_thrust_posGridIndices + numObjects, dev_thrust_pos);
+  dev_thrust_velGridIndices =
+      thrust::device_pointer_cast<int>(dev_velGridIndices);
+  dev_thrust_vel = thrust::device_pointer_cast<glm::vec3>(dev_vel1);
+  thrust::sort_by_key(dev_thrust_velGridIndices,
+                      dev_thrust_velGridIndices + numObjects, dev_thrust_vel);
+
   // - Perform velocity updates using neighbor search
-  // - Update positions
+  kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid_particles,
+                                        blockSize>>>(
+      numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+      gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos,
+      dev_vel1, dev_vel2);
+
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  std::swap(dev_vel1, dev_vel2);
+
+  // - Update positions
+  kernUpdatePos<<<fullBlocksPerGrid_particles, blockSize>>>(numObjects, dt,
+                                                            dev_pos, dev_vel1);
 }
 
 void Boids::endSimulation() {
@@ -389,35 +833,52 @@ void Boids::endSimulation() {
   cudaFree(dev_vel2);
   cudaFree(dev_pos);
 
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  // Part-2.1 Part-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_posGridIndices);
+  cudaFree(dev_velGridIndices);
 }
 
 void Boids::unitTest() {
   // LOOK-1.2 Feel free to write additional tests here.
+  std::cout << "---- Begins Thrust Unit Test -----\n";
 
   // test unstable sort
   int *dev_intKeys;
   int *dev_intValues;
   int N = 10;
 
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+  std::unique_ptr<int[]> intKeys{new int[N]};
+  std::unique_ptr<int[]> intValues{new int[N]};
+
+  intKeys[0]   = 0;
+  intValues[0] = 0;
+  intKeys[1]   = 1;
+  intValues[1] = 1;
+  intKeys[2]   = 0;
+  intValues[2] = 2;
+  intKeys[3]   = 3;
+  intValues[3] = 3;
+  intKeys[4]   = 0;
+  intValues[4] = 4;
+  intKeys[5]   = 2;
+  intValues[5] = 5;
+  intKeys[6]   = 2;
+  intValues[6] = 6;
+  intKeys[7]   = 0;
+  intValues[7] = 7;
+  intKeys[8]   = 5;
+  intValues[8] = 8;
+  intKeys[9]   = 6;
+  intValues[9] = 9;
+
+  cudaMalloc((void **)&dev_intKeys, N * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
 
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+  cudaMalloc((void **)&dev_intValues, N * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
 
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
@@ -429,8 +890,10 @@ void Boids::unitTest() {
   }
 
   // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N,
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N,
+             cudaMemcpyHostToDevice);
 
   // Wrap device vectors in thrust iterators for use with thrust.
   thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
@@ -439,8 +902,10 @@ void Boids::unitTest() {
   thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
 
   // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N,
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N,
+             cudaMemcpyDeviceToHost);
   checkCUDAErrorWithLine("memcpy back failed!");
 
   std::cout << "after unstable sort: " << std::endl;
diff --git a/src/kernel.h b/src/kernel.h
index 3d3da72..7ec6873 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -1,21 +1,23 @@
 #pragma once
 
+#include <cuda.h>
+#include <device_launch_parameters.h>
 #include <stdio.h>
-#include <thrust/sort.h>
+#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
-#include <thrust/device_vector.h>
-#include <cuda.h>
+#include <thrust/sort.h>
+
 #include <cmath>
 #include <vector>
 
 namespace Boids {
-    void initSimulation(int N);
-    void stepSimulationNaive(float dt);
-    void stepSimulationScatteredGrid(float dt);
-    void stepSimulationCoherentGrid(float dt);
-    void copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities);
+void initSimulation(int N);
+void stepSimulationNaive(float dt);
+void stepSimulationScatteredGrid(float dt);
+void stepSimulationCoherentGrid(float dt);
+void copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities);
 
-    void endSimulation();
-    void unitTest();
-}
+void endSimulation();
+void unitTest();
+}  // namespace Boids
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..c59a3ec 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,10 +1,10 @@
 /**
-* @file      main.cpp
-* @brief     Example Boids flocking simulation for CIS 565
-* @authors   Liam Boone, Kai Ninomiya, Kangning (Gary) Li
-* @date      2013-2017
-* @copyright University of Pennsylvania
-*/
+ * @file      main.cpp
+ * @brief     Example Boids flocking simulation for CIS 565
+ * @authors   Liam Boone, Kai Ninomiya, Kangning (Gary) Li
+ * @date      2013-2017
+ * @copyright University of Pennsylvania
+ */
 
 #include "main.hpp"
 
@@ -13,18 +13,18 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE     1
+#define UNIFORM_GRID  1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
-const float DT = 0.2f;
+const int N_FOR_VIS = 100000;
+const float DT      = 0.2f;
 
 /**
-* C main function.
-*/
-int main(int argc, char* argv[]) {
+ * C main function.
+ */
+int main(int argc, char *argv[]) {
   projectName = "565 CUDA Intro: Boids";
 
   if (init(argc, argv)) {
@@ -44,19 +44,18 @@ std::string deviceName;
 GLFWwindow *window;
 
 /**
-* Initialization of CUDA and GLFW.
-*/
+ * Initialization of CUDA and GLFW.
+ */
 bool init(int argc, char **argv) {
   // Set window title to "Student Name: [SM 2.0] GPU Name"
   cudaDeviceProp deviceProp;
-  int gpuDevice = 0;
+  int gpuDevice    = 0;
   int device_count = 0;
   cudaGetDeviceCount(&device_count);
   if (gpuDevice > device_count) {
     std::cout
-    << "Error: GPU device number is greater than the number of devices!"
-    << " Perhaps a CUDA-capable GPU is not installed?"
-    << std::endl;
+        << "Error: GPU device number is greater than the number of devices!"
+        << " Perhaps a CUDA-capable GPU is not installed?" << std::endl;
     return false;
   }
   cudaGetDeviceProperties(&deviceProp, gpuDevice);
@@ -64,17 +63,16 @@ bool init(int argc, char **argv) {
   int minor = deviceProp.minor;
 
   std::ostringstream ss;
-  ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]";
+  ss << projectName << " [SM " << major << "." << minor << " "
+     << deviceProp.name << "]";
   deviceName = ss.str();
 
   // Window setup stuff
   glfwSetErrorCallback(errorCallback);
 
   if (!glfwInit()) {
-    std::cout
-    << "Error: Could not initialize GLFW!"
-    << " Perhaps OpenGL 3.3 isn't available?"
-    << std::endl;
+    std::cout << "Error: Could not initialize GLFW!"
+              << " Perhaps OpenGL 3.3 isn't available?" << std::endl;
     return false;
   }
 
@@ -101,8 +99,8 @@ bool init(int argc, char **argv) {
   // Initialize drawing state
   initVAO();
 
-  // Default to device ID 0. If you have more than one GPU and want to test a non-default one,
-  // change the device ID.
+  // Default to device ID 0. If you have more than one GPU and want to test a
+  // non-default one, change the device ID.
   cudaGLSetGLDevice(0);
 
   cudaGLRegisterBufferObject(boidVBO_positions);
@@ -121,9 +119,8 @@ bool init(int argc, char **argv) {
 }
 
 void initVAO() {
-
-  std::unique_ptr<GLfloat[]> bodies{ new GLfloat[4 * (N_FOR_VIS)] };
-  std::unique_ptr<GLuint[]> bindices{ new GLuint[N_FOR_VIS] };
+  std::unique_ptr<GLfloat[]> bodies{new GLfloat[4 * (N_FOR_VIS)]};
+  std::unique_ptr<GLuint[]> bindices{new GLuint[N_FOR_VIS]};
 
   glm::vec4 ul(-1.0, -1.0, 1.0, 1.0);
   glm::vec4 lr(1.0, 1.0, 0.0, 0.0);
@@ -133,11 +130,11 @@ void initVAO() {
     bodies[4 * i + 1] = 0.0f;
     bodies[4 * i + 2] = 0.0f;
     bodies[4 * i + 3] = 1.0f;
-    bindices[i] = i;
+    bindices[i]       = i;
   }
 
-
-  glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this
+  glGenVertexArrays(
+      1, &boidVAO);  // Attach everything needed to draw a particle to this
   glGenBuffers(1, &boidVBO_positions);
   glGenBuffers(1, &boidVBO_velocities);
   glGenBuffers(1, &boidIBO);
@@ -145,169 +142,176 @@ void initVAO() {
   glBindVertexArray(boidVAO);
 
   // Bind the positions array to the boidVAO by way of the boidVBO_positions
-  glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer
-  glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data
+  glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions);  // bind the buffer
+  glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(),
+               GL_DYNAMIC_DRAW);  // transfer data
 
   glEnableVertexAttribArray(positionLocation);
   glVertexAttribPointer((GLuint)positionLocation, 4, GL_FLOAT, GL_FALSE, 0, 0);
 
   // Bind the velocities array to the boidVAO by way of the boidVBO_velocities
   glBindBuffer(GL_ARRAY_BUFFER, boidVBO_velocities);
-  glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW);
+  glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(),
+               GL_DYNAMIC_DRAW);
   glEnableVertexAttribArray(velocitiesLocation);
-  glVertexAttribPointer((GLuint)velocitiesLocation, 4, GL_FLOAT, GL_FALSE, 0, 0);
+  glVertexAttribPointer((GLuint)velocitiesLocation, 4, GL_FLOAT, GL_FALSE, 0,
+                        0);
 
   glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, boidIBO);
-  glBufferData(GL_ELEMENT_ARRAY_BUFFER, (N_FOR_VIS) * sizeof(GLuint), bindices.get(), GL_STATIC_DRAW);
+  glBufferData(GL_ELEMENT_ARRAY_BUFFER, (N_FOR_VIS) * sizeof(GLuint),
+               bindices.get(), GL_STATIC_DRAW);
 
   glBindVertexArray(0);
 }
 
-void initShaders(GLuint * program) {
+void initShaders(GLuint *program) {
   GLint location;
 
   program[PROG_BOID] = glslUtility::createProgram(
-    "shaders/boid.vert.glsl",
-    "shaders/boid.geom.glsl",
-    "shaders/boid.frag.glsl", attributeLocations, 2);
-    glUseProgram(program[PROG_BOID]);
+      "shaders/boid.vert.glsl", "shaders/boid.geom.glsl",
+      "shaders/boid.frag.glsl", attributeLocations, 2);
+  glUseProgram(program[PROG_BOID]);
 
-    if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
-      glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
-    }
-    if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1) {
-      glUniform3fv(location, 1, &cameraPosition[0]);
-    }
+  if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) !=
+      -1) {
+    glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
   }
-
-  //====================================
-  // Main loop
-  //====================================
-  void runCUDA() {
-    // Map OpenGL buffer object for writing from CUDA on a single GPU
-    // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
-    // use this buffer
-
-    float4 *dptr = NULL;
-    float *dptrVertPositions = NULL;
-    float *dptrVertVelocities = NULL;
-
-    cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
-    cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
-
-    // execute the kernel
-    #if UNIFORM_GRID && COHERENT_GRID
-    Boids::stepSimulationCoherentGrid(DT);
-    #elif UNIFORM_GRID
-    Boids::stepSimulationScatteredGrid(DT);
-    #else
-    Boids::stepSimulationNaive(DT);
-    #endif
-
-    #if VISUALIZE
-    Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
-    #endif
-    // unmap buffer object
-    cudaGLUnmapBufferObject(boidVBO_positions);
-    cudaGLUnmapBufferObject(boidVBO_velocities);
+  if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) !=
+      -1) {
+    glUniform3fv(location, 1, &cameraPosition[0]);
   }
+}
 
-  void mainLoop() {
-    double fps = 0;
-    double timebase = 0;
-    int frame = 0;
-
-    Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
-                       // your CUDA development setup is ready to go.
+//====================================
+// Main loop
+//====================================
+void runCUDA() {
+  // Map OpenGL buffer object for writing from CUDA on a single GPU
+  // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
+  // use this buffer
+
+  float4 *dptr              = NULL;
+  float *dptrVertPositions  = NULL;
+  float *dptrVertVelocities = NULL;
+
+  cudaGLMapBufferObject((void **)&dptrVertPositions, boidVBO_positions);
+  cudaGLMapBufferObject((void **)&dptrVertVelocities, boidVBO_velocities);
+
+// execute the kernel
+#if UNIFORM_GRID && COHERENT_GRID
+  Boids::stepSimulationCoherentGrid(DT);
+#elif UNIFORM_GRID
+  Boids::stepSimulationScatteredGrid(DT);
+#else
+  Boids::stepSimulationNaive(DT);
+#endif
+
+#if VISUALIZE
+  Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
+#endif
+  // unmap buffer object
+  cudaGLUnmapBufferObject(boidVBO_positions);
+  cudaGLUnmapBufferObject(boidVBO_velocities);
+}
 
-    while (!glfwWindowShouldClose(window)) {
-      glfwPollEvents();
+void mainLoop() {
+  double fps      = 0;
+  double timebase = 0;
+  int frame       = 0;
 
-      frame++;
-      double time = glfwGetTime();
+  Boids::unitTest();  // LOOK-1.2 We run some basic example code to make sure
+                      // your CUDA development setup is ready to go.
 
-      if (time - timebase > 1.0) {
-        fps = frame / (time - timebase);
-        timebase = time;
-        frame = 0;
-      }
+  while (!glfwWindowShouldClose(window)) {
+    glfwPollEvents();
 
-      runCUDA();
+    frame++;
+    double time = glfwGetTime();
 
-      std::ostringstream ss;
-      ss << "[";
-      ss.precision(1);
-      ss << std::fixed << fps;
-      ss << " fps] " << deviceName;
-      glfwSetWindowTitle(window, ss.str().c_str());
+    if (time - timebase > 1.0) {
+      fps      = frame / (time - timebase);
+      timebase = time;
+      frame    = 0;
+    }
 
-      glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    runCUDA();
 
-      #if VISUALIZE
-      glUseProgram(program[PROG_BOID]);
-      glBindVertexArray(boidVAO);
-      glPointSize((GLfloat)pointSize);
-      glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
-      glPointSize(1.0f);
+    std::ostringstream ss;
+    ss << "[";
+    ss.precision(1);
+    ss << std::fixed << fps;
+    ss << " fps] " << deviceName;
+    glfwSetWindowTitle(window, ss.str().c_str());
 
-      glUseProgram(0);
-      glBindVertexArray(0);
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
-      glfwSwapBuffers(window);
-      #endif
-    }
-    glfwDestroyWindow(window);
-    glfwTerminate();
-  }
+#if VISUALIZE
+    glUseProgram(program[PROG_BOID]);
+    glBindVertexArray(boidVAO);
+    glPointSize((GLfloat)pointSize);
+    glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
+    glPointSize(1.0f);
 
+    glUseProgram(0);
+    glBindVertexArray(0);
 
-  void errorCallback(int error, const char *description) {
-    fprintf(stderr, "error %d: %s\n", error, description);
+    glfwSwapBuffers(window);
+#endif
   }
+  glfwDestroyWindow(window);
+  glfwTerminate();
+}
 
-  void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) {
-    if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
-      glfwSetWindowShouldClose(window, GL_TRUE);
-    }
-  }
+void errorCallback(int error, const char *description) {
+  fprintf(stderr, "error %d: %s\n", error, description);
+}
 
-  void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) {
-    leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS);
-    rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS);
+void keyCallback(GLFWwindow *window, int key, int scancode, int action,
+                 int mods) {
+  if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
+    glfwSetWindowShouldClose(window, GL_TRUE);
   }
+}
 
-  void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) {
-    if (leftMousePressed) {
-      // compute new camera parameters
-      phi += (xpos - lastX) / width;
-      theta -= (ypos - lastY) / height;
-      theta = std::fmax(0.01f, std::fmin(theta, 3.14f));
-      updateCamera();
-    }
-    else if (rightMousePressed) {
-      zoom += (ypos - lastY) / height;
-      zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f));
-      updateCamera();
-    }
+void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods) {
+  leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS);
+  rightMousePressed =
+      (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS);
+}
 
-	lastX = xpos;
-	lastY = ypos;
+void mousePositionCallback(GLFWwindow *window, double xpos, double ypos) {
+  if (leftMousePressed) {
+    // compute new camera parameters
+    phi += (xpos - lastX) / width;
+    theta -= (ypos - lastY) / height;
+    theta = std::fmax(0.01f, std::fmin(theta, 3.14f));
+    updateCamera();
+  } else if (rightMousePressed) {
+    zoom += (ypos - lastY) / height;
+    zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f));
+    updateCamera();
   }
 
-  void updateCamera() {
-    cameraPosition.x = zoom * sin(phi) * sin(theta);
-    cameraPosition.z = zoom * cos(theta);
-    cameraPosition.y = zoom * cos(phi) * sin(theta);
-    cameraPosition += lookAt;
+  lastX = xpos;
+  lastY = ypos;
+}
 
-    projection = glm::perspective(fovy, float(width) / float(height), zNear, zFar);
-    glm::mat4 view = glm::lookAt(cameraPosition, lookAt, glm::vec3(0, 0, 1));
-    projection = projection * view;
+void updateCamera() {
+  cameraPosition.x = zoom * sin(phi) * sin(theta);
+  cameraPosition.z = zoom * cos(theta);
+  cameraPosition.y = zoom * cos(phi) * sin(theta);
+  cameraPosition += lookAt;
 
-    GLint location;
+  projection =
+      glm::perspective(fovy, float(width) / float(height), zNear, zFar);
+  glm::mat4 view = glm::lookAt(cameraPosition, lookAt, glm::vec3(0, 0, 1));
+  projection     = projection * view;
 
-    glUseProgram(program[PROG_BOID]);
-    if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) {
-      glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
-    }
+  GLint location;
+
+  glUseProgram(program[PROG_BOID]);
+  if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) !=
+      -1) {
+    glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]);
   }
+}
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..201de19 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -1,53 +1,56 @@
 #pragma once
 
-#include <iostream>
-#include <cstdlib>
-#include <string>
-#include <sstream>
-#include <fstream>
 #include <GL/glew.h>
 #include <GLFW/glfw3.h>
-#include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
+#include <cuda_runtime.h>
+
+#include <cstdlib>
+#include <fstream>
 #include <glm/glm.hpp>
 #include <glm/gtc/matrix_transform.hpp>
-#include "utilityCore.hpp"
+#include <iostream>
+#include <sstream>
+#include <string>
+
 #include "glslUtility.hpp"
 #include "kernel.h"
+#include "utilityCore.hpp"
 
 //====================================
 // GL Stuff
 //====================================
 
-GLuint positionLocation = 0;   // Match results from glslUtility::createProgram.
-GLuint velocitiesLocation = 1; // Also see attribtueLocations below.
-const char *attributeLocations[] = { "Position", "Velocity" };
+GLuint positionLocation = 0;  // Match results from glslUtility::createProgram.
+GLuint velocitiesLocation        = 1;  // Also see attribtueLocations below.
+const char *attributeLocations[] = {"Position", "Velocity"};
 
-GLuint boidVAO = 0;
-GLuint boidVBO_positions = 0;
+GLuint boidVAO            = 0;
+GLuint boidVBO_positions  = 0;
 GLuint boidVBO_velocities = 0;
-GLuint boidIBO = 0;
+GLuint boidIBO            = 0;
 GLuint displayImage;
 GLuint program[2];
 
 const unsigned int PROG_BOID = 0;
 
-const float fovy = (float) (PI / 4);
+const float fovy  = (float)(PI / 4);
 const float zNear = 0.10f;
-const float zFar = 10.0f;
+const float zFar  = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
-int pointSize = 2;
+// Doubled for high DPI display.
+int width     = 1280 * 2;
+int height    = 720 * 2;
+int pointSize = 2 * 2;
 
 // For camera controls
-bool leftMousePressed = false;
+bool leftMousePressed  = false;
 bool rightMousePressed = false;
 double lastX;
 double lastY;
-float theta = 1.22f;
-float phi = -0.70f;
-float zoom = 4.0f;
+float theta      = 1.22f;
+float phi        = -0.70f;
+float zoom       = 4.0f;
 glm::vec3 lookAt = glm::vec3(0.0f, 0.0f, 0.0f);
 glm::vec3 cameraPosition;
 
@@ -59,16 +62,17 @@ glm::mat4 projection;
 
 const char *projectName;
 
-int main(int argc, char* argv[]);
+int main(int argc, char *argv[]);
 
 //====================================
 // Main loop
 //====================================
 void mainLoop();
 void errorCallback(int error, const char *description);
-void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods);
-void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods);
-void mousePositionCallback(GLFWwindow* window, double xpos, double ypos);
+void keyCallback(GLFWwindow *window, int key, int scancode, int action,
+                 int mods);
+void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods);
+void mousePositionCallback(GLFWwindow *window, double xpos, double ypos);
 void updateCamera();
 void runCUDA();
 
diff --git a/src/print_glm.cpp b/src/print_glm.cpp
new file mode 100644
index 0000000..d309234
--- /dev/null
+++ b/src/print_glm.cpp
@@ -0,0 +1,28 @@
+#include "print_glm.hpp"
+
+std::ostream &operator<<(std::ostream &o, const glm::vec4 &v) {
+  o << std::setprecision(4) << "[" << std::setw(7) << v[0] << ", "
+    << std::setw(7) << v[1] << ", " << std::setw(7) << v[2] << ", "
+    << std::setw(7) << v[3] << "]";
+  return o;
+}
+
+std::ostream &operator<<(std::ostream &o, const glm::vec3 &v) {
+  o << std::setprecision(4) << "[" << std::setw(7) << v[0] << ", "
+    << std::setw(7) << v[1] << ", " << std::setw(7) << v[2] << "]";
+  return o;
+}
+
+std::ostream &operator<<(std::ostream &o, const glm::mat4 &m) {
+  o << "\n[" << glm::row(m, 0) << "\n " << row(m, 1) << "\n " << row(m, 2)
+    << "\n " << row(m, 3) << "]";
+
+  return o;
+}
+
+std::ostream &operator<<(std::ostream &o, const glm::quat &q) {
+  o << std::setprecision(4) << "[" << std::setw(7) << q[0] << ", "
+    << std::setw(7) << q[1] << ", " << std::setw(7) << q[2] << ", "
+    << std::setw(7) << q[3] << "]";
+  return o;
+}
diff --git a/src/print_glm.hpp b/src/print_glm.hpp
new file mode 100644
index 0000000..6afaa06
--- /dev/null
+++ b/src/print_glm.hpp
@@ -0,0 +1,17 @@
+#ifndef __PRINT_GLM_HPP__
+#define __PRINT_GLM_HPP__
+
+#define GLM_FORCE_RADIANS
+
+#include <glm/glm.hpp>
+#include <glm/gtc/matrix_access.hpp>
+#include <glm/gtc/quaternion.hpp>
+#include <iomanip>
+#include <iostream>
+
+std::ostream &operator<<(std::ostream &o, const glm::vec4 &v);
+std::ostream &operator<<(std::ostream &o, const glm::vec3 &v);
+std::ostream &operator<<(std::ostream &o, const glm::mat4 &m);
+std::ostream &operator<<(std::ostream &o, const glm::quat &q);
+
+#endif  // __PRINT_GLM_HPP__
diff --git a/src/utilityCore.cpp b/src/utilityCore.cpp
index 70f4eea..9e2ad8c 100644
--- a/src/utilityCore.cpp
+++ b/src/utilityCore.cpp
@@ -1,162 +1,174 @@
 /**
  * @file      utilityCore.cpp
- * @brief     UTILITYCORE: A collection/kitchen sink of generally useful functions
+ * @brief     UTILITYCORE: A collection/kitchen sink of generally useful
+ * functions
  * @authors   Yining Karl Li
  * @date      2012
  * @copyright Yining Karl Li
  */
 
-#include <iostream>
+#include "utilityCore.hpp"
+
 #include <cstdio>
-#include <glm/gtc/matrix_transform.hpp>
 #include <glm/gtc/matrix_inverse.hpp>
-#include "utilityCore.hpp"
+#include <glm/gtc/matrix_transform.hpp>
+#include <iostream>
 
 float utilityCore::clamp(float f, float min, float max) {
-    if (f < min) {
-        return min;
-    } else if (f > max) {
-        return max;
-    } else {
-        return f;
-    }
+  if (f < min) {
+    return min;
+  } else if (f > max) {
+    return max;
+  } else {
+    return f;
+  }
 }
 
-bool utilityCore::replaceString(std::string& str, const std::string& from, const std::string& to) {
-    size_t start_pos = str.find(from);
-    if (start_pos == std::string::npos) {
-        return false;
-    }
-    str.replace(start_pos, from.length(), to);
-    return true;
+bool utilityCore::replaceString(std::string& str, const std::string& from,
+                                const std::string& to) {
+  size_t start_pos = str.find(from);
+  if (start_pos == std::string::npos) {
+    return false;
+  }
+  str.replace(start_pos, from.length(), to);
+  return true;
 }
 
 std::string utilityCore::convertIntToString(int number) {
-    std::stringstream ss;
-    ss << number;
-    return ss.str();
+  std::stringstream ss;
+  ss << number;
+  return ss.str();
 }
 
 glm::vec3 utilityCore::clampRGB(glm::vec3 color) {
-    if (color[0] < 0) {
-        color[0] = 0;
-    } else if (color[0] > 255) {
-        color[0] = 255;
-    }
-    if (color[1] < 0) {
-        color[1] = 0;
-    } else if (color[1] > 255) {
-        color[1] = 255;
-    }
-    if (color[2] < 0) {
-        color[2] = 0;
-    } else if (color[2] > 255) {
-        color[2] = 255;
-    }
-    return color;
+  if (color[0] < 0) {
+    color[0] = 0;
+  } else if (color[0] > 255) {
+    color[0] = 255;
+  }
+  if (color[1] < 0) {
+    color[1] = 0;
+  } else if (color[1] > 255) {
+    color[1] = 255;
+  }
+  if (color[2] < 0) {
+    color[2] = 0;
+  } else if (color[2] > 255) {
+    color[2] = 255;
+  }
+  return color;
 }
 
 bool utilityCore::epsilonCheck(float a, float b) {
-    if (fabs(fabs(a) - fabs(b)) < EPSILON) {
-        return true;
-    } else {
-        return false;
-    }
+  if (fabs(fabs(a) - fabs(b)) < EPSILON) {
+    return true;
+  } else {
+    return false;
+  }
 }
 
-void utilityCore::printCudaMat4(const cudaMat4 &m) {
-    utilityCore::printVec4(m.x);
-    utilityCore::printVec4(m.y);
-    utilityCore::printVec4(m.z);
-    utilityCore::printVec4(m.w);
+void utilityCore::printCudaMat4(const cudaMat4& m) {
+  utilityCore::printVec4(m.x);
+  utilityCore::printVec4(m.y);
+  utilityCore::printVec4(m.z);
+  utilityCore::printVec4(m.w);
 }
 
-glm::mat4 utilityCore::buildTransformationMatrix(glm::vec3 translation, glm::vec3 rotation, glm::vec3 scale) {
-    glm::mat4 translationMat = glm::translate(glm::mat4(), translation);
-    glm::mat4 rotationMat = glm::rotate(glm::mat4(), rotation.x, glm::vec3(1, 0, 0));
-    rotationMat = rotationMat * glm::rotate(glm::mat4(), rotation.y, glm::vec3(0, 1, 0));
-    rotationMat = rotationMat * glm::rotate(glm::mat4(), rotation.z, glm::vec3(0, 0, 1));
-    glm::mat4 scaleMat = glm::scale(glm::mat4(), scale);
-    return translationMat * rotationMat * scaleMat;
+glm::mat4 utilityCore::buildTransformationMatrix(glm::vec3 translation,
+                                                 glm::vec3 rotation,
+                                                 glm::vec3 scale) {
+  glm::mat4 translationMat = glm::translate(glm::mat4(), translation);
+  glm::mat4 rotationMat =
+      glm::rotate(glm::mat4(), rotation.x, glm::vec3(1, 0, 0));
+  rotationMat =
+      rotationMat * glm::rotate(glm::mat4(), rotation.y, glm::vec3(0, 1, 0));
+  rotationMat =
+      rotationMat * glm::rotate(glm::mat4(), rotation.z, glm::vec3(0, 0, 1));
+  glm::mat4 scaleMat = glm::scale(glm::mat4(), scale);
+  return translationMat * rotationMat * scaleMat;
 }
 
-cudaMat4 utilityCore::glmMat4ToCudaMat4(const glm::mat4 &a) {
-    cudaMat4 m;
-    glm::mat4 aTr = glm::transpose(a);
-    m.x = aTr[0];
-    m.y = aTr[1];
-    m.z = aTr[2];
-    m.w = aTr[3];
-    return m;
+cudaMat4 utilityCore::glmMat4ToCudaMat4(const glm::mat4& a) {
+  cudaMat4 m;
+  glm::mat4 aTr = glm::transpose(a);
+  m.x           = aTr[0];
+  m.y           = aTr[1];
+  m.z           = aTr[2];
+  m.w           = aTr[3];
+  return m;
 }
 
-glm::mat4 utilityCore::cudaMat4ToGlmMat4(const cudaMat4 &a) {
-    glm::mat4 m;
-    m[0] = a.x;
-    m[1] = a.y;
-    m[2] = a.z;
-    m[3] = a.w;
-    return glm::transpose(m);
+glm::mat4 utilityCore::cudaMat4ToGlmMat4(const cudaMat4& a) {
+  glm::mat4 m;
+  m[0] = a.x;
+  m[1] = a.y;
+  m[2] = a.z;
+  m[3] = a.w;
+  return glm::transpose(m);
 }
 
 std::vector<std::string> utilityCore::tokenizeString(std::string str) {
-    std::stringstream strstr(str);
-    std::istream_iterator<std::string> it(strstr);
-    std::istream_iterator<std::string> end;
-    std::vector<std::string> results(it, end);
-    return results;
+  std::stringstream strstr(str);
+  std::istream_iterator<std::string> it(strstr);
+  std::istream_iterator<std::string> end;
+  std::vector<std::string> results(it, end);
+  return results;
 }
 
 std::istream& utilityCore::safeGetline(std::istream& is, std::string& t) {
-    t.clear();
-
-    // The characters in the stream are read one-by-one using a std::streambuf.
-    // That is faster than reading them one-by-one using the std::istream.
-    // Code that uses streambuf this way must be guarded by a sentry object.
-    // The sentry object performs various tasks,
-    // such as thread synchronization and updating the stream state.
-
-    std::istream::sentry se(is, true);
-    std::streambuf* sb = is.rdbuf();
-
-    for (;;) {
-        int c = sb->sbumpc();
-        switch (c) {
-        case '\n':
-            return is;
-        case '\r':
-            if (sb->sgetc() == '\n') {
-                sb->sbumpc();
-            }
-            return is;
-        case EOF:
-            // Also handle the case when the last line has no line ending
-            if (t.empty()) {
-                is.setstate(std::ios::eofbit);
-            }
-            return is;
-        default:
-            t += (char)c;
+  t.clear();
+
+  // The characters in the stream are read one-by-one using a std::streambuf.
+  // That is faster than reading them one-by-one using the std::istream.
+  // Code that uses streambuf this way must be guarded by a sentry object.
+  // The sentry object performs various tasks,
+  // such as thread synchronization and updating the stream state.
+
+  std::istream::sentry se(is, true);
+  std::streambuf* sb = is.rdbuf();
+
+  for (;;) {
+    int c = sb->sbumpc();
+    switch (c) {
+      case '\n':
+        return is;
+      case '\r':
+        if (sb->sgetc() == '\n') {
+          sb->sbumpc();
+        }
+        return is;
+      case EOF:
+        // Also handle the case when the last line has no line ending
+        if (t.empty()) {
+          is.setstate(std::ios::eofbit);
         }
+        return is;
+      default:
+        t += (char)c;
     }
+  }
 
-    return is;
+  return is;
 }
 //-----------------------------
 //-------GLM Printers----------
 //-----------------------------
 
-void utilityCore::printMat4(const glm::mat4 &m) {
-    std::cout << m[0][0] << " " << m[1][0] << " " << m[2][0] << " " << m[3][0] << std::endl;
-    std::cout << m[0][1] << " " << m[1][1] << " " << m[2][1] << " " << m[3][1] << std::endl;
-    std::cout << m[0][2] << " " << m[1][2] << " " << m[2][2] << " " << m[3][2] << std::endl;
-    std::cout << m[0][3] << " " << m[1][3] << " " << m[2][3] << " " << m[3][3] << std::endl;
+void utilityCore::printMat4(const glm::mat4& m) {
+  std::cout << m[0][0] << " " << m[1][0] << " " << m[2][0] << " " << m[3][0]
+            << std::endl;
+  std::cout << m[0][1] << " " << m[1][1] << " " << m[2][1] << " " << m[3][1]
+            << std::endl;
+  std::cout << m[0][2] << " " << m[1][2] << " " << m[2][2] << " " << m[3][2]
+            << std::endl;
+  std::cout << m[0][3] << " " << m[1][3] << " " << m[2][3] << " " << m[3][3]
+            << std::endl;
 }
 
-void utilityCore::printVec4(const glm::vec4 &m) {
-    std::cout << m[0] << " " << m[1] << " " << m[2] << " " << m[3] << std::endl;
+void utilityCore::printVec4(const glm::vec4& m) {
+  std::cout << m[0] << " " << m[1] << " " << m[2] << " " << m[3] << std::endl;
 }
 
-void utilityCore::printVec3(const glm::vec3 &m) {
-    std::cout << m[0] << " " << m[1] << " " << m[2] << std::endl;
+void utilityCore::printVec3(const glm::vec3& m) {
+  std::cout << m[0] << " " << m[1] << " " << m[2] << std::endl;
 }
diff --git a/src/utilityCore.hpp b/src/utilityCore.hpp
index bf7c05f..1a186a0 100644
--- a/src/utilityCore.hpp
+++ b/src/utilityCore.hpp
@@ -1,6 +1,7 @@
 /**
  * @file      utilityCore.hpp
- * @brief     UTILITYCORE: A collection/kitchen sink of generally useful functions
+ * @brief     UTILITYCORE: A collection/kitchen sink of generally useful
+ * functions
  * @authors   Yining Karl Li
  * @date      2012
  * @copyright Yining Karl Li
@@ -9,40 +10,46 @@
 #pragma once
 
 #include <algorithm>
+#include <glm/glm.hpp>
 #include <istream>
-#include <ostream>
 #include <iterator>
+#include <ostream>
 #include <sstream>
 #include <string>
 #include <vector>
-#include <glm/glm.hpp>
+
 #include "cudaMat4.hpp"
+#include "print_glm.hpp"
 
-#define PI                          3.1415926535897932384626422832795028841971
-#define TWO_PI                      6.2831853071795864769252867665590057683943
-#define SQRT_OF_ONE_THIRD           0.5773502691896257645091487805019574556476
-#define E                           2.7182818284590452353602874713526624977572
-#define G                           6.67384e-11
-#define EPSILON                     .000000001
-#define ZERO_ABSORPTION_EPSILON     0.00001
+#define PI                      3.1415926535897932384626422832795028841971
+#define TWO_PI                  6.2831853071795864769252867665590057683943
+#define SQRT_OF_ONE_THIRD       0.5773502691896257645091487805019574556476
+#define E                       2.7182818284590452353602874713526624977572
+#define G                       6.67384e-11
+#define EPSILON                 .000000001
+#define ZERO_ABSORPTION_EPSILON 0.00001
 
 namespace utilityCore {
-extern float clamp(float f, float min, float max);
-extern bool replaceString(std::string& str, const std::string& from, const std::string& to);
-extern glm::vec3 clampRGB(glm::vec3 color);
-extern bool epsilonCheck(float a, float b);
-extern std::vector<std::string> tokenizeString(std::string str);
-extern cudaMat4 glmMat4ToCudaMat4(const glm::mat4 &a);
-extern glm::mat4 cudaMat4ToGlmMat4(const cudaMat4 &a);
-extern glm::mat4 buildTransformationMatrix(glm::vec3 translation, glm::vec3 rotation, glm::vec3 scale);
-extern void printCudaMat4(const cudaMat4 &m);
-extern std::string convertIntToString(int number);
-extern std::istream& safeGetline(std::istream& is, std::string& t); //Thanks to http://stackoverflow.com/a/6089413
+float clamp(float f, float min, float max);
+bool replaceString(std::string& str, const std::string& from,
+                   const std::string& to);
+glm::vec3 clampRGB(glm::vec3 color);
+bool epsilonCheck(float a, float b);
+std::vector<std::string> tokenizeString(std::string str);
+cudaMat4 glmMat4ToCudaMat4(const glm::mat4& a);
+glm::mat4 cudaMat4ToGlmMat4(const cudaMat4& a);
+glm::mat4 buildTransformationMatrix(glm::vec3 translation, glm::vec3 rotation,
+                                    glm::vec3 scale);
+void printCudaMat4(const cudaMat4& m);
+std::string convertIntToString(int number);
+std::istream& safeGetline(
+    std::istream& is,
+    std::string& t);  // Thanks to http://stackoverflow.com/a/6089413
 
 //-----------------------------
 //-------GLM Printers----------
 //-----------------------------
-extern void printMat4(const glm::mat4 &);
-extern void printVec4(const glm::vec4 &);
-extern void printVec3(const glm::vec3 &);
-}
+void printMat4(const glm::mat4&);
+void printVec4(const glm::vec4&);
+void printVec3(const glm::vec3&);
+}  // namespace utilityCore
diff --git a/tool/visualize.py b/tool/visualize.py
new file mode 100644
index 0000000..9bd0e34
--- /dev/null
+++ b/tool/visualize.py
@@ -0,0 +1,38 @@
+from matplotlib import pyplot as plt
+
+# number of boids
+num_boids = [5000, 10000, 15000, 20000, 30000, 50000, 100000, 200000, 500000]
+
+# simulation w/o visualization
+fps_naive_boids = [430, 212, 138, 100, 51, 20, 5.5, 1.5, 0.2]
+fps_scattered_boids = [1300, 1180, 920, 840, 600, 350, 167, 70, 5]
+fps_coherent_boids = [1880, 1540, 1460, 1260, 750, 540, 230, 105, 25]
+
+# simulation w/ visualization
+fps_naive_boids_viz = [411, 206, 138, 100.2, 50.8, 20, 5.6, 1.5, 0.2]
+fps_scattered_boids_viz = [1108, 1000, 830, 756, 565, 310, 155, 65.5, 5.2]
+fps_coherent_boids_viz = [1218, 1065, 1000, 905, 630, 480, 220, 118, 27]
+
+plt.figure()
+plt.plot(num_boids, fps_naive_boids, '-b',
+         num_boids, fps_scattered_boids, '.-b',
+         num_boids, fps_coherent_boids, '*-b',
+         num_boids, fps_naive_boids_viz, '--r',
+         num_boids, fps_scattered_boids_viz, '.--r',
+         num_boids, fps_coherent_boids_viz, '*--r')
+plt.legend(['Naive w/o sim', 'Scattered w/o sim', 'Coherent w/o sim',
+            'naive w/ sim', 'Scattered w/ sim', 'Coherent w/ sim'])
+plt.xlabel('Num of Boids')
+plt.ylabel('FPS')
+plt.title('FPS vs. Num of Boids Graph')
+plt.show()
+
+# Block size (num_boids = 500000)
+block_size = [128, 256, 512, 1024]
+fps_coherent_block_size_viz = [24, 26, 30, 40]
+plt.figure()
+plt.plot(block_size, fps_coherent_block_size_viz, '.-')
+plt.xlabel('Block size')
+plt.ylabel('FPS')
+plt.title('FPS vs. Block size Graph')
+plt.show()