diff --git a/README.md b/README.md
index 110697c..120b56a 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,44 @@ CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+*  Fengkai Wu
+*  Tested on: Windows 7, i7-6700 @ 3.40GHz 16GB, Quadro K620 4095MB (Moore 100C Lab)
 
-### (TODO: Your README)
+## Results
+![result_img](https://github.com/wufk/Project3-CUDA-Path-Tracer/blob/master/img/final.PNG)
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+Features:
+
+1. Shading with BSDF evaluation
+
+2. Path termination using stream compaction
+
+3. Toggeable option to cache first bounce and sort path segments by materials
+
+4. Refraction with frenesel effects using Shilick's approximation
+
+5. Stochastic antialiasing
+
+## Analysis
+
+### Antialiasing
+
+Before:
+![nojitter_img](https://github.com/wufk/Project3-CUDA-Path-Tracer/blob/master/img/nonjitter.PNG)
+
+After jittering:
+![jitter_img](https://github.com/wufk/Project3-CUDA-Path-Tracer/blob/master/img/jitter.PNG)
+
+By adding a uniform random value to the ray, the aliasing effect is removed. As you can see from the picture, the edges of the cube and the wall is smoothened.
+
+### Sorting materials
+![sort_img](https://github.com/wufk/Project3-CUDA-Path-Tracer/blob/master/img/sort.PNG)
+
+The sorting is on ray/path arrays with respect to their materials. It is performed right after computing intersections. However it increase the running time primialy due to this addition operation. Making ray/paths contiguous in memory sorting by material does seem to be a good choice. The reason might due to that each path is independent and the kernel does not access each pixel by material type.
+
+### Caching first bounce
+
+![cache_img](https://github.com/wufk/Project3-CUDA-Path-Tracer/blob/master/img/cache.PNG)
+
+The outcome of the first iteration of the pathtracing is cached in device and reused for the subsequent bouncing. The graph above shows that it indeed increase performance but at a constant rate. Reloading the cache for reuse is also a high cost. 
 
diff --git a/img/cache.PNG b/img/cache.PNG
new file mode 100644
index 0000000..983f3aa
Binary files /dev/null and b/img/cache.PNG differ
diff --git a/img/final.PNG b/img/final.PNG
new file mode 100644
index 0000000..302828d
Binary files /dev/null and b/img/final.PNG differ
diff --git a/img/jitter.PNG b/img/jitter.PNG
new file mode 100644
index 0000000..b23e410
Binary files /dev/null and b/img/jitter.PNG differ
diff --git a/img/nonjitter.PNG b/img/nonjitter.PNG
new file mode 100644
index 0000000..1b03065
Binary files /dev/null and b/img/nonjitter.PNG differ
diff --git a/img/sort.PNG b/img/sort.PNG
new file mode 100644
index 0000000..4237b2d
Binary files /dev/null and b/img/sort.PNG differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff820..4c1aab2 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -48,6 +48,26 @@ REFR        0
 REFRIOR     0
 EMITTANCE   0
 
+// Transparent white
+MATERIAL 5
+RGB         .7 .6 .6
+SPECEX      0
+SPECRGB     .7 .6 .6
+REFL        0
+REFR        1.5
+REFRIOR     1.4
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 6
+RGB         .1 .5 .9
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
 // Camera
 CAMERA
 RES         800 800
@@ -112,6 +132,38 @@ SCALE       .01 10 10
 OBJECT 6
 sphere
 material 4
-TRANS       -1 4 -1
+TRANS       -1 1.5 -1
 ROTAT       0 0 0
 SCALE       3 3 3
+
+// Sphere
+OBJECT 7
+sphere
+material 5
+TRANS       3 1.5 -1
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// right wall light
+OBJECT 8
+cube
+material 0
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .3 3 3
+
+// cube
+//OBJECT 9
+//cube
+//material 4
+//TRANS       0 1 -2
+//ROTAT       0 30 30
+//SCALE       2 2 2
+
+// cube
+OBJECT 9
+cube
+material 6
+TRANS       -3 0 1
+ROTAT       30 0 0
+SCALE       2 2 2
diff --git a/src/interactions.h b/src/interactions.h
index 5ce3628..9e87850 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -41,6 +41,69 @@ glm::vec3 calculateRandomDirectionInHemisphere(
         + sin(around) * over * perpendicularDirection2;
 }
 
+__host__ __device__ float schlick(float costheta, float n1, float n2)
+{
+	float R0 = (n1 - n2) / (n1 + n2);
+	R0 *= R0;
+	return R0 + (1 - R0) * pow((1 - costheta), 5);
+}
+
+__host__ __device__ void reflect(
+	PathSegment & pathSegment,
+	glm::vec3 intersect,
+	glm::vec3 &normal,
+	const Material &m
+)
+{
+	pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal);
+	pathSegment.ray.direction = glm::normalize(pathSegment.ray.direction);
+	pathSegment.ray.origin = intersect + pathSegment.ray.direction * 0.001f;
+	pathSegment.color *= m.color;
+	pathSegment.remainingBounces--;
+}
+
+__host__ __device__ void refract(
+	PathSegment & pathSegment,
+	glm::vec3 intersect,
+	glm::vec3 &normal,
+	const Material &m,
+	thrust::default_random_engine &rng) 
+{
+	float n1, n2;
+	float cosTheta, eta;
+	float fresnel;
+
+	n1 = 1.0f;
+	n2 = m.indexOfRefraction;
+	cosTheta = glm::dot(pathSegment.ray.direction, normal);
+
+	if (cosTheta > .0f)
+	{
+		normal = -normal;
+		eta = n2 / n1;
+	}
+	else
+	{
+		eta = n1 / n2;
+	}
+
+	thrust::uniform_real_distribution<float> u01(0, 1);
+	fresnel = schlick(fabs(cosTheta), n1, n2);
+	if (u01(rng) < fresnel)
+	{
+		pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal);
+		pathSegment.color *= m.color;
+	}
+	else
+	{
+		pathSegment.ray.direction = glm::refract(pathSegment.ray.direction, normal, eta);
+	}
+
+	pathSegment.ray.origin = intersect + pathSegment.ray.direction * 0.001f;
+	pathSegment.ray.direction = glm::normalize(pathSegment.ray.direction);
+	pathSegment.remainingBounces--;
+}
+
 /**
  * Scatter a ray with some probabilities according to the material properties.
  * For example, a diffuse surface scatters in a cosine-weighted hemisphere.
@@ -70,10 +133,42 @@ __host__ __device__
 void scatterRay(
 		PathSegment & pathSegment,
         glm::vec3 intersect,
-        glm::vec3 normal,
+        glm::vec3 &normal,
         const Material &m,
         thrust::default_random_engine &rng) {
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
+
+	if (glm::dot(pathSegment.ray.direction, normal) > 0.0f && m.hasRefractive <= 0.001f)
+	{
+		pathSegment.color = glm::vec3(0.0f);
+		pathSegment.remainingBounces = 0;
+		return;
+	}
+	if (m.hasReflective > 0.0f)
+	{
+		reflect(pathSegment, intersect, normal, m);
+	}
+	else if (m.hasRefractive > 0.0f)
+	{
+		refract(pathSegment, intersect, normal, m, rng);
+	}
+	else if (m.emittance > 0.0f)
+	{
+		pathSegment.color *= m.color * m.emittance;
+		pathSegment.remainingBounces = 0;
+	}
+	else
+	{
+
+		//PathSegment temp = pathSegment;
+
+		pathSegment.ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+		pathSegment.ray.direction = glm::normalize(pathSegment.ray.direction);
+		pathSegment.ray.origin = intersect + pathSegment.ray.direction * 0.001f;
+		pathSegment.color *= m.color;
+		pathSegment.remainingBounces--;
+	}
+
 }
diff --git a/src/intersections.h b/src/intersections.h
index 6f23872..58fef7c 100644
--- a/src/intersections.h
+++ b/src/intersections.h
@@ -136,9 +136,9 @@ __host__ __device__ float sphereIntersectionTest(Geom sphere, Ray r,
 
     intersectionPoint = multiplyMV(sphere.transform, glm::vec4(objspaceIntersection, 1.f));
     normal = glm::normalize(multiplyMV(sphere.invTranspose, glm::vec4(objspaceIntersection, 0.f)));
-    if (!outside) {
-        normal = -normal;
-    }
+    //if (!outside) {
+    //    normal = -normal;
+    //}
 
     return glm::length(r.origin - intersectionPoint);
 }
diff --git a/src/main.cpp b/src/main.cpp
index fe8e85e..169a49f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -23,6 +23,10 @@ Scene *scene;
 RenderState *renderState;
 int iteration;
 
+double totalTimeMs = 0.0f;
+float iterationTimeMs;
+double totalElapsedTimeMs = 0.0;
+
 int width;
 int height;
 
@@ -134,8 +138,21 @@ void runCuda() {
 
         // execute the kernel
         int frame = 0;
+		cudaEvent_t start, stop;
+		cudaEventCreate(&start);
+		cudaEventCreate(&stop);
+		cudaEventRecord(start);
+
         pathtrace(pbo_dptr, frame, iteration);
 
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&iterationTimeMs, start, stop);
+		totalElapsedTimeMs += iterationTimeMs;
+
+		if (iteration % 50 == 0) {
+			totalTimeMs = totalElapsedTimeMs;
+		}
         // unmap buffer object
         cudaGLUnmapBufferObject(pbo);
     } else {
diff --git a/src/main.h b/src/main.h
index fdb7d5d..cd1bb38 100644
--- a/src/main.h
+++ b/src/main.h
@@ -28,6 +28,8 @@ using namespace std;
 
 extern Scene* scene;
 extern int iteration;
+extern float iterationTimeMs;
+extern double totalTimeMs;
 
 extern int width;
 extern int height;
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index c1ec122..47d31a1 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -13,6 +13,11 @@
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+#include "device_launch_parameters.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
 
 #define ERRORCHECK 1
 
@@ -73,6 +78,10 @@ static Geom * dev_geoms = NULL;
 static Material * dev_materials = NULL;
 static PathSegment * dev_paths = NULL;
 static ShadeableIntersection * dev_intersections = NULL;
+static bool * dev_flag = NULL;
+static int *dev_pathMaterials = nullptr;
+static PathSegment * dev_cachePaths = NULL;
+static ShadeableIntersection * dev_cacheIntersections = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
 
@@ -96,6 +105,13 @@ void pathtraceInit(Scene *scene) {
   	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
     // TODO: initialize any extra device memeory you need
+	cudaMalloc(&dev_flag, pixelcount * sizeof(bool));
+
+	cudaMalloc(&dev_cachePaths, pixelcount * sizeof(PathSegment));
+	cudaMalloc(&dev_cacheIntersections, pixelcount * sizeof(ShadeableIntersection));
+
+	cudaMalloc(&dev_pathMaterials, pixelcount * sizeof(unsigned));
+	cudaMemset(dev_pathMaterials, 0, pixelcount * sizeof(unsigned));
 
     checkCUDAError("pathtraceInit");
 }
@@ -107,6 +123,10 @@ void pathtraceFree() {
   	cudaFree(dev_materials);
   	cudaFree(dev_intersections);
     // TODO: clean up any extra device memory you created
+	cudaFree(dev_flag);
+	cudaFree(dev_pathMaterials);
+	cudaFree(dev_cachePaths);
+	cudaFree(dev_cacheIntersections);
 
     checkCUDAError("pathtraceFree");
 }
@@ -128,13 +148,19 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		int index = x + (y * cam.resolution.x);
 		PathSegment & segment = pathSegments[index];
 
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+		thrust::uniform_real_distribution<float> u01(0, 1);
+
 		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
 		// TODO: implement antialiasing by jittering the ray
+		float x_jitter = x + u01(rng);
+		float y_jitter = y + u01(rng);
+
 		segment.ray.direction = glm::normalize(cam.view
-			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
-			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
+			- cam.right * cam.pixelLength.x * ((float)x_jitter - (float)cam.resolution.x * 0.5f)
+			- cam.up * cam.pixelLength.y * ((float)y_jitter - (float)cam.resolution.y * 0.5f)
 			);
 
 		segment.pixelIndex = index;
@@ -153,6 +179,7 @@ __global__ void computeIntersections(
 	, Geom * geoms
 	, int geoms_size
 	, ShadeableIntersection * intersections
+	, int * pathMaterials
 	)
 {
 	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -201,6 +228,7 @@ __global__ void computeIntersections(
 		if (hit_geom_index == -1)
 		{
 			intersections[path_index].t = -1.0f;
+			pathMaterials[path_index] = -1;
 		}
 		else
 		{
@@ -208,6 +236,8 @@ __global__ void computeIntersections(
 			intersections[path_index].t = t_min;
 			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
 			intersections[path_index].surfaceNormal = normal;
+
+			pathMaterials[path_index] = geoms[hit_geom_index].materialid;
 		}
 	}
 }
@@ -227,11 +257,14 @@ __global__ void shadeFakeMaterial (
 	, ShadeableIntersection * shadeableIntersections
 	, PathSegment * pathSegments
 	, Material * materials
+	, bool *flag
+	, glm::vec3 *image
 	)
 {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_paths)
   {
+	  PathSegment &refpath = pathSegments[idx];
     ShadeableIntersection intersection = shadeableIntersections[idx];
     if (intersection.t > 0.0f) { // if the intersection exists...
       // Set up the RNG
@@ -243,24 +276,36 @@ __global__ void shadeFakeMaterial (
       Material material = materials[intersection.materialId];
       glm::vec3 materialColor = material.color;
 
+	  if (refpath.remainingBounces)
+	  {
+		  scatterRay(refpath, intersection.t * refpath.ray.direction + refpath.ray.origin, intersection.surfaceNormal, material, rng);
+		  flag[idx] = true;
+	  }
+	  else
+	  {
+		  flag[idx] = false;
+		  image[refpath.pixelIndex] += refpath.color;
+	  }
+
       // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        pathSegments[idx].color *= (materialColor * material.emittance);
-      }
+      //if (material.emittance > 0.0f) {
+      //  pathSegments[idx].color *= (materialColor * material.emittance);
+      //}
       // Otherwise, do some pseudo-lighting computation. This is actually more
       // like what you would expect from shading in a rasterizer like OpenGL.
       // TODO: replace this! you should be able to start with basically a one-liner
-      else {
-        float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
-        pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
-        pathSegments[idx].color *= u01(rng); // apply some noise because why not
-      }
+      //else {
+      //  float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
+      //  pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
+      //  pathSegments[idx].color *= u01(rng); // apply some noise because why not
+      //}
     // If there was no intersection, color the ray black.
     // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
     // used for opacity, in which case they can indicate "no opacity".
     // This can be useful for post-processing and image compositing.
     } else {
-      pathSegments[idx].color = glm::vec3(0.0f);
+      //pathSegments[idx].color = glm::vec3(0.0f);
+	  flag[idx] = false;
     }
   }
 }
@@ -277,6 +322,28 @@ __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterati
 	}
 }
 
+void compactPath(int& num_paths, PathSegment *paths, bool *flag)
+{
+	thrust::device_ptr<bool> thrust_flags(flag);
+	thrust::device_ptr<PathSegment> thrust_paths(paths);
+	thrust::remove_if(thrust_paths, thrust_paths + num_paths, thrust_flags, thrust::logical_not<bool>());
+	num_paths = thrust::count_if(thrust_flags, thrust_flags + num_paths, thrust::identity<bool>());
+}
+
+void sortByMaterials(int num_path) {
+	thrust::device_ptr<PathSegment> thrust_paths(dev_paths);
+	thrust::device_ptr<ShadeableIntersection> thrust_intersections(dev_intersections);
+	thrust::device_ptr<int> thrust_pathMaterials(dev_pathMaterials);
+
+	thrust::device_vector<int> indices(num_path);
+	thrust::device_vector<PathSegment> pvec(thrust_paths, thrust_paths + num_path);
+	thrust::device_vector<ShadeableIntersection> ivec(thrust_intersections, thrust_intersections + num_path);
+	thrust::sequence(indices.begin(), indices.end());
+	thrust::stable_sort_by_key(thrust_pathMaterials, thrust_pathMaterials + num_path, indices.begin());
+	thrust::gather(indices.begin(), indices.end(), pvec.begin(), thrust_paths);
+	thrust::gather(indices.begin(), indices.end(), ivec.begin(), thrust_intersections);
+}
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
@@ -293,7 +360,7 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
             (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
 	// 1D block for path tracing
-	const int blockSize1d = 128;
+	const int blockSize1d = 512;
 
     ///////////////////////////////////////////////////////////////////////////
 
@@ -325,69 +392,101 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
     //   for you.
 
     // TODO: perform one iteration of path tracing
-
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
-
+	cudaMemset(dev_pathMaterials, 0, pixelcount * sizeof(unsigned));
 	int depth = 0;
 	PathSegment* dev_path_end = dev_paths + pixelcount;
 	int num_paths = dev_path_end - dev_paths;
+	//if (iter == 1) {
+		generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+		//dim3 numblocksPathSegmentTracing = (pixelcount + blockSize1d - 1) / blockSize1d;
+		//computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+		//	depth
+		//	, num_paths
+		//	, dev_paths
+		//	, dev_geoms
+		//	, hst_scene->geoms.size()
+		//	, dev_intersections
+		//	, dev_pathMaterials
+		//	);
+
+		//cudaMemcpy(dev_cachePaths, dev_paths, pixelcount * sizeof(PathSegment), cudaMemcpyDeviceToDevice);
+		//cudaMemcpy(dev_cacheIntersections, dev_intersections, pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+
+		checkCUDAError("generate camera ray");
+	//}
+	//else
+	//{
+	//	cudaMemcpy(dev_paths, dev_cachePaths, pixelcount * sizeof(PathSegment), cudaMemcpyDeviceToDevice);
+	//	cudaMemcpy(dev_intersections, dev_cacheIntersections, pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+	//}
+
+
 
 	// --- PathSegment Tracing Stage ---
 	// Shoot ray into scene, bounce between objects, push shading chunks
 
-  bool iterationComplete = false;
+	bool iterationComplete = false;
 	while (!iterationComplete) {
-
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+		// tracing
+		dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+		computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
+			depth
+			, num_paths
+			, dev_paths
+			, dev_geoms
+			, hst_scene->geoms.size()
+			, dev_intersections
+			, dev_pathMaterials
+			);
+		checkCUDAError("trace one bounce");
+		cudaDeviceSynchronize();
+		depth++;
+
+
+		// TODO:
+		// --- Shading Stage ---
+		// Shade path segments based on intersections and generate new rays by
+		// evaluating the BSDF.
+		// Start off with just a big kernel that handles all the different
+		// materials you have in the scenefile.
+		// TODO: compare between directly shading the path segments and shading
+		// path segments that have been reshuffled to be contiguous in memory.
+
+		//sortByMaterials(num_paths);
+
+		shadeFakeMaterial<<<numblocksPathSegmentTracing, blockSize1d>>> (
+			iter,
+			num_paths,
+			dev_intersections,
+			dev_paths,
+			dev_materials,
+			dev_flag,
+			dev_image
 		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-	depth++;
-
-
-	// TODO:
-	// --- Shading Stage ---
-	// Shade path segments based on intersections and generate new rays by
-  // evaluating the BSDF.
-  // Start off with just a big kernel that handles all the different
-  // materials you have in the scenefile.
-  // TODO: compare between directly shading the path segments and shading
-  // path segments that have been reshuffled to be contiguous in memory.
-
-  shadeFakeMaterial<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = true; // TODO: should be based off stream compaction results.
+
+		compactPath(num_paths, dev_paths, dev_flag);
+
+		if (!num_paths) iterationComplete = true;
+		//iterationComplete = true; // TODO: should be based off stream compaction results.
 	}
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+	// Assemble this iteration and apply it to the image
+	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
 
-    ///////////////////////////////////////////////////////////////////////////
+	///////////////////////////////////////////////////////////////////////////
 
-    // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
+	// Send results to OpenGL buffer for rendering
+	sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
 
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+	// Retrieve image from GPU
+	cudaMemcpy(hst_scene->state.image.data(), dev_image,
+			pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-    checkCUDAError("pathtrace");
+	checkCUDAError("pathtrace");
 }
diff --git a/src/preview.cpp b/src/preview.cpp
index 4eb0bc1..0ae7aae 100644
--- a/src/preview.cpp
+++ b/src/preview.cpp
@@ -173,7 +173,8 @@ void mainLoop() {
         glfwPollEvents();
         runCuda();
 
-        string title = "CIS565 Path Tracer | " + utilityCore::convertIntToString(iteration) + " Iterations";
+        string title = "CIS565 Path Tracer | " + utilityCore::convertIntToString(iteration) + " Iterations "
+			+ utilityCore::convertIntToString(iterationTimeMs) + " ms " + utilityCore::convertIntToString(totalTimeMs) + " total ms";
         glfwSetWindowTitle(window, title.c_str());
 
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);