diff --git a/README.md b/README.md index 0e38ddb..a2f3df1 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,86 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* MANVI AGARWAL + * [LinkedIn](https://www.linkedin.com/in/manviagarwal27/) +* Tested on: Windows 11, AMD Ryzen 5 7640HS @ 4.30GHz 16GB, GeForce RTX 4060 8GB(personal) -### (TODO: Your README) +### Performance Analysis + +This repository compares the implementation of scan or prefix scan algorithm on CPU and GPU. The code in `main.cpp` does correctness check as well as profiles individual implementations to compute the time. The output for the test code is: + +``` +**************** +** SCAN TESTS ** +**************** + [ 31 35 17 38 8 45 8 15 34 30 36 13 29 ... 7 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0012ms (std::chrono Measured) + [ 0 31 66 83 121 129 174 182 197 231 261 297 310 ... 6271 6278 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0013ms (std::chrono Measured) + [ 0 31 66 83 121 129 174 182 197 231 261 297 310 ... 6206 6242 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.640896ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.16288ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.245184ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.115168ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 16.073ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.03667ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 1 1 2 0 3 2 1 0 0 0 1 3 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0012ms (std::chrono Measured) + [ 3 1 1 2 3 2 1 1 3 3 2 3 3 ... 3 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0011ms (std::chrono Measured) + [ 3 1 1 2 3 2 1 1 3 3 2 3 3 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0055ms (std::chrono Measured) + [ 3 1 1 2 3 2 1 1 3 3 2 3 3 ... 3 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.077728ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.083264ms (CUDA Measured) + passed +``` + +## Computation comparison + +![](img/ComparisonChart.png) + +## Insights + +The results favor CPU implementation heavily in terms of timing. There are a couple of reasons for GPU implementation to be much slower than CPU one. All these limitations can be explored to look for optimization for GPU implementation. + +Following are some of the reasons for GPU implementation to be much slower: + +**1. Expensive memory operations:** + For the implementation, I've used global memory and before calling GPU, the data is transferred from CPU memory to global memory which adds to the latency of GPU implementation. + + **2. Small computations and small data set:** + Since each thread doesn't have a lot of computation, the time it takes to transfer data from host to device and back ends up surpassing the reduction in time that parallelization of computation attempts to achieve. + +**3. Warp Partitioning:** +With each iteration, number of threads working reduce but the corresponding warps remain active. This causes divergent wraps and hence the GPU is under-utilized. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/ComparisonChart.png b/img/ComparisonChart.png new file mode 100644 index 0000000..07676cd Binary files /dev/null and b/img/ComparisonChart.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..2fce82c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 16; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..f314c9f 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,9 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int thid = threadIdx.x + (blockIdx.x*blockDim.x); + bools[thid] = (idata[thid] != 0); + } /** @@ -33,6 +36,11 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int thid = threadIdx.x + blockIdx.x*blockDim.x; + if(bools[thid] == 1) + { + odata[indices[thid]] = idata[thid]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..ccca286 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + for(int i = 0; i> > (zeropadded_n, g_idata,offset); + offset *= 2; + } + cudaDeviceSynchronize(); + offset = zeropadded_n / 2; + for (int i = 0; i < ilog2ceil(n); i++) + { + scan_downstream << > > (zeropadded_n, g_idata,offset); + offset /= 2; + + } + cudaMemcpy(odata,g_idata,sizeof(int)*n,cudaMemcpyDeviceToHost); + cudaFree(g_idata); timer().endGpuTimer(); } @@ -33,8 +95,56 @@ namespace StreamCompaction { int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + int *g_bools = 0; + int *bools = 0; + int* indices; + int* g_idata; + int *g_odata; + int zeropadded_n = pow(2,ilog2ceil(n)); + printf("zeropadded = %d\n", zeropadded_n); + int threadsPerBlock = 256; + int blocksPerGrid = (zeropadded_n + threadsPerBlock - 1) / threadsPerBlock; + int *temp_array = (int*)malloc(sizeof(int)*zeropadded_n); + cudaError_t result = cudaMalloc((void**)(&g_bools), zeropadded_n * sizeof(int)); + if (result != cudaSuccess) { + fprintf(stderr, "Mem alloc failed: %s\n", cudaGetErrorString(result)); + cudaFree(g_bools); + timer().endGpuTimer(); + return -1; + } + result = cudaMalloc((void**)(&g_idata), zeropadded_n * sizeof(int)); + cudaMemset(g_idata, 0, zeropadded_n*sizeof(int)); + if (result != cudaSuccess) { + fprintf(stderr, "Mem alloc failed: %s\n", cudaGetErrorString(result)); + cudaFree(g_bools); + cudaFree(g_idata); + timer().endGpuTimer(); + return -1; + } + cudaMemcpy(g_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMalloc((void**)&g_odata,sizeof(int)* zeropadded_n); + StreamCompaction::Common::kernMapToBoolean<<>>(zeropadded_n, g_bools, g_idata); + bools = (int*)malloc(zeropadded_n*sizeof(int)); + cudaDeviceSynchronize(); + cudaMemcpy(bools,g_bools,zeropadded_n*sizeof(int),cudaMemcpyDeviceToHost); + + result = cudaMalloc(&indices, zeropadded_n * sizeof(int)); timer().endGpuTimer(); - return -1; + scan(zeropadded_n, temp_array, bools); + + timer().startGpuTimer(); + cudaMemcpy(indices, temp_array, zeropadded_n * sizeof(int), cudaMemcpyHostToDevice); + + StreamCompaction::Common::kernScatter<<>>(zeropadded_n, g_odata,g_idata, g_bools, indices); + cudaMemcpy(odata,g_odata,zeropadded_n*sizeof(int),cudaMemcpyDeviceToHost); + cudaFree(g_bools); + cudaFree(indices); + cudaFree(g_odata); + cudaFree(g_idata); + free(bools); + timer().endGpuTimer(); + + return temp_array[zeropadded_n-1]; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..bac7481 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define block_size 256 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,56 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void scan_global(int n, int *odata, int *idata, int *temp,int offset,int pout) + { + int thid = threadIdx.x + (blockIdx.x * blockDim.x); + // Load input into global memory. + // This is exclusive scan, so shift right by one + // and set first element to 0 + int pin = 1 - pout; + if (thid >= offset) + temp[pout * n + thid] = temp[pin * n + thid - offset] + temp[pin* n + thid]; + else + temp[pout * n + thid] = temp[pin * n + thid]; + __syncthreads(); + odata[thid] = temp[pout * n + thid]; // write output + } + __global__ void shiftInput(int* idata, int* shifted_input) + { + int thid = threadIdx.x + (blockIdx.x * blockDim.x); + shifted_input[thid] = (thid > 0) ? idata[thid - 1] : 0; + __syncthreads(); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + int *g_odata,*g_idata,*temp; + int zeropadded_n = pow(2, ilog2ceil(n)); + cudaError_t result = cudaMalloc((void**)&g_idata, zeropadded_n * sizeof(int)); + result = cudaMalloc((void**)&g_odata,zeropadded_n*sizeof(int)); + result = cudaMalloc((void**)&temp,2 * zeropadded_n * sizeof(int)); + cudaMemcpy(g_idata,idata,sizeof(int)*n,cudaMemcpyHostToDevice); + int threadsPerBlock = 1024; + int blocksPerGrid = (zeropadded_n + threadsPerBlock - 1) / threadsPerBlock; + + int offset = 1; + int pout = 0; + shiftInput<<>>(g_idata, temp); + for (int i = 0; i < ilog2ceil(n); i++) { + pout = 1 - pout; + scan_global<<>>(zeropadded_n,g_odata,g_idata,temp,offset,pout); + offset *= 2; + } + + cudaMemcpy(odata,g_odata,sizeof(int)*n,cudaMemcpyDeviceToHost); + for (int i = 0; i < 257; i++) + { + //printf("%d %d\n", idata[i], odata[i]); + } timer().endGpuTimer(); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..1b4db06 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -21,7 +21,14 @@ namespace StreamCompaction { timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector thrust_idata(idata, idata + n); + thrust::host_vector thrust_odata(odata, odata + n); + + thrust::device_vector thrust_dev_idata(idata,idata+n); + thrust::device_vector thrust_dev_odata(odata,odata+n); + + thrust::exclusive_scan(thrust_dev_idata.begin(), thrust_dev_idata.end(), thrust_dev_odata.begin()); + thrust::copy(thrust_dev_odata.begin(), thrust_dev_odata.end(), odata); timer().endGpuTimer(); } }