diff --git a/Performance Data Collection.xlsx b/Performance Data Collection.xlsx new file mode 100644 index 00000000..a2edaaeb Binary files /dev/null and b/Performance Data Collection.xlsx differ diff --git a/README.md b/README.md index 0e38ddb1..79e090eb 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,96 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE +* Yuning Wen * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tested on: Windows 11, i9-12900H @ 2.50GHz 16GB, NVIDIA GeForce RTX 3060 Laptop GPU (Personal Laptop) -### (TODO: Your README) +### README -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +#### Description +* CPU Scan, Stream Compaction + +* Naive GPU Scan Algorithm + +* Work-Efficient GPU Scan & Stream Compaction + +* Thrust test implemented + +#### Performance Analysis (Answering Questions) + +* Here are the two graphs for the performances of different GPU scan algorithms (Naive, Work-Efficient, and Thrust) and the serial CPU version of Scan. + +![power of 2](./img/po2.png) +![non power of 2](./img/non%20po2.png) + +* I have tried to use Nsight Systems only to check if I may find something. But it seems like cuda is mostly use by my own function, which means thrust may just used little stuff in cuda + +![general](./img/nsys%20general.png) + + But there is some tiny things inside and that might be used by thrust + +![focus](./img/nsys%20focus.png) + + As a result, I guess thurst functions actually make a great use of parallel calculation in CPU, or have excellent algorithm that runs in just hundred microseconds and finishes the use of CUDA. + +#### Bottlenecks + +* From the nsight charts above, I think the current bottleneck is that the efficiency of memory usage is too low, so if I may apply shared memory to my algorithm, the efficiency may then increase. + +#### Result of running + +``` +**************** +** SCAN TESTS ** +**************** + [ 32 7 4 15 27 47 43 11 5 18 44 41 47 ... 30 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 20.1042ms (std::chrono Measured) + [ 0 32 39 43 58 85 132 175 186 191 209 253 294 ... 205520488 205520518 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 19.9824ms (std::chrono Measured) + [ 0 32 39 43 58 85 132 175 186 191 209 253 294 ... 205520437 205520458 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 7.2087ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 6.83536ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 2.68765ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 2.97347ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.885536ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.02086ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 1 2 1 1 1 3 3 3 2 0 3 1 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 29.0733ms (std::chrono Measured) + [ 2 1 2 1 1 1 3 3 3 2 3 1 2 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 45.5551ms (std::chrono Measured) + [ 2 1 2 1 1 1 3 3 3 2 3 1 2 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 83.1376ms (std::chrono Measured) + [ 2 1 2 1 1 1 3 3 3 2 3 1 2 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 2.79856ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 2.45539ms (CUDA Measured) + passed +``` \ No newline at end of file diff --git a/img/non po2.png b/img/non po2.png new file mode 100644 index 00000000..dd3fa95e Binary files /dev/null and b/img/non po2.png differ diff --git a/img/nsys focus.png b/img/nsys focus.png new file mode 100644 index 00000000..1f466fe3 Binary files /dev/null and b/img/nsys focus.png differ diff --git a/img/nsys general.png b/img/nsys general.png new file mode 100644 index 00000000..f5fbe3e6 Binary files /dev/null and b/img/nsys general.png differ diff --git a/img/po2.png b/img/po2.png new file mode 100644 index 00000000..4adcd365 Binary files /dev/null and b/img/po2.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..5d515b63 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 23; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..506269e2 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,17 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (idata[index] == 0) { + bools[index] = 0; + } + else { + bools[index] = 1; + } } /** @@ -33,6 +44,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..d093abbc 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +54,34 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // map + int* map = new int[n]; + for (int i = 0; i < n; i++) { + if (idata[i] == 0) { + map[i] = 0; + } + else { + map[i] = 1; + } + } + + // scan + int* scan = new int[n]; + scan[0] = 0; + for (int i = 1; i < n; i++) { + scan[i] = scan[i - 1] + map[i - 1]; + } + + // scatter + int count = 0; + for (int i = 0; i < n; i++) { + if (map[i] != 0) { + odata[scan[i]] = idata[i]; + count++; + } + } timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..734fe611 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,64 @@ namespace StreamCompaction { return timer; } + // up sweep function + __global__ void upSweep(int N, int* data, int pow) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int end = (index + 1) * (1 << (pow + 1)) - 1; + if (end >= N) { + return; + } + int start = end - (1 << pow); + data[end] += data[start]; + } + + // down sweep function + __global__ void downSweep(int N, int* data, int pow) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int end = (index + 1) * (1 << (pow + 1)) - 1; + if (end >= N) { + return; + } + int start = end - (1 << pow); + + int temp = data[end]; + data[end] += data[start]; + data[start] = temp; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO - timer().endGpuTimer(); + int* dev_tree; + + int depth = ilog2ceil(n); + int N = 1 << depth; + + cudaMalloc((void**)&dev_tree, N * sizeof(int)); + cudaMemcpy(dev_tree, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); // not to include any initial/final memory operations + + // up sweep + for (int d = 0; d < depth; d++) { + dim3 curGrid((N / (1 << (d + 1)) + blockSize - 1) / blockSize); + upSweep << > > (N, dev_tree, d); + } + + // down sweep + //dev_tree[n - 1] = 0; + cudaMemset(dev_tree + N - 1, 0, sizeof(int)); + for (int d = depth - 1; d >= 0; d--) { + dim3 curGrid((N / (1 << (d + 1)) + blockSize - 1) / blockSize); + downSweep << > > (N, dev_tree, d); + } + + timer().endGpuTimer(); // not to include any initial/final memory operations + + cudaMemcpy(odata, dev_tree, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_tree); } /** @@ -31,10 +84,39 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + + //timer().startGpuTimer();£¿ // TODO - timer().endGpuTimer(); - return -1; + int* dev_idata; + int* dev_odata; + int* dev_indices; + int* dev_bool; + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_bool, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bool, dev_idata); + StreamCompaction::Efficient::scan(n, dev_indices, dev_bool); + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_indices); + + int count; + int check; + cudaMemcpy(&count, dev_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&check, dev_bool + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_indices); + cudaFree(dev_bool); + //timer().endGpuTimer();£¿ + return count + check; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..687d3a94 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 128 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -12,14 +14,65 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void naiveScan(int N, int* odata, const int* idata, int pow) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + int diff = 1 << pow; + if (index >= diff) { + odata[index] = idata[index - diff] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO - timer().endGpuTimer(); + int* dev_arrA; + int* dev_arrB; + + cudaMalloc((void**)&dev_arrA, n * sizeof(int)); + cudaMalloc((void**)&dev_arrB, n * sizeof(int)); + cudaMemcpy(dev_arrA, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); // not to include any initial/final memory operations + + int depth = ilog2ceil(n); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int* in; + int* out; + + for (int i = 0; i < depth; i++) { + if (i % 2 == 0) { + in = dev_arrA; + out = dev_arrB; + } + else { + in = dev_arrB; + out = dev_arrA; + } + naiveScan << > > (n, out, in, i); + } + + timer().endGpuTimer(); // not to include any initial/final memory operations + + cudaMemcpy(odata, out, n * sizeof(int), cudaMemcpyDeviceToHost); + + for (int i = n - 1; i > 0; i--) { + odata[i] = odata[i - 1]; + } + odata[0] = 0; + + cudaFree(dev_arrA); + cudaFree(dev_arrB); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..f174c2c0 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,24 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); + + // Copy idata to device + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + + timer().startGpuTimer(); // not to include any initial/final memory operations + + // Perform exclusive scan + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + timer().endGpuTimer(); // not to include any initial/final memory operations + + // Copy the result from device to odata + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }