diff --git a/.gitignore b/.gitignore index a59ec56..ff5107f 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,8 @@ build .LSOverride # Icon must end with two \r -Icon +Icon + # Thumbnails ._* @@ -253,6 +254,7 @@ bld/ # Visual Studio 2015 cache/options directory .vs/ +.vscode/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ diff --git a/README.md b/README.md index 0e38ddb..1910057 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,459 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 2 - Stream Compaction** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +- Jordan Hochman + - [LinkedIn](https://www.linkedin.com/in/jhochman24), [Personal Website](https://jordanh.xyz), [GitHub](https://github.com/JHawk0224) +- Tested on: Windows 11, Ryzen 7 5800 @ 3.4GHz 32GB, GeForce RTX 3060 Ti 8GB (Compute Capability: 8.6) -### (TODO: Your README) +## Welcome to my Stream Compaction Project! -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![nsight systems report timeline](img/entire-report.jpg) +This project investigates many different implementations of the scan and compact algorithms, along with optimizations. Additionally, it also implements [Radix sort](https://en.wikipedia.org/wiki/Radix_sort). + +First I will explain what exactly scan and compact do, and then I'll walk through the different implementations of them. They both act on an array. + +Scan computes the partial sums up to each element, and stores these partial sums either in another array or modifies the original in place. Additionally, there are two types of scans, inclusive and exclusive. Exclusive always starts with a 0, and the partial sums are only of all the elements strictly before the current index, while inclusive scan includes the current index. For example, for the list [4, 7, 12], the inclusive scan would be [4, 11, 23] while the exclusive scan would be [0, 4, 11]. + +Compact takes in an array and returns an array containing only the elements which meet a certain criteria, in the same order. For this project, that criteria will always be that it's non-zero. For example, Compacting [5, 2, 8, 3, 11, 9] where we only want the even elements would give the array [2, 8]. + +Now why do we care about these? Well at first, you might be tempted to implement each one as a simple list that iterates through the given array. While this works, and is serially the best we can do, we don't need to run them serially! By parallelizing these algorithms, we can make them run much faster. This is something that we want to do because these two algorithms occur very frequently in other areas. + +This project implements scan and compact using many different methods, some on the CPU and some on the GPU. I will break down how each implementation works below, but feel free to check out `INSTRUCTION.md` for more details about scan and compact. + +## Running the Implementations + +If you are interested in running these implementations and the test cases for them, you may first want to ensure your computer is set up to do so. To do this, you can follow the instructions in [Project 0](https://github.com/JHawk0224/CIS5650-Project0-Getting-Started/blob/main/INSTRUCTION.md). After that, you may want to read through the [instructions](INSTRUCTION.md) for this project. + +Additionally, you can tweak parameters found in `main.cpp` to change the program settings: + +- `SIZE` - a power-of-2 size of array to test on +- `NPOT` - a non-power-of-2 size of array to test on +- `SORTMAXVAL` - the maximum possible value generated in the list for the sort test cases (only affects sorting!) + +You may also be interested in changing `blockSize` in each of the following files: `naive.cu`, `efficient.cu`, `efficient-thread-optimized.cu`, and `radix-sort.cu`. This value affects the number of threads per block for each of these versions which are implemented on the GPU. + +Note that before running the performance analysis, the block sizes were optimized for each implementation to give the best results, but feel free to change them. I explain more in depth how this was done in the corresponding section below. I will now walk through the different implementations of scan. + +## Scan Implementations + +### CPU Scan + +In this approach, the most simple solution one might think of is implemented. On the CPU, the we iterate through the array serially, adding each element to a running total and writing that to the output. As one might guess, this has O(n) time as it runs linearly in the length of the array. + +This can be very inefficient especially for long arrays, and it makes much more sense to somehow parallelize this. In the performance analysis below, we use this method as the baseline to compare everything to. + +### Naive Scan + +This approach involves the first version of scan one might think of when trying to parallelize it. It works by running passes over the array. + +In the first pass, elements 0 and 1 are added and stored in index 1, elements 1 and 2 are added and stored in index 2, and so on until every neighboring pair is added. Note that this is all done in parallel with one thread for every pair, so it should run quickly. In the second pass, elements 0 and 2 are added and stored in 2, 1 and 3 are added and stored in 3, and so on for every gap of 2 (again with one thread per pair). In each subsequent pass, this same thing happens but for gaps that are powers of 2 (so the third pass would be for elements with a gap of 4). + +At the end of this process, we end up with an inclusive scan of the original list (which we then can convert to an exclusive one). This works because for any given element of the array, every preceding element was added to it at some point in one of the prior passes. This is exactly what we want in a scan. + +Since passes each run in parallel, we can consider them to run in constant time. Then the passes must be run serially after each other, and there are log(n) passes. This gives a runtime of O(log(n)), ignoring things like memory accesses and the like on the GPU. + +For more details about this implementation, see `INSTRUCTION.md`. + +### Work-Efficient Scan + +In this approach, a slightly more convoluted algorithm is performed, but we see it actually runs faster than the naive implementation. This Work-Efficient Scan is composed of two stages, the "upsweep" (parallel reduction) and the "downsweep." + +In the upsweep phase, it runs log(n) passes. The first pass adds elements 0 and 1 and stores in index 1, elements 2 and 3 to store in index 3, elements 4 and 5 to store in index 5, and so on (with one thread for each pair again). The second pass now does the same exact thing but this time only looking at the cells which were set on the previous iteration. For example, it now takes cells 1 and 3 and adds them and stores in 3, elements 5 and 7 and stores them in 7, and so on. + +Each subsequent pass does the same thing but with gaps that become larger and larger, so eventually the last pass stores the sum of the middle element and the last element into the last index. At the end, we see that this last index will contain the sum of the list. Remember, each pass runs in parallel with one thread per pair. + +Before the second phase is run, the last element is set to 0 (explained later). + +In the second, downsweep phase, the elements are combined backwards. The first pass only acts on the middle and last element. The two are added and stored in the last index, and the middle index is set to the original final value (which was 0). In the second pass, this same operation is done but this time on two different pairs, the first pair being the first-quarter element and the middle element, and the second being the third-quarter element and the last one. + +Each subsequent pass does the same thing but working on twice the number of elements of the previous pass, and at the end of this process, the array will be left as an exclusive scan of the original version. + +This works because first of all, the 0 that is always first in an exclusive scan comes from setting the last element to 0 as mentioned above, which was then swapped all the way to the left of the array. Then, for every other element of the array, each preceding block of elements was added to it in some pass of the downsweep, where each block's sum was calculated in the upsweep. + +For more details about this implementation, see `INSTRUCTION.md`. + +Now even though this method has two separate phases, note the amount of work that's done in each phase. In this version, the upsweep stage 1 has n/2 operations, the next pase n/4, and so on down to 1. Adding these up gives O(n) operations _total_ for the entire upsweep. The story is the same for the downsweep. Thus we would expect this algorithm to actually run pretty quickly. + +### Thread Optimized Work-Efficient Scan (Part 5 EC) + +Now after implementing the Work-Efficient Scan, one might run into an issue. That issue is that the runtime didn't actually improve at all. You might get an "efficient" gpu scan that is actually not that efficient (being slower than the cpu approach). + +The reason for this is that if you implement this method without thought, you might end up with something like this for the upsweep phase (and very similar for the downsweep) + +```cpp +// on the host (CPU) side +for (int d = 0; d < numIters; d++) { + upSweep<<>>(n, d, dev_scanResult); +} + +// on the kernel (GPU) side +__global__ void upSweep(int n, int d, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + // ... + if (index % (1 << (d + 1)) == 0) { + data[index + (1 << (d + 1)) - 1] += data[index + (1 << d) - 1]; + } +} +``` + +The important thing to note here is the `blockSize` and `numBlocks`. In the actual algorithm, each pass requires less and less work, ranging from only needing 1 thread to up to the array size over 2. Thus it doesn't make sense to allocate the same number of threads and blocks for every single call of upsweep and downsweep. + +The issue with this original implementation is that we are creating TONS of threads, most of which are doing nothing. Note the `if (index % pow2d1 == 0)` statement in the kernel. Most of the threads aren't actually doing anything, because VERY few of them pass this if check. Even if they do nothing, it is still a very big waste of GPU resources. + +Therefore, a truly optimized implementation of the work-efficient scan would only create the necessary amount of threads for each invocation of upsweep and downsweep. Then the indexing on the kernel side might need to be different (e.g. multiply them by the correct multiples so they line up), but we can spawn many fewer threads. + +Therefore, this implementation fixes this issue. The Thread Optimized Work-Efficient Scan uses only the required number of threads at each stage, so we don't starve the GPU of resources. This is implemented in the `efficient-thread-optimized.cu` file (and the corresponding header file). Note that I did add some test cases to compare the performance, and this will be analyzed later. + +To give a comparison, the upsweep stage now looks something like + +```cpp +int threadsNeeded = 1 << numIters; +for (int d = 0; d < numIters; d++) { + threadsNeeded >>= 1; + int numBlocks = (threadsNeeded + blockSize - 1) / blockSize; + upSweep<<>>(nPowOf2, d, dev_data); +} +``` + +It outputs the identical thing that the normal Work-Efficient Scan does, it's just implemented slightly differently. You can call it extremely similarly to how Work-Efficient Scan is called, like so: + +```cpp +StreamCompaction::EfficientThreadOptimized::scan(SIZE, out, in); +``` + +### Thrust Scan + +This implementation is just a wrapper around the `thrust` library's implementation of scan. We will use this as a benchmark for comparison later in the performance analysis. + +How does this scan implementation work though? Well we can look at the output from [Nsight Systems](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) generated from running it: + +![nsight systems thrust report timeline](img/thrust-report.jpg) + +If you would like to see the full generated report, it is included [here](thrust-report.nsys-rep) as `thrust-report.nsys-rep`. + +Looking at this report, we can see that most of the kernel time is spent in `DeviceScanKernel`. The approximately equal amounts of host to device and device to host memory transfers are from the setup to run the `thrust` scan. We can thus infer that the `thrust` scan implementation uses the GPU to run it efficiently. Further analysis of this is possible by looking at the [actual implementation](https://github.com/NVIDIA/thrust) of the function. + +### Memory Optimized Naive Scan (Part 6 EC 2) + +Now all of the methods we've previously discussed on the GPU have a decent flaw. That is that they rely on lots global memory reads and writes. Just like in any GPU program, it is extremely important to minimize global memory access as this is extremely slow. The usual way to do this is to use shared memory instead of global memory. + +The way this works is that we first split up the target array into chunks (such that each chunk will fit into shared memory). Then we can have many threads populate the shared memory from the input in parallel, and then we can run our algorithm on shared memory. Running it on shared memory will be much quicker since it doesn't need to access global memory. Finally, at the end, the threads can take care of writing back the shared memory to global memory. + +The difficulty with this is actually splitting up the array into chunks. After we split up the array into chunks, that's all well and good. However, when we run our scan algorithm on the chunk, it will only scan that chunk of course! Thus if we run this in parallel, we will be left with an array where each individual chunk had scan run on it, but not the entire array as a whole. + +Luckily, this problem isn't too difficult to remedy. If we store the total sums for each chunk of the original array, we can then run an exclusive scan on these total sums. Then we can take the result of this scan, and add each value in it to the entire chunk it corresponds to in the chunk-scanned array. This works because in the chunk-scanned array, each chunk was missing the total values from everything before it, and this value is exactly what's provided by the new scan. + +Thus what's needed to implement this algorithm is to smartly choose the block size, and then partition the array over each block and run the scan in each block. Then get the sums of each block, run scan on them, and re-add these sums back in. + +This is exactly what the Memory Optimized Naive Scan does. It takes the original Naive Scan, but it replaces the global memory accesses with this new version using shared memory. + +This function is implemented in the `naive.cu` file. It outputs the identical thing that the normal Naive Scan method does, it's just implemented slightly differently. You can call it in the same way you would with the normal scan but with + +```cpp +StreamCompaction::Naive::scanShared(SIZE, out, in); +``` + +Note that I did add some test cases to compare the performance, and this will be analyzed later. + +### Memory Optimized Work-Efficient Scan (Part 6 EC 2) + +Now just like how we could implement a memory-optimized version of the Naive Scan, we can also do the same thing with the Work-Efficient Scan! That is exactly what this implementation does, it takes the Work-Efficient Scan and replaces the global memory accesses with shared memory accesses, in the same way discussed previously. + +This function is implemented in the `efficient.cu` file. It outputs the identical thing that the normal Work-Efficient Scan method does, it's just implemented slightly differently. You can call it in the same way you would with the normal scan but with + +```cpp +StreamCompaction::Efficient::scanShared(SIZE, out, in); +``` + +Note that I did add some test cases to compare the performance, and this will be analyzed later. + +Now another thing that's interesting to discuss is bank conflicts. While I did not analyze them for this project, it is something worth noting. We can further optimize the memory access on the GPU by changing the access pattern to avoid bank conflicts. You can find more details about bank conflicts and avoiding them [here](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda), and this is something I would like to look into in the future. + +## Compact Implementations + +### CPU Compact without Scan + +In this implementation of compact, it runs on the CPU. This method does not use scan, and all it does is iterate through the original array checking the condition, if it's met, then the value is added to the output array at the end. This just repeats on loop until the entire array is calculated (the end of the output is just a pointer that's increased each time). + +As you can imagine, this can be pretty inefficient as it runs in linear time with the length of the array. It has to iterate through the entire original array, but we can parallelize this. + +### CPU Compact with Scan + +This implementation of compact also runs on the CPU. However, it uses scan and then a function called scatter to compute the output. Here, scan is the normal CPU Scan discussed above. + +It works by first taking the input and calculating an array of booleans that are 1 if the element should be in the output and otherwise 0. Then, it runs scan on this array. This output gives us an array of which index the element should go in for the output array, if it actually belongs there. Then the scatter function runs which does just this. It iterates through the original array, and if the boolean flag is true, it places it in the output array at the location specified by the scan. + +For more details about this algorithm, see `INSTRUCTION.md` + +### Compact with Work-Efficient Scan + +This implementation does the same thing as CPU Compact with Scan, except that it uses the GPU and the Work-Efficient Scan method previously discussed. Here, the Work-Efficient Scan runs on the GPU, and scatter was also implemented on the GPU for this function. + +### Compact with Thread Optimized Work-Efficient Scan + +This implementation does the same thing as the previous two compact implementations, except that it uses the GPU and the Thread Optimized Work-Efficient Scan method previously discussed. Here, scatter was also implemented on the GPU for this function. + +Note that I did add some test cases to compare the performance of this method with the other ones. This is implemented in the `efficient-thread-optimized.cu` file (and the corresponding header file). + +It outputs the identical thing that the normal Compact with Work-Efficient Scan does, it's just implemented slightly differently. You can call it extremely similarly the other one is called, like so: + +```cpp +int count = StreamCompaction::EfficientThreadOptimized::compact(SIZE, out, in); +``` + +## Radix Sort (Part 6 EC 1) + +I also implemented the [Radix Sort](https://en.wikipedia.org/wiki/Radix_sort) function. This function sorts an array by going through it bit by bit (pun intended). + +It takes the input array where each element is represented in bits, and first looks at the least-significant (right-most) bit. It partitions the input into two halves, where the first half is all the elements where that bit was a 0 and the second half is the elements where that bit was a 1 (note that it doesn't necessarily have to be exactly half and half if the balance of elements is uneven). Within these two halves, the order is preserved. + +Then, it runs a second iteration but this time looking at the second bit, and moving elements in the same exact way. It then continues to do this same operation but for each bit, and at the end, we are left with a sorted array. To see more details on why this algorithm works, check out the [link above](https://en.wikipedia.org/wiki/Radix_sort) and the `INSTRUCTION.md` file. + +Now note that in each iteration, we can run the operation in parallel to speed it up on the GPU. However, exactly like what we had to do with the shared-memory optimizations earlier, we need to split up the input array into chunks and run Radix sort on each chunk. Finally, after each chunk is sorted, we can merge them back together. + +This merging was done with a parallel bitonic merge, and we are then left with the output. The actual operation in each iteration can be implemented with a combination of a scan and a specialized scatter, and for this implementation we used the Memory Optimized Work-Efficient Scan for Radix Sort. + +Now note that I did add some test cases check that my Radix Sort was functioning correctly, and to check it's performance. To do this, I also implemented a quick CPU Sort which is just a wrapper around `std::sort`. The test case compares the Radix Sort with the output of the CPU Sort. + +The Radix Sort function, along with `bitonicMerge`, is implemented in the `radix-sort.cu` file (and the corresponding header file). It outputs the sorted version of the input array, and it does not work in place. + +There are two methods of calling it. This is because Radix Sort depends on the number of bits of the data it stores. Thus for default integers in C++ (which are 32 bits), it will run 32 iterations. However, most of the time we know our data is capped at some value. Thus if we know this maximum cap, we can limit the number of iterations to the ceiling of the log of this maximum value. Thus we have two ways to call Radix Sort, one without knowing the limit, and one with. Here are the two methods: + +```cpp +StreamCompaction::RadixSort::sort(SIZE, out, in); +StreamCompaction::RadixSort::sort(SIZE, SORTMAXVAL, out, in); +``` + +The CPU Sort was implemented in the `cpu.cu` file and can be called with: + +```cpp +StreamCompaction::CPU::sort(SIZE, out, in); +``` + +Here is an example of how to run the Radix Sort: + +```cpp +int in[7] = {3, 12, 7, 5, 10, 12, 8}; +int out[7]; + +// without knowing maximum, will perform 32 iterations +StreamCompaction::RadixSort::sort(7, out, in); +printArray(7, out, true); +// [3, 5, 7, 8, 10, 12, 12] + +// knowing maximum, will perform ceil(log_2(12)) iterations +StreamCompaction::RadixSort::sort(7, 12, out, in); +printArray(7, out, true); +// [3, 5, 7, 8, 10, 12, 12] +``` + +## Scan Performance Analysis + +First I will discuss the method and measurements of performance used for the analysis below. All of the test cases were from running one of the above Scan algorithms on an array of a given length. The time it took for each one to run was printed in milliseconds (ms). Therefore, for the rest of the data below, lower means better since that means the algorithm ran quicker. + +The time each algorithm took to run was measured with the following + +- `std::chrono`: to provide CPU high-precision timing for the CPU functions +- CUDA events: to measure the CUDA performance for the GPU functions + +Each test below was run 3 times and then averaged together to try to combat the natural variance in CPU/GPU timer measurements. Furthermore, none of the memory operations/allocations (such as `cudaMalloc`, `cudaMemcpy`, and `cudaFree`) were included in the performance measurements. The timers were run strictly on the portion of the code which was executing the algorithm, with no pollutants. + +Additionally, all of these were run in Release mode. The specs of the computer they were run on is mentioned at the very top, and all of the raw data for these runs can be found in the `execution-data.xlsx` file [here](execution-data.xlsx), although screenshots are provided throughout for convenience. + +I ran each scan algorithm on various array sizes and measured their performances. I ran tests for array sizes of 2^6=64, 2^8, 2^10, ..., 2^20, and 2^22=16777216. I chose powers of two because they provide an easy way to scale the array size up, but they shouldn't cause any performance differences compared to non-powers-of-two. I chose this minimum size because it seems like a small enough list that one might practically scan, and 2^22 because it was a very large one that might need to be scanned. The raw data for these tests can be found at the bottom in the appendix. + +Now here is a overall graph of the performance with time in milliseconds, in log scale (remember, lower is better!): + +![execution graph](img/graph.jpg) + +Now there is a lot to unpack in this graph, so I will break it down for you. Let's instead first focus on the lower section of this graph where the number of elements is less than 2^18=262144: + +![lower execution graph](img/lower-graph.jpg) + +It is much easier to tell what is going on in this graph. First of all, we see that for small array sizes (up to ~100000 or so), the execution time for every algorithm is relatively constant. This makes sense because for many of these algorithms, much of the overhead and compute time comes from setting up the algorithm itself, and not from actually scanning the elements. Thus we would expect that varying the number of elements when there are few elements won't make much of a difference. + +Now we will mention some other interesting points. First is that the CPU Scan is outperforming every other method. Even though we implemented the parallelism with the other GPU methods, it doesn't actually help at these lower scales. This makes sense because the overhead of using the GPU isn't worth it when we don't have that many elements to scan in the first place. With such few elements, the CPU can handle running scan on them serially just fine and much quicker. Thus this result isn't too surprising. + +Second, note that the Thrust Scan is very slow compared to everything else. While the specific reason for this would depend on the actual implementation of `thrust`, we can make inferences. Since it's constantly above everything else, we can guess that whatever it's doing to scan the array must have a lot of overhead. This would cause it to run slower even if we had a lower number of elements. + +We can also see that the Memory Optimized Naive Scan is performing better than the Naive Scan as we would expect. This is because it isn't accessing global memory as much. Now even though this difference should be less pronounced at small array sizes (due to fewer global memory accesses needed in the first place), it is still a notable difference. + +Likewise, the Memory Optimized Work-Efficient Scan is performing better than the Work-Efficient Scan for the same reason. Something slightly surprising however is that the Work-Efficient Scan is actually performing slightly worse than the Naive Scan for small array sizes. One might be tempted to say this is because this version of the Work-Efficient Scan is still running way too many threads, so it isn't as optimal as we wish, but this is probably not the reason. This is because at such small array sizes, we aren't even running that many threads anyway. Additionally, we can compare the Thread Optimized Work-Efficient Scan to the Naive Scan, and see that it is still losing out most of the time. Thus the issue isn't the threads. One explanation could be just that for small array sizes, the overhead of having the two phase system of Work-Efficient Scan just isn't worth it. It likely doesn't make sense to have this complicated system when we only need to sort a small number of elements, so the Naive Scan is the way to go here. It could also be the global memory accesses, which I will explain briefly. + +However we do see that the Memory Optimized Work-Efficient Scan is outperforming the Memory Optimized Naive Scan (which contrasts the opposite relation between Work-Efficient Scan and Naive Scan). This might be because if you factor in global memory accesses, Work-Efficient Scan does more of them. Thus without removing them, it makes sense that the Work-Efficient Scan does poorer than Naive, it has to access global memory more. But when we optimized this to use shared memory, now the Memory Optimized Work-Efficient Scan can shine as intended. + +Now let's take a look at the upper portion of the graph (in log scale), where the number of elements is more than 2^18=262144 (remember, lower is better!). + +![upper execution graph](img/upper-graph.jpg) + +Now we see that for large array sizes (past ~100000 or so), the execution time for every algorithm starts increasing somewhat exponentially (something that appears exponential on the log-scale graph is linear, as one would find from the runtime calculations). This makes sense because for many of these algorithms, at some point once there are enough elements, whatever compute that has to be done to scan them starts taking over. Thus we would expect the asymptotic runtime here to become apparent for each algorithm. + +Now we will mention some other interesting points. First is that the CPU Scan is no longer outperforming the other methods. It has now dropped behind and is becoming much slower. This makes sense because for larger and larger array sizes, the serial computation makes less and less sense to do. Thus we would expect its runtime to grow very quickly. + +Second, now the Thrust Scan is actually beating out everything else by the time we get to the biggest array. Its growth is _much_ slower than all of the others. The specific reason for this depends on the implementation of `thrust`, but we can guess a little. It's likely that `thrust`'s implementation was made to be asymptotically very efficient so that it works quickly on large arrays. This would match our data because it is scanning large arrays in almost the same time as small arrays, but the tradeoff is that it takes a little longer to scan the smaller arrays. + +We can also see that the Memory Optimized Naive Scan is performing better than the Naive Scan still. Additionally, this difference is becoming more and more pronounced as the array size gets larger because there are mor and more global memory accesses needed. Thus the Memory Optimized version can improve more and more. + +Likewise, the Memory Optimized Work-Efficient Scan is performing better than the Work-Efficient Scan for the same reason, and we see the same improvement trend. Additionally, we also see that finally the Work-Efficient Scan is performing better than the Naive Scan, which follows what we discussed previously. + +Something a little surprising is that the Memory Optimized Work-Efficient Scan and the Memory Optimized Naive Scan are performing almost identically. This might be because at larger array sizes, the actual algorithm isn't what's making the difference, but rather the splitting into different chunks. Since they both rely on this same system, the optimization for scanning can't actually go that far (only up to the chunk size). Rather the time it takes to compute the block sums with scan is the same for both, so both start performing about the same. + +Additionally, we do see that the Thread Optimized Work-Efficient Scan is performing much better than the Work-Efficient Scan, and even more so at larger array sizes. This matches what we predicted before because there we are saving on computation by spawning less unnecessary threads. + +### Performance Bottlenecks + +The performance bottlenecks for each algorithm differ. For the Work-Efficient Scan, it is likely the fact that so many threads are being spawned and not doing anything. For both the Naive Scan and Work-Efficient Scan, the global memory reads are also big bottlenecks. For Thrust Scan, the bottleneck is probably computation-based. I'm guessing this because I assume `thrust` would be implemented as optimized as possible since it's a public library, so any bottlenecks are likely due to limitations in compute. The bottleneck for CPU Scan is also compute, and is just limited to the intrinsic speed that serial instructions can be executed. For the Memory Optimized Scans, their bottleneck is in both compute as the number of array elements is large, and also limitations of GPU resources. They both need to be split up into chunks, so they're limited by the block size and can only get so fast. + +### Optimizing Block Sizes + +To optimize the block sizes chosen for each of the algorithms discussed previously, I ran the algorithms 3 times with each block size from 32, 64, 128, 256, 512, and 1024 on an array size of 2^20. I chose 2^20 because it offered a good spot where each algorithm starts to show its advantages/disadvantages over each other. I then averaged the 3 results for each trial and chose the block sizes that gave the best results. Note that for multiple of the algorithms, a block size of 128 and 256 performed very similarly, while all of the others were noticeably worse. + +### Appendix + +Here is the raw data from running the tests. They can also be found in the `execution-data.xlsx` file [here](execution-data.xlsx). + +![graph data](img/data.png) + +Additionally, just like how I ran Nsight Systems on the `thrust` implementation of scan, I also generated a report for the entire test suite. That report can be found in the `entire-report.nsys-rep` file [here](entire-report.nsys-rep). However, here is a screenshot summary of it: + +![nsight systems report timeline appendix](img/entire-report.jpg) + +Finally, here is the output from running the tests found in `main.cpp`. Note that I did add my own test cases for many of the extra methods I implemented, and they are mentioned above for each function. However, to give a brief list of extra tests I added, they are: + +Scan Tests: +- work-efficient scan thread optimized, power-of-two +- work-efficient scan thread optimized, non-power-of-two +- naive scan memory optimized, power-of-two +- naive scan memory optimized, non-power-of-two +- work-efficient scan memory optimized, power-of-two +- work-efficient scan memory optimized, non-power-of-two + +Compact Tests: +- work-efficient compact thread optimized, power-of-two +- work-efficient compact thread optimized, non-power-of-two + +Sort Tests: +- std sort, power-of-two +- radix sort memory optimized, power-of-two +- std sort, non-power-of-two +- radix sort memory optimized, non-power-of-two + +Test output: + +``` +**************** +** SCAN TESTS ** +**************** + [ 11 42 20 44 5 2 11 36 9 49 16 42 41 ... 28 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 4.3458ms (std::chrono Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 4.7294ms (std::chrono Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487016 205487035 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 4.95194ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 4.66166ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 4.14038ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 3.90957ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487016 205487035 ] + passed +==== work-efficient scan thread optimized, power-of-two ==== + elapsed time: 2.50742ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] + passed +==== work-efficient scan thread optimized, non-power-of-two ==== + elapsed time: 2.02218ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487016 205487035 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.89178ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.858336ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487016 205487035 ] + passed +==== naive scan memory optimized, power-of-two ==== + elapsed time: 1.71162ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] + passed +==== naive scan memory optimized, non-power-of-two ==== + elapsed time: 1.56301ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 0 0 ] + passed +==== work-efficient scan memory optimized, power-of-two ==== + elapsed time: 1.9087ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487046 205487074 ] + passed +==== work-efficient scan memory optimized, non-power-of-two ==== + elapsed time: 1.83875ms (CUDA Measured) + [ 0 11 53 73 117 122 124 135 171 180 229 245 287 ... 205487016 205487035 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 2 1 1 3 2 1 0 1 0 2 0 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 12.7152ms (std::chrono Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 12.6938ms (std::chrono Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 25.8371ms (std::chrono Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 4.53075ms (CUDA Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 2 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 4.42582ms (CUDA Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 1 1 ] + passed +==== work-efficient compact thread optimized, power-of-two ==== + elapsed time: 2.78566ms (CUDA Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 2 3 ] + passed +==== work-efficient compact thread optimized, non-power-of-two ==== + elapsed time: 2.79344ms (CUDA Measured) + [ 2 2 1 1 3 2 1 1 2 2 2 3 3 ... 1 1 ] + passed + +********************** +** RADIX SORT TESTS ** +********************** + [ 27514 16740 10566 14189 29569 9179 23674 12109 29620 5901 9668 28686 19412 ... 28411 0 ] +==== std sort, power-of-two ==== + elapsed time: 357.567ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] +==== radix sort memory optimized, power-of-two ==== + elapsed time: 49.1781ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed +==== std sort, non-power-of-two ==== + elapsed time: 341.76ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] +==== radix sort memory optimized, non-power-of-two ==== + elapsed time: 52.4414ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] + passed +Press any key to continue . . . +``` diff --git a/entire-report.nsys-rep b/entire-report.nsys-rep new file mode 100644 index 0000000..05c4976 Binary files /dev/null and b/entire-report.nsys-rep differ diff --git a/execution-data.xlsx b/execution-data.xlsx new file mode 100644 index 0000000..1806a88 Binary files /dev/null and b/execution-data.xlsx differ diff --git a/img/data.png b/img/data.png new file mode 100644 index 0000000..b5a08a1 Binary files /dev/null and b/img/data.png differ diff --git a/img/entire-report.jpg b/img/entire-report.jpg new file mode 100644 index 0000000..a999916 Binary files /dev/null and b/img/entire-report.jpg differ diff --git a/img/graph.jpg b/img/graph.jpg new file mode 100644 index 0000000..3e1fe87 Binary files /dev/null and b/img/graph.jpg differ diff --git a/img/lower-graph.jpg b/img/lower-graph.jpg new file mode 100644 index 0000000..d1e8483 Binary files /dev/null and b/img/lower-graph.jpg differ diff --git a/img/thrust-report.jpg b/img/thrust-report.jpg new file mode 100644 index 0000000..1508961 Binary files /dev/null and b/img/thrust-report.jpg differ diff --git a/img/upper-graph.jpg b/img/upper-graph.jpg new file mode 100644 index 0000000..af1cd02 Binary files /dev/null and b/img/upper-graph.jpg differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..daaa19f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,11 +10,14 @@ #include #include #include +#include #include +#include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 8 * 1024 * 1024; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two +const int SORTMAXVAL = SIZE / 2; int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; @@ -51,7 +54,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +67,77 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan thread optimized, power-of-two"); + StreamCompaction::EfficientThreadOptimized::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::EfficientThreadOptimized::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan thread optimized, non-power-of-two"); + StreamCompaction::EfficientThreadOptimized::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::EfficientThreadOptimized::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan memory optimized, power-of-two"); + StreamCompaction::Naive::scanShared(SIZE, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("naive scan memory optimized, non-power-of-two"); + StreamCompaction::Naive::scanShared(NPOT, c, a); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan memory optimized, power-of-two"); + StreamCompaction::Efficient::scanShared(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient scan memory optimized, non-power-of-two"); + StreamCompaction::Efficient::scanShared(NPOT, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,16 +182,65 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); + printDesc("work-efficient compact thread optimized, power-of-two"); + count = StreamCompaction::EfficientThreadOptimized::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::EfficientThreadOptimized::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("work-efficient compact thread optimized, non-power-of-two"); + count = StreamCompaction::EfficientThreadOptimized::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::EfficientThreadOptimized::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("**********************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("**********************\n"); + + // Radix sort tests + + genArray(SIZE - 1, a, SORTMAXVAL); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + // std::sort as baseline + printDesc("std sort, power-of-two"); + StreamCompaction::CPU::sort(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + printDesc("radix sort memory optimized, power-of-two"); + StreamCompaction::RadixSort::sort(SIZE, SORTMAXVAL, c, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + // std::sort as baseline + printDesc("std sort, non-power-of-two"); + StreamCompaction::CPU::sort(NPOT, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(NPOT, b, true); + + printDesc("radix sort memory optimized, non-power-of-two"); + StreamCompaction::RadixSort::sort(NPOT, SORTMAXVAL, c, a); + printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511ca..4eba193 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -3,7 +3,9 @@ set(headers "cpu.h" "naive.h" "efficient.h" + "efficient-thread-optimized.h" "thrust.h" + "radix-sort.h" ) set(sources @@ -11,7 +13,9 @@ set(sources "cpu.cu" "naive.cu" "efficient.cu" + "efficient-thread-optimized.cu" "thrust.cu" + "radix-sort.cu" ) list(SORT headers) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..936377d 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + bools[index] = idata[index] != 0 ? 1 : 0; } /** @@ -32,7 +36,13 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + if (bools[index] == 1) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..a615101 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int total = 0; + for (int i = 0; i < n; i++) { + odata[i] = total; + total += idata[i]; + } timer().endCpuTimer(); } @@ -30,9 +34,26 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int numElements = 0; + for (int i = 0; i < n; i++) { + int value = idata[i]; + if (value != 0) { + odata[numElements++] = value; + } + } timer().endCpuTimer(); - return -1; + return numElements; + } + + int scatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { + int numElements = 0; + for (int i = 0; i < n; i++) { + if (bools[i] == 1) { + odata[indices[i]] = idata[i]; + numElements++; + } + } + return numElements; } /** @@ -41,10 +62,32 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* bools = new int[n]; + int* scanResult = new int[n]; + timer().startCpuTimer(); + for (int i = 0; i < n; i++) { + bools[i] = idata[i] != 0 ? 1 : 0; + } + + // NOTE: not calling scan() because don't want to double call timer().startCpuTimer() + int total = 0; + for (int i = 0; i < n; i++) { + scanResult[i] = total; + total += bools[i]; + } + + int numElements = scatter(n, odata, idata, bools, scanResult); + timer().endCpuTimer(); + delete[] bools; + delete[] scanResult; + return numElements; + } + + void sort(int n, int *odata, const int *idata) { + std::copy(idata, idata + n, odata); timer().startCpuTimer(); - // TODO + std::sort(odata, odata + n); timer().endCpuTimer(); - return -1; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..ffabe81 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void sort(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/efficient-thread-optimized.cu b/stream_compaction/efficient-thread-optimized.cu new file mode 100644 index 0000000..9042bca --- /dev/null +++ b/stream_compaction/efficient-thread-optimized.cu @@ -0,0 +1,136 @@ +#include +#include +#include "common.h" +#include "efficient-thread-optimized.h" + +#define blockSize 64 + +namespace StreamCompaction { + namespace EfficientThreadOptimized { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void upSweep(int n, int d, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int elemIndex = ((index + 1) << (d + 1)) - 1; + if (elemIndex >= n) { + return; + } + + int pow2d = 1 << d; + data[elemIndex] += data[elemIndex - pow2d]; + } + + __global__ void downSweep(int n, int d, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int elemIndex = ((index + 1) << (d + 1)) - 1; + if (elemIndex >= n) { + return; + } + + int pow2d = 1 << d; + int t = data[elemIndex - pow2d]; + data[elemIndex - pow2d] = data[elemIndex]; + data[elemIndex] += t; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + int numIters = ilog2ceil(n); + int nPowOf2 = 1 << numIters; + + int *dev_data; + cudaMalloc((void**)&dev_data, nPowOf2 * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + + timer().startGpuTimer(); + int threadsNeeded = nPowOf2; + for (int d = 0; d < numIters; d++) { + threadsNeeded >>= 1; + int numBlocks = (threadsNeeded + blockSize - 1) / blockSize; + upSweep<<>>(nPowOf2, d, dev_data); + } + cudaMemset(dev_data + nPowOf2 - 1, 0, sizeof(int)); + for (int d = numIters - 1; d >= 0; d--) { + int numBlocks = (threadsNeeded + blockSize - 1) / blockSize; + downSweep<<>>(nPowOf2, d, dev_data); + threadsNeeded <<= 1; + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + checkCUDAError("cudaFree failed"); + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + int numIters = ilog2ceil(n); + int nPowOf2 = 1 << numIters; + + int *dev_idata; + int *dev_odata; + int *dev_bools; + int *dev_scanResult; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_scanResult, nPowOf2 * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + int numBlocks = (nPowOf2 + blockSize - 1) / blockSize; + + timer().startGpuTimer(); + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + Common::kernMapToBoolean<<>>(n, dev_scanResult, dev_idata); + + // NOTE: not calling scan() because don't want to double call timer().startCpuTimer() + int threadsNeeded = nPowOf2; + for (int d = 0; d < numIters; d++) { + threadsNeeded >>= 1; + int numBlocks = (threadsNeeded + blockSize - 1) / blockSize; + upSweep<<>>(nPowOf2, d, dev_scanResult); + } + cudaMemset(dev_scanResult + nPowOf2 - 1, 0, sizeof(int)); + for (int d = numIters - 1; d >= 0; d--) { + int numBlocks = (threadsNeeded + blockSize - 1) / blockSize; + downSweep<<>>(nPowOf2, d, dev_scanResult); + threadsNeeded <<= 1; + } + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_scanResult); + + int numElements = 0; + cudaMemcpy(&numElements, dev_scanResult + nPowOf2 - 1, sizeof(int), cudaMemcpyDeviceToHost); + if (n == nPowOf2 && idata[n - 1] != 0) { + numElements++; + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, numElements * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_scanResult); + checkCUDAError("cudaFree failed"); + return numElements; + } + } +} diff --git a/stream_compaction/efficient-thread-optimized.h b/stream_compaction/efficient-thread-optimized.h new file mode 100644 index 0000000..755696c --- /dev/null +++ b/stream_compaction/efficient-thread-optimized.h @@ -0,0 +1,13 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace EfficientThreadOptimized { + StreamCompaction::Common::PerformanceTimer& timer(); + + void scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata); + } +} diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..e683110 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 256 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,59 @@ namespace StreamCompaction { return timer; } + __global__ void upSweep(int n, int d, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + int pow2d = 1 << d; + int pow2d1 = 1 << (d + 1); + if (index % pow2d1 == 0) { + data[index + pow2d1 - 1] += data[index + pow2d - 1]; + } + } + + __global__ void downSweep(int n, int d, int *data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + int pow2d = 1 << d; + int pow2d1 = 1 << (d + 1); + if (index % pow2d1 == 0) { + int t = data[index + pow2d - 1]; + data[index + pow2d - 1] = data[index + pow2d1 - 1]; + data[index + pow2d1 - 1] += t; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int numIters = ilog2ceil(n); + int nPowOf2 = 1 << numIters; + + int *dev_data; + cudaMalloc((void**)&dev_data, nPowOf2 * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + int numBlocks = (nPowOf2 + blockSize - 1) / blockSize; + timer().startGpuTimer(); - // TODO + for (int d = 0; d < numIters; d++) { + upSweep<<>>(nPowOf2, d, dev_data); + } + cudaMemset(dev_data + nPowOf2 - 1, 0, sizeof(int)); + for (int d = numIters - 1; d >= 0; d--) { + downSweep<<>>(nPowOf2, d, dev_data); + } timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_data); + checkCUDAError("cudaFree failed"); } /** @@ -31,10 +79,147 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int numIters = ilog2ceil(n); + int nPowOf2 = 1 << numIters; + + int *dev_idata; + int *dev_odata; + int *dev_bools; + int *dev_scanResult; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_scanResult, nPowOf2 * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + int numBlocks = (nPowOf2 + blockSize - 1) / blockSize; + timer().startGpuTimer(); - // TODO + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + Common::kernMapToBoolean<<>>(n, dev_scanResult, dev_idata); + + // NOTE: not calling scan() because don't want to double call timer().startCpuTimer() + for (int d = 0; d < numIters; d++) { + upSweep<<>>(nPowOf2, d, dev_scanResult); + } + cudaMemset(dev_scanResult + nPowOf2 - 1, 0, sizeof(int)); + for (int d = numIters - 1; d >= 0; d--) { + downSweep<<>>(nPowOf2, d, dev_scanResult); + } + + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_scanResult); + + int numElements = 0; + cudaMemcpy(&numElements, dev_scanResult + nPowOf2 - 1, sizeof(int), cudaMemcpyDeviceToHost); + if (n == nPowOf2 && idata[n - 1] != 0) { + numElements++; + } timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, numElements * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_scanResult); + checkCUDAError("cudaFree failed"); + return numElements; + } + + __global__ void runScanSharedChunk(int n, int *odata, int *blockSums, const int *idata) { + extern __shared__ int shared_data[]; + + int index = threadIdx.x; + int chunkSize = blockDim.x; + int globalIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (globalIndex >= n) { + return; + } + + shared_data[index] = idata[globalIndex]; + __syncthreads(); + + // up sweep + int val = (index + 1) << 1; + for (int d = 1; d < chunkSize; d <<= 1) { + int elemIndex = val * d - 1; + if (elemIndex < chunkSize) { + shared_data[elemIndex] += shared_data[elemIndex - d]; + } + __syncthreads(); + } + + if (index == chunkSize - 1) { + if (blockSums != nullptr) { + blockSums[blockIdx.x] = shared_data[chunkSize - 1]; + } + shared_data[chunkSize - 1] = 0; + } + __syncthreads(); + + // down sweep + for (int d = chunkSize >> 1; d > 0; d >>= 1) { + int elemIndex = val * d - 1; + if (elemIndex < chunkSize) { + int t = shared_data[elemIndex - d]; + shared_data[elemIndex - d] = shared_data[elemIndex]; + shared_data[elemIndex] += t; + } + __syncthreads(); + } + + odata[globalIndex] = shared_data[index]; + } + + __global__ void scanSharedAddBlockSums(int n, int *data, const int *blockSums) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + data[index] += blockSums[blockIdx.x]; + } + + void scanSharedHelper(int n, int *dev_odata, int *dev_idata) { + int numBlocks = (n + blockSize - 1) / blockSize; + + if (numBlocks > 1) { + int numBlocksPowOf2 = 1 << ilog2ceil(numBlocks); + int *dev_blockSums; + int *dev_blockSumsScan; + cudaMalloc((void**)&dev_blockSums, numBlocksPowOf2 * sizeof(int)); + cudaMalloc((void**)&dev_blockSumsScan, numBlocksPowOf2 * sizeof(int)); + + runScanSharedChunk<<>>(n, dev_odata, dev_blockSums, dev_idata); + scanSharedHelper(numBlocksPowOf2, dev_blockSumsScan, dev_blockSums); + scanSharedAddBlockSums<<>>(n, dev_odata, dev_blockSumsScan); + + cudaFree(dev_blockSums); + cudaFree(dev_blockSumsScan); + } else { + int numThreads = std::min(n, blockSize); + runScanSharedChunk<<<1, numThreads, numThreads * sizeof(int)>>>(n, dev_odata, nullptr, dev_idata); + } + } + + void scanShared(int n, int *odata, const int *idata) { + int nPowOf2 = 1 << ilog2ceil(n); + + int *dev_idata; + int *dev_odata; + cudaMalloc((void**)&dev_idata, nPowOf2 * sizeof(int)); + cudaMalloc((void**)&dev_odata, nPowOf2 * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + + timer().startGpuTimer(); + scanSharedHelper(nPowOf2, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + checkCUDAError("cudaFree failed"); } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..f1982a7 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -9,5 +9,7 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); + + void scanShared(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..63b44dc 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,8 @@ #include "common.h" #include "naive.h" +#define blockSize 256 + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +13,143 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void runIter(int n, int d, int *odata, const int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + int pow2d1 = 1 << (d - 1); + if (index >= pow2d1) { + odata[index] = idata[index - pow2d1] + idata[index]; + } else { + odata[index] = idata[index]; + } + } + + __global__ void convertInclusiveToExclusive(int n, int *odata, const int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = index == 0 ? 0 : idata[index - 1]; + } + + __global__ void copyArray(int n, int *odata, const int *idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = idata[index]; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + int numBlocks = (n + blockSize - 1) / blockSize; + timer().startGpuTimer(); - // TODO + int numIters = ilog2ceil(n); + for (int d = 1; d <= numIters; d++) { + runIter<<>>(n, d, dev_odata, dev_idata); + std::swap(dev_odata, dev_idata); + } + convertInclusiveToExclusive<<>>(n, dev_odata, dev_idata); timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + checkCUDAError("cudaFree failed"); + } + + __global__ void runScanSharedChunk(int n, int *odata, int *blockSums, const int *idata) { + extern __shared__ int shared_data[]; + + int index = threadIdx.x; + int chunkSize = blockDim.x; + int globalIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (globalIndex >= n) { + return; + } + + shared_data[index] = idata[globalIndex]; + __syncthreads(); + + for (int d = 1; d < chunkSize; d <<= 1) { + int temp; + if (index >= d) { + temp = shared_data[index - d] + shared_data[index]; + } else { + temp = shared_data[index]; + } + __syncthreads(); + shared_data[index] = temp; + __syncthreads(); + } + + odata[globalIndex] = shared_data[index]; + if (blockSums != nullptr && index == chunkSize - 1) { + blockSums[blockIdx.x] = shared_data[index]; + } + } + + __global__ void scanSharedAddBlockSums(int n, int *data, const int *blockSums) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + data[index] += blockSums[blockIdx.x]; + } + + void scanSharedHelper(int n, int *dev_odata, int *dev_idata) { + int numBlocks = (n + blockSize - 1) / blockSize; + + if (numBlocks > 1) { + int *dev_blockSums; + int *dev_blockSumsScan; + cudaMalloc((void**)&dev_blockSums, numBlocks * sizeof(int)); + cudaMalloc((void**)&dev_blockSumsScan, numBlocks * sizeof(int)); + + runScanSharedChunk<<>>(n, dev_odata, dev_blockSums, dev_idata); + scanSharedHelper(numBlocks, dev_blockSumsScan, dev_blockSums); + scanSharedAddBlockSums<<>>(n, dev_odata, dev_blockSumsScan); + + cudaFree(dev_blockSums); + cudaFree(dev_blockSumsScan); + } else { + int numThreads = std::min(n, blockSize); + runScanSharedChunk<<<1, numThreads, numThreads * sizeof(int)>>>(n, dev_odata, nullptr, dev_idata); + } + copyArray<<>>(n, dev_idata, dev_odata); + convertInclusiveToExclusive<<>>(n, dev_odata, dev_idata); + } + + void scanShared(int n, int *odata, const int *idata) { + int *dev_idata; + int *dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + + timer().startGpuTimer(); + scanSharedHelper(n, dev_odata, dev_idata); + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + checkCUDAError("cudaFree failed"); } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..414835b 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -7,5 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + + void scanShared(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/radix-sort.cu b/stream_compaction/radix-sort.cu new file mode 100644 index 0000000..3eeaf15 --- /dev/null +++ b/stream_compaction/radix-sort.cu @@ -0,0 +1,139 @@ +#include +#include +#include "common.h" +#include "radix-sort.h" + +#define blockSize 128 + +namespace StreamCompaction { + namespace RadixSort { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void bitonicMerge(int* data, int n, int j, int k) { + int i = threadIdx.x + (blockIdx.x * blockDim.x); + int ixj = i ^ j; + + if (ixj > i) { + if ((i & k) == 0) { + if (data[i] > data[ixj]) { + int temp = data[i]; + data[i] = data[ixj]; + data[ixj] = temp; + } + } else { + if (data[i] < data[ixj]) { + int temp = data[i]; + data[i] = data[ixj]; + data[ixj] = temp; + } + } + } + } + + __global__ void runSortChunk(int n, int numBits, int *odata, const int *idata) { + extern __shared__ int shared_data[]; + int* shared_bools = (int*)&shared_data[blockDim.x]; + int* shared_scan = (int*)&shared_bools[blockDim.x]; + + int index = threadIdx.x; + int chunkSize = blockDim.x; + int globalIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (globalIndex >= n) { + return; + } + + shared_data[index] = idata[globalIndex]; + __syncthreads(); + + for (int bit = 0; bit < numBits; bit++) { + shared_bools[index] = (shared_data[index] & (1 << bit)) ? 0 : 1; + shared_scan[index] = shared_bools[index]; + __syncthreads(); + + // up sweep + int val = (index + 1) << 1; + for (int d = 1; d < chunkSize; d <<= 1) { + int elemIndex = val * d - 1; + if (elemIndex < chunkSize) { + shared_scan[elemIndex] += shared_scan[elemIndex - d]; + } + __syncthreads(); + } + + if (index == chunkSize - 1) { + shared_scan[chunkSize - 1] = 0; + } + __syncthreads(); + + // down sweep + for (int d = chunkSize >> 1; d > 0; d >>= 1) { + int elemIndex = val * d - 1; + if (elemIndex < chunkSize) { + int t = shared_scan[elemIndex - d]; + shared_scan[elemIndex - d] = shared_scan[elemIndex]; + shared_scan[elemIndex] += t; + } + __syncthreads(); + } + + int totalFalses = shared_scan[chunkSize - 1] + shared_bools[chunkSize - 1]; + + int targetIndex; + if (shared_bools[index] == 1) { + targetIndex = shared_scan[index]; + } else { + targetIndex = index - shared_scan[index] + totalFalses; + } + __syncthreads(); + + shared_bools[targetIndex] = shared_data[index]; + __syncthreads(); + + shared_data[index] = shared_bools[index]; + __syncthreads(); + } + + odata[globalIndex] = shared_data[index]; + } + + void sort(int n, int *odata, const int *idata) { + sort(n, -1, odata, idata); + } + + void sort(int n, int maxVal, int *odata, const int *idata) { + int nPowOf2 = 1 << ilog2ceil(n); + int intBits = sizeof(int) * 8; + int numBits = maxVal < 0 ? intBits : std::min(intBits, ilog2ceil(maxVal)); + + int *dev_idata, *dev_odata; + cudaMalloc((void**)&dev_idata, nPowOf2 * sizeof(int)); + cudaMalloc((void**)&dev_odata, nPowOf2 * sizeof(int)); + checkCUDAError("cudaMalloc failed"); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy failed"); + int numBlocks = (nPowOf2 + blockSize - 1) / blockSize; + + timer().startGpuTimer(); + runSortChunk<<>>(nPowOf2, numBits, dev_odata, dev_idata); + cudaDeviceSynchronize(); + + for (int k = 2; k <= nPowOf2; k <<= 1) { + for (int j = k >> 1; j > 0; j >>= 1) { + bitonicMerge<<>>(dev_odata, nPowOf2, j, k); + cudaDeviceSynchronize(); + } + } + timer().endGpuTimer(); + + cudaMemcpy(odata, (int*)&dev_odata[nPowOf2 - n], n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + cudaFree(dev_odata); + checkCUDAError("cudaFree failed"); + } + } +} diff --git a/stream_compaction/radix-sort.h b/stream_compaction/radix-sort.h new file mode 100644 index 0000000..f181cdf --- /dev/null +++ b/stream_compaction/radix-sort.h @@ -0,0 +1,12 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace RadixSort { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + void sort(int n, int maxVal, int *odata, const int *idata); + } +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..a23de83 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,13 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dev_data(idata, idata + n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_data.begin(), dev_data.end(), dev_data.begin()); timer().endGpuTimer(); + + thrust::copy(dev_data.begin(), dev_data.end(), odata); } } } diff --git a/thrust-report.nsys-rep b/thrust-report.nsys-rep new file mode 100644 index 0000000..a12eda9 Binary files /dev/null and b/thrust-report.nsys-rep differ