diff --git a/README.md b/README.md index d2fa33d..31ab0be 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,142 @@ -Project 0 Getting Started -==================== +# CUDA Setup and Development Environment Setup **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Amy Liu + * [Personal Website](https://amyliu.dev/), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana). +* Tested on: Ubuntu 24.04.3 LTS, Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB -### (TODO: Your README) +## Part 1: Installation and Setup + +NVIDIA Drivers: +- **Version:** 580.76.05 +- **Method:** Added Ubuntu PPA repository to local APT registry in order to install latest 580 beta driver. ([primary followed guide](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-22-04)) + +CUDA: +- **Version:** 13.0 +- **Method:** From NVIDIA `.run` file. Deselected auto-install of drivers in execution menu. ([primary followed guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#runfile-installation)) + +## Part 2.1: Development Environment Testing + +### Part 2.1.1: Project Build & Run + +#### GPU Optimus Mode + +On Linux Wayland, NVIDIA GPU is not the primary GPU. Build & run without modification leads to the following error: + +```bash +Cuda error: Kernel failed!: CUDA-capable device(s) is/are busy or unavailable. +``` + +To solve this, two environment variables must be added wherever I run the executable (e.g. from command line, from all Nsight applications, etc ): + +```bash +__PRIME_RENDER_OFFLOAD=1 +__GLX_VENDOR_LIBRARY_NAME=nvidia +``` + +Alternatively, modify the `/etc/environment` file to contain: + +```bash +QT_QPA_PLATFORMTHEME="wayland;xcb" +GBM_BACKEND=nvidia-drm +__GLX_VENDOR_LIBRARY_NAME=nvidia +ENABLE_VKBASALT=1 +LIBVA_DRIVER_NAME=nvidia +WLR_NO_HARDWARE_CURSORS=1 +``` + +#### GLFW and Wayland incompability + +When using GLFW on Wayland, the `glfwInit()` function does not work (program will exit at `Line 49` of `main.cpp`). To counter this, set two environment variables: + +```bash +WAYLAND_DISPLAY="" +XDG_SESSION_TYPE=x11 +``` + +#### CMakeLists.txt Modification + +Due to having a very new GPU model, I changed the `CUDA_ARCHITECTURES` variable from `native` to my explicit arch, i.e. `120`. + +```bash +if(CMAKE_VERSION VERSION_LESS "3.23.0") + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES OFF) +elseif(CMAKE_VERSION VERSION_LESS "3.24.0") + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major) +else() + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES 120) +endif() +``` + +#### `run.sh` + +Wrote a simple run script to facilitate 1. build + run as well as 2. run without build from command line. Simplifies environment variable requirements mentioned above. + +Located at `cuda-gl-check/run.sh`. + +### Part 2.1.2: CUDA Project UI Modification + +![img](images/modified_project_2.1.2.png) + +### Part 2.1.3: Nsight Debugger + +Final `launch.json` configuration: + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "${workspaceFolder}/build/bin/cuda-gl-check", + "environment": [ + { "name": "__NV_PRIME_RENDER_OFFLOAD", "value": "1" }, + { "name": "__GLX_VENDOR_LIBRARY_NAME", "value": "nvidia" } + ] + } + ] +} +``` + +### Part 2.1.4: Nsight Systems + +Configuration:\ +![img](images/nsight_systems_config.png) + +Timeline Window: +![img](images/nsight_systems_timeline.png) + +Analysis Summary Window: +![img](images/nsight_systems_analysis_summary.png) + + +### Part 2.1.5: Nsight Compute + +Configuration: +![img](images/nsight_compute_config.png) + +Summary Window: +![img](images/nsight_compute_summary.png) + +Details Window: +![img](images/nsight_compute_details.png) + + +## Part 2.2: WebGL + +Works on both Zen (Firefox-based) and Arc (Chromium) browsers. +![img](images/webgl_report.png) + +## Part 2.3: WebGPU + +Does **not** work on Linux Wayland. + +![img](images/webgpu_report_wayland.png) + +Works on Linux Xorg. This requires logging out and toggling a startup option. + +![img](images/webgpu_report_xorg.png) -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/cuda-gl-check/.gitignore b/cuda-gl-check/.gitignore index 92a50c5..524a448 100644 --- a/cuda-gl-check/.gitignore +++ b/cuda-gl-check/.gitignore @@ -4,6 +4,8 @@ *.xcodeproj build +compute + # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode ### Linux ### diff --git a/cuda-gl-check/.vscode/launch.json b/cuda-gl-check/.vscode/launch.json new file mode 100644 index 0000000..32520da --- /dev/null +++ b/cuda-gl-check/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "${workspaceFolder}/build/bin/cuda-gl-check", + "environment": [ + { "name": "__NV_PRIME_RENDER_OFFLOAD", "value": "1" }, + { "name": "__GLX_VENDOR_LIBRARY_NAME", "value": "nvidia" } + ] + } + ] +} diff --git a/cuda-gl-check/.vscode/settings.json b/cuda-gl-check/.vscode/settings.json new file mode 100644 index 0000000..854df65 --- /dev/null +++ b/cuda-gl-check/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "files.associations": { + "cpp": "cuda-cpp", + "hpp": "cuda-cpp" + } +} \ No newline at end of file diff --git a/cuda-gl-check/CMakeLists.txt b/cuda-gl-check/CMakeLists.txt index df5826d..d0dbc4f 100644 --- a/cuda-gl-check/CMakeLists.txt +++ b/cuda-gl-check/CMakeLists.txt @@ -71,7 +71,7 @@ if(CMAKE_VERSION VERSION_LESS "3.23.0") elseif(CMAKE_VERSION VERSION_LESS "3.24.0") set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major) else() - set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native) + set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES 120) endif() target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$,$>:-G>") set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) diff --git a/cuda-gl-check/run_linux.sh b/cuda-gl-check/run_linux.sh new file mode 100755 index 0000000..bfd52b9 --- /dev/null +++ b/cuda-gl-check/run_linux.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +CURRENT_DIR=$(pwd) +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +while true; do + read -p "Do you want to re-build? (y/n): " yn + case $yn in + [Yy]* ) + cd $SCRIPT_DIR; # cd to where script is located + + trash build; + mkdir build && cd build; + cmake ..; + make -j$(nproc --all); + + cd $CURRENT_DIR; # cd to stored original directory + break;; + [Nn]* ) + break;; + * ) echo "Invalid input. Please answer 'y' or 'n'.";; + esac +done + +XDG_SESSION_TYPE=x11 $SCRIPT_DIR/build/bin/cuda-gl-check diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp index 886fd4c..df59fa4 100644 --- a/cuda-gl-check/src/main.cpp +++ b/cuda-gl-check/src/main.cpp @@ -11,7 +11,7 @@ */ int main(int argc, char* argv[]) { // TODO: Change this line to use your name! - m_yourName = "TODO: YOUR NAME HERE"; + m_yourName = "Amy Liu"; if (init(argc, argv)) { mainLoop(); diff --git a/cuda-introduction/.gitignore b/cuda-introduction/.gitignore new file mode 100644 index 0000000..ac3cfe6 --- /dev/null +++ b/cuda-introduction/.gitignore @@ -0,0 +1,3 @@ +build +output +log.txt \ No newline at end of file diff --git a/cuda-introduction/.vscode/c_cpp_properties.json b/cuda-introduction/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..690cd28 --- /dev/null +++ b/cuda-introduction/.vscode/c_cpp_properties.json @@ -0,0 +1,17 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/source/**" + ], + "defines": [], + "compilerPath": "/usr/local/cuda/bin/nvcc", + "cStandard": "gnu17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.makefile-tools" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/cuda-introduction/.vscode/extension.json b/cuda-introduction/.vscode/extension.json new file mode 100644 index 0000000..f176108 --- /dev/null +++ b/cuda-introduction/.vscode/extension.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "nvidia.nsight-vscode-edition", + "ms-vscode.cpptools", + "ms-vscode.makefile-tools" + ] +} \ No newline at end of file diff --git a/cuda-introduction/.vscode/launch.json b/cuda-introduction/.vscode/launch.json new file mode 100644 index 0000000..9f25ee7 --- /dev/null +++ b/cuda-introduction/.vscode/launch.json @@ -0,0 +1,13 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "CUDA C++: Launch", + "preLaunchTask": "CMake: build", + "type": "cuda-gdb", + "request": "launch", + "program": "${command:cmake.launchTargetPath}", + "logFile": "${workspaceFolder}/log.txt", + } + ] +} diff --git a/cuda-introduction/.vscode/settings.json b/cuda-introduction/.vscode/settings.json new file mode 100644 index 0000000..9ec8d6b --- /dev/null +++ b/cuda-introduction/.vscode/settings.json @@ -0,0 +1,63 @@ +{ + "files.associations": { + "cpp": "cuda-cpp", + "hpp": "cuda-cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "cctype": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "format": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "queue": "cpp", + "ranges": "cpp", + "span": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "text_encoding": "cpp", + "typeinfo": "cpp", + "variant": "cpp" + } +} \ No newline at end of file diff --git a/cuda-introduction/.vscode/tasks.json b/cuda-introduction/.vscode/tasks.json new file mode 100644 index 0000000..feb5a39 --- /dev/null +++ b/cuda-introduction/.vscode/tasks.json @@ -0,0 +1,16 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cmake", + "label": "CMake: build", + "command": "build", + "targets": [ + "${command:cmake.buildTargetName}" + ], + "group": "build", + "problemMatcher": ["$nvcc"], + "detail": "CMake template build task" + }, + ] +} \ No newline at end of file diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu index dce8793..ba880f0 100644 --- a/cuda-introduction/source/common.cu +++ b/cuda-introduction/source/common.cu @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div) { // TODO: implement a 1 line function to return the divup operation. // Note: You only need to use addition, subtraction, and division operations. - return 0; + return (size + div - 1) / div; } void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value) @@ -26,6 +26,11 @@ bool compareReferenceAndResult(const float *ref, const float *res, unsigned size bool passed = true; for (unsigned i = 0; i < size; i++) { +#if 0 + // print thread info regardless of fail or not + std::cout << "ID: " << i << " \t Res: " << res[i] << " \t Ref: " << ref[i] << std::endl; +#endif + // LOOK: Check if floating point values are equal within an epsilon as returns can vary slightly between CPU and GPU if (std::fabs(res[i] - ref[i]) > epsilon) { diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu index 826e535..fb0ba63 100644 --- a/cuda-introduction/source/matmul.cu +++ b/cuda-introduction/source/matmul.cu @@ -12,17 +12,26 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con { // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions. // Remember that each thread of the kernel computes the result of 1 unique element of P - unsigned px; - unsigned py; + unsigned px = blockIdx.x * blockDim.x + threadIdx.x; + unsigned py = blockIdx.y * blockDim.y + threadIdx.y; // TODO 10b: Check if px or py are out of bounds. If they are, return. + if (px >= sizeMX || py >= sizeNY) + { + return; + } + // TODO 10c: Compute the dot product for the P element in each thread // This loop will be the same as the host loop float dot = 0.0; + for (unsigned k = 0; k < sizeXY; k++) + { + dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } // TODO 10d: Copy dot to P matrix - // matrixP[] = dot; + matrixP[px * sizeNY + py] = dot; } int main(int argc, char *argv[]) @@ -31,19 +40,15 @@ int main(int argc, char *argv[]) // Then try large multiple-block square matrix like 64x64 up to 2048x2048. // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771 // Then try rectangles with powers of two and then non-power-of-two. - const unsigned sizeMX = 0; - const unsigned sizeXY = 0; - const unsigned sizeNY = 0; + const unsigned sizeMX = 32; + const unsigned sizeXY = 64; + const unsigned sizeNY = 128; // TODO 2: Allocate host 1D arrays for: - // matrixM[sizeMX, sizeXY] - // matrixN[sizeXY, sizeNY] - // matrixP[sizeMX, sizeNY] - // matrixPGold[sizeMX, sizeNY] - float* matrixM; - float* matrixN; - float* matrixP; - float* matrixPGold; + float* matrixM = new float[sizeMX * sizeXY]; + float* matrixN = new float[sizeXY * sizeNY]; + float* matrixP = new float[sizeMX * sizeNY]; + float* matrixPGold = new float[sizeMX * sizeNY]; // LOOK: Setup random number generator and fill host arrays and the scalar a. std::random_device rd; @@ -59,19 +64,34 @@ int main(int argc, char *argv[]) matrixN[i] = dist(mt); // TODO 3: Compute "gold" reference standard - // for py -> 0 to sizeNY - // for px -> 0 to sizeMX - // initialize dot product accumulator - // for k -> 0 to sizeXY - // dot = m[k, px] * n[py, k] - // matrixPGold[py, px] = dot + for (unsigned py = 0; py < sizeNY; py++) + { + for (unsigned px = 0; px < sizeMX; px++) + { + float dot = 0.f; + for (unsigned k = 0; k < sizeXY; k++) + { + dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } + matrixPGold[px * sizeNY + py] = dot; + } + } // Device arrays float *d_matrixM, *d_matrixN, *d_matrixP; // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP. + const size_t sizeInBytesM = sizeMX * sizeXY * sizeof(float); + const size_t sizeInBytesN = sizeXY * sizeNY * sizeof(float); + const size_t sizeInBytesP = sizeMX * sizeNY * sizeof(float); + + CUDA(cudaMalloc((void **)& d_matrixM, sizeInBytesM)); + CUDA(cudaMalloc((void **)& d_matrixN, sizeInBytesN)); + CUDA(cudaMalloc((void **)& d_matrixP, sizeInBytesP)); // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_matrixM, matrixM, sizeInBytesM, cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_matrixN, matrixN, sizeInBytesN, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -82,17 +102,21 @@ int main(int argc, char *argv[]) // LOOK: Use the clearHostAndDeviceArray function to clear matrixP and d_matrixP clearHostAndDeviceArray(matrixP, d_matrixP, sizeMX * sizeNY); + const int BS_X = 32; + const int BS_Y = 32; + // TODO 6: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup // HINT: The shape of matrices has no impact on launch configuaration DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1); // TODO 7: Launch the matrix transpose kernel - // matrixMultiplicationNaive<<<>>>(); + matrixMultiplicationNaive<<>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY); // TODO 8: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(matrixP, d_matrixP, sizeInBytesP, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3); @@ -101,6 +125,9 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 9: free device memory using cudaFree + CUDA(cudaFree(d_matrixM)); + CUDA(cudaFree(d_matrixN)); + CUDA(cudaFree(d_matrixP)); // free host memory delete[] matrixM; diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu index 5ed591f..9478232 100644 --- a/cuda-introduction/source/saxpy.cu +++ b/cuda-introduction/source/saxpy.cu @@ -9,20 +9,24 @@ __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size) { // TODO 9: Compute the global index for each thread. - unsigned idx = 0; + unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // TODO 10: Check if idx is out of bounds. If yes, return. - if (idx >= 0) + if (idx >= size) + { return; + } // TODO 11: Perform the SAXPY operation: z = a * x + y. + + z[idx] = a * x[idx] + y[idx]; } int main(int argc, char *argv[]) { // TODO 1: Set the size. Start with something simple like 64. // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127 - const unsigned size = 0; + const unsigned size = 256; // Host arrays. float* x = new float[size]; @@ -51,11 +55,16 @@ int main(int argc, char *argv[]) // Device arrays float *d_x, *d_y, *d_z; + int sizeInBytes = size * sizeof(float); + // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z. - // CUDA(cudaMalloc((void **)& pointer, size in bytes))); + CUDA(cudaMalloc((void **)& d_x, sizeInBytes)); + CUDA(cudaMalloc((void **)& d_y, sizeInBytes)); + CUDA(cudaMalloc((void **)& d_z, sizeInBytes)); // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2, - // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum)); + CUDA(cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -69,17 +78,20 @@ int main(int argc, char *argv[]) // TODO 4: Setup threads and blocks. // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024 // Use divup to get the number of blocks to launch. - const unsigned threadsPerBlock = 0; + const unsigned threadsPerBlock = 128; // TODO 5: Implement the divup function in common.cpp const unsigned blocks = divup(size, threadsPerBlock); // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration - // saxpy<<< >>> (....); + saxpy<<>> (d_z, d_x, d_y, a, size); // TODO 7: Copy the answer back to the host (CPU) from the device (GPU). // Copy what you did in 3, except for d_z -> z. + cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost); + + // LOOK: Use postprocess to check the result compareReferenceAndResult(z_gold, z, size, 1e-6); std::cout << "****************************************************" << std::endl << std::endl; @@ -88,6 +100,10 @@ int main(int argc, char *argv[]) // TODO 8: free device memory using cudaFree // CUDA(cudaFree(device pointer)); + CUDA(cudaFree(d_x)); + CUDA(cudaFree(d_y)); + CUDA(cudaFree(d_z)); + // free host memory delete[] x; delete[] y; diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu index 89f6f8f..4debe71 100644 --- a/cuda-introduction/source/transpose.cu +++ b/cuda-introduction/source/transpose.cu @@ -19,16 +19,20 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 6a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0;; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 6b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + { + return; + } // TODO 6c: Compute global 1D index from i and j - unsigned index = 0; + unsigned index = (j * sizeX) + i; // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same - // b[] = a[]; + b[index] = a[index]; } // TODO 11: Implement the transpose kernel @@ -38,16 +42,22 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 11a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 11b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + { + return; + } // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i) - unsigned index_in = 0; // Compute input index (i,j) from matrix A - unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A) + unsigned index_in = j * sizeX + i; // Compute input index (i,j) from matrix A + unsigned index_out = i * sizeY + j; // Compute output index (j,i) in matrix B = transpose(A) // TODO 11d: Copy data from A to B using transpose indices + + b[index_out] = a[index_in]; } int main(int argc, char *argv[]) @@ -81,12 +91,22 @@ int main(int argc, char *argv[]) // Device arrays float *d_a, *d_b; + float sizeInBytes = sizeX * sizeY * sizeof(float); + // TODO 2: Allocate memory on the device for d_a and d_b. + CUDA(cudaMalloc((void **)& d_a, sizeInBytes)); + CUDA(cudaMalloc((void **)& d_b, sizeInBytes)); + // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); + // declare block size components here + const unsigned BS_X = 32; + const unsigned BS_Y = 32; + //////////////////////////////////////////////////////////// std::cout << "****************************************************" << std::endl; std::cout << "***Device To Device Copy***" << std::endl; @@ -97,13 +117,14 @@ int main(int argc, char *argv[]) // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1); // LOOK: Launch the copy kernel copyKernel<<>>(d_a, d_b, sizeX, sizeY); // TODO 5: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(a_gold, b, sizeX * sizeY); @@ -121,13 +142,14 @@ int main(int argc, char *argv[]) // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeX, BS_Y), divup(sizeY, BS_Y), 1); // TODO 9: Launch the matrix transpose kernel - // matrixTransposeNaive<<<>>>(......); + matrixTransposeNaive<<>>(d_a, d_b, sizeX, sizeY); // TODO 10: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(b_gold, b, sizeX * sizeY); @@ -136,6 +158,8 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 7: free device memory using cudaFree + CUDA(cudaFree(d_a)); + CUDA(cudaFree(d_b)); // free host memory delete[] a; diff --git a/images/modified_project_2.1.2.png b/images/modified_project_2.1.2.png new file mode 100644 index 0000000..d7d498c Binary files /dev/null and b/images/modified_project_2.1.2.png differ diff --git a/images/nsight_compute_config.png b/images/nsight_compute_config.png new file mode 100644 index 0000000..64fee16 Binary files /dev/null and b/images/nsight_compute_config.png differ diff --git a/images/nsight_compute_details.png b/images/nsight_compute_details.png new file mode 100644 index 0000000..e69ea73 Binary files /dev/null and b/images/nsight_compute_details.png differ diff --git a/images/nsight_compute_summary.png b/images/nsight_compute_summary.png new file mode 100644 index 0000000..6bd9618 Binary files /dev/null and b/images/nsight_compute_summary.png differ diff --git a/images/nsight_systems_analysis_summary.png b/images/nsight_systems_analysis_summary.png new file mode 100644 index 0000000..ca8f7df Binary files /dev/null and b/images/nsight_systems_analysis_summary.png differ diff --git a/images/nsight_systems_config.png b/images/nsight_systems_config.png new file mode 100644 index 0000000..4e91016 Binary files /dev/null and b/images/nsight_systems_config.png differ diff --git a/images/nsight_systems_timeline.png b/images/nsight_systems_timeline.png new file mode 100644 index 0000000..dc6a1b2 Binary files /dev/null and b/images/nsight_systems_timeline.png differ diff --git a/images/webgl_report.png b/images/webgl_report.png new file mode 100644 index 0000000..daa293a Binary files /dev/null and b/images/webgl_report.png differ diff --git a/images/webgpu_report_wayland.png b/images/webgpu_report_wayland.png new file mode 100644 index 0000000..c1e89d8 Binary files /dev/null and b/images/webgpu_report_wayland.png differ diff --git a/images/webgpu_report_xorg.png b/images/webgpu_report_xorg.png new file mode 100644 index 0000000..9933275 Binary files /dev/null and b/images/webgpu_report_xorg.png differ