diff --git a/README.md b/README.md index d2fa33d..3fbec0c 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,69 @@ Project 0 Getting Started **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Oliver Hendrych + * [LinkedIn](https://www.linkedin.com/in/oliver-hendrych/) +* Tested on: Windows 11, i7-14700K @ 3.4GHz 32GB, RTX 4070 SUPER 12GB (Personal) -### (TODO: Your README) +## Assignment Include screenshots, analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) + +### 2.1.1.1 + +Compute Capability: 8.9 (GeForce RTX 4070) + +### 2.1 + +#### 2.1.2 + +Modified CUDA Project Window: + +![Screenshot with Oliver Hendrych in title](images/part_2_1_2.png) + +#### 2.1.3 + +I set to the conition to `index == 1621`. With a width of 800, that would mean the pixel is at x=21, y=2. +With block sizes of 16x16, then the block index should be x=1, y=0, and the thread index should be x=5, y=2, which is what we observe. + +![Screenshot with index == 1621](images/part_2_1_3_autos_and_warp_info.png) + +#### 2.1.4 + +In the analysis summary, we can see details about the configuration, and we can see that most of the CPU utilization comes from the `cuda-gl-check.exe`, followed by `Idle` + +![Screenshot showing configuration and cpu utilization](images/part_2_1_4_analysis_summary.png) + +We can see the `cuda-gl-check.exe` process, and if we look at the kernel calls, we can see the createVersionVisualization kernel invokes in the event view. + +![Screenshot with timeline and events view](images/part_2_1_4_timeline.png) + +#### 2.1.5 + +Note: there is a known bug relating to Nsight Compute when trying to use the kernel replay. For that reason, rather than trying to downgrade Nsight Compute (which would also involve downgrading my CUDA Toolkit version), I used the WAR which is to use Application for the replay mode. I'm not sure if it makes a difference. [https://forums.developer.nvidia.com/t/error-failed-to-profile-createversionvisualization-in-process-12840/343411/16](https://forums.developer.nvidia.com/t/error-failed-to-profile-createversionvisualization-in-process-12840/343411/16) + +In the summary view, we can see the createVersionVisualization kernel calls. + +![Screenshot of summary view](images/part_2_1_5_summary.png) + +In the detailed view, we can see details about the throughput and launch statistics. + +![Screenshot of detailed view](images/part_2_1_5_details.png) + +### 2.2 + +In Chrome, we can see that both WebGL versions (1 and 2) are available. + +![Screenshot of enabled WebGL1](images/part_2_2_webgl1.png) + +![Screenshot of enabled WebGL2](images/part_2_2_webgl2.png) + +### 2.3 + +In Chrome, we can see that WebGPU is enabled with an nvidia vendor. + +![Screenshot of enabled WebGPU](images/part_2_3_webgpu.png) + + + diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp index 886fd4c..1a20bce 100644 --- a/cuda-gl-check/src/main.cpp +++ b/cuda-gl-check/src/main.cpp @@ -11,7 +11,7 @@ */ int main(int argc, char* argv[]) { // TODO: Change this line to use your name! - m_yourName = "TODO: YOUR NAME HERE"; + m_yourName = "Oliver Hendrych"; if (init(argc, argv)) { mainLoop(); diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu index dce8793..49257ba 100644 --- a/cuda-introduction/source/common.cu +++ b/cuda-introduction/source/common.cu @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div) { // TODO: implement a 1 line function to return the divup operation. // Note: You only need to use addition, subtraction, and division operations. - return 0; + return (size / div) + ((size % div == 0) ? 0 : 1); } void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value) diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu index 826e535..2c8df2d 100644 --- a/cuda-introduction/source/matmul.cu +++ b/cuda-introduction/source/matmul.cu @@ -12,17 +12,23 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con { // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions. // Remember that each thread of the kernel computes the result of 1 unique element of P - unsigned px; - unsigned py; + unsigned px = blockIdx.x * blockDim.x + threadIdx.x; + unsigned py = blockIdx.y * blockDim.y + threadIdx.y; // TODO 10b: Check if px or py are out of bounds. If they are, return. + if (px >= sizeMX || py >= sizeNY) { + return; + } // TODO 10c: Compute the dot product for the P element in each thread // This loop will be the same as the host loop float dot = 0.0; + for (unsigned k = 0; k < sizeXY; k++) { + dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } // TODO 10d: Copy dot to P matrix - // matrixP[] = dot; + matrixP[px * sizeNY + py] = dot; } int main(int argc, char *argv[]) @@ -31,19 +37,19 @@ int main(int argc, char *argv[]) // Then try large multiple-block square matrix like 64x64 up to 2048x2048. // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771 // Then try rectangles with powers of two and then non-power-of-two. - const unsigned sizeMX = 0; - const unsigned sizeXY = 0; - const unsigned sizeNY = 0; + const unsigned sizeMX = 156; + const unsigned sizeXY = 234; + const unsigned sizeNY = 125; // TODO 2: Allocate host 1D arrays for: // matrixM[sizeMX, sizeXY] // matrixN[sizeXY, sizeNY] // matrixP[sizeMX, sizeNY] // matrixPGold[sizeMX, sizeNY] - float* matrixM; - float* matrixN; - float* matrixP; - float* matrixPGold; + float* matrixM = new float[sizeMX * sizeXY]; + float* matrixN = new float[sizeXY * sizeNY]; + float* matrixP = new float[sizeMX * sizeNY]; + float* matrixPGold = new float[sizeMX * sizeNY]; // LOOK: Setup random number generator and fill host arrays and the scalar a. std::random_device rd; @@ -66,12 +72,27 @@ int main(int argc, char *argv[]) // dot = m[k, px] * n[py, k] // matrixPGold[py, px] = dot + for (unsigned py = 0; py < sizeNY; py++) { + for (unsigned px = 0; px < sizeMX; px++) { + float sum = 0; + for (unsigned k = 0; k < sizeXY; k++) { + sum += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } + matrixPGold[px * sizeNY + py] = sum; + } + } + // Device arrays float *d_matrixM, *d_matrixN, *d_matrixP; // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP. + CUDA(cudaMalloc(&d_matrixM, sizeMX * sizeXY * sizeof(float))); + CUDA(cudaMalloc(&d_matrixN, sizeXY * sizeNY * sizeof(float))); + CUDA(cudaMalloc(&d_matrixP, sizeMX * sizeNY * sizeof(float))); // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -86,13 +107,14 @@ int main(int argc, char *argv[]) // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup // HINT: The shape of matrices has no impact on launch configuaration DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(11, 13, 1); + dims.dimGrid = dim3(divup(sizeMX, dims.dimBlock.x), divup(sizeNY, dims.dimBlock.y), 1); // TODO 7: Launch the matrix transpose kernel - // matrixMultiplicationNaive<<<>>>(); + matrixMultiplicationNaive<<>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY); // TODO 8: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3); @@ -101,6 +123,9 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 9: free device memory using cudaFree + CUDA(cudaFree(d_matrixM)); + CUDA(cudaFree(d_matrixN)); + CUDA(cudaFree(d_matrixP)); // free host memory delete[] matrixM; diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu index 5ed591f..02e4654 100644 --- a/cuda-introduction/source/saxpy.cu +++ b/cuda-introduction/source/saxpy.cu @@ -9,20 +9,21 @@ __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size) { // TODO 9: Compute the global index for each thread. - unsigned idx = 0; + unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // TODO 10: Check if idx is out of bounds. If yes, return. - if (idx >= 0) + if (idx >= size) return; // TODO 11: Perform the SAXPY operation: z = a * x + y. + z[idx] = a * x[idx] + y[idx]; } int main(int argc, char *argv[]) { // TODO 1: Set the size. Start with something simple like 64. // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127 - const unsigned size = 0; + const unsigned size = 2049; // Host arrays. float* x = new float[size]; @@ -51,11 +52,18 @@ int main(int argc, char *argv[]) // Device arrays float *d_x, *d_y, *d_z; + size_t byteSize = size * sizeof(float); + // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z. // CUDA(cudaMalloc((void **)& pointer, size in bytes))); + CUDA(cudaMalloc((void**)&d_x, byteSize)); + CUDA(cudaMalloc((void**)&d_y, byteSize)); + CUDA(cudaMalloc((void**)&d_z, byteSize)); // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2, // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum)); + CUDA(cudaMemcpy(d_x, x, byteSize, cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_y, y, byteSize, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -69,16 +77,18 @@ int main(int argc, char *argv[]) // TODO 4: Setup threads and blocks. // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024 // Use divup to get the number of blocks to launch. - const unsigned threadsPerBlock = 0; + const unsigned threadsPerBlock = 128; // TODO 5: Implement the divup function in common.cpp const unsigned blocks = divup(size, threadsPerBlock); // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration // saxpy<<< >>> (....); + saxpy << > > (d_z, d_x, d_y, a, size); // TODO 7: Copy the answer back to the host (CPU) from the device (GPU). // Copy what you did in 3, except for d_z -> z. + CUDA(cudaMemcpy(z, d_z, byteSize, cudaMemcpyDeviceToHost)); // LOOK: Use postprocess to check the result compareReferenceAndResult(z_gold, z, size, 1e-6); @@ -87,6 +97,9 @@ int main(int argc, char *argv[]) // TODO 8: free device memory using cudaFree // CUDA(cudaFree(device pointer)); + CUDA(cudaFree(d_x)); + CUDA(cudaFree(d_y)); + CUDA(cudaFree(d_z)); // free host memory delete[] x; diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu index 89f6f8f..6fb7d02 100644 --- a/cuda-introduction/source/transpose.cu +++ b/cuda-introduction/source/transpose.cu @@ -19,16 +19,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 6a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0;; + unsigned i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned j = blockDim.y * blockIdx.y + threadIdx.y; // TODO 6b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + return; // TODO 6c: Compute global 1D index from i and j - unsigned index = 0; + unsigned index = sizeX * j + i; // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same // b[] = a[]; + b[index] = a[index]; } // TODO 11: Implement the transpose kernel @@ -38,16 +41,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 11a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0; + unsigned i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned j = blockDim.y * blockIdx.y + threadIdx.y; // TODO 11b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + return; // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i) - unsigned index_in = 0; // Compute input index (i,j) from matrix A - unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A) + unsigned index_in = sizeX * j + i; // Compute input index (i,j) from matrix A + unsigned index_out = sizeY * i + j; // Compute output index (j,i) in matrix B = transpose(A) // TODO 11d: Copy data from A to B using transpose indices + b[index_out] = a[index_in]; } int main(int argc, char *argv[]) @@ -82,8 +88,12 @@ int main(int argc, char *argv[]) float *d_a, *d_b; // TODO 2: Allocate memory on the device for d_a and d_b. + size_t sizeBytes = sizeX * sizeY * sizeof(float); + CUDA(cudaMalloc(&d_a, sizeBytes)); + CUDA(cudaMalloc(&d_b, sizeBytes)); // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_a, a, sizeBytes, cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -97,13 +107,14 @@ int main(int argc, char *argv[]) // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(11, 13, 1); + dims.dimGrid = dim3(divup(sizeX, dims.dimBlock.x), divup(sizeY, dims.dimGrid.y), 1); // LOOK: Launch the copy kernel copyKernel<<>>(d_a, d_b, sizeX, sizeY); // TODO 5: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeBytes, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(a_gold, b, sizeX * sizeY); @@ -121,13 +132,14 @@ int main(int argc, char *argv[]) // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(11, 13, 1); + dims.dimGrid = dim3(divup(sizeX, dims.dimBlock.x), divup(sizeY, dims.dimGrid.y), 1); // TODO 9: Launch the matrix transpose kernel - // matrixTransposeNaive<<<>>>(......); + matrixTransposeNaive<<>>(d_a, d_b, sizeX, sizeY); // TODO 10: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeBytes, cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(b_gold, b, sizeX * sizeY); @@ -136,6 +148,8 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 7: free device memory using cudaFree + CUDA(cudaFree(d_a)); + CUDA(cudaFree(d_b)); // free host memory delete[] a; diff --git a/images/part_2_1_2.png b/images/part_2_1_2.png new file mode 100644 index 0000000..01926f3 Binary files /dev/null and b/images/part_2_1_2.png differ diff --git a/images/part_2_1_3_autos_and_warp_info.png b/images/part_2_1_3_autos_and_warp_info.png new file mode 100644 index 0000000..29dd754 Binary files /dev/null and b/images/part_2_1_3_autos_and_warp_info.png differ diff --git a/images/part_2_1_4_analysis_summary.png b/images/part_2_1_4_analysis_summary.png new file mode 100644 index 0000000..dc87e5a Binary files /dev/null and b/images/part_2_1_4_analysis_summary.png differ diff --git a/images/part_2_1_4_timeline.png b/images/part_2_1_4_timeline.png new file mode 100644 index 0000000..7a9ae24 Binary files /dev/null and b/images/part_2_1_4_timeline.png differ diff --git a/images/part_2_1_5_details.png b/images/part_2_1_5_details.png new file mode 100644 index 0000000..790f20d Binary files /dev/null and b/images/part_2_1_5_details.png differ diff --git a/images/part_2_1_5_summary.png b/images/part_2_1_5_summary.png new file mode 100644 index 0000000..423db2f Binary files /dev/null and b/images/part_2_1_5_summary.png differ diff --git a/images/part_2_2_webgl1.png b/images/part_2_2_webgl1.png new file mode 100644 index 0000000..243c18a Binary files /dev/null and b/images/part_2_2_webgl1.png differ diff --git a/images/part_2_2_webgl2.png b/images/part_2_2_webgl2.png new file mode 100644 index 0000000..03ee4b1 Binary files /dev/null and b/images/part_2_2_webgl2.png differ diff --git a/images/part_2_3_webgpu.png b/images/part_2_3_webgpu.png new file mode 100644 index 0000000..0365ad1 Binary files /dev/null and b/images/part_2_3_webgpu.png differ