diff --git a/README.md b/README.md
index d2fa33d..31ab0be 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,142 @@
-Project 0 Getting Started
-====================
+# CUDA Setup and Development Environment Setup
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Amy Liu
+  * [Personal Website](https://amyliu.dev/), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana).
+* Tested on: Ubuntu 24.04.3 LTS, Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB
 
-### (TODO: Your README)
+## Part 1: Installation and Setup
+
+NVIDIA Drivers:
+- **Version:** 580.76.05
+- **Method:** Added Ubuntu PPA repository to local APT registry in order to install latest 580 beta driver. ([primary followed guide](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-22-04))
+
+CUDA:
+- **Version:** 13.0
+- **Method:** From NVIDIA `.run` file. Deselected auto-install of drivers in execution menu. ([primary followed guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#runfile-installation))
+
+## Part 2.1: Development Environment Testing
+
+### Part 2.1.1: Project Build & Run
+
+#### GPU Optimus Mode
+
+On Linux Wayland, NVIDIA GPU is not the primary GPU. Build & run without modification leads to the following error:
+
+```bash
+Cuda error: Kernel failed!: CUDA-capable device(s) is/are busy or unavailable.
+```
+
+To solve this, two environment variables must be added wherever I run the executable (e.g. from command line, from all Nsight applications, etc ):
+
+```bash
+__PRIME_RENDER_OFFLOAD=1
+__GLX_VENDOR_LIBRARY_NAME=nvidia
+```
+
+Alternatively, modify the `/etc/environment` file to contain:
+
+```bash
+QT_QPA_PLATFORMTHEME="wayland;xcb"
+GBM_BACKEND=nvidia-drm
+__GLX_VENDOR_LIBRARY_NAME=nvidia
+ENABLE_VKBASALT=1
+LIBVA_DRIVER_NAME=nvidia
+WLR_NO_HARDWARE_CURSORS=1
+```
+
+#### GLFW and Wayland incompability
+
+When using GLFW on Wayland, the `glfwInit()` function does not work (program will exit at `Line 49` of `main.cpp`). To counter this, set two environment variables:
+
+```bash
+WAYLAND_DISPLAY=""
+XDG_SESSION_TYPE=x11
+```
+
+#### CMakeLists.txt Modification
+
+Due to having a very new GPU model, I changed the `CUDA_ARCHITECTURES` variable from `native` to my explicit arch, i.e. `120`.
+
+```bash
+if(CMAKE_VERSION VERSION_LESS "3.23.0")
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES OFF)
+elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major)
+else()
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES 120)
+endif()
+```
+
+#### `run.sh`
+
+Wrote a simple run script to facilitate 1. build + run as well as 2. run without build from command line. Simplifies environment variable requirements mentioned above.
+
+Located at `cuda-gl-check/run.sh`.
+
+### Part 2.1.2: CUDA Project UI Modification
+
+![img](images/modified_project_2.1.2.png)
+
+### Part 2.1.3: Nsight Debugger
+
+Final `launch.json` configuration:
+
+```json
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/bin/cuda-gl-check",
+            "environment": [
+                { "name": "__NV_PRIME_RENDER_OFFLOAD", "value": "1" },
+                { "name": "__GLX_VENDOR_LIBRARY_NAME", "value": "nvidia" }
+            ]
+        }
+    ]
+}
+```
+
+### Part 2.1.4: Nsight Systems
+
+Configuration:\
+![img](images/nsight_systems_config.png)
+
+Timeline Window:
+![img](images/nsight_systems_timeline.png)
+
+Analysis Summary Window:
+![img](images/nsight_systems_analysis_summary.png)
+
+
+### Part 2.1.5: Nsight Compute
+
+Configuration:
+![img](images/nsight_compute_config.png)
+
+Summary Window:
+![img](images/nsight_compute_summary.png)
+
+Details Window:
+![img](images/nsight_compute_details.png)
+
+
+## Part 2.2: WebGL
+
+Works on both Zen (Firefox-based) and Arc (Chromium) browsers.
+![img](images/webgl_report.png)
+
+## Part 2.3: WebGPU
+
+Does **not** work on Linux Wayland.
+
+![img](images/webgpu_report_wayland.png)
+
+Works on Linux Xorg. This requires logging out and toggling a startup option.
+
+![img](images/webgpu_report_xorg.png)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/cuda-gl-check/.gitignore b/cuda-gl-check/.gitignore
index 92a50c5..524a448 100644
--- a/cuda-gl-check/.gitignore
+++ b/cuda-gl-check/.gitignore
@@ -4,6 +4,8 @@
 *.xcodeproj
 build
 
+compute
+
 # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode
 
 ### Linux ###
diff --git a/cuda-gl-check/.vscode/launch.json b/cuda-gl-check/.vscode/launch.json
new file mode 100644
index 0000000..32520da
--- /dev/null
+++ b/cuda-gl-check/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${workspaceFolder}/build/bin/cuda-gl-check",
+            "environment": [
+                { "name": "__NV_PRIME_RENDER_OFFLOAD", "value": "1" },
+                { "name": "__GLX_VENDOR_LIBRARY_NAME", "value": "nvidia" }
+            ]
+        }
+    ]
+}
diff --git a/cuda-gl-check/.vscode/settings.json b/cuda-gl-check/.vscode/settings.json
new file mode 100644
index 0000000..854df65
--- /dev/null
+++ b/cuda-gl-check/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "files.associations": {
+        "cpp": "cuda-cpp",
+        "hpp": "cuda-cpp"
+    }
+}
\ No newline at end of file
diff --git a/cuda-gl-check/CMakeLists.txt b/cuda-gl-check/CMakeLists.txt
index df5826d..d0dbc4f 100644
--- a/cuda-gl-check/CMakeLists.txt
+++ b/cuda-gl-check/CMakeLists.txt
@@ -71,7 +71,7 @@ if(CMAKE_VERSION VERSION_LESS "3.23.0")
 elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
     set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES all-major)
 else()
-    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native)
+    set_target_properties(${CMAKE_PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES 120)
 endif()
 target_compile_options(${CMAKE_PROJECT_NAME} PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G>")
 set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME})
diff --git a/cuda-gl-check/run_linux.sh b/cuda-gl-check/run_linux.sh
new file mode 100755
index 0000000..bfd52b9
--- /dev/null
+++ b/cuda-gl-check/run_linux.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+CURRENT_DIR=$(pwd)
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+while true; do
+    read -p "Do you want to re-build? (y/n): " yn
+    case $yn in
+        [Yy]* )
+        cd $SCRIPT_DIR; # cd to where script is located
+
+        trash build;
+        mkdir build && cd build;
+        cmake ..;
+        make -j$(nproc --all);
+
+        cd $CURRENT_DIR; # cd to stored original directory 
+        break;;
+        [Nn]* ) 
+        break;;
+        * ) echo "Invalid input. Please answer 'y' or 'n'.";;
+    esac
+done
+
+XDG_SESSION_TYPE=x11 $SCRIPT_DIR/build/bin/cuda-gl-check
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
index 886fd4c..df59fa4 100644
--- a/cuda-gl-check/src/main.cpp
+++ b/cuda-gl-check/src/main.cpp
@@ -11,7 +11,7 @@
  */
 int main(int argc, char* argv[]) {
     // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Amy Liu";
 
     if (init(argc, argv)) {
         mainLoop();
diff --git a/cuda-introduction/.gitignore b/cuda-introduction/.gitignore
new file mode 100644
index 0000000..ac3cfe6
--- /dev/null
+++ b/cuda-introduction/.gitignore
@@ -0,0 +1,3 @@
+build
+output
+log.txt
\ No newline at end of file
diff --git a/cuda-introduction/.vscode/c_cpp_properties.json b/cuda-introduction/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..690cd28
--- /dev/null
+++ b/cuda-introduction/.vscode/c_cpp_properties.json
@@ -0,0 +1,17 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/source/**"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/local/cuda/bin/nvcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++17",
+            "intelliSenseMode": "linux-gcc-x64",
+            "configurationProvider": "ms-vscode.makefile-tools"
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/cuda-introduction/.vscode/extension.json b/cuda-introduction/.vscode/extension.json
new file mode 100644
index 0000000..f176108
--- /dev/null
+++ b/cuda-introduction/.vscode/extension.json
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cpptools",
+        "ms-vscode.makefile-tools"
+    ]
+}
\ No newline at end of file
diff --git a/cuda-introduction/.vscode/launch.json b/cuda-introduction/.vscode/launch.json
new file mode 100644
index 0000000..9f25ee7
--- /dev/null
+++ b/cuda-introduction/.vscode/launch.json
@@ -0,0 +1,13 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "CUDA C++: Launch",
+            "preLaunchTask": "CMake: build",
+            "type": "cuda-gdb",
+            "request": "launch",
+            "program": "${command:cmake.launchTargetPath}",
+            "logFile": "${workspaceFolder}/log.txt",
+        }
+    ]
+}
diff --git a/cuda-introduction/.vscode/settings.json b/cuda-introduction/.vscode/settings.json
new file mode 100644
index 0000000..9ec8d6b
--- /dev/null
+++ b/cuda-introduction/.vscode/settings.json
@@ -0,0 +1,63 @@
+{
+    "files.associations": {
+        "cpp": "cuda-cpp",
+        "hpp": "cuda-cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "cctype": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "format": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "queue": "cpp",
+        "ranges": "cpp",
+        "span": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "text_encoding": "cpp",
+        "typeinfo": "cpp",
+        "variant": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/cuda-introduction/.vscode/tasks.json b/cuda-introduction/.vscode/tasks.json
new file mode 100644
index 0000000..feb5a39
--- /dev/null
+++ b/cuda-introduction/.vscode/tasks.json
@@ -0,0 +1,16 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "type": "cmake",
+            "label": "CMake: build",
+            "command": "build",
+            "targets": [
+                "${command:cmake.buildTargetName}"
+            ],
+            "group": "build",
+            "problemMatcher": ["$nvcc"],
+            "detail": "CMake template build task"
+        },
+    ]
+}
\ No newline at end of file
diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
index dce8793..ba880f0 100644
--- a/cuda-introduction/source/common.cu
+++ b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return (size + div - 1) / div;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
@@ -26,6 +26,11 @@ bool compareReferenceAndResult(const float *ref, const float *res, unsigned size
     bool passed = true;
     for (unsigned i = 0; i < size; i++)
     {
+#if 0
+        // print thread info regardless of fail or not
+        std::cout << "ID: " << i << " \t Res: " << res[i] << " \t Ref: " << ref[i] << std::endl;
+#endif
+
         // LOOK: Check if floating point values are equal within an epsilon as returns can vary slightly between CPU and GPU
         if (std::fabs(res[i] - ref[i]) > epsilon)
         {
diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
index 826e535..fb0ba63 100644
--- a/cuda-introduction/source/matmul.cu
+++ b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,26 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned py = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px >= sizeMX || py >= sizeNY)
+    {
+        return;
+    }
+
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
     float dot = 0.0;
+    for (unsigned k = 0; k < sizeXY; k++)
+    {
+        dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
+    }
 
     // TODO 10d: Copy dot to P matrix
-    // matrixP[] = dot;
+    matrixP[px * sizeNY + py] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,19 +40,15 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 32;
+    const unsigned sizeXY = 64;
+    const unsigned sizeNY = 128;
 
     // TODO 2: Allocate host 1D arrays for:
-    // matrixM[sizeMX, sizeXY]
-    // matrixN[sizeXY, sizeNY]
-    // matrixP[sizeMX, sizeNY]
-    // matrixPGold[sizeMX, sizeNY]
-    float* matrixM;
-    float* matrixN;
-    float* matrixP;
-    float* matrixPGold;
+    float* matrixM = new float[sizeMX * sizeXY];
+    float* matrixN = new float[sizeXY * sizeNY];
+    float* matrixP = new float[sizeMX * sizeNY];
+    float* matrixPGold = new float[sizeMX * sizeNY];
 
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
@@ -59,19 +64,34 @@ int main(int argc, char *argv[])
         matrixN[i] = dist(mt);
 
     // TODO 3: Compute "gold" reference standard
-    // for py -> 0 to sizeNY
-    //   for px -> 0 to sizeMX
-    //     initialize dot product accumulator
-    //     for k -> 0 to sizeXY
-    //       dot = m[k, px] * n[py, k]
-    //  matrixPGold[py, px] = dot
+    for (unsigned py = 0; py < sizeNY; py++)
+    {
+        for (unsigned px = 0; px < sizeMX; px++)
+        {
+            float dot = 0.f;
+            for (unsigned k = 0; k < sizeXY; k++)
+            {
+                dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
+            }
+            matrixPGold[px * sizeNY + py] = dot;
+        }
+    }
 
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
+    const size_t sizeInBytesM = sizeMX * sizeXY * sizeof(float);
+    const size_t sizeInBytesN = sizeXY * sizeNY * sizeof(float);
+    const size_t sizeInBytesP = sizeMX * sizeNY * sizeof(float);
+
+    CUDA(cudaMalloc((void **)& d_matrixM, sizeInBytesM));
+    CUDA(cudaMalloc((void **)& d_matrixN, sizeInBytesN));
+    CUDA(cudaMalloc((void **)& d_matrixP, sizeInBytesP));
 
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_matrixM, matrixM, sizeInBytesM, cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_matrixN, matrixN, sizeInBytesN, cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -82,17 +102,21 @@ int main(int argc, char *argv[])
     // LOOK: Use the clearHostAndDeviceArray function to clear matrixP and d_matrixP
     clearHostAndDeviceArray(matrixP, d_matrixP, sizeMX * sizeNY);
 
+    const int BS_X = 32;
+    const int BS_Y = 32;
+
     // TODO 6: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+    dims.dimBlock = dim3(BS_X, BS_Y, 1);
+    dims.dimGrid  = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1);
 
     // TODO 7: Launch the matrix transpose kernel
-    // matrixMultiplicationNaive<<<>>>();
+    matrixMultiplicationNaive<<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
 
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
+    CUDA(cudaMemcpy(matrixP, d_matrixP, sizeInBytesP, cudaMemcpyDeviceToHost));
 
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
@@ -101,6 +125,9 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 9: free device memory using cudaFree
+    CUDA(cudaFree(d_matrixM));
+    CUDA(cudaFree(d_matrixN));
+    CUDA(cudaFree(d_matrixP));
 
     // free host memory
     delete[] matrixM;
diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
index 5ed591f..9478232 100644
--- a/cuda-introduction/source/saxpy.cu
+++ b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,24 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size) 
+    {
         return;
+    }
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 256;
 
     // Host arrays.
     float* x = new float[size];
@@ -51,11 +55,16 @@ int main(int argc, char *argv[])
     // Device arrays
     float *d_x, *d_y, *d_z;
 
+    int sizeInBytes = size * sizeof(float);
+
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
-    // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
+    CUDA(cudaMalloc((void **)& d_x, sizeInBytes));
+    CUDA(cudaMalloc((void **)& d_y, sizeInBytes));
+    CUDA(cudaMalloc((void **)& d_z, sizeInBytes));
 
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
-    // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
+    CUDA(cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -69,17 +78,20 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
-    // saxpy<<< >>> (....);
+    saxpy<<<blocks, threadsPerBlock>>> (d_z, d_x, d_y, a, size);
 
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
 
+    cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost);
+
+
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
     std::cout << "****************************************************" << std::endl << std::endl;
@@ -88,6 +100,10 @@ int main(int argc, char *argv[])
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
 
+    CUDA(cudaFree(d_x));
+    CUDA(cudaFree(d_y));
+    CUDA(cudaFree(d_z));
+
     // free host memory
     delete[] x;
     delete[] y;
diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu
index 89f6f8f..4debe71 100644
--- a/cuda-introduction/source/transpose.cu
+++ b/cuda-introduction/source/transpose.cu
@@ -19,16 +19,20 @@
 __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 6a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 6b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY) 
+    {
+        return;
+    }
 
     // TODO 6c: Compute global 1D index from i and j
-    unsigned index = 0;
+    unsigned index = (j * sizeX) + i;
 
     // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
-    // b[] = a[];
+    b[index] = a[index];
 }
 
 // TODO 11: Implement the transpose kernel
@@ -38,16 +42,22 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
 __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 11a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 11b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY) 
+    {
+        return;
+    }
 
     // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
-    unsigned index_in  = 0;  // Compute input index (i,j) from matrix A
-    unsigned index_out = 0;  // Compute output index (j,i) in matrix B = transpose(A)
+    unsigned index_in  = j * sizeX + i;  // Compute input index (i,j) from matrix A
+    unsigned index_out = i * sizeY + j;  // Compute output index (j,i) in matrix B = transpose(A)
 
     // TODO 11d: Copy data from A to B using transpose indices
+
+    b[index_out] = a[index_in];
 }
 
 int main(int argc, char *argv[])
@@ -81,12 +91,22 @@ int main(int argc, char *argv[])
     // Device arrays
     float *d_a, *d_b;
 
+    float sizeInBytes = sizeX * sizeY * sizeof(float);
+
     // TODO 2: Allocate memory on the device for d_a and d_b.
+    CUDA(cudaMalloc((void **)& d_a, sizeInBytes));
+    CUDA(cudaMalloc((void **)& d_b, sizeInBytes));
+
 
     // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
+    // declare block size components here
+    const unsigned BS_X = 32;
+    const unsigned BS_Y = 32;
+
     ////////////////////////////////////////////////////////////
     std::cout << "****************************************************" << std::endl;
     std::cout << "***Device To Device Copy***" << std::endl;
@@ -97,13 +117,14 @@ int main(int argc, char *argv[])
         // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1);
 
         // LOOK: Launch the copy kernel
         copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 5: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(a_gold, b, sizeX * sizeY);
@@ -121,13 +142,14 @@ int main(int argc, char *argv[])
         // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(divup(sizeX, BS_Y), divup(sizeY, BS_Y), 1);
 
         // TODO 9: Launch the matrix transpose kernel
-        // matrixTransposeNaive<<<>>>(......);
+        matrixTransposeNaive<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 10: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(b_gold, b, sizeX * sizeY);
@@ -136,6 +158,8 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 7: free device memory using cudaFree
+    CUDA(cudaFree(d_a));
+    CUDA(cudaFree(d_b));
 
     // free host memory
     delete[] a;
diff --git a/images/modified_project_2.1.2.png b/images/modified_project_2.1.2.png
new file mode 100644
index 0000000..d7d498c
Binary files /dev/null and b/images/modified_project_2.1.2.png differ
diff --git a/images/nsight_compute_config.png b/images/nsight_compute_config.png
new file mode 100644
index 0000000..64fee16
Binary files /dev/null and b/images/nsight_compute_config.png differ
diff --git a/images/nsight_compute_details.png b/images/nsight_compute_details.png
new file mode 100644
index 0000000..e69ea73
Binary files /dev/null and b/images/nsight_compute_details.png differ
diff --git a/images/nsight_compute_summary.png b/images/nsight_compute_summary.png
new file mode 100644
index 0000000..6bd9618
Binary files /dev/null and b/images/nsight_compute_summary.png differ
diff --git a/images/nsight_systems_analysis_summary.png b/images/nsight_systems_analysis_summary.png
new file mode 100644
index 0000000..ca8f7df
Binary files /dev/null and b/images/nsight_systems_analysis_summary.png differ
diff --git a/images/nsight_systems_config.png b/images/nsight_systems_config.png
new file mode 100644
index 0000000..4e91016
Binary files /dev/null and b/images/nsight_systems_config.png differ
diff --git a/images/nsight_systems_timeline.png b/images/nsight_systems_timeline.png
new file mode 100644
index 0000000..dc6a1b2
Binary files /dev/null and b/images/nsight_systems_timeline.png differ
diff --git a/images/webgl_report.png b/images/webgl_report.png
new file mode 100644
index 0000000..daa293a
Binary files /dev/null and b/images/webgl_report.png differ
diff --git a/images/webgpu_report_wayland.png b/images/webgpu_report_wayland.png
new file mode 100644
index 0000000..c1e89d8
Binary files /dev/null and b/images/webgpu_report_wayland.png differ
diff --git a/images/webgpu_report_xorg.png b/images/webgpu_report_xorg.png
new file mode 100644
index 0000000..9933275
Binary files /dev/null and b/images/webgpu_report_xorg.png differ