diff --git a/README.md b/README.md
index d2fa33d..7de4486 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,15 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Christina Qiu
+  * [LinkedIn](https://www.linkedin.com/in/christina-qiu-6094301b6/), [personal website](https://christinaqiu3.github.io/), [twitter](), etc.
+* Tested on: Windows 11, Intel Core i7-13700H @ 2.40GHz, 16GB RAM, NVIDIA GeForce RTX 4060 Laptop GPU (Personal laptop)
 
-### (TODO: Your README)
+### README
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](images/Screenshot%202025-08-30%20223233.png)
+![](images/Screenshot%202025-08-30%20223315.png)
+![](images/Screenshot%202025-08-30%20231757.png)
+![](images/Screenshot%202025-08-30%20234604.png)
+
+note: had issue running Project 0 with Nsight Compute as recommended in Step 2.1.5
diff --git a/cuda-introduction/.gitignore b/cuda-introduction/.gitignore
new file mode 100644
index 0000000..92a50c5
--- /dev/null
+++ b/cuda-introduction/.gitignore
@@ -0,0 +1,559 @@
+*.orig
+*.filters
+*.vcxproj
+*.xcodeproj
+build
+
+# Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode
+
+### Linux ###
+*~
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+
+### OSX ###
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### SublimeText ###
+# cache files for sublime text
+*.tmlanguage.cache
+*.tmPreferences.cache
+*.stTheme.cache
+
+# workspace files are user-specific
+*.sublime-workspace
+
+# project files should be checked into the repository, unless a significant
+# proportion of contributors will probably not be using SublimeText
+# *.sublime-project
+
+# sftp configuration file
+sftp-config.json
+
+
+### Windows ###
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+
+### JetBrains ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
+
+*.iml
+
+## Directory-based project format:
+#.idea/
+# if you remove the above rule, at least ignore the following:
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+## File-based project format:
+*.ipr
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+
+
+### Vim ###
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
+
+
+### Emacs ###
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+
+
+### CMake ###
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Makefile
+cmake_install.cmake
+install_manifest.txt
+
+
+### C++ ###
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+
+### CUDA ###
+*.i
+*.ii
+*.gpu
+*.ptx
+*.cubin
+*.fatbin
+
+
+### VisualStudio ###
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.userosscache.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+build/
+bld/
+[Bb]in/
+[Oo]bj/
+
+# Visual Studio 2015 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# DNX
+project.lock.json
+artifacts/
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# TODO: Comment the next line if you want to checkin your web deploy settings 
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/packages/*
+# except build/, which is used as an MSBuild target.
+!**/packages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/packages/repositories.config
+
+# Windows Azure Build Output
+csx/
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.pfx
+*.publishsettings
+node_modules/
+orleans.codegen.cs
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+*.mdf
+*.ldf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+
+### WebStorm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
+
+*.iml
+
+## Directory-based project format:
+.idea/
+# if you remove the above rule, at least ignore the following:
+
+# User-specific stuff:
+# .idea/workspace.xml
+# .idea/tasks.xml
+# .idea/dictionaries
+
+# Sensitive or high-churn files:
+# .idea/dataSources.ids
+# .idea/dataSources.xml
+# .idea/sqlDataSources.xml
+# .idea/dynamic.xml
+# .idea/uiDesigner.xml
+
+# Gradle:
+# .idea/gradle.xml
+# .idea/libraries
+
+# Mongo Explorer plugin:
+# .idea/mongoSettings.xml
+
+## File-based project format:
+*.ipr
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+
+
+### Eclipse ###
+*.pydevproject
+.metadata
+.gradle
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.settings/
+.loadpath
+
+# Eclipse Core
+.project
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# JDT-specific (Eclipse Java Development Tools)
+.classpath
+
+# Java annotation processor (APT)
+.factorypath
+
+# PDT-specific
+.buildpath
+
+# sbteclipse plugin
+.target
+
+# TeXlipse plugin
+.texlipse
+
+
+### Xcode ###
+# Xcode
+#
+# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
+
+## Build generated
+build/
+DerivedData
+
+## Various settings
+*.pbxuser
+!default.pbxuser
+*.mode1v3
+!default.mode1v3
+*.mode2v3
+!default.mode2v3
+*.perspectivev3
+!default.perspectivev3
+xcuserdata
+
+## Other
+*.xccheckout
+*.moved-aside
+*.xcuserstate
diff --git a/cuda-introduction/CMakeLists.txt b/cuda-introduction/CMakeLists.txt
index 82eab9a..eb1728b 100644
--- a/cuda-introduction/CMakeLists.txt
+++ b/cuda-introduction/CMakeLists.txt
@@ -39,7 +39,7 @@ foreach(exe ${EXECUTABLES})
     elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
         set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES all-major)
     else()
-        set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES native)
+        set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES 89) # CHANGED THIS FROM native TO 89
     endif()
     target_compile_options(${exe} PRIVATE "$<$<AND:$<CONFIG:Debug,RelWithDebInfo>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>")
     target_compile_options(${exe} PRIVATE "$<$<AND:$<CONFIG:Release>,$<COMPILE_LANGUAGE:CUDA>>:-lineinfo;-src-in-ptx>")
diff --git a/cuda-introduction/CUDAIntroduction.sln b/cuda-introduction/CUDAIntroduction.sln
new file mode 100644
index 0000000..0f2b580
--- /dev/null
+++ b/cuda-introduction/CUDAIntroduction.sln
@@ -0,0 +1,87 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CMakePredefinedTargets", "CMakePredefinedTargets", "{696D6F54-7811-3467-A4CE-B6CCCC116630}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "saxpy", "saxpy.vcxproj", "{21EDE36A-0E55-3801-A021-BF8D59A5C54B}"
+	ProjectSection(ProjectDependencies) = postProject
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}"
+	ProjectSection(ProjectDependencies) = postProject
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}
+		{C19380DA-7422-329E-AD89-03A81FEB7372} = {C19380DA-7422-329E-AD89-03A81FEB7372}
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B} = {21EDE36A-0E55-3801-A021-BF8D59A5C54B}
+		{290915A9-191B-3931-8E65-1F8C7C3B624A} = {290915A9-191B-3931-8E65-1F8C7C3B624A}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matmul", "matmul.vcxproj", "{C19380DA-7422-329E-AD89-03A81FEB7372}"
+	ProjectSection(ProjectDependencies) = postProject
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "transpose", "transpose.vcxproj", "{290915A9-191B-3931-8E65-1F8C7C3B624A}"
+	ProjectSection(ProjectDependencies) = postProject
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+		MinSizeRel|x64 = MinSizeRel|x64
+		RelWithDebInfo|x64 = RelWithDebInfo|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Debug|x64.ActiveCfg = Debug|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Debug|x64.Build.0 = Debug|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Release|x64.ActiveCfg = Release|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Release|x64.Build.0 = Release|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{21EDE36A-0E55-3801-A021-BF8D59A5C54B}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.Debug|x64.ActiveCfg = Debug|x64
+		{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.Release|x64.ActiveCfg = Release|x64
+		{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Debug|x64.ActiveCfg = Debug|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Debug|x64.Build.0 = Debug|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Release|x64.ActiveCfg = Release|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Release|x64.Build.0 = Release|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.Debug|x64.ActiveCfg = Debug|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.Debug|x64.Build.0 = Debug|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.Release|x64.ActiveCfg = Release|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.Release|x64.Build.0 = Release|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{C19380DA-7422-329E-AD89-03A81FEB7372}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.Debug|x64.ActiveCfg = Debug|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.Debug|x64.Build.0 = Debug|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.Release|x64.ActiveCfg = Release|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.Release|x64.Build.0 = Release|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{290915A9-191B-3931-8E65-1F8C7C3B624A}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52} = {696D6F54-7811-3467-A4CE-B6CCCC116630}
+		{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {696D6F54-7811-3467-A4CE-B6CCCC116630}
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {639D87F3-11B1-36E2-BD8F-76B22CDC1CDF}
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal
diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
index dce8793..f582db2 100644
--- a/cuda-introduction/source/common.cu
+++ b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return (size / div) + (size % div == 0? 0 : 1);
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
index 826e535..42c3db1 100644
--- a/cuda-introduction/source/matmul.cu
+++ b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,22 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned py = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px >= sizeMX || py >= sizeNY)
+        return;
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
-    float dot = 0.0;
+    float dot = 0.0f;
+    for (unsigned k = 0; k < sizeXY; k++)
+        dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
 
     // TODO 10d: Copy dot to P matrix
-    // matrixP[] = dot;
+    
+    matrixP[py * sizeMX + px] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,19 +36,19 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 16;
+    const unsigned sizeXY = 16;
+    const unsigned sizeNY = 16;
 
     // TODO 2: Allocate host 1D arrays for:
     // matrixM[sizeMX, sizeXY]
     // matrixN[sizeXY, sizeNY]
     // matrixP[sizeMX, sizeNY]
     // matrixPGold[sizeMX, sizeNY]
-    float* matrixM;
-    float* matrixN;
-    float* matrixP;
-    float* matrixPGold;
+    float* matrixM = new float[sizeMX * sizeXY];
+    float* matrixN = new float[sizeXY * sizeNY];
+    float* matrixP = new float[sizeMX * sizeNY];
+    float* matrixPGold = new float[sizeMX * sizeNY];
 
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
@@ -66,12 +71,31 @@ int main(int argc, char *argv[])
     //       dot = m[k, px] * n[py, k]
     //  matrixPGold[py, px] = dot
 
+    for (int py = 0; py < sizeNY; py++) {
+        for (int px = 0; px < sizeMX; px++) {
+            float dot = 0.0f;
+            for (int k = 0; k < sizeXY; k++) {
+                dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
+            }
+            matrixPGold[py * sizeMX + px] = dot;
+        }
+    }
+
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
+    int sizeInBytesM = sizeMX * sizeXY * sizeof(float);
+    int sizeInBytesN = sizeXY * sizeNY * sizeof(float);
+    int sizeInBytesP = sizeMX * sizeNY * sizeof(float);
+
+    cudaMalloc((void**)&d_matrixM, sizeInBytesM);
+    cudaMalloc((void**)&d_matrixN, sizeInBytesN);
+    cudaMalloc((void**)&d_matrixP, sizeInBytesP);
 
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
+    cudaMemcpy(d_matrixM, matrixM, sizeInBytesM, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_matrixN, matrixN, sizeInBytesN, cudaMemcpyHostToDevice);
 
     CUDA(cudaDeviceSynchronize());
 
@@ -86,13 +110,22 @@ int main(int argc, char *argv[])
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+
+    const unsigned BS_X = 16;
+    const unsigned BS_Y = 16;
+
+    const unsigned blocks_X = divup(sizeMX, BS_X);
+    const unsigned blocks_Y = divup(sizeNY, BS_Y);
+
+    dims.dimBlock = dim3(BS_X, BS_Y, 1);
+    dims.dimGrid = dim3(blocks_X, blocks_Y, 1);
 
     // TODO 7: Launch the matrix transpose kernel
-    // matrixMultiplicationNaive<<<>>>();
+    
+    matrixMultiplicationNaive<<<dims.dimGrid, dims.dimBlock>>> (d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
 
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
+    cudaMemcpy(matrixP, d_matrixP, sizeInBytesP, cudaMemcpyDeviceToHost);
 
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
@@ -101,6 +134,7 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 9: free device memory using cudaFree
+    cudaFree(d_matrixM);    cudaFree(d_matrixN);    cudaFree(d_matrixP);
 
     // free host memory
     delete[] matrixM;
diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
index 5ed591f..9609ddf 100644
--- a/cuda-introduction/source/saxpy.cu
+++ b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,21 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 64;
 
     // Host arrays.
     float* x = new float[size];
@@ -54,9 +55,19 @@ int main(int argc, char *argv[])
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
     // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
 
+    int sizeInBytes = size * sizeof(float);
+
+    cudaMalloc((void**)&d_x, sizeInBytes);
+    cudaMalloc((void**)&d_y, sizeInBytes);
+    cudaMalloc((void**)&d_z, sizeInBytes);
+
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
     // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
 
+    cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice);
+
+
     CUDA(cudaDeviceSynchronize());
 
     ////////////////////////////////////////////////////////////
@@ -69,7 +80,7 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
@@ -77,9 +88,14 @@ int main(int argc, char *argv[])
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
     // saxpy<<< >>> (....);
 
+    saxpy <<< blocks, threadsPerBlock >>> (d_z, d_x, d_y, a, size);
+
+
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
 
+    cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost);
+
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
     std::cout << "****************************************************" << std::endl << std::endl;
@@ -88,6 +104,8 @@ int main(int argc, char *argv[])
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
 
+    cudaFree(d_x);    cudaFree(d_y);    cudaFree(d_z);
+
     // free host memory
     delete[] x;
     delete[] y;
diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu
index 89f6f8f..6d2ab75 100644
--- a/cuda-introduction/source/transpose.cu
+++ b/cuda-introduction/source/transpose.cu
@@ -19,16 +19,19 @@
 __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 6a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 6b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY)
+        return;
 
     // TODO 6c: Compute global 1D index from i and j
-    unsigned index = 0;
+    unsigned index = j * sizeX + i;
 
     // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
-    // b[] = a[];
+    
+    b[index] = a[index];
 }
 
 // TODO 11: Implement the transpose kernel
@@ -38,16 +41,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
 __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 11a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 11b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY)
+        return;
 
     // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
-    unsigned index_in  = 0;  // Compute input index (i,j) from matrix A
-    unsigned index_out = 0;  // Compute output index (j,i) in matrix B = transpose(A)
+    unsigned index_in  = j * sizeX + i;  // Compute input index (i,j) from matrix A
+    unsigned index_out = i * sizeY + j;  // Compute output index (j,i) in matrix B = transpose(A)
 
     // TODO 11d: Copy data from A to B using transpose indices
+    b[index_out] = a[index_in];
 }
 
 int main(int argc, char *argv[])
@@ -55,8 +61,8 @@ int main(int argc, char *argv[])
     // TODO 1: Initialize sizes. Start with simple like 32 x 32.
     // TODO Optional: Try different sizes - both square and non-square. Use these as examples:
     // 1024 x 1024, 2048 x 2048, 64 x 16, 128 x 768, 63 x 63, 31 x 15, 1025 x 1025, 1234 x 3153
-    const unsigned sizeX = 1234;
-    const unsigned sizeY = 3153;
+    const unsigned sizeX = 32;
+    const unsigned sizeY = 32;
 
     // LOOK: Allocate host arrays. The gold arrays are used to store the results from CPU.
     float* a = new float[sizeX * sizeY];
@@ -83,8 +89,17 @@ int main(int argc, char *argv[])
 
     // TODO 2: Allocate memory on the device for d_a and d_b.
 
+    int sizeInBytes = sizeX * sizeY * sizeof(float);
+
+    cudaMalloc((void**)&d_a, sizeInBytes);
+    cudaMalloc((void**)&d_b, sizeInBytes);
+
+
     // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
 
+    cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice);
+
+
     CUDA(cudaDeviceSynchronize());
 
     ////////////////////////////////////////////////////////////
@@ -97,14 +112,23 @@ int main(int argc, char *argv[])
         // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+
+        const unsigned BS_X = 16;
+		const unsigned BS_Y = 16;
+
+        const unsigned blocks_X = divup(sizeX, BS_X);
+        const unsigned blocks_Y = divup(sizeY, BS_Y);
+
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(blocks_X, blocks_Y, 1);
 
         // LOOK: Launch the copy kernel
         copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 5: copy the answer back to the host (CPU) from the device (GPU)
 
+		cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost);
+
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(a_gold, b, sizeX * sizeY);
     }
@@ -121,14 +145,23 @@ int main(int argc, char *argv[])
         // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        const unsigned BS_X = 16;
+        const unsigned BS_Y = 16;
+
+        const unsigned blocks_X = divup(sizeX, BS_X);
+        const unsigned blocks_Y = divup(sizeY, BS_Y);
+
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(blocks_X, blocks_Y, 1);
 
         // TODO 9: Launch the matrix transpose kernel
-        // matrixTransposeNaive<<<>>>(......);
+        
+        matrixTransposeNaive<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 10: copy the answer back to the host (CPU) from the device (GPU)
 
+		cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost);
+
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(b_gold, b, sizeX * sizeY);
     }
@@ -137,6 +170,8 @@ int main(int argc, char *argv[])
 
     // TODO 7: free device memory using cudaFree
 
+    cudaFree(d_a);    cudaFree(d_b);
+
     // free host memory
     delete[] a;
     delete[] b;
diff --git a/images/Screenshot 2025-08-30 161657.png b/images/Screenshot 2025-08-30 161657.png
new file mode 100644
index 0000000..97f302b
Binary files /dev/null and b/images/Screenshot 2025-08-30 161657.png differ
diff --git a/images/Screenshot 2025-08-30 223233.png b/images/Screenshot 2025-08-30 223233.png
new file mode 100644
index 0000000..37ed784
Binary files /dev/null and b/images/Screenshot 2025-08-30 223233.png differ
diff --git a/images/Screenshot 2025-08-30 223315.png b/images/Screenshot 2025-08-30 223315.png
new file mode 100644
index 0000000..0ac3343
Binary files /dev/null and b/images/Screenshot 2025-08-30 223315.png differ
diff --git a/images/Screenshot 2025-08-30 231757.png b/images/Screenshot 2025-08-30 231757.png
new file mode 100644
index 0000000..8a9f5f4
Binary files /dev/null and b/images/Screenshot 2025-08-30 231757.png differ
diff --git a/images/Screenshot 2025-08-30 234604.png b/images/Screenshot 2025-08-30 234604.png
new file mode 100644
index 0000000..4171530
Binary files /dev/null and b/images/Screenshot 2025-08-30 234604.png differ