diff --git a/README.md b/README.md index d2fa33d..7de4486 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,15 @@ Project 0 Getting Started **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Christina Qiu + * [LinkedIn](https://www.linkedin.com/in/christina-qiu-6094301b6/), [personal website](https://christinaqiu3.github.io/), [twitter](), etc. +* Tested on: Windows 11, Intel Core i7-13700H @ 2.40GHz, 16GB RAM, NVIDIA GeForce RTX 4060 Laptop GPU (Personal laptop) -### (TODO: Your README) +### README -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +![](images/Screenshot%202025-08-30%20223233.png) +![](images/Screenshot%202025-08-30%20223315.png) +![](images/Screenshot%202025-08-30%20231757.png) +![](images/Screenshot%202025-08-30%20234604.png) + +note: had issue running Project 0 with Nsight Compute as recommended in Step 2.1.5 diff --git a/cuda-introduction/.gitignore b/cuda-introduction/.gitignore new file mode 100644 index 0000000..92a50c5 --- /dev/null +++ b/cuda-introduction/.gitignore @@ -0,0 +1,559 @@ +*.orig +*.filters +*.vcxproj +*.xcodeproj +build + +# Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode + +### Linux ### +*~ + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + + +### OSX ### +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +### SublimeText ### +# cache files for sublime text +*.tmlanguage.cache +*.tmPreferences.cache +*.stTheme.cache + +# workspace files are user-specific +*.sublime-workspace + +# project files should be checked into the repository, unless a significant +# proportion of contributors will probably not be using SublimeText +# *.sublime-project + +# sftp configuration file +sftp-config.json + + +### Windows ### +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + + +### JetBrains ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio + +*.iml + +## Directory-based project format: +#.idea/ +# if you remove the above rule, at least ignore the following: + +# User-specific stuff: +.idea/workspace.xml +.idea/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/dataSources.ids +.idea/dataSources.xml +.idea/sqlDataSources.xml +.idea/dynamic.xml +.idea/uiDesigner.xml + +# Gradle: +.idea/gradle.xml +.idea/libraries + +# Mongo Explorer plugin: +.idea/mongoSettings.xml + +## File-based project format: +*.ipr +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties + + +### Vim ### +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +*.un~ +Session.vim +.netrwhist +*~ + + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ + + +### CMake ### +CMakeCache.txt +CMakeFiles +CMakeScripts +Makefile +cmake_install.cmake +install_manifest.txt + + +### C++ ### +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + + +### CUDA ### +*.i +*.ii +*.gpu +*.ptx +*.cubin +*.fatbin + + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.userosscache.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +build/ +bld/ +[Bb]in/ +[Oo]bj/ + +# Visual Studio 2015 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUNIT +*.VisualState.xml +TestResult.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# DNX +project.lock.json +artifacts/ + +*_i.c +*_p.c +*_i.h +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# JustCode is a .NET coding add-in +.JustCode + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# TODO: Comment the next line if you want to checkin your web deploy settings +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# NuGet Packages +*.nupkg +# The packages folder can be ignored because of Package Restore +**/packages/* +# except build/, which is used as an MSBuild target. +!**/packages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/packages/repositories.config + +# Windows Azure Build Output +csx/ +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!*.[Cc]ache/ + +# Others +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.pfx +*.publishsettings +node_modules/ +orleans.codegen.cs + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +*.mdf +*.ldf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings + +# Microsoft Fakes +FakesAssemblies/ + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + + +### WebStorm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio + +*.iml + +## Directory-based project format: +.idea/ +# if you remove the above rule, at least ignore the following: + +# User-specific stuff: +# .idea/workspace.xml +# .idea/tasks.xml +# .idea/dictionaries + +# Sensitive or high-churn files: +# .idea/dataSources.ids +# .idea/dataSources.xml +# .idea/sqlDataSources.xml +# .idea/dynamic.xml +# .idea/uiDesigner.xml + +# Gradle: +# .idea/gradle.xml +# .idea/libraries + +# Mongo Explorer plugin: +# .idea/mongoSettings.xml + +## File-based project format: +*.ipr +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties + + +### Eclipse ### +*.pydevproject +.metadata +.gradle +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath + +# Eclipse Core +.project + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# JDT-specific (Eclipse Java Development Tools) +.classpath + +# Java annotation processor (APT) +.factorypath + +# PDT-specific +.buildpath + +# sbteclipse plugin +.target + +# TeXlipse plugin +.texlipse + + +### Xcode ### +# Xcode +# +# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore + +## Build generated +build/ +DerivedData + +## Various settings +*.pbxuser +!default.pbxuser +*.mode1v3 +!default.mode1v3 +*.mode2v3 +!default.mode2v3 +*.perspectivev3 +!default.perspectivev3 +xcuserdata + +## Other +*.xccheckout +*.moved-aside +*.xcuserstate diff --git a/cuda-introduction/CMakeLists.txt b/cuda-introduction/CMakeLists.txt index 82eab9a..eb1728b 100644 --- a/cuda-introduction/CMakeLists.txt +++ b/cuda-introduction/CMakeLists.txt @@ -39,7 +39,7 @@ foreach(exe ${EXECUTABLES}) elseif(CMAKE_VERSION VERSION_LESS "3.24.0") set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES all-major) else() - set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES native) + set_target_properties(${exe} PROPERTIES CUDA_ARCHITECTURES 89) # CHANGED THIS FROM native TO 89 endif() target_compile_options(${exe} PRIVATE "$<$,$>:-G;-src-in-ptx>") target_compile_options(${exe} PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") diff --git a/cuda-introduction/CUDAIntroduction.sln b/cuda-introduction/CUDAIntroduction.sln new file mode 100644 index 0000000..0f2b580 --- /dev/null +++ b/cuda-introduction/CUDAIntroduction.sln @@ -0,0 +1,87 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CMakePredefinedTargets", "CMakePredefinedTargets", "{696D6F54-7811-3467-A4CE-B6CCCC116630}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "saxpy", "saxpy.vcxproj", "{21EDE36A-0E55-3801-A021-BF8D59A5C54B}" + ProjectSection(ProjectDependencies) = postProject + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}" + ProjectSection(ProjectDependencies) = postProject + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} + {C19380DA-7422-329E-AD89-03A81FEB7372} = {C19380DA-7422-329E-AD89-03A81FEB7372} + {21EDE36A-0E55-3801-A021-BF8D59A5C54B} = {21EDE36A-0E55-3801-A021-BF8D59A5C54B} + {290915A9-191B-3931-8E65-1F8C7C3B624A} = {290915A9-191B-3931-8E65-1F8C7C3B624A} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}" + ProjectSection(ProjectDependencies) = postProject + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "matmul", "matmul.vcxproj", "{C19380DA-7422-329E-AD89-03A81FEB7372}" + ProjectSection(ProjectDependencies) = postProject + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "transpose", "transpose.vcxproj", "{290915A9-191B-3931-8E65-1F8C7C3B624A}" + ProjectSection(ProjectDependencies) = postProject + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + MinSizeRel|x64 = MinSizeRel|x64 + RelWithDebInfo|x64 = RelWithDebInfo|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Debug|x64.ActiveCfg = Debug|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Debug|x64.Build.0 = Debug|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Release|x64.ActiveCfg = Release|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.Release|x64.Build.0 = Release|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {21EDE36A-0E55-3801-A021-BF8D59A5C54B}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.Debug|x64.ActiveCfg = Debug|x64 + {3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.Release|x64.ActiveCfg = Release|x64 + {3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {3C68B1A5-3F23-34E2-A43D-901EEEFD3F52}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Debug|x64.ActiveCfg = Debug|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Debug|x64.Build.0 = Debug|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Release|x64.ActiveCfg = Release|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.Release|x64.Build.0 = Release|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.Debug|x64.ActiveCfg = Debug|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.Debug|x64.Build.0 = Debug|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.Release|x64.ActiveCfg = Release|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.Release|x64.Build.0 = Release|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {C19380DA-7422-329E-AD89-03A81FEB7372}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.Debug|x64.ActiveCfg = Debug|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.Debug|x64.Build.0 = Debug|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.Release|x64.ActiveCfg = Release|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.Release|x64.Build.0 = Release|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.MinSizeRel|x64.Build.0 = MinSizeRel|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64 + {290915A9-191B-3931-8E65-1F8C7C3B624A}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64 + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {3C68B1A5-3F23-34E2-A43D-901EEEFD3F52} = {696D6F54-7811-3467-A4CE-B6CCCC116630} + {47A996B1-BBC8-38B5-81F2-4000E0AD9EF1} = {696D6F54-7811-3467-A4CE-B6CCCC116630} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {639D87F3-11B1-36E2-BD8F-76B22CDC1CDF} + EndGlobalSection + GlobalSection(ExtensibilityAddIns) = postSolution + EndGlobalSection +EndGlobal diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu index dce8793..f582db2 100644 --- a/cuda-introduction/source/common.cu +++ b/cuda-introduction/source/common.cu @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div) { // TODO: implement a 1 line function to return the divup operation. // Note: You only need to use addition, subtraction, and division operations. - return 0; + return (size / div) + (size % div == 0? 0 : 1); } void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value) diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu index 826e535..42c3db1 100644 --- a/cuda-introduction/source/matmul.cu +++ b/cuda-introduction/source/matmul.cu @@ -12,17 +12,22 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con { // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions. // Remember that each thread of the kernel computes the result of 1 unique element of P - unsigned px; - unsigned py; + unsigned px = blockIdx.x * blockDim.x + threadIdx.x; + unsigned py = blockIdx.y * blockDim.y + threadIdx.y; // TODO 10b: Check if px or py are out of bounds. If they are, return. + if (px >= sizeMX || py >= sizeNY) + return; // TODO 10c: Compute the dot product for the P element in each thread // This loop will be the same as the host loop - float dot = 0.0; + float dot = 0.0f; + for (unsigned k = 0; k < sizeXY; k++) + dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; // TODO 10d: Copy dot to P matrix - // matrixP[] = dot; + + matrixP[py * sizeMX + px] = dot; } int main(int argc, char *argv[]) @@ -31,19 +36,19 @@ int main(int argc, char *argv[]) // Then try large multiple-block square matrix like 64x64 up to 2048x2048. // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771 // Then try rectangles with powers of two and then non-power-of-two. - const unsigned sizeMX = 0; - const unsigned sizeXY = 0; - const unsigned sizeNY = 0; + const unsigned sizeMX = 16; + const unsigned sizeXY = 16; + const unsigned sizeNY = 16; // TODO 2: Allocate host 1D arrays for: // matrixM[sizeMX, sizeXY] // matrixN[sizeXY, sizeNY] // matrixP[sizeMX, sizeNY] // matrixPGold[sizeMX, sizeNY] - float* matrixM; - float* matrixN; - float* matrixP; - float* matrixPGold; + float* matrixM = new float[sizeMX * sizeXY]; + float* matrixN = new float[sizeXY * sizeNY]; + float* matrixP = new float[sizeMX * sizeNY]; + float* matrixPGold = new float[sizeMX * sizeNY]; // LOOK: Setup random number generator and fill host arrays and the scalar a. std::random_device rd; @@ -66,12 +71,31 @@ int main(int argc, char *argv[]) // dot = m[k, px] * n[py, k] // matrixPGold[py, px] = dot + for (int py = 0; py < sizeNY; py++) { + for (int px = 0; px < sizeMX; px++) { + float dot = 0.0f; + for (int k = 0; k < sizeXY; k++) { + dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } + matrixPGold[py * sizeMX + px] = dot; + } + } + // Device arrays float *d_matrixM, *d_matrixN, *d_matrixP; // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP. + int sizeInBytesM = sizeMX * sizeXY * sizeof(float); + int sizeInBytesN = sizeXY * sizeNY * sizeof(float); + int sizeInBytesP = sizeMX * sizeNY * sizeof(float); + + cudaMalloc((void**)&d_matrixM, sizeInBytesM); + cudaMalloc((void**)&d_matrixN, sizeInBytesN); + cudaMalloc((void**)&d_matrixP, sizeInBytesP); // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU) + cudaMemcpy(d_matrixM, matrixM, sizeInBytesM, cudaMemcpyHostToDevice); + cudaMemcpy(d_matrixN, matrixN, sizeInBytesN, cudaMemcpyHostToDevice); CUDA(cudaDeviceSynchronize()); @@ -86,13 +110,22 @@ int main(int argc, char *argv[]) // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup // HINT: The shape of matrices has no impact on launch configuaration DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + + const unsigned BS_X = 16; + const unsigned BS_Y = 16; + + const unsigned blocks_X = divup(sizeMX, BS_X); + const unsigned blocks_Y = divup(sizeNY, BS_Y); + + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(blocks_X, blocks_Y, 1); // TODO 7: Launch the matrix transpose kernel - // matrixMultiplicationNaive<<<>>>(); + + matrixMultiplicationNaive<<>> (d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY); // TODO 8: copy the answer back to the host (CPU) from the device (GPU) + cudaMemcpy(matrixP, d_matrixP, sizeInBytesP, cudaMemcpyDeviceToHost); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3); @@ -101,6 +134,7 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 9: free device memory using cudaFree + cudaFree(d_matrixM); cudaFree(d_matrixN); cudaFree(d_matrixP); // free host memory delete[] matrixM; diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu index 5ed591f..9609ddf 100644 --- a/cuda-introduction/source/saxpy.cu +++ b/cuda-introduction/source/saxpy.cu @@ -9,20 +9,21 @@ __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size) { // TODO 9: Compute the global index for each thread. - unsigned idx = 0; + unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // TODO 10: Check if idx is out of bounds. If yes, return. - if (idx >= 0) + if (idx >= size) return; // TODO 11: Perform the SAXPY operation: z = a * x + y. + z[idx] = a * x[idx] + y[idx]; } int main(int argc, char *argv[]) { // TODO 1: Set the size. Start with something simple like 64. // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127 - const unsigned size = 0; + const unsigned size = 64; // Host arrays. float* x = new float[size]; @@ -54,9 +55,19 @@ int main(int argc, char *argv[]) // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z. // CUDA(cudaMalloc((void **)& pointer, size in bytes))); + int sizeInBytes = size * sizeof(float); + + cudaMalloc((void**)&d_x, sizeInBytes); + cudaMalloc((void**)&d_y, sizeInBytes); + cudaMalloc((void**)&d_z, sizeInBytes); + // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2, // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum)); + cudaMemcpy(d_x, x, sizeInBytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, sizeInBytes, cudaMemcpyHostToDevice); + + CUDA(cudaDeviceSynchronize()); //////////////////////////////////////////////////////////// @@ -69,7 +80,7 @@ int main(int argc, char *argv[]) // TODO 4: Setup threads and blocks. // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024 // Use divup to get the number of blocks to launch. - const unsigned threadsPerBlock = 0; + const unsigned threadsPerBlock = 128; // TODO 5: Implement the divup function in common.cpp const unsigned blocks = divup(size, threadsPerBlock); @@ -77,9 +88,14 @@ int main(int argc, char *argv[]) // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration // saxpy<<< >>> (....); + saxpy <<< blocks, threadsPerBlock >>> (d_z, d_x, d_y, a, size); + + // TODO 7: Copy the answer back to the host (CPU) from the device (GPU). // Copy what you did in 3, except for d_z -> z. + cudaMemcpy(z, d_z, sizeInBytes, cudaMemcpyDeviceToHost); + // LOOK: Use postprocess to check the result compareReferenceAndResult(z_gold, z, size, 1e-6); std::cout << "****************************************************" << std::endl << std::endl; @@ -88,6 +104,8 @@ int main(int argc, char *argv[]) // TODO 8: free device memory using cudaFree // CUDA(cudaFree(device pointer)); + cudaFree(d_x); cudaFree(d_y); cudaFree(d_z); + // free host memory delete[] x; delete[] y; diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu index 89f6f8f..6d2ab75 100644 --- a/cuda-introduction/source/transpose.cu +++ b/cuda-introduction/source/transpose.cu @@ -19,16 +19,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 6a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0;; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 6b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + return; // TODO 6c: Compute global 1D index from i and j - unsigned index = 0; + unsigned index = j * sizeX + i; // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same - // b[] = a[]; + + b[index] = a[index]; } // TODO 11: Implement the transpose kernel @@ -38,16 +41,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 11a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 11b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + return; // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i) - unsigned index_in = 0; // Compute input index (i,j) from matrix A - unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A) + unsigned index_in = j * sizeX + i; // Compute input index (i,j) from matrix A + unsigned index_out = i * sizeY + j; // Compute output index (j,i) in matrix B = transpose(A) // TODO 11d: Copy data from A to B using transpose indices + b[index_out] = a[index_in]; } int main(int argc, char *argv[]) @@ -55,8 +61,8 @@ int main(int argc, char *argv[]) // TODO 1: Initialize sizes. Start with simple like 32 x 32. // TODO Optional: Try different sizes - both square and non-square. Use these as examples: // 1024 x 1024, 2048 x 2048, 64 x 16, 128 x 768, 63 x 63, 31 x 15, 1025 x 1025, 1234 x 3153 - const unsigned sizeX = 1234; - const unsigned sizeY = 3153; + const unsigned sizeX = 32; + const unsigned sizeY = 32; // LOOK: Allocate host arrays. The gold arrays are used to store the results from CPU. float* a = new float[sizeX * sizeY]; @@ -83,8 +89,17 @@ int main(int argc, char *argv[]) // TODO 2: Allocate memory on the device for d_a and d_b. + int sizeInBytes = sizeX * sizeY * sizeof(float); + + cudaMalloc((void**)&d_a, sizeInBytes); + cudaMalloc((void**)&d_b, sizeInBytes); + + // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU) + cudaMemcpy(d_a, a, sizeInBytes, cudaMemcpyHostToDevice); + + CUDA(cudaDeviceSynchronize()); //////////////////////////////////////////////////////////// @@ -97,14 +112,23 @@ int main(int argc, char *argv[]) // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + + const unsigned BS_X = 16; + const unsigned BS_Y = 16; + + const unsigned blocks_X = divup(sizeX, BS_X); + const unsigned blocks_Y = divup(sizeY, BS_Y); + + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(blocks_X, blocks_Y, 1); // LOOK: Launch the copy kernel copyKernel<<>>(d_a, d_b, sizeX, sizeY); // TODO 5: copy the answer back to the host (CPU) from the device (GPU) + cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost); + // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(a_gold, b, sizeX * sizeY); } @@ -121,14 +145,23 @@ int main(int argc, char *argv[]) // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + const unsigned BS_X = 16; + const unsigned BS_Y = 16; + + const unsigned blocks_X = divup(sizeX, BS_X); + const unsigned blocks_Y = divup(sizeY, BS_Y); + + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(blocks_X, blocks_Y, 1); // TODO 9: Launch the matrix transpose kernel - // matrixTransposeNaive<<<>>>(......); + + matrixTransposeNaive<<>>(d_a, d_b, sizeX, sizeY); // TODO 10: copy the answer back to the host (CPU) from the device (GPU) + cudaMemcpy(b, d_b, sizeInBytes, cudaMemcpyDeviceToHost); + // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(b_gold, b, sizeX * sizeY); } @@ -137,6 +170,8 @@ int main(int argc, char *argv[]) // TODO 7: free device memory using cudaFree + cudaFree(d_a); cudaFree(d_b); + // free host memory delete[] a; delete[] b; diff --git a/images/Screenshot 2025-08-30 161657.png b/images/Screenshot 2025-08-30 161657.png new file mode 100644 index 0000000..97f302b Binary files /dev/null and b/images/Screenshot 2025-08-30 161657.png differ diff --git a/images/Screenshot 2025-08-30 223233.png b/images/Screenshot 2025-08-30 223233.png new file mode 100644 index 0000000..37ed784 Binary files /dev/null and b/images/Screenshot 2025-08-30 223233.png differ diff --git a/images/Screenshot 2025-08-30 223315.png b/images/Screenshot 2025-08-30 223315.png new file mode 100644 index 0000000..0ac3343 Binary files /dev/null and b/images/Screenshot 2025-08-30 223315.png differ diff --git a/images/Screenshot 2025-08-30 231757.png b/images/Screenshot 2025-08-30 231757.png new file mode 100644 index 0000000..8a9f5f4 Binary files /dev/null and b/images/Screenshot 2025-08-30 231757.png differ diff --git a/images/Screenshot 2025-08-30 234604.png b/images/Screenshot 2025-08-30 234604.png new file mode 100644 index 0000000..4171530 Binary files /dev/null and b/images/Screenshot 2025-08-30 234604.png differ