From de4c4e36310955cfc500f0fc549e1f3cca35533a Mon Sep 17 00:00:00 2001 From: Basil Fierz Date: Fri, 13 Oct 2017 19:58:33 +0200 Subject: [PATCH 1/2] Adds concurrent kernel example --- src/examples/CMakeLists.txt | 4 + src/examples/cuda/CMakeLists.txt | 2 + .../cuda/parallelkernels/CMakeLists.txt | 32 +++++ src/examples/cuda/parallelkernels/main.cpp | 109 ++++++++++++++++++ .../cuda/parallelkernels/vectoradd.cu | 40 +++++++ .../vcl/compute/cuda/commandqueue.cpp | 2 +- .../vcl/compute/cuda/device.cpp | 4 + .../vcl/compute/cuda/device.h | 3 + 8 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 src/examples/cuda/CMakeLists.txt create mode 100644 src/examples/cuda/parallelkernels/CMakeLists.txt create mode 100644 src/examples/cuda/parallelkernels/main.cpp create mode 100644 src/examples/cuda/parallelkernels/vectoradd.cu diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt index f314eb75..c8ebb86e 100644 --- a/src/examples/CMakeLists.txt +++ b/src/examples/CMakeLists.txt @@ -11,6 +11,10 @@ IF(VCL_OPENGL_SUPPORT) SUBDIRS(opengl) ENDIF() +IF(CUDA_FOUND AND VCL_CUDA_SUPPORT) + SUBDIRS(cuda) +ENDIF() + IF(OPENCL_FOUND AND VCL_OPENCL_SUPPORT) SUBDIRS(opencl) ENDIF() diff --git a/src/examples/cuda/CMakeLists.txt b/src/examples/cuda/CMakeLists.txt new file mode 100644 index 00000000..c617467c --- /dev/null +++ b/src/examples/cuda/CMakeLists.txt @@ -0,0 +1,2 @@ +# Execute parallel kernels +SUBDIRS(parallelkernels) diff --git a/src/examples/cuda/parallelkernels/CMakeLists.txt b/src/examples/cuda/parallelkernels/CMakeLists.txt new file mode 100644 index 00000000..47021442 --- /dev/null +++ b/src/examples/cuda/parallelkernels/CMakeLists.txt @@ -0,0 +1,32 @@ +PROJECT(parallelkernels_cuda) + +INCLUDE(${CMAKE_SOURCE_DIR}/cmake/VCLCompileCUDA.cmake) + +SET(SRC + main.cpp +) + +SET(KERNELS_CU + vectoradd.cu +) +# Access the include directories in order to compile the CUDA code +GET_PROPERTY(CURR_INC_DIRS_0 TARGET vcl_core_cuda PROPERTY INCLUDE_DIRECTORIES) +LIST(APPEND CURR_INC_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${CURR_INC_DIRS_0}) + +VCLCOMPILECU( + ${PROJECT_SOURCE_DIR}/vectoradd.cu + "vectoradd" + "${CURR_INC_DIRS}" + COMPILEDKERNELS_0 +) +SET(COMPILEDKERNELS ${COMPILEDKERNELS_0}) + +SOURCE_GROUP("cuc" FILES ${COMPILEDKERNELS}) +SOURCE_GROUP("" FILES ${SRC} ${KERNELS}) + +ADD_EXECUTABLE(parallelkernels_cuda ${SRC} ${KERNELS} ${COMPILEDKERNELS}) +SET_TARGET_PROPERTIES(parallelkernels_cuda PROPERTIES FOLDER examples/cuda) + +TARGET_LINK_LIBRARIES(parallelkernels_cuda + vcl_compute_cuda +) \ No newline at end of file diff --git a/src/examples/cuda/parallelkernels/main.cpp b/src/examples/cuda/parallelkernels/main.cpp new file mode 100644 index 00000000..97518498 --- /dev/null +++ b/src/examples/cuda/parallelkernels/main.cpp @@ -0,0 +1,109 @@ +/* + * This file is part of the Visual Computing Library (VCL) release under the + * MIT license. + * + * Copyright (c) 2017 Basil Fierz + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// VCL configuration +#include + +// C++ standard library +#include + +// VCL +#include +#include +#include +#include +#include +#include +#include + +extern uint32_t vectoradd[]; +extern size_t vectoraddSize; + +int main(int argc, char* argv[]) +{ + using namespace Vcl::Compute::Cuda; + using Vcl::Core::ref_ptr; + using Vcl::Core::dynamic_pointer_cast; + using Vcl::Core::static_pointer_cast; + + const size_t problem_size = 1024*1024; + + Platform::initialise(); + + for (int d = 0; d < Platform::instance()->nrDevices(); d++) + { + auto& dev = Platform::instance()->device(d); + Context ctx{ dev }; + + auto mod = ctx.createModuleFromSource((const int8_t*) vectoradd, vectoraddSize * sizeof(uint32_t)); + auto kernel = dynamic_pointer_cast(mod->kernel("vectoradd")); + + ref_ptr queue[] = { + dynamic_pointer_cast(ctx.defaultQueue()), + dynamic_pointer_cast(ctx.createCommandQueue()) }; + + ref_ptr mem0[] = { dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) }; + ref_ptr mem1[] = { dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) }; + ref_ptr mem2[] = { dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) }; + + float one = 1; + float two = 2; + + std::vector result[2]; + result[0] = std::vector(problem_size); + result[1] = std::vector(problem_size); + for (size_t i = 0; i < 2; i++) + { + queue[i]->fill(static_pointer_cast(mem0[i]), &one, sizeof(float)); + queue[i]->fill(static_pointer_cast(mem1[i]), &two, sizeof(float)); + } + + for (size_t i = 0; i < 2; i++) + { + kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]); + kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]); + } + + for (size_t i = 0; i < 2; i++) + { + queue[i]->read(result[i].data(), static_pointer_cast(mem2[i])); + queue[i]->sync(); + } + //for (auto f : result[0]) + //{ + // std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F'); + //} + //std::cout << std::endl; + //for (auto f : result[1]) + //{ + // std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F'); + //} + //std::cout << std::endl; + } + + Platform::dispose(); + + return 0; +} diff --git a/src/examples/cuda/parallelkernels/vectoradd.cu b/src/examples/cuda/parallelkernels/vectoradd.cu new file mode 100644 index 00000000..5addce64 --- /dev/null +++ b/src/examples/cuda/parallelkernels/vectoradd.cu @@ -0,0 +1,40 @@ +/* + * This file is part of the Visual Computing Library (VCL) release under the + * MIT license. + * + * Copyright (c) 2017 Basil Fierz + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +extern "C" +__global__ void vectoradd +( + int size, + const float* vecA, + const float* vecB, + float* vecC +) +{ + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if (idx >= size) + return; + + vecC[idx] = vecA[idx] + vecB[idx]; +} diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp index 4505914c..eb5dc7ac 100644 --- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp +++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp @@ -34,7 +34,7 @@ namespace Vcl { namespace Compute { namespace Cuda : Compute::CommandQueue() , _ownerCtx(owner) { - VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, 0)); + VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, CU_STREAM_NON_BLOCKING)); } CommandQueue::~CommandQueue() diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp index 3941edf4..f7a5b267 100644 --- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp +++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp @@ -73,6 +73,10 @@ namespace Vcl { namespace Compute { namespace Cuda int nrAsyncEngines = 0; VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&nrAsyncEngines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev)); _nrAsyncEngines = nrAsyncEngines; + + int canRunConcurrentKernels = 0; + VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&canRunConcurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); + _canRunConcurrentKernels = canRunConcurrentKernels != 0; } bool Device::supports(Feature f) const diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h index 1778fc3f..8d0193a4 100644 --- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h +++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h @@ -89,5 +89,8 @@ namespace Vcl { namespace Compute { namespace Cuda //! Number of asynchronous engines uint32_t _nrAsyncEngines{ 0 }; + + //! Indicate if the device can run kernels concurrently + bool _canRunConcurrentKernels{ false }; }; }}} From 7e7592d4845fb9a4d675013832a42680e2f9515d Mon Sep 17 00:00:00 2001 From: Basil Fierz Date: Tue, 17 Oct 2017 20:57:31 +0200 Subject: [PATCH 2/2] Switch to CMake 3.8 cuda support --- .../cuda/parallelkernels/CMakeLists.txt | 46 +++++++++---------- src/examples/cuda/parallelkernels/main.cpp | 41 +++++++++-------- .../cuda/parallelkernels/vectoradd.cu | 19 +++++++- 3 files changed, 63 insertions(+), 43 deletions(-) diff --git a/src/examples/cuda/parallelkernels/CMakeLists.txt b/src/examples/cuda/parallelkernels/CMakeLists.txt index 47021442..c1af793a 100644 --- a/src/examples/cuda/parallelkernels/CMakeLists.txt +++ b/src/examples/cuda/parallelkernels/CMakeLists.txt @@ -1,32 +1,32 @@ -PROJECT(parallelkernels_cuda) +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +project(parallelkernels_cuda LANGUAGES CXX CUDA) -INCLUDE(${CMAKE_SOURCE_DIR}/cmake/VCLCompileCUDA.cmake) +# Define the project's target +add_executable(parallelkernels_cuda "") -SET(SRC - main.cpp -) +# Place the target into a specific solution folder +set_target_properties(parallelkernels_cuda PROPERTIES FOLDER examples/cuda) -SET(KERNELS_CU - vectoradd.cu -) -# Access the include directories in order to compile the CUDA code -GET_PROPERTY(CURR_INC_DIRS_0 TARGET vcl_core_cuda PROPERTY INCLUDE_DIRECTORIES) -LIST(APPEND CURR_INC_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${CURR_INC_DIRS_0}) +# Enable C++11 for CUDA +target_compile_features(parallelkernels_cuda PUBLIC cxx_std_11) -VCLCOMPILECU( - ${PROJECT_SOURCE_DIR}/vectoradd.cu - "vectoradd" - "${CURR_INC_DIRS}" - COMPILEDKERNELS_0 -) -SET(COMPILEDKERNELS ${COMPILEDKERNELS_0}) +# Use separate compilation and linking for CUDA +set_target_properties(parallelkernels_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -SOURCE_GROUP("cuc" FILES ${COMPILEDKERNELS}) -SOURCE_GROUP("" FILES ${SRC} ${KERNELS}) +# Define the compilation targets +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=sm_30") +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_50,code=compute_50") +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_60,code=compute_60") -ADD_EXECUTABLE(parallelkernels_cuda ${SRC} ${KERNELS} ${COMPILEDKERNELS}) -SET_TARGET_PROPERTIES(parallelkernels_cuda PROPERTIES FOLDER examples/cuda) +# Define the source +set(SRC + main.cpp + vectoradd.cu +) +source_group("" FILES ${SRC}) +target_sources(parallelkernels_cuda PRIVATE ${SRC}) -TARGET_LINK_LIBRARIES(parallelkernels_cuda +# Link dependent libraries +target_link_libraries(parallelkernels_cuda vcl_compute_cuda ) \ No newline at end of file diff --git a/src/examples/cuda/parallelkernels/main.cpp b/src/examples/cuda/parallelkernels/main.cpp index 97518498..c0022040 100644 --- a/src/examples/cuda/parallelkernels/main.cpp +++ b/src/examples/cuda/parallelkernels/main.cpp @@ -38,8 +38,16 @@ #include #include -extern uint32_t vectoradd[]; -extern size_t vectoraddSize; +void vectoradd +( + cudaStream_t stream, + int grid_size, + int block_size, + int problem_size, + const float* vecA, + const float* vecB, + float* vecC +); int main(int argc, char* argv[]) { @@ -48,7 +56,7 @@ int main(int argc, char* argv[]) using Vcl::Core::dynamic_pointer_cast; using Vcl::Core::static_pointer_cast; - const size_t problem_size = 1024*1024; + const size_t problem_size = 128; Platform::initialise(); @@ -57,9 +65,6 @@ int main(int argc, char* argv[]) auto& dev = Platform::instance()->device(d); Context ctx{ dev }; - auto mod = ctx.createModuleFromSource((const int8_t*) vectoradd, vectoraddSize * sizeof(uint32_t)); - auto kernel = dynamic_pointer_cast(mod->kernel("vectoradd")); - ref_ptr queue[] = { dynamic_pointer_cast(ctx.defaultQueue()), dynamic_pointer_cast(ctx.createCommandQueue()) }; @@ -82,8 +87,8 @@ int main(int argc, char* argv[]) for (size_t i = 0; i < 2; i++) { - kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]); - kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]); + vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr()); + vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr()); } for (size_t i = 0; i < 2; i++) @@ -91,16 +96,16 @@ int main(int argc, char* argv[]) queue[i]->read(result[i].data(), static_pointer_cast(mem2[i])); queue[i]->sync(); } - //for (auto f : result[0]) - //{ - // std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F'); - //} - //std::cout << std::endl; - //for (auto f : result[1]) - //{ - // std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F'); - //} - //std::cout << std::endl; + for (auto f : result[0]) + { + std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F'); + } + std::cout << std::endl; + for (auto f : result[1]) + { + std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F'); + } + std::cout << std::endl; } Platform::dispose(); diff --git a/src/examples/cuda/parallelkernels/vectoradd.cu b/src/examples/cuda/parallelkernels/vectoradd.cu index 5addce64..28462b83 100644 --- a/src/examples/cuda/parallelkernels/vectoradd.cu +++ b/src/examples/cuda/parallelkernels/vectoradd.cu @@ -23,7 +23,6 @@ * SOFTWARE. */ -extern "C" __global__ void vectoradd ( int size, @@ -36,5 +35,21 @@ __global__ void vectoradd if (idx >= size) return; - vecC[idx] = vecA[idx] + vecB[idx]; + vecC[idx] = 0; + for (int i = 0; i < 100000; i++) + atomicAdd(&vecC[idx], vecA[idx] + vecB[idx]); +} + +void vectoradd +( + cudaStream_t stream, + int grid_size, + int block_size, + int problem_size, + const float* vecA, + const float* vecB, + float* vecC +) +{ + vectoradd<<>>(problem_size, vecA, vecB, vecC); }