diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt index f314eb75..c8ebb86e 100644 --- a/src/examples/CMakeLists.txt +++ b/src/examples/CMakeLists.txt @@ -11,6 +11,10 @@ IF(VCL_OPENGL_SUPPORT) SUBDIRS(opengl) ENDIF() +IF(CUDA_FOUND AND VCL_CUDA_SUPPORT) + SUBDIRS(cuda) +ENDIF() + IF(OPENCL_FOUND AND VCL_OPENCL_SUPPORT) SUBDIRS(opencl) ENDIF() diff --git a/src/examples/cuda/CMakeLists.txt b/src/examples/cuda/CMakeLists.txt new file mode 100644 index 00000000..c617467c --- /dev/null +++ b/src/examples/cuda/CMakeLists.txt @@ -0,0 +1,2 @@ +# Execute parallel kernels +SUBDIRS(parallelkernels) diff --git a/src/examples/cuda/parallelkernels/CMakeLists.txt b/src/examples/cuda/parallelkernels/CMakeLists.txt new file mode 100644 index 00000000..c1af793a --- /dev/null +++ b/src/examples/cuda/parallelkernels/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +project(parallelkernels_cuda LANGUAGES CXX CUDA) + +# Define the project's target +add_executable(parallelkernels_cuda "") + +# Place the target into a specific solution folder +set_target_properties(parallelkernels_cuda PROPERTIES FOLDER examples/cuda) + +# Enable C++11 for CUDA +target_compile_features(parallelkernels_cuda PUBLIC cxx_std_11) + +# Use separate compilation and linking for CUDA +set_target_properties(parallelkernels_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + +# Define the compilation targets +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=sm_30") +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_50,code=compute_50") +string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_60,code=compute_60") + +# Define the source +set(SRC + main.cpp + vectoradd.cu +) +source_group("" FILES ${SRC}) +target_sources(parallelkernels_cuda PRIVATE ${SRC}) + +# Link dependent libraries +target_link_libraries(parallelkernels_cuda + vcl_compute_cuda +) \ No newline at end of file diff --git a/src/examples/cuda/parallelkernels/main.cpp b/src/examples/cuda/parallelkernels/main.cpp new file mode 100644 index 00000000..c0022040 --- /dev/null +++ b/src/examples/cuda/parallelkernels/main.cpp @@ -0,0 +1,114 @@ +/* + * This file is part of the Visual Computing Library (VCL) release under the + * MIT license. + * + * Copyright (c) 2017 Basil Fierz + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// VCL configuration +#include + +// C++ standard library +#include + +// VCL +#include +#include +#include +#include +#include +#include +#include + +void vectoradd +( + cudaStream_t stream, + int grid_size, + int block_size, + int problem_size, + const float* vecA, + const float* vecB, + float* vecC +); + +int main(int argc, char* argv[]) +{ + using namespace Vcl::Compute::Cuda; + using Vcl::Core::ref_ptr; + using Vcl::Core::dynamic_pointer_cast; + using Vcl::Core::static_pointer_cast; + + const size_t problem_size = 128; + + Platform::initialise(); + + for (int d = 0; d < Platform::instance()->nrDevices(); d++) + { + auto& dev = Platform::instance()->device(d); + Context ctx{ dev }; + + ref_ptr queue[] = { + dynamic_pointer_cast(ctx.defaultQueue()), + dynamic_pointer_cast(ctx.createCommandQueue()) }; + + ref_ptr mem0[] = { dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) }; + ref_ptr mem1[] = { dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) }; + ref_ptr mem2[] = { dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) }; + + float one = 1; + float two = 2; + + std::vector result[2]; + result[0] = std::vector(problem_size); + result[1] = std::vector(problem_size); + for (size_t i = 0; i < 2; i++) + { + queue[i]->fill(static_pointer_cast(mem0[i]), &one, sizeof(float)); + queue[i]->fill(static_pointer_cast(mem1[i]), &two, sizeof(float)); + } + + for (size_t i = 0; i < 2; i++) + { + vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr()); + vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr()); + } + + for (size_t i = 0; i < 2; i++) + { + queue[i]->read(result[i].data(), static_pointer_cast(mem2[i])); + queue[i]->sync(); + } + for (auto f : result[0]) + { + std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F'); + } + std::cout << std::endl; + for (auto f : result[1]) + { + std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F'); + } + std::cout << std::endl; + } + + Platform::dispose(); + + return 0; +} diff --git a/src/examples/cuda/parallelkernels/vectoradd.cu b/src/examples/cuda/parallelkernels/vectoradd.cu new file mode 100644 index 00000000..28462b83 --- /dev/null +++ b/src/examples/cuda/parallelkernels/vectoradd.cu @@ -0,0 +1,55 @@ +/* + * This file is part of the Visual Computing Library (VCL) release under the + * MIT license. + * + * Copyright (c) 2017 Basil Fierz + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +__global__ void vectoradd +( + int size, + const float* vecA, + const float* vecB, + float* vecC +) +{ + int idx = threadIdx.x + blockDim.x*blockIdx.x; + if (idx >= size) + return; + + vecC[idx] = 0; + for (int i = 0; i < 100000; i++) + atomicAdd(&vecC[idx], vecA[idx] + vecB[idx]); +} + +void vectoradd +( + cudaStream_t stream, + int grid_size, + int block_size, + int problem_size, + const float* vecA, + const float* vecB, + float* vecC +) +{ + vectoradd<<>>(problem_size, vecA, vecB, vecC); +} diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp index 4505914c..eb5dc7ac 100644 --- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp +++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp @@ -34,7 +34,7 @@ namespace Vcl { namespace Compute { namespace Cuda : Compute::CommandQueue() , _ownerCtx(owner) { - VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, 0)); + VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, CU_STREAM_NON_BLOCKING)); } CommandQueue::~CommandQueue() diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp index 3941edf4..f7a5b267 100644 --- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp +++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp @@ -73,6 +73,10 @@ namespace Vcl { namespace Compute { namespace Cuda int nrAsyncEngines = 0; VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&nrAsyncEngines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev)); _nrAsyncEngines = nrAsyncEngines; + + int canRunConcurrentKernels = 0; + VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&canRunConcurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); + _canRunConcurrentKernels = canRunConcurrentKernels != 0; } bool Device::supports(Feature f) const diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h index 1778fc3f..8d0193a4 100644 --- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h +++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h @@ -89,5 +89,8 @@ namespace Vcl { namespace Compute { namespace Cuda //! Number of asynchronous engines uint32_t _nrAsyncEngines{ 0 }; + + //! Indicate if the device can run kernels concurrently + bool _canRunConcurrentKernels{ false }; }; }}}