From de4c4e36310955cfc500f0fc549e1f3cca35533a Mon Sep 17 00:00:00 2001
From: Basil Fierz <basil.fierz@hotmail.com>
Date: Fri, 13 Oct 2017 19:58:33 +0200
Subject: [PATCH 1/2] Adds concurrent kernel example

---
 src/examples/CMakeLists.txt                   |   4 +
 src/examples/cuda/CMakeLists.txt              |   2 +
 .../cuda/parallelkernels/CMakeLists.txt       |  32 +++++
 src/examples/cuda/parallelkernels/main.cpp    | 109 ++++++++++++++++++
 .../cuda/parallelkernels/vectoradd.cu         |  40 +++++++
 .../vcl/compute/cuda/commandqueue.cpp         |   2 +-
 .../vcl/compute/cuda/device.cpp               |   4 +
 .../vcl/compute/cuda/device.h                 |   3 +
 8 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 src/examples/cuda/CMakeLists.txt
 create mode 100644 src/examples/cuda/parallelkernels/CMakeLists.txt
 create mode 100644 src/examples/cuda/parallelkernels/main.cpp
 create mode 100644 src/examples/cuda/parallelkernels/vectoradd.cu

diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt
index f314eb75..c8ebb86e 100644
--- a/src/examples/CMakeLists.txt
+++ b/src/examples/CMakeLists.txt
@@ -11,6 +11,10 @@ IF(VCL_OPENGL_SUPPORT)
 	SUBDIRS(opengl)
 ENDIF()
 
+IF(CUDA_FOUND AND VCL_CUDA_SUPPORT)
+	SUBDIRS(cuda)
+ENDIF()
+
 IF(OPENCL_FOUND AND VCL_OPENCL_SUPPORT)
 	SUBDIRS(opencl)
 ENDIF()
diff --git a/src/examples/cuda/CMakeLists.txt b/src/examples/cuda/CMakeLists.txt
new file mode 100644
index 00000000..c617467c
--- /dev/null
+++ b/src/examples/cuda/CMakeLists.txt
@@ -0,0 +1,2 @@
+# Execute parallel kernels
+SUBDIRS(parallelkernels)
diff --git a/src/examples/cuda/parallelkernels/CMakeLists.txt b/src/examples/cuda/parallelkernels/CMakeLists.txt
new file mode 100644
index 00000000..47021442
--- /dev/null
+++ b/src/examples/cuda/parallelkernels/CMakeLists.txt
@@ -0,0 +1,32 @@
+PROJECT(parallelkernels_cuda)
+
+INCLUDE(${CMAKE_SOURCE_DIR}/cmake/VCLCompileCUDA.cmake)
+
+SET(SRC
+	main.cpp
+)
+
+SET(KERNELS_CU
+	vectoradd.cu
+)
+# Access the include directories in order to compile the CUDA code
+GET_PROPERTY(CURR_INC_DIRS_0 TARGET vcl_core_cuda PROPERTY INCLUDE_DIRECTORIES)
+LIST(APPEND CURR_INC_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${CURR_INC_DIRS_0})
+
+VCLCOMPILECU(
+	${PROJECT_SOURCE_DIR}/vectoradd.cu
+	"vectoradd"
+	"${CURR_INC_DIRS}"
+	COMPILEDKERNELS_0
+)
+SET(COMPILEDKERNELS ${COMPILEDKERNELS_0})
+
+SOURCE_GROUP("cuc" FILES ${COMPILEDKERNELS})
+SOURCE_GROUP("" FILES ${SRC} ${KERNELS})
+
+ADD_EXECUTABLE(parallelkernels_cuda ${SRC} ${KERNELS} ${COMPILEDKERNELS})
+SET_TARGET_PROPERTIES(parallelkernels_cuda PROPERTIES FOLDER examples/cuda)
+
+TARGET_LINK_LIBRARIES(parallelkernels_cuda
+	vcl_compute_cuda
+)
\ No newline at end of file
diff --git a/src/examples/cuda/parallelkernels/main.cpp b/src/examples/cuda/parallelkernels/main.cpp
new file mode 100644
index 00000000..97518498
--- /dev/null
+++ b/src/examples/cuda/parallelkernels/main.cpp
@@ -0,0 +1,109 @@
+/*
+ * This file is part of the Visual Computing Library (VCL) release under the
+ * MIT license.
+ *
+ * Copyright (c) 2017 Basil Fierz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// VCL configuration
+#include <vcl/config/global.h>
+
+// C++ standard library
+#include <iostream>
+
+// VCL
+#include <vcl/compute/cuda/buffer.h>
+#include <vcl/compute/cuda/commandqueue.h>
+#include <vcl/compute/cuda/context.h>
+#include <vcl/compute/cuda/device.h>
+#include <vcl/compute/cuda/kernel.h>
+#include <vcl/compute/cuda/platform.h>
+#include <vcl/math/math.h>
+
+extern uint32_t vectoradd[];
+extern size_t vectoraddSize;
+
+int main(int argc, char* argv[])
+{
+	using namespace Vcl::Compute::Cuda;
+	using Vcl::Core::ref_ptr;
+	using Vcl::Core::dynamic_pointer_cast;
+	using Vcl::Core::static_pointer_cast;
+
+	const size_t problem_size = 1024*1024;
+
+	Platform::initialise();
+
+	for (int d = 0; d < Platform::instance()->nrDevices(); d++)
+	{
+		auto& dev = Platform::instance()->device(d);
+		Context ctx{ dev };
+		
+		auto mod = ctx.createModuleFromSource((const int8_t*) vectoradd, vectoraddSize * sizeof(uint32_t));
+		auto kernel = dynamic_pointer_cast<Kernel>(mod->kernel("vectoradd"));
+
+		ref_ptr<CommandQueue> queue[] = {
+			dynamic_pointer_cast<CommandQueue>(ctx.defaultQueue()),
+			dynamic_pointer_cast<CommandQueue>(ctx.createCommandQueue()) };
+
+		ref_ptr<Buffer> mem0[] = { dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) };
+		ref_ptr<Buffer> mem1[] = { dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) };
+		ref_ptr<Buffer> mem2[] = { dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) };
+
+		float one = 1;
+		float two = 2;
+		
+		std::vector<float> result[2];
+		result[0] = std::vector<float>(problem_size);
+		result[1] = std::vector<float>(problem_size);
+		for (size_t i = 0; i < 2; i++)
+		{
+			queue[i]->fill(static_pointer_cast<Vcl::Compute::Buffer>(mem0[i]), &one, sizeof(float));
+			queue[i]->fill(static_pointer_cast<Vcl::Compute::Buffer>(mem1[i]), &two, sizeof(float));
+		}
+		
+		for (size_t i = 0; i < 2; i++)
+		{
+			kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]);
+			kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]);
+		}
+		
+		for (size_t i = 0; i < 2; i++)
+		{
+			queue[i]->read(result[i].data(), static_pointer_cast<const Vcl::Compute::Buffer>(mem2[i]));
+			queue[i]->sync();
+		}
+		//for (auto f : result[0])
+		//{
+		//	std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F');
+		//}
+		//std::cout << std::endl;
+		//for (auto f : result[1])
+		//{
+		//	std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F');
+		//}
+		//std::cout << std::endl;
+	}
+
+	Platform::dispose();
+
+	return 0;
+}
diff --git a/src/examples/cuda/parallelkernels/vectoradd.cu b/src/examples/cuda/parallelkernels/vectoradd.cu
new file mode 100644
index 00000000..5addce64
--- /dev/null
+++ b/src/examples/cuda/parallelkernels/vectoradd.cu
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the Visual Computing Library (VCL) release under the
+ * MIT license.
+ *
+ * Copyright (c) 2017 Basil Fierz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+extern "C"
+__global__ void vectoradd
+(
+	int size,
+	const float* vecA,
+	const float* vecB,
+	      float* vecC
+)
+{
+	int idx = threadIdx.x + blockDim.x*blockIdx.x;
+	if (idx >= size)
+		return;
+	
+	vecC[idx] = vecA[idx] + vecB[idx];
+}
diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp
index 4505914c..eb5dc7ac 100644
--- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp
+++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp
@@ -34,7 +34,7 @@ namespace Vcl { namespace Compute { namespace Cuda
 	: Compute::CommandQueue()
 	, _ownerCtx(owner)
 	{
-		VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, 0));
+		VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, CU_STREAM_NON_BLOCKING));
 	}
 
 	CommandQueue::~CommandQueue()
diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp
index 3941edf4..f7a5b267 100644
--- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp
+++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp
@@ -73,6 +73,10 @@ namespace Vcl { namespace Compute { namespace Cuda
 		int nrAsyncEngines = 0;
 		VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&nrAsyncEngines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev));
 		_nrAsyncEngines = nrAsyncEngines;
+		
+		int canRunConcurrentKernels = 0;
+		VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&canRunConcurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
+		_canRunConcurrentKernels = canRunConcurrentKernels != 0;
 	}
 
 	bool Device::supports(Feature f) const
diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h
index 1778fc3f..8d0193a4 100644
--- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h
+++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h
@@ -89,5 +89,8 @@ namespace Vcl { namespace Compute { namespace Cuda
 
 		//! Number of asynchronous engines
 		uint32_t _nrAsyncEngines{ 0 };
+		
+		//! Indicate if the device can run kernels concurrently
+		bool _canRunConcurrentKernels{ false };
 	};
 }}}

From 7e7592d4845fb9a4d675013832a42680e2f9515d Mon Sep 17 00:00:00 2001
From: Basil Fierz <basil.fierz@hotmail.com>
Date: Tue, 17 Oct 2017 20:57:31 +0200
Subject: [PATCH 2/2] Switch to CMake 3.8 cuda support

---
 .../cuda/parallelkernels/CMakeLists.txt       | 46 +++++++++----------
 src/examples/cuda/parallelkernels/main.cpp    | 41 +++++++++--------
 .../cuda/parallelkernels/vectoradd.cu         | 19 +++++++-
 3 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/src/examples/cuda/parallelkernels/CMakeLists.txt b/src/examples/cuda/parallelkernels/CMakeLists.txt
index 47021442..c1af793a 100644
--- a/src/examples/cuda/parallelkernels/CMakeLists.txt
+++ b/src/examples/cuda/parallelkernels/CMakeLists.txt
@@ -1,32 +1,32 @@
-PROJECT(parallelkernels_cuda)
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+project(parallelkernels_cuda LANGUAGES CXX CUDA)
 
-INCLUDE(${CMAKE_SOURCE_DIR}/cmake/VCLCompileCUDA.cmake)
+# Define the project's target
+add_executable(parallelkernels_cuda "")
 
-SET(SRC
-	main.cpp
-)
+# Place the target into a specific solution folder
+set_target_properties(parallelkernels_cuda PROPERTIES FOLDER examples/cuda)
 
-SET(KERNELS_CU
-	vectoradd.cu
-)
-# Access the include directories in order to compile the CUDA code
-GET_PROPERTY(CURR_INC_DIRS_0 TARGET vcl_core_cuda PROPERTY INCLUDE_DIRECTORIES)
-LIST(APPEND CURR_INC_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${CURR_INC_DIRS_0})
+# Enable C++11 for CUDA
+target_compile_features(parallelkernels_cuda PUBLIC cxx_std_11)
 
-VCLCOMPILECU(
-	${PROJECT_SOURCE_DIR}/vectoradd.cu
-	"vectoradd"
-	"${CURR_INC_DIRS}"
-	COMPILEDKERNELS_0
-)
-SET(COMPILEDKERNELS ${COMPILEDKERNELS_0})
+# Use separate compilation and linking for CUDA
+set_target_properties(parallelkernels_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
-SOURCE_GROUP("cuc" FILES ${COMPILEDKERNELS})
-SOURCE_GROUP("" FILES ${SRC} ${KERNELS})
+# Define the compilation targets
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=sm_30")
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_50,code=compute_50")
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_60,code=compute_60")
 
-ADD_EXECUTABLE(parallelkernels_cuda ${SRC} ${KERNELS} ${COMPILEDKERNELS})
-SET_TARGET_PROPERTIES(parallelkernels_cuda PROPERTIES FOLDER examples/cuda)
+# Define the source
+set(SRC
+	main.cpp
+	vectoradd.cu
+)
+source_group("" FILES ${SRC})
+target_sources(parallelkernels_cuda PRIVATE ${SRC})
 
-TARGET_LINK_LIBRARIES(parallelkernels_cuda
+# Link dependent libraries
+target_link_libraries(parallelkernels_cuda
 	vcl_compute_cuda
 )
\ No newline at end of file
diff --git a/src/examples/cuda/parallelkernels/main.cpp b/src/examples/cuda/parallelkernels/main.cpp
index 97518498..c0022040 100644
--- a/src/examples/cuda/parallelkernels/main.cpp
+++ b/src/examples/cuda/parallelkernels/main.cpp
@@ -38,8 +38,16 @@
 #include <vcl/compute/cuda/platform.h>
 #include <vcl/math/math.h>
 
-extern uint32_t vectoradd[];
-extern size_t vectoraddSize;
+void vectoradd
+(
+	cudaStream_t stream, 
+	int grid_size,
+	int block_size,
+	int problem_size,
+	const float* vecA,
+	const float* vecB,
+	float* vecC
+);
 
 int main(int argc, char* argv[])
 {
@@ -48,7 +56,7 @@ int main(int argc, char* argv[])
 	using Vcl::Core::dynamic_pointer_cast;
 	using Vcl::Core::static_pointer_cast;
 
-	const size_t problem_size = 1024*1024;
+	const size_t problem_size = 128;
 
 	Platform::initialise();
 
@@ -57,9 +65,6 @@ int main(int argc, char* argv[])
 		auto& dev = Platform::instance()->device(d);
 		Context ctx{ dev };
 		
-		auto mod = ctx.createModuleFromSource((const int8_t*) vectoradd, vectoraddSize * sizeof(uint32_t));
-		auto kernel = dynamic_pointer_cast<Kernel>(mod->kernel("vectoradd"));
-
 		ref_ptr<CommandQueue> queue[] = {
 			dynamic_pointer_cast<CommandQueue>(ctx.defaultQueue()),
 			dynamic_pointer_cast<CommandQueue>(ctx.createCommandQueue()) };
@@ -82,8 +87,8 @@ int main(int argc, char* argv[])
 		
 		for (size_t i = 0; i < 2; i++)
 		{
-			kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]);
-			kernel->run(*queue[i], (int)problem_size / 32, 32, 0, (int)problem_size, mem0[i], mem1[i], mem2[i]);
+			vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr());
+			vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr());
 		}
 		
 		for (size_t i = 0; i < 2; i++)
@@ -91,16 +96,16 @@ int main(int argc, char* argv[])
 			queue[i]->read(result[i].data(), static_pointer_cast<const Vcl::Compute::Buffer>(mem2[i]));
 			queue[i]->sync();
 		}
-		//for (auto f : result[0])
-		//{
-		//	std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F');
-		//}
-		//std::cout << std::endl;
-		//for (auto f : result[1])
-		//{
-		//	std::cout << (Vcl::Mathematics::equal(f, 3, 1e-5f) ? '.' : 'F');
-		//}
-		//std::cout << std::endl;
+		for (auto f : result[0])
+		{
+			std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F');
+		}
+		std::cout << std::endl;
+		for (auto f : result[1])
+		{
+			std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F');
+		}
+		std::cout << std::endl;
 	}
 
 	Platform::dispose();
diff --git a/src/examples/cuda/parallelkernels/vectoradd.cu b/src/examples/cuda/parallelkernels/vectoradd.cu
index 5addce64..28462b83 100644
--- a/src/examples/cuda/parallelkernels/vectoradd.cu
+++ b/src/examples/cuda/parallelkernels/vectoradd.cu
@@ -23,7 +23,6 @@
  * SOFTWARE.
  */
 
-extern "C"
 __global__ void vectoradd
 (
 	int size,
@@ -36,5 +35,21 @@ __global__ void vectoradd
 	if (idx >= size)
 		return;
 	
-	vecC[idx] = vecA[idx] + vecB[idx];
+	vecC[idx] = 0;
+	for (int i = 0; i < 100000; i++)
+		atomicAdd(&vecC[idx], vecA[idx] + vecB[idx]);
+}
+
+void vectoradd
+(
+	cudaStream_t stream, 
+	int grid_size,
+	int block_size,
+	int problem_size,
+	const float* vecA,
+	const float* vecB,
+	float* vecC
+)
+{
+	vectoradd<<<grid_size / block_size, block_size, 0, stream>>>(problem_size, vecA, vecB, vecC);
 }