diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt
index f314eb75..c8ebb86e 100644
--- a/src/examples/CMakeLists.txt
+++ b/src/examples/CMakeLists.txt
@@ -11,6 +11,10 @@ IF(VCL_OPENGL_SUPPORT)
 	SUBDIRS(opengl)
 ENDIF()
 
+IF(CUDA_FOUND AND VCL_CUDA_SUPPORT)
+	SUBDIRS(cuda)
+ENDIF()
+
 IF(OPENCL_FOUND AND VCL_OPENCL_SUPPORT)
 	SUBDIRS(opencl)
 ENDIF()
diff --git a/src/examples/cuda/CMakeLists.txt b/src/examples/cuda/CMakeLists.txt
new file mode 100644
index 00000000..c617467c
--- /dev/null
+++ b/src/examples/cuda/CMakeLists.txt
@@ -0,0 +1,2 @@
+# Execute parallel kernels
+SUBDIRS(parallelkernels)
diff --git a/src/examples/cuda/parallelkernels/CMakeLists.txt b/src/examples/cuda/parallelkernels/CMakeLists.txt
new file mode 100644
index 00000000..c1af793a
--- /dev/null
+++ b/src/examples/cuda/parallelkernels/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+project(parallelkernels_cuda LANGUAGES CXX CUDA)
+
+# Define the project's target
+add_executable(parallelkernels_cuda "")
+
+# Place the target into a specific solution folder
+set_target_properties(parallelkernels_cuda PROPERTIES FOLDER examples/cuda)
+
+# Enable C++11 for CUDA
+target_compile_features(parallelkernels_cuda PUBLIC cxx_std_11)
+
+# Use separate compilation and linking for CUDA
+set_target_properties(parallelkernels_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+# Define the compilation targets
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=sm_30")
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_50,code=compute_50")
+string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_60,code=compute_60")
+
+# Define the source
+set(SRC
+	main.cpp
+	vectoradd.cu
+)
+source_group("" FILES ${SRC})
+target_sources(parallelkernels_cuda PRIVATE ${SRC})
+
+# Link dependent libraries
+target_link_libraries(parallelkernels_cuda
+	vcl_compute_cuda
+)
\ No newline at end of file
diff --git a/src/examples/cuda/parallelkernels/main.cpp b/src/examples/cuda/parallelkernels/main.cpp
new file mode 100644
index 00000000..c0022040
--- /dev/null
+++ b/src/examples/cuda/parallelkernels/main.cpp
@@ -0,0 +1,114 @@
+/*
+ * This file is part of the Visual Computing Library (VCL) release under the
+ * MIT license.
+ *
+ * Copyright (c) 2017 Basil Fierz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// VCL configuration
+#include <vcl/config/global.h>
+
+// C++ standard library
+#include <iostream>
+
+// VCL
+#include <vcl/compute/cuda/buffer.h>
+#include <vcl/compute/cuda/commandqueue.h>
+#include <vcl/compute/cuda/context.h>
+#include <vcl/compute/cuda/device.h>
+#include <vcl/compute/cuda/kernel.h>
+#include <vcl/compute/cuda/platform.h>
+#include <vcl/math/math.h>
+
+void vectoradd
+(
+	cudaStream_t stream, 
+	int grid_size,
+	int block_size,
+	int problem_size,
+	const float* vecA,
+	const float* vecB,
+	float* vecC
+);
+
+int main(int argc, char* argv[])
+{
+	using namespace Vcl::Compute::Cuda;
+	using Vcl::Core::ref_ptr;
+	using Vcl::Core::dynamic_pointer_cast;
+	using Vcl::Core::static_pointer_cast;
+
+	const size_t problem_size = 128;
+
+	Platform::initialise();
+
+	for (int d = 0; d < Platform::instance()->nrDevices(); d++)
+	{
+		auto& dev = Platform::instance()->device(d);
+		Context ctx{ dev };
+		
+		ref_ptr<CommandQueue> queue[] = {
+			dynamic_pointer_cast<CommandQueue>(ctx.defaultQueue()),
+			dynamic_pointer_cast<CommandQueue>(ctx.createCommandQueue()) };
+
+		ref_ptr<Buffer> mem0[] = { dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) };
+		ref_ptr<Buffer> mem1[] = { dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) };
+		ref_ptr<Buffer> mem2[] = { dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))), dynamic_pointer_cast<Buffer>(ctx.createBuffer(Vcl::Compute::BufferAccess::ReadWrite, problem_size*sizeof(float))) };
+
+		float one = 1;
+		float two = 2;
+		
+		std::vector<float> result[2];
+		result[0] = std::vector<float>(problem_size);
+		result[1] = std::vector<float>(problem_size);
+		for (size_t i = 0; i < 2; i++)
+		{
+			queue[i]->fill(static_pointer_cast<Vcl::Compute::Buffer>(mem0[i]), &one, sizeof(float));
+			queue[i]->fill(static_pointer_cast<Vcl::Compute::Buffer>(mem1[i]), &two, sizeof(float));
+		}
+		
+		for (size_t i = 0; i < 2; i++)
+		{
+			vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr());
+			vectoradd(*queue[i], (int)problem_size, 32, (int)problem_size, (float*) mem0[i]->devicePtr(), (float*) mem1[i]->devicePtr(), (float*) mem2[i]->devicePtr());
+		}
+		
+		for (size_t i = 0; i < 2; i++)
+		{
+			queue[i]->read(result[i].data(), static_pointer_cast<const Vcl::Compute::Buffer>(mem2[i]));
+			queue[i]->sync();
+		}
+		for (auto f : result[0])
+		{
+			std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F');
+		}
+		std::cout << std::endl;
+		for (auto f : result[1])
+		{
+			std::cout << (Vcl::Mathematics::equal(f, 300000, 1e-5f) ? '.' : 'F');
+		}
+		std::cout << std::endl;
+	}
+
+	Platform::dispose();
+
+	return 0;
+}
diff --git a/src/examples/cuda/parallelkernels/vectoradd.cu b/src/examples/cuda/parallelkernels/vectoradd.cu
new file mode 100644
index 00000000..28462b83
--- /dev/null
+++ b/src/examples/cuda/parallelkernels/vectoradd.cu
@@ -0,0 +1,55 @@
+/*
+ * This file is part of the Visual Computing Library (VCL) release under the
+ * MIT license.
+ *
+ * Copyright (c) 2017 Basil Fierz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+__global__ void vectoradd
+(
+	int size,
+	const float* vecA,
+	const float* vecB,
+	      float* vecC
+)
+{
+	int idx = threadIdx.x + blockDim.x*blockIdx.x;
+	if (idx >= size)
+		return;
+	
+	vecC[idx] = 0;
+	for (int i = 0; i < 100000; i++)
+		atomicAdd(&vecC[idx], vecA[idx] + vecB[idx]);
+}
+
+void vectoradd
+(
+	cudaStream_t stream, 
+	int grid_size,
+	int block_size,
+	int problem_size,
+	const float* vecA,
+	const float* vecB,
+	float* vecC
+)
+{
+	vectoradd<<<grid_size / block_size, block_size, 0, stream>>>(problem_size, vecA, vecB, vecC);
+}
diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp
index 4505914c..eb5dc7ac 100644
--- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp
+++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/commandqueue.cpp
@@ -34,7 +34,7 @@ namespace Vcl { namespace Compute { namespace Cuda
 	: Compute::CommandQueue()
 	, _ownerCtx(owner)
 	{
-		VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, 0));
+		VCL_CU_SAFE_CALL(cuStreamCreate(&_queue, CU_STREAM_NON_BLOCKING));
 	}
 
 	CommandQueue::~CommandQueue()
diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp
index 3941edf4..f7a5b267 100644
--- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp
+++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.cpp
@@ -73,6 +73,10 @@ namespace Vcl { namespace Compute { namespace Cuda
 		int nrAsyncEngines = 0;
 		VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&nrAsyncEngines, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev));
 		_nrAsyncEngines = nrAsyncEngines;
+		
+		int canRunConcurrentKernels = 0;
+		VCL_CU_SAFE_CALL(cuDeviceGetAttribute(&canRunConcurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
+		_canRunConcurrentKernels = canRunConcurrentKernels != 0;
 	}
 
 	bool Device::supports(Feature f) const
diff --git a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h
index 1778fc3f..8d0193a4 100644
--- a/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h
+++ b/src/libs/vcl.compute.cuda/vcl/compute/cuda/device.h
@@ -89,5 +89,8 @@ namespace Vcl { namespace Compute { namespace Cuda
 
 		//! Number of asynchronous engines
 		uint32_t _nrAsyncEngines{ 0 };
+		
+		//! Indicate if the device can run kernels concurrently
+		bool _canRunConcurrentKernels{ false };
 	};
 }}}