diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..11a8f13 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,76 @@ +{ + "cmake.configureOnOpen": false, + "files.associations": { + "algorithm": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "cctype": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "exception": "cpp", + "format": "cpp", + "forward_list": "cpp", + "fstream": "cpp", + "functional": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "iterator": "cpp", + "limits": "cpp", + "list": "cpp", + "locale": "cpp", + "map": "cpp", + "memory": "cpp", + "mutex": "cpp", + "new": "cpp", + "optional": "cpp", + "ostream": "cpp", + "ratio": "cpp", + "set": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "string": "cpp", + "system_error": "cpp", + "thread": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "unordered_map": "cpp", + "utility": "cpp", + "vector": "cpp", + "xfacet": "cpp", + "xhash": "cpp", + "xiosbase": "cpp", + "xlocale": "cpp", + "xlocbuf": "cpp", + "xlocinfo": "cpp", + "xlocmes": "cpp", + "xlocmon": "cpp", + "xlocnum": "cpp", + "xloctime": "cpp", + "xmemory": "cpp", + "xstddef": "cpp", + "xstring": "cpp", + "xtr1common": "cpp", + "xtree": "cpp", + "xutility": "cpp" + } +} \ No newline at end of file diff --git a/README.md b/README.md index d63a6a1..e2d0b3a 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,58 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Richard Chen + * [LinkedIn](https://www.linkedin.com/in/richardrlchen) +* Tested on: Windows 11, i7-10875H @ 2.3GHz 16GB, RTX 2060 MAX-Q 6GB (Personal Computer) -### (TODO: Your README) +## Overview +This project involved computing and rendering flocks of boids all on the GPU. +First was a naive O(n^2) approach that involved pairwise checking all the boids. +Next, a uniform grid data structure was employed so that only close boids would be checked, reducing the amount of math needed. Lastly, the uniform grid was improved by rearranging the buffers on the GPU rather than adding a layer of indirection. This should greatly improve memory access times. + +## Videos and Images +100,000 Boids +
+ + +10,000 Boids +
+ + +### Naive Implementation +Naive barely handles 50k Boids +
+ + +### Uniform Grid +Uniform Grid handles 50k Boids just fine +
+ + +## Performance +Visualize On +
+ + +Visualize Off +
+ + +* As the number of boids increases, the naive approach does not scale +* At lower boid numbers, the coherent approach incurs overhead from reshuffling arrays but with more boids, the memory indirection time saved overcomes this + +Block Size +
+ + +## Questions +* Increasing the boids increases the number of computations needed. With the naive +implmentation acting on every pair, it is O(n^2) while for the spatial grid based +implementations, the repelling behavior at close distances means that we should not +hit the strict n choose 2 case. +* Each Streaming Multiprocessor runs one warp at a time so as long as all the SMs are saturated, +there should not be a significant difference +* The coherent grid trades some extra copies and assigns to avoid reading from slow memory and needing to refresh the cache. With a small number of boids, the overhead outweighs the time saved but large boid numbers is where it shines +* The performance was slightly slower but perhaps it could have benefits as the physical area being checked is smaller, it just has to be done in a way that the extra checks overhead is compensated for by the time saved checking a smaller area, perhaps with high boid density and large grid squares. As it currently was for +50,000 boids on the coherent implmentation, the fps dropped from around 360 to 333. -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/images/Visualize Off.svg b/images/Visualize Off.svg new file mode 100644 index 0000000..1ba4a5b --- /dev/null +++ b/images/Visualize Off.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/Visualize On.svg b/images/Visualize On.svg new file mode 100644 index 0000000..20a22da --- /dev/null +++ b/images/Visualize On.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/blocksize.svg b/images/blocksize.svg new file mode 100644 index 0000000..edd9802 --- /dev/null +++ b/images/blocksize.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/naiveIsSlow.png b/images/naiveIsSlow.png new file mode 100644 index 0000000..b281f43 Binary files /dev/null and b/images/naiveIsSlow.png differ diff --git a/images/recording1.gif b/images/recording1.gif new file mode 100644 index 0000000..daaafdb Binary files /dev/null and b/images/recording1.gif differ diff --git a/images/recording2.gif b/images/recording2.gif new file mode 100644 index 0000000..30771bc Binary files /dev/null and b/images/recording2.gif differ diff --git a/images/uniformIsFine.png b/images/uniformIsFine.png new file mode 100644 index 0000000..5dc3f85 Binary files /dev/null and b/images/uniformIsFine.png differ diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..6f7e796 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -6,13 +6,15 @@ #include "utilityCore.hpp" #include "kernel.h" +#include "fstream" + // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax -#define imax( a, b ) ( ((a) > (b)) ? (a) : (b) ) +#define imax(a, b) (((a) > (b)) ? (a) : (b)) #endif #ifndef imin -#define imin( a, b ) ( ((a) < (b)) ? (a) : (b) ) +#define imin(a, b) (((a) < (b)) ? (a) : (b)) #endif #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) @@ -20,10 +22,13 @@ /** * Check for CUDA errors; print and exit if there was a problem. */ -void checkCUDAError(const char *msg, int line = -1) { +void checkCUDAError(const char *msg, int line = -1) +{ cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - if (line >= 0) { + if (cudaSuccess != err) + { + if (line >= 0) + { fprintf(stderr, "Line %d: ", line); } fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); @@ -31,7 +36,6 @@ void checkCUDAError(const char *msg, int line = -1) { } } - /***************** * Configuration * *****************/ @@ -53,6 +57,7 @@ void checkCUDAError(const char *msg, int line = -1) { /*! Size of the starting area in simulation space. */ #define scene_scale 100.0f +// #define scene_scale 20.f /*********************************************** * Kernel state (pointers are device pointers) * @@ -75,7 +80,7 @@ glm::vec3 *dev_vel2; // For efficient sorting and the uniform grid. These should always be parallel. int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? -int *dev_particleGridIndices; // What grid cell is this particle in? +int *dev_particleGridIndices; // What grid cell is this particle in? // needed for use with thrust thrust::device_ptr dev_thrust_particleArrayIndices; thrust::device_ptr dev_thrust_particleGridIndices; @@ -85,6 +90,7 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3 *dev_newPos; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -98,7 +104,8 @@ glm::vec3 gridMinimum; * initSimulation * ******************/ -__host__ __device__ unsigned int hash(unsigned int a) { +__host__ __device__ unsigned int hash(unsigned int a) +{ a = (a + 0x7ed55d16) + (a << 12); a = (a ^ 0xc761c23c) ^ (a >> 19); a = (a + 0x165667b1) + (a << 5); @@ -112,7 +119,8 @@ __host__ __device__ unsigned int hash(unsigned int a) { * LOOK-1.2 - this is a typical helper function for a CUDA kernel. * Function for generating a random vec3. */ -__host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { +__host__ __device__ glm::vec3 generateRandomVec3(float time, int index) +{ thrust::default_random_engine rng(hash((int)(index * time))); thrust::uniform_real_distribution unitDistrib(-1, 1); @@ -123,9 +131,11 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { * LOOK-1.2 - This is a basic CUDA kernel. * CUDA kernel for generating boids with a specified mass randomly around the star. */ -__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) { +__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 *arr, float scale) +{ int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { + if (index < N) + { glm::vec3 rand = generateRandomVec3(time, index); arr[index].x = scale * rand.x; arr[index].y = scale * rand.y; @@ -136,28 +146,30 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo /** * Initialize memory, update some globals */ -void Boids::initSimulation(int N) { +void Boids::initSimulation(int N) +{ numObjects = N; dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); // LOOK-1.2 - This is basic CUDA memory management and error checking. // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); + cudaMalloc((void **)&dev_pos, N * sizeof(glm::vec3)); checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); - cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); + cudaMalloc((void **)&dev_vel1, N * sizeof(glm::vec3)); checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); - cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); + cudaMalloc((void **)&dev_vel2, N * sizeof(glm::vec3)); checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); // LOOK-1.2 - This is a typical CUDA kernel invocation. kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); + dev_pos, scene_scale); checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); // LOOK-2.1 computing grid params gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + gridCellWidth *= 0.5f; int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; gridSideCount = 2 * halfSideCount; @@ -169,10 +181,24 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void **)&dev_particleArrayIndices, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void **)&dev_particleGridIndices, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void **)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void **)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + cudaMalloc((void **)&dev_newPos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_newPos failed!"); + cudaDeviceSynchronize(); } - /****************** * copyBoidsToVBO * ******************/ @@ -180,12 +206,14 @@ void Boids::initSimulation(int N) { /** * Copy the boid positions into the VBO so that they can be drawn by OpenGL. */ -__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) { +__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) +{ int index = threadIdx.x + (blockIdx.x * blockDim.x); float c_scale = -1.0f / s_scale; - if (index < N) { + if (index < N) + { vbo[4 * index + 0] = pos[index].x * c_scale; vbo[4 * index + 1] = pos[index].y * c_scale; vbo[4 * index + 2] = pos[index].z * c_scale; @@ -193,10 +221,12 @@ __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float } } -__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) { +__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) +{ int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index < N) { + if (index < N) + { vbo[4 * index + 0] = vel[index].x + 0.3f; vbo[4 * index + 1] = vel[index].y + 0.3f; vbo[4 * index + 2] = vel[index].z + 0.3f; @@ -207,18 +237,18 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float /** * Wrapper for call to the kernCopyboidsToVBO CUDA kernel. */ -void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { +void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) +{ dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO<<>>(numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO<<>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale); checkCUDAErrorWithLine("copyBoidsToVBO failed!"); cudaDeviceSynchronize(); } - /****************** * stepSimulation * ******************/ @@ -229,32 +259,79 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * Compute the new velocity on the body with index `iSelf` due to the `N` boids * in the `pos` and `vel` arrays. */ -__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { +__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) +{ + glm::vec3 const myPos = pos[iSelf]; // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + glm::vec3 perceivedCenter(0); + int neighbors = 0; + for (int i = 0; i < N; i++) + { + if (i != iSelf && (glm::distance(pos[i], myPos) < rule1Distance)) + { + perceivedCenter += pos[i]; + neighbors++; + } + } + perceivedCenter /= neighbors > 0 ? neighbors : 1; + glm::vec3 dV1 = rule1Scale * (perceivedCenter - myPos); // Rule 2: boids try to stay a distance d away from each other + glm::vec3 c(0); + for (int i = 0; i < N; i++) + { + if (i != iSelf && (glm::distance(pos[i], myPos) < rule2Distance)) + { + c -= pos[i] - myPos; + } + } + glm::vec3 dV2 = rule2Scale * c; // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + glm::vec3 perceivedVelocity(0); + int neighbors2 = 0; + for (int i = 0; i < N; i++) + { + if (i != iSelf && (glm::distance(pos[i], myPos) < rule3Distance)) + { + perceivedVelocity += vel[i]; + neighbors2++; + } + } + perceivedVelocity /= neighbors2 > 0 ? neighbors2 : 1; + glm::vec3 dV3 = rule3Scale * perceivedVelocity; + return dV1 + dV2 + dV3; } /** * TODO-1.2 implement basic flocking -* For each of the `N` bodies, update its position based on its current velocity. +* For each of the `N` bodies, update its next velocity based on its current velocity. */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, - glm::vec3 *vel1, glm::vec3 *vel2) { + glm::vec3 *vel1, glm::vec3 *vel2) +{ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) + { + return; + } // Compute a new velocity based on pos and vel1 + glm::vec3 dVel(computeVelocityChange(N, index, pos, vel1)); // Clamp the speed + // dVel = glm::length(dVel) > maxSpeed ? glm::normalize(dVel) : dVel; + dVel = maxSpeed * glm::normalize(dVel); // Record the new velocity into vel2. Question: why NOT vel1? + vel2[index] = dVel; // + vel1[index]; } /** * LOOK-1.2 Since this is pretty trivial, we implemented it for you. * For each of the `N` bodies, update its position based on its current velocity. */ -__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { +__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) +{ // Update position by velocity int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index >= N) { + if (index >= N) + { return; } glm::vec3 thisPos = pos[index]; @@ -278,57 +355,172 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { // for(x) // for(y) // for(z)? Or some other order? -__device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { +__device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) +{ return x + y * gridResolution + z * gridResolution * gridResolution; } __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { - // TODO-2.1 - // - Label each boid with the index of its grid cell. - // - Set up a parallel array of integer indices as pointers to the actual - // boid data in pos and vel1/vel2 + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3 *pos, int *indices, int *gridIndices) +{ + // TODO-2.1 + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) + { + return; + } + // - Label each boid with the index of its grid cell. + indices[index] = index; + // - Set up a parallel array of integer indices as pointers to the actual + // boid data in pos and vel1/vel2 + glm::ivec3 gridCoord(glm::floor(inverseCellWidth * (pos[index] - gridMin))); + gridIndices[index] = gridIndex3Dto1D(gridCoord.x, + gridCoord.y, + gridCoord.z, + gridResolution); } // LOOK-2.1 Consider how this could be useful for indicating that a cell // does not enclose any boids -__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { +__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) +{ + // set null value as -1 int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { + if (index < N) + { intBuffer[index] = value; } } __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { + int *gridCellStartIndices, int *gridCellEndIndices) +{ // TODO-2.1 + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) + { + return; + } + auto myGridIdx = particleGridIndices[index]; + auto prevGridIdx = index == 0 ? -1 : particleGridIndices[index - 1]; + auto nextGridIdx = index == N - 1 ? -1 : particleGridIndices[index + 1]; + __syncthreads(); + if (prevGridIdx == -1 || myGridIdx != prevGridIdx) + { + gridCellStartIndices[myGridIdx] = index; + } + if (nextGridIdx == -1 || myGridIdx != nextGridIdx) + { + gridCellEndIndices[myGridIdx] = index; + } + __syncthreads(); // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) + { + return; + } // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce // the number of boids that need to be checked. + auto selfIdx = particleArrayIndices[index]; + auto myPos = pos[selfIdx]; + auto myGridPos = inverseCellWidth * (myPos - gridMin); // - Identify the grid cell that this particle is in + glm::ivec3 gridCoord(glm::floor(myGridPos)); // - Identify which cells may contain neighbors. This isn't always 8. + int xNew = gridCoord.x + (glm::fract(myGridPos.x) > 0.5 ? 1 : -1); + int yNew = gridCoord.y + (glm::fract(myGridPos.y) > 0.5 ? 1 : -1); + int zNew = gridCoord.z + (glm::fract(myGridPos.z) > 0.5 ? 1 : -1); + xNew = xNew < 0 || xNew > gridResolution - 1 ? -1 : xNew; + yNew = yNew < 0 || yNew > gridResolution - 1 ? -1 : yNew; + zNew = zNew < 0 || zNew > gridResolution - 1 ? -1 : zNew; + int myGridCells[8] = {-1}; + myGridCells[0] = gridIndex3Dto1D(gridCoord.x, gridCoord.y, gridCoord.z, gridResolution); + myGridCells[1] = xNew >= 0 ? gridIndex3Dto1D(xNew, gridCoord.y, gridCoord.z, gridResolution) : -1; + myGridCells[2] = yNew >= 0 ? gridIndex3Dto1D(gridCoord.x, yNew, gridCoord.z, gridResolution) : -1; + myGridCells[3] = yNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, yNew, gridCoord.z, gridResolution) : -1; + myGridCells[4] = zNew >= 0 ? gridIndex3Dto1D(gridCoord.x, gridCoord.y, zNew, gridResolution) : -1; + myGridCells[5] = zNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, gridCoord.y, zNew, gridResolution) : -1; + myGridCells[6] = zNew >= 0 && yNew >= 0 ? gridIndex3Dto1D(gridCoord.x, yNew, zNew, gridResolution) : -1; + myGridCells[7] = zNew >= 0 && yNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, yNew, zNew, gridResolution) : -1; // - For each cell, read the start/end indices in the boid pointer array. // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. + glm::vec3 perceivedCenter(0); + int neighbors = 0; + glm::vec3 c(0); + glm::vec3 perceivedVelocity(0); + int neighbors2 = 0; + for (int i = 0; i < 8; i++) + { + if (myGridCells[i] == -1) + { + continue; + } + int currIdx = myGridCells[i]; + for (int boid = gridCellStartIndices[currIdx]; boid <= gridCellEndIndices[currIdx]; boid++) + { + auto boidIdx = particleArrayIndices[boid]; + auto otherPos = pos[boidIdx]; + if (boidIdx != selfIdx) + { + if (glm::distance(otherPos, myPos) < rule1Distance) + { + perceivedCenter += otherPos; + neighbors++; + } + if (glm::distance(otherPos, myPos) < rule2Distance) + { + c -= otherPos - myPos; + } + if (glm::distance(otherPos, myPos) < rule3Distance) + { + perceivedVelocity += vel1[boidIdx]; + neighbors2++; + } + } + } + } + glm::vec3 dVel(0.f); + if (neighbors > 0) + { + perceivedCenter /= neighbors; + dVel += rule1Scale * (perceivedCenter - myPos); + } + dVel += rule2Scale * c; + if (neighbors2 > 0) + { + perceivedVelocity /= neighbors2; + dVel += rule3Scale * perceivedVelocity; + } + auto newVel = vel1[selfIdx] + dVel; // - Clamp the speed change before putting the new speed in vel2 + vel2[selfIdx] = glm::length(newVel) > maxSpeed ? maxSpeed * glm::normalize(newVel) : newVel; } __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) + { + return; + } // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, // except with one less level of indirection. // This should expect gridCellStartIndices and gridCellEndIndices to refer @@ -341,58 +533,288 @@ __global__ void kernUpdateVelNeighborSearchCoherent( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + auto myPos = pos[index]; + auto myGridPos = inverseCellWidth * (myPos - gridMin); + // - Identify the grid cell that this particle is in + glm::ivec3 gridCoord(glm::floor(myGridPos)); + // - Identify which cells may contain neighbors. This isn't always 8. + // int xNew = gridCoord.x + (glm::fract(myGridPos.x) > 0.5 ? 1 : -1); + // int yNew = gridCoord.y + (glm::fract(myGridPos.y) > 0.5 ? 1 : -1); + // int zNew = gridCoord.z + (glm::fract(myGridPos.z) > 0.5 ? 1 : -1); + // xNew = xNew < 0 || xNew > gridResolution - 1 ? -1 : xNew; + // yNew = yNew < 0 || yNew > gridResolution - 1 ? -1 : yNew; + // zNew = zNew < 0 || zNew > gridResolution - 1 ? -1 : zNew; + // int myGridCells[8] = {-1}; + // myGridCells[0] = gridIndex3Dto1D(gridCoord.x, gridCoord.y, gridCoord.z, gridResolution); + // myGridCells[1] = xNew >= 0 ? gridIndex3Dto1D(xNew, gridCoord.y, gridCoord.z, gridResolution) : -1; + // myGridCells[2] = yNew >= 0 ? gridIndex3Dto1D(gridCoord.x, yNew, gridCoord.z, gridResolution) : -1; + // myGridCells[3] = yNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, yNew, gridCoord.z, gridResolution) : -1; + // myGridCells[4] = zNew >= 0 ? gridIndex3Dto1D(gridCoord.x, gridCoord.y, zNew, gridResolution) : -1; + // myGridCells[5] = zNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, gridCoord.y, zNew, gridResolution) : -1; + // myGridCells[6] = zNew >= 0 && yNew >= 0 ? gridIndex3Dto1D(gridCoord.x, yNew, zNew, gridResolution) : -1; + // myGridCells[7] = zNew >= 0 && yNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, yNew, zNew, gridResolution) : -1; + int myGridCells[27] = {-1}; + for (int i = -1; i <= 1; i++) + { + for (int j = -1; j <= 1; j++) + { + for (int k = -1; k <= 1; k++) + { + auto xNew = gridCoord.x + k; + xNew = xNew < 0 || xNew > gridResolution - 1 ? -1 : xNew; + auto yNew = gridCoord.y + j; + yNew = yNew < 0 || yNew > gridResolution - 1 ? -1 : yNew; + auto zNew = gridCoord.z + i; + zNew = zNew < 0 || zNew > gridResolution - 1 ? -1 : zNew; + myGridCells[k + 1 + (j + 1) * 3 + (i + 1) * 9] = + zNew >= 0 && yNew >= 0 && xNew >= 0 ? gridIndex3Dto1D(xNew, yNew, zNew, gridResolution) : -1; + } + } + } + // - For each cell, read the start/end indices in the boid pointer array. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + glm::vec3 perceivedCenter(0); + int neighbors = 0; + glm::vec3 c(0); + glm::vec3 perceivedVelocity(0); + int neighbors2 = 0; + // for (int i = 0; i < 8; i++) + for (int i = 0; i < 27; i++) + { + if (myGridCells[i] == -1) + { + continue; + } + int currIdx = myGridCells[i]; + for (int boid = gridCellStartIndices[currIdx]; boid <= gridCellEndIndices[currIdx]; boid++) + { + // auto boidIdx = particleArrayIndices[boid]; + auto otherPos = pos[boid]; + if (boid != index) + { + if (glm::distance(otherPos, myPos) < rule1Distance) + { + perceivedCenter += otherPos; + neighbors++; + } + if (glm::distance(otherPos, myPos) < rule2Distance) + { + c -= otherPos - myPos; + } + if (glm::distance(otherPos, myPos) < rule3Distance) + { + perceivedVelocity += vel1[boid]; + neighbors2++; + } + } + } + } + glm::vec3 dVel(0.f); + if (neighbors > 0) + { + perceivedCenter /= neighbors; + dVel += rule1Scale * (perceivedCenter - myPos); + } + dVel += rule2Scale * c; + if (neighbors2 > 0) + { + perceivedVelocity /= neighbors2; + dVel += rule3Scale * perceivedVelocity; + } + auto newVel = vel1[index] + dVel; + // - Clamp the speed change before putting the new speed in vel2 + vel2[index] = glm::length(newVel) > maxSpeed ? maxSpeed * glm::normalize(newVel) : newVel; } /** * Step the entire N-body simulation by `dt` seconds. */ -void Boids::stepSimulationNaive(float dt) { +void Boids::stepSimulationNaive(float dt) +{ + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!"); + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); // TODO-1.2 ping-pong the velocity buffers + glm::vec3 *tmp = dev_vel1; + dev_vel1 = dev_vel2; + dev_vel2 = tmp; } -void Boids::stepSimulationScatteredGrid(float dt) { +void Boids::stepSimulationScatteredGrid(float dt) +{ + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridCells((gridCellCount + blockSize - 1) / blockSize); // TODO-2.1 // Uniform Grid Neighbor search using Thrust sort. // In Parallel: // - label each particle with its array index as well as its grid index. // Use 2x width grids. + kernComputeIndices<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you // are welcome to do a performance comparison. + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + thrust::sort_by_key(dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); + // - Naively unroll the loop for finding the start and end indices of each // cell's data pointers in the array of boid indices + kernResetIntBuffer<<>>( + gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer gridCellStartIndices failed!"); + kernResetIntBuffer<<>>( + gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer gridCellEndIndices failed!"); + kernIdentifyCellStartEnd<<>>(numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_particleArrayIndices, + dev_pos, + dev_vel1, + dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); + // kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2); // - Update positions + kernUpdatePos<<>>(numObjects, + dt, + dev_pos, + dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); // - Ping-pong buffers as needed + glm::vec3 *tmp = dev_vel1; + dev_vel1 = dev_vel2; + dev_vel2 = tmp; } -void Boids::stepSimulationCoherentGrid(float dt) { +__global__ void kernRearrangeParticleData( + int N, int *particleArrayIndices, + glm::vec3 *oldPos, glm::vec3 *newPos, + glm::vec3 *oldVel, glm::vec3 *newVel) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) + { + return; + } + newPos[index] = oldPos[particleArrayIndices[index]]; + newVel[index] = oldVel[particleArrayIndices[index]]; +} +void Boids::stepSimulationCoherentGrid(float dt) +{ + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridCells((gridCellCount + blockSize - 1) / blockSize); // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. // In Parallel: // - Label each particle with its array index as well as its grid index. // Use 2x width grids + kernComputeIndices<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); // - Unstable key sort using Thrust. A stable sort isn't necessary, but you // are welcome to do a performance comparison. + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + thrust::sort_by_key(dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); // - Naively unroll the loop for finding the start and end indices of each // cell's data pointers in the array of boid indices + kernResetIntBuffer<<>>( + gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer gridCellStartIndices failed!"); + kernResetIntBuffer<<>>( + gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer gridCellEndIndices failed!"); + kernIdentifyCellStartEnd<<>>(numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all // the particle data in the simulation array. // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + + // v1 holds current velocity data, rearrange into v2. + // in next function, flip so v1 after holds the next timesteps + kernRearrangeParticleData<<>>(numObjects, + dev_particleArrayIndices, + dev_pos, + dev_newPos, + dev_vel1, + dev_vel2); + checkCUDAErrorWithLine("kernRearrangeParticleData failed!"); // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchCoherent<<>>(numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_newPos, + dev_vel2, + dev_vel1); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); // - Update positions + kernUpdatePos<<>>(numObjects, + dt, + dev_newPos, + dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + glm::vec3 *tmp = dev_pos; + dev_pos = dev_newPos; + dev_newPos = tmp; } -void Boids::endSimulation() { +void Boids::endSimulation() +{ cudaFree(dev_vel1); cudaFree(dev_vel2); cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_gridCellEndIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_particleArrayIndices); + cudaFree(dev_newPos); } -void Boids::unitTest() { +void Boids::unitTest() +{ // LOOK-1.2 Feel free to write additional tests here. // test unstable sort @@ -400,30 +822,41 @@ void Boids::unitTest() { int *dev_intValues; int N = 10; - std::unique_ptrintKeys{ new int[N] }; - std::unique_ptrintValues{ new int[N] }; - - intKeys[0] = 0; intValues[0] = 0; - intKeys[1] = 1; intValues[1] = 1; - intKeys[2] = 0; intValues[2] = 2; - intKeys[3] = 3; intValues[3] = 3; - intKeys[4] = 0; intValues[4] = 4; - intKeys[5] = 2; intValues[5] = 5; - intKeys[6] = 2; intValues[6] = 6; - intKeys[7] = 0; intValues[7] = 7; - intKeys[8] = 5; intValues[8] = 8; - intKeys[9] = 6; intValues[9] = 9; - - cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); + std::unique_ptr intKeys{new int[N]}; + std::unique_ptr intValues{new int[N]}; + + intKeys[0] = 0; + intValues[0] = 0; + intKeys[1] = 1; + intValues[1] = 1; + intKeys[2] = 0; + intValues[2] = 2; + intKeys[3] = 3; + intValues[3] = 3; + intKeys[4] = 0; + intValues[4] = 4; + intKeys[5] = 2; + intValues[5] = 5; + intKeys[6] = 2; + intValues[6] = 6; + intKeys[7] = 0; + intValues[7] = 7; + intKeys[8] = 5; + intValues[8] = 8; + intKeys[9] = 6; + intValues[9] = 9; + + cudaMalloc((void **)&dev_intKeys, N * sizeof(int)); checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); - cudaMalloc((void**)&dev_intValues, N * sizeof(int)); + cudaMalloc((void **)&dev_intValues, N * sizeof(int)); checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); std::cout << "before unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { std::cout << " key: " << intKeys[i]; std::cout << " value: " << intValues[i] << std::endl; } @@ -444,7 +877,8 @@ void Boids::unitTest() { checkCUDAErrorWithLine("memcpy back failed!"); std::cout << "after unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { + for (int i = 0; i < N; i++) + { std::cout << " key: " << intKeys[i]; std::cout << " value: " << intValues[i] << std::endl; } diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..8ae7691 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,24 +14,28 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +const int N_FOR_VIS = 50000; const float DT = 0.2f; /** * C main function. */ -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) +{ projectName = "565 CUDA Intro: Boids"; - if (init(argc, argv)) { + if (init(argc, argv)) + { mainLoop(); Boids::endSimulation(); return 0; - } else { + } + else + { return 1; } } @@ -46,17 +50,19 @@ GLFWwindow *window; /** * Initialization of CUDA and GLFW. */ -bool init(int argc, char **argv) { +bool init(int argc, char **argv) +{ // Set window title to "Student Name: [SM 2.0] GPU Name" cudaDeviceProp deviceProp; int gpuDevice = 0; int device_count = 0; cudaGetDeviceCount(&device_count); - if (gpuDevice > device_count) { + if (gpuDevice > device_count) + { std::cout - << "Error: GPU device number is greater than the number of devices!" - << " Perhaps a CUDA-capable GPU is not installed?" - << std::endl; + << "Error: GPU device number is greater than the number of devices!" + << " Perhaps a CUDA-capable GPU is not installed?" + << std::endl; return false; } cudaGetDeviceProperties(&deviceProp, gpuDevice); @@ -70,11 +76,12 @@ bool init(int argc, char **argv) { // Window setup stuff glfwSetErrorCallback(errorCallback); - if (!glfwInit()) { + if (!glfwInit()) + { std::cout - << "Error: Could not initialize GLFW!" - << " Perhaps OpenGL 3.3 isn't available?" - << std::endl; + << "Error: Could not initialize GLFW!" + << " Perhaps OpenGL 3.3 isn't available?" + << std::endl; return false; } @@ -84,7 +91,8 @@ bool init(int argc, char **argv) { glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); window = glfwCreateWindow(width, height, deviceName.c_str(), NULL, NULL); - if (!window) { + if (!window) + { glfwTerminate(); return false; } @@ -94,7 +102,8 @@ bool init(int argc, char **argv) { glfwSetMouseButtonCallback(window, mouseButtonCallback); glewExperimental = GL_TRUE; - if (glewInit() != GLEW_OK) { + if (glewInit() != GLEW_OK) + { return false; } @@ -120,15 +129,17 @@ bool init(int argc, char **argv) { return true; } -void initVAO() { +void initVAO() +{ - std::unique_ptr bodies{ new GLfloat[4 * (N_FOR_VIS)] }; - std::unique_ptr bindices{ new GLuint[N_FOR_VIS] }; + std::unique_ptr bodies{new GLfloat[4 * (N_FOR_VIS)]}; + std::unique_ptr bindices{new GLuint[N_FOR_VIS]}; glm::vec4 ul(-1.0, -1.0, 1.0, 1.0); glm::vec4 lr(1.0, 1.0, 0.0, 0.0); - for (int i = 0; i < N_FOR_VIS; i++) { + for (int i = 0; i < N_FOR_VIS; i++) + { bodies[4 * i + 0] = 0.0f; bodies[4 * i + 1] = 0.0f; bodies[4 * i + 2] = 0.0f; @@ -136,7 +147,6 @@ void initVAO() { bindices[i] = i; } - glGenVertexArrays(1, &boidVAO); // Attach everything needed to draw a particle to this glGenBuffers(1, &boidVBO_positions); glGenBuffers(1, &boidVBO_velocities); @@ -145,7 +155,7 @@ void initVAO() { glBindVertexArray(boidVAO); // Bind the positions array to the boidVAO by way of the boidVBO_positions - glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer + glBindBuffer(GL_ARRAY_BUFFER, boidVBO_positions); // bind the buffer glBufferData(GL_ARRAY_BUFFER, 4 * (N_FOR_VIS) * sizeof(GLfloat), bodies.get(), GL_DYNAMIC_DRAW); // transfer data glEnableVertexAttribArray(positionLocation); @@ -163,151 +173,166 @@ void initVAO() { glBindVertexArray(0); } -void initShaders(GLuint * program) { +void initShaders(GLuint *program) +{ GLint location; program[PROG_BOID] = glslUtility::createProgram( - "shaders/boid.vert.glsl", - "shaders/boid.geom.glsl", - "shaders/boid.frag.glsl", attributeLocations, 2); - glUseProgram(program[PROG_BOID]); - - if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) { - glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); - } - if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1) { - glUniform3fv(location, 1, &cameraPosition[0]); - } + "shaders/boid.vert.glsl", + "shaders/boid.geom.glsl", + "shaders/boid.frag.glsl", attributeLocations, 2); + glUseProgram(program[PROG_BOID]); + + if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) + { + glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); } - - //==================================== - // Main loop - //==================================== - void runCUDA() { - // Map OpenGL buffer object for writing from CUDA on a single GPU - // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not - // use this buffer - - float4 *dptr = NULL; - float *dptrVertPositions = NULL; - float *dptrVertVelocities = NULL; - - cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions); - cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities); - - // execute the kernel - #if UNIFORM_GRID && COHERENT_GRID - Boids::stepSimulationCoherentGrid(DT); - #elif UNIFORM_GRID - Boids::stepSimulationScatteredGrid(DT); - #else - Boids::stepSimulationNaive(DT); - #endif - - #if VISUALIZE - Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities); - #endif - // unmap buffer object - cudaGLUnmapBufferObject(boidVBO_positions); - cudaGLUnmapBufferObject(boidVBO_velocities); + if ((location = glGetUniformLocation(program[PROG_BOID], "u_cameraPos")) != -1) + { + glUniform3fv(location, 1, &cameraPosition[0]); } +} - void mainLoop() { - double fps = 0; - double timebase = 0; - int frame = 0; - - Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure - // your CUDA development setup is ready to go. +//==================================== +// Main loop +//==================================== +void runCUDA() +{ + // Map OpenGL buffer object for writing from CUDA on a single GPU + // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not + // use this buffer + + float4 *dptr = NULL; + float *dptrVertPositions = NULL; + float *dptrVertVelocities = NULL; + + cudaGLMapBufferObject((void **)&dptrVertPositions, boidVBO_positions); + cudaGLMapBufferObject((void **)&dptrVertVelocities, boidVBO_velocities); + +// execute the kernel +#if UNIFORM_GRID && COHERENT_GRID + Boids::stepSimulationCoherentGrid(DT); +#elif UNIFORM_GRID + Boids::stepSimulationScatteredGrid(DT); +#else + Boids::stepSimulationNaive(DT); +#endif + +#if VISUALIZE + Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities); +#endif + // unmap buffer object + cudaGLUnmapBufferObject(boidVBO_positions); + cudaGLUnmapBufferObject(boidVBO_velocities); +} - while (!glfwWindowShouldClose(window)) { - glfwPollEvents(); +void mainLoop() +{ + double fps = 0; + double timebase = 0; + int frame = 0; - frame++; - double time = glfwGetTime(); + Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure + // your CUDA development setup is ready to go. - if (time - timebase > 1.0) { - fps = frame / (time - timebase); - timebase = time; - frame = 0; - } + while (!glfwWindowShouldClose(window)) + { + glfwPollEvents(); - runCUDA(); + frame++; + double time = glfwGetTime(); - std::ostringstream ss; - ss << "["; - ss.precision(1); - ss << std::fixed << fps; - ss << " fps] " << deviceName; - glfwSetWindowTitle(window, ss.str().c_str()); + if (time - timebase > 1.0) + { + fps = frame / (time - timebase); + timebase = time; + frame = 0; + } - glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + runCUDA(); - #if VISUALIZE - glUseProgram(program[PROG_BOID]); - glBindVertexArray(boidVAO); - glPointSize((GLfloat)pointSize); - glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0); - glPointSize(1.0f); + std::ostringstream ss; + ss << "["; + ss.precision(1); + ss << std::fixed << fps; + ss << " fps] " << deviceName; + glfwSetWindowTitle(window, ss.str().c_str()); - glUseProgram(0); - glBindVertexArray(0); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); - glfwSwapBuffers(window); - #endif - } - glfwDestroyWindow(window); - glfwTerminate(); - } +#if VISUALIZE + glUseProgram(program[PROG_BOID]); + glBindVertexArray(boidVAO); + glPointSize((GLfloat)pointSize); + glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0); + glPointSize(1.0f); + glUseProgram(0); + glBindVertexArray(0); - void errorCallback(int error, const char *description) { - fprintf(stderr, "error %d: %s\n", error, description); + glfwSwapBuffers(window); +#endif } + glfwDestroyWindow(window); + glfwTerminate(); +} - void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods) { - if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) { - glfwSetWindowShouldClose(window, GL_TRUE); - } - } +void errorCallback(int error, const char *description) +{ + fprintf(stderr, "error %d: %s\n", error, description); +} - void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) { - leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS); - rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS); +void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) +{ + if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) + { + glfwSetWindowShouldClose(window, GL_TRUE); } +} - void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) { - if (leftMousePressed) { - // compute new camera parameters - phi += (xpos - lastX) / width; - theta -= (ypos - lastY) / height; - theta = std::fmax(0.01f, std::fmin(theta, 3.14f)); - updateCamera(); - } - else if (rightMousePressed) { - zoom += (ypos - lastY) / height; - zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f)); - updateCamera(); - } +void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods) +{ + leftMousePressed = (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS); + rightMousePressed = (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS); +} - lastX = xpos; - lastY = ypos; +void mousePositionCallback(GLFWwindow *window, double xpos, double ypos) +{ + if (leftMousePressed) + { + // compute new camera parameters + phi += (xpos - lastX) / width; + theta -= (ypos - lastY) / height; + theta = std::fmax(0.01f, std::fmin(theta, 3.14f)); + updateCamera(); + } + else if (rightMousePressed) + { + zoom += (ypos - lastY) / height; + zoom = std::fmax(0.1f, std::fmin(zoom, 5.0f)); + updateCamera(); } - void updateCamera() { - cameraPosition.x = zoom * sin(phi) * sin(theta); - cameraPosition.z = zoom * cos(theta); - cameraPosition.y = zoom * cos(phi) * sin(theta); - cameraPosition += lookAt; + lastX = xpos; + lastY = ypos; +} - projection = glm::perspective(fovy, float(width) / float(height), zNear, zFar); - glm::mat4 view = glm::lookAt(cameraPosition, lookAt, glm::vec3(0, 0, 1)); - projection = projection * view; +void updateCamera() +{ + cameraPosition.x = zoom * sin(phi) * sin(theta); + cameraPosition.z = zoom * cos(theta); + cameraPosition.y = zoom * cos(phi) * sin(theta); + cameraPosition += lookAt; - GLint location; + projection = glm::perspective(fovy, float(width) / float(height), zNear, zFar); + glm::mat4 view = glm::lookAt(cameraPosition, lookAt, glm::vec3(0, 0, 1)); + projection = projection * view; - glUseProgram(program[PROG_BOID]); - if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) { - glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); - } + GLint location; + + glUseProgram(program[PROG_BOID]); + if ((location = glGetUniformLocation(program[PROG_BOID], "u_projMatrix")) != -1) + { + glUniformMatrix4fv(location, 1, GL_FALSE, &projection[0][0]); } +}