-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel.cu
More file actions
133 lines (102 loc) · 3.24 KB
/
kernel.cu
File metadata and controls
133 lines (102 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include <stdlib.h>
#include <math.h>
// #include <stdio.h>
#define MAXTHREADS MAX_THREADS
#define PI 3.1415926536
__device__ float normalDistribution(float* x, float* mu,
float* diagonalCovariance, unsigned int dim){
/*
x: individual point being evaluated, x[dim]
mu: mean of the normal distribution being evaluated mu[dim]
diagonalCovariance: for the norm dist diagonalCovariance[dim]
dim: dimensionality of the distribution, also
equal to length of the previous vectors
Evaluates the normalDistribution on the GPU, for
diagonal Covariance Matrices only.
*/
float total = 0;
float det = 1;
float finval = 0;
float denom = 0;
float temp = 0;
for (int i = 0; i < dim; ++i)
{
temp = (x[i]-mu[i]);
temp *= temp; // Square it
total += temp / diagonalCovariance[i];
//Note this is the stuff that goes inside the normal
det *= diagonalCovariance[i];
//TODO: replace with memory implementation instead?
}
// printf("temp = %f, det = %f, total = %f\n", temp, det, total);
total*=-1/2.0;
finval = expf(total);
denom = powf(2*PI, dim) * det;
return (rsqrtf(denom) * finval);
}
__global__ void likelihoodKernel(float *Xpoints, float *means, float *diagCovs,
float *weights,
unsigned int dim, unsigned int numPoints, unsigned int numMixtures,
float* finalLikelihood)
{
/*
All 2d arrays are passed in as row major
Xpoints - 2d array of points, numPoints rows of vectors of dim length
Xpoints[numPoints][dim]
Means - 2d array of means, numMixtures rows of vectors of dim
Means[numMixtures][dim]
diagCovs - 2d array of cov diagonals, ditto
diagCovs[numMixtures][dim]
weights - 1d array of length numMixtures
weights[numMixtures]
numPoints is the actual number of points being evaluated
This is likely to be a subset of what actually needs to be processe
GridDim*BlockDim > numPoints
finalLikelihood: Likelihood value that we want to return
finalLikelihood[blockIdx.x]
Since threads are usually a power of 2, we have to check if we're out of bounds
with regards to the data.
*/
__shared__ float sarray[MAXTHREADS];
//Should be consistently at the max allowed and easier than dynamic allocation
int index = blockIdx.x * blockDim.x + threadIdx.x;
int threadIndex = threadIdx.x;
sarray[threadIndex] = 0;
__syncthreads();
//Following CUDA guidelines here for quick reduction
//TODO: Speed up computation by having a block per mixture?
// If possible, also allows for marginal updates
if (index<numPoints) //Check that we're in bounds!
{
// Just make sure we have stuff to compute
// Will contain the id of the x value
float value = 0;
for (int i = 0; i < numMixtures; ++i)
{
value += weights[i] * normalDistribution(Xpoints+(index*dim), means+(i*dim),
diagCovs+(i*dim), dim);
}
sarray[threadIndex] = logf(value); //Log Likelihood
}
else
{
sarray[threadIndex] = 0.0f; //I.e it's zero
}
// finalLikelihood[threadIndex] = sarray[threadIndex];
// Reduction
__syncthreads();
for (int s = blockDim.x/2; s > 0; s>>=1)
{
//Only works for powers of 2
if (threadIndex<s)
{
sarray[threadIndex] += sarray[threadIndex+s];
}
__syncthreads();
}
if (threadIndex==0)
//Since everything has been synced, sarray[0] now holds our result
{
finalLikelihood[blockIdx.x] = sarray[0];
}
}