PyCudaIntro/source/likelihood/kernel.cu at master · nayyarv/PyCudaIntro · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include <stdlib.h>
#include <math.h>
// #include <stdio.h>

#define MAXTHREADS MAX_THREADS
#define PI 3.1415926536

__device__ float normalDistribution(float* x, float* mu,
	float* diagonalCovariance, unsigned int dim){
	/*
	x:  individual point being evaluated, x[dim]
	mu: mean of the normal distribution being evaluated  mu[dim]
	diagonalCovariance: for the norm dist  diagonalCovariance[dim]

	dim: dimensionality of the distribution, also
	equal to length of the previous vectors
	Evaluates the normalDistribution on the GPU, for
	diagonal Covariance Matrices only.
	*/
	float total = 0;
	float det = 1;
	float finval = 0;
	float denom = 0;
	float temp = 0;

	for (int i = 0; i < dim; ++i)
	{
		temp = (x[i]-mu[i]);
		temp *= temp; // Square it
		total += temp / diagonalCovariance[i];
		//Note this is the stuff that goes inside the normal
		det *= diagonalCovariance[i];
		//TODO: replace with memory implementation instead?
	}


	// printf("temp = %f, det = %f, total = %f\n", temp, det, total);

	total*=-1/2.0;

	finval = expf(total);

	denom = powf(2*PI, dim) * det;

	return (rsqrtf(denom) * finval);
}


__global__ void likelihoodKernel(float *Xpoints, float *means, float *diagCovs,
	float *weights,
	unsigned int dim, unsigned int numPoints, unsigned int numMixtures,
	float* finalLikelihood)
{
	/*
	All 2d arrays are passed in as row major
	Xpoints - 2d array of points, numPoints rows of vectors of dim length
		Xpoints[numPoints][dim]
	Means - 2d array of means, numMixtures rows of vectors of dim
		Means[numMixtures][dim]
	diagCovs - 2d array of cov diagonals, ditto
		diagCovs[numMixtures][dim]
	weights - 1d array of length numMixtures
		weights[numMixtures]

	numPoints is the actual number of points being evaluated
		This is likely to be a subset of what actually needs to be processe
		GridDim*BlockDim > numPoints
	finalLikelihood: Likelihood value that we want to return
		finalLikelihood[blockIdx.x]
	Since threads are usually a power of 2, we have to check if we're out of bounds
	with regards to the data.
	*/

	__shared__ float sarray[MAXTHREADS];
	//Should be consistently at the max allowed and easier than dynamic allocation

	int index = blockIdx.x * blockDim.x + threadIdx.x;
	int threadIndex = threadIdx.x;

	sarray[threadIndex] = 0;

	__syncthreads();
	//Following CUDA guidelines here for quick reduction
	//TODO: Speed up computation by having a block per mixture?
	// If possible, also allows for marginal updates


	if (index<numPoints) //Check that we're in bounds!
	{
		// Just make sure we have stuff to compute
		// Will contain the id of the x value

		float value = 0;

		for (int i = 0; i < numMixtures; ++i)
		{
			value += weights[i] * normalDistribution(Xpoints+(index*dim), means+(i*dim),
													 diagCovs+(i*dim), dim);
		}

		sarray[threadIndex] = logf(value); //Log Likelihood


	}
	else
	{
		sarray[threadIndex] = 0.0f; //I.e it's zero
	}

	// finalLikelihood[threadIndex] = sarray[threadIndex];

	// Reduction
	__syncthreads();
	for (int s = blockDim.x/2; s > 0; s>>=1)
	{
		//Only works for powers of 2
		if (threadIndex<s)
		{
			sarray[threadIndex] += sarray[threadIndex+s];
		}
		__syncthreads();
	}


	if (threadIndex==0)
	//Since everything has been synced, sarray[0] now holds our result
	{
		finalLikelihood[blockIdx.x] = sarray[0];
	}


}