diff --git a/6_Example_PackingProblem/main.cpp b/6_Example_PackingProblem/main.cpp index 7f5a794..22956bf 100644 --- a/6_Example_PackingProblem/main.cpp +++ b/6_Example_PackingProblem/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -9,99 +10,220 @@ using namespace std; using namespace oneapi; -vector doMAP(int a,int x[], int n) +/** + * Perform a parallel map on the elements of an input array and store + * them in a caller-provided output array. Should work with any other + * "array-like" data type that supports the subscript operator `[]`. + * + * If the template parameters seem confusing, imagine that: + * - \c OutArrT is \c OutT[] + * - \c InArrT is \c InT[] + * - \c FuncT is \c OutT(InT) + * + * NOTE: Thanks to TAD (Template Argument Deduction), we do not need to + * specify any template parameters for this function template. + * + * @param out Output array + * @param in Input array + * @param func Map function + * @param n Length of the arrays + */ +template +void doMap(OutArrT& out, const InArrT& in, FuncT&& func, size_t n) { - vector out(n); - - tbb::parallel_for( - tbb::blocked_range(0, n), - - // lambda function - [&](tbb::blocked_range r) { - for (auto i = r.begin(); i != r.end(); i++) { - out[i] = x[i]>=a; - } - } - - ); - return out; + tbb::parallel_for( + tbb::blocked_range(0, n), + + // lambda function + [&](tbb::blocked_range r) { + for (size_t i = r.begin(); i != r.end(); i++) { + out[i] = invoke(func, in[i]); + } + } + ); } +/** + * Calculate a parallel prefix/scan of an input array and store the + * individual results in an output array. Should work with any other + * "array-like" data type that supports the subscript operator `[]`. + * + * The TBB function for a parallel scan takes two functions: one to + * perform a sequential scan over a range, and another to combine two + * summaries. This abstraction defines the former function from the + * latter, so the caller only needs to provide the combiner function. + * + * If the template parameters seem confusing, imagine that: + * - \c ArrT is \c T[] + * - \c InArrT is \c InT[] + * - \c FuncT is \c T(T, T) + * + * The only reason why \c InArrT and \c ArrT are different types is to + * enable implicit casting of input values to the output type without + * having to do a separate map step. For example, implicitly casting + * booleans in an array to \c size_t when computing the prefix sum. + * + * NOTE: Thanks to TAD (Template Argument Deduction), we do not need to + * specify any template parameters for this function template. + * + * @param out Output array + * @param in Input array + * @param ident The identity element for the function + * @param func Combiner function + * @param n Length of the arrays + * + * @return The summary computed over the whole range + */ +template +T doScan(ArrT &out, const InArrT& in, const T& ident, FuncT&& func, size_t n) +{ + return tbb::parallel_scan( + tbb::blocked_range(0, n), // range + ident, + [&](tbb::blocked_range r, T sum, bool is_final_scan) { + T tmp = sum; + for (size_t i = r.begin(); i != r.end(); i++) { + tmp = invoke(func, tmp, in[i]); + if (is_final_scan) { + out[i] = tmp; + } + } + return tmp; + }, + //[&](T left, T right) { + // return invoke(func, left, right); + //} + func /* No lambda needed, just use the provided function directly */ + ); +} -int doSCAN(int out[], const int in[],int n){ - int total_sum = tbb::parallel_scan( - tbb::blocked_range(0, n), //range - 0, //id - [&](tbb::blocked_range r, int sum, bool is_final_scan){ - int tmp = sum; - for (int i = r.begin(); i < r.end(); ++i) { - tmp = tmp + in[i]; - if (is_final_scan) - out[i] = tmp; - } - return tmp; - }, - [&]( int left, int right ) { - return left + right; - } - ); - return total_sum; +/** + * Filter the elements of an input array according to a boolean match + * array and store them in an output array at the position specified + * by the value in an index match array. Should work with any other + * "array-like" data type that supports the subscript operator `[]`. + * + * Note that a proper filter function would only take the input array + * and a \c bool(T) predicate function. However, this program is meant + * to show how a parallel filter function can be implemented, so the + * signature of this function takes the intermediate results that were + * previously computed. + * + * NOTE: Thanks to TAD (Template Argument Deduction), we do not need to + * specify any template parameters for this function template. + * + * @param out Output array + * @param in Input array + * @param bolMatch Array of bool-like (result of mapping a predicate on \p in array) + * @param ixMatch Array of indices plus 1 (result of prefix sum over \p bolMatch array) + * @param n Length of the input arrays (output array can be shorter) + */ +template +void doFilter(ArrT& out, const ArrT& in, const BoolArrT& bolMatch, const IdxArrT& ixMatch, size_t n) +{ + tbb::parallel_for( + tbb::blocked_range(0, n), + [&](tbb::blocked_range r) { + for (size_t i = r.begin(); i < r.end(); i++) { + if (bolMatch[i]) { + out[ixMatch[i] - 1] = in[i]; + } + } + } + ); } -//doMAPFilter(&out[0],&ix[0],&x[0],&filter_results[0], x.size()) -void doMAPFilter(int bolMatch[], int ixMatch[],int x[], int out[], int n){ - tbb::parallel_for( - tbb::blocked_range(0, n), - // lambda function - [&](tbb::blocked_range r) { - for (auto i = r.begin(); i < r.end(); i++) { - if (bolMatch[i]){ - out[ixMatch[i]-1] = x[i]; - } - } - } - ); +/** + * Print the length and contents of a vector. Assumes elements can be + * printed using the \c << operator of \c cout directly. Attempts to + * pad values using \c setw() so that they remain aligned. + * + * @param vec The vector to print the info of + * @param name The name to show for this vector + */ +template +void printVec(const vector& vec, const string& name) +{ + const string prefix = name + " [" + to_string(vec.size()) + "]:"; + cout << setw(16) << prefix; + for (const T& e : vec) { + cout << setw(4) << e << ","; + } + cout << endl; } -int main(){ +/** + * Perform a parallel filter operation on an input vector. The output + * vector only contains the elements of the input vector for which the + * predicate returned true, preserving the order of the input elements. + * + * We use vectors here for convenience, since we need to allocate both + * intermediate and final results. + * + * If the template parameters seem confusing, imagine that: + * - \c PredT is \c bool(T) + * + * @param inputVec Input vector + * @param predicate Function to test each element with + * + * @return A vector with the filtered elements from the input + */ +template +vector vecFilter(const vector& inputVec, PredT&& predicate) +{ + const size_t n = inputVec.size(); + printVec(inputVec, "inputVec"); + + /** + * MAP: Apply the predicate to each element of the input vector. + * We obtain a \c bolMatch vector where each boolean value is + * the value returned by the predicate for a given input value. + */ + vector bolMatch(n); + doMap(bolMatch, inputVec, predicate, n); + printVec(bolMatch, "bolMatch"); + + /** + * SCAN: Compute the prefix sum of the \c bolMatch vector. The + * resulting \c ixMatch vector tells us the index in the output + * vector where a given input value needs to be stored (plus 1). + */ + const auto scanFunc = [](size_t a, size_t b) { return a + b; }; + const size_t identity = 0; + vector ixMatch(n); + /** + * We can calculate the output length (i.e. the number of values + * that passed the predicate) using a REDUCE operation. However, + * what \c doScan returns (the summary computed over the entire + * range) is exactly the same a REDUCE operation would return. + */ + size_t outSize = doScan(ixMatch, bolMatch, identity, scanFunc, n); + printVec(ixMatch, "ixMatch"); + + /** + * JOIN: Using \c bolMatch and \c ixMatch from previous steps, + * copy the input elements that passed the predicate (i.e. for + * which \c bolMatch is true) to the output vector, making use + * of \c ixMatch to know in which position the elements should + * be inserted. + */ + vector filteredVec(outSize); + doFilter(filteredVec, inputVec, bolMatch, ixMatch, n); + printVec(filteredVec, "filteredVec"); + + return filteredVec; +} - static int a = 10; - static vector x{7,1,0,13,0,15,20,-1}; +int main() +{ + const vector input{7, 1, 0, 13, 0, 15, 20, -1}; + const auto predicate = [](int x) { return x > 10; }; tbb::tick_count t0 = tbb::tick_count::now(); - - //MAP operation - vector bolMatch = doMAP(a,&x[0], x.size()); - - cout << "Map vector: "<< endl; - for (int i: bolMatch){ - cout << i << ','; - } - cout << endl; - - //SCAN - vector ixMatch(x.size()); - int sum = doSCAN(&ixMatch[0], &bolMatch[0], x.size()); //get index order - - cout << "Scan vector: " << sum << endl; - for (int i: ixMatch){ - cout << i << ','; - } - cout << endl; - - //JOIN - vector filtered_results(sum); - doMAPFilter(&bolMatch[0],&ixMatch[0],&x[0],&filtered_results[0], x.size()); - - - cout << "Filtered vector: " << endl; - for (int i: filtered_results){ - cout << i << ','; - } - cout << endl; - - cout << "\nTime: " << (tbb::tick_count::now()-t0).seconds() << "seconds" << endl; - - return 0; - - } + + const vector output = vecFilter(input, predicate); + + /* NOTE: this includes the time spent printing the contents of vectors */ + cout << "\nTime: " << (tbb::tick_count::now() - t0).seconds() << " seconds" << endl; + return 0; +} diff --git a/6_Example_PackingProblem/seqvsparallel.cpp b/6_Example_PackingProblem/seqvsparallel.cpp index c7c9e49..52d0d2e 100644 --- a/6_Example_PackingProblem/seqvsparallel.cpp +++ b/6_Example_PackingProblem/seqvsparallel.cpp @@ -12,94 +12,127 @@ using namespace std; using namespace oneapi; using namespace tbb; -template +/** + * This allows selecting between two different versions of the inner + * loop function in the parallel scan implementation. This optimised + * version is more verbose, but avoids evaluating the same condition + * each and every loop iteration. + * + * When not using compiler optimisations, the unoptimised version is + * much worse (about +33% exec time for ARRAY_SIZE = 100000000). But + * with \c -O3 optimisations both approaches perform about the same, + * i.e. the compiler is smart enough to optimise this if allowed to. + * + * This shows that manual optimisations (which often make the source + * code harder to read and maintain) may be useless when things like + * the compiler can automatically perform the same optimisations. So + * consider profiling / benchmarking before wasting time optimising. + */ +#define USE_OPTIMISED_LOOP 0 + +template class Body { - T reduced_result; - T* const y; - const T* const x; - - public: - - Body( T y_[], const T x_[] ) : reduced_result(0), x(x_), y(y_) {} - - T get_reduced_result() const {return reduced_result;} - - template - void operator()( const blocked_range& r, Tag ) - { - T temp = reduced_result; - - for( int i=r.begin(); i + void operator()(const blocked_range& r, Tag) + { +#if USE_OPTIMISED_LOOP + /** + * Evaluate \c Tag::is_final_scan() once outside the + * loop, but we need to have two separate loops. + */ + if (Tag::is_final_scan()) { + for (size_t i = r.begin(); i < r.end(); ++i) { + sum = sum + x[i]; + y[i] = sum; + } + } else { + for (size_t i = r.begin(); i < r.end(); ++i) { + sum = sum + x[i]; + } + } +#else + /** + * Less verbose, but \c Tag::is_final_scan() is + * evaluated each and every iteration. Or is it? + */ + for (size_t i = r.begin(); i < r.end(); ++i) { + sum = sum + x[i]; + if (Tag::is_final_scan()) { + y[i] = sum; + } + } +#endif + } + + Body(Body& b, split) : x(b.x), y(b.y), sum(0) {} + + void reverse_join(Body& a) + { + sum = a.sum + sum; + } + + void assign(Body& b) + { + sum = b.sum; + } }; - -template -float DoParallelScan( T y[], const T x[], int n) +template +T DoParallelScan(T y[], const T x[], size_t n) { - Body body(y,x); - tick_count t1,t2,t3,t4; - t1=tick_count::now(); - parallel_scan( blocked_range(0,n), body , auto_partitioner() ); - t2=tick_count::now(); - cout<<"Time Taken for parallel scan is \t"<<(t2-t1).seconds()< body(y, x); + const tick_count t0 = tick_count::now(); + parallel_scan(blocked_range(0, n), body); + const tick_count t1 = tick_count::now(); + cout << "Time Taken for parallel scan is: " << (t1 - t0).seconds() << endl; + return body.get_sum(); } - -template -float SerialScan(T1 y[], const T1 x[], int n) +template +T DoSerialScan(T y[], const T x[], size_t n) { - tick_count t3,t4; - - t3=tick_count::now(); - T1 temp = 10; - - for( int i=1; i y1(ARRAY_SIZE), x1(ARRAY_SIZE); - int y1[100000],x1[100000]; - - for(int i=0;i<100000;i++) - x1[i]=i; - - cout<(i & 0x7fffffff); + } - cout<<"\n serial scan output is \t"<