diff --git a/6_Example_PackingProblem/main.cpp b/6_Example_PackingProblem/main.cpp
index 7f5a794..22956bf 100644
--- a/6_Example_PackingProblem/main.cpp
+++ b/6_Example_PackingProblem/main.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <iomanip>
 #include <vector>
 #include <algorithm>
 
@@ -9,99 +10,220 @@
 using namespace std;
 using namespace oneapi;
 
-vector<int> doMAP(int a,int x[], int n)
+/**
+ * Perform a parallel map on the elements of an input array and store
+ * them in a caller-provided output array. Should work with any other
+ * "array-like" data type that supports the subscript operator `[]`.
+ *
+ * If the template parameters seem confusing, imagine that:
+ *    - \c OutArrT is \c OutT[]
+ *    - \c InArrT is \c InT[]
+ *    - \c FuncT is \c OutT(InT)
+ *
+ * NOTE: Thanks to TAD (Template Argument Deduction), we do not need to
+ * specify any template parameters for this function template.
+ *
+ * @param out   Output array
+ * @param in    Input array
+ * @param func  Map function
+ * @param n     Length of the arrays
+ */
+template<typename OutArrT, typename InArrT, typename FuncT>
+void doMap(OutArrT& out, const InArrT& in, FuncT&& func, size_t n)
 {
-    vector<int> out(n);
-
-    tbb::parallel_for(
-        tbb::blocked_range<int>(0, n),
-
-        // lambda function
-        [&](tbb::blocked_range<int> r) {
-            for (auto i = r.begin(); i != r.end(); i++) {
-                out[i] = x[i]>=a;
-            }
-        }
-
-    );
-    return out;
+	tbb::parallel_for(
+		tbb::blocked_range<size_t>(0, n),
+
+		// lambda function
+		[&](tbb::blocked_range<size_t> r) {
+			for (size_t i = r.begin(); i != r.end(); i++) {
+				out[i] = invoke(func, in[i]);
+			}
+		}
+	);
 }
 
+/**
+ * Calculate a parallel prefix/scan of an input array and store the
+ * individual results in an output array. Should work with any other
+ * "array-like" data type that supports the subscript operator `[]`.
+ *
+ * The TBB function for a parallel scan takes two functions: one to
+ * perform a sequential scan over a range, and another to combine two
+ * summaries. This abstraction defines the former function from the
+ * latter, so the caller only needs to provide the combiner function.
+ *
+ * If the template parameters seem confusing, imagine that:
+ *    - \c ArrT is \c T[]
+ *    - \c InArrT is \c InT[]
+ *    - \c FuncT is \c T(T, T)
+ *
+ * The only reason why \c InArrT and \c ArrT are different types is to
+ * enable implicit casting of input values to the output type without
+ * having to do a separate map step. For example, implicitly casting
+ * booleans in an array to \c size_t when computing the prefix sum.
+ *
+ * NOTE: Thanks to TAD (Template Argument Deduction), we do not need to
+ * specify any template parameters for this function template.
+ *
+ * @param out   Output array
+ * @param in    Input array
+ * @param ident The identity element for the function
+ * @param func  Combiner function
+ * @param n     Length of the arrays
+ *
+ * @return The summary computed over the whole range
+ */
+template<typename T, typename ArrT, typename InArrT, typename FuncT>
+T doScan(ArrT &out, const InArrT& in, const T& ident, FuncT&& func, size_t n)
+{
+	return tbb::parallel_scan(
+		tbb::blocked_range<size_t>(0, n), // range
+		ident,
+		[&](tbb::blocked_range<size_t> r, T sum, bool is_final_scan) {
+			T tmp = sum;
+			for (size_t i = r.begin(); i != r.end(); i++) {
+				tmp = invoke(func, tmp, in[i]);
+				if (is_final_scan) {
+					out[i] = tmp;
+				}
+			}
+			return tmp;
+		},
+		//[&](T left, T right) {
+		//	return invoke(func, left, right);
+		//}
+		func /* No lambda needed, just use the provided function directly */
+	);
+}
 
-int doSCAN(int out[], const int in[],int n){
-    int total_sum = tbb::parallel_scan(
-        tbb::blocked_range<int>(0, n), //range
-        0, //id
-        [&](tbb::blocked_range<int> r, int sum, bool is_final_scan){        
-            int tmp = sum;
-            for (int i = r.begin(); i < r.end(); ++i) {
-                tmp = tmp + in[i];
-                if (is_final_scan)
-                    out[i] = tmp;
-            }
-            return tmp;
-        },
-        [&]( int left, int right ) {
-            return left + right;
-        }
-    );
-    return total_sum;
+/**
+ * Filter the elements of an input array according to a boolean match
+ * array and store them in an output array at the position specified
+ * by the value in an index match array. Should work with any other
+ * "array-like" data type that supports the subscript operator `[]`.
+ *
+ * Note that a proper filter function would only take the input array
+ * and a \c bool(T) predicate function. However, this program is meant
+ * to show how a parallel filter function can be implemented, so the
+ * signature of this function takes the intermediate results that were
+ * previously computed.
+ *
+ * NOTE: Thanks to TAD (Template Argument Deduction), we do not need to
+ * specify any template parameters for this function template.
+ *
+ * @param out       Output array
+ * @param in        Input array
+ * @param bolMatch  Array of bool-like (result of mapping a predicate on \p in array)
+ * @param ixMatch   Array of indices plus 1 (result of prefix sum over \p bolMatch array)
+ * @param n         Length of the input arrays (output array can be shorter)
+ */
+template<typename ArrT, typename BoolArrT, typename IdxArrT>
+void doFilter(ArrT& out, const ArrT& in, const BoolArrT& bolMatch, const IdxArrT& ixMatch, size_t n)
+{
+	tbb::parallel_for(
+		tbb::blocked_range<size_t>(0, n),
+		[&](tbb::blocked_range<size_t> r) {
+			for (size_t i = r.begin(); i < r.end(); i++) {
+				if (bolMatch[i]) {
+					out[ixMatch[i] - 1] = in[i];
+				}
+			}
+		}
+	);
 }
 
-//doMAPFilter(&out[0],&ix[0],&x[0],&filter_results[0], x.size())
-void doMAPFilter(int bolMatch[], int ixMatch[],int x[], int out[], int n){
-    tbb::parallel_for(
-        tbb::blocked_range<int>(0, n),
-        // lambda function
-        [&](tbb::blocked_range<int> r) {
-            for (auto i = r.begin(); i < r.end(); i++) {
-                if (bolMatch[i]){
-                    out[ixMatch[i]-1] = x[i];    
-                }
-            }
-        }
-    );
+/**
+ * Print the length and contents of a vector. Assumes elements can be
+ * printed using the \c << operator of \c cout directly. Attempts to
+ * pad values using \c setw() so that they remain aligned.
+ *
+ * @param vec   The vector to print the info of
+ * @param name  The name to show for this vector
+ */
+template<typename T>
+void printVec(const vector<T>& vec, const string& name)
+{
+	const string prefix = name + " [" + to_string(vec.size()) + "]:";
+	cout << setw(16) << prefix;
+	for (const T& e : vec) {
+		cout << setw(4) << e << ",";
+	}
+	cout << endl;
 }
 
-int main(){
+/**
+ * Perform a parallel filter operation on an input vector. The output
+ * vector only contains the elements of the input vector for which the
+ * predicate returned true, preserving the order of the input elements.
+ *
+ * We use vectors here for convenience, since we need to allocate both
+ * intermediate and final results.
+ *
+ * If the template parameters seem confusing, imagine that:
+ *    - \c PredT is \c bool(T)
+ *
+ * @param inputVec  Input vector
+ * @param predicate Function to test each element with
+ *
+ * @return A vector with the filtered elements from the input
+ */
+template<typename T, typename PredT>
+vector<T> vecFilter(const vector<T>& inputVec, PredT&& predicate)
+{
+	const size_t n = inputVec.size();
+	printVec(inputVec, "inputVec");
+
+	/**
+	 * MAP: Apply the predicate to each element of the input vector.
+	 * We obtain a \c bolMatch vector where each boolean value is
+	 * the value returned by the predicate for a given input value.
+	 */
+	vector<bool> bolMatch(n);
+	doMap(bolMatch, inputVec, predicate, n);
+	printVec(bolMatch, "bolMatch");
+
+	/**
+	 * SCAN: Compute the prefix sum of the \c bolMatch vector. The
+	 * resulting \c ixMatch vector tells us the index in the output
+	 * vector where a given input value needs to be stored (plus 1).
+	 */
+	const auto scanFunc = [](size_t a, size_t b) { return a + b; };
+	const size_t identity = 0;
+	vector<size_t> ixMatch(n);
+	/**
+	 * We can calculate the output length (i.e. the number of values
+	 * that passed the predicate) using a REDUCE operation. However,
+	 * what \c doScan returns (the summary computed over the entire
+	 * range) is exactly the same a REDUCE operation would return.
+	 */
+	size_t outSize = doScan(ixMatch, bolMatch, identity, scanFunc, n);
+	printVec(ixMatch, "ixMatch");
+
+	/**
+	 * JOIN: Using \c bolMatch and \c ixMatch from previous steps,
+	 * copy the input elements that passed the predicate (i.e. for
+	 * which \c bolMatch is true) to the output vector, making use
+	 * of \c ixMatch to know in which position the elements should
+	 * be inserted.
+	 */
+	vector<T> filteredVec(outSize);
+	doFilter(filteredVec, inputVec, bolMatch, ixMatch, n);
+	printVec(filteredVec, "filteredVec");
+
+	return filteredVec;
+}
 
-    static int a = 10;
-    static vector<int> x{7,1,0,13,0,15,20,-1};
+int main()
+{
+	const vector<int> input{7, 1, 0, 13, 0, 15, 20, -1};
+	const auto predicate = [](int x) { return x > 10; };
 
 	tbb::tick_count t0 = tbb::tick_count::now();
- 	
-    //MAP operation
-    vector<int> bolMatch = doMAP(a,&x[0], x.size());
- 	
-    cout << "Map vector: "<< endl;
-    for (int i: bolMatch){
-        cout << i << ',';
-    }
-    cout << endl;
-
-    //SCAN
-    vector<int> ixMatch(x.size());
-    int sum = doSCAN(&ixMatch[0], &bolMatch[0],  x.size()); //get index order
-
-    cout << "Scan vector: " << sum << endl;
-    for (int i: ixMatch){
-        cout << i << ',';
-    }
-    cout << endl;
-
-    //JOIN
-    vector<int> filtered_results(sum);
-    doMAPFilter(&bolMatch[0],&ixMatch[0],&x[0],&filtered_results[0], x.size());
-
-
-    cout << "Filtered vector: " << endl;
-    for (int i: filtered_results){
-        cout << i << ',';
-    }
-    cout << endl;
-
- 	cout << "\nTime: " << (tbb::tick_count::now()-t0).seconds() << "seconds" << endl;
-
- 	return 0;
-
- }
+
+	const vector<int> output = vecFilter(input, predicate);
+
+	/* NOTE: this includes the time spent printing the contents of vectors */
+	cout << "\nTime: " << (tbb::tick_count::now() - t0).seconds() << " seconds" << endl;
+	return 0;
+}
diff --git a/6_Example_PackingProblem/seqvsparallel.cpp b/6_Example_PackingProblem/seqvsparallel.cpp
index c7c9e49..52d0d2e 100644
--- a/6_Example_PackingProblem/seqvsparallel.cpp
+++ b/6_Example_PackingProblem/seqvsparallel.cpp
@@ -12,94 +12,127 @@ using namespace std;
 using namespace oneapi;
 using namespace tbb;
 
-template <class T>
+/**
+ * This allows selecting between two different versions of the inner
+ * loop function in the parallel scan implementation. This optimised
+ * version is more verbose, but avoids evaluating the same condition
+ * each and every loop iteration.
+ *
+ * When not using compiler optimisations, the unoptimised version is
+ * much worse (about +33% exec time for ARRAY_SIZE = 100000000). But
+ * with \c -O3 optimisations both approaches perform about the same,
+ * i.e. the compiler is smart enough to optimise this if allowed to.
+ *
+ * This shows that manual optimisations (which often make the source
+ * code harder to read and maintain) may be useless when things like
+ * the compiler can automatically perform the same optimisations. So
+ * consider profiling / benchmarking before wasting time optimising.
+ */
+#define USE_OPTIMISED_LOOP 0
+
+template<typename T>
 class Body
 {
-    T reduced_result;
-    T* const y;
-    const T* const x;
-
-    public:
-
-    Body( T y_[], const T x_[] ) : reduced_result(0), x(x_), y(y_) {}
-
-    T get_reduced_result() const {return reduced_result;}
-
-    template<typename Tag>
-    void operator()( const blocked_range<int>& r, Tag )
-    {
-        T temp = reduced_result;
-
-        for( int i=r.begin(); i<r.end(); ++i )
-        {
-            temp = temp+x[i];
-            if( Tag::is_final_scan() )
-            y[i] = temp;
-        }
-
-        reduced_result = temp;
-    }
-
-    Body( Body& b, split ) : x(b.x), y(b.y), reduced_result(10) {}
-
-    void reverse_join( Body& a )
-    {
-        reduced_result = a.reduced_result + reduced_result;
-    }
-
-    void assign( Body& b )
-    {
-        reduced_result = b.reduced_result;
-    }
+	T sum;
+	T* const y;
+	const T* const x;
+
+public:
+	Body(T y_[], const T x_[]) : sum(0), x(x_), y(y_) {}
+
+	T get_sum() const { return sum; }
+
+	template<typename Tag>
+	void operator()(const blocked_range<size_t>& r, Tag)
+	{
+#if USE_OPTIMISED_LOOP
+		/**
+		 * Evaluate \c Tag::is_final_scan() once outside the
+		 * loop, but we need to have two separate loops.
+		 */
+		if (Tag::is_final_scan()) {
+			for (size_t i = r.begin(); i < r.end(); ++i) {
+				sum = sum + x[i];
+				y[i] = sum;
+			}
+		} else {
+			for (size_t i = r.begin(); i < r.end(); ++i) {
+				sum = sum + x[i];
+			}
+		}
+#else
+		/**
+		 * Less verbose, but \c Tag::is_final_scan() is
+		 * evaluated each and every iteration. Or is it?
+		 */
+		for (size_t i = r.begin(); i < r.end(); ++i) {
+			sum = sum + x[i];
+			if (Tag::is_final_scan()) {
+				y[i] = sum;
+			}
+		}
+#endif
+	}
+
+	Body(Body& b, split) : x(b.x), y(b.y), sum(0) {}
+
+	void reverse_join(Body& a)
+	{
+		sum = a.sum + sum;
+	}
+
+	void assign(Body& b)
+	{
+		sum = b.sum;
+	}
 };
 
-
-template<class T>
-float DoParallelScan( T y[], const T x[], int n)
+template<typename T>
+T DoParallelScan(T y[], const T x[], size_t n)
 {
-    Body<int> body(y,x);
-    tick_count t1,t2,t3,t4;
-    t1=tick_count::now();
-    parallel_scan( blocked_range<int>(0,n), body , auto_partitioner() );
-    t2=tick_count::now();
-    cout<<"Time Taken for parallel scan is \t"<<(t2-t1).seconds()<<endl;
-    return body.get_reduced_result();
+	Body<T> body(y, x);
+	const tick_count t0 = tick_count::now();
+	parallel_scan(blocked_range<size_t>(0, n), body);
+	const tick_count t1 = tick_count::now();
+	cout << "Time Taken for parallel scan is: " << (t1 - t0).seconds() << endl;
+	return body.get_sum();
 }
 
-
-template<class T1>
-float SerialScan(T1 y[], const T1 x[], int n)
+template<typename T>
+T DoSerialScan(T y[], const T x[], size_t n)
 {
-    tick_count t3,t4;
-
-    t3=tick_count::now();
-    T1 temp = 10;
-
-    for( int i=1; i<n; ++i )
-    {
-        temp = temp+x[i];
-        y[i] = temp;
-    }
-    t4=tick_count::now();
-    cout<<"Time Taken for serial  scan is \t"<<(t4-t3).seconds()<<endl;
-    return temp;
-
+	const tick_count t0 = tick_count::now();
+	T temp = 0;
+	for (size_t i = 0; i < n; ++i) {
+		temp = temp + x[i];
+		y[i] = temp;
+	}
+	const tick_count t1 = tick_count::now();
+	cout << "Time Taken for   serial scan is: " << (t1 - t0).seconds() << endl;
+	return temp;
 }
 
+/**
+ * The size of the input and output arrays used to perform the serial
+ * and parallel scan operations. If the arrays do not fit in RAM then
+ * performance will be awful and/or the program may get killed.
+ */
+static const size_t ARRAY_SIZE = 1000000;
 
 int main()
 {
+	/* For some reason, using very large C-style arrays causes segfaults in the loop */
+	std::vector<int> y1(ARRAY_SIZE), x1(ARRAY_SIZE);
 
-    int y1[100000],x1[100000];
-
-    for(int i=0;i<100000;i++)
-        x1[i]=i;
-
-    cout<<fixed;
+	for (size_t i = 0; i < ARRAY_SIZE; i++) {
+		x1[i] = static_cast<int>(i & 0x7fffffff);
+	}
 
-    cout<<"\n serial scan output is \t"<<SerialScan(y1,x1,100000)<<endl;
+	const int outSerial = DoSerialScan(y1.data(), x1.data(), ARRAY_SIZE);
+	const int outParallel = DoParallelScan(y1.data(), x1.data(), ARRAY_SIZE);
 
-    cout<<"\n parallel scan output is \t"<<DoParallelScan(y1,x1,100000)<<endl;
+	cout << "  serial scan output is \t" << outSerial << endl;
+	cout << "parallel scan output is \t" << outParallel << endl;
 
-    return 0;
+	return 0;
 }