From a5c007f271e6e905cffdfd8f4041329a24c53d39 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Thu, 19 Oct 2023 11:28:44 -0700
Subject: [PATCH 01/14] Copied over OpenMP offload dir to OpenACC to use as
 starter code

---
 openacc/Makefile     | 100 +++++++++
 openacc/init.c       | 185 ++++++++++++++++
 openacc/io.c         | 331 ++++++++++++++++++++++++++++
 openacc/main.c       |  86 ++++++++
 openacc/material.c   | 123 +++++++++++
 openacc/rsbench.h    | 139 ++++++++++++
 openacc/simulation.c | 504 +++++++++++++++++++++++++++++++++++++++++++
 openacc/utils.c      |  29 +++
 8 files changed, 1497 insertions(+)
 create mode 100644 openacc/Makefile
 create mode 100644 openacc/init.c
 create mode 100644 openacc/io.c
 create mode 100644 openacc/main.c
 create mode 100644 openacc/material.c
 create mode 100644 openacc/rsbench.h
 create mode 100644 openacc/simulation.c
 create mode 100644 openacc/utils.c

diff --git a/openacc/Makefile b/openacc/Makefile
new file mode 100644
index 0000000..e6b7438
--- /dev/null
+++ b/openacc/Makefile
@@ -0,0 +1,100 @@
+#===============================================================================
+# User Options
+#===============================================================================
+
+COMPILER  = llvm
+OPTIMIZE  = yes
+DEBUG     = no
+PROFILE   = no
+
+#===============================================================================
+# Program name & source code list
+#===============================================================================
+
+program = rsbench
+
+source = \
+main.c \
+simulation.c\
+io.c \
+init.c \
+material.c \
+utils.c
+
+obj = $(source:.c=.o)
+
+#===============================================================================
+# Sets Flags
+#===============================================================================
+
+# Standard Flags
+CFLAGS := -std=gnu99 -Wall
+
+# Linker Flags
+LDFLAGS = -lm
+
+# Intel Compiler
+ifeq ($(COMPILER),intel)
+  CC = icx 
+  CFLAGS += -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 
+endif
+
+# LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),llvm)
+  CC = clang
+  CFLAGS += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
+endif
+
+# IBM XL Compiler
+ifeq ($(COMPILER),ibm)
+  CC = xlc_r
+  CFLAGS += -qsmp=omp -qoffload
+endif
+
+# NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),nvidia)
+  CC = nvc
+  CFLAGS += -mp=gpu -Minfo=mp -gpu=cc80
+endif
+
+# AOMP Targeting MI100 -- Change march to Target Other GPUs
+ifeq ($(COMPILER),amd)
+  CC = clang
+  CFLAGS += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
+endif
+
+# Debug Flags
+ifeq ($(DEBUG),yes)
+  CFLAGS += -g
+  LDFLAGS += -g
+endif
+
+# Profiling Flags
+ifeq ($(PROFILE),yes)
+  CFLAGS += -pg
+  LDFLAGS  += -pg
+endif
+
+# Optimization Flags
+ifeq ($(OPTIMIZE),yes)
+  CFLAGS += -O3
+endif
+
+#===============================================================================
+# Targets to Build
+#===============================================================================
+
+$(program): $(obj) rsbench.h Makefile
+	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
+
+%.o: %.c rsbench.h Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+
+clean:
+	rm -rf rsbench $(obj)
+
+edit:
+	vim -p $(source) rsbench.h
+
+run:
+	./rsbench
diff --git a/openacc/init.c b/openacc/init.c
new file mode 100644
index 0000000..2445acf
--- /dev/null
+++ b/openacc/init.c
@@ -0,0 +1,185 @@
+#include "rsbench.h"
+
+SimulationData initialize_simulation( Input input )
+{
+	uint64_t seed = INITIALIZATION_SEED;
+	
+	// Get material data
+	printf("Loading Hoogenboom-Martin material data...\n");
+	SimulationData SD = get_materials( input, &seed ); 
+	
+	// Allocate & fill energy grids
+	printf("Generating resonance distributions...\n");
+	SD.n_poles = generate_n_poles( input, &seed );
+	SD.length_n_poles = input.n_nuclides;
+
+	// Allocate & fill Window grids
+	printf("Generating window distributions...\n");
+	SD.n_windows = generate_n_windows( input, &seed );
+	SD.length_n_windows = input.n_nuclides;
+
+	// Prepare full resonance grid
+	printf("Generating resonance parameter grid...\n");
+	SD.poles = generate_poles( input, SD.n_poles, &seed, &SD.max_num_poles );
+	SD.length_poles = input.n_nuclides * SD.max_num_poles;
+
+	// Prepare full Window grid
+	printf("Generating window parameter grid...\n");
+	SD.windows = generate_window_params( input, SD.n_windows, SD.n_poles, &seed, &SD.max_num_windows);
+	SD.length_windows = input.n_nuclides * SD.max_num_windows;
+
+	// Prepare 0K Resonances
+	printf("Generating 0K l_value data...\n");
+	SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
+	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
+
+	return SD;
+}
+
+int * generate_n_poles( Input input, uint64_t * seed )
+{
+	int total_resonances = input.avg_n_poles * input.n_nuclides;
+
+	int * R = (int *) malloc( input.n_nuclides * sizeof(int));
+	
+	// Ensure all nuclides have at least 1 resonance
+	for( int i = 0; i < input.n_nuclides; i++ )
+		R[i] = 1;
+
+	// Sample the rest
+	for( int i = 0; i < total_resonances - input.n_nuclides; i++ )
+		R[LCG_random_int(seed) % input.n_nuclides]++;
+	
+	/* Debug	
+	for( int i = 0; i < input.n_nuclides; i++ )
+		printf("R[%d] = %d\n", i, R[i]);
+	*/
+	
+	return R;
+}
+
+int * generate_n_windows( Input input, uint64_t * seed )
+{
+	int total_resonances = input.avg_n_windows * input.n_nuclides;
+
+	int * R = (int *) malloc( input.n_nuclides * sizeof(int));
+	
+	// Ensure all nuclides have at least 1 resonance
+	for( int i = 0; i < input.n_nuclides; i++ )
+		R[i] = 1;
+
+	// Sample the rest
+	for( int i = 0; i < total_resonances - input.n_nuclides; i++ )
+		R[LCG_random_int(seed) % input.n_nuclides]++;
+	
+	/* Debug	
+	for( int i = 0; i < input.n_nuclides; i++ )
+		printf("R[%d] = %d\n", i, R[i]);
+	*/
+	
+	return R;
+}
+
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles )
+{
+	// Pole Scaling Factor -- Used to bias hitting of the fast Faddeeva
+	// region to approximately 99.5% (i.e., only 0.5% of lookups should
+	// require the full eval).
+	double f = 152.5;
+	RSComplex f_c = {f, 0};
+
+	int max_poles = -1;
+	
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		if( n_poles[i] > max_poles)
+			max_poles = n_poles[i];
+	}
+	*max_num_poles = max_poles;
+
+	// Allocating 2D matrix as a 1D contiguous vector
+	Pole * R = (Pole *) malloc( input.n_nuclides * max_poles * sizeof(Pole));
+	
+	// fill with data
+	for( int i = 0; i < input.n_nuclides; i++ )
+		for( int j = 0; j < n_poles[i]; j++ )
+		{
+			double r = LCG_random_double(seed);
+			double im = LCG_random_double(seed);
+			RSComplex t1 = {r, im};
+			R[i * max_poles + j].MP_EA = c_mul(f_c,t1);
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t2 = {f*r, im};
+			R[i * max_poles + j].MP_RT = t2;
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t3 = {f*r, im};
+			R[i * max_poles + j].MP_RA = t3;
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t4 = {f*r, im};
+			R[i * max_poles + j].MP_RF = t4;
+			R[i * max_poles + j].l_value = LCG_random_int(seed) % input.numL;
+		}
+	
+	/* Debug
+	for( int i = 0; i < input.n_nuclides; i++ )
+		for( int j = 0; j < n_poles[i]; j++ )
+			printf("R[%d][%d]: Eo = %lf lambda_o = %lf Tn = %lf Tg = %lf Tf = %lf\n", i, j, R[i * max_poles + j].Eo, R[i * max_poles + j].lambda_o, R[i * max_poles + j].Tn, R[i * max_poles + j].Tg, R[i * max_poles + j].Tf);
+	*/
+
+	return R;
+}
+
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows )
+{
+	int max_windows = -1;
+	
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		if( n_windows[i] > max_windows)
+			max_windows = n_windows[i];
+	}
+	*max_num_windows = max_windows;
+
+	// Allocating 2D contiguous matrix
+	Window * R = (Window *) malloc( input.n_nuclides * max_windows * sizeof(Window));
+	
+	// fill with data
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		int space = n_poles[i] / n_windows[i];
+		int remainder = n_poles[i] - space * n_windows[i];
+		int ctr = 0;
+		for( int j = 0; j < n_windows[i]; j++ )
+		{
+			R[i * max_windows + j].T = LCG_random_double(seed);
+			R[i * max_windows + j].A = LCG_random_double(seed);
+			R[i * max_windows + j].F = LCG_random_double(seed);
+			R[i * max_windows + j].start = ctr; 
+			R[i * max_windows + j].end = ctr + space - 1;
+
+			ctr += space;
+
+			if ( j < remainder )
+			{
+				ctr++;
+				R[i * max_windows + j].end++;
+			}
+		}
+	}
+
+	return R;
+}
+
+double * generate_pseudo_K0RS( Input input, uint64_t * seed )
+{
+	double * R = (double *) malloc( input.n_nuclides * input.numL * sizeof(double));
+
+	for( int i = 0; i < input.n_nuclides; i++)
+		for( int j = 0; j < input.numL; j++ )
+			R[i * input.numL + j] = LCG_random_double(seed);
+
+	return R;
+}
diff --git a/openacc/io.c b/openacc/io.c
new file mode 100644
index 0000000..e15ce16
--- /dev/null
+++ b/openacc/io.c
@@ -0,0 +1,331 @@
+#include "rsbench.h"
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+"                    _____   _____ ____                  _     \n"
+"                   |  __ \\ / ____|  _ \\                | |    \n"
+"                   | |__) | (___ | |_) | ___ _ __   ___| |__  \n"
+"                   |  _  / \\___ \\|  _ < / _ \\ '_ \\ / __| '_ \\ \n"
+"                   | | \\ \\ ____) | |_) |  __/ | | | (__| | | |\n"
+"                   |_|  \\_\\_____/|____/ \\___|_| |_|\\___|_| |_|\n\n"
+	);
+	border_print();
+	center_print("Developed at Argonne National Laboratory", 79);
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( int a )
+{
+    if( a < 1000 )
+        printf("%d\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%d,%03d\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%d,%03d,%03d\n", a / 1000000, (a % 1000000) / 1000, a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%d,%03d,%03d,%03d\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%d\n",a);
+}
+
+Input read_CLI( int argc, char * argv[] )
+{
+	Input input;
+
+	// defaults to the history based simulation method
+	input.simulation_method = HISTORY_BASED;
+	// defaults to max threads on the system	
+	input.nthreads = 1;
+	// defaults to 355 (corresponding to H-M Large benchmark)
+	input.n_nuclides = 355;
+	// defaults to 300,000
+	input.particles = 300000;
+	// defaults to 34
+	input.lookups = 34;
+	// defaults to H-M Large benchmark
+	input.HM = LARGE;
+	// defaults to 3000 resonancs (avg) per nuclide
+	input.avg_n_poles = 1000;
+	// defaults to 100
+	input.avg_n_windows = 100;
+	// defaults to 4;
+	input.numL = 4;
+	// defaults to no temperature dependence (Doppler broadening)
+	input.doppler = 1;
+	// defaults to baseline simulation kernel
+	input.kernel_id = 0;
+	
+	int default_lookups = 1;
+	int default_particles = 1;
+
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// Simulation Method (-m)
+		if( strcmp(arg, "-m") == 0 )
+		{
+			char * sim_type = NULL;
+			if( ++i < argc )
+				sim_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(sim_type, "history") == 0 )
+				input.simulation_method = HISTORY_BASED;
+			else if( strcmp(sim_type, "event") == 0 )
+			{
+				input.simulation_method = EVENT_BASED;
+				// Also resets default # of lookups
+				if( default_lookups && default_particles )
+				{
+					input.lookups =  input.lookups * input.particles;
+					input.particles = 0;
+				}
+			}
+			else
+				print_CLI_error();
+		}
+		// lookups (-l)
+		else if( strcmp(arg, "-l") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.lookups = atoi(argv[i]);
+				default_lookups = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// particles (-p)
+		else if( strcmp(arg, "-p") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.particles = atoi(argv[i]);
+				default_particles = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// nuclides (-n)
+		else if( strcmp(arg, "-n") == 0 )
+		{
+			if( ++i < argc )
+				input.n_nuclides = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// HM (-s)
+		else if( strcmp(arg, "-s") == 0 )
+		{	
+			if( ++i < argc )
+			{
+				if( strcmp(argv[i], "small") == 0 )
+					input.HM = SMALL;
+				else if ( strcmp(argv[i], "large") == 0 )
+					input.HM = LARGE;
+				else
+					print_CLI_error();
+			}
+			else
+				print_CLI_error();
+		}
+		// Doppler Broadening (Temperature Dependence)
+		else if( strcmp(arg, "-d") == 0 )
+		{	
+			input.doppler = 0;
+		}
+		// Avg number of windows per nuclide (-w)
+		else if( strcmp(arg, "-W") == 0 )
+		{
+			if( ++i < argc )
+				input.avg_n_windows = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// Avg number of poles per nuclide (-p)
+		else if( strcmp(arg, "-P") == 0 )
+		{
+			if( ++i < argc )
+				input.avg_n_poles = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// Kernel ID (-k)
+		else if( strcmp(arg, "-k") == 0 )
+		{
+			if( ++i < argc )
+				input.kernel_id = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+
+	// Validate nthreads
+	if( input.nthreads < 1 )
+		print_CLI_error();
+	
+	// Validate n_isotopes
+	if( input.n_nuclides < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.lookups < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.avg_n_poles < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.avg_n_windows < 1 )
+		print_CLI_error();
+	
+	// Set HM size specific parameters
+	// (defaults to large)
+	if( input.HM == SMALL )
+		input.n_nuclides = 68;
+
+	// Return input struct
+	return input;
+}
+
+void print_CLI_error(void)
+{
+	printf("Usage: ./multibench <options>\n");
+	printf("Options include:\n");
+	printf("  -s <size>        Size of H-M Benchmark to run (small, large)\n");
+	printf("  -l <lookups>     Number of Cross-section (XS) lookups per particle history\n");
+	printf("  -p <particles>   Number of particle histories\n");
+	printf("  -P <poles>       Average Number of Poles per Nuclide\n");
+	printf("  -W <poles>       Average Number of Windows per Nuclide\n");
+	printf("  -d               Disables Temperature Dependence (Doppler Broadening)\n");
+	printf("Default is equivalent to: -s large -l 34 -p 300000 -P 1000 -W 100\n");
+	printf("See readme for full description of default run values\n");
+	exit(4);
+}
+
+void print_input_summary(Input input)
+{
+	// Calculate Estimate of Memory Usage
+	size_t mem = get_mem_estimate(input);
+
+	printf("Programming Model:           OpenMP Taget Offloading\n");
+	if( input.simulation_method == EVENT_BASED )
+		printf("Simulation Method:           Event Based\n");
+	else
+		printf("Simulation Method:           History Based\n");
+	printf("Materials:                   12\n");
+	printf("H-M Benchmark Size:          ");
+	if( input.HM == 0 )
+		printf("Small\n");
+	else
+		printf("Large\n");
+	if( input.doppler == 1 )
+		printf("Temperature Dependence:      ON\n");
+	else
+		printf("Temperature Dependence:      OFF\n");
+	printf("Total Nuclides:              %d\n", input.n_nuclides);
+	printf("Avg Poles per Nuclide:       "); fancy_int(input.avg_n_poles);
+	printf("Avg Windows per Nuclide:     "); fancy_int(input.avg_n_windows);
+
+	int lookups = input.lookups;
+	if( input.simulation_method == HISTORY_BASED )
+	{
+		printf("Particles:                   "); fancy_int(input.particles);
+		printf("XS Lookups per Particle:     "); fancy_int(input.lookups);
+		lookups *= input.particles;
+	}
+	printf("Total XS Lookups:            "); fancy_int(lookups);
+	printf("Est. Memory Usage (MB):      %.1lf\n", mem / 1024.0 / 1024.0);
+}
+
+int validate_and_print_results(Input input, double runtime, unsigned long vhash)
+{
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
+	printf("Runtime:               %.3lf seconds\n", runtime);
+	int lookups = 0;
+	if( input.simulation_method == HISTORY_BASED )
+		lookups = input.lookups*input.particles;
+	else
+		lookups = input.lookups;
+	printf("Lookups:               "); fancy_int(lookups);
+	printf("Lookups/s:             "); fancy_int((double) lookups / (runtime));
+
+	int is_invalid = 1;
+
+	unsigned long long large = 0;
+	unsigned long long small = 0;
+	if(input.simulation_method == HISTORY_BASED )
+	{
+		large = 351485;
+		small = 879693;
+	}
+	else if( input.simulation_method == EVENT_BASED )
+	{
+		large = 358389;
+		small = 880018;
+	}
+
+	if( input.HM  == LARGE )
+	{
+		if( vhash == large )
+		{
+			printf("Verification checksum: %lu (Valid)\n", vhash);
+			is_invalid = 0;
+		}
+		else
+			printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+	}
+	else if( input.HM  == SMALL )
+	{
+		if( vhash == small )
+		{
+			printf("Verification checksum: %lu (Valid)\n", vhash);
+			is_invalid = 0;
+		}
+		else
+			printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+	}
+
+	return is_invalid;
+}
diff --git a/openacc/main.c b/openacc/main.c
new file mode 100644
index 0000000..0f88ad8
--- /dev/null
+++ b/openacc/main.c
@@ -0,0 +1,86 @@
+#include "rsbench.h"
+
+int main(int argc, char * argv[])
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+
+	int version = 13;
+	double start, stop;
+	
+	// Process CLI Fields
+	Input input = read_CLI( argc, argv );
+	
+	// =====================================================================
+	// Print-out of Input Summary
+	// =====================================================================
+	logo(version);
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	print_input_summary(input);
+
+	// =====================================================================
+	// Intialize Simulation Data Structures
+	// =====================================================================
+	border_print();
+	center_print("INITIALIZATION", 79);
+	border_print();
+	
+	start = get_time();
+	
+	SimulationData SD = initialize_simulation( input );
+
+	stop = get_time();
+
+	printf("Initialization Complete. (%.2lf seconds)\n", stop-start);
+	
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation Begins
+	// =====================================================================
+	border_print();
+	center_print("SIMULATION", 79);
+	border_print();
+
+	unsigned long vhash = 0;
+
+	// Run Simulation
+	start = get_time();
+
+	// Run simulation
+	if( input.simulation_method == EVENT_BASED )
+	{
+		if( input.kernel_id == 0 )
+			run_event_based_simulation(input, SD, &vhash );
+		else
+		{
+			printf("Error: No kernel ID %d found!\n", input.kernel_id);
+			exit(1);
+		}
+	}
+	else if( input.simulation_method == HISTORY_BASED )
+	{
+		printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+	stop = get_time();
+
+	// Final hash step
+	vhash = vhash % 999983;
+
+	printf("Simulation Complete.\n");
+
+	// =====================================================================
+	// Print / Save Results and Exit
+	// =====================================================================
+	border_print();
+	center_print("RESULTS", 79);
+	border_print();
+
+	int is_invalid = validate_and_print_results(input, stop-start, vhash);
+
+	border_print();
+
+	return is_invalid;
+}
diff --git a/openacc/material.c b/openacc/material.c
new file mode 100644
index 0000000..791565b
--- /dev/null
+++ b/openacc/material.c
@@ -0,0 +1,123 @@
+#include "rsbench.h"
+
+// Handles all material creation tasks - returns Material struct
+SimulationData get_materials(Input input, uint64_t * seed)
+{
+	SimulationData SD;
+
+	SD.num_nucs = load_num_nucs(input);
+	SD.length_num_nucs = 12;
+
+	SD.mats = load_mats(input, SD.num_nucs, &SD.max_num_nucs, &SD.length_mats);
+
+	SD.concs = load_concs(SD.num_nucs, seed, SD.max_num_nucs);
+	SD.length_concs = 12 * SD.max_num_nucs; 
+
+	return SD;
+}
+
+// num_nucs represents the number of nuclides that each material contains
+int * load_num_nucs(Input input)
+{
+	int * num_nucs = (int*)malloc(12*sizeof(int));
+	
+	// Material 0 is a special case (fuel). The H-M small reactor uses
+	// 34 nuclides, while H-M larges uses 300.
+	if( input.n_nuclides == 68 )
+		num_nucs[0]  = 34; // HM Small is 34, H-M Large is 321
+	else
+		num_nucs[0]  = 321; // HM Small is 34, H-M Large is 321
+
+	num_nucs[1]  = 5;
+	num_nucs[2]  = 4;
+	num_nucs[3]  = 4;
+	num_nucs[4]  = 27;
+	num_nucs[5]  = 21;
+	num_nucs[6]  = 21;
+	num_nucs[7]  = 21;
+	num_nucs[8]  = 21;
+	num_nucs[9]  = 21;
+	num_nucs[10] = 9;
+	num_nucs[11] = 9;
+
+	return num_nucs;
+}
+
+// Assigns an array of nuclide ID's to each material
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats )
+{
+	*max_num_nucs = 0;
+	int num_mats = 12;
+	for( int m = 0; m < num_mats; m++ )
+	{
+		if( num_nucs[m] > *max_num_nucs )
+			*max_num_nucs = num_nucs[m];
+	}
+	int * mats = (int *) malloc( num_mats * (*max_num_nucs) * sizeof(int) );
+	*length_mats = num_mats * (*max_num_nucs);
+
+	// Small H-M has 34 fuel nuclides
+	int mats0_Sml[] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53, 
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	// Large H-M has 300 fuel nuclides
+	int mats0_Lrg[321] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	for( int i = 0; i < 321-34; i++ )
+		mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
+	
+	// These are the non-fuel materials	
+	int mats1[] =  { 63, 64, 65, 66, 67 }; // cladding
+	int mats2[] =  { 24, 41, 4, 5 }; // cold borated water
+	int mats3[] =  { 24, 41, 4, 5 }; // hot borated water
+	int mats4[] =  { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
+	                 30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
+	                 17 }; // RPV
+	int mats5[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
+	int mats6[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
+	int mats7[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom plate
+	int mats8[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
+	int mats9[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top nozzle
+	int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
+	int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
+	
+	// H-M large v small dependency
+	if( input.n_nuclides == 68 )
+		memcpy( mats,  mats0_Sml,  num_nucs[0]  * sizeof(int) );	
+	else
+		memcpy( mats,  mats0_Lrg,  num_nucs[0]  * sizeof(int) );
+	
+	// Copy other materials
+	memcpy( mats + *max_num_nucs * 1,  mats1,  num_nucs[1]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 2,  mats2,  num_nucs[2]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 3,  mats3,  num_nucs[3]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 4,  mats4,  num_nucs[4]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 5,  mats5,  num_nucs[5]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 6,  mats6,  num_nucs[6]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 7,  mats7,  num_nucs[7]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 8,  mats8,  num_nucs[8]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 9,  mats9,  num_nucs[9]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );	
+
+	return mats;
+}
+
+// Creates a randomized array of 'concentrations' of nuclides in each mat
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs )
+{
+	double * concs = (double *) malloc( 12 * max_num_nucs * sizeof( double ) );
+
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			concs[i * max_num_nucs + j] = LCG_random_double(seed);
+
+	return concs;
+}
+
diff --git a/openacc/rsbench.h b/openacc/rsbench.h
new file mode 100644
index 0000000..98e6bac
--- /dev/null
+++ b/openacc/rsbench.h
@@ -0,0 +1,139 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include<math.h>
+#include<string.h>
+#include<stdint.h>
+#include<float.h>
+#include<omp.h>
+#include<assert.h>
+
+#define OPENMP
+
+#define PI 3.14159265359
+
+// typedefs
+typedef enum __hm{SMALL, LARGE, XL, XXL} HM_size;
+
+#define HISTORY_BASED 1
+#define EVENT_BASED 2
+
+#define STARTING_SEED 1070
+#define INITIALIZATION_SEED 42
+
+typedef struct{
+	double r;
+	double i;
+} RSComplex;
+
+typedef struct{
+	int nthreads;
+	int n_nuclides;
+	int lookups;
+	HM_size HM;
+	int avg_n_poles;
+	int avg_n_windows;
+	int numL;
+	int doppler;
+	int particles;
+	int simulation_method;
+	int kernel_id;
+} Input;
+
+typedef struct{
+	RSComplex MP_EA;
+	RSComplex MP_RT;
+	RSComplex MP_RA;
+	RSComplex MP_RF;
+	short int l_value;
+} Pole;
+
+typedef struct{
+	double T;
+	double A;
+	double F;
+	int start;
+	int end;
+} Window;
+
+typedef struct{
+	int * n_poles;
+	unsigned long length_n_poles;
+	int * n_windows;
+	unsigned long length_n_windows;
+	Pole * poles;
+	unsigned long length_poles;
+	Window * windows;
+	unsigned long length_windows;
+	double * pseudo_K0RS;
+	unsigned long length_pseudo_K0RS;
+	int * num_nucs;
+	unsigned long length_num_nucs;
+	int * mats;
+	unsigned long length_mats;
+	double * concs;
+	unsigned long length_concs;
+	int max_num_nucs;
+	int max_num_poles;
+	int max_num_windows;
+	double * p_energy_samples;
+	unsigned long length_p_energy_samples;
+	int * mat_samples;
+	unsigned long length_mat_samples;
+} SimulationData;
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int( int a );
+Input read_CLI( int argc, char * argv[] );
+void print_CLI_error(void);
+void print_input_summary(Input input);
+int validate_and_print_results(Input input, double runtime, unsigned long vhash);
+
+// init.c
+SimulationData initialize_simulation( Input input );
+int * generate_n_poles( Input input,  uint64_t * seed );
+int * generate_n_windows( Input input ,  uint64_t * seed);
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles );
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
+double * generate_pseudo_K0RS( Input input, uint64_t * seed );
+
+// material.c
+int * load_num_nucs(Input input);
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats );
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs );
+SimulationData get_materials(Input input, uint64_t * seed);
+
+// utils.c
+size_t get_mem_estimate( Input input );
+RSComplex fast_cexp( RSComplex z );
+double get_time(void);
+
+// xs_kernel.c
+RSComplex fast_nuclear_W( RSComplex Z );
+void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles ) ;
+void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles);
+void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+
+// simulation.c
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+void run_history_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+double LCG_random_double(uint64_t * seed);
+uint64_t LCG_random_int(uint64_t * seed);
+uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
+void run_event_based_simulation_optimization_1(Input in, SimulationData SD, unsigned long * vhash_result );
+int pick_mat( uint64_t * seed );
+void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors );
+
+// rscomplex.c
+RSComplex c_add( RSComplex A, RSComplex B);
+RSComplex c_sub( RSComplex A, RSComplex B);
+RSComplex c_mul( RSComplex A, RSComplex B);
+RSComplex c_div( RSComplex A, RSComplex B);
+double c_abs( RSComplex A);
+
+// papi.c
+void counter_init( int *eventset, int *num_papi_events );
+void counter_stop( int * eventset, int num_papi_events );
diff --git a/openacc/simulation.c b/openacc/simulation.c
new file mode 100644
index 0000000..c4df586
--- /dev/null
+++ b/openacc/simulation.c
@@ -0,0 +1,504 @@
+#include "rsbench.h"
+
+////////////////////////////////////////////////////////////////////////////////////
+// BASELINE FUNCTIONS
+////////////////////////////////////////////////////////////////////////////////////
+// All "baseline" code is at the top of this file. The baseline code is a simple
+// implementation of the algorithm, with only minor CPU optimizations in place.
+// Following these functions are a number of optimized variants,
+// which each deploy a different combination of optimizations strategies. By
+// default, RSBench will only run the baseline implementation. Optimized variants
+// must be specifically selected using the "-k <optimized variant ID>" command
+// line argument.
+////////////////////////////////////////////////////////////////////////////////////
+
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result )
+{
+	printf("Beginning baseline event based simulation on device...\n");
+	unsigned long long * verification = (unsigned long long *) malloc(input.lookups * sizeof(unsigned long long));
+
+	int offloaded_to_device = 0;
+
+	// Main simulation loop over macroscopic cross section lookups
+
+	#pragma omp target teams distribute parallel for\
+	map(to:data.n_poles[:data.length_n_poles])\
+	map(to:data.n_windows[:data.length_n_windows])\
+	map(to:data.poles[:data.length_poles])\
+	map(to:data.windows[:data.length_windows])\
+	map(to:data.pseudo_K0RS[:data.length_pseudo_K0RS])\
+	map(to:data.num_nucs[:data.length_num_nucs])\
+	map(to:data.mats[:data.length_mats])\
+	map(to:data.concs[:data.length_concs])\
+	map(to:data.max_num_nucs)\
+	map(to:data.max_num_poles)\
+	map(to:data.max_num_windows)\
+	map(tofrom:offloaded_to_device)\
+  map(from:verification[:input.lookups])
+	for( int i = 0; i < input.lookups; i++ )
+	{
+		// Set the initial seed value
+		uint64_t seed = STARTING_SEED;	
+
+		// Forward seed to lookup index (we need 2 samples per lookup)
+		seed = fast_forward_LCG(seed, 2*i);
+
+		// Randomly pick an energy and material for the particle
+		double E = LCG_random_double(&seed);
+		int mat  = pick_mat(&seed);
+
+		double macro_xs[4] = {0};
+
+		calculate_macro_xs( macro_xs, mat, E, input, data.num_nucs, data.mats, data.max_num_nucs, data.concs, data.n_windows, data.pseudo_K0RS, data.windows, data.poles, data.max_num_windows, data.max_num_poles );
+
+		// For verification, and to prevent the compiler from optimizing
+		// all work out, we interrogate the returned macro_xs_vector array
+		// to find its maximum value index, then increment the verification
+		// value by that index. In this implementation, we prevent thread
+		// contention by using an OMP reduction on it. For other accelerators,
+		// a different approach might be required (e.g., atomics, reduction
+		// of thread-specific values in large array via CUDA thrust, etc)
+		double max = -DBL_MAX;
+		int max_idx = 0;
+		for(int x = 0; x < 4; x++ )
+		{
+			if( macro_xs[x] > max )
+			{
+				max = macro_xs[x];
+				max_idx = x;
+			}
+		}
+		verification[i] = max_idx+1;
+
+		// Check if we are currently running on the device or not
+		if( i == 0 )
+			offloaded_to_device = !omp_is_initial_device();
+	}
+  
+  // Reduce validation hash on the host
+  unsigned long long validation_hash = 0;
+	for( int i = 0; i < input.lookups; i++ )
+    validation_hash += verification[i];
+
+	// Print if kernel actually ran on the device
+	if( offloaded_to_device )
+		printf( "Kernel ran accelerator device.\n" );
+	else
+		printf( "NOTE - Kernel ran on the host!\n" );
+
+	*vhash_result = validation_hash;
+}
+
+void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles ) 
+{
+	// zero out macro vector
+	for( int i = 0; i < 4; i++ )
+		macro_xs[i] = 0;
+
+	// for nuclide in mat
+	for( int i = 0; i < num_nucs[mat]; i++ )
+	{
+		double micro_xs[4];
+		int nuc = mats[mat * max_num_nucs + i];
+
+		if( input.doppler == 1 )
+			calculate_micro_xs_doppler( micro_xs, nuc, E, input, n_windows, pseudo_K0Rs, windows, poles, max_num_windows, max_num_poles);
+		else
+			calculate_micro_xs( micro_xs, nuc, E, input, n_windows, pseudo_K0Rs, windows, poles, max_num_windows, max_num_poles);
+
+		for( int j = 0; j < 4; j++ )
+		{
+			macro_xs[j] += micro_xs[j] * concs[mat * max_num_nucs + i];
+		}
+		// Debug
+		/*
+		printf("E = %.2lf, mat = %d, macro_xs[0] = %.2lf, macro_xs[1] = %.2lf, macro_xs[2] = %.2lf, macro_xs[3] = %.2lf\n",
+		E, mat, macro_xs[0], macro_xs[1], macro_xs[2], macro_xs[3] );
+		*/
+	}
+
+	// Debug
+	/*
+	printf("E = %.2lf, mat = %d, macro_xs[0] = %.2lf, macro_xs[1] = %.2lf, macro_xs[2] = %.2lf, macro_xs[3] = %.2lf\n",
+	E, mat, macro_xs[0], macro_xs[1], macro_xs[2], macro_xs[3] );
+	*/
+}
+
+// No Temperature dependence (i.e., 0K evaluation)
+void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles)
+{
+	// MicroScopic XS's to Calculate
+	double sigT;
+	double sigA;
+	double sigF;
+	double sigE;
+
+	// Calculate Window Index
+	double spacing = 1.0 / n_windows[nuc];
+	int window = (int) ( E / spacing );
+	if( window == n_windows[nuc] )
+		window--;
+
+	// Calculate sigTfactors
+	RSComplex sigTfactors[4]; // Of length input.numL, which is always 4
+	calculate_sig_T(nuc, E, input, pseudo_K0RS, sigTfactors );
+
+	// Calculate contributions from window "background" (i.e., poles outside window (pre-calculated)
+	Window w = windows[nuc * max_num_windows + window];
+	sigT = E * w.T;
+	sigA = E * w.A;
+	sigF = E * w.F;
+
+	// Loop over Poles within window, add contributions
+	for( int i = w.start; i < w.end; i++ )
+	{
+		RSComplex PSIIKI;
+		RSComplex CDUM;
+		Pole pole = poles[nuc * max_num_poles + i];
+		RSComplex t1 = {0, 1};
+		RSComplex t2 = {sqrt(E), 0 };
+		PSIIKI = c_div( t1 , c_sub(pole.MP_EA,t2) );
+		RSComplex E_c = {E, 0};
+		CDUM = c_div(PSIIKI, E_c);
+		sigT += (c_mul(pole.MP_RT, c_mul(CDUM, sigTfactors[pole.l_value])) ).r;
+		sigA += (c_mul( pole.MP_RA, CDUM)).r;
+		sigF += (c_mul(pole.MP_RF, CDUM)).r;
+	}
+
+	sigE = sigT - sigA;
+
+	micro_xs[0] = sigT;
+	micro_xs[1] = sigA;
+	micro_xs[2] = sigF;
+	micro_xs[3] = sigE;
+}
+
+// Temperature Dependent Variation of Kernel
+// (This involves using the Complex Faddeeva function to
+// Doppler broaden the poles within the window)
+void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles )
+{
+	// MicroScopic XS's to Calculate
+	double sigT;
+	double sigA;
+	double sigF;
+	double sigE;
+
+	// Calculate Window Index
+	double spacing = 1.0 / n_windows[nuc];
+	int window = (int) ( E / spacing );
+	if( window == n_windows[nuc] )
+		window--;
+
+	// Calculate sigTfactors
+	RSComplex sigTfactors[4]; // Of length input.numL, which is always 4
+	calculate_sig_T(nuc, E, input, pseudo_K0RS, sigTfactors );
+
+	// Calculate contributions from window "background" (i.e., poles outside window (pre-calculated)
+	Window w = windows[nuc * max_num_windows + window];
+	sigT = E * w.T;
+	sigA = E * w.A;
+	sigF = E * w.F;
+
+	double dopp = 0.5;
+
+	// Loop over Poles within window, add contributions
+	for( int i = w.start; i < w.end; i++ )
+	{
+		Pole pole = poles[nuc * max_num_poles + i];
+
+		// Prep Z
+		RSComplex E_c = {E, 0};
+		RSComplex dopp_c = {dopp, 0};
+		RSComplex Z = c_mul(c_sub(E_c, pole.MP_EA), dopp_c);
+
+		// Evaluate Fadeeva Function
+		RSComplex faddeeva = fast_nuclear_W( Z );
+
+		// Update W
+		sigT += (c_mul( pole.MP_RT, c_mul(faddeeva, sigTfactors[pole.l_value]) )).r;
+		sigA += (c_mul( pole.MP_RA , faddeeva)).r;
+		sigF += (c_mul( pole.MP_RF , faddeeva)).r;
+	}
+
+	sigE = sigT - sigA;
+
+	micro_xs[0] = sigT;
+	micro_xs[1] = sigA;
+	micro_xs[2] = sigF;
+	micro_xs[3] = sigE;
+}
+
+// picks a material based on a probabilistic distribution
+int pick_mat( uint64_t * seed )
+{
+	// I have a nice spreadsheet supporting these numbers. They are
+	// the fractions (by volume) of material in the core. Not a 
+	// *perfect* approximation of where XS lookups are going to occur,
+	// but this will do a good job of biasing the system nonetheless.
+
+	double dist[12];
+	dist[0]  = 0.140;	// fuel
+	dist[1]  = 0.052;	// cladding
+	dist[2]  = 0.275;	// cold, borated water
+	dist[3]  = 0.134;	// hot, borated water
+	dist[4]  = 0.154;	// RPV
+	dist[5]  = 0.064;	// Lower, radial reflector
+	dist[6]  = 0.066;	// Upper reflector / top plate
+	dist[7]  = 0.055;	// bottom plate
+	dist[8]  = 0.008;	// bottom nozzle
+	dist[9]  = 0.015;	// top nozzle
+	dist[10] = 0.025;	// top of fuel assemblies
+	dist[11] = 0.013;	// bottom of fuel assemblies
+
+	double roll = LCG_random_double(seed);
+
+	// makes a pick based on the distro
+	for( int i = 0; i < 12; i++ )
+	{
+		double running = 0;
+		for( int j = i; j > 0; j-- )
+			running += dist[j];
+		if( roll < running )
+			return i;
+	}
+
+	return 0;
+}
+
+void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors )
+{
+	double phi;
+
+	for( int i = 0; i < 4; i++ )
+	{
+		phi = pseudo_K0RS[nuc * input.numL + i] * sqrt(E);
+
+		if( i == 1 )
+			phi -= - atan( phi );
+		else if( i == 2 )
+			phi -= atan( 3.0 * phi / (3.0 - phi*phi));
+		else if( i == 3 )
+			phi -= atan(phi*(15.0-phi*phi)/(15.0-6.0*phi*phi));
+
+		phi *= 2.0;
+
+		sigTfactors[i].r = cos(phi);
+		sigTfactors[i].i = -sin(phi);
+	}
+}
+
+// This function uses a combination of the Abrarov Approximation
+// and the QUICK_W three term asymptotic expansion.
+// Only expected to use Abrarov ~0.5% of the time.
+RSComplex fast_nuclear_W( RSComplex Z )
+{
+	// Abrarov 
+	if( c_abs(Z) < 6.0 )
+	{
+		// Precomputed parts for speeding things up
+		// (N = 10, Tm = 12.0)
+		RSComplex prefactor = {0, 8.124330e+01};
+		double an[10] = {
+			2.758402e-01,
+			2.245740e-01,
+			1.594149e-01,
+			9.866577e-02,
+			5.324414e-02,
+			2.505215e-02,
+			1.027747e-02,
+			3.676164e-03,
+			1.146494e-03,
+			3.117570e-04
+		};
+		double neg_1n[10] = {
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0
+		};
+
+		double denominator_left[10] = {
+			9.869604e+00,
+			3.947842e+01,
+			8.882644e+01,
+			1.579137e+02,
+			2.467401e+02,
+			3.553058e+02,
+			4.836106e+02,
+			6.316547e+02,
+			7.994380e+02,
+			9.869604e+02
+		};
+
+		RSComplex t1 = {0, 12};
+		RSComplex t2 = {12, 0};
+		RSComplex i = {0,1};
+		RSComplex one = {1, 0};
+		RSComplex W = c_div(c_mul(i, ( c_sub(one, fast_cexp(c_mul(t1, Z))) )) , c_mul(t2, Z));
+		RSComplex sum = {0,0};
+		for( int n = 0; n < 10; n++ )
+		{
+			RSComplex t3 = {neg_1n[n], 0};
+			RSComplex top = c_sub(c_mul(t3, fast_cexp(c_mul(t1, Z))), one);
+			RSComplex t4 = {denominator_left[n], 0};
+			RSComplex t5 = {144, 0};
+			RSComplex bot = c_sub(t4, c_mul(t5,c_mul(Z,Z)));
+			RSComplex t6 = {an[n], 0};
+			sum = c_add(sum, c_mul(t6, c_div(top,bot)));
+		}
+		W = c_add(W, c_mul(prefactor, c_mul(Z, sum)));
+		return W;
+	}
+	else
+	{
+		// QUICK_2 3 Term Asymptotic Expansion (Accurate to O(1e-6)).
+		// Pre-computed parameters
+		RSComplex a = {0.512424224754768462984202823134979415014943561548661637413182,0};
+		RSComplex b = {0.275255128608410950901357962647054304017026259671664935783653, 0};
+		RSComplex c = {0.051765358792987823963876628425793170829107067780337219430904, 0};
+		RSComplex d = {2.724744871391589049098642037352945695982973740328335064216346, 0};
+
+		RSComplex i = {0,1};
+		RSComplex Z2 = c_mul(Z, Z);
+		// Three Term Asymptotic Expansion
+		RSComplex W = c_mul(c_mul(Z,i), (c_add(c_div(a,(c_sub(Z2, b))) , c_div(c,(c_sub(Z2, d))))));
+
+		return W;
+	}
+}
+
+double LCG_random_double(uint64_t * seed)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return (double) (*seed) / (double) m;
+}	
+
+uint64_t LCG_random_int(uint64_t * seed)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return *seed;
+}	
+
+uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	uint64_t a = 2806196910506780709ULL;
+	uint64_t c = 1ULL;
+
+	n = n % m;
+
+	uint64_t a_new = 1;
+	uint64_t c_new = 0;
+
+	while(n > 0) 
+	{
+		if(n & 1)
+		{
+			a_new *= a;
+			c_new = c_new * a + c;
+		}
+		c *= (a + 1);
+		a *= a;
+
+		n >>= 1;
+	}
+
+	return (a_new * seed + c_new) % m;
+}
+
+// Complex arithmetic functions
+
+RSComplex c_add( RSComplex A, RSComplex B)
+{
+	RSComplex C;
+	C.r = A.r + B.r;
+	C.i = A.i + B.i;
+	return C;
+}
+
+RSComplex c_sub( RSComplex A, RSComplex B)
+{
+	RSComplex C;
+	C.r = A.r - B.r;
+	C.i = A.i - B.i;
+	return C;
+}
+
+RSComplex c_mul( RSComplex A, RSComplex B)
+{
+	double a = A.r;
+	double b = A.i;
+	double c = B.r;
+	double d = B.i;
+	RSComplex C;
+	C.r = (a*c) - (b*d);
+	C.i = (a*d) + (b*c);
+	return C;
+}
+
+RSComplex c_div( RSComplex A, RSComplex B)
+{
+	double a = A.r;
+	double b = A.i;
+	double c = B.r;
+	double d = B.i;
+	RSComplex C;
+	double denom = c*c + d*d;
+	C.r = ( (a*c) + (b*d) ) / denom;
+	C.i = ( (b*c) - (a*d) ) / denom;
+	return C;
+}
+
+double c_abs( RSComplex A)
+{
+	return sqrt(A.r*A.r + A.i * A.i);
+}
+
+
+// Fast (but inaccurate) exponential function
+// Written By "ACMer":
+// https://codingforspeed.com/using-faster-exponential-approximation/
+// We use our own to avoid small differences in compiler specific
+// exp() intrinsic implementations that make it difficult to verify
+// if the code is working correctly or not.
+double fast_exp(double x)
+{
+  x = 1.0 + x * 0.000244140625;
+  x *= x; x *= x; x *= x; x *= x;
+  x *= x; x *= x; x *= x; x *= x;
+  x *= x; x *= x; x *= x; x *= x;
+  return x;
+}
+
+// Implementation based on:
+// z = x + iy
+// cexp(z) = e^x * (cos(y) + i * sin(y))
+RSComplex fast_cexp( RSComplex z )
+{
+	double x = z.r;
+	double y = z.i;
+
+	// For consistency across architectures, we
+	// will use our own exponetial implementation
+	//double t1 = exp(x);
+	double t1 = fast_exp(x);
+	double t2 = cos(y);
+	double t3 = sin(y);
+	RSComplex t4 = {t2, t3};
+	RSComplex t5 = {t1, 0};
+	RSComplex result = c_mul(t5, (t4));
+	return result;
+}	
diff --git a/openacc/utils.c b/openacc/utils.c
new file mode 100644
index 0000000..e06b352
--- /dev/null
+++ b/openacc/utils.c
@@ -0,0 +1,29 @@
+#include "rsbench.h"
+
+size_t get_mem_estimate( Input input )
+{
+	size_t poles = input.n_nuclides * input.avg_n_poles * sizeof(Pole) + input.n_nuclides * sizeof(Pole *);
+	size_t windows = input.n_nuclides * input.avg_n_windows * sizeof(Window) + input.n_nuclides * sizeof(Window *);
+	size_t pseudo_K0RS = input.n_nuclides * input.numL * sizeof( double ) + input.n_nuclides * sizeof(double);
+	size_t other = input.n_nuclides * 2 * sizeof(int);
+
+	size_t total = poles + windows + pseudo_K0RS + other;
+	
+	return total;
+}
+
+double get_time(void)
+{
+	#ifdef OPENMP
+	return omp_get_wtime();
+	#endif
+
+	struct timeval timecheck;
+
+	gettimeofday(&timecheck, NULL);
+	long ms = (long)timecheck.tv_sec * 1000 + (long)timecheck.tv_usec / 1000;
+
+	double time = (double) ms / 1000.0;
+
+	return time;
+}

From 3dd2cf91514b0ac81d42f2f4d0ea687f6ed1d282 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Wed, 25 Oct 2023 11:45:39 -0700
Subject: [PATCH 02/14] intial effort at converting omp offloading to openacc

---
 openacc/Makefile     | 12 +++++++++---
 openacc/simulation.c | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/openacc/Makefile b/openacc/Makefile
index e6b7438..fa8748d 100644
--- a/openacc/Makefile
+++ b/openacc/Makefile
@@ -2,7 +2,7 @@
 # User Options
 #===============================================================================
 
-COMPILER  = llvm
+COMPILER  = gcc
 OPTIMIZE  = yes
 DEBUG     = no
 PROFILE   = no
@@ -36,13 +36,19 @@ LDFLAGS = -lm
 # Intel Compiler
 ifeq ($(COMPILER),intel)
   CC = icx 
-  CFLAGS += -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 
+  CFLAGS += -fopenacc -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ 
+endif
+
+# GCC Compiler Targeting A100 -- Change SM Level to Target Other GPUs
+ifeq ($(COMPILER),gcc)
+  CC = gcc
+  CFLAGS += -fopenacc
 endif
 
 # LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
 ifeq ($(COMPILER),llvm)
   CC = clang
-  CFLAGS += -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
+  CFLAGS += -fopenacc -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
 endif
 
 # IBM XL Compiler
diff --git a/openacc/simulation.c b/openacc/simulation.c
index c4df586..c084ce4 100644
--- a/openacc/simulation.c
+++ b/openacc/simulation.c
@@ -21,7 +21,7 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 
 	// Main simulation loop over macroscopic cross section lookups
 
-	#pragma omp target teams distribute parallel for\
+	//#pragma omp target teams distribute parallel for\
 	map(to:data.n_poles[:data.length_n_poles])\
 	map(to:data.n_windows[:data.length_n_windows])\
 	map(to:data.poles[:data.length_poles])\
@@ -35,6 +35,20 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 	map(to:data.max_num_windows)\
 	map(tofrom:offloaded_to_device)\
   map(from:verification[:input.lookups])
+  #pragma acc parallel loop \
+	copyin(data.n_poles[:data.length_n_poles])\
+	copyin(data.n_windows[:data.length_n_windows])\
+	copyin(data.poles[:data.length_poles])\
+	copyin(data.windows[:data.length_windows])\
+	copyin(data.pseudo_K0RS[:data.length_pseudo_K0RS])\
+	copyin(data.num_nucs[:data.length_num_nucs])\
+	copyin(data.mats[:data.length_mats])\
+	copyin(data.concs[:data.length_concs])\
+	copyin(data.max_num_nucs)\
+	copyin(data.max_num_poles)\
+	copyin(data.max_num_windows)\
+	copy(offloaded_to_device)\
+  copyout(verification[:input.lookups])
 	for( int i = 0; i < input.lookups; i++ )
 	{
 		// Set the initial seed value

From 2e2f473029c02302ad699073d2783534d452aac3 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Thu, 26 Oct 2023 07:35:03 -0700
Subject: [PATCH 03/14] Added basic gitignore (filters binaries, object files,
 output files, and batch scripts)

---
 .gitignore | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..61069c4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# No compiled code
+*.o
+*/rsbench
+
+# No batch scripts (system dependent)
+*.sh
+
+# No output files
+*.qdstrm
+*.out

From 355660cd74b68b84cab27da7ae6893a119c4285d Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Thu, 26 Oct 2023 07:37:03 -0700
Subject: [PATCH 04/14] Implmented basic OpenACC port (still buggy)

---
 openacc/Makefile     | 5 +++--
 openacc/simulation.c | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/openacc/Makefile b/openacc/Makefile
index fa8748d..49693a4 100644
--- a/openacc/Makefile
+++ b/openacc/Makefile
@@ -2,7 +2,7 @@
 # User Options
 #===============================================================================
 
-COMPILER  = gcc
+COMPILER  = nvidia
 OPTIMIZE  = yes
 DEBUG     = no
 PROFILE   = no
@@ -43,6 +43,7 @@ endif
 ifeq ($(COMPILER),gcc)
   CC = gcc
   CFLAGS += -fopenacc
+	export ACC_DEVICE_TYPE=nvidia
 endif
 
 # LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
@@ -60,7 +61,7 @@ endif
 # NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
 ifeq ($(COMPILER),nvidia)
   CC = nvc
-  CFLAGS += -mp=gpu -Minfo=mp -gpu=cc80
+	CFLAGS += -acc -Minfo=accel -gpu=cc80
 endif
 
 # AOMP Targeting MI100 -- Change march to Target Other GPUs
diff --git a/openacc/simulation.c b/openacc/simulation.c
index c084ce4..f0d0475 100644
--- a/openacc/simulation.c
+++ b/openacc/simulation.c
@@ -85,8 +85,10 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 		verification[i] = max_idx+1;
 
 		// Check if we are currently running on the device or not
-		if( i == 0 )
-			offloaded_to_device = !omp_is_initial_device();
+		if( i == 0 ) {
+			//offloaded_to_device = !omp_is_initial_device();
+      offloaded_to_device = 1;
+    }
 	}
   
   // Reduce validation hash on the host

From 0c1f9e889249892e43b291fa918dd31e180d6e8d Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Tue, 31 Oct 2023 07:37:46 -0700
Subject: [PATCH 05/14] Found OpenACC equivalent of omp_is_initial_device to
 check wehtehr or not we're running on host or device and fixed compilation
 error

---
 openacc/rsbench.h    | 2 +-
 openacc/simulation.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/openacc/rsbench.h b/openacc/rsbench.h
index 98e6bac..050d8ee 100644
--- a/openacc/rsbench.h
+++ b/openacc/rsbench.h
@@ -5,7 +5,7 @@
 #include<string.h>
 #include<stdint.h>
 #include<float.h>
-#include<omp.h>
+#include<openacc.h>
 #include<assert.h>
 
 #define OPENMP
diff --git a/openacc/simulation.c b/openacc/simulation.c
index f0d0475..7b89e09 100644
--- a/openacc/simulation.c
+++ b/openacc/simulation.c
@@ -36,6 +36,7 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 	map(tofrom:offloaded_to_device)\
   map(from:verification[:input.lookups])
   #pragma acc parallel loop \
+  copyin(data)\
 	copyin(data.n_poles[:data.length_n_poles])\
 	copyin(data.n_windows[:data.length_n_windows])\
 	copyin(data.poles[:data.length_poles])\
@@ -86,8 +87,7 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 
 		// Check if we are currently running on the device or not
 		if( i == 0 ) {
-			//offloaded_to_device = !omp_is_initial_device();
-      offloaded_to_device = 1;
+      offloaded_to_device = !acc_on_device(acc_device_host);
     }
 	}
   

From 94e273a00fdc511f92ed20de6270d3819ebd3145 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Tue, 31 Oct 2023 08:24:21 -0700
Subject: [PATCH 06/14] Updated programming model statement to read OpenACC

---
 openacc/io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openacc/io.c b/openacc/io.c
index e15ce16..b8bf24a 100644
--- a/openacc/io.c
+++ b/openacc/io.c
@@ -249,7 +249,7 @@ void print_input_summary(Input input)
 	// Calculate Estimate of Memory Usage
 	size_t mem = get_mem_estimate(input);
 
-	printf("Programming Model:           OpenMP Taget Offloading\n");
+	printf("Programming Model:           OpenACC\n");
 	if( input.simulation_method == EVENT_BASED )
 		printf("Simulation Method:           Event Based\n");
 	else

From 50a5dc0a5abab734dcbcae3c344c162611a1b8f8 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Tue, 31 Oct 2023 08:37:57 -0700
Subject: [PATCH 07/14] Fixed correctness errors by adding loop
 gang/worker/vector clauses throughout simulation.c

---
 openacc/simulation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/openacc/simulation.c b/openacc/simulation.c
index 7b89e09..987d265 100644
--- a/openacc/simulation.c
+++ b/openacc/simulation.c
@@ -35,7 +35,7 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 	map(to:data.max_num_windows)\
 	map(tofrom:offloaded_to_device)\
   map(from:verification[:input.lookups])
-  #pragma acc parallel loop \
+  #pragma acc parallel loop gang \
   copyin(data)\
 	copyin(data.n_poles[:data.length_n_poles])\
 	copyin(data.n_windows[:data.length_n_windows])\
@@ -112,6 +112,7 @@ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int
 		macro_xs[i] = 0;
 
 	// for nuclide in mat
+#pragma acc loop worker
 	for( int i = 0; i < num_nucs[mat]; i++ )
 	{
 		double micro_xs[4];
@@ -166,6 +167,7 @@ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int
 	sigF = E * w.F;
 
 	// Loop over Poles within window, add contributions
+#pragma acc loop vector
 	for( int i = w.start; i < w.end; i++ )
 	{
 		RSComplex PSIIKI;
@@ -219,6 +221,7 @@ void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input inp
 	double dopp = 0.5;
 
 	// Loop over Poles within window, add contributions
+#pragma acc loop vector
 	for( int i = w.start; i < w.end; i++ )
 	{
 		Pole pole = poles[nuc * max_num_poles + i];

From 2ffbae426cc8a43a514ddeb228084a3c5ca34316 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Tue, 31 Oct 2023 08:38:35 -0700
Subject: [PATCH 08/14] Removed old OpenMP comments

---
 openacc/simulation.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/openacc/simulation.c b/openacc/simulation.c
index 987d265..4fcafae 100644
--- a/openacc/simulation.c
+++ b/openacc/simulation.c
@@ -20,21 +20,6 @@ void run_event_based_simulation(Input input, SimulationData data, unsigned long
 	int offloaded_to_device = 0;
 
 	// Main simulation loop over macroscopic cross section lookups
-
-	//#pragma omp target teams distribute parallel for\
-	map(to:data.n_poles[:data.length_n_poles])\
-	map(to:data.n_windows[:data.length_n_windows])\
-	map(to:data.poles[:data.length_poles])\
-	map(to:data.windows[:data.length_windows])\
-	map(to:data.pseudo_K0RS[:data.length_pseudo_K0RS])\
-	map(to:data.num_nucs[:data.length_num_nucs])\
-	map(to:data.mats[:data.length_mats])\
-	map(to:data.concs[:data.length_concs])\
-	map(to:data.max_num_nucs)\
-	map(to:data.max_num_poles)\
-	map(to:data.max_num_windows)\
-	map(tofrom:offloaded_to_device)\
-  map(from:verification[:input.lookups])
   #pragma acc parallel loop gang \
   copyin(data)\
 	copyin(data.n_poles[:data.length_n_poles])\

From b6a7af22493f4dc26b364c3849de0adc9de139b9 Mon Sep 17 00:00:00 2001
From: Joy Kitson <jkitson@umd.edu>
Date: Thu, 2 Nov 2023 07:09:38 -0700
Subject: [PATCH 09/14] Switched back to usign OpenMP timers (as the default
 system ones don't seem precise enough)

---
 openacc/Makefile  | 2 +-
 openacc/rsbench.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/openacc/Makefile b/openacc/Makefile
index 49693a4..037046d 100644
--- a/openacc/Makefile
+++ b/openacc/Makefile
@@ -61,7 +61,7 @@ endif
 # NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
 ifeq ($(COMPILER),nvidia)
   CC = nvc
-	CFLAGS += -acc -Minfo=accel -gpu=cc80
+	CFLAGS += -acc -Minfo=accel -gpu=cc80 -openmp
 endif
 
 # AOMP Targeting MI100 -- Change march to Target Other GPUs
diff --git a/openacc/rsbench.h b/openacc/rsbench.h
index 050d8ee..2e3ee8e 100644
--- a/openacc/rsbench.h
+++ b/openacc/rsbench.h
@@ -6,6 +6,7 @@
 #include<stdint.h>
 #include<float.h>
 #include<openacc.h>
+#include<omp.h>
 #include<assert.h>
 
 #define OPENMP

From 57ede1ef719137b815059c4361373e0601e0d64d Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:52:58 -0400
Subject: [PATCH 10/14] HIP Free Device Memory and Move Device Allocations to
 Simulation.cpp

* feat: add offload arch options for HIP

* refactor: rename *.hip to *.cpp

* refactor: free all dynamically allocated memory

* refactor: rename executable to match repo name

* fix: syntax issues

---------

Co-authored-by: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>
---
 hip/Makefile                           |  23 ++--
 hip/{init.hip => init.cpp}             |  28 ++++-
 hip/{io.hip => io.cpp}                 |   0
 hip/{main.hip => main.cpp}             |  15 +--
 hip/{material.hip => material.cpp}     |   0
 hip/rsbench.cuh                        | 145 -------------------------
 hip/rsbench.h                          |   4 +-
 hip/{simulation.hip => simulation.cpp} |  30 +++--
 hip/{utils.hip => utils.cpp}           |   0
 9 files changed, 68 insertions(+), 177 deletions(-)
 rename hip/{init.hip => init.cpp} (93%)
 rename hip/{io.hip => io.cpp} (100%)
 rename hip/{main.hip => main.cpp} (85%)
 rename hip/{material.hip => material.cpp} (100%)
 delete mode 100644 hip/rsbench.cuh
 rename hip/{simulation.hip => simulation.cpp} (94%)
 rename hip/{utils.hip => utils.cpp} (100%)

diff --git a/hip/Makefile b/hip/Makefile
index 645e3b0..985b1b0 100644
--- a/hip/Makefile
+++ b/hip/Makefile
@@ -6,22 +6,23 @@ COMPILER    = amd
 OPTIMIZE    = yes
 DEBUG       = no
 PROFILE     = no
+OFFLOAD_ARCH ?= gfx90a
 
 #===============================================================================
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = RSBench
 
 source = \
-main.hip \
-simulation.hip\
-io.hip \
-init.hip \
-material.hip \
-utils.hip
+main.cpp \
+simulation.cpp\
+io.cpp \
+init.cpp \
+material.cpp \
+utils.cpp
 
-obj = $(source:.hip=.o)
+obj = $(source:.cpp=.o)
 
 #===============================================================================
 # Sets Flags
@@ -33,7 +34,7 @@ CFLAGS :=
 # AMD
 ifeq ($(COMPILER),amd)
   CC = hipcc
-  CFLAGS += -std=c++14
+  CFLAGS += -std=c++14 --offload-arch=${OFFLOAD_ARCH}
 endif
 
 # Linker Flags
@@ -63,11 +64,11 @@ endif
 $(program): $(obj) rsbench.h Makefile
 	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
 
-%.o: %.hip rsbench.h Makefile
+%.o: %.cpp rsbench.h Makefile
 	$(CC) $(CFLAGS) -c $< -o $@
 
 clean:
-	rm -rf rsbench $(obj)
+	rm -rf $(program) $(obj)
 
 edit:
 	vim -p $(source) rsbench.h
diff --git a/hip/init.hip b/hip/init.cpp
similarity index 93%
rename from hip/init.hip
rename to hip/init.cpp
index 0b2cedf..61cf417 100644
--- a/hip/init.hip
+++ b/hip/init.cpp
@@ -1,8 +1,7 @@
 #include "rsbench.h"
 
 // Moves all required data structures to the GPU's memory space
-SimulationData move_simulation_data_to_device( Input in, SimulationData SD )
-{
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD ) {
 	printf("Allocating and moving simulation data to GPU memory space...\n");
 
 	size_t sz;
@@ -102,9 +101,34 @@ SimulationData initialize_simulation( Input input )
 	SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
 	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
 
+	SD.verification = (unsigned long *) malloc(input.lookups * sizeof(unsigned long));
+
 	return SD;
 }
 
+void release_memory(SimulationData SD) {
+	free(SD.num_nucs);
+	free(SD.concs);
+	free(SD.mats);
+	free(SD.n_poles);
+	free(SD.n_windows);
+	free(SD.poles);
+	free(SD.windows);
+	free(SD.pseudo_K0RS);
+}
+
+void release_device_memory(SimulationData GSD) {
+	hipFree(GSD.num_nucs);
+	hipFree(GSD.concs);
+	hipFree(GSD.mats);
+	hipFree(GSD.n_poles);
+	hipFree(GSD.n_windows);
+	hipFree(GSD.poles);
+	hipFree(GSD.windows);
+	hipFree(GSD.pseudo_K0RS);
+	hipFree(GSD.verification);
+}
+
 int * generate_n_poles( Input input, uint64_t * seed )
 {
 	int total_resonances = input.avg_n_poles * input.n_nuclides;
diff --git a/hip/io.hip b/hip/io.cpp
similarity index 100%
rename from hip/io.hip
rename to hip/io.cpp
diff --git a/hip/main.hip b/hip/main.cpp
similarity index 85%
rename from hip/main.hip
rename to hip/main.cpp
index d9d1077..b66dadd 100644
--- a/hip/main.hip
+++ b/hip/main.cpp
@@ -27,14 +27,8 @@ int main(int argc, char * argv[])
 	center_print("INITIALIZATION", 79);
 	border_print();
 	
-	start = get_time();
 	
 	SimulationData SD = initialize_simulation( input );
-	SimulationData GSD = move_simulation_data_to_device( input, SD );
-
-	stop = get_time();
-
-	printf("Initialization Complete. (%.2lf seconds)\n", stop-start);
 	
 	// =====================================================================
 	// Cross Section (XS) Parallel Lookup Simulation Begins
@@ -44,15 +38,15 @@ int main(int argc, char * argv[])
 	border_print();
 
 	unsigned long vhash = 0;
+	double elapsed_time = 0;
 
 	// Run Simulation
-	start = get_time();
 
 	// Run simulation
 	if( input.simulation_method == EVENT_BASED )
 	{
 		if( input.kernel_id == 0 )
-			run_event_based_simulation(input, GSD, &vhash );
+			run_event_based_simulation(input, SD, &vhash, &elapsed_time);
 		else
 		{
 			printf("Error: No kernel ID %d found!\n", input.kernel_id);
@@ -65,13 +59,14 @@ int main(int argc, char * argv[])
 		exit(1);
 	}
 
-	stop = get_time();
 
 	// Final hash step
 	vhash = vhash % 999983;
 
 	printf("Simulation Complete.\n");
 
+	release_memory(SD);
+
 	// =====================================================================
 	// Print / Save Results and Exit
 	// =====================================================================
@@ -79,7 +74,7 @@ int main(int argc, char * argv[])
 	center_print("RESULTS", 79);
 	border_print();
 
-	int is_invalid = validate_and_print_results(input, stop-start, vhash);
+	int is_invalid = validate_and_print_results(input, elapsed_time, vhash);
 
 	border_print();
 
diff --git a/hip/material.hip b/hip/material.cpp
similarity index 100%
rename from hip/material.hip
rename to hip/material.cpp
diff --git a/hip/rsbench.cuh b/hip/rsbench.cuh
deleted file mode 100644
index efc9688..0000000
--- a/hip/rsbench.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-#include<stdio.h>
-#include<stdlib.h>
-#include<sys/time.h>
-#include<math.h>
-#include<string.h>
-#include<stdint.h>
-#include<float.h>
-#include<assert.h>
-#include<cuda.h>
-#include <thrust/reduce.h>
-#include <chrono> 
-
-#define PI 3.14159265359
-
-// typedefs
-typedef enum __hm{SMALL, LARGE, XL, XXL} HM_size;
-
-#define HISTORY_BASED 1
-#define EVENT_BASED 2
-
-#define STARTING_SEED 1070
-#define INITIALIZATION_SEED 42
-
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-	if (code != cudaSuccess) 
-	{
-		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-		if (abort) exit(code);
-	}
-}
-
-typedef struct{
-	double r;
-	double i;
-} RSComplex;
-
-typedef struct{
-	int nthreads;
-	int n_nuclides;
-	int lookups;
-	HM_size HM;
-	int avg_n_poles;
-	int avg_n_windows;
-	int numL;
-	int doppler;
-	int particles;
-	int simulation_method;
-	int kernel_id;
-} Input;
-
-typedef struct{
-	RSComplex MP_EA;
-	RSComplex MP_RT;
-	RSComplex MP_RA;
-	RSComplex MP_RF;
-	short int l_value;
-} Pole;
-
-typedef struct{
-	double T;
-	double A;
-	double F;
-	int start;
-	int end;
-} Window;
-
-typedef struct{
-	int * n_poles;
-	unsigned long length_n_poles;
-	int * n_windows;
-	unsigned long length_n_windows;
-	Pole * poles;
-	unsigned long length_poles;
-	Window * windows;
-	unsigned long length_windows;
-	double * pseudo_K0RS;
-	unsigned long length_pseudo_K0RS;
-	int * num_nucs;
-	unsigned long length_num_nucs;
-	int * mats;
-	unsigned long length_mats;
-	double * concs;
-	unsigned long length_concs;
-	int max_num_nucs;
-	int max_num_poles;
-	int max_num_windows;
-	double * p_energy_samples;
-	unsigned long length_p_energy_samples;
-	int * mat_samples;
-	unsigned long length_mat_samples;
-	unsigned long  * verification;
-	unsigned long length_verification;
-} SimulationData;
-
-// io.c
-void logo(int version);
-void center_print(const char *s, int width);
-void border_print(void);
-void fancy_int( int a );
-Input read_CLI( int argc, char * argv[] );
-void print_CLI_error(void);
-void print_input_summary(Input input);
-int validate_and_print_results(Input input, double runtime, unsigned long vhash);
-
-// init.c
-SimulationData initialize_simulation( Input input );
-int * generate_n_poles( Input input,  uint64_t * seed );
-int * generate_n_windows( Input input ,  uint64_t * seed);
-Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles );
-Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
-double * generate_pseudo_K0RS( Input input, uint64_t * seed );
-SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
-
-// material.c
-int * load_num_nucs(Input input);
-int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats );
-double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs );
-SimulationData get_materials(Input input, uint64_t * seed);
-
-// utils.c
-size_t get_mem_estimate( Input input );
-double get_time(void);
-
-// simulation.c
-void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
-void run_event_based_simulation_optimization_1(Input in, SimulationData GSD, unsigned long * vhash_result);
-__global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
-__device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
-__device__ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles);
-__device__ void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
-__device__ int pick_mat( uint64_t * seed );
-__device__ void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors );
-__device__ RSComplex fast_nuclear_W( RSComplex Z );
-__host__ __device__ double LCG_random_double(uint64_t * seed);
-__host__ __device__ uint64_t LCG_random_int(uint64_t * seed);
-__device__ uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
-__device__ RSComplex c_add( RSComplex A, RSComplex B);
-__device__ RSComplex c_sub( RSComplex A, RSComplex B);
-__host__ __device__ RSComplex c_mul( RSComplex A, RSComplex B);
-__device__ RSComplex c_div( RSComplex A, RSComplex B);
-__device__ double c_abs( RSComplex A);
-__device__ double fast_exp(double x);
-__device__ RSComplex fast_cexp( RSComplex z );
diff --git a/hip/rsbench.h b/hip/rsbench.h
index 1abca08..9029191 100644
--- a/hip/rsbench.h
+++ b/hip/rsbench.h
@@ -118,13 +118,15 @@ int * load_num_nucs(Input input);
 int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats );
 double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs );
 SimulationData get_materials(Input input, uint64_t * seed);
+void release_memory(SimulationData SD);
+void release_device_memory(SimulationData GSD);
 
 // utils.c
 size_t get_mem_estimate( Input input );
 double get_time(void);
 
 // simulation.c
-void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_resultl, double * elapsed_time);
 __global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
 __device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
 __device__ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles);
diff --git a/hip/simulation.hip b/hip/simulation.cpp
similarity index 94%
rename from hip/simulation.hip
rename to hip/simulation.cpp
index 3592747..41c30bb 100644
--- a/hip/simulation.hip
+++ b/hip/simulation.cpp
@@ -13,8 +13,18 @@
 // line argument.
 ////////////////////////////////////////////////////////////////////////////////////
 
-void run_event_based_simulation(Input input, SimulationData GSD, unsigned long * vhash_result )
+void run_event_based_simulation(Input input, SimulationData SD, unsigned long * vhash_result, double * elapsed_time)
 {
+	double start, stop;
+	start = get_time();
+	////////////////////////////////////////////////////////////////////////////////
+	// Move Data to Device
+	////////////////////////////////////////////////////////////////////////////////
+	SimulationData GSD = move_simulation_data_to_device(input, SD);
+
+ 	stop = get_time();
+	printf("Initialization Complete. (%.2lf seconds)\n", stop - start);
+
 	////////////////////////////////////////////////////////////////////////////////
 	// Configure & Launch Simulation Kernel
 	////////////////////////////////////////////////////////////////////////////////
@@ -25,22 +35,26 @@ void run_event_based_simulation(Input input, SimulationData GSD, unsigned long *
 
 	hipLaunchKernelGGL(xs_lookup_kernel_baseline, dim3(nblocks), dim3(nthreads), 0, 0,  input, GSD );
 	gpuErrchk( hipPeekAtLastError() );
-	gpuErrchk( hipDeviceSynchronize() );
+	size_t sz = input.lookups * sizeof(unsigned long);
+	gpuErrchk( hipMemcpy(SD.verification, GSD.verification, sz, hipMemcpyDeviceToHost) );
 	
 	////////////////////////////////////////////////////////////////////////////////
 	// Reduce Verification Results
 	////////////////////////////////////////////////////////////////////////////////
 	printf("Reducing verification results...\n");
 
-  size_t sz = input.lookups * sizeof(unsigned long);
-  unsigned long * v = (unsigned long *) malloc(sz);
-  gpuErrchk( hipMemcpy(v, GSD.verification, sz, hipMemcpyDeviceToHost) );
 
-  unsigned long verification_scalar = 0;
-  for( int i =0; i < input.lookups; i++ )
-    verification_scalar += v[i];
+	unsigned long verification_scalar = 0;
+	for( int i =0; i < input.lookups; i++ )
+		verification_scalar += SD.verification[i];
 
 	*vhash_result = verification_scalar;
+
+	stop = get_time();
+
+	*elapsed_time = stop - start;
+
+	release_device_memory(GSD);
 }
 
 // In this kernel, we perform a single lookup with each thread. Threads within a warp
diff --git a/hip/utils.hip b/hip/utils.cpp
similarity index 100%
rename from hip/utils.hip
rename to hip/utils.cpp

From 08f4d113d4b4dc5f351cf2d1ead2301f66e71dcd Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:53:18 -0400
Subject: [PATCH 11/14] feat: add copy back of verification vector to host (#4)

Co-authored-by: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>
---
 cuda/Makefile      |  6 +++---
 cuda/init.cu       | 25 +++++++++++++++++++++++++
 cuda/main.cu       | 14 ++++++--------
 cuda/rsbench.cuh   |  4 +++-
 cuda/simulation.cu | 28 ++++++++++++++++++++++------
 5 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/cuda/Makefile b/cuda/Makefile
index 78dc383..4f8edbf 100644
--- a/cuda/Makefile
+++ b/cuda/Makefile
@@ -6,13 +6,13 @@ COMPILER    = nvidia
 OPTIMIZE    = yes
 DEBUG       = no
 PROFILE     = no
-SM_VERSION  = 80
+SM_VERSION  ?= 80
 
 #===============================================================================
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = RSBench
 
 source = \
 main.cu \
@@ -68,7 +68,7 @@ $(program): $(obj) rsbench.cuh Makefile
 	$(CC) $(CFLAGS) -c $< -o $@
 
 clean:
-	rm -rf rsbench $(obj)
+	rm -rf $(program) $(obj)
 
 edit:
 	vim -p $(source) rsbench.cuh
diff --git a/cuda/init.cu b/cuda/init.cu
index e350a1f..e39d1e0 100644
--- a/cuda/init.cu
+++ b/cuda/init.cu
@@ -102,9 +102,34 @@ SimulationData initialize_simulation( Input input )
 	SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
 	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
 
+	SD.verification = (unsigned long *) malloc(input.lookups * sizeof(unsigned long));
+
 	return SD;
 }
 
+void release_memory(SimulationData SD) {
+	free(SD.num_nucs);
+	free(SD.concs);
+	free(SD.mats);
+	free(SD.n_poles);
+	free(SD.n_windows);
+	free(SD.poles);
+	free(SD.windows);
+	free(SD.pseudo_K0RS);
+}
+
+void release_device_memory(SimulationData GSD) {
+	cudaFree(GSD.num_nucs);
+	cudaFree(GSD.concs);
+	cudaFree(GSD.mats);
+	cudaFree(GSD.n_poles);
+	cudaFree(GSD.n_windows);
+	cudaFree(GSD.poles);
+	cudaFree(GSD.windows);
+	cudaFree(GSD.pseudo_K0RS);
+	cudaFree(GSD.verification);
+}
+
 int * generate_n_poles( Input input, uint64_t * seed )
 {
 	int total_resonances = input.avg_n_poles * input.n_nuclides;
diff --git a/cuda/main.cu b/cuda/main.cu
index 24dd288..b9218ea 100644
--- a/cuda/main.cu
+++ b/cuda/main.cu
@@ -30,7 +30,6 @@ int main(int argc, char * argv[])
 	start = get_time();
 	
 	SimulationData SD = initialize_simulation( input );
-	SimulationData GSD = move_simulation_data_to_device( input, SD );
 
 	stop = get_time();
 
@@ -44,17 +43,15 @@ int main(int argc, char * argv[])
 	border_print();
 
 	unsigned long vhash = 0;
-
-	// Run Simulation
-	start = get_time();
+	double elapsed_time = 0;
 
 	// Run simulation
 	if( input.simulation_method == EVENT_BASED )
 	{
 		if( input.kernel_id == 0 )
-			run_event_based_simulation(input, GSD, &vhash );
+			run_event_based_simulation(input, SD, &vhash, &elapsed_time);
 		else if( input.kernel_id == 1 )
-			run_event_based_simulation_optimization_1(input, GSD, &vhash );
+			run_event_based_simulation_optimization_1(input, SD, &vhash );
 		else
 		{
 			printf("Error: No kernel ID %d found!\n", input.kernel_id);
@@ -67,13 +64,14 @@ int main(int argc, char * argv[])
 		exit(1);
 	}
 
-	stop = get_time();
 
 	// Final hash step
 	vhash = vhash % 999983;
 
 	printf("Simulation Complete.\n");
 
+	release_memory(SD);
+
 	// =====================================================================
 	// Print / Save Results and Exit
 	// =====================================================================
@@ -81,7 +79,7 @@ int main(int argc, char * argv[])
 	center_print("RESULTS", 79);
 	border_print();
 
-	int is_invalid = validate_and_print_results(input, stop-start, vhash);
+	int is_invalid = validate_and_print_results(input, elapsed_time, vhash);
 
 	border_print();
 
diff --git a/cuda/rsbench.cuh b/cuda/rsbench.cuh
index d2c044d..19ac00e 100644
--- a/cuda/rsbench.cuh
+++ b/cuda/rsbench.cuh
@@ -115,6 +115,8 @@ Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_nu
 Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
 double * generate_pseudo_K0RS( Input input, uint64_t * seed );
 SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
+void release_memory(SimulationData SD);
+void release_device_memory(SimulationData GSD);
 
 // material.c
 int * load_num_nucs(Input input);
@@ -127,7 +129,7 @@ size_t get_mem_estimate( Input input );
 double get_time(void);
 
 // simulation.c
-void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result, double * elapsed_time);
 void run_event_based_simulation_optimization_1(Input in, SimulationData GSD, unsigned long * vhash_result);
 __global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
 __device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
diff --git a/cuda/simulation.cu b/cuda/simulation.cu
index 504729a..7ebeb45 100644
--- a/cuda/simulation.cu
+++ b/cuda/simulation.cu
@@ -12,30 +12,46 @@
 // line argument.
 ////////////////////////////////////////////////////////////////////////////////////
 
-void run_event_based_simulation(Input input, SimulationData GSD, unsigned long * vhash_result )
-{
+void run_event_based_simulation(Input input, SimulationData SD, unsigned long * vhash_result, double * elapsed_time) {
+	double start, stop;
+	start = get_time();
+	////////////////////////////////////////////////////////////////////////////////
+	// Move Data to Device
+	////////////////////////////////////////////////////////////////////////////////
+	SimulationData GSD = move_simulation_data_to_device(input, SD);
+ 
+ 	stop = get_time();
+	printf("Initialization Complete. (%.2lf seconds)\n", stop - start);
 	////////////////////////////////////////////////////////////////////////////////
 	// Configure & Launch Simulation Kernel
 	////////////////////////////////////////////////////////////////////////////////
 	printf("Running baseline event-based simulation on device...\n");
 
+	start = get_time();
+
 	int nthreads = 256;
 	int nblocks = ceil( (double) input.lookups / (double) nthreads);
 
 	xs_lookup_kernel_baseline<<<nblocks, nthreads>>>( input, GSD );
 	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
+	gpuErrchk(cudaMemcpy(SD.verification, GSD.verification, input.lookups * sizeof(unsigned long), cudaMemcpyDeviceToHost));
 	
 	////////////////////////////////////////////////////////////////////////////////
 	// Reduce Verification Results
 	////////////////////////////////////////////////////////////////////////////////
 	printf("Reducing verification results...\n");
 
-	unsigned long verification_scalar = thrust::reduce(thrust::device, GSD.verification, GSD.verification + input.lookups, 0);
-	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
+	unsigned long long verification_scalar = 0;
+	for(int i = 0; i < input.lookups; i++ )
+		verification_scalar += SD.verification[i];
 
 	*vhash_result = verification_scalar;
+
+    stop = get_time();
+
+	*elapsed_time = stop - start;
+
+	release_device_memory(GSD);
 }
 
 // In this kernel, we perform a single lookup with each thread. Threads within a warp

From 6a40a07503b4c790e9f58330aa3c513f990fddd3 Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Fri, 3 Nov 2023 17:53:40 -0400
Subject: [PATCH 12/14] SYCL2020 Updates

* refactor: updates for SYCL2020

* refactor: namespace cl::sycl to sycl

* feat: update Makefile to target common backends

* fix: namespace and initializations

* refactor: rename rsbench to RSBench to match repo name

* feat: synchronize with host using get_host_access

---------

Co-authored-by: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>

Free Host Memory for SYCL2020 (#6)

* feat: synchronize with host using get_host_access

* feat: release dynamic memory

---------

Co-authored-by: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>

Fix offload Arch instead of sm version (#7)

* refactor: updates for SYCL2020

* refactor: namespace cl::sycl to sycl

* feat: update Makefile to target common backends

* fix: namespace and initializations

* refactor: rename rsbench to RSBench to match repo name

* feat: synchronize with host using get_host_access

* feat: release dynamic memory

* fix: OFFLOAD_ARCH instead of SM_VERSION

---------

Co-authored-by: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>
---
 sycl/Makefile       |  45 ++++++++---
 sycl/init.cpp       |  11 +++
 sycl/main.cpp       |   4 +-
 sycl/rsbench.h      |   1 +
 sycl/simulation.cpp | 191 +++++++++++++++++++++-----------------------
 5 files changed, 141 insertions(+), 111 deletions(-)

diff --git a/sycl/Makefile b/sycl/Makefile
index 3b4e6df..d6363e1 100644
--- a/sycl/Makefile
+++ b/sycl/Makefile
@@ -6,12 +6,15 @@ CC        = llvm
 OPTIMIZE  = yes
 DEBUG     = no
 PROFILE   = no
+TARGET ?=
+SM_VERSION ?= 
+OFFLOAD_ARCH ?=
 
 #===============================================================================
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = RSBench
 
 source = \
 main.cpp \
@@ -28,22 +31,44 @@ obj = $(source:.cpp=.o)
 #===============================================================================
 
 # Standard Flags
-CFLAGS := -std=c++14 -Wall
+CFLAGS := -std=c++17 -Wall 
 
 # Linker Flags
 LDFLAGS = -lm
 
-# Codeplay Compiler
-ifeq ($(CC),codeplay)
-  CC = compute++
-  CFLAGS += -sycl -sycl-driver
-  LDFLAGS += -lComputeCpp
-endif
-
+# LLVM Compiler
 ifeq ($(CC),llvm)
   CC = clang++
   CFLAGS += -fsycl
-  LDFLAGS += -lOpenCL
+endif
+
+# IntelOneAPI Compiler
+ifeq ($(CC),oneapi)
+  CC = icpx
+  CFLAGS += -fsycl
+endif
+
+# AdaptiveCpp
+ifeq ($(CC),adaptivecpp)
+  CC = acpp
+endif
+
+ifeq ($(TARGET),CUDA)
+  ifeq ($(CC),acpp)
+	CFLAGS += --acpp-targets="cuda:sm_${SM_VERSION}"
+  else
+	CFLAGS += -fsycl-targets=nvptx64-nvidia-cuda \
+			  -Xsycl-target-backend --cuda-gpu-arch=sm_${SM_VERSION}
+  endif
+endif
+
+ifeq ($(TARGET),HIP)
+  ifeq ($(CC),acpp)
+	CFLAGS += --acpp-targets="hip:${OFFLOAD_ARCH}"
+  else
+	CFLAGS += -fsycl-targets=amdgcn-amd-amdhsa \
+			  -Xsycl-target-backend --offload-arch=${OFFLOAD_ARCH}
+  endif
 endif
 
 # Debug Flags
diff --git a/sycl/init.cpp b/sycl/init.cpp
index 2445acf..f047957 100644
--- a/sycl/init.cpp
+++ b/sycl/init.cpp
@@ -36,6 +36,17 @@ SimulationData initialize_simulation( Input input )
 	return SD;
 }
 
+void release_memory(SimulationData SD) {
+	free(SD.num_nucs);
+	free(SD.concs);
+	free(SD.mats);
+	free(SD.n_poles);
+	free(SD.n_windows);
+	free(SD.poles);
+	free(SD.windows);
+	free(SD.pseudo_K0RS);
+}
+
 int * generate_n_poles( Input input, uint64_t * seed )
 {
 	int total_resonances = input.avg_n_poles * input.n_nuclides;
diff --git a/sycl/main.cpp b/sycl/main.cpp
index 4945ef9..3c21bd5 100644
--- a/sycl/main.cpp
+++ b/sycl/main.cpp
@@ -47,7 +47,7 @@ int main(int argc, char * argv[])
 	// Run Simulation
 	start = get_time();
 
-	double kernel_init_time;
+	double kernel_init_time = 0;
 
 	// Run simulation
 	if( input.simulation_method == EVENT_BASED )
@@ -73,6 +73,8 @@ int main(int argc, char * argv[])
 
 	printf("Simulation Complete.\n");
 
+    release_memory(SD);
+
 	// =====================================================================
 	// Print / Save Results and Exit
 	// =====================================================================
diff --git a/sycl/rsbench.h b/sycl/rsbench.h
index fce78f6..8a52011 100644
--- a/sycl/rsbench.h
+++ b/sycl/rsbench.h
@@ -96,6 +96,7 @@ int * generate_n_windows( Input input ,  uint64_t * seed);
 Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles );
 Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
 double * generate_pseudo_K0RS( Input input, uint64_t * seed );
+void release_memory(SimulationData SD);
 
 // material.c
 int * load_num_nucs(Input input);
diff --git a/sycl/simulation.cpp b/sycl/simulation.cpp
index 7859244..885dad5 100644
--- a/sycl/simulation.cpp
+++ b/sycl/simulation.cpp
@@ -1,5 +1,4 @@
 #include "rsbench.h"
-using namespace cl::sycl;
 
 ////////////////////////////////////////////////////////////////////////////////////
 // BASELINE FUNCTIONS
@@ -13,8 +12,7 @@ using namespace cl::sycl;
 // line argument.
 ////////////////////////////////////////////////////////////////////////////////////
 
-void run_event_based_simulation(Input in, SimulationData SD, unsigned long * vhash_result, double * kernel_init_time )
-{
+void run_event_based_simulation(Input in, SimulationData SD, unsigned long * vhash_result, double * kernel_init_time ) {
 	
 	// Let's create an extra verification array to reduce manually later on
 	printf("Allocating an additional %.1lf MB of memory for verification arrays...\n", in.lookups * sizeof(int) /1024.0/1024.0);
@@ -22,107 +20,100 @@ void run_event_based_simulation(Input in, SimulationData SD, unsigned long * vha
 	
 	// Timers
 	double start = get_time();
-	double stop;
-
-	// Scope here is important, as when we exit this blocl we will automatically sync with device
-	// to ensure all work is done and that we can read from verification_host array.
-	{
-		// create a queue using the default device for the platform (cpu, gpu)
+	double stop = 0.0;
 
-		queue sycl_q{default_selector()};
-		//queue sycl_q{gpu_selector()};
-		//queue sycl_q{cpu_selector()};
-		printf("Running on: %s\n", sycl_q.get_device().get_info<cl::sycl::info::device::name>().c_str());
-		printf("Initializing device buffers and JIT compiling kernel...\n");
+	sycl::queue sycl_q{sycl::default_selector_v};
+	printf("Running on: %s\n", sycl_q.get_device().get_info<sycl::info::device::name>().c_str());
 	
+	////////////////////////////////////////////////////////////////////////////////
+	// Create Device Buffers
+	////////////////////////////////////////////////////////////////////////////////
+
+	// assign SYCL buffer to existing memory
+	printf("Initializing device buffers and JIT compiling kernel...\n");
+	sycl::buffer<int> num_nucs_d {SD.num_nucs,SD.length_num_nucs};
+	sycl::buffer<double> concs_d {SD.concs, SD.length_concs};
+	sycl::buffer<int> mats_d {SD.mats, SD.length_mats};
+	sycl::buffer<int> n_windows_d {SD.n_windows, SD.length_n_windows};
+	sycl::buffer<Pole> poles_d {SD.poles, SD.length_poles};
+	sycl::buffer<Window> windows_d {SD.windows, SD.length_windows};
+	sycl::buffer<double> pseudo_K0RS_d {SD.pseudo_K0RS, SD.length_pseudo_K0RS};
+	sycl::buffer<int> verification_d {verification_host, in.lookups};
+
+	////////////////////////////////////////////////////////////////////////////////
+	// Define Device Kernel
+	////////////////////////////////////////////////////////////////////////////////
+
+	printf("Beginning event based simulation...\n");
+
+	// queue a kernel to be run, as a lambda
+	sycl_q.submit([&](sycl::handler &cgh) {
 		////////////////////////////////////////////////////////////////////////////////
-		// Create Device Buffers
+		// Create Device Accessors for Device Buffers
 		////////////////////////////////////////////////////////////////////////////////
-
-		// assign SYCL buffer to existing memory
-		buffer<int, 1> num_nucs_d(SD.num_nucs,SD.length_num_nucs);
-		buffer<double, 1> concs_d(SD.concs, SD.length_concs);
-		buffer<int, 1> mats_d(SD.mats, SD.length_mats);
-		buffer<int, 1> n_windows_d(SD.n_windows, SD.length_n_windows);
-		buffer<Pole, 1> poles_d(SD.poles, SD.length_poles);
-		buffer<Window, 1> windows_d(SD.windows, SD.length_windows);
-		buffer<double, 1> pseudo_K0RS_d(SD.pseudo_K0RS, SD.length_pseudo_K0RS);
-		buffer<int, 1> verification_d(verification_host, in.lookups);
+		sycl::accessor num_nucs {num_nucs_d, cgh, sycl::read_only};
+		sycl::accessor concs {concs_d, cgh, sycl::read_only};
+		sycl::accessor mats {mats_d, cgh, sycl::read_only};
+		sycl::accessor n_windows {n_windows_d, cgh, sycl::read_only};
+		sycl::accessor poles {poles_d, cgh, sycl::read_only};
+		sycl::accessor windows {windows_d, cgh, sycl::read_only};
+		sycl::accessor pseudo_K0RS {pseudo_K0RS_d, cgh, sycl::read_only};
+		sycl::accessor verification {verification_d, cgh, sycl::write_only, sycl::no_init};
 		
 		////////////////////////////////////////////////////////////////////////////////
-		// Define Device Kernel
+		// XS Lookup Simulation Loop
 		////////////////////////////////////////////////////////////////////////////////
 
-		// queue a kernel to be run, as a lambda
-		sycl_q.submit([&](handler &cgh)
-				{
-				////////////////////////////////////////////////////////////////////////////////
-				// Create Device Accessors for Device Buffers
-				////////////////////////////////////////////////////////////////////////////////
-				auto num_nucs = num_nucs_d.get_access<access::mode::read>(cgh);
-				auto concs = concs_d.get_access<access::mode::read>(cgh);
-				auto mats = mats_d.get_access<access::mode::read>(cgh);
-				auto verification = verification_d.get_access<access::mode::write>(cgh);
-				auto n_windows = n_windows_d.get_access<access::mode::read>(cgh);
-				auto poles = poles_d.get_access<access::mode::read>(cgh);
-				auto windows = windows_d.get_access<access::mode::read>(cgh);
-				auto pseudo_K0RS = pseudo_K0RS_d.get_access<access::mode::read>(cgh);
-
-				////////////////////////////////////////////////////////////////////////////////
-				// XS Lookup Simulation Loop
-				////////////////////////////////////////////////////////////////////////////////
-				cgh.parallel_for<kernel>(range<1>(in.lookups), [=](id<1> idx)
-					{
-					// get the index to operate on, first dimemsion
-					size_t i = idx[0];
-
-					// Set the initial seed value
-					uint64_t seed = STARTING_SEED;	
-
-					// Forward seed to lookup index (we need 2 samples per lookup)
-					seed = fast_forward_LCG(seed, 2*i);
-
-					// Randomly pick an energy and material for the particle
-					double p_energy = LCG_random_double(&seed);
-					int mat         = pick_mat(&seed); 
-
-					// debugging
-					//printf("E = %lf mat = %d\n", p_energy, mat);
-
-					double macro_xs_vector[4] = {0};
-
-					// Perform macroscopic Cross Section Lookup
-					calculate_macro_xs( macro_xs_vector, mat, p_energy, in, num_nucs, mats, SD.max_num_nucs, concs, n_windows, pseudo_K0RS, windows, poles, SD.max_num_windows, SD.max_num_poles );
-
-					// For verification, and to prevent the compiler from optimizing
-					// all work out, we interrogate the returned macro_xs_vector array
-					// to find its maximum value index, then increment the verification
-					// value by that index. In this implementation, we store to a global
-					// array that will get tranferred back and reduced on the host.
-					double max = -DBL_MAX;
-					int max_idx = 0;
-					for(int j = 0; j < 4; j++ )
-					{
-						if( macro_xs_vector[j] > max )
-						{
-							max = macro_xs_vector[j];
-							max_idx = j;
-						}
-					}
-					verification[i] = max_idx+1;
-
-					});
-				});
-		stop = get_time();
-		printf("Kernel initialization, compilation, and launch took %.2lf seconds.\n", stop-start);
-		printf("Beginning event based simulation...\n");
-	}
+		cgh.parallel_for<sycl::kernel>(sycl::range<1>(in.lookups), [=](sycl::id<1> idx) {
+			// get the index to operate on, first dimemsion
+			size_t i = idx[0];
+
+			// Set the initial seed value
+			uint64_t seed = STARTING_SEED;	
+
+			// Forward seed to lookup index (we need 2 samples per lookup)
+			seed = fast_forward_LCG(seed, 2*i);
+
+			// Randomly pick an energy and material for the particle
+			double p_energy = LCG_random_double(&seed);
+			int mat = pick_mat(&seed); 
+
+			double macro_xs_vector[4] = {0};
+
+			// Perform macroscopic Cross Section Lookup
+			calculate_macro_xs( macro_xs_vector, mat, p_energy, in, num_nucs, mats, SD.max_num_nucs, concs, n_windows, pseudo_K0RS, windows, poles, SD.max_num_windows, SD.max_num_poles );
+
+			// For verification, and to prevent the compiler from optimizing
+			// all work out, we interrogate the returned macro_xs_vector array
+			// to find its maximum value index, then increment the verification
+			// value by that index. In this implementation, we store to a global
+			// array that will get tranferred back and reduced on the host.
+			double max = -DBL_MAX;
+			int max_idx = 0;
+			for(int j = 0; j < 4; j++ ) {
+				if(macro_xs_vector[j] > max) {
+					max = macro_xs_vector[j];
+					max_idx = j;
+				}
+			}
+			verification[i] = max_idx+1;
+		});
+	});
+
+
+	stop = get_time();
+    
+	printf("Kernel initialization, compilation, and launch took %.2lf seconds.\n", stop-start);
+
+	verification_d.get_host_access();
 
 	// Host reduces the verification array
 	unsigned long long verification_scalar = 0;
 	for( int i = 0; i < in.lookups; i++ )
 		verification_scalar += verification_host[i];
 
+	stop = get_time();
+
 	*vhash_result = verification_scalar;
 	*kernel_init_time = stop-start;
 }
@@ -196,7 +187,7 @@ void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, INT_
 		RSComplex CDUM;
 		Pole pole = poles[nuc * max_num_poles + i];
 		RSComplex t1 = {0, 1};
-		RSComplex t2 = {cl::sycl::sqrt(E), 0 };
+		RSComplex t2 = {sycl::sqrt(E), 0 };
 		PSIIKI = c_div( t1 , c_sub(pole.MP_EA,t2) );
 		RSComplex E_c = {E, 0};
 		CDUM = c_div(PSIIKI, E_c);
@@ -314,19 +305,19 @@ void calculate_sig_T( int nuc, double E, Input input, DOUBLE_T pseudo_K0RS, RSCo
 
 	for( int i = 0; i < 4; i++ )
 	{
-		phi = pseudo_K0RS[nuc * input.numL + i] * cl::sycl::sqrt(E);
+		phi = pseudo_K0RS[nuc * input.numL + i] * sycl::sqrt(E);
 
 		if( i == 1 )
-			phi -= - cl::sycl::atan( phi );
+			phi -= - sycl::atan( phi );
 		else if( i == 2 )
-			phi -= cl::sycl::atan( 3.0 * phi / (3.0 - phi*phi));
+			phi -= sycl::atan( 3.0 * phi / (3.0 - phi*phi));
 		else if( i == 3 )
-			phi -= cl::sycl::atan(phi*(15.0-phi*phi)/(15.0-6.0*phi*phi));
+			phi -= sycl::atan(phi*(15.0-phi*phi)/(15.0-6.0*phi*phi));
 
 		phi *= 2.0;
 
-		sigTfactors[i].r = cl::sycl::cos(phi);
-		sigTfactors[i].i = -cl::sycl::sin(phi);
+		sigTfactors[i].r = sycl::cos(phi);
+		sigTfactors[i].i = -sycl::sin(phi);
 	}
 }
 
@@ -506,7 +497,7 @@ RSComplex c_div( RSComplex A, RSComplex B)
 
 double c_abs( RSComplex A)
 {
-	return cl::sycl::sqrt(A.r*A.r + A.i * A.i);
+	return sycl::sqrt(A.r*A.r + A.i * A.i);
 }
 
 
@@ -537,8 +528,8 @@ RSComplex fast_cexp( RSComplex z )
 	// will use our own exponetial implementation
 	//double t1 = exp(x);
 	double t1 = fast_exp(x);
-	double t2 = cl::sycl::cos(y);
-	double t3 = cl::sycl::sin(y);
+	double t2 = sycl::cos(y);
+	double t3 = sycl::sin(y);
 	RSComplex t4 = {t2, t3};
 	RSComplex t5 = {t1, 0};
 	RSComplex result = c_mul(t5, (t4));

From 721488eb81e59ab033af09fccbdb462a5ecc4abe Mon Sep 17 00:00:00 2001
From: Pranav Sivaraman <14294205+pranav-sivaraman@users.noreply.github.com>
Date: Fri, 3 Nov 2023 21:28:14 -0400
Subject: [PATCH 13/14] Add RAJA Model

* feat: add .gitignore

* feat: RAJA model initial setup

* fix: rename executable to RSBench

* fix: Umpire link issues

* feat: replace CUDA memory calls with Umpire

* feat: release allocated device memory

* feat: release allocated memory

* feat: change global kernel to use RAJA

* feat: WIP Makefile

* feat: Makefile for RAJA CUDA

* feat: OpenMP support for RAJA model

* fix: CMake CUDA Arch and downgrade to c++14

* feat: add HIP support for RAJA model

* chore: update .gitignore

* refactor: EXECUTION_POLICY to TARGET

* refactor: remove CMake build option in favor of Makefile

* refactor: copy verificaiton array back to host and move device movement in simulation

* fix: make clean

* feat: OpenMP Backend

* style: add comments to Makefile

* feat: update README

* feat: print device info depending on backend

* chore: update .gitignore

* feat: update make clean to include cuda files

* fix: -rpath for HIP

* fix: minor adjustment to where copy occurs to match other versions

---------

Co-authored-by: Pranav Sivaraman <14294205+PerryStyle@users.noreply.github.com>
---
 .gitignore          |  10 +
 README.md           |   2 +
 raja/Makefile       | 106 +++++++++
 raja/init.cpp       | 299 +++++++++++++++++++++++++
 raja/io.cpp         | 353 ++++++++++++++++++++++++++++++
 raja/main.cpp       |  74 +++++++
 raja/material.cpp   | 133 +++++++++++
 raja/rsbench.h      | 137 ++++++++++++
 raja/simulation.cpp | 522 ++++++++++++++++++++++++++++++++++++++++++++
 raja/utils.cpp      |  31 +++
 10 files changed, 1667 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 raja/Makefile
 create mode 100644 raja/init.cpp
 create mode 100644 raja/io.cpp
 create mode 100644 raja/main.cpp
 create mode 100644 raja/material.cpp
 create mode 100644 raja/rsbench.h
 create mode 100644 raja/simulation.cpp
 create mode 100644 raja/utils.cpp

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b90588c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*.o
+RSBench
+*.cpp1*
+*.cpp4*
+*.ptx
+*.cubin
+*.cudafe1*
+*.fatbin*
+*.module_id
+[Bb][Uu][Ii][Ll][Dd]*/
diff --git a/README.md b/README.md
index b322fbc..8655e1e 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,8 @@ This version of RSBench is written in SYCL, and can be used for CPU, GPU, FPGA,
 5. **RSBench/hip**
 This version of RSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.
 
+6. **RSBench/RAJA**
+This version of RSBench is written using the RAJA programming model and Umpire to handle memory management. This programming model can be run on either CPU or GPU depending on how RAJA is configured. You will likely need to edit the makefile if RAJA is installed with multiple backends.
 ## Compilation
 
 To compile RSBench with default settings, navigate to your selected source directory and use the following command:
diff --git a/raja/Makefile b/raja/Makefile
new file mode 100644
index 0000000..87b674c
--- /dev/null
+++ b/raja/Makefile
@@ -0,0 +1,106 @@
+#===============================================================================
+# User Options
+#===============================================================================
+
+RAJA_PATH ?= 
+Umpire_PATH ?=
+camp_PATH ?=
+TARGET ?= # Choose Target Backend (CUDA, HIP, OpenMP) Default is Sequential
+SM_VERSION ?= # For CUDA Target
+OFFLOAD_ARCH ?= # For HIP Target
+
+OPTIMIZE ?= yes
+DEBUG ?= no
+PROFILE ?= no
+MPI ?= no
+
+RAJA_INCLUDE_DIR = $(RAJA_PATH)/include
+RAJA_LIB_DIR = $(RAJA_PATH)/lib
+
+Umpire_INCLUDE_DIR = $(Umpire_PATH)/include
+Umpire_LIB_DIR = $(Umpire_PATH)/lib
+
+camp_INCLUDE_DIR ?= $(camp_PATH)/include
+camp_LIB_DIR = $(camp_PATH)/lib
+
+#===============================================================================
+# Program name & source code list
+#===============================================================================
+
+program = RSBench
+
+source = \
+main.cpp \
+io.cpp \
+simulation.cpp \
+init.cpp \
+utils.cpp \
+material.cpp \
+
+obj = $(source:.cpp=.o)
+
+CXXFLAGS := -std=c++14
+
+# Linker Flags
+LDFLAGS = -lm
+
+# Debug Flags
+ifeq ($(DEBUG),yes)
+  CXXFLAGS += -g -G
+  LDFLAGS += -g -G
+endif
+
+# Profiling Flags
+ifeq ($(PROFILE),yes)
+  CXXFLAGS += -pg
+  LDFLAGS  += -pg
+endif
+
+# Optimization Flags
+ifeq ($(OPTIMIZE),yes)
+  CXXFLAGS += -O3
+endif
+
+ifeq ($(TARGET),CUDA)
+  CXX = nvcc
+  CUDAFLAGS = -x cu --expt-relaxed-constexpr -forward-unknown-to-host-compiler -extended-lambda -restrict -keep -arch=sm_$(SM_VERSION) 
+  CXXFLAGS += $(CUDAFLAGS)
+  LDFLAGS += -Xlinker -rpath=$(RAJA_LIB_DIR) -Xlinker -rpath=$(Umpire_LIB_DIR)
+endif  
+
+ifeq ($(TARGET),HIP)
+  CXX = hipcc
+  HIPFLAGS = --offload-arch=$(OFFLOAD_ARCH)
+  CXXFLAGS += $(HIPFLAGS)
+  LDFLAGS += -Xlinker -rpath=$(RAJA_LIB_DIR) -Xlinker -rpath=$(Umpire_LIB_DIR)
+endif
+
+ifeq ($(TARGET), OpenMP)
+  CXXFLAGS += -fopenmp
+  LDFLAGS += -fopenmp
+  LDFLAGS += -Xlinker -rpath=$(RAJA_LIB_DIR) -Xlinker -rpath=$(Umpire_LIB_DIR)
+endif
+
+default: $(program)
+
+CXXFLAGS += -I./ -I$(RAJA_INCLUDE_DIR) -I$(Umpire_INCLUDE_DIR) -I$(camp_INCLUDE_DIR)
+LDFLAGS += -lRAJA -lumpire -lcamp
+LDFLAGS += -L$(RAJA_LIB_DIR) -L$(Umpire_LIB_DIR) -L$(camp_LIB_DIR)
+
+CXXFLAGS += -w
+
+$(program): $(obj) rsbench.h Makefile
+	$(CXX) $(obj) -o $@ $(LDFLAGS)
+
+%.o: %.cpp rsbench.h Makefile
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(program) $(obj) *.cpp1* *.cpp4* *.ptx *.cubin *.cudafe1* *.fatbin* *.module_id
+
+edit:
+	vim -p $(source) rsbench.h
+
+run:
+	./rsbench
+
diff --git a/raja/init.cpp b/raja/init.cpp
new file mode 100644
index 0000000..5fed873
--- /dev/null
+++ b/raja/init.cpp
@@ -0,0 +1,299 @@
+#include "rsbench.h"
+
+// Moves all required data structures to the GPU's memory space
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD )
+{
+	printf("Allocating and moving simulation data to GPU memory space...\n");
+
+	size_t sz;
+	size_t total_sz = 0;
+
+	// Shallow copy of CPU simulation data to GPU simulation data
+	SimulationData GSD = SD;
+
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("DEVICE");
+
+	// Move data to GPU memory space
+	sz = GSD.length_num_nucs * sizeof(int);
+	GSD.num_nucs = static_cast<int*>(allocator.allocate(sz));
+	rm.copy(GSD.num_nucs, SD.num_nucs);
+	total_sz += sz;
+
+	sz = GSD.length_concs * sizeof(double);
+	GSD.concs = static_cast<double*>(allocator.allocate(sz));
+	rm.copy(GSD.concs, SD.concs);
+	total_sz += sz;
+
+	sz = GSD.length_mats * sizeof(int);
+	GSD.mats = static_cast<int*>(allocator.allocate(sz));
+	rm.copy(GSD.mats, SD.mats);
+	total_sz += sz;
+
+	sz = GSD.length_n_poles * sizeof(int);
+	GSD.n_poles = static_cast<int*>(allocator.allocate(sz));
+	rm.copy(GSD.n_poles, SD.n_poles);
+	total_sz += sz;
+
+	sz = GSD.length_n_windows * sizeof(int);
+	GSD.n_windows = static_cast<int*>(allocator.allocate(sz));
+	rm.copy(GSD.n_windows, SD.n_windows);
+	total_sz += sz;
+
+	sz = GSD.length_poles * sizeof(Pole);
+	GSD.poles = static_cast<Pole*>(allocator.allocate(sz));
+	rm.copy(GSD.poles, SD.poles);
+	total_sz += sz;
+
+	sz = GSD.length_windows * sizeof(Window);
+	GSD.windows = static_cast<Window*>(allocator.allocate(sz));
+	rm.copy(GSD.windows, SD.windows);
+	total_sz += sz;
+
+	sz = GSD.length_pseudo_K0RS * sizeof(double);
+	GSD.pseudo_K0RS = static_cast<double*>(allocator.allocate(sz));
+	rm.copy(GSD.pseudo_K0RS, SD.pseudo_K0RS);
+	total_sz += sz;
+	
+	// Allocate verification array on device. This structure is not needed on CPU, so we don't
+	// have to copy anything over.
+	sz = in.lookups * sizeof(unsigned long);
+	GSD.verification = static_cast<unsigned long*>(allocator.allocate(sz));
+	total_sz += sz;
+
+	GSD.length_verification = in.lookups;
+	
+	printf("GPU Intialization complete. Allocated %.0lf MB of data on GPU.\n", total_sz/1024.0/1024.0 );
+
+	return GSD;
+
+}
+
+SimulationData initialize_simulation(Input input)
+{
+	uint64_t seed = INITIALIZATION_SEED;
+
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+	
+	// Get material data
+	printf("Loading Hoogenboom-Martin material data...\n");
+	SimulationData SD = get_materials(input, &seed); 
+	
+	// Allocate & fill energy grids
+	printf("Generating resonance distributions...\n");
+	SD.n_poles = generate_n_poles( input, &seed );
+	SD.length_n_poles = input.n_nuclides;
+
+	// Allocate & fill Window grids
+	printf("Generating window distributions...\n");
+	SD.n_windows = generate_n_windows(input, &seed);
+	SD.length_n_windows = input.n_nuclides;
+
+	// Prepare full resonance grid
+	printf("Generating resonance parameter grid...\n");
+	SD.poles = generate_poles( input, SD.n_poles, &seed, &SD.max_num_poles );
+	SD.length_poles = input.n_nuclides * SD.max_num_poles;
+
+	// Prepare full Window grid
+	printf("Generating window parameter grid...\n");
+	SD.windows = generate_window_params(input, SD.n_windows, SD.n_poles, &seed, &SD.max_num_windows);
+	SD.length_windows = input.n_nuclides * SD.max_num_windows;
+
+	// Prepare 0K Resonances
+	printf("Generating 0K l_value data...\n");
+	SD.pseudo_K0RS = generate_pseudo_K0RS(input, &seed);
+	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
+
+	size_t sz = input.lookups * sizeof(unsigned long);
+	SD.verification = static_cast<unsigned long*>(allocator.allocate(sz));
+
+	return SD;
+}
+
+void release_memory(SimulationData SD) {
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	allocator.deallocate(SD.num_nucs);
+	allocator.deallocate(SD.concs);
+	allocator.deallocate(SD.mats);
+	allocator.deallocate(SD.n_poles);
+	allocator.deallocate(SD.n_windows);
+	allocator.deallocate(SD.poles);
+	allocator.deallocate(SD.windows);
+	allocator.deallocate(SD.pseudo_K0RS);
+}
+
+void release_device_memory(SimulationData GSD) {
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("DEVICE");
+
+	allocator.deallocate(GSD.num_nucs);
+	allocator.deallocate(GSD.concs);
+	allocator.deallocate(GSD.mats);
+	allocator.deallocate(GSD.n_poles);
+	allocator.deallocate(GSD.n_windows);
+	allocator.deallocate(GSD.poles);
+	allocator.deallocate(GSD.windows);
+	allocator.deallocate(GSD.pseudo_K0RS);
+	allocator.deallocate(GSD.verification);
+}
+
+int * generate_n_poles( Input input, uint64_t * seed )
+{
+	int total_resonances = input.avg_n_poles * input.n_nuclides;
+
+
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	int * R = static_cast<int*>(allocator.allocate(input.n_nuclides * sizeof(int)));
+	
+	// Ensure all nuclides have at least 1 resonance
+	for( int i = 0; i < input.n_nuclides; i++ )
+		R[i] = 1;
+
+	// Sample the rest
+	for( int i = 0; i < total_resonances - input.n_nuclides; i++ )
+		R[LCG_random_int(seed) % input.n_nuclides]++;
+	
+	/* Debug	
+	for( int i = 0; i < input.n_nuclides; i++ )
+		printf("R[%d] = %d\n", i, R[i]);
+	*/
+	
+	return R;
+}
+
+int * generate_n_windows( Input input, uint64_t * seed )
+{
+	int total_resonances = input.avg_n_windows * input.n_nuclides;
+
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	int * R = static_cast<int*>(allocator.allocate(input.n_nuclides * sizeof(int)));
+	
+	// Ensure all nuclides have at least 1 resonance
+	for( int i = 0; i < input.n_nuclides; i++ )
+		R[i] = 1;
+
+	// Sample the rest
+	for( int i = 0; i < total_resonances - input.n_nuclides; i++ )
+		R[LCG_random_int(seed) % input.n_nuclides]++;
+	
+	/* Debug	
+	for( int i = 0; i < input.n_nuclides; i++ )
+		printf("R[%d] = %d\n", i, R[i]);
+	*/
+	
+	return R;
+}
+
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles )
+{
+	// Pole Scaling Factor -- Used to bias hitting of the fast Faddeeva
+	// region to approximately 99.5% (i.e., only 0.5% of lookups should
+	// require the full eval).
+	double f = 152.5;
+	RSComplex f_c = {f, 0};
+
+	int max_poles = -1;
+	
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		if( n_poles[i] > max_poles)
+			max_poles = n_poles[i];
+	}
+	*max_num_poles = max_poles;
+
+	// Allocating 2D matrix as a 1D contiguous vector
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	Pole * R = static_cast<Pole*>(allocator.allocate(input.n_nuclides * max_poles * sizeof(Pole)));
+	
+	// fill with data
+	for( int i = 0; i < input.n_nuclides; i++ )
+		for( int j = 0; j < n_poles[i]; j++ )
+		{
+			double r = LCG_random_double(seed);
+			double im = LCG_random_double(seed);
+			RSComplex t1 = {r, im};
+			R[i * max_poles + j].MP_EA = c_mul(f_c,t1);
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t2 = {f*r, im};
+			R[i * max_poles + j].MP_RT = t2;
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t3 = {f*r, im};
+			R[i * max_poles + j].MP_RA = t3;
+			r = LCG_random_double(seed);
+			im = LCG_random_double(seed);
+			RSComplex t4 = {f*r, im};
+			R[i * max_poles + j].MP_RF = t4;
+			R[i * max_poles + j].l_value = LCG_random_int(seed) % input.numL;
+		}
+	
+	return R;
+}
+
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows )
+{
+	int max_windows = -1;
+	
+	for( int i = 0; i < input.n_nuclides; i++ )
+	{
+		if( n_windows[i] > max_windows)
+			max_windows = n_windows[i];
+	}
+	*max_num_windows = max_windows;
+
+	// Allocating 2D contiguous matrix
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	Window * R = static_cast<Window*>(allocator.allocate(input.n_nuclides * max_windows * sizeof(Window)));
+	
+	// fill with data
+	for(int i = 0; i < input.n_nuclides; i++)
+	{
+		int space = n_poles[i] / n_windows[i];
+		int remainder = n_poles[i] - space * n_windows[i];
+		int ctr = 0;
+		for(int j = 0; j < n_windows[i]; j++)
+		{
+			R[i * max_windows + j].T = LCG_random_double(seed);
+			R[i * max_windows + j].A = LCG_random_double(seed);
+			R[i * max_windows + j].F = LCG_random_double(seed);
+			R[i * max_windows + j].start = ctr; 
+			R[i * max_windows + j].end = ctr + space - 1;
+
+			ctr += space;
+
+			if (j < remainder)
+			{
+				ctr++;
+				R[i * max_windows + j].end++;
+			}
+		}
+	}
+
+	return R;
+}
+
+double * generate_pseudo_K0RS( Input input, uint64_t * seed )
+{
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	double * R = static_cast<double*>(allocator.allocate(input.n_nuclides * input.numL * sizeof(double)));
+
+	for( int i = 0; i < input.n_nuclides; i++)
+		for( int j = 0; j < input.numL; j++ )
+			R[i * input.numL + j] = LCG_random_double(seed);
+
+	return R;
+}
diff --git a/raja/io.cpp b/raja/io.cpp
new file mode 100644
index 0000000..c9be8c8
--- /dev/null
+++ b/raja/io.cpp
@@ -0,0 +1,353 @@
+#include "rsbench.h"
+
+// Prints program logo
+void logo(int version)
+{
+	border_print();
+	printf(
+"                    _____   _____ ____                  _     \n"
+"                   |  __ \\ / ____|  _ \\                | |    \n"
+"                   | |__) | (___ | |_) | ___ _ __   ___| |__  \n"
+"                   |  _  / \\___ \\|  _ < / _ \\ '_ \\ / __| '_ \\ \n"
+"                   | | \\ \\ ____) | |_) |  __/ | | | (__| | | |\n"
+"                   |_|  \\_\\_____/|____/ \\___|_| |_|\\___|_| |_|\n\n"
+	);
+	border_print();
+	center_print("Developed at Argonne National Laboratory", 79);
+	char v[100];
+	sprintf(v, "Version: %d", version);
+	center_print(v, 79);
+	border_print();
+}
+
+// Prints Section titles in center of 80 char terminal
+void center_print(const char *s, int width)
+{
+	int length = strlen(s);
+	int i;
+	for (i=0; i<=(width-length)/2; i++) {
+		fputs(" ", stdout);
+	}
+	fputs(s, stdout);
+	fputs("\n", stdout);
+}
+
+void border_print(void)
+{
+	printf(
+	"==================================================================="
+	"=============\n");
+}
+
+// Prints comma separated integers - for ease of reading
+void fancy_int( int a )
+{
+    if( a < 1000 )
+        printf("%d\n",a);
+
+    else if( a >= 1000 && a < 1000000 )
+        printf("%d,%03d\n", a / 1000, a % 1000);
+
+    else if( a >= 1000000 && a < 1000000000 )
+        printf("%d,%03d,%03d\n", a / 1000000, (a % 1000000) / 1000, a % 1000 );
+
+    else if( a >= 1000000000 )
+        printf("%d,%03d,%03d,%03d\n",
+               a / 1000000000,
+               (a % 1000000000) / 1000000,
+               (a % 1000000) / 1000,
+               a % 1000 );
+    else
+        printf("%d\n",a);
+}
+
+Input read_CLI( int argc, char * argv[] )
+{
+	Input input;
+
+	// defaults to the history based simulation method
+	input.simulation_method = HISTORY_BASED;
+	// defaults to max threads on the system	
+#if defined(RAJA_ENABLE_OPENMP)
+	input.nthreads = omp_get_num_procs();
+#else
+	input.nthreads = 1;
+#endif
+
+	// defaults to 355 (corresponding to H-M Large benchmark)
+	input.n_nuclides = 355;
+	// defaults to 300,000
+	input.particles = 300000;
+	// defaults to 34
+	input.lookups = 34;
+	// defaults to H-M Large benchmark
+	input.HM = LARGE;
+	// defaults to 3000 resonancs (avg) per nuclide
+	input.avg_n_poles = 1000;
+	// defaults to 100
+	input.avg_n_windows = 100;
+	// defaults to 4;
+	input.numL = 4;
+	// defaults to no temperature dependence (Doppler broadening)
+	input.doppler = 1;
+	// defaults to baseline simulation kernel
+	input.kernel_id = 0;
+	
+	int default_lookups = 1;
+	int default_particles = 1;
+
+	// Collect Raw Input
+	for( int i = 1; i < argc; i++ )
+	{
+		char * arg = argv[i];
+
+		// Simulation Method (-m)
+		if( strcmp(arg, "-m") == 0 )
+		{
+			char * sim_type = NULL;
+			if( ++i < argc )
+				sim_type = argv[i];
+			else
+				print_CLI_error();
+
+			if( strcmp(sim_type, "history") == 0 )
+				input.simulation_method = HISTORY_BASED;
+			else if( strcmp(sim_type, "event") == 0 )
+			{
+				input.simulation_method = EVENT_BASED;
+				// Also resets default # of lookups
+				if( default_lookups && default_particles )
+				{
+					input.lookups =  input.lookups * input.particles;
+					input.particles = 0;
+				}
+			}
+			else
+				print_CLI_error();
+		}
+		// lookups (-l)
+		else if( strcmp(arg, "-l") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.lookups = atoi(argv[i]);
+				default_lookups = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// particles (-p)
+		else if( strcmp(arg, "-p") == 0 )
+		{
+			if( ++i < argc )
+			{
+				input.particles = atoi(argv[i]);
+				default_particles = 0;
+			}
+			else
+				print_CLI_error();
+		}
+		// nuclides (-n)
+		else if( strcmp(arg, "-n") == 0 )
+		{
+			if( ++i < argc )
+				input.n_nuclides = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// HM (-s)
+		else if( strcmp(arg, "-s") == 0 )
+		{	
+			if( ++i < argc )
+			{
+				if( strcmp(argv[i], "small") == 0 )
+					input.HM = SMALL;
+				else if ( strcmp(argv[i], "large") == 0 )
+					input.HM = LARGE;
+				else
+					print_CLI_error();
+			}
+			else
+				print_CLI_error();
+		}
+		// Doppler Broadening (Temperature Dependence)
+		else if( strcmp(arg, "-d") == 0 )
+		{	
+			input.doppler = 0;
+		}
+		// Avg number of windows per nuclide (-w)
+		else if( strcmp(arg, "-W") == 0 )
+		{
+			if( ++i < argc )
+				input.avg_n_windows = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// Avg number of poles per nuclide (-p)
+		else if( strcmp(arg, "-P") == 0 )
+		{
+			if( ++i < argc )
+				input.avg_n_poles = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		// Kernel ID (-k)
+		else if( strcmp(arg, "-k") == 0 )
+		{
+			if( ++i < argc )
+				input.kernel_id = atoi(argv[i]);
+			else
+				print_CLI_error();
+		}
+		else
+			print_CLI_error();
+	}
+
+	// Validate Input
+
+	// Validate nthreads
+	if( input.nthreads < 1 )
+		print_CLI_error();
+	
+	// Validate n_isotopes
+	if( input.n_nuclides < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.lookups < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.avg_n_poles < 1 )
+		print_CLI_error();
+	
+	// Validate lookups
+	if( input.avg_n_windows < 1 )
+		print_CLI_error();
+	
+	// Set HM size specific parameters
+	// (defaults to large)
+	if( input.HM == SMALL )
+		input.n_nuclides = 68;
+
+	// Return input struct
+	return input;
+}
+
+void print_CLI_error(void)
+{
+	printf("Usage: ./multibench <options>\n");
+	printf("Options include:\n");
+	printf("  -s <size>        Size of H-M Benchmark to run (small, large)\n");
+	printf("  -l <lookups>     Number of Cross-section (XS) lookups per particle history\n");
+	printf("  -p <particles>   Number of particle histories\n");
+	printf("  -P <poles>       Average Number of Poles per Nuclide\n");
+	printf("  -W <poles>       Average Number of Windows per Nuclide\n");
+	printf("  -d               Disables Temperature Dependence (Doppler Broadening)\n");
+	printf("Default is equivalent to: -s large -l 34 -p 300000 -P 1000 -W 100\n");
+	printf("See readme for full description of default run values\n");
+	exit(4);
+}
+
+void print_input_summary(Input input)
+{
+	printf("Programming Model:           RAJA\n");
+
+#if defined(RAJA_ENABLE_CUDA)
+	cudaDeviceProp prop;
+	int device;
+	cudaGetDevice(&device);
+	cudaGetDeviceProperties ( &prop, device );
+	printf("GPU Device:                 %s\n", prop.name); 
+#elif defined(RAJA_ENABLE_HIP)
+	hipDeviceProp_t prop;
+	int device;
+	hipGetDevice(&device);
+	hipGetDeviceProperties ( &prop, device );
+	printf("GPU Device:                  %s\n", prop.name);
+#endif
+
+
+
+	// Calculate Estimate of Memory Usage
+	size_t mem = get_mem_estimate(input);
+
+	if( input.simulation_method == EVENT_BASED )
+		printf("Simulation Method:           Event Based\n");
+	else
+		printf("Simulation Method:           History Based\n");
+	printf("Materials:                   12\n");
+	printf("H-M Benchmark Size:          ");
+	if( input.HM == 0 )
+		printf("Small\n");
+	else
+		printf("Large\n");
+	if( input.doppler == 1 )
+		printf("Temperature Dependence:      ON\n");
+	else
+		printf("Temperature Dependence:      OFF\n");
+	printf("Total Nuclides:              %d\n", input.n_nuclides);
+	printf("Avg Poles per Nuclide:       "); fancy_int(input.avg_n_poles);
+	printf("Avg Windows per Nuclide:     "); fancy_int(input.avg_n_windows);
+
+	int lookups = input.lookups;
+	if( input.simulation_method == HISTORY_BASED )
+	{
+		printf("Particles:                   "); fancy_int(input.particles);
+		printf("XS Lookups per Particle:     "); fancy_int(input.lookups);
+		lookups *= input.particles;
+	}
+	printf("Total XS Lookups:            "); fancy_int(lookups);
+	printf("Est. Memory Usage (MB):      %.1lf\n", mem / 1024.0 / 1024.0);
+}
+
+int validate_and_print_results(Input input, double runtime, unsigned long vhash)
+{
+  printf("NOTE: Timings are estimated -- use nvprof/nsys/iprof/rocprof for formal analysis\n");
+	printf("Runtime:               %.3lf seconds\n", runtime);
+	int lookups = 0;
+	if( input.simulation_method == HISTORY_BASED )
+		lookups = input.lookups*input.particles;
+	else
+		lookups = input.lookups;
+	printf("Lookups:               "); fancy_int(lookups);
+	printf("Lookups/s:             "); fancy_int((double) lookups / (runtime));
+
+	int is_invalid = 1;
+
+	unsigned long long large = 0;
+	unsigned long long small = 0;
+	if(input.simulation_method == HISTORY_BASED )
+	{
+		large = 351485;
+		small = 879693;
+	}
+	else if( input.simulation_method == EVENT_BASED )
+	{
+		large = 358389;
+		small = 880018;
+	}
+
+	if( input.HM  == LARGE )
+	{
+		if( vhash == large )
+		{
+			printf("Verification checksum: %lu (Valid)\n", vhash);
+			is_invalid = 0;
+		}
+		else
+			printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+	}
+	else if( input.HM  == SMALL )
+	{
+		if( vhash == small )
+		{
+			printf("Verification checksum: %lu (Valid)\n", vhash);
+			is_invalid = 0;
+		}
+		else
+			printf("Verification checksum: %lu (WARNING - INAVALID CHECKSUM!)\n", vhash);
+	}
+
+	return is_invalid;
+}
diff --git a/raja/main.cpp b/raja/main.cpp
new file mode 100644
index 0000000..1816dfb
--- /dev/null
+++ b/raja/main.cpp
@@ -0,0 +1,74 @@
+#include "rsbench.h"
+
+int main(int argc, char * argv[])
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+
+	int version = 13;
+	double start, stop;
+	
+	// Process CLI Fields
+	Input input = read_CLI( argc, argv );
+	
+	// =====================================================================
+	// Print-out of Input Summary
+	// =====================================================================
+	logo(version);
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	print_input_summary(input);
+
+	// =====================================================================
+	// Intialize Simulation Data Structures
+	// =====================================================================
+	border_print();
+	center_print("INITIALIZATION", 79);
+	border_print();
+	
+	start = get_time();
+	
+	SimulationData SD = initialize_simulation(input);
+	stop = get_time();
+
+	printf("Initialization Complete. (%.2lf seconds)\n", stop-start);
+	
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation Begins
+	// =====================================================================
+	border_print();
+	center_print("SIMULATION", 79);
+	border_print();
+
+	unsigned long vhash = 0;
+	double elapsed_time = 0;
+
+	// Run Simulation
+	if( input.simulation_method == EVENT_BASED) {
+		run_event_based_simulation(input, SD, &vhash, &elapsed_time);
+	} else if( input.simulation_method == HISTORY_BASED) {
+		printf("History-based simulation not implemented in RAJA code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+	// Final hash step
+	vhash = vhash % 999983;
+
+	printf("Simulation Complete.\n");
+
+	release_memory(SD);
+
+	// =====================================================================
+	// Print / Save Results and Exit
+	// =====================================================================
+	border_print();
+	center_print("RESULTS", 79);
+	border_print();
+
+	int is_invalid = validate_and_print_results(input, elapsed_time, vhash);
+
+	border_print();
+
+	return is_invalid;
+}
diff --git a/raja/material.cpp b/raja/material.cpp
new file mode 100644
index 0000000..f5bee57
--- /dev/null
+++ b/raja/material.cpp
@@ -0,0 +1,133 @@
+#include "rsbench.h"
+
+// Handles all material creation tasks - returns Material struct
+SimulationData get_materials(Input input, uint64_t * seed)
+{
+	SimulationData SD;
+
+	SD.num_nucs = load_num_nucs(input);
+	SD.length_num_nucs = 12;
+
+	SD.mats = load_mats(input, SD.num_nucs, &SD.max_num_nucs, &SD.length_mats);
+
+	SD.concs = load_concs(SD.num_nucs, seed, SD.max_num_nucs);
+	SD.length_concs = 12 * SD.max_num_nucs; 
+
+	return SD;
+}
+
+// num_nucs represents the number of nuclides that each material contains
+int * load_num_nucs(Input input)
+{
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	int * num_nucs = static_cast<int*>(allocator.allocate(12 * sizeof(int)));
+	
+	// Material 0 is a special case (fuel). The H-M small reactor uses
+	// 34 nuclides, while H-M larges uses 300.
+	if( input.n_nuclides == 68 )
+		num_nucs[0]  = 34; // HM Small is 34, H-M Large is 321
+	else
+		num_nucs[0]  = 321; // HM Small is 34, H-M Large is 321
+
+	num_nucs[1]  = 5;
+	num_nucs[2]  = 4;
+	num_nucs[3]  = 4;
+	num_nucs[4]  = 27;
+	num_nucs[5]  = 21;
+	num_nucs[6]  = 21;
+	num_nucs[7]  = 21;
+	num_nucs[8]  = 21;
+	num_nucs[9]  = 21;
+	num_nucs[10] = 9;
+	num_nucs[11] = 9;
+
+	return num_nucs;
+}
+
+// Assigns an array of nuclide ID's to each material
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats )
+{
+	*max_num_nucs = 0;
+	int num_mats = 12;
+	for( int m = 0; m < num_mats; m++ )
+	{
+		if( num_nucs[m] > *max_num_nucs )
+			*max_num_nucs = num_nucs[m];
+	}
+
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	int * mats = static_cast<int*>(allocator.allocate(num_mats * (*max_num_nucs) * sizeof(int)));
+	*length_mats = num_mats * (*max_num_nucs);
+
+	// Small H-M has 34 fuel nuclides
+	int mats0_Sml[] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53, 
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	// Large H-M has 300 fuel nuclides
+	int mats0_Lrg[321] =  { 58, 59, 60, 61, 40, 42, 43, 44, 45, 46, 1, 2, 3, 7,
+	                 8, 9, 10, 29, 57, 47, 48, 0, 62, 15, 33, 34, 52, 53,
+	                 54, 55, 56, 18, 23, 41 }; //fuel
+	for( int i = 0; i < 321-34; i++ )
+		mats0_Lrg[34+i] = 68 + i; // H-M large adds nuclides to fuel only
+	
+	// These are the non-fuel materials	
+	int mats1[] =  { 63, 64, 65, 66, 67 }; // cladding
+	int mats2[] =  { 24, 41, 4, 5 }; // cold borated water
+	int mats3[] =  { 24, 41, 4, 5 }; // hot borated water
+	int mats4[] =  { 19, 20, 21, 22, 35, 36, 37, 38, 39, 25, 27, 28, 29,
+	                 30, 31, 32, 26, 49, 50, 51, 11, 12, 13, 14, 6, 16,
+	                 17 }; // RPV
+	int mats5[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // lower radial reflector
+	int mats6[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top reflector / plate
+	int mats7[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom plate
+	int mats8[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // bottom nozzle
+	int mats9[] =  { 24, 41, 4, 5, 19, 20, 21, 22, 35, 36, 37, 38, 39, 25,
+	                 49, 50, 51, 11, 12, 13, 14 }; // top nozzle
+	int mats10[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // top of FA's
+	int mats11[] = { 24, 41, 4, 5, 63, 64, 65, 66, 67 }; // bottom FA's
+	
+	// H-M large v small dependency
+	if( input.n_nuclides == 68 )
+		memcpy( mats,  mats0_Sml,  num_nucs[0]  * sizeof(int) );	
+	else
+		memcpy( mats,  mats0_Lrg,  num_nucs[0]  * sizeof(int) );
+	
+	// Copy other materials
+	memcpy( mats + *max_num_nucs * 1,  mats1,  num_nucs[1]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 2,  mats2,  num_nucs[2]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 3,  mats3,  num_nucs[3]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 4,  mats4,  num_nucs[4]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 5,  mats5,  num_nucs[5]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 6,  mats6,  num_nucs[6]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 7,  mats7,  num_nucs[7]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 8,  mats8,  num_nucs[8]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 9,  mats9,  num_nucs[9]  * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 10, mats10, num_nucs[10] * sizeof(int) );	
+	memcpy( mats + *max_num_nucs * 11, mats11, num_nucs[11] * sizeof(int) );	
+
+	return mats;
+}
+
+// Creates a randomized array of 'concentrations' of nuclides in each mat
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs )
+{
+	auto& rm = umpire::ResourceManager::getInstance();
+	umpire::Allocator allocator = rm.getAllocator("HOST");
+
+	double * concs = static_cast<double*>(allocator.allocate(12 * max_num_nucs * sizeof(double)));
+
+	for( int i = 0; i < 12; i++ )
+		for( int j = 0; j < num_nucs[i]; j++ )
+			concs[i * max_num_nucs + j] = LCG_random_double(seed);
+
+	return concs;
+}
+
diff --git a/raja/rsbench.h b/raja/rsbench.h
new file mode 100644
index 0000000..af99cbb
--- /dev/null
+++ b/raja/rsbench.h
@@ -0,0 +1,137 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<sys/time.h>
+#include<math.h>
+#include<string.h>
+#include<stdint.h>
+#include<float.h>
+#include<assert.h>
+#include <chrono> 
+
+#include <RAJA/RAJA.hpp>
+#include "umpire/Allocator.hpp"
+#include "umpire/ResourceManager.hpp"
+
+#define PI 3.14159265359
+
+// typedefs
+typedef enum __hm{SMALL, LARGE, XL, XXL} HM_size;
+
+#define HISTORY_BASED 1
+#define EVENT_BASED 2
+
+#define STARTING_SEED 1070
+#define INITIALIZATION_SEED 42
+
+typedef struct{
+	double r;
+	double i;
+} RSComplex;
+
+typedef struct{
+	int nthreads;
+	int n_nuclides;
+	int lookups;
+	HM_size HM;
+	int avg_n_poles;
+	int avg_n_windows;
+	int numL;
+	int doppler;
+	int particles;
+	int simulation_method;
+	int kernel_id;
+} Input;
+
+typedef struct{
+	RSComplex MP_EA;
+	RSComplex MP_RT;
+	RSComplex MP_RA;
+	RSComplex MP_RF;
+	short int l_value;
+} Pole;
+
+typedef struct{
+	double T;
+	double A;
+	double F;
+	int start;
+	int end;
+} Window;
+
+typedef struct{
+	int * n_poles;
+	unsigned long length_n_poles;
+	int * n_windows;
+	unsigned long length_n_windows;
+	Pole * poles;
+	unsigned long length_poles;
+	Window * windows;
+	unsigned long length_windows;
+	double * pseudo_K0RS;
+	unsigned long length_pseudo_K0RS;
+	int * num_nucs;
+	unsigned long length_num_nucs;
+	int * mats;
+	unsigned long length_mats;
+	double * concs;
+	unsigned long length_concs;
+	int max_num_nucs;
+	int max_num_poles;
+	int max_num_windows;
+	double * p_energy_samples;
+	unsigned long length_p_energy_samples;
+	int * mat_samples;
+	unsigned long length_mat_samples;
+	unsigned long  * verification;
+	unsigned long length_verification;
+} SimulationData;
+
+// io.c
+void logo(int version);
+void center_print(const char *s, int width);
+void border_print(void);
+void fancy_int( int a );
+Input read_CLI( int argc, char * argv[] );
+void print_CLI_error(void);
+void print_input_summary(Input input);
+int validate_and_print_results(Input input, double runtime, unsigned long vhash);
+
+// init.c
+SimulationData initialize_simulation( Input input );
+int * generate_n_poles( Input input,  uint64_t * seed );
+int * generate_n_windows( Input input ,  uint64_t * seed);
+Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_num_poles );
+Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
+double * generate_pseudo_K0RS( Input input, uint64_t * seed );
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
+void release_memory(SimulationData SD);
+void release_device_memory(SimulationData GSD);
+
+// material.c
+int * load_num_nucs(Input input);
+int * load_mats( Input input, int * num_nucs, int * max_num_nucs, unsigned long * length_mats );
+double * load_concs( int * num_nucs, uint64_t * seed, int max_num_nucs );
+SimulationData get_materials(Input input, uint64_t * seed);
+
+// utils.c
+size_t get_mem_estimate( Input input );
+double get_time(void);
+
+// simulation.c
+void run_event_based_simulation(Input input, SimulationData SD, unsigned long * vhash_result, double * elapsed_time);
+RAJA_HOST_DEVICE void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+RAJA_HOST_DEVICE void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles);
+RAJA_HOST_DEVICE void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
+RAJA_HOST_DEVICE int pick_mat( uint64_t * seed );
+RAJA_HOST_DEVICE void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors );
+RAJA_HOST_DEVICE RSComplex fast_nuclear_W( RSComplex Z );
+RAJA_HOST_DEVICE double LCG_random_double(uint64_t * seed);
+RAJA_HOST_DEVICE uint64_t LCG_random_int(uint64_t * seed);
+RAJA_HOST_DEVICE uint64_t fast_forward_LCG(uint64_t seed, uint64_t n);
+RAJA_HOST_DEVICE RSComplex c_add( RSComplex A, RSComplex B);
+RAJA_HOST_DEVICE RSComplex c_sub( RSComplex A, RSComplex B);
+RAJA_HOST_DEVICE RSComplex c_mul( RSComplex A, RSComplex B);
+RAJA_HOST_DEVICE RSComplex c_div( RSComplex A, RSComplex B);
+RAJA_HOST_DEVICE double c_abs( RSComplex A);
+RAJA_HOST_DEVICE double fast_exp(double x);
+RAJA_HOST_DEVICE RSComplex fast_cexp( RSComplex z );
diff --git a/raja/simulation.cpp b/raja/simulation.cpp
new file mode 100644
index 0000000..72a7c3e
--- /dev/null
+++ b/raja/simulation.cpp
@@ -0,0 +1,522 @@
+#include "rsbench.h"
+
+////////////////////////////////////////////////////////////////////////////////////
+// BASELINE FUNCTIONS
+////////////////////////////////////////////////////////////////////////////////////
+// All "baseline" code is at the top of this file. The baseline code is a simple
+// implementation of the algorithm, with only minor GPU optimizations in place.
+// Following these functions are a number of optimized variants,
+// which each deploy a different combination of optimizations strategies. By
+// default, RSBench will only run the baseline implementation. Optimized variants
+// must be specifically selected using the "-k <optimized variant ID>" command
+// line argument.
+////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(RAJA_ENABLE_CUDA)
+using policy = RAJA::cuda_exec<256>;
+#elif defined(RAJA_ENABLE_HIP)
+using policy = RAJA::hip_exec<256>;
+#elif defined(RAJA_ENABLE_OPENMP)
+using policy = RAJA::omp_parallel_for_exec;
+#else
+using policy = RAJA::seq_exec;
+#endif
+
+void run_event_based_simulation(Input input, SimulationData SD, unsigned long *vhash_result, double * elapsed_time) {
+	double start, stop;
+	auto& rm = umpire::ResourceManager::getInstance();
+	start = get_time();
+	////////////////////////////////////////////////////////////////////////////////
+	// Move Data to Device
+	////////////////////////////////////////////////////////////////////////////////
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+	SimulationData GSD = move_simulation_data_to_device(input, SD);
+	stop = get_time();
+	printf("Initialization Complete. (%.2lf seconds)\n", stop - start);
+#else
+	SimulationData GSD = SD;
+#endif
+
+	////////////////////////////////////////////////////////////////////////////////
+	// Configure & Launch Simulation Kernel
+	////////////////////////////////////////////////////////////////////////////////
+	printf("Running baseline event-based simulation on device...\n");
+
+	start = get_time();
+
+	int nthreads = 256;
+	int nblocks = ceil( (double) input.lookups / (double) nthreads);
+
+	RAJA::forall<policy>(RAJA::RangeSegment(0, input.lookups), [=] RAJA_HOST_DEVICE (int i) {
+		if (i < input.lookups) {
+			uint64_t seed = STARTING_SEED;
+			
+			seed = fast_forward_LCG(seed, 2*i);
+			double E = LCG_random_double(&seed);
+			int mat  = pick_mat(&seed);
+
+			double macro_xs[4] = {0};
+
+			calculate_macro_xs(macro_xs,
+							   mat,
+							   E,
+							   input,
+							   GSD.num_nucs,
+							   GSD.mats,
+							   GSD.max_num_nucs,
+							   GSD.concs,
+							   GSD.n_windows,
+							   GSD.pseudo_K0RS,
+							   GSD.windows,
+							   GSD.poles,
+							   GSD.max_num_windows,
+							   GSD.max_num_poles);
+
+			double max = -DBL_MAX;
+			int max_idx = 0;
+			for (int x = 0; x < 4; x++) {
+				if (macro_xs[x] > max) {
+					max = macro_xs[x];
+					max_idx = x;
+				}
+			}
+
+			GSD.verification[i] = max_idx+1;
+		}
+	});
+
+	rm.copy(SD.verification, GSD.verification);
+	
+	////////////////////////////////////////////////////////////////////////////////
+	// Reduce Verification Results
+	////////////////////////////////////////////////////////////////////////////////
+	printf("Reducing verification results...\n");
+
+	unsigned long long verification_scalar = 0;
+	for(int i = 0; i < input.lookups; i++ )
+		verification_scalar += SD.verification[i];
+
+	*vhash_result = verification_scalar;
+
+	stop = get_time();
+
+	*elapsed_time = stop - start;
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+	release_device_memory(GSD);
+#endif
+}
+
+RAJA_HOST_DEVICE void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles ) 
+{
+	// zero out macro vector
+	for( int i = 0; i < 4; i++ )
+		macro_xs[i] = 0;
+
+	// for nuclide in mat
+	for( int i = 0; i < num_nucs[mat]; i++ )
+	{
+		double micro_xs[4];
+		int nuc = mats[mat * max_num_nucs + i];
+
+		if( input.doppler == 1 )
+			calculate_micro_xs_doppler( micro_xs, nuc, E, input, n_windows, pseudo_K0Rs, windows, poles, max_num_windows, max_num_poles);
+		else
+			calculate_micro_xs( micro_xs, nuc, E, input, n_windows, pseudo_K0Rs, windows, poles, max_num_windows, max_num_poles);
+
+		for( int j = 0; j < 4; j++ )
+		{
+			macro_xs[j] += micro_xs[j] * concs[mat * max_num_nucs + i];
+		}
+		// Debug
+		/*
+		printf("E = %.2lf, mat = %d, macro_xs[0] = %.2lf, macro_xs[1] = %.2lf, macro_xs[2] = %.2lf, macro_xs[3] = %.2lf\n",
+		E, mat, macro_xs[0], macro_xs[1], macro_xs[2], macro_xs[3] );
+		*/
+	}
+
+	// Debug
+	/*
+	printf("E = %.2lf, mat = %d, macro_xs[0] = %.2lf, macro_xs[1] = %.2lf, macro_xs[2] = %.2lf, macro_xs[3] = %.2lf\n",
+	E, mat, macro_xs[0], macro_xs[1], macro_xs[2], macro_xs[3] );
+	*/
+}
+
+// No Temperature dependence (i.e., 0K evaluation)
+RAJA_HOST_DEVICE void calculate_micro_xs( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles)
+{
+	// MicroScopic XS's to Calculate
+	double sigT;
+	double sigA;
+	double sigF;
+	double sigE;
+
+	// Calculate Window Index
+	double spacing = 1.0 / n_windows[nuc];
+	int window = (int) ( E / spacing );
+	if( window == n_windows[nuc] )
+		window--;
+
+	// Calculate sigTfactors
+	RSComplex sigTfactors[4]; // Of length input.numL, which is always 4
+	calculate_sig_T(nuc, E, input, pseudo_K0RS, sigTfactors );
+
+	// Calculate contributions from window "background" (i.e., poles outside window (pre-calculated)
+	Window w = windows[nuc * max_num_windows + window];
+	sigT = E * w.T;
+	sigA = E * w.A;
+	sigF = E * w.F;
+
+	// Loop over Poles within window, add contributions
+	for( int i = w.start; i < w.end; i++ )
+	{
+		RSComplex PSIIKI;
+		RSComplex CDUM;
+		Pole pole = poles[nuc * max_num_poles + i];
+		RSComplex t1 = {0, 1};
+		RSComplex t2 = {sqrt(E), 0 };
+		PSIIKI = c_div( t1 , c_sub(pole.MP_EA,t2) );
+		RSComplex E_c = {E, 0};
+		CDUM = c_div(PSIIKI, E_c);
+		sigT += (c_mul(pole.MP_RT, c_mul(CDUM, sigTfactors[pole.l_value])) ).r;
+		sigA += (c_mul( pole.MP_RA, CDUM)).r;
+		sigF += (c_mul(pole.MP_RF, CDUM)).r;
+	}
+
+	sigE = sigT - sigA;
+
+	micro_xs[0] = sigT;
+	micro_xs[1] = sigA;
+	micro_xs[2] = sigF;
+	micro_xs[3] = sigE;
+}
+
+// Temperature Dependent Variation of Kernel
+// (This involves using the Complex Faddeeva function to
+// Doppler broaden the poles within the window)
+RAJA_HOST_DEVICE void calculate_micro_xs_doppler( double * micro_xs, int nuc, double E, Input input, int * n_windows, double * pseudo_K0RS, Window * windows, Pole * poles, int max_num_windows, int max_num_poles )
+{
+	// MicroScopic XS's to Calculate
+	double sigT;
+	double sigA;
+	double sigF;
+	double sigE;
+
+	// Calculate Window Index
+	double spacing = 1.0 / n_windows[nuc];
+	int window = (int) ( E / spacing );
+	if( window == n_windows[nuc] )
+		window--;
+
+	// Calculate sigTfactors
+	RSComplex sigTfactors[4]; // Of length input.numL, which is always 4
+	calculate_sig_T(nuc, E, input, pseudo_K0RS, sigTfactors );
+
+	// Calculate contributions from window "background" (i.e., poles outside window (pre-calculated)
+	Window w = windows[nuc * max_num_windows + window];
+	sigT = E * w.T;
+	sigA = E * w.A;
+	sigF = E * w.F;
+
+	double dopp = 0.5;
+
+	// Loop over Poles within window, add contributions
+	for( int i = w.start; i < w.end; i++ )
+	{
+		Pole pole = poles[nuc * max_num_poles + i];
+
+		// Prep Z
+		RSComplex E_c = {E, 0};
+		RSComplex dopp_c = {dopp, 0};
+		RSComplex Z = c_mul(c_sub(E_c, pole.MP_EA), dopp_c);
+
+		// Evaluate Fadeeva Function
+		RSComplex faddeeva = fast_nuclear_W( Z );
+
+		// Update W
+		sigT += (c_mul( pole.MP_RT, c_mul(faddeeva, sigTfactors[pole.l_value]) )).r;
+		sigA += (c_mul( pole.MP_RA , faddeeva)).r;
+		sigF += (c_mul( pole.MP_RF , faddeeva)).r;
+	}
+
+	sigE = sigT - sigA;
+
+	micro_xs[0] = sigT;
+	micro_xs[1] = sigA;
+	micro_xs[2] = sigF;
+	micro_xs[3] = sigE;
+}
+
+// picks a material based on a probabilistic distribution
+RAJA_HOST_DEVICE int pick_mat( uint64_t * seed )
+{
+	// I have a nice spreadsheet supporting these numbers. They are
+	// the fractions (by volume) of material in the core. Not a 
+	// *perfect* approximation of where XS lookups are going to occur,
+	// but this will do a good job of biasing the system nonetheless.
+
+	double dist[12];
+	dist[0]  = 0.140;	// fuel
+	dist[1]  = 0.052;	// cladding
+	dist[2]  = 0.275;	// cold, borated water
+	dist[3]  = 0.134;	// hot, borated water
+	dist[4]  = 0.154;	// RPV
+	dist[5]  = 0.064;	// Lower, radial reflector
+	dist[6]  = 0.066;	// Upper reflector / top plate
+	dist[7]  = 0.055;	// bottom plate
+	dist[8]  = 0.008;	// bottom nozzle
+	dist[9]  = 0.015;	// top nozzle
+	dist[10] = 0.025;	// top of fuel assemblies
+	dist[11] = 0.013;	// bottom of fuel assemblies
+
+	double roll = LCG_random_double(seed);
+
+	// makes a pick based on the distro
+	for( int i = 0; i < 12; i++ )
+	{
+		double running = 0;
+		for( int j = i; j > 0; j-- )
+			running += dist[j];
+		if( roll < running )
+			return i;
+	}
+
+	return 0;
+}
+
+RAJA_HOST_DEVICE void calculate_sig_T( int nuc, double E, Input input, double * pseudo_K0RS, RSComplex * sigTfactors )
+{
+	double phi;
+
+	for( int i = 0; i < 4; i++ )
+	{
+		phi = pseudo_K0RS[nuc * input.numL + i] * sqrt(E);
+
+		if( i == 1 )
+			phi -= - atan( phi );
+		else if( i == 2 )
+			phi -= atan( 3.0 * phi / (3.0 - phi*phi));
+		else if( i == 3 )
+			phi -= atan(phi*(15.0-phi*phi)/(15.0-6.0*phi*phi));
+
+		phi *= 2.0;
+
+		sigTfactors[i].r = cos(phi);
+		sigTfactors[i].i = -sin(phi);
+	}
+}
+
+// This function uses a combination of the Abrarov Approximation
+// and the QUICK_W three term asymptotic expansion.
+// Only expected to use Abrarov ~0.5% of the time.
+RAJA_HOST_DEVICE RSComplex fast_nuclear_W( RSComplex Z )
+{
+	// Abrarov 
+	if( c_abs(Z) < 6.0 )
+	{
+		// Precomputed parts for speeding things up
+		// (N = 10, Tm = 12.0)
+		RSComplex prefactor = {0, 8.124330e+01};
+		double an[10] = {
+			2.758402e-01,
+			2.245740e-01,
+			1.594149e-01,
+			9.866577e-02,
+			5.324414e-02,
+			2.505215e-02,
+			1.027747e-02,
+			3.676164e-03,
+			1.146494e-03,
+			3.117570e-04
+		};
+		double neg_1n[10] = {
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0,
+			-1.0,
+			1.0
+		};
+
+		double denominator_left[10] = {
+			9.869604e+00,
+			3.947842e+01,
+			8.882644e+01,
+			1.579137e+02,
+			2.467401e+02,
+			3.553058e+02,
+			4.836106e+02,
+			6.316547e+02,
+			7.994380e+02,
+			9.869604e+02
+		};
+
+		RSComplex t1 = {0, 12};
+		RSComplex t2 = {12, 0};
+		RSComplex i = {0,1};
+		RSComplex one = {1, 0};
+		RSComplex W = c_div(c_mul(i, ( c_sub(one, fast_cexp(c_mul(t1, Z))) )) , c_mul(t2, Z));
+		RSComplex sum = {0,0};
+		for( int n = 0; n < 10; n++ )
+		{
+			RSComplex t3 = {neg_1n[n], 0};
+			RSComplex top = c_sub(c_mul(t3, fast_cexp(c_mul(t1, Z))), one);
+			RSComplex t4 = {denominator_left[n], 0};
+			RSComplex t5 = {144, 0};
+			RSComplex bot = c_sub(t4, c_mul(t5,c_mul(Z,Z)));
+			RSComplex t6 = {an[n], 0};
+			sum = c_add(sum, c_mul(t6, c_div(top,bot)));
+		}
+		W = c_add(W, c_mul(prefactor, c_mul(Z, sum)));
+		return W;
+	}
+	else
+	{
+		// QUICK_2 3 Term Asymptotic Expansion (Accurate to O(1e-6)).
+		// Pre-computed parameters
+		RSComplex a = {0.512424224754768462984202823134979415014943561548661637413182,0};
+		RSComplex b = {0.275255128608410950901357962647054304017026259671664935783653, 0};
+		RSComplex c = {0.051765358792987823963876628425793170829107067780337219430904, 0};
+		RSComplex d = {2.724744871391589049098642037352945695982973740328335064216346, 0};
+
+		RSComplex i = {0,1};
+		RSComplex Z2 = c_mul(Z, Z);
+		// Three Term Asymptotic Expansion
+		RSComplex W = c_mul(c_mul(Z,i), (c_add(c_div(a,(c_sub(Z2, b))) , c_div(c,(c_sub(Z2, d))))));
+
+		return W;
+	}
+}
+
+RAJA_HOST_DEVICE double LCG_random_double(uint64_t * seed)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return (double) (*seed) / (double) m;
+}	
+
+RAJA_HOST_DEVICE uint64_t LCG_random_int(uint64_t * seed)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	const uint64_t a = 2806196910506780709ULL;
+	const uint64_t c = 1ULL;
+	*seed = (a * (*seed) + c) % m;
+	return *seed;
+}	
+
+RAJA_HOST_DEVICE uint64_t fast_forward_LCG(uint64_t seed, uint64_t n)
+{
+	const uint64_t m = 9223372036854775808ULL; // 2^63
+	uint64_t a = 2806196910506780709ULL;
+	uint64_t c = 1ULL;
+
+	n = n % m;
+
+	uint64_t a_new = 1;
+	uint64_t c_new = 0;
+
+	while(n > 0) 
+	{
+		if(n & 1)
+		{
+			a_new *= a;
+			c_new = c_new * a + c;
+		}
+		c *= (a + 1);
+		a *= a;
+
+		n >>= 1;
+	}
+
+	return (a_new * seed + c_new) % m;
+}
+
+// Complex arithmetic functions
+
+RAJA_HOST_DEVICE RSComplex c_add( RSComplex A, RSComplex B)
+{
+	RSComplex C;
+	C.r = A.r + B.r;
+	C.i = A.i + B.i;
+	return C;
+}
+
+RAJA_HOST_DEVICE RSComplex c_sub( RSComplex A, RSComplex B)
+{
+	RSComplex C;
+	C.r = A.r - B.r;
+	C.i = A.i - B.i;
+	return C;
+}
+
+RAJA_HOST_DEVICE RSComplex c_mul( RSComplex A, RSComplex B)
+{
+	double a = A.r;
+	double b = A.i;
+	double c = B.r;
+	double d = B.i;
+	RSComplex C;
+	C.r = (a*c) - (b*d);
+	C.i = (a*d) + (b*c);
+	return C;
+}
+
+RAJA_HOST_DEVICE RSComplex c_div( RSComplex A, RSComplex B)
+{
+	double a = A.r;
+	double b = A.i;
+	double c = B.r;
+	double d = B.i;
+	RSComplex C;
+	double denom = c*c + d*d;
+	C.r = ( (a*c) + (b*d) ) / denom;
+	C.i = ( (b*c) - (a*d) ) / denom;
+	return C;
+}
+
+RAJA_HOST_DEVICE double c_abs( RSComplex A)
+{
+	return sqrt(A.r*A.r + A.i * A.i);
+}
+
+
+// Fast (but inaccurate) exponential function
+// Written By "ACMer":
+// https://codingforspeed.com/using-faster-exponential-approximation/
+// We use our own to avoid small differences in compiler specific
+// exp() intrinsic implementations that make it difficult to verify
+// if the code is working correctly or not.
+RAJA_HOST_DEVICE double fast_exp(double x)
+{
+  x = 1.0 + x * 0.000244140625;
+  x *= x; x *= x; x *= x; x *= x;
+  x *= x; x *= x; x *= x; x *= x;
+  x *= x; x *= x; x *= x; x *= x;
+  return x;
+}
+
+// Implementation based on:
+// z = x + iy
+// cexp(z) = e^x * (cos(y) + i * sin(y))
+RAJA_HOST_DEVICE RSComplex fast_cexp( RSComplex z )
+{
+	double x = z.r;
+	double y = z.i;
+
+	// For consistency across architectures, we
+	// will use our own exponetial implementation
+	//double t1 = exp(x);
+	double t1 = fast_exp(x);
+	double t2 = cos(y);
+	double t3 = sin(y);
+	RSComplex t4 = {t2, t3};
+	RSComplex t5 = {t1, 0};
+	RSComplex result = c_mul(t5, (t4));
+	return result;
+}	
diff --git a/raja/utils.cpp b/raja/utils.cpp
new file mode 100644
index 0000000..ffd966e
--- /dev/null
+++ b/raja/utils.cpp
@@ -0,0 +1,31 @@
+#include "rsbench.h"
+
+size_t get_mem_estimate( Input input )
+{
+	size_t poles = input.n_nuclides * input.avg_n_poles * sizeof(Pole) + input.n_nuclides * sizeof(Pole *);
+	size_t windows = input.n_nuclides * input.avg_n_windows * sizeof(Window) + input.n_nuclides * sizeof(Window *);
+	size_t pseudo_K0RS = input.n_nuclides * input.numL * sizeof( double ) + input.n_nuclides * sizeof(double);
+	size_t other = input.n_nuclides * 2 * sizeof(int);
+
+	size_t total = poles + windows + pseudo_K0RS + other;
+	
+	return total;
+}
+
+double get_time(void)
+{
+
+	// If using C, we can do this:
+	/* 
+	struct timeval timecheck;
+	gettimeofday(&timecheck, NULL);
+	long ms = (long)timecheck.tv_sec * 1000 + (long)timecheck.tv_usec / 1000;
+	double time = (double) ms / 1000.0;
+	return time;
+	*/
+	
+	// If using C++, we can do this:
+	unsigned long us_since_epoch = std::chrono::high_resolution_clock::now().time_since_epoch() / std::chrono::microseconds(1);
+	return (double) us_since_epoch / 1.0e6;
+
+}

From 0922cabc6fab92d146406f5358b6eded04e407fd Mon Sep 17 00:00:00 2001
From: Josh Davis <jhdavis@udel.edu>
Date: Fri, 3 Nov 2023 18:32:25 -0700
Subject: [PATCH 14/14] Add Makefile parameters for GPU target type, update exe
 name

---
 openacc/Makefile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/openacc/Makefile b/openacc/Makefile
index 037046d..392248a 100644
--- a/openacc/Makefile
+++ b/openacc/Makefile
@@ -6,12 +6,14 @@ COMPILER  = nvidia
 OPTIMIZE  = yes
 DEBUG     = no
 PROFILE   = no
+SM_VERSION ?= 80
+AMD_TARGET ?= gfx908
 
 #===============================================================================
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = RSBench
 
 source = \
 main.c \
@@ -49,7 +51,7 @@ endif
 # LLVM Compiler Targeting A100 -- Change SM Level to Target Other GPUs
 ifeq ($(COMPILER),llvm)
   CC = clang
-  CFLAGS += -fopenacc -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_80
+  CFLAGS += -fopenacc -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_$(SM_VERSION)
 endif
 
 # IBM XL Compiler
@@ -61,13 +63,13 @@ endif
 # NVIDIA Compiler Targeting A100 -- Change SM Level to Target Other GPUs
 ifeq ($(COMPILER),nvidia)
   CC = nvc
-	CFLAGS += -acc -Minfo=accel -gpu=cc80 -openmp
+	CFLAGS += -acc -Minfo=accel -gpu=cc$(SM_VERSION) -openmp
 endif
 
 # AOMP Targeting MI100 -- Change march to Target Other GPUs
 ifeq ($(COMPILER),amd)
   CC = clang
-  CFLAGS += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908
+  CFLAGS += -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=$(AMD_TARGET)
 endif
 
 # Debug Flags