ANL-CESAR · joy-kitson · Oct 19, 2023 · Oct 25, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+*.o
+RSBench
+rsbench
+*.cpp1*
+*.cpp4*
+*.ptx
+*.cubin
+*.cudafe1*
+*.fatbin*
+*.module_id
+[Bb][Uu][Ii][Ll][Dd]*/
diff --git a/README.md b/README.md
@@ -44,6 +44,8 @@ This version of RSBench is written in SYCL, and can be used for CPU, GPU, FPGA,
 5. **RSBench/hip**
 This version of RSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.
 
+6. **RSBench/RAJA**
+This version of RSBench is written using the RAJA programming model and Umpire to handle memory management. This programming model can be run on either CPU or GPU depending on how RAJA is configured. You will likely need to edit the makefile if RAJA is installed with multiple backends.
 ## Compilation
 
 To compile RSBench with default settings, navigate to your selected source directory and use the following command:

diff --git a/cuda/Makefile b/cuda/Makefile
@@ -6,13 +6,13 @@ COMPILER    = nvidia
 OPTIMIZE    = yes
 DEBUG       = no
 PROFILE     = no
-SM_VERSION  = 80
+SM_VERSION  ?= 80
 
 #===============================================================================
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = RSBench
 
 source = \
 main.cu \
@@ -68,7 +68,7 @@ $(program): $(obj) rsbench.cuh Makefile
 	$(CC) $(CFLAGS) -c $< -o $@
 
 clean:
-	rm -rf rsbench $(obj)
+	rm -rf $(program) $(obj)
 
 edit:
 	vim -p $(source) rsbench.cuh

diff --git a/cuda/init.cu b/cuda/init.cu
@@ -102,9 +102,34 @@ SimulationData initialize_simulation( Input input )
 	SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
 	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
 
+	SD.verification = (unsigned long *) malloc(input.lookups * sizeof(unsigned long));
+
 	return SD;
 }
 
+void release_memory(SimulationData SD) {
+	free(SD.num_nucs);
+	free(SD.concs);
+	free(SD.mats);
+	free(SD.n_poles);
+	free(SD.n_windows);
+	free(SD.poles);
+	free(SD.windows);
+	free(SD.pseudo_K0RS);
+}
+
+void release_device_memory(SimulationData GSD) {
+	cudaFree(GSD.num_nucs);
+	cudaFree(GSD.concs);
+	cudaFree(GSD.mats);
+	cudaFree(GSD.n_poles);
+	cudaFree(GSD.n_windows);
+	cudaFree(GSD.poles);
+	cudaFree(GSD.windows);
+	cudaFree(GSD.pseudo_K0RS);
+	cudaFree(GSD.verification);
+}
+
 int * generate_n_poles( Input input, uint64_t * seed )
 {
 	int total_resonances = input.avg_n_poles * input.n_nuclides;

diff --git a/cuda/main.cu b/cuda/main.cu
@@ -30,7 +30,6 @@ int main(int argc, char * argv[])
 	start = get_time();
 
 	SimulationData SD = initialize_simulation( input );
-	SimulationData GSD = move_simulation_data_to_device( input, SD );
 
 	stop = get_time();
 
@@ -44,17 +43,15 @@ int main(int argc, char * argv[])
 	border_print();
 
 	unsigned long vhash = 0;
-
-	// Run Simulation
-	start = get_time();
+	double elapsed_time = 0;
 
 	// Run simulation
 	if( input.simulation_method == EVENT_BASED )
 	{
 		if( input.kernel_id == 0 )
-			run_event_based_simulation(input, GSD, &vhash );
+			run_event_based_simulation(input, SD, &vhash, &elapsed_time);
 		else if( input.kernel_id == 1 )
-			run_event_based_simulation_optimization_1(input, GSD, &vhash );
+			run_event_based_simulation_optimization_1(input, SD, &vhash );
 		else
 		{
 			printf("Error: No kernel ID %d found!\n", input.kernel_id);
@@ -67,21 +64,22 @@ int main(int argc, char * argv[])
 		exit(1);
 	}
 
-	stop = get_time();
 
 	// Final hash step
 	vhash = vhash % 999983;
 
 	printf("Simulation Complete.\n");
 
+	release_memory(SD);
+
 	// =====================================================================
 	// Print / Save Results and Exit
 	// =====================================================================
 	border_print();
 	center_print("RESULTS", 79);
 	border_print();
 
-	int is_invalid = validate_and_print_results(input, stop-start, vhash);
+	int is_invalid = validate_and_print_results(input, elapsed_time, vhash);
 
 	border_print();
 

diff --git a/cuda/rsbench.cuh b/cuda/rsbench.cuh
@@ -115,6 +115,8 @@ Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_nu
 Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
 double * generate_pseudo_K0RS( Input input, uint64_t * seed );
 SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
+void release_memory(SimulationData SD);
+void release_device_memory(SimulationData GSD);
 
 // material.c
 int * load_num_nucs(Input input);
@@ -127,7 +129,7 @@ size_t get_mem_estimate( Input input );
 double get_time(void);
 
 // simulation.c
-void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
+void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result, double * elapsed_time);
 void run_event_based_simulation_optimization_1(Input in, SimulationData GSD, unsigned long * vhash_result);
 __global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
 __device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );

diff --git a/cuda/simulation.cu b/cuda/simulation.cu
@@ -12,30 +12,46 @@
 // line argument.
 ////////////////////////////////////////////////////////////////////////////////////
 
-void run_event_based_simulation(Input input, SimulationData GSD, unsigned long * vhash_result )
-{
+void run_event_based_simulation(Input input, SimulationData SD, unsigned long * vhash_result, double * elapsed_time) {
+	double start, stop;
+	start = get_time();
+	////////////////////////////////////////////////////////////////////////////////
+	// Move Data to Device
+	////////////////////////////////////////////////////////////////////////////////
+	SimulationData GSD = move_simulation_data_to_device(input, SD);
+
+ 	stop = get_time();
+	printf("Initialization Complete. (%.2lf seconds)\n", stop - start);
 	////////////////////////////////////////////////////////////////////////////////
 	// Configure & Launch Simulation Kernel
 	////////////////////////////////////////////////////////////////////////////////
 	printf("Running baseline event-based simulation on device...\n");
 
+	start = get_time();
+
 	int nthreads = 256;
 	int nblocks = ceil( (double) input.lookups / (double) nthreads);
 
 	xs_lookup_kernel_baseline<<<nblocks, nthreads>>>( input, GSD );
 	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
+	gpuErrchk(cudaMemcpy(SD.verification, GSD.verification, input.lookups * sizeof(unsigned long), cudaMemcpyDeviceToHost));
 
 	////////////////////////////////////////////////////////////////////////////////
 	// Reduce Verification Results
 	////////////////////////////////////////////////////////////////////////////////
 	printf("Reducing verification results...\n");
 
-	unsigned long verification_scalar = thrust::reduce(thrust::device, GSD.verification, GSD.verification + input.lookups, 0);
-	gpuErrchk( cudaPeekAtLastError() );
-	gpuErrchk( cudaDeviceSynchronize() );
+	unsigned long long verification_scalar = 0;
+	for(int i = 0; i < input.lookups; i++ )
+		verification_scalar += SD.verification[i];
 
 	*vhash_result = verification_scalar;
+
+    stop = get_time();
+
+	*elapsed_time = stop - start;
+
+	release_device_memory(GSD);
 }
 
 // In this kernel, we perform a single lookup with each thread. Threads within a warp

diff --git a/hip/Makefile b/hip/Makefile
@@ -6,22 +6,23 @@ COMPILER    = amd
 OPTIMIZE    = yes
 DEBUG       = no
 PROFILE     = no
+OFFLOAD_ARCH ?= gfx90a
 
 #===============================================================================
 # Program name & source code list
 #===============================================================================
 
-program = rsbench
+program = RSBench
 
 source = \
-main.hip \
-simulation.hip\
-io.hip \
-init.hip \
-material.hip \
-utils.hip
+main.cpp \
+simulation.cpp\
+io.cpp \
+init.cpp \
+material.cpp \
+utils.cpp
 
-obj = $(source:.hip=.o)
+obj = $(source:.cpp=.o)
 
 #===============================================================================
 # Sets Flags
@@ -33,7 +34,7 @@ CFLAGS :=
 # AMD
 ifeq ($(COMPILER),amd)
   CC = hipcc
-  CFLAGS += -std=c++14
+  CFLAGS += -std=c++14 --offload-arch=${OFFLOAD_ARCH}
 endif
 
 # Linker Flags
@@ -63,11 +64,11 @@ endif
 $(program): $(obj) rsbench.h Makefile
 	$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)
 
-%.o: %.hip rsbench.h Makefile
+%.o: %.cpp rsbench.h Makefile
 	$(CC) $(CFLAGS) -c $< -o $@
 
 clean:
-	rm -rf rsbench $(obj)
+	rm -rf $(program) $(obj)
 
 edit:
 	vim -p $(source) rsbench.h

diff --git a/hip/init.hip → hip/init.cpp b/hip/init.hip → hip/init.cpp
@@ -1,8 +1,7 @@
 #include "rsbench.h"
 
 // Moves all required data structures to the GPU's memory space
-SimulationData move_simulation_data_to_device( Input in, SimulationData SD )
-{
+SimulationData move_simulation_data_to_device( Input in, SimulationData SD ) {
 	printf("Allocating and moving simulation data to GPU memory space...\n");
 
 	size_t sz;
@@ -102,9 +101,34 @@ SimulationData initialize_simulation( Input input )
 	SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
 	SD.length_pseudo_K0RS = input.n_nuclides * input.numL;
 
+	SD.verification = (unsigned long *) malloc(input.lookups * sizeof(unsigned long));
+
 	return SD;
 }
 
+void release_memory(SimulationData SD) {
+	free(SD.num_nucs);
+	free(SD.concs);
+	free(SD.mats);
+	free(SD.n_poles);
+	free(SD.n_windows);
+	free(SD.poles);
+	free(SD.windows);
+	free(SD.pseudo_K0RS);
+}
+
+void release_device_memory(SimulationData GSD) {
+	hipFree(GSD.num_nucs);
+	hipFree(GSD.concs);
+	hipFree(GSD.mats);
+	hipFree(GSD.n_poles);
+	hipFree(GSD.n_windows);
+	hipFree(GSD.poles);
+	hipFree(GSD.windows);
+	hipFree(GSD.pseudo_K0RS);
+	hipFree(GSD.verification);
+}
+
 int * generate_n_poles( Input input, uint64_t * seed )
 {
 	int total_resonances = input.avg_n_poles * input.n_nuclides;

diff --git a/hip/io.hip → hip/io.cpp b/hip/io.hip → hip/io.cpp
diff --git a/hip/main.cpp b/hip/main.cpp
@@ -0,0 +1,82 @@
+#include "rsbench.h"
+
+int main(int argc, char * argv[])
+{
+	// =====================================================================
+	// Initialization & Command Line Read-In
+	// =====================================================================
+
+	int version = 13;
+	double start, stop;
+
+	// Process CLI Fields
+	Input input = read_CLI( argc, argv );
+
+	// =====================================================================
+	// Print-out of Input Summary
+	// =====================================================================
+	logo(version);
+	center_print("INPUT SUMMARY", 79);
+	border_print();
+	print_input_summary(input);
+
+	// =====================================================================
+	// Intialize Simulation Data Structures
+	// =====================================================================
+	border_print();
+	center_print("INITIALIZATION", 79);
+	border_print();
+
+
+	SimulationData SD = initialize_simulation( input );
+
+	// =====================================================================
+	// Cross Section (XS) Parallel Lookup Simulation Begins
+	// =====================================================================
+	border_print();
+	center_print("SIMULATION", 79);
+	border_print();
+
+	unsigned long vhash = 0;
+	double elapsed_time = 0;
+
+	// Run Simulation
+
+	// Run simulation
+	if( input.simulation_method == EVENT_BASED )
+	{
+		if( input.kernel_id == 0 )
+			run_event_based_simulation(input, SD, &vhash, &elapsed_time);
+		else
+		{
+			printf("Error: No kernel ID %d found!\n", input.kernel_id);
+			exit(1);
+		}
+	}
+	else if( input.simulation_method == HISTORY_BASED )
+	{
+		printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
+		exit(1);
+	}
+
+
+	// Final hash step
+	vhash = vhash % 999983;
+
+	printf("Simulation Complete.\n");
+
+	release_memory(SD);
+
+	// =====================================================================
+	// Print / Save Results and Exit
+	// =====================================================================
+	border_print();
+	center_print("RESULTS", 79);
+	border_print();
+
+	int is_invalid = validate_and_print_results(input, elapsed_time, vhash);
+
+	border_print();
+
+	return is_invalid;
+}
diff --git a/hip/material.hip → hip/material.cpp b/hip/material.hip → hip/material.cpp