Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
*.o
RSBench
rsbench
*.cpp1*
*.cpp4*
*.ptx
*.cubin
*.cudafe1*
*.fatbin*
*.module_id
[Bb][Uu][Ii][Ll][Dd]*/
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ This version of RSBench is written in SYCL, and can be used for CPU, GPU, FPGA,
5. **RSBench/hip**
This version of RSBench is written in HIP for use with GPU architectures. This version is derived from CUDA using an automatic conversion tool with only a few small manual changes.

6. **RSBench/RAJA**
This version of RSBench is written using the RAJA programming model and Umpire to handle memory management. This programming model can be run on either CPU or GPU depending on how RAJA is configured. You will likely need to edit the makefile if RAJA is installed with multiple backends.
## Compilation

To compile RSBench with default settings, navigate to your selected source directory and use the following command:
Expand Down
6 changes: 3 additions & 3 deletions cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ COMPILER = nvidia
OPTIMIZE = yes
DEBUG = no
PROFILE = no
SM_VERSION = 80
SM_VERSION ?= 80

#===============================================================================
# Program name & source code list
#===============================================================================

program = rsbench
program = RSBench

source = \
main.cu \
Expand Down Expand Up @@ -68,7 +68,7 @@ $(program): $(obj) rsbench.cuh Makefile
$(CC) $(CFLAGS) -c $< -o $@

clean:
rm -rf rsbench $(obj)
rm -rf $(program) $(obj)

edit:
vim -p $(source) rsbench.cuh
Expand Down
25 changes: 25 additions & 0 deletions cuda/init.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,34 @@ SimulationData initialize_simulation( Input input )
SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
SD.length_pseudo_K0RS = input.n_nuclides * input.numL;

SD.verification = (unsigned long *) malloc(input.lookups * sizeof(unsigned long));

return SD;
}

void release_memory(SimulationData SD) {
free(SD.num_nucs);
free(SD.concs);
free(SD.mats);
free(SD.n_poles);
free(SD.n_windows);
free(SD.poles);
free(SD.windows);
free(SD.pseudo_K0RS);
}

void release_device_memory(SimulationData GSD) {
cudaFree(GSD.num_nucs);
cudaFree(GSD.concs);
cudaFree(GSD.mats);
cudaFree(GSD.n_poles);
cudaFree(GSD.n_windows);
cudaFree(GSD.poles);
cudaFree(GSD.windows);
cudaFree(GSD.pseudo_K0RS);
cudaFree(GSD.verification);
}

int * generate_n_poles( Input input, uint64_t * seed )
{
int total_resonances = input.avg_n_poles * input.n_nuclides;
Expand Down
14 changes: 6 additions & 8 deletions cuda/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ int main(int argc, char * argv[])
start = get_time();

SimulationData SD = initialize_simulation( input );
SimulationData GSD = move_simulation_data_to_device( input, SD );

stop = get_time();

Expand All @@ -44,17 +43,15 @@ int main(int argc, char * argv[])
border_print();

unsigned long vhash = 0;

// Run Simulation
start = get_time();
double elapsed_time = 0;

// Run simulation
if( input.simulation_method == EVENT_BASED )
{
if( input.kernel_id == 0 )
run_event_based_simulation(input, GSD, &vhash );
run_event_based_simulation(input, SD, &vhash, &elapsed_time);
else if( input.kernel_id == 1 )
run_event_based_simulation_optimization_1(input, GSD, &vhash );
run_event_based_simulation_optimization_1(input, SD, &vhash );
else
{
printf("Error: No kernel ID %d found!\n", input.kernel_id);
Expand All @@ -67,21 +64,22 @@ int main(int argc, char * argv[])
exit(1);
}

stop = get_time();

// Final hash step
vhash = vhash % 999983;

printf("Simulation Complete.\n");

release_memory(SD);

// =====================================================================
// Print / Save Results and Exit
// =====================================================================
border_print();
center_print("RESULTS", 79);
border_print();

int is_invalid = validate_and_print_results(input, stop-start, vhash);
int is_invalid = validate_and_print_results(input, elapsed_time, vhash);

border_print();

Expand Down
4 changes: 3 additions & 1 deletion cuda/rsbench.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ Pole * generate_poles( Input input, int * n_poles, uint64_t * seed, int * max_nu
Window * generate_window_params( Input input, int * n_windows, int * n_poles, uint64_t * seed, int * max_num_windows );
double * generate_pseudo_K0RS( Input input, uint64_t * seed );
SimulationData move_simulation_data_to_device( Input in, SimulationData SD );
void release_memory(SimulationData SD);
void release_device_memory(SimulationData GSD);

// material.c
int * load_num_nucs(Input input);
Expand All @@ -127,7 +129,7 @@ size_t get_mem_estimate( Input input );
double get_time(void);

// simulation.c
void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result );
void run_event_based_simulation(Input input, SimulationData data, unsigned long * vhash_result, double * elapsed_time);
void run_event_based_simulation_optimization_1(Input in, SimulationData GSD, unsigned long * vhash_result);
__global__ void xs_lookup_kernel_baseline(Input in, SimulationData GSD );
__device__ void calculate_macro_xs( double * macro_xs, int mat, double E, Input input, int * num_nucs, int * mats, int max_num_nucs, double * concs, int * n_windows, double * pseudo_K0Rs, Window * windows, Pole * poles, int max_num_windows, int max_num_poles );
Expand Down
28 changes: 22 additions & 6 deletions cuda/simulation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,46 @@
// line argument.
////////////////////////////////////////////////////////////////////////////////////

void run_event_based_simulation(Input input, SimulationData GSD, unsigned long * vhash_result )
{
void run_event_based_simulation(Input input, SimulationData SD, unsigned long * vhash_result, double * elapsed_time) {
double start, stop;
start = get_time();
////////////////////////////////////////////////////////////////////////////////
// Move Data to Device
////////////////////////////////////////////////////////////////////////////////
SimulationData GSD = move_simulation_data_to_device(input, SD);

stop = get_time();
printf("Initialization Complete. (%.2lf seconds)\n", stop - start);
////////////////////////////////////////////////////////////////////////////////
// Configure & Launch Simulation Kernel
////////////////////////////////////////////////////////////////////////////////
printf("Running baseline event-based simulation on device...\n");

start = get_time();

int nthreads = 256;
int nblocks = ceil( (double) input.lookups / (double) nthreads);

xs_lookup_kernel_baseline<<<nblocks, nthreads>>>( input, GSD );
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
gpuErrchk(cudaMemcpy(SD.verification, GSD.verification, input.lookups * sizeof(unsigned long), cudaMemcpyDeviceToHost));

////////////////////////////////////////////////////////////////////////////////
// Reduce Verification Results
////////////////////////////////////////////////////////////////////////////////
printf("Reducing verification results...\n");

unsigned long verification_scalar = thrust::reduce(thrust::device, GSD.verification, GSD.verification + input.lookups, 0);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
unsigned long long verification_scalar = 0;
for(int i = 0; i < input.lookups; i++ )
verification_scalar += SD.verification[i];

*vhash_result = verification_scalar;

stop = get_time();

*elapsed_time = stop - start;

release_device_memory(GSD);
}

// In this kernel, we perform a single lookup with each thread. Threads within a warp
Expand Down
23 changes: 12 additions & 11 deletions hip/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@ COMPILER = amd
OPTIMIZE = yes
DEBUG = no
PROFILE = no
OFFLOAD_ARCH ?= gfx90a

#===============================================================================
# Program name & source code list
#===============================================================================

program = rsbench
program = RSBench

source = \
main.hip \
simulation.hip\
io.hip \
init.hip \
material.hip \
utils.hip
main.cpp \
simulation.cpp\
io.cpp \
init.cpp \
material.cpp \
utils.cpp

obj = $(source:.hip=.o)
obj = $(source:.cpp=.o)

#===============================================================================
# Sets Flags
Expand All @@ -33,7 +34,7 @@ CFLAGS :=
# AMD
ifeq ($(COMPILER),amd)
CC = hipcc
CFLAGS += -std=c++14
CFLAGS += -std=c++14 --offload-arch=${OFFLOAD_ARCH}
endif

# Linker Flags
Expand Down Expand Up @@ -63,11 +64,11 @@ endif
$(program): $(obj) rsbench.h Makefile
$(CC) $(CFLAGS) $(obj) -o $@ $(LDFLAGS)

%.o: %.hip rsbench.h Makefile
%.o: %.cpp rsbench.h Makefile
$(CC) $(CFLAGS) -c $< -o $@

clean:
rm -rf rsbench $(obj)
rm -rf $(program) $(obj)

edit:
vim -p $(source) rsbench.h
Expand Down
28 changes: 26 additions & 2 deletions hip/init.hip → hip/init.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#include "rsbench.h"

// Moves all required data structures to the GPU's memory space
SimulationData move_simulation_data_to_device( Input in, SimulationData SD )
{
SimulationData move_simulation_data_to_device( Input in, SimulationData SD ) {
printf("Allocating and moving simulation data to GPU memory space...\n");

size_t sz;
Expand Down Expand Up @@ -102,9 +101,34 @@ SimulationData initialize_simulation( Input input )
SD.pseudo_K0RS = generate_pseudo_K0RS( input, &seed );
SD.length_pseudo_K0RS = input.n_nuclides * input.numL;

SD.verification = (unsigned long *) malloc(input.lookups * sizeof(unsigned long));

return SD;
}

void release_memory(SimulationData SD) {
free(SD.num_nucs);
free(SD.concs);
free(SD.mats);
free(SD.n_poles);
free(SD.n_windows);
free(SD.poles);
free(SD.windows);
free(SD.pseudo_K0RS);
}

void release_device_memory(SimulationData GSD) {
hipFree(GSD.num_nucs);
hipFree(GSD.concs);
hipFree(GSD.mats);
hipFree(GSD.n_poles);
hipFree(GSD.n_windows);
hipFree(GSD.poles);
hipFree(GSD.windows);
hipFree(GSD.pseudo_K0RS);
hipFree(GSD.verification);
}

int * generate_n_poles( Input input, uint64_t * seed )
{
int total_resonances = input.avg_n_poles * input.n_nuclides;
Expand Down
File renamed without changes.
82 changes: 82 additions & 0 deletions hip/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#include "rsbench.h"

int main(int argc, char * argv[])
{
// =====================================================================
// Initialization & Command Line Read-In
// =====================================================================

int version = 13;
double start, stop;

// Process CLI Fields
Input input = read_CLI( argc, argv );

// =====================================================================
// Print-out of Input Summary
// =====================================================================
logo(version);
center_print("INPUT SUMMARY", 79);
border_print();
print_input_summary(input);

// =====================================================================
// Intialize Simulation Data Structures
// =====================================================================
border_print();
center_print("INITIALIZATION", 79);
border_print();


SimulationData SD = initialize_simulation( input );

// =====================================================================
// Cross Section (XS) Parallel Lookup Simulation Begins
// =====================================================================
border_print();
center_print("SIMULATION", 79);
border_print();

unsigned long vhash = 0;
double elapsed_time = 0;

// Run Simulation

// Run simulation
if( input.simulation_method == EVENT_BASED )
{
if( input.kernel_id == 0 )
run_event_based_simulation(input, SD, &vhash, &elapsed_time);
else
{
printf("Error: No kernel ID %d found!\n", input.kernel_id);
exit(1);
}
}
else if( input.simulation_method == HISTORY_BASED )
{
printf("History-based simulation not implemented in OpenMP offload code. Instead,\nuse the event-based method with \"-m event\" argument.\n");
exit(1);
}


// Final hash step
vhash = vhash % 999983;

printf("Simulation Complete.\n");

release_memory(SD);

// =====================================================================
// Print / Save Results and Exit
// =====================================================================
border_print();
center_print("RESULTS", 79);
border_print();

int is_invalid = validate_and_print_results(input, elapsed_time, vhash);

border_print();

return is_invalid;
}
File renamed without changes.
Loading