CrossGPUAtomics/benchmark_script at master · SajadKarim/CrossGPUAtomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash

echo ">>>>>>Script started!<<<<<<"

#--------------------------------------------------------
# Test 3 - Baseline vs Mix-Atomics
# This experiments focuses on the comparison of aggregate operations by Baseline and Mix-Atomics techniques. Baseline is a technique to perform aggregate operation in that the operation is made in two steps. In the first step it performs all the aggregate operations locally using Local Atomics and then in the second phase it pushes all the resultant vectors (locally performed aggregate operations) to a single device (GPU) and then merge the values again using Local Atomics. It does it to avoid Remote Atomic calls. On the other hand, Mix-Atomic technique performs aggregate operation in a single call and it does it by utilizing Remote Atomic calls. Unlike Baseline, it does wait to merge the values.
#--------------------------------------------------------
maxatomics=$((2**32)) #18446744073709551616  # 1073741824 #268435456 # 2 pow 28
currentatomics=$((2**10))   # 2 pow 10
while [ $currentatomics -le $maxatomics ]
do
        valIn2sPow=$(echo "l($(((currentatomics/1024)*1024)))/l(2)" | bc -l)

	maxrandomnumber=40
	while [ $maxrandomnumber -le 4000 ]
	do
		echo ""
	        echo ">>>Launching script to record benchmarks for Local+Remote Atomics. GridSize: $((currentatomics/1024)), BlockSize: 1024, MaxRandomNumber: ${maxrandomnumber}."
		/usr/local/cuda/bin/nvprof --normalized-time-unit s --print-gpu-summary --log-file "./output/Exp3/Exp3_BL_2Pow${valIn2sPow%.*}RemotePlusLocalAtomics_RAND${maxrandomnumber}.csv" --csv ./sandbox s2 $((currentatomics/1024)) $maxrandomnumber "./output/Exp3/Exp3_BL_2Pow${valIn2sPow%.*}RemotePlusLocalAtomics_RAND${maxrandomnumber}_cpu.csv"
	        wait $!
        	echo "<<<"
	        echo ""
        	sleep 1
	        wait $!

	        echo ""
        	echo ">>>Launching script to record benchmarks for Local Atomics. GridSize: $((currentatomics/1024)), BlockSize: 1024, MaxRandomNumber: ${maxrandomnumber}."
		/usr/local/cuda/bin/nvprof --normalized-time-unit s --print-gpu-summary --log-file "./output/Exp3/Exp3_BL_2Pow${valIn2sPow%.*}LocalAtomics_RAND${maxrandomnumber}.csv" --csv ./sandbox b2 $((currentatomics/1024)) $maxrandomnumber "./output/Exp3/Exp3_BL_2Pow${valIn2sPow%.*}LocalAtomics_RAND${maxrandomnumber}_cpu.csv"
	        wait $!
	        echo "<<<"
	        echo ""
	        sleep 1
	        wait $!


                echo ""
                echo ">>>Launching script to record benchmarks for Local Atomics. GridSize: $((currentatomics/1024)), BlockSize: 1024, MaxRandomNumber: ${maxrandomnumber}."
                /usr/local/cuda/bin/nvprof --normalized-time-unit s --print-gpu-summary --log-file "./output/Exp3/Exp3_BL_2Pow${valIn2sPow%.*}LocalAtomics_CPUMerge_RAND${maxrandomnumber}.csv" --csv ./sandbox b1 $((currentatomics/1024)) $maxrandomnumber "./output/Exp3/Exp3_BL_2Pow${valIn2sPow%.*}LocalAtomics_CPUMerge_RAND${maxrandomnumber}_cpu.csv"
                wait $!
                echo "<<<"
                echo ""
                sleep 1
                wait $!


		maxrandomnumber=$((maxrandomnumber*10))
	done
        currentatomics=$((currentatomics*2))
done
exit
#----------------------------
# Test 2 - Thread-Count vs Block-Size
# This experiments evaluates the impact of Thread-Count and Block-Size using time complexity. The count of the Atomic operations are constant throughout the experiment and they all are Local Atomics. As stated, this experiment is performed to gauge impact of Thread-Count and Block-Size, therefore, the Atomics are performed against different configurations and in each configuration different values for Thread-Count and Block-Size are used. (Please note: Block Size refers to the data-structure/parameter that GPU uses to hide internal parallelism logic and to keep it abstract and uniform across its each variants). In this experiment we performed 2Pow28 Atomic operations. These Atomics operations are performed using single device, and GPU-0 is used for this purpose.
#----------------------------
totalatomics=1073741824 #268435456 # 2 pow 28
maxthreads=1048576 #1024 # 2 pow 10
currentthreads=8    # 2 pow 3
while [ $currentthreads -le $maxthreads ]
do
        valIn2sPow=$(echo "l($(((currentthreads/8)*8)))/l(2)" | bc -l)

	blocksize=8
	while [ $blocksize -le 1024 ]
	do
        	echo ""
	        echo ">>>Launching script to record benchmarks for Local Atomics. TotalAtomics: ${totalatomics}, MaxThreads: ${currentthreads}, BlockSize: ${blocksize}."
	        /usr/local/cuda/bin/nvprof --normalized-time-unit s --print-summary-per-gpu --log-file "./output/Exp2/Exp2_LAEX_2Pow${valIn2sPow%.*}_BlocSize${blocksize}Atomics.csv" --csv ./sandbox laex $totalatomics $currentthreads $blocksize
        	wait $!
	        echo "<<<"
        	echo ""
	        sleep 1
        	wait $!

		blocksize=$((blocksize*2))
	done

	currentthreads=$((currentthreads*2))
done

#---------------------------------
# Test 1 - Local vs Remote Atomics
# This experiments focuses on the comparison of Local and Remote Atomics. In this experiment we performed Atomic operations that ranges from 2Pow10 to 2Pow28. We used two GPU devices to run this experiment. GPU-0 is used to make Local Atomics, and to make Remote Atomics GPU-1 is used. This experiment is run in two steps, 1) GPU-0 is used to perform Local Atomics for the aforementioned range, and 2) GPU-0 and GPU-1 are used together to perform Remote Atomics where the source data is copied to GPU-0 and resultant Atomics data-structure is maintained at GPU-1.
#---------------------------------
final=268435456	# 2 pow 28
current=1024	# 2 pow 10
while [ $current -le $final ]
do
	valIn2sPow=$(echo "l($(((current/1024)*1024)))/l(2)" | bc -l)

	echo ""
	echo ">>>Launching script to record benchmarks for Local Atomics. GridSize: $((current/1024)) BlockSize: 1024."
	/usr/local/cuda/bin/nvprof --normalized-time-unit s --print-summary-per-gpu --log-file "./output/Exp1/Exp1_LA_2Pow${valIn2sPow%.*}Atomics.csv" --csv ./sandbox la $current/1024 1024
	wait $!
	echo "<<<"
	echo ""
	sleep 1
	wait $!

        echo ""
        echo ">>>Launching script to record benchmarks for Remote Atomics. GridSize: $((current/1024)) BlockSize: 1024."
	/usr/local/cuda/bin/nvprof --normalized-time-unit s --print-summary-per-gpu --log-file "./output/Exp1/Exp1_RA_2Pow${valIn2sPow%.*}Atomics.csv" --csv ./sandbox ra $current/1024 1024
        wait $!
        echo "<<<"
        echo ""
        sleep 1
        wait $!

        current=$((current*2))
done


echo ">>>>>>Script Ended!<<<<<<"