diff --git a/.gitignore b/.gitignore index fec5bad..459c0e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ +.bloop/ .bsp/ +.idea/ .metals/ +.scala-build/ .vscode/ project/ target/ @@ -17,3 +20,4 @@ emulator/obj_dir **/*.orig **/*.dump **/fp-emu +**/asm-gen/ diff --git a/flexpret b/flexpret index d51bbbf..45eba52 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit d51bbbf6bde11217370080e76f5eb6efd88da94d +Subproject commit 45eba525e5b9f58676dd2dca09ca96e82ae7f654 diff --git a/programs/HelloWorld/hello.c b/programs/HelloWorld/hello.c index 91440a6..a15579c 100644 --- a/programs/HelloWorld/hello.c +++ b/programs/HelloWorld/hello.c @@ -34,4 +34,4 @@ int main2() { int main3() { _fp_print(43); -} \ No newline at end of file +} diff --git a/programs/benchmarks/noc/latency_aligned/Makefile b/programs/benchmarks/noc/latency_aligned/Makefile new file mode 100644 index 0000000..2463e1d --- /dev/null +++ b/programs/benchmarks/noc/latency_aligned/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm noc_latency_aligned.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/benchmarks/noc/latency_aligned/align.h b/programs/benchmarks/noc/latency_aligned/align.h new file mode 100644 index 0000000..c5f19ea --- /dev/null +++ b/programs/benchmarks/noc/latency_aligned/align.h @@ -0,0 +1,34 @@ +#define WAIT_FOR_NEXT_ZERO_MOD_1024(id) \ + "li t0, 1014\n\t" \ + "li a0, 1\n\t" \ + "li a1, 2\n\t" \ + "li a2, 3\n\t" \ + "li a3, 4\n\t" \ + "li a4, 5\n\t" \ + "li a5, 6\n\t" \ + "li t6, 7\n\t" \ + "rdcycle t1\n\t" \ + "andi t1, t1, 7\n\t" \ + "beq t1, t6, LOOP" #id "\n\t" \ + "beq t1, a5, LOOP" #id "\n\t" \ + "beq t1, a4, LOOP" #id "\n\t" \ + "beq t1, a3, LOOP" #id "\n\t" \ + "beq t1, a2, LOOP" #id "\n\t" \ + "beq t1, a1, LOOP" #id "\n\t" \ + "beq t1, a0, LOOP" #id "\n\t" \ + "beq t1, x0, LOOP" #id "\n\t" \ + /* This entire loop is 8 cycles long, so the value of t1 upon exiting is t0 plus a */ \ + /* number in the range [0, 7] */ \ + "LOOP" #id ":\n\t" \ + "nop\n\t" /* Delay so that loop length is a power of 2 */ \ + "nop\n\t" \ + "nop\n\t" \ + "rdcycle t1\n\t" \ + "andi t1, t1, 1023\n\t" \ + "blt t1, t0, LOOP" #id "\n\t" /* Cost of 3 cycles when taken, 1 otherwise; see page 37 https://www2.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-181.pdf */ \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c new file mode 100644 index 0000000..242d280 --- /dev/null +++ b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c @@ -0,0 +1,80 @@ +/** + * This program explores the absolute minimum amount of time that it can take to send one word and + * write it into a register on another core, when under the most favorable circumstances, + * and when controlling relative timing and optimizing the assembly. + */ +#include +#include +#include +#include + +#include "align.h" + +#define N 100 + +static int main_of(uint32_t core); + +static int send_main(uint32_t receiver); +static int receive_main(uint32_t sender); + +int main() { + unsigned long coreid = read_csr(CSR_COREID); + srand(coreid); + if (coreid == 0) for (int i = 0; i < 10; i++) send_main(1); + if (coreid == 1) for (int i = 0; i < 10; i++) receive_main(0); +} + +static int send_main(uint32_t receiver) { + asm volatile ( + "li t4, 0x40000000\n\t" + WAIT_FOR_NEXT_ZERO_MOD_1024(send) // clobber "a" registers, as well as t0, t1, t6 + // like noc_send, but without blocking + "li t5, 0x1\n\t" // noc destination + "sw t5, 8(t4)\n\t" + "li t5, 0x08\n\t" + "sw t5, 4(t4)\n\t" + "nop\n\t" + "nop\n\t" + "li t5, 42\n\t" // Set noc data to 42 + "sw t5, 8(t4)\n\t" // NOTE: Data must be written first. This is by design. + "li t5, 0x04\n\t" + "sw t5, 4(t4)\n\t" + ); +} + +static int receive_main(uint32_t sender) { + asm volatile ( + WAIT_FOR_NEXT_ZERO_MOD_1024(receive) + // "nop\n\t" // The 9-cycle read loop is aligned optimally when the number of nops here is zero mod 9 + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + "li t4, 0x40000000\n\t" // wishbone base address + // FIXME: Why does this loop have to go through one iteration extra the first time around, compared to the number of iterations that it makes thereafter? + "CHECK_IF_RECEIVED_YET:\n\t" + // Sadly, this whole sequence -- store, wait, read, mask, beq -- must be in the loop. In particular, if the store is factored out, the read doesn't work, even though we are storing the same thing each time. + "sw x0, 0(t4)\n\t" // Write the address of NoC CSR to Wishbone read address + "nop\n\t" + "nop\n\t" + "lw t5, 12(t4)\n\t" // Read NoC CSR + "andi t5, t5, 2\n\t" + "beq x0, t5, CHECK_IF_RECEIVED_YET\n\t" + "li t5, 4\n\t" // Write the address of NoC data to Wishbone read address + "sw t5, 0(t4)\n\t" + "nop\n\t" + "nop\n\t" + "lw t5, 12(t4)\n\t" // Read NoC data + "rdcycle t3\n\t" + "andi t3, t3, 1023\n\t" + "li t0, 0xbaaabaaa\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t3\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t5\n\t" + ); +} diff --git a/programs/benchmarks/noc/latency_random_sparse_send/Makefile b/programs/benchmarks/noc/latency_random_sparse_send/Makefile new file mode 100644 index 0000000..a7e3df6 --- /dev/null +++ b/programs/benchmarks/noc/latency_random_sparse_send/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm noc_latency_random_sparse_send.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c b/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c new file mode 100644 index 0000000..119f622 --- /dev/null +++ b/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include + +#define N 100 +// 1 << LOG2_OF_A_LONG_TIME should be much greater than the number of cycles required to run +// one iteration of the benchmark. I think it takes less than 512 cycles to run one iteration +// of the benchmark. +#define LOG2_OF_A_LONG_TIME 11 + +static int main_of(uint32_t core); + +int main() { + unsigned long coreid = read_csr(CSR_COREID); + srand(coreid); + main_of(coreid); +} + +static int send_main(uint32_t receiver) { + for (uint32_t i = 0; i < N; i++) { + uint32_t min_delay = 1 << LOG2_OF_A_LONG_TIME; + uint32_t additional_delay = rand() & ((1 << LOG2_OF_A_LONG_TIME) - 1); + unsigned long end_time = rdcycle() + min_delay + additional_delay; + while (rdcycle() < end_time) {} + unsigned long t0 = rdcycle(); // benchmark start + noc_send(receiver, t0); + } +} + +static int receive_main(uint32_t sender) { + for (uint32_t i = 0; i < N; i++) { + uint32_t t0 = noc_receive(); + uint32_t t1 = rdcycle(); // benchmark end + _fp_print((sender + 1) * 1000000 + t1 - t0); + } +} + +static int send_receive(uint32_t partner, int first) { + first ? send_main(partner) : receive_main(partner); + !first ? send_main(partner) : receive_main(partner); +} + +static int main_of(uint32_t core) { + int big = core & 2; + int odd = core & 1; + send_receive((core + 1) & 3, !odd); + send_receive((core + 2) & 3, !big); + send_receive((core + 3) & 3, !odd); +} diff --git a/programs/noc/BroadcastCount/Makefile b/programs/noc/BroadcastCount/Makefile new file mode 100644 index 0000000..18ee0b3 --- /dev/null +++ b/programs/noc/BroadcastCount/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm broadcast_count_noc.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/noc/BroadcastCount/broadcast_count_noc.c b/programs/noc/BroadcastCount/broadcast_count_noc.c new file mode 100644 index 0000000..1335de9 --- /dev/null +++ b/programs/noc/BroadcastCount/broadcast_count_noc.c @@ -0,0 +1,55 @@ +#include +#include +#include + +/* | NE | _ | N | E | _ | */ + +/*********************** + * core 0 / N \ core 1 * + * W + E * + * core 2 \ S / core 3 * + ***********************/ + +int main0(); +int main1(); +int main2(); +int main3(); + +int main() { + int core_id = read_csr(CSR_COREID); + switch(core_id) { + case 0: main0(); break; + case 1: main1(); break; + case 2: main2(); break; + case 3: main3(); break; + default: _fp_print(666); //ERROR + } +} + +int main0() { + broadcast_count(0, 125); + read_n_words_and_print(1, EAST_INT); + read_n_words_and_print(2, NORTH_INT); + read_n_words_and_print(3, NORTHEAST_INT); +} + +int main1() { + read_n_words_and_print(0, EAST_INT); + broadcast_count(1, 17); + read_n_words_and_print(2, NORTHEAST_INT); + read_n_words_and_print(3, NORTH_INT); +} + +int main2() { + read_n_words_and_print(0, NORTH_INT); + read_n_words_and_print(1, NORTHEAST_INT); + broadcast_count(2, 42); + read_n_words_and_print(3, EAST_INT); +} + +int main3() { + read_n_words_and_print(0, NORTHEAST_INT); + read_n_words_and_print(1, NORTH_INT); + read_n_words_and_print(2, EAST_INT); + broadcast_count(3, 3); +} diff --git a/programs/noc/BroadcastMemory/Makefile b/programs/noc/BroadcastMemory/Makefile new file mode 100644 index 0000000..28e29a8 --- /dev/null +++ b/programs/noc/BroadcastMemory/Makefile @@ -0,0 +1,24 @@ +build: gen compile + +gen: + mkdir -p asm-gen + + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-flexpret-noc-c-api-h]]]" > asm-gen/flexpret-noc-c-api.h + + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-read-n-words-and-print]]]" > asm-gen/read-n-words-and-print.s + + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-broadcast-count]]]" > asm-gen/broadcast-count.s + + rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello.rvg > asm-gen/hello.s + rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello_h.rvg > asm-gen/hello.h + +compile: + riscv_compile.sh ispm asm-gen/hello.s asm-gen/read-n-words-and-print.s asm-gen/broadcast-count.s broadcast_memory_noc.c + +clean: + rm -r asm-gen + riscv_clean.sh + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/noc/BroadcastMemory/asm/ctrl.rvg b/programs/noc/BroadcastMemory/asm/ctrl.rvg new file mode 100644 index 0000000..3eb520a --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/ctrl.rvg @@ -0,0 +1,101 @@ +[mu [(program)] +[def (condition!) [assertify [lam [(x)] [[hasattr {condition}] x]] {a condition}]] +[def (make-condition) [lam [(true-branch frag!) (false-branch frag!)] [lam [(rs1 reg!) (rs2 reg!)] + [[addattr {condition}] + [lam [(label frag!)] + [pair { + [id true-branch] rs1 rs2 label + } { + [id false-branch] rs1 rs2 label + }]]]]]] +[def (<) [make-condition {blt} {bgeu}]] +[def (>=) [make-condition {bgeu} {blt}]] +[def (==) [make-condition {beq} {bne}]] +[def (!=) [make-condition {bne} {beq}]] +[def (branch3) [lam [(condition condition!) (instrs [cycles! {2}])] + [def (assembly) { + [id { [id [[condition {DONE}] false]] } ] + [id instrs] + DONE: + }] + [[unsafe-assert-exact-cycles {3}] assembly]]] +[def (branch~) [lam [(condition condition!) (asm block!)] { + [[condition {DONE}] false] + [id asm] + DONE: + }]] +[def (unroll) [lam [(n num!) (body [-> [lamof num!] block!])] + [[[fold-range {1} {0} n] {}] + [lam [(acc) (_)] { + [id acc] + [body n] + }]]]] +[def (repeat~) [lam [(n num!) (body block!)] + [unroll n [lam [(_)] body]]]] +[def (repeat) [lam [(n num!) (body block!)] + [[unsafe-assert-exact-cycles [* n [cycles? body]]] + [repeat~ n body]]]] +[def (for) [lam [ + (start reg!) (stop reg!) + (step num! [>! {0}]) + (body block!)] + [def (addition-and-branching-cycles) {4}] + [def (total-cycles [lamof num!]) [lam [(num-iterations)] [- + [* num-iterations + [+ [cycles? body] addition-and-branching-cycles]] + {2}]]] + [def (assembly) { + LOOP: + [id body] + addi start start step + blt start stop LOOP + }] + [pair assembly total-cycles]]] +[def (for_startup+iterations*-2) [lam [ + (counter reg!) (clobber reg!) + (start num!) (stop num! [>! start]) (step num! [>! {0}]) + (body block!)] + [def (stopreg) clobber] + [def (startup) { + li counter start + li stopreg stop + }] + [def (num-iterations) [/ [- start stop] [- {0} step]]] + [def (assembly,cycles pair!) [for counter stopreg step body]] + { + [id startup] + [[unsafe-assert-exact-cycles + [[assembly,cycles false] num-iterations]] + [assembly,cycles true]] + }]] +[def (prologue-epilogue!) [-> + [lamof [∘ num! [∘ [>=! {0}] [ max_sequence_length_reg ? max_sequence_length_reg +: total_num_words_to_send_reg - max_sequence_length_reg < 0 ? 0 +: total_num_words_to_send_reg - max_sequence_length_reg +@param total_num_words_to_send_reg Input +@param n_words_reg Output: Register to be set to closest integer in the range from 0 to max_sequence_length (inclusive) to +the value in total_num_words_to_send_reg minus max_sequence_length +@param max_sequence_length_reg Input +}] +[def (initialize-n-words-asm) [lam [ + (total-num-words-to-send-reg input!) + (n-words-reg output!) + (max-sequence-length-reg input!) +] { + sub n-words-reg total-num-words-to-send-reg max-sequence-length-reg + bge n-words-reg zero DONE_WITH_SMALL_CASE + add n-words-reg zero zero + nop + DONE_WITH_SMALL_CASE: + bge max-sequence-length-reg n-words-reg DONE_WITH_BIG_CASE + add n-words-reg max-sequence-length-reg zero + nop + DONE_WITH_BIG_CASE: + }]] +[def (cap-at-max-sequence-length) [lam [ + (total-num-words-to-send-reg input!) + (n-words-reg output!) + (max-sequence-length-reg output!) + (max-sequence-length-literal num!) +] { + addi max-sequence-length-reg zero max-sequence-length-literal + li n-words-reg max-sequence-length-literal + bge total-num-words-to-send-reg max-sequence-length-reg DONE + add n-words-reg total-num-words-to-send-reg zero + DONE: + }]] +[def (broadcast-count-prepare-next-send-asm) [lam [ + (countdown-reg input! output!) + (n-words-reg input! output!) + (max-sequence-length-reg clobber!) +] { + li max-sequence-length-reg 32 + addi n-words-reg countdown-reg -32 + bge max-sequence-length-reg n-words-reg DONE + li n-words-reg 32 + nop + DONE: + andi n-words-reg n-words-reg 1023 + nop + }]] + +[;; { +@brief Broadcast a countdown from countdown_reg to 1 to all other cores. +@param SENDING_TO_ZERO_MACRO, ..., SENDING_TO_THREE_MACRO: All of these should be TRUE_MACRO +except the one corresponding to the current core (which does not broadcast to itself). +@param countdown_reg Input: The first number in the countdown. +@param result_reg Output: Indicates whether operation succeeded. See SEND_N_WORDS for details. +@param noc_base_address Clobber. +}] +[def (broadcast-count) [lam [ + (sending-to-0 bool!) + (sending-to-1 bool!) + (sending-to-2 bool!) + (sending-to-3 bool!) + (countdown-reg input! clobber!) + (result-reg output!) + (noc-base-address clobber!) + (n-words-reg clobber!) + (max-sequence-length-reg clobber!) + (c1 clobber!) + (c2 clobber!) + (c3 clobber!) + (c4 clobber!) +] { + [cap-at-max-sequence-length countdown-reg n-words-reg max-sequence-length-reg {32}] + [send-n-words + n-words-reg + result-reg + true true true + sending-to-0 sending-to-1 sending-to-2 sending-to-3 + [initialize-n-words-asm countdown-reg n-words-reg max-sequence-length-reg] + [broadcast-count-send-asm n-words-reg countdown-reg noc-base-address c3] + [broadcast-count-prepare-next-send-asm countdown-reg n-words-reg c2] + noc-base-address c1 c2 c3 c4 + ] + }]] +[def (print-broadcast-count) [lam [] + [def (current-core) {a0}] + [def (start-count-at) {a1}] + [def (send-from-direction-if-applicable) [lam [(notfrom0 bool!) (notfrom1 bool!) (notfrom2 bool!) (notfrom3 bool!)] + [branch~ + [== {zero} current-core] + [broadcast-count + notfrom0 + notfrom1 + notfrom2 + notfrom3 + start-count-at + {t0} {t1} {t2} {t3} {t4} {t5} {t6} {a5}]]]] + [[file {broadcast_count} { + broadcast_count: + [send-from-direction-if-applicable false true true true] + addi current-core current-core -1 + [send-from-direction-if-applicable true false true true] + addi current-core current-core -1 + [send-from-direction-if-applicable true true false true] + addi current-core current-core -1 + [send-from-direction-if-applicable true true true false] + [branch~ + [!= {zero} current-core] + [fp-print-int {666} {t0} {t1}]] + jalr zero ra 0 + }]]]] +[program]]]]]]]]]] diff --git a/programs/noc/BroadcastMemory/asm/flexpret-noc-low-level-interface.rvg b/programs/noc/BroadcastMemory/asm/flexpret-noc-low-level-interface.rvg new file mode 100644 index 0000000..3c4bf7a --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/flexpret-noc-low-level-interface.rvg @@ -0,0 +1,261 @@ +[mu [(program)] +[def (northeast-int) {0 }] +[def (north-int ) {8 }] +[def (east-int ) {12}] +[def (quintet!) [-> [lamof [cycles! {2}] [cycles! {1}] [cycles! {1}] input! input!] [cycles! {5}]]] +[def (northeast-quintet quintet!) [lam [ + (asm2cycles [cycles! {2}]) + (asm1cycle0 [cycles! {1}]) + (asm1cycle1 [cycles! {1}]) + (to-send input!) + (noc-base-address input!) +] { + sw to-send 0(noc-base-address) + [id asm2cycles] + [id asm1cycle0] + [id asm1cycle1] + }]] +[def (north-quintet quintet!) [lam [ + (asm2cycles [cycles! {2}]) + (asm1cycle0 [cycles! {1}]) + (asm1cycle1 [cycles! {1}]) + (to-send input!) + (noc-base-address input!) +] { + [id asm2cycles] + sw to-send 0(noc-base-address) + [id asm1cycle0] + [id asm1cycle1] + }]] +[def (east-quintet quintet!) [lam [ + (asm2cycles [cycles! {2}]) + (asm1cycle0 [cycles! {1}]) + (asm1cycle1 [cycles! {1}]) + (to-send input!) + (noc-base-address input!) +] { + [id asm2cycles] + [id asm1cycle0] + sw to-send 0(noc-base-address) + [id asm1cycle1] + }]] +[def (load-noc-base-address [-> [lamof output!] [cycles! {1}]]) [lam [(r)] { + li r 0x80000000 + }]] +[;; { +The instruction immediately following `sync5` is able to store a word +into the zeroth TDM slot (northwest) in a single-threaded setting. +It takes 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining +9 cycles are overhead.}] +[def (sync5) [lam [(noc-base-address clobber!) (c0 clobber!) (c1 clobber!) (c2 clobber!) (c3 clobber!)] { + li c0 1 + li c1 2 + li c2 3 + [load-noc-base-address noc-base-address] + lw c3 32(noc-base-address) + beq c3 c0 DONE_SYNCHRONIZING + beq c3 zero DONE_SYNCHRONIZING + beq c3 c2 DONE_SYNCHRONIZING + beq c3 c1 DONE_SYNCHRONIZING + DONE_SYNCHRONIZING: + }]] +[;; { +Broadcast the value stored in reg to all other cores. The instruction immediately +following BROADCAST_SYNCED is able to store into the zeroth TDM slot. +This assumes that the current thread is synchronized to the TDM schedule.}] +[def (broadcast-synced-with-instructions) [lam [ + (noc-base-address input!) + (reg input!) + (i0 [cycles! {1}]) + (i1 [cycles! {1}])] { + sw reg 0(noc-base-address) + [id i0] + sw reg 0(noc-base-address) + sw reg 0(noc-base-address) + [id i1]}]] +[def (broadcast-synced) [lam [(noc-base-address input!) (reg input!)] + [broadcast-synced-with-instructions {nop} {nop}]]] +[def (load-noc-core-base-address) [lam [(noc-core-base-address output!) (sending-core-reg input!) (c0 clobber!)] { + [load-noc-base-address noc-core-base-address] + slli c0 sending-core-reg 2 + add noc-core-base-address c0 noc-core-base-address + }]] +[def (block-on-flit-from-core) [lam [(noc-core-base-address input!) (c0 clobber!)] { + BLOCKING_READ_POLL: + lw c0 16(noc-core-base-address) + beq zero c0 BLOCKING_READ_POLL + }]] +[def (blocking-read) [lam [(noc-core-base-address clobber!) (read-to-reg output!) (sending-core-reg input!)] { + [load-noc-core-base-address noc-core-base-address sending-core-reg read-to-reg] + [block-on-flit-from-core noc-core-base-address read-to-reg] + lw read-to-reg 0(noc-core-base-address) + }]] +[def (or-validity-of-noc-data) [lam [ + (noc-base-address-reg input!) + (accumulator-reg input! output!) + (offset num!) + (c0 clobber!)] { + lw c0 offset(noc-base-address-reg) + or accumulator-reg accumulator-reg c0 + }]] +[def (read-and-fail-if-tag-bit-is-1) [lam [ + (noc-base-address-reg input!) + (read-to-reg output!) + (sending-core-reg input!) + (result-reg output!) + (tag-bit-mask input!) + (fail-label frag!)] { + [blocking-read noc-base-address-reg read-to-reg sending-core-reg] + and result-reg read-to-reg tag-bit-mask + bnez result-reg fail-label + }]] +[;; { +@brief Try to send the number of words specified by n_words_reg. +@param n_words_reg The number of words to send. +@param result_reg 0 if successful, 1 if one of the potential message receivers has already tried +to send a message before this even tries to send a message, 0x80000000 if one of the potential +message receivers tries to send a message after this tries to send a message (so awkward! who +gets to talk first?) +@param initialize_asm Assembly code for preparing to send words, e.g. by loading the words into +the register file. Must take 2 cycles (mod 5). +@param send_words_asm Assembly code for sending the words rapidly. Must never miss a TDM slot! +May assume that the preceding code has already taken care of synchronization. Must preserve +synchronization. Must set n_words_reg to hold the number of words in the next contiguous sequence, +with the top bit set high if the length of prepare_for_next_send_words_asm is not exactly 11 +cycles. +@param prepare_for_next_send_words_asm Prepare for the next run of send_words_asm. +@param noc_base_address Register that will hold the NoC base address. No assumptions are made +about the original value held in this register (the NoC base address will be written into it +regardless). +The remaining parameters (e.g., SENDING_NORTHEAST_MACRO) must be either TRUE_MACRO or +FALSE_MACRO. +}] +[def (send-n-words) [lam [ + (n-words-reg input! clobber! [;; {only the tag bit is clobbered}]) + (result-reg output!) + (sending-northeast bool!) + (sending-north bool!) + (sending-east bool!) + (sending-to-0 bool!) + (sending-to-1 bool!) + (sending-to-2 bool!) + (sending-to-3 bool!) + (initialize-asm block!) + (send-words-asm block!) + (prepare-for-next-send-words-asm block!) + (noc-base-address clobber!) (c1 clobber!) (c2 clobber!) (c3 clobber!) (c4 clobber!) +] { + [load-noc-base-address noc-base-address] + add result-reg zero zero + [sending-to-0 [or-validity-of-noc-data noc-base-address result-reg {16} c1] {}] + [sending-to-1 [or-validity-of-noc-data noc-base-address result-reg {20} c1] {}] + [sending-to-2 [or-validity-of-noc-data noc-base-address result-reg {24} c1] {}] + [sending-to-3 [or-validity-of-noc-data noc-base-address result-reg {28} c1] {}] + bnez result-reg END ;; fail with error code 1 + [sync5 noc-base-address c1 c2 c3 c4] + li c1 0x80000000 ;; Set the top bit as a tag bit. If u wanna send so many words that this creates ambiguity, then u have a bigger problem on your hands + or n-words-reg n-words-reg c1 + [/* [def (send-num-words-or-nop) [lam [(? bool!)] [? {sw n-words-reg 0(noc-base-address)} {nop}]]]] + [send-num-words-or-nop sending-north ] + [send-num-words-or-nop sending-east ] + li c3 1 + [send-num-words-or-nop sending-northeast] + [/* [def (blocking-read-or-not) [lam [(? bool!) (receiving-core input!)] [? + [read-and-fail-if-tag-bit-is-1 noc-base-address c2 receiving-core result-reg c1 {END}] + {}]]]] + [blocking-read-or-not sending-to-0 {zero}] + [blocking-read-or-not sending-to-1 c3] + li c3 2 + [blocking-read-or-not sending-to-2 c3] + li c3 3 + [blocking-read-or-not sending-to-3 c3] + [id initialize-asm] + SENDING_SYNC_WORD: + [/* [def (sync-word) [lam [(? bool!)] {sw zero 0(noc-base-address)}]]] + [sync-word sending-northeast] + nop + [sync-word sending-north ] + [sync-word sending-east ] + nop + SEND_WORDS_ASM: + [id send-words-asm] + [send-num-words-or-nop sending-northeast] + nop + [send-num-words-or-nop sending-north ] + [send-num-words-or-nop sending-east ] + beqz n-words-reg END + [id prepare-for-next-send-words-asm] + bge n-words-reg zero SEND_WORDS_ASM + jal zero SENDING_SYNC_WORD + END: + }]] +[def (mul4-2cycles [-> id [cycles! {2}]]) [lam [(in-reg input!) (out-reg output!)] { + [mul4 in-reg out-reg] + nop + }]] +[;; { +@brief Read the number of words specified by the sender. +@param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the +receiver. +@param MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES Assembly that takes an in_reg and an out_reg +and sets the out_reg to the number of instructions in each 5-cycle subsequence of +receive_words_asm. +@param offset_numeric_literal 36 plus (4 times the number of instructions in +preparatory asm). This will be 48 unless one provides instructions that +cause stalls, such as loads, branches, or jumps. +@param hex_for_12_bit_distance_from_auipc_to_end 3-digit hex (no 0x prefix) for +offset_numeric_literal plus the offset of receive_words_asm. +@param sending_core_reg Input: A register specifying the sending core. Clobbered. +@param receive_words_asm Assembly that receives the sent words. Must read in its first cycle, and +exactly every 5 cycles thereafter! +@param noc_base_address Output: A register that will be set to the base address of the NoC +corresponding to the given core. +The remaining registers are all clobbers -- they have descriptive names, but they are not really +API except insofar as they are clobbered. t6 is also clobbered, and receive_words_asm cannot use +t6. The only clobber that receive_words_asm should use is clobber4. +}] +[def (read-n-words) [lam [ + (direction-quintet quintet!) + (mul-by-receive-words-period [-> id [cycles! {2}]]) + (offset-numeric-literal num!) + (hex-for-12-bit-distance-from-auipc-to-end frag!) + (sending-core-reg input!) + (receive-words-asm block!) + (noc-base-address output!) + (packet-size-reg clobber!) (jalr-word-reg clobber!) (replaced-instruction-reg clobber!) (c4 clobber!) +] { + [blocking-read noc-base-address c4 sending-core-reg] + [mul-by-receive-words-period c4 packet-size-reg] + slli packet-size-reg packet-size-reg 2 + [sync5 sending-core-reg {t6} jalr-word-reg replaced-instruction-reg c4] + auipc t6 0 + add packet-size-reg t6 packet-size-reg + li jalr-word-reg [id {0x}][id hex-for-12-bit-distance-from-auipc-to-end]F8067 + nop + [direction-quintet + {lw replaced-instruction-reg 52(packet-size-reg)} + {sw jalr-word-reg 52(packet-size-reg)} + {nop} + {zero} + noc-base-address] + WAIT_FOR_SYNC_WORD: + [block-on-flit-from-core noc-base-address c4] + nop + nop + RECEIVE_WORDS: + [id receive-words-asm] + sw replaced-instruction-reg 52(packet-size-reg) + nop + lw c4 0(noc-base-address) + beqz c4 END + [mul-by-receive-words-period c4 replaced-instruction-reg] + slli packet-size-reg replaced-instruction-reg 2 + add packet-size-reg t6 packet-size-reg + lw replaced-instruction-reg 52(packet-size-reg) + sw jalr-word-reg 52(packet-size-reg) + nop + blt c4 zero WAIT_FOR_SYNC_WORD + jal zero RECEIVE_WORDS + END: + }]] +[program]] diff --git a/programs/noc/BroadcastMemory/asm/flexpret.rvg b/programs/noc/BroadcastMemory/asm/flexpret.rvg new file mode 100644 index 0000000..463f414 --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/flexpret.rvg @@ -0,0 +1,13 @@ +[mu [(program)] +[def (fp-print) [lam [(r input!) (c0 clobber!)] { + li c0 0xbaaabaaa + csrw 0x51e c0 + csrw 0x51e r +}]] +[def (fp-print-int) [lam [(k num!) (c0 clobber!) (c1 clobber!)] { + li c0 0xbaaabaaa + li c1 [id k] + csrw 0x51e c0 + csrw 0x51e c1 +}]] +[program]] diff --git a/programs/noc/BroadcastMemory/asm/hello.rvg b/programs/noc/BroadcastMemory/asm/hello.rvg new file mode 100644 index 0000000..482b589 --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/hello.rvg @@ -0,0 +1,8 @@ +[stdlib [mu [] [ctrl [mu [] +[[file {hello} { +hello: + li t0 0xbaaabaaa + csrw 0x51e t0 + csrw 0x51e a0 + jalr zero ra 0 +}]]]]]] diff --git a/programs/noc/BroadcastMemory/asm/hello_h.rvg b/programs/noc/BroadcastMemory/asm/hello_h.rvg new file mode 100644 index 0000000..0244fed --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/hello_h.rvg @@ -0,0 +1,3 @@ +[[print { + void hello(int n); +}]] diff --git a/programs/noc/BroadcastMemory/asm/rvgbuild b/programs/noc/BroadcastMemory/asm/rvgbuild new file mode 100644 index 0000000..b6f7a2d --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/rvgbuild @@ -0,0 +1 @@ +stdlib.rvg -> ctrl.rvg -> hello.rvg diff --git a/programs/noc/BroadcastMemory/asm/stdlib.rvg b/programs/noc/BroadcastMemory/asm/stdlib.rvg new file mode 100644 index 0000000..1937e4a --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/stdlib.rvg @@ -0,0 +1,47 @@ +[lam [(program)] + +[def (id) [lam [(x)] x]] + +[def (if) [lam [(condition) (then) (else)] + [def (selected) [condition then else]] + [selected]]] + +[def (lam!) [lam [(k)] [lam [(l)] + [if [[lam? k] l] + [lam [] l] + [fail l {Expected lam of [id k] parameters}]]]]] + +[def (->) [lam [(arg-check) (result-check)] + [lam [(l arg-check)] + [applierify-varargs [lam [(applier)] + [result-check [applier l]]]]]]] +[def (assertify) [lam [(? [lam! {1}]) (description)] [lam [(checkee)] + [if [? checkee] + [lam [] checkee] + [fail checkee {Expected [id description]}]]]]] +[def (hasattr!) [lam [(attr)] [assertify [lam [(x)] [[hasattr attr] x]] attr]]] +[def (num!) [assertify [lam [(x)] [num? x]] {a number}]] +[def (frag!) [assertify [lam [(x)] [frag? x]] {a fragment}]] +[def (block!) [assertify [lam [(x)] [block? x]] {an assembly block}]] +[def (reg!) [assertify [lam [(x)] [reg? x]] {a register}]] +[def (lam! [lamof num!]) lam!] + +[def (true [addattr {bool}]) [lam [(a) (b)] a]] +[def (false [addattr {bool}]) [lam [(a) (b)] b]] +[def (bool!) [hasattr! {bool}]] +[def (pair [lam! {2}]) [lam [(a) (b)] + [[addattr {pair}] + [lam [(get bool!)] + [get a b]]]]] +[def (pair!) [hasattr! {pair}]] +[def (range!) [hasattr! {range}]] +[def (println) [lam [(s)] [lam [] [[print s]] [[print { +}]]]]] +[def (∘) [lam [(a [lam! {1}]) (b)] [applierify-varargs [lam [(applier)] + [a [applier b]]]]]] +[def (;;) [lam [(doc)] id]] +[def (/*) [lam [(ignored)] {}]] +[def (input!) reg!] +[def (output!) reg!] +[def (clobber!) reg!] +[program]] diff --git a/programs/noc/BroadcastMemory/broadcast_memory_noc.c b/programs/noc/BroadcastMemory/broadcast_memory_noc.c new file mode 100644 index 0000000..7be3094 --- /dev/null +++ b/programs/noc/BroadcastMemory/broadcast_memory_noc.c @@ -0,0 +1,59 @@ +#include +#include +// #include + +// #include "asm-gen/hello.h" +#include "asm-gen/flexpret-noc-c-api.h" + +/* | NE | _ | N | E | _ | */ + +/*********************** + * core 0 / N \ core 1 * + * W + E * + * core 2 \ S / core 3 * + ***********************/ + +int main0(); +int main1(); +int main2(); +int main3(); + +int main() { + // hello(41); + int core_id = read_csr(CSR_COREID); + switch(core_id) { + case 0: main0(); break; + case 1: main1(); break; + case 2: main2(); break; + case 3: main3(); break; + default: _fp_print(666); //ERROR + } +} + +int main0() { + broadcast_count(0, 125); + read_n_words_and_print(1, EAST_INT); + read_n_words_and_print(2, NORTH_INT); + read_n_words_and_print(3, NORTHEAST_INT); +} + +int main1() { + read_n_words_and_print(0, EAST_INT); + broadcast_count(1, 17); + read_n_words_and_print(2, NORTHEAST_INT); + read_n_words_and_print(3, NORTH_INT); +} + +int main2() { + read_n_words_and_print(0, NORTH_INT); + read_n_words_and_print(1, NORTHEAST_INT); + broadcast_count(2, 42); + read_n_words_and_print(3, EAST_INT); +} + +int main3() { + read_n_words_and_print(0, NORTHEAST_INT); + read_n_words_and_print(1, NORTH_INT); + read_n_words_and_print(2, EAST_INT); + broadcast_count(3, 3); +} diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100755 index 0000000..1b70ba0 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1 @@ +sbt -Dsbt.main.class=sbt.ScriptMain $1 diff --git a/scripts/script.sc b/scripts/script.sc new file mode 100755 index 0000000..a9ad42a --- /dev/null +++ b/scripts/script.sc @@ -0,0 +1,119 @@ +/*** +scalaVersion := "2.12.10" + +// https://mvnrepository.com/artifact/io.github.pityka/nspl-awt +libraryDependencies += "io.github.pityka" %% "nspl-awt" % "0.5.0" +*/ + +import java.util.regex.Pattern +import java.util.regex.Matcher +import java.lang.ProcessBuilder +import java.nio.file.Path +import java.io.File +import java.io.BufferedOutputStream +import java.io.FileOutputStream +import java.io.InputStream +import scala.util.{Try, Using} + +import org.nspl._ +import org.nspl.data.HistogramData +import org.nspl.awtrenderer._ + +val readLine: InputStream => Option[String] = (stream: InputStream) => { + var c: Int = stream.read() + var ret: List[Int] = Nil + if (c != -1) { + while (c != '\n' && c != -1) { + ret = ret :+ c; + c = stream.read() + } + val arr: Array[Byte] = ret.map(_.toByte).toArray + Some(new String(arr)) + } else { + None + } +} +val outputLinePattern = Pattern.compile("Core\\-(?\\d):\\s+(?\\d)0+(?\\d+)") +val analyzeOutputLine: String => Option[(Int, Int, Int)] = (outputLine: String) => { + val matcher = outputLinePattern.matcher(outputLine) + if (!matcher.matches()) { + println(s"Output line $outputLine does not match") + None + } + Some(( + Integer.parseInt(matcher.group("myId")), + Integer.parseInt(matcher.group("theirIdPlusOne")) - 1, + Integer.parseInt(matcher.group("cycleCount")) + )) +} +val writeBytesToFile = (data: Array[Byte], file: File) => { + val target = new BufferedOutputStream( new FileOutputStream(file) ) + try data.foreach( target.write(_) ) finally target.close +} +val nocBenchmarksPath = Path.of(sys.env("FP_ROOT"), "programs", "benchmarks", "noc") +val lrssPath = nocBenchmarksPath.resolve("latency_random_sparse_send") + +val doMake = (p: Path) => { + val builder = new ProcessBuilder("make").directory(lrssPath.toFile()) + val process = builder.start() + println(new String(process.getInputStream().readAllBytes())) +} + +class CommunicationParameters(val senderCore: Int, val receiverCore: Int) { + override def equals(x: Any): Boolean = { + if (!x.isInstanceOf[CommunicationParameters]) false else ( + x.asInstanceOf[CommunicationParameters].senderCore == senderCore + && x.asInstanceOf[CommunicationParameters].receiverCore == receiverCore + ) + } + override def hashCode(): Int = senderCore * 31 + receiverCore + override def toString(): String = s"(sender=$senderCore, receiver=$receiverCore)" +} + +type Results = Map[CommunicationParameters, Seq[Int]] + +val doSimulate: Path => Results = (p: Path) => { + var latencies: Results = Map() + val process = new ProcessBuilder("fp-emu").directory(lrssPath.toFile()).start() + Using(process.getErrorStream()) { stream => + while (readLine(stream) match { + case Some(s) => { + analyzeOutputLine(s) match { + case Some((myId, theirId, cycleCount)) => { + // Assume reporter is receiver + val parameters = new CommunicationParameters(theirId, myId) + latencies = if (latencies.contains(parameters)) latencies.updated( + parameters, latencies(parameters) :+ cycleCount + ) else (latencies + (parameters -> (cycleCount :: Nil))) + } + case _ => None + } + true + } + case _ => false + }) { } + } + latencies +} + + +val makePlot = (latencies: Results) => { + println(latencies) + val someData = HistogramData(latencies.get(new CommunicationParameters(0, 1)).getOrElse(Nil).map(_.toDouble), 10) -> bar() + + val plot = xyplot(someData)( + par( + main="Distribution of latencies", + xlab="latency", + ylab="frequency" + ) + ) + println("Writing to " + Path.of(sys.env("FP_ROOT"), "temp.png")) + writeBytesToFile(renderToByteArray(plot.build, width=2000), Path.of(sys.env("FP_ROOT"), "temp.png").toFile()) +} + +doMake(lrssPath) +val latencies = doSimulate(lrssPath) +makePlot(latencies) + + diff --git a/soc-comm b/soc-comm index e13da1b..809e01e 160000 --- a/soc-comm +++ b/soc-comm @@ -1 +1 @@ -Subproject commit e13da1be215a222b071b104775d3ea6d05978a14 +Subproject commit 809e01e996a5d21980cf6433c56757cd0a4766df diff --git a/src/main/scala/Top.scala b/src/main/scala/Top.scala index 7db13ed..12980a0 100644 --- a/src/main/scala/Top.scala +++ b/src/main/scala/Top.scala @@ -3,7 +3,7 @@ import chisel3._ import chisel3.util.experimental.loadMemoryFromFileInline // To load program into ISpm import flexpret.core.{Core, FlexpretConfiguration, GPIO, HostIO, ISpm} -import wishbone.{S4NoCTopWB} +import s4noc.S4NoC import s4noc.Config @@ -34,14 +34,12 @@ class Top(topCfg: TopConfig) extends Module { val wbBuses = for (i <- 0 until topCfg.nCores) yield { Module(new WishboneBus( masterWidth = topCfg.coreCfgs(i).busAddrBits, - deviceWidths = Seq(4,4) // NOC width=4 and Uart width = 4 + deviceWidths = Seq(4) // Uart width = 4 )) } // NoC with n ports - val noc = Module(new S4NoCTopWB(Config(4, 2, 2, 2, 32))) - noc.io.wbPorts.map(_.setDefaults) - + val noc = Module(new S4NoC(Config(4, 2, 2, 2, 32))) // Termination and printing logic (just for simulation) val regCoreDone = RegInit(VecInit(Seq.fill(topCfg.nCores)(false.B))) @@ -54,15 +52,13 @@ class Top(topCfg: TopConfig) extends Module { cores(i).io.int_exts.foreach(_ := false.B) // Connect to wbM master cores(i).io.bus <> wbMasters(i).busIO + cores(i).io.noc <> noc.io(i) // Connect WbMaster to WbBus wbMasters(i).wbIO <> wbBuses(i).io.wbMaster - // Connect WbBus to NOC - wbBuses(i).io.wbDevices(0) <> noc.io.wbPorts(i) - // Connect WbBus to Uart - wbBuses(i).io.wbDevices(1) <> wbUarts(i).io.port + wbBuses(i).io.wbDevices(0) <> wbUarts(i).io.port // Connect all cores to uart input wbUarts(i).ioUart.rx := io.uart.rx @@ -80,7 +76,7 @@ class Top(topCfg: TopConfig) extends Module { } regCoreDone(i) := true.B } - + // Handle printfs when(cores(i).io.host.to_host === "hbaaabaaa".U) { regCorePrintNext(i) := true.B diff --git a/src/main/scala/WishboneMaster.scala b/src/main/scala/WishboneMaster.scala index 564d077..5fdc2f3 100644 --- a/src/main/scala/WishboneMaster.scala +++ b/src/main/scala/WishboneMaster.scala @@ -31,7 +31,8 @@ class WishboneMaster(addrBits: Int)(implicit conf: FlexpretConfiguration) extend val wDoRead = WireDefault(false.B) val wDoWrite = WireDefault(false.B) assert(!(wDoRead && wDoWrite), "Both read and write at the same time") - assert(!(busIO.enable && regState =/= sIdle), "Recevied bus request while busy") + // assert(!(busIO.enable && regState =/= sIdle), "Received bus request while busy") // How is the CPU supposed to know not to send bus request if it cannot even read the status of the WishboneMaster? + regBusRead := regStatus switch(regState) { // Idle state. Waiting for request from FlexPret Core @@ -62,7 +63,6 @@ class WishboneMaster(addrBits: Int)(implicit conf: FlexpretConfiguration) extend when(addr === MMIO_READ_DATA) { regBusRead := regReadData }.elsewhen(addr === MMIO_STATUS) { - regBusRead := regStatus regStatus := false.B }.otherwise { assert(false.B, "Tried to read from invalid address %d on wishbone bus master", addr)