From fdf8d34e43f4838dcd4c120466d6ea7ff7ba02d6 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 30 Dec 2022 01:12:02 -0800 Subject: [PATCH 01/34] Tentative start on NoC benchmarks. --- programs/benchmarks/noc/latency/Makefile | 10 ++ programs/benchmarks/noc/latency/noc_latency.c | 54 ++++++++ .../noc/latency_random_sparse_send/Makefile | 10 ++ .../noc_latency_random_sparse_send.c | 50 ++++++++ scripts/run.sh | 1 + scripts/script.sc | 119 ++++++++++++++++++ 6 files changed, 244 insertions(+) create mode 100644 programs/benchmarks/noc/latency/Makefile create mode 100644 programs/benchmarks/noc/latency/noc_latency.c create mode 100644 programs/benchmarks/noc/latency_random_sparse_send/Makefile create mode 100644 programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c create mode 100755 scripts/run.sh create mode 100755 scripts/script.sc diff --git a/programs/benchmarks/noc/latency/Makefile b/programs/benchmarks/noc/latency/Makefile new file mode 100644 index 0000000..cdfaef8 --- /dev/null +++ b/programs/benchmarks/noc/latency/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm noc_latency.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/benchmarks/noc/latency/noc_latency.c b/programs/benchmarks/noc/latency/noc_latency.c new file mode 100644 index 0000000..c880037 --- /dev/null +++ b/programs/benchmarks/noc/latency/noc_latency.c @@ -0,0 +1,54 @@ +#include +#include +#include + +#define N 3 + +static int main_of(uint32_t core); + +int main() { + main_of(read_csr(CSR_COREID)); +} + +static int send_main(uint32_t receiver) { + _fp_print(111111111); + noc_send(receiver, 0); // sync for reproducibility + int cycles_of_latency = 0; + for (uint32_t i = 0; i < N; i++) { + unsigned long t0 = rdcycle(); // benchmark start + noc_send(receiver, i); + uint32_t reply = noc_receive(); + unsigned long t1 = rdcycle(); // benchmark end + _fp_print(receiver * 1000000 + t1 - t0); + cycles_of_latency = t1 - t0; + } + for (uint32_t i = 0; i < N; i++) { + unsigned long t0 = rdcycle(); // benchmark start + noc_send(receiver, t0); + } +} + +static int receive_main(uint32_t sender) { + noc_receive(); // sync for reproducibility + for (uint32_t i = 0; i < N; i++) { + noc_send(sender, noc_receive()); + } + for (uint32_t i = 0; i < N; i++) { + uint32_t t0 = noc_receive(); + uint32_t t1 = rdcycle(); // benchmark end + _fp_print(sender * 1000000 + t1 - t0); + } +} + +static int send_receive(uint32_t partner, int first) { + first ? send_main(partner) : receive_main(partner); + !first ? send_main(partner) : receive_main(partner); +} + +static int main_of(uint32_t core) { + int big = core & 2; + int odd = core & 1; + send_receive((core + 1) & 3, !odd); + send_receive((core + 2) & 3, !big); + send_receive((core + 3) & 3, !odd); +} diff --git a/programs/benchmarks/noc/latency_random_sparse_send/Makefile b/programs/benchmarks/noc/latency_random_sparse_send/Makefile new file mode 100644 index 0000000..a7e3df6 --- /dev/null +++ b/programs/benchmarks/noc/latency_random_sparse_send/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm noc_latency_random_sparse_send.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c b/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c new file mode 100644 index 0000000..119f622 --- /dev/null +++ b/programs/benchmarks/noc/latency_random_sparse_send/noc_latency_random_sparse_send.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include + +#define N 100 +// 1 << LOG2_OF_A_LONG_TIME should be much greater than the number of cycles required to run +// one iteration of the benchmark. I think it takes less than 512 cycles to run one iteration +// of the benchmark. +#define LOG2_OF_A_LONG_TIME 11 + +static int main_of(uint32_t core); + +int main() { + unsigned long coreid = read_csr(CSR_COREID); + srand(coreid); + main_of(coreid); +} + +static int send_main(uint32_t receiver) { + for (uint32_t i = 0; i < N; i++) { + uint32_t min_delay = 1 << LOG2_OF_A_LONG_TIME; + uint32_t additional_delay = rand() & ((1 << LOG2_OF_A_LONG_TIME) - 1); + unsigned long end_time = rdcycle() + min_delay + additional_delay; + while (rdcycle() < end_time) {} + unsigned long t0 = rdcycle(); // benchmark start + noc_send(receiver, t0); + } +} + +static int receive_main(uint32_t sender) { + for (uint32_t i = 0; i < N; i++) { + uint32_t t0 = noc_receive(); + uint32_t t1 = rdcycle(); // benchmark end + _fp_print((sender + 1) * 1000000 + t1 - t0); + } +} + +static int send_receive(uint32_t partner, int first) { + first ? send_main(partner) : receive_main(partner); + !first ? send_main(partner) : receive_main(partner); +} + +static int main_of(uint32_t core) { + int big = core & 2; + int odd = core & 1; + send_receive((core + 1) & 3, !odd); + send_receive((core + 2) & 3, !big); + send_receive((core + 3) & 3, !odd); +} diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100755 index 0000000..1b70ba0 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1 @@ +sbt -Dsbt.main.class=sbt.ScriptMain $1 diff --git a/scripts/script.sc b/scripts/script.sc new file mode 100755 index 0000000..a9ad42a --- /dev/null +++ b/scripts/script.sc @@ -0,0 +1,119 @@ +/*** +scalaVersion := "2.12.10" + +// https://mvnrepository.com/artifact/io.github.pityka/nspl-awt +libraryDependencies += "io.github.pityka" %% "nspl-awt" % "0.5.0" +*/ + +import java.util.regex.Pattern +import java.util.regex.Matcher +import java.lang.ProcessBuilder +import java.nio.file.Path +import java.io.File +import java.io.BufferedOutputStream +import java.io.FileOutputStream +import java.io.InputStream +import scala.util.{Try, Using} + +import org.nspl._ +import org.nspl.data.HistogramData +import org.nspl.awtrenderer._ + +val readLine: InputStream => Option[String] = (stream: InputStream) => { + var c: Int = stream.read() + var ret: List[Int] = Nil + if (c != -1) { + while (c != '\n' && c != -1) { + ret = ret :+ c; + c = stream.read() + } + val arr: Array[Byte] = ret.map(_.toByte).toArray + Some(new String(arr)) + } else { + None + } +} +val outputLinePattern = Pattern.compile("Core\\-(?\\d):\\s+(?\\d)0+(?\\d+)") +val analyzeOutputLine: String => Option[(Int, Int, Int)] = (outputLine: String) => { + val matcher = outputLinePattern.matcher(outputLine) + if (!matcher.matches()) { + println(s"Output line $outputLine does not match") + None + } + Some(( + Integer.parseInt(matcher.group("myId")), + Integer.parseInt(matcher.group("theirIdPlusOne")) - 1, + Integer.parseInt(matcher.group("cycleCount")) + )) +} +val writeBytesToFile = (data: Array[Byte], file: File) => { + val target = new BufferedOutputStream( new FileOutputStream(file) ) + try data.foreach( target.write(_) ) finally target.close +} +val nocBenchmarksPath = Path.of(sys.env("FP_ROOT"), "programs", "benchmarks", "noc") +val lrssPath = nocBenchmarksPath.resolve("latency_random_sparse_send") + +val doMake = (p: Path) => { + val builder = new ProcessBuilder("make").directory(lrssPath.toFile()) + val process = builder.start() + println(new String(process.getInputStream().readAllBytes())) +} + +class CommunicationParameters(val senderCore: Int, val receiverCore: Int) { + override def equals(x: Any): Boolean = { + if (!x.isInstanceOf[CommunicationParameters]) false else ( + x.asInstanceOf[CommunicationParameters].senderCore == senderCore + && x.asInstanceOf[CommunicationParameters].receiverCore == receiverCore + ) + } + override def hashCode(): Int = senderCore * 31 + receiverCore + override def toString(): String = s"(sender=$senderCore, receiver=$receiverCore)" +} + +type Results = Map[CommunicationParameters, Seq[Int]] + +val doSimulate: Path => Results = (p: Path) => { + var latencies: Results = Map() + val process = new ProcessBuilder("fp-emu").directory(lrssPath.toFile()).start() + Using(process.getErrorStream()) { stream => + while (readLine(stream) match { + case Some(s) => { + analyzeOutputLine(s) match { + case Some((myId, theirId, cycleCount)) => { + // Assume reporter is receiver + val parameters = new CommunicationParameters(theirId, myId) + latencies = if (latencies.contains(parameters)) latencies.updated( + parameters, latencies(parameters) :+ cycleCount + ) else (latencies + (parameters -> (cycleCount :: Nil))) + } + case _ => None + } + true + } + case _ => false + }) { } + } + latencies +} + + +val makePlot = (latencies: Results) => { + println(latencies) + val someData = HistogramData(latencies.get(new CommunicationParameters(0, 1)).getOrElse(Nil).map(_.toDouble), 10) -> bar() + + val plot = xyplot(someData)( + par( + main="Distribution of latencies", + xlab="latency", + ylab="frequency" + ) + ) + println("Writing to " + Path.of(sys.env("FP_ROOT"), "temp.png")) + writeBytesToFile(renderToByteArray(plot.build, width=2000), Path.of(sys.env("FP_ROOT"), "temp.png").toFile()) +} + +doMake(lrssPath) +val latencies = doSimulate(lrssPath) +makePlot(latencies) + + From 1cff46f82c5825acea6cb18915d6b4e4cf672347 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 30 Dec 2022 20:07:52 -0800 Subject: [PATCH 02/34] Measure 11 cycles of latency by cheating. --- .../benchmarks/noc/latency_aligned/Makefile | 10 +++ .../benchmarks/noc/latency_aligned/align.h | 34 ++++++++ .../noc/latency_aligned/noc_latency_aligned.c | 80 +++++++++++++++++++ src/main/scala/WishboneMaster.scala | 4 +- 4 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 programs/benchmarks/noc/latency_aligned/Makefile create mode 100644 programs/benchmarks/noc/latency_aligned/align.h create mode 100644 programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c diff --git a/programs/benchmarks/noc/latency_aligned/Makefile b/programs/benchmarks/noc/latency_aligned/Makefile new file mode 100644 index 0000000..2463e1d --- /dev/null +++ b/programs/benchmarks/noc/latency_aligned/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm noc_latency_aligned.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/benchmarks/noc/latency_aligned/align.h b/programs/benchmarks/noc/latency_aligned/align.h new file mode 100644 index 0000000..c5f19ea --- /dev/null +++ b/programs/benchmarks/noc/latency_aligned/align.h @@ -0,0 +1,34 @@ +#define WAIT_FOR_NEXT_ZERO_MOD_1024(id) \ + "li t0, 1014\n\t" \ + "li a0, 1\n\t" \ + "li a1, 2\n\t" \ + "li a2, 3\n\t" \ + "li a3, 4\n\t" \ + "li a4, 5\n\t" \ + "li a5, 6\n\t" \ + "li t6, 7\n\t" \ + "rdcycle t1\n\t" \ + "andi t1, t1, 7\n\t" \ + "beq t1, t6, LOOP" #id "\n\t" \ + "beq t1, a5, LOOP" #id "\n\t" \ + "beq t1, a4, LOOP" #id "\n\t" \ + "beq t1, a3, LOOP" #id "\n\t" \ + "beq t1, a2, LOOP" #id "\n\t" \ + "beq t1, a1, LOOP" #id "\n\t" \ + "beq t1, a0, LOOP" #id "\n\t" \ + "beq t1, x0, LOOP" #id "\n\t" \ + /* This entire loop is 8 cycles long, so the value of t1 upon exiting is t0 plus a */ \ + /* number in the range [0, 7] */ \ + "LOOP" #id ":\n\t" \ + "nop\n\t" /* Delay so that loop length is a power of 2 */ \ + "nop\n\t" \ + "nop\n\t" \ + "rdcycle t1\n\t" \ + "andi t1, t1, 1023\n\t" \ + "blt t1, t0, LOOP" #id "\n\t" /* Cost of 3 cycles when taken, 1 otherwise; see page 37 https://www2.eecs.berkeley.edu/Pubs/TechRpts/2015/EECS-2015-181.pdf */ \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c new file mode 100644 index 0000000..6b1476d --- /dev/null +++ b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c @@ -0,0 +1,80 @@ + +#include +#include +#include +#include + +#include "align.h" + +#define N 100 + +static int main_of(uint32_t core); + +static int send_main(uint32_t receiver); +static int receive_main(uint32_t sender); + +int main() { + unsigned long coreid = read_csr(CSR_COREID); + srand(coreid); + if (coreid == 0) send_main(1); + if (coreid == 1) receive_main(0); +} + +static int send_main(uint32_t receiver) { + asm volatile ( + // "li t1, 0x40000004\n\t" // wishbone address mmio address + // "li t2, 0x40000008\n\t" // wishbone data mmio address + // "li t3, 0x08\n\t" // noc destination wishbone address + // "li t4, 0x04\n\t" // noc data wishbone address + "li t4, 0x40000000\n\t" + // Set noc destination to 1 + "li t5, 0x08\n\t" // noc destination wishbone address + "sw t5, 4(t4)\n\t" + "li t5, 0x1\n\t" // noc destination + "sw t5, 8(t4)\n\t" + // Check that the wishbone has accepted the write + "nop\n\t" + "nop\n\t" + "lw t5, 16(t4)\n\t" + "li t4, 0xbaaabaaa\n\t" + "csrw 0x51e, t4\n\t" + "csrw 0x51e, t5\n\t" // should be nonzero + "li t4, 0x40000000\n\t" + WAIT_FOR_NEXT_ZERO_MOD_1024(send) // clobber "a" registers, as well as t0, t1, t6 + // like noc_send, but without blocking + "li t5, 0x04\n\t" // Set noc data to 42 + "sw t5, 4(t4)\n\t" + "li t5, 42\n\t" + "sw t5, 8(t4)\n\t" + "li t5, 0x08\n\t" // noc destination + "sw t5, 4(t4)\n\t" + "li t5, 0x1\n\t" + "sw t5, 8(t4)\n\t" + "nop\n\t" + "nop\n\t" + // Is it necessary to wait here to make sure the destination was passed down to the NoC? + // "li t5, 42\n\t" // noc value + // "sw t5, 4(x0)\n\t" // noc data + // "rdcycle t1\n\t" + // "andi t1, t1, 1023\n\t" + // "li t0, 0xbaaabaaa\n\t" + // "csrw 0x51e, t0\n\t" + // "csrw 0x51e, t1\n\t" + ); +} + +static int receive_main(uint32_t sender) { + asm volatile ( + WAIT_FOR_NEXT_ZERO_MOD_1024(receive) + "li t4, 0x40000000\n\t" // wishbone base address + "CHECK_IF_RECEIVED_YET:\n\t" + "sw x0, 0(t4)\n\t" // Write the address of NoC CSR to Wishbone read address + "lw t5, 12(t4)\n\t" // Read NoC CSR + "beq x0, t5, CHECK_IF_RECEIVED_YET\n\t" + "rdcycle t3\n\t" + "andi t3, t3, 1023\n\t" + "li t0, 0xbaaabaaa\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t3\n\t" + ); +} diff --git a/src/main/scala/WishboneMaster.scala b/src/main/scala/WishboneMaster.scala index 564d077..5fdc2f3 100644 --- a/src/main/scala/WishboneMaster.scala +++ b/src/main/scala/WishboneMaster.scala @@ -31,7 +31,8 @@ class WishboneMaster(addrBits: Int)(implicit conf: FlexpretConfiguration) extend val wDoRead = WireDefault(false.B) val wDoWrite = WireDefault(false.B) assert(!(wDoRead && wDoWrite), "Both read and write at the same time") - assert(!(busIO.enable && regState =/= sIdle), "Recevied bus request while busy") + // assert(!(busIO.enable && regState =/= sIdle), "Received bus request while busy") // How is the CPU supposed to know not to send bus request if it cannot even read the status of the WishboneMaster? + regBusRead := regStatus switch(regState) { // Idle state. Waiting for request from FlexPret Core @@ -62,7 +63,6 @@ class WishboneMaster(addrBits: Int)(implicit conf: FlexpretConfiguration) extend when(addr === MMIO_READ_DATA) { regBusRead := regReadData }.elsewhen(addr === MMIO_STATUS) { - regBusRead := regStatus regStatus := false.B }.otherwise { assert(false.B, "Tried to read from invalid address %d on wishbone bus master", addr) From 841e500a0c113754c3fd414d0089134f31421f42 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 30 Dec 2022 20:19:16 -0800 Subject: [PATCH 03/34] store->nop->load -> wrong WB read. Bug? --- .../noc/latency_aligned/noc_latency_aligned.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c index 6b1476d..2aa72ec 100644 --- a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c +++ b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c @@ -22,10 +22,6 @@ int main() { static int send_main(uint32_t receiver) { asm volatile ( - // "li t1, 0x40000004\n\t" // wishbone address mmio address - // "li t2, 0x40000008\n\t" // wishbone data mmio address - // "li t3, 0x08\n\t" // noc destination wishbone address - // "li t4, 0x04\n\t" // noc data wishbone address "li t4, 0x40000000\n\t" // Set noc destination to 1 "li t5, 0x08\n\t" // noc destination wishbone address @@ -33,8 +29,6 @@ static int send_main(uint32_t receiver) { "li t5, 0x1\n\t" // noc destination "sw t5, 8(t4)\n\t" // Check that the wishbone has accepted the write - "nop\n\t" - "nop\n\t" "lw t5, 16(t4)\n\t" "li t4, 0xbaaabaaa\n\t" "csrw 0x51e, t4\n\t" @@ -50,16 +44,6 @@ static int send_main(uint32_t receiver) { "sw t5, 4(t4)\n\t" "li t5, 0x1\n\t" "sw t5, 8(t4)\n\t" - "nop\n\t" - "nop\n\t" - // Is it necessary to wait here to make sure the destination was passed down to the NoC? - // "li t5, 42\n\t" // noc value - // "sw t5, 4(x0)\n\t" // noc data - // "rdcycle t1\n\t" - // "andi t1, t1, 1023\n\t" - // "li t0, 0xbaaabaaa\n\t" - // "csrw 0x51e, t0\n\t" - // "csrw 0x51e, t1\n\t" ); } @@ -69,6 +53,7 @@ static int receive_main(uint32_t sender) { "li t4, 0x40000000\n\t" // wishbone base address "CHECK_IF_RECEIVED_YET:\n\t" "sw x0, 0(t4)\n\t" // Write the address of NoC CSR to Wishbone read address + "nop\n\t" "lw t5, 12(t4)\n\t" // Read NoC CSR "beq x0, t5, CHECK_IF_RECEIVED_YET\n\t" "rdcycle t3\n\t" From f9afb8663d7e9925329cf4c0d62d16c42ca09b57 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Sat, 31 Dec 2022 14:06:33 -0800 Subject: [PATCH 04/34] Actually 35 cycles of latency it seems. --- .../noc/latency_aligned/noc_latency_aligned.c | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c index 2aa72ec..4bdb8c3 100644 --- a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c +++ b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c @@ -21,40 +21,52 @@ int main() { } static int send_main(uint32_t receiver) { + _fp_print(wb_read(NOC_CSR)); asm volatile ( "li t4, 0x40000000\n\t" // Set noc destination to 1 - "li t5, 0x08\n\t" // noc destination wishbone address - "sw t5, 4(t4)\n\t" - "li t5, 0x1\n\t" // noc destination - "sw t5, 8(t4)\n\t" - // Check that the wishbone has accepted the write - "lw t5, 16(t4)\n\t" - "li t4, 0xbaaabaaa\n\t" - "csrw 0x51e, t4\n\t" - "csrw 0x51e, t5\n\t" // should be nonzero - "li t4, 0x40000000\n\t" + // "li t5, 0x08\n\t" // noc destination wishbone address + // "sw t5, 4(t4)\n\t" + // "li t5, 0x1\n\t" // noc destination + // "sw t5, 8(t4)\n\t" + // // Check that the wishbone has accepted the write + // "lw t5, 16(t4)\n\t" + // "li t4, 0xbaaabaaa\n\t" + // "csrw 0x51e, t4\n\t" + // "csrw 0x51e, t5\n\t" // should be nonzero + // "li t4, 0x40000000\n\t" WAIT_FOR_NEXT_ZERO_MOD_1024(send) // clobber "a" registers, as well as t0, t1, t6 // like noc_send, but without blocking - "li t5, 0x04\n\t" // Set noc data to 42 - "sw t5, 4(t4)\n\t" - "li t5, 42\n\t" + "li t5, 0x1\n\t" // noc destination "sw t5, 8(t4)\n\t" - "li t5, 0x08\n\t" // noc destination + "li t5, 0x08\n\t" + "sw t5, 4(t4)\n\t" + "nop\n\t" + "nop\n\t" + "li t5, 42\n\t" // Set noc data to 42 + "sw t5, 8(t4)\n\t" // FIXME: Data must be written first? Why? Is it Hardware Bug? + "li t5, 0x04\n\t" "sw t5, 4(t4)\n\t" - "li t5, 0x1\n\t" - "sw t5, 8(t4)\n\t" ); } static int receive_main(uint32_t sender) { + // asm volatile( + // "nop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\t" + // ); + // while (1) _fp_print(wb_read(NOC_CSR)); asm volatile ( WAIT_FOR_NEXT_ZERO_MOD_1024(receive) "li t4, 0x40000000\n\t" // wishbone base address "CHECK_IF_RECEIVED_YET:\n\t" "sw x0, 0(t4)\n\t" // Write the address of NoC CSR to Wishbone read address "nop\n\t" + "nop\n\t" "lw t5, 12(t4)\n\t" // Read NoC CSR + // "li t0, 0xbaaabaaa\n\t" + // "csrw 0x51e, t0\n\t" + // "csrw 0x51e, t5\n\t" + "andi t5, t5, 2\n\t" "beq x0, t5, CHECK_IF_RECEIVED_YET\n\t" "rdcycle t3\n\t" "andi t3, t3, 1023\n\t" From ec8290218dc19856bd085aa752dfe6e02caf6ebb Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Sat, 31 Dec 2022 14:42:39 -0800 Subject: [PATCH 05/34] Adjust and comment on noc_latency_aligned. --- .../noc/latency_aligned/noc_latency_aligned.c | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c index 4bdb8c3..804e68b 100644 --- a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c +++ b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c @@ -1,4 +1,8 @@ - +/** + * This program explores the absolute minimum amount of time that it can take to send one word and + * write it into a register on another core, when under the most favorable circumstances, + * and when controlling relative timing and optimizing the assembly. + */ #include #include #include @@ -16,25 +20,13 @@ static int receive_main(uint32_t sender); int main() { unsigned long coreid = read_csr(CSR_COREID); srand(coreid); - if (coreid == 0) send_main(1); - if (coreid == 1) receive_main(0); + if (coreid == 0) for (int i = 0; i < 10; i++) send_main(1); + if (coreid == 1) for (int i = 0; i < 10; i++) receive_main(0); } static int send_main(uint32_t receiver) { - _fp_print(wb_read(NOC_CSR)); asm volatile ( "li t4, 0x40000000\n\t" - // Set noc destination to 1 - // "li t5, 0x08\n\t" // noc destination wishbone address - // "sw t5, 4(t4)\n\t" - // "li t5, 0x1\n\t" // noc destination - // "sw t5, 8(t4)\n\t" - // // Check that the wishbone has accepted the write - // "lw t5, 16(t4)\n\t" - // "li t4, 0xbaaabaaa\n\t" - // "csrw 0x51e, t4\n\t" - // "csrw 0x51e, t5\n\t" // should be nonzero - // "li t4, 0x40000000\n\t" WAIT_FOR_NEXT_ZERO_MOD_1024(send) // clobber "a" registers, as well as t0, t1, t6 // like noc_send, but without blocking "li t5, 0x1\n\t" // noc destination @@ -51,27 +43,38 @@ static int send_main(uint32_t receiver) { } static int receive_main(uint32_t sender) { - // asm volatile( - // "nop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\t" - // ); - // while (1) _fp_print(wb_read(NOC_CSR)); asm volatile ( WAIT_FOR_NEXT_ZERO_MOD_1024(receive) + // "nop\n\t" // The 9-cycle read loop is aligned optimally when the number of nops here is zero mod 9 + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" + // "nop\n\t" "li t4, 0x40000000\n\t" // wishbone base address + // FIXME: Why does this loop have to go through one iteration extra the first time around, compared to the number of iterations that it makes thereafter? "CHECK_IF_RECEIVED_YET:\n\t" + // Sadly, this whole sequence -- store, wait, read, mask, beq -- must be in the loop. In particular, if the store is factored out, the read doesn't work, even though we are storing the same thing each time. "sw x0, 0(t4)\n\t" // Write the address of NoC CSR to Wishbone read address "nop\n\t" "nop\n\t" "lw t5, 12(t4)\n\t" // Read NoC CSR - // "li t0, 0xbaaabaaa\n\t" - // "csrw 0x51e, t0\n\t" - // "csrw 0x51e, t5\n\t" "andi t5, t5, 2\n\t" "beq x0, t5, CHECK_IF_RECEIVED_YET\n\t" + "li t5, 4\n\t" // Write the address of NoC data to Wishbone read address + "sw t5, 0(t4)\n\t" + "nop\n\t" + "nop\n\t" + "lw t5, 12(t4)\n\t" // Read NoC data "rdcycle t3\n\t" "andi t3, t3, 1023\n\t" "li t0, 0xbaaabaaa\n\t" "csrw 0x51e, t0\n\t" "csrw 0x51e, t3\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t5\n\t" ); } From 6abf1159bf6c39eaa1df0405c7aa33eff928542c Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Sun, 1 Jan 2023 23:31:17 -0800 Subject: [PATCH 06/34] Experiment with the NoC interface. --- .gitignore | 3 + flexpret | 2 +- programs/HelloWorld/hello.c | 2 +- programs/benchmarks/noc/latency/noc_latency.c | 54 --------------- .../LowLevelInterface}/Makefile | 2 +- .../low_level_interface_noc.c | 65 +++++++++++++++++++ soc-comm | 2 +- src/main/scala/Top.scala | 16 ++--- 8 files changed, 78 insertions(+), 68 deletions(-) delete mode 100644 programs/benchmarks/noc/latency/noc_latency.c rename programs/{benchmarks/noc/latency => noc/LowLevelInterface}/Makefile (62%) create mode 100644 programs/noc/LowLevelInterface/low_level_interface_noc.c diff --git a/.gitignore b/.gitignore index fec5bad..08a5683 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ +.bloop/ .bsp/ +.idea/ .metals/ +.scala-build/ .vscode/ project/ target/ diff --git a/flexpret b/flexpret index d51bbbf..5d57943 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit d51bbbf6bde11217370080e76f5eb6efd88da94d +Subproject commit 5d5794361b3c7463ed31f4b31524f66f9ce5db50 diff --git a/programs/HelloWorld/hello.c b/programs/HelloWorld/hello.c index 91440a6..a15579c 100644 --- a/programs/HelloWorld/hello.c +++ b/programs/HelloWorld/hello.c @@ -34,4 +34,4 @@ int main2() { int main3() { _fp_print(43); -} \ No newline at end of file +} diff --git a/programs/benchmarks/noc/latency/noc_latency.c b/programs/benchmarks/noc/latency/noc_latency.c deleted file mode 100644 index c880037..0000000 --- a/programs/benchmarks/noc/latency/noc_latency.c +++ /dev/null @@ -1,54 +0,0 @@ -#include -#include -#include - -#define N 3 - -static int main_of(uint32_t core); - -int main() { - main_of(read_csr(CSR_COREID)); -} - -static int send_main(uint32_t receiver) { - _fp_print(111111111); - noc_send(receiver, 0); // sync for reproducibility - int cycles_of_latency = 0; - for (uint32_t i = 0; i < N; i++) { - unsigned long t0 = rdcycle(); // benchmark start - noc_send(receiver, i); - uint32_t reply = noc_receive(); - unsigned long t1 = rdcycle(); // benchmark end - _fp_print(receiver * 1000000 + t1 - t0); - cycles_of_latency = t1 - t0; - } - for (uint32_t i = 0; i < N; i++) { - unsigned long t0 = rdcycle(); // benchmark start - noc_send(receiver, t0); - } -} - -static int receive_main(uint32_t sender) { - noc_receive(); // sync for reproducibility - for (uint32_t i = 0; i < N; i++) { - noc_send(sender, noc_receive()); - } - for (uint32_t i = 0; i < N; i++) { - uint32_t t0 = noc_receive(); - uint32_t t1 = rdcycle(); // benchmark end - _fp_print(sender * 1000000 + t1 - t0); - } -} - -static int send_receive(uint32_t partner, int first) { - first ? send_main(partner) : receive_main(partner); - !first ? send_main(partner) : receive_main(partner); -} - -static int main_of(uint32_t core) { - int big = core & 2; - int odd = core & 1; - send_receive((core + 1) & 3, !odd); - send_receive((core + 2) & 3, !big); - send_receive((core + 3) & 3, !odd); -} diff --git a/programs/benchmarks/noc/latency/Makefile b/programs/noc/LowLevelInterface/Makefile similarity index 62% rename from programs/benchmarks/noc/latency/Makefile rename to programs/noc/LowLevelInterface/Makefile index cdfaef8..4bcaa17 100644 --- a/programs/benchmarks/noc/latency/Makefile +++ b/programs/noc/LowLevelInterface/Makefile @@ -1,5 +1,5 @@ build: - riscv_compile.sh ispm noc_latency.c + riscv_compile.sh ispm low_level_interface_noc.c clean: riscv_clean.sh diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c new file mode 100644 index 0000000..e155da3 --- /dev/null +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -0,0 +1,65 @@ +#include +#include + +int main0(); +int main1(); +int main2(); +int main3(); + +int main() { + int core_id = read_csr(CSR_COREID); + switch(core_id) { + case 0: main0(); break; + case 1: main1(); break; + case 2: main1(); break; + case 3: main1(); break; + default: _fp_print(66); //ERROR + } +} + +int main0() { + asm volatile( + "li t4, 0x80000000\n\t" + "li t3, 43\n\t" + "sw t3, 0(t4)\n\t" + ); +} + +int main1() { + asm volatile( + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "li t4, 0x80000000\n\t" + "lw t3, 0(t4)\n\t" + "li t0, 0xbaaabaaa\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t3\n\t" + ); +} + +int main2() { + +} + +int main3() { + +} diff --git a/soc-comm b/soc-comm index e13da1b..c5ec6d9 160000 --- a/soc-comm +++ b/soc-comm @@ -1 +1 @@ -Subproject commit e13da1be215a222b071b104775d3ea6d05978a14 +Subproject commit c5ec6d9f613e806e40662574c8cc5718db0de204 diff --git a/src/main/scala/Top.scala b/src/main/scala/Top.scala index 7db13ed..12980a0 100644 --- a/src/main/scala/Top.scala +++ b/src/main/scala/Top.scala @@ -3,7 +3,7 @@ import chisel3._ import chisel3.util.experimental.loadMemoryFromFileInline // To load program into ISpm import flexpret.core.{Core, FlexpretConfiguration, GPIO, HostIO, ISpm} -import wishbone.{S4NoCTopWB} +import s4noc.S4NoC import s4noc.Config @@ -34,14 +34,12 @@ class Top(topCfg: TopConfig) extends Module { val wbBuses = for (i <- 0 until topCfg.nCores) yield { Module(new WishboneBus( masterWidth = topCfg.coreCfgs(i).busAddrBits, - deviceWidths = Seq(4,4) // NOC width=4 and Uart width = 4 + deviceWidths = Seq(4) // Uart width = 4 )) } // NoC with n ports - val noc = Module(new S4NoCTopWB(Config(4, 2, 2, 2, 32))) - noc.io.wbPorts.map(_.setDefaults) - + val noc = Module(new S4NoC(Config(4, 2, 2, 2, 32))) // Termination and printing logic (just for simulation) val regCoreDone = RegInit(VecInit(Seq.fill(topCfg.nCores)(false.B))) @@ -54,15 +52,13 @@ class Top(topCfg: TopConfig) extends Module { cores(i).io.int_exts.foreach(_ := false.B) // Connect to wbM master cores(i).io.bus <> wbMasters(i).busIO + cores(i).io.noc <> noc.io(i) // Connect WbMaster to WbBus wbMasters(i).wbIO <> wbBuses(i).io.wbMaster - // Connect WbBus to NOC - wbBuses(i).io.wbDevices(0) <> noc.io.wbPorts(i) - // Connect WbBus to Uart - wbBuses(i).io.wbDevices(1) <> wbUarts(i).io.port + wbBuses(i).io.wbDevices(0) <> wbUarts(i).io.port // Connect all cores to uart input wbUarts(i).ioUart.rx := io.uart.rx @@ -80,7 +76,7 @@ class Top(topCfg: TopConfig) extends Module { } regCoreDone(i) := true.B } - + // Handle printfs when(cores(i).io.host.to_host === "hbaaabaaa".U) { regCorePrintNext(i) := true.B From 828d100c8184e88567e91a1e9b326d5f779560ac Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Mon, 2 Jan 2023 14:43:24 -0800 Subject: [PATCH 07/34] More tinkering. I realize now that NetworkInterface really is needed since in order for a communication protocol to work properly the received messages must be placed in buffers corresponding to the sender cores. --- flexpret | 2 +- .../low_level_interface_noc.c | 30 +++++++++++++++++++ soc-comm | 2 +- src/main/scala/Top.scala | 4 ++- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/flexpret b/flexpret index 5d57943..2bb0d20 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit 5d5794361b3c7463ed31f4b31524f66f9ce5db50 +Subproject commit 2bb0d20866bbb738dbf91be38dc43fd38ff71038 diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index e155da3..9634c0a 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -20,8 +20,21 @@ int main() { int main0() { asm volatile( "li t4, 0x80000000\n\t" + "li t2, 42\n\t" "li t3, 43\n\t" + "li a1, 44\n\t" + "li t5, 45\n\t" + "li t6, 46\n\t" + "sw t2, 0(t4)\n\t" "sw t3, 0(t4)\n\t" + "sw a1, 0(t4)\n\t" + "sw t5, 0(t4)\n\t" + "sw t6, 0(t4)\n\t" + "sw t2, 0(t4)\n\t" + "sw t3, 0(t4)\n\t" + "sw a1, 0(t4)\n\t" + "sw t5, 0(t4)\n\t" + "sw t6, 0(t4)\n\t" ); } @@ -48,6 +61,23 @@ int main1() { "nop\n\t" "nop\n\t" "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "li t4, 0x80000000\n\t" + "lw t3, 0(t4)\n\t" + "li t0, 0xbaaabaaa\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t3\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" "li t4, 0x80000000\n\t" "lw t3, 0(t4)\n\t" "li t0, 0xbaaabaaa\n\t" diff --git a/soc-comm b/soc-comm index c5ec6d9..2d673c6 160000 --- a/soc-comm +++ b/soc-comm @@ -1 +1 @@ -Subproject commit c5ec6d9f613e806e40662574c8cc5718db0de204 +Subproject commit 2d673c61408ae5e05301f223f8cfef70725661a8 diff --git a/src/main/scala/Top.scala b/src/main/scala/Top.scala index 12980a0..2be620f 100644 --- a/src/main/scala/Top.scala +++ b/src/main/scala/Top.scala @@ -52,7 +52,9 @@ class Top(topCfg: TopConfig) extends Module { cores(i).io.int_exts.foreach(_ := false.B) // Connect to wbM master cores(i).io.bus <> wbMasters(i).busIO - cores(i).io.noc <> noc.io(i) + cores(i).io.noc.in <> noc.io.channels(i).in + cores(i).io.noc.out <> noc.io.channels(i).out + noc.io.read(i) := cores(i).io.noc.read // Connect WbMaster to WbBus wbMasters(i).wbIO <> wbBuses(i).io.wbMaster From 93e16318363b90af7a4801658d521d971da7d75a Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Mon, 2 Jan 2023 20:52:23 -0800 Subject: [PATCH 08/34] Get a basic test working in simulation. On the order of 12 cycles worst case to send an integer without handshaking except checking the valid bit. --- flexpret | 2 +- .../low_level_interface_noc.c | 62 +++---------------- soc-comm | 2 +- src/main/scala/Top.scala | 4 +- 4 files changed, 12 insertions(+), 58 deletions(-) diff --git a/flexpret b/flexpret index 2bb0d20..56fc76e 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit 2bb0d20866bbb738dbf91be38dc43fd38ff71038 +Subproject commit 56fc76e8ec10020aa484c6b65e610a5b311d7f0f diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 9634c0a..0181006 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -13,73 +13,29 @@ int main() { case 1: main1(); break; case 2: main1(); break; case 3: main1(); break; - default: _fp_print(66); //ERROR + default: _fp_print(666); //ERROR } } int main0() { asm volatile( "li t4, 0x80000000\n\t" - "li t2, 42\n\t" - "li t3, 43\n\t" - "li a1, 44\n\t" - "li t5, 45\n\t" - "li t6, 46\n\t" + "rdcycle t2\n\t" + "sw t2, 0(t4)\n\t" + "sw t2, 0(t4)\n\t" "sw t2, 0(t4)\n\t" - "sw t3, 0(t4)\n\t" - "sw a1, 0(t4)\n\t" - "sw t5, 0(t4)\n\t" - "sw t6, 0(t4)\n\t" "sw t2, 0(t4)\n\t" - "sw t3, 0(t4)\n\t" - "sw a1, 0(t4)\n\t" - "sw t5, 0(t4)\n\t" - "sw t6, 0(t4)\n\t" ); } int main1() { asm volatile( - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "li t4, 0x80000000\n\t" - "lw t3, 0(t4)\n\t" - "li t0, 0xbaaabaaa\n\t" - "csrw 0x51e, t0\n\t" - "csrw 0x51e, t3\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" "li t4, 0x80000000\n\t" - "lw t3, 0(t4)\n\t" + "POLL: lw t3, 16(t4)\n\t" + "beq x0, t3, POLL\n\t" + "rdcycle t3\n\t" + "lw t2, 0(t4)\n\t" + "sub t3, t3, t2\n\t" "li t0, 0xbaaabaaa\n\t" "csrw 0x51e, t0\n\t" "csrw 0x51e, t3\n\t" diff --git a/soc-comm b/soc-comm index 2d673c6..9a87d51 160000 --- a/soc-comm +++ b/soc-comm @@ -1 +1 @@ -Subproject commit 2d673c61408ae5e05301f223f8cfef70725661a8 +Subproject commit 9a87d518fffd4f4e9066bb139532bb2a962d1152 diff --git a/src/main/scala/Top.scala b/src/main/scala/Top.scala index 2be620f..12980a0 100644 --- a/src/main/scala/Top.scala +++ b/src/main/scala/Top.scala @@ -52,9 +52,7 @@ class Top(topCfg: TopConfig) extends Module { cores(i).io.int_exts.foreach(_ := false.B) // Connect to wbM master cores(i).io.bus <> wbMasters(i).busIO - cores(i).io.noc.in <> noc.io.channels(i).in - cores(i).io.noc.out <> noc.io.channels(i).out - noc.io.read(i) := cores(i).io.noc.read + cores(i).io.noc <> noc.io(i) // Connect WbMaster to WbBus wbMasters(i).wbIO <> wbBuses(i).io.wbMaster From 7e59e25d9ecb202e5e4cb4c49112076d34faad1c Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Tue, 3 Jan 2023 11:53:23 -0800 Subject: [PATCH 09/34] Failed attempt at synchronization. This uses self-modifying code. Probably more efficient to do it with a sequence of forward branches conditioned on read cycle. --- flexpret | 2 +- .../low_level_interface_noc.c | 49 ++++++++++++++----- soc-comm | 2 +- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/flexpret b/flexpret index 56fc76e..606bd52 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit 56fc76e8ec10020aa484c6b65e610a5b311d7f0f +Subproject commit 606bd52c8e67735c84014d0a521cf745aad3161a diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 0181006..df68754 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -1,6 +1,11 @@ #include #include +#define NORTHEAST 12 +#define NORTH 0 +#define EAST 4 +#define PERIOD 20 + int main0(); int main1(); int main2(); @@ -9,22 +14,44 @@ int main3(); int main() { int core_id = read_csr(CSR_COREID); switch(core_id) { - case 0: main0(); break; - case 1: main1(); break; - case 2: main1(); break; + case 0: main0(NORTH); break; + // case 1: main1(); break; + // case 2: main1(); break; case 3: main1(); break; - default: _fp_print(666); //ERROR + // default: _fp_print(666); //ERROR } } -int main0() { +int main0(uint32_t direction) { + // Goal: write to slot asm volatile( "li t4, 0x80000000\n\t" - "rdcycle t2\n\t" - "sw t2, 0(t4)\n\t" - "sw t2, 0(t4)\n\t" - "sw t2, 0(t4)\n\t" - "sw t2, 0(t4)\n\t" + "li t5, 0x007ea023\n\t" // sw t2, 0(t4) + "lw t2, 32(t4)\n\t" // Get elapsed cycles mod period + "slli t2, t2, 2\n\t" // log2 instruction byte width + "addi t2, t2, 12\n\t" // = NORTHEAST + "addi t3, t2, -20\n\t" // -20 = -4 times period + "blt t3, x0, DONE_MODDING_BY_PERIOD\n\t" + "add t2, t3, x0\n\t" + "nop\n\t" // Ensure 3 cycles are required regardless of whether branch is taken + "DONE_MODDING_BY_PERIOD:" + // "li t2, 42\n\t" + "auipc t1, 0\n\t" + "add t1, t1, t2\n\t" // PC + offset + "sw t5, 28(t1)\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "li t0, 0xbaaabaaa\n\t" + "csrw 0x51e, t0\n\t" + "csrw 0x51e, t2\n\t" ); } @@ -38,7 +65,7 @@ int main1() { "sub t3, t3, t2\n\t" "li t0, 0xbaaabaaa\n\t" "csrw 0x51e, t0\n\t" - "csrw 0x51e, t3\n\t" + "csrw 0x51e, t2\n\t" ); } diff --git a/soc-comm b/soc-comm index 9a87d51..809e01e 160000 --- a/soc-comm +++ b/soc-comm @@ -1 +1 @@ -Subproject commit 9a87d518fffd4f4e9066bb139532bb2a962d1152 +Subproject commit 809e01e996a5d21980cf6433c56757cd0a4766df From d77fd4dd71457843078c2eaa874419bf11aca2d2 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Tue, 3 Jan 2023 13:08:33 -0800 Subject: [PATCH 10/34] Successful attempt at synchronization. The synchronization itself costs like 13 cycles. It's cheap. And synchronization is something that an HRTT only has to do once per rendezvous, so 13 cycles is no big deal. --- .../LowLevelInterface/low_level_interface.h | 31 ++++++++++++ .../low_level_interface_noc.c | 48 +++++++------------ 2 files changed, 48 insertions(+), 31 deletions(-) create mode 100644 programs/noc/LowLevelInterface/low_level_interface.h diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h new file mode 100644 index 0000000..31ffc5c --- /dev/null +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -0,0 +1,31 @@ +/** + * @brief The instruction immediately following SYNC5 is able to store a word into the zeroth TDM + * slot (northwest) in a single-threaded setting. + * It 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining 9 cycles are overhead. + */ +#define SYNC5(nonce, reg0, reg1, reg2, reg3, reg4) \ + "li " #reg0 ", 1\n\t" \ + "li " #reg1 ", 2\n\t" \ + "li " #reg2 ", 3\n\t" \ + "li " #reg3 ", 0x80000000\n\t" \ + "lw " #reg4 ", 32(" #reg3 ")\n\t" /* Get elapsed cycles mod period */ \ + "beq " #reg4 ", " #reg0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #reg4 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ + "nop\n\t" \ + "beq " #reg4 ", " #reg2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #reg4 ", " #reg1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "DONE_SYNCHRONIZING" #nonce ":\n\t" + +/** + * @brief Broadcast the value stored in reg to all other cores. The instruction immediately + * following BROADCAST_SYNCED is able to store into the zeroth TDM slot. + */ +#define BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, reg, instr0, instr1) \ + "sw " #reg ", 0(t4)\n\t" \ + instr0 \ + "sw " #reg ", 0(t4)\n\t" \ + "sw " #reg ", 0(t4)\n\t" \ + instr1 \ + +#define BROADCAST_SYNCED(nonce, reg) \ + BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, reg, "nop\n\t", "nop\n\t") diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index df68754..64192f9 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -1,6 +1,8 @@ #include #include +#include "low_level_interface.h" + #define NORTHEAST 12 #define NORTH 0 #define EAST 4 @@ -13,10 +15,20 @@ int main3(); int main() { int core_id = read_csr(CSR_COREID); + // if (core_id != 0) return; + // main0(NORTH); + // asm volatile("nop\n\t"); + // main0(NORTH); + // asm volatile("nop\n\tnop\n\t"); + // main0(NORTH); + // asm volatile("nop\n\tnop\n\tnop\n\t"); + // main0(NORTH); + // asm volatile("nop\n\tnop\n\tnop\n\tnop\n\t"); + // main0(NORTH); switch(core_id) { case 0: main0(NORTH); break; - // case 1: main1(); break; - // case 2: main1(); break; + case 1: main1(); break; + case 2: main1(); break; case 3: main1(); break; // default: _fp_print(666); //ERROR } @@ -25,33 +37,9 @@ int main() { int main0(uint32_t direction) { // Goal: write to slot asm volatile( - "li t4, 0x80000000\n\t" - "li t5, 0x007ea023\n\t" // sw t2, 0(t4) - "lw t2, 32(t4)\n\t" // Get elapsed cycles mod period - "slli t2, t2, 2\n\t" // log2 instruction byte width - "addi t2, t2, 12\n\t" // = NORTHEAST - "addi t3, t2, -20\n\t" // -20 = -4 times period - "blt t3, x0, DONE_MODDING_BY_PERIOD\n\t" - "add t2, t3, x0\n\t" - "nop\n\t" // Ensure 3 cycles are required regardless of whether branch is taken - "DONE_MODDING_BY_PERIOD:" - // "li t2, 42\n\t" - "auipc t1, 0\n\t" - "add t1, t1, t2\n\t" // PC + offset - "sw t5, 28(t1)\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "li t0, 0xbaaabaaa\n\t" - "csrw 0x51e, t0\n\t" - "csrw 0x51e, t2\n\t" + "li t1, 42\n\t" + SYNC5(main0, a0, a1, a2, t4, t2) + BROADCAST_SYNCED(main0, t1) ); } @@ -60,9 +48,7 @@ int main1() { "li t4, 0x80000000\n\t" "POLL: lw t3, 16(t4)\n\t" "beq x0, t3, POLL\n\t" - "rdcycle t3\n\t" "lw t2, 0(t4)\n\t" - "sub t3, t3, t2\n\t" "li t0, 0xbaaabaaa\n\t" "csrw 0x51e, t0\n\t" "csrw 0x51e, t2\n\t" From d65c22f8c97037733384bf9a4ac84da714a9b893 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Tue, 3 Jan 2023 14:21:16 -0800 Subject: [PATCH 11/34] Factor more assembly out into macros. --- .../LowLevelInterface/low_level_interface.h | 60 ++++++++++++++----- .../low_level_interface_noc.c | 26 ++------ 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 31ffc5c..38d0a9c 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -1,31 +1,63 @@ +#define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" + /** * @brief The instruction immediately following SYNC5 is able to store a word into the zeroth TDM - * slot (northwest) in a single-threaded setting. + * slot (northwest) in a single-threaded setting. Clobber the given registers. * It 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining 9 cycles are overhead. */ -#define SYNC5(nonce, reg0, reg1, reg2, reg3, reg4) \ +#define SYNC5(nonce, noc_base_address, reg0, reg1, reg2, reg3) \ "li " #reg0 ", 1\n\t" \ "li " #reg1 ", 2\n\t" \ "li " #reg2 ", 3\n\t" \ - "li " #reg3 ", 0x80000000\n\t" \ - "lw " #reg4 ", 32(" #reg3 ")\n\t" /* Get elapsed cycles mod period */ \ - "beq " #reg4 ", " #reg0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "beq " #reg4 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ + "li " #noc_base_address ", 0x80000000\n\t" \ + "lw " #reg3 ", 32(" #noc_base_address ")\n\t" /* Get elapsed cycles mod period */ \ + "beq " #reg3 ", " #reg0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #reg3 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ "nop\n\t" \ - "beq " #reg4 ", " #reg2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "beq " #reg4 ", " #reg1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #reg3 ", " #reg2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #reg3 ", " #reg1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ "DONE_SYNCHRONIZING" #nonce ":\n\t" /** * @brief Broadcast the value stored in reg to all other cores. The instruction immediately * following BROADCAST_SYNCED is able to store into the zeroth TDM slot. + * This assumes that the current thread is synchronized to the TDM schedule. */ -#define BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, reg, instr0, instr1) \ - "sw " #reg ", 0(t4)\n\t" \ +#define BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, instr0, instr1) \ + "sw " #reg ", 0(" #noc_base_address ")\n\t" \ + instr0 \ + "sw " #reg ", 0(" #noc_base_address ")\n\t" \ + "sw " #reg ", 0(" #noc_base_address ")\n\t" \ + instr1 + +#define BROADCAST_SYNCED(nonce, noc_base_address, reg) \ + BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, "nop\n\t", "nop\n\t") + +#define SEND_NORTHEAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, instr0, instr1, instr2, instr3) \ + "sw " #reg ", 0(" #noc_base_address ")\n\t" \ instr0 \ - "sw " #reg ", 0(t4)\n\t" \ - "sw " #reg ", 0(t4)\n\t" \ instr1 \ + instr2 \ + instr3 + +#define SEND_NORTHEAST_SYNCED(nonce, noc_base_address, reg) \ + SEND_NORTHEAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, "nop\n\t", "nop\n\t", "nop\n\t", "nop\n\t") + +/** + * @brief Do a blocking read of the message from the core at sending_core_reg. + * Set noc_base_address to the base address corresponding to sending_core_reg. + * Preserve the value of sending_core_reg. + */ +#define BLOCKING_READ(nonce, noc_base_address, read_to_reg, sending_core_reg) \ + LOAD_NOC_BASE_ADDRESS(noc_base_address) \ + "slli " #read_to_reg ", " #sending_core_reg ", 2\n\t" \ + "add " #noc_base_address ", " #read_to_reg ", " #noc_base_address "\n\t" \ + "POLL" #nonce ": lw " #read_to_reg ", 16(" #noc_base_address ")\n\t" \ + "beq x0, " #read_to_reg ", POLL" #nonce "\n\t" \ + "lw " #read_to_reg ", 0(" #noc_base_address ")\n\t" + +#define FP_PRINT_ASM(reg, clobber0) \ + "li " #clobber0 ", 0xbaaabaaa\n\t" \ + "csrw 0x51e, " #clobber0 "\n\t" \ + "csrw 0x51e, " #reg "\n\t" \ -#define BROADCAST_SYNCED(nonce, reg) \ - BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, reg, "nop\n\t", "nop\n\t") diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 64192f9..ae6e30e 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -15,22 +15,12 @@ int main3(); int main() { int core_id = read_csr(CSR_COREID); - // if (core_id != 0) return; - // main0(NORTH); - // asm volatile("nop\n\t"); - // main0(NORTH); - // asm volatile("nop\n\tnop\n\t"); - // main0(NORTH); - // asm volatile("nop\n\tnop\n\tnop\n\t"); - // main0(NORTH); - // asm volatile("nop\n\tnop\n\tnop\n\tnop\n\t"); - // main0(NORTH); switch(core_id) { case 0: main0(NORTH); break; case 1: main1(); break; case 2: main1(); break; case 3: main1(); break; - // default: _fp_print(666); //ERROR + default: _fp_print(666); //ERROR } } @@ -38,20 +28,16 @@ int main0(uint32_t direction) { // Goal: write to slot asm volatile( "li t1, 42\n\t" - SYNC5(main0, a0, a1, a2, t4, t2) - BROADCAST_SYNCED(main0, t1) + LOAD_NOC_BASE_ADDRESS(t4) + SYNC5(main0, t4, a0, a1, a2, t2) + BROADCAST_SYNCED(main0, t4, t1) ); } int main1() { asm volatile( - "li t4, 0x80000000\n\t" - "POLL: lw t3, 16(t4)\n\t" - "beq x0, t3, POLL\n\t" - "lw t2, 0(t4)\n\t" - "li t0, 0xbaaabaaa\n\t" - "csrw 0x51e, t0\n\t" - "csrw 0x51e, t2\n\t" + BLOCKING_READ(__LINE__, t4, t2, x0) + FP_PRINT_ASM(t2, a0) ); } From 8eefda3ff12c76c3dd3834b2a6175a699fd876d0 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Tue, 3 Jan 2023 17:06:04 -0800 Subject: [PATCH 12/34] First draft of the sender side of the protocol. --- programs/noc/LowLevelInterface/asm_utils.h | 22 ++++++ .../LowLevelInterface/low_level_interface.h | 74 ++++++++++++++++++- .../low_level_interface_noc.c | 1 + 3 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 programs/noc/LowLevelInterface/asm_utils.h diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h new file mode 100644 index 0000000..fbabe70 --- /dev/null +++ b/programs/noc/LowLevelInterface/asm_utils.h @@ -0,0 +1,22 @@ +#ifndef ASM_UTILS_H +#define ASM_UTILS_H + +/** + * @brief Pure assembly version of _fp_print. + */ +#define FP_PRINT_ASM(reg, clobber0) \ + "li " #clobber0 ", 0xbaaabaaa\n\t" \ + "csrw 0x51e, " #clobber0 "\n\t" \ + "csrw 0x51e, " #reg "\n\t" + +#define TRUE_MACRO(case_true, case_false) case_true +#define FALSE_MACRO(case_true, case_false) case_false + +#define REPEAT1(x) x +#define REPEAT2(x) x x +#define REPEAT3(x) x x x +#define REPEAT5(x) REPEAT2(x) REPEAT3(x) + +#define REPEAT4(x) REPEAT2(REPEAT2(x)) + +#endif // ASM_UTILS_H diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 38d0a9c..c1674ea 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -1,3 +1,5 @@ +#include "asm_utils.h" + #define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" /** @@ -47,6 +49,8 @@ * @brief Do a blocking read of the message from the core at sending_core_reg. * Set noc_base_address to the base address corresponding to sending_core_reg. * Preserve the value of sending_core_reg. + * When run on a FlexPRET core with a single thread, this is guaranteed to take three cycles + * (mod 5). */ #define BLOCKING_READ(nonce, noc_base_address, read_to_reg, sending_core_reg) \ LOAD_NOC_BASE_ADDRESS(noc_base_address) \ @@ -56,8 +60,70 @@ "beq x0, " #read_to_reg ", POLL" #nonce "\n\t" \ "lw " #read_to_reg ", 0(" #noc_base_address ")\n\t" -#define FP_PRINT_ASM(reg, clobber0) \ - "li " #clobber0 ", 0xbaaabaaa\n\t" \ - "csrw 0x51e, " #clobber0 "\n\t" \ - "csrw 0x51e, " #reg "\n\t" \ +/* Helper to SEND_N_WORDS. Accumulates valid bits. */ +#define OR_VALIDITY_OF_NOC_DATA(noc_base_address_reg, accumulator_reg, offset_literal, clobber0) \ + "lw " #clobber0 ", " offset_literal "(" #noc_base_address_reg ")\n\t" \ + "or " #accumulator_reg ", " #accumulator_reg ", " #clobber0 "\n\t" \ + +#define READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, noc_base_address_reg, read_to_reg, sending_core_reg, result_reg, tag_bit_mask) \ + BLOCKING_READ(nonce, clobber0, read_to_reg, x0) \ + "andi " #result_reg ", " #read_to_reg ", " #tag_bit_mask "\n\t" /* check tag bit */ \ + "bnez " #result_reg ", END" #nonce "\n\t" \ +/** + * @brief Try to send the number of words specified by n_words_reg. + * @param n_words_reg The number of words to send. + * @param result_reg 0 if successful, 1 if one of the potential message receivers has already tried + * to send a message before this even tries to send a message, 0x80000000 if one of the potential + * message receivers tries to send a message after this tries to send a message (so awkward! who + * gets to talk first?) + * @param load_words_asm Assembly code for preparing to send words, e.g. by loading the words into + * the register file. + * @param send_words_asm Assembly code for sending the words rapidly. Must never miss a TDM slot! + * May assume that the preceding code has already taken care of synchronization. Must preserve + * synchronization. + * @param TIMES_TO_REPEAT_SEND_WORDS_ASM Macro that repeats send_words_asm an appropriate number of + * times. + * The remaining parameters (e.g., SENDING_NORTHEAST_MACRO) must be either TRUE_MACRO or + * FALSE_MACRO. + */ +#define SEND_N_WORDS( \ + nonce, \ + n_words_reg, \ + result_reg, \ + SENDING_NORTHEAST_MACRO, \ + SENDING_NORTH_MACRO, \ + SENDING_EAST_MACRO, \ + SENDING_TO_ZERO_MACRO, \ + SENDING_TO_ONE_MACRO, \ + SENDING_TO_TWO_MACRO, \ + SENDING_TO_THREE_MACRO, \ + load_words_asm, \ + send_words_asm, \ + TIMES_TO_REPEAT_SEND_WORDS_ASM, \ + clobber0, clobber1, clobber2, clobber3, clobber4) \ + LOAD_NOC_BASE_ADDRESS(clobber0) \ + "add " #result_reg ", x0, x0\n\t" \ + SENDING_TO_ZERO_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 16, clobber1), "") \ + SENDING_TO_ONE_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 20, clobber1), "") \ + SENDING_TO_TWO_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 24, clobber1), "") \ + SENDING_TO_THREE_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 28, clobber1), "") \ + "bnez " #result_reg ", END" #nonce "\n\t" /** Fail with error code 1 */ \ + SYNC5(nonce, clobber0, clobber1, clobber2, clobber3, clobber4) \ + "li " #clobber1 ", 0x80000000\n\t" /* Set the top bit as a tag bit. If u wanna send so many words that this creates amiguity, then u have a bigger problem on your hands */ \ + "ori " #n_words_reg ", " #n_words_reg ", " #clobber1 "\n\t" \ + SENDING_NORTH_MACRO("sw " #n_words_reg ", 0(" #clobber0 ")\n\t", "nop\n\t") \ + SENDING_EAST_MACRO("sw " #n_words_reg ", 0(" #clobber0 ")\n\t", "nop\n\t") \ + "li " #clobber3 ", 1\n\t" \ + SENDING_NORTHEAST_MACRO("sw " #n_words_reg ", 0(" #clobber0 ")\n\t", "nop\n\t") \ + SENDING_TO_ZERO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, x0, result_reg, clobber1), "") \ + SENDING_TO_ONE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, clobber3, result_reg, clobber1), "") \ + "li " #clobber3 ", 2\n\t" \ + SENDING_TO_TWO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, clobber3, result_reg, clobber1), "") \ + "li " #clobber3 ", 3\n\t" \ + SENDING_TO_THREE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, clobber3, result_reg, clobber1), "") \ + /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. By my count the TDM slot of the next instruction will be -2 mod 5, but for now I won't use that fact, preferring instead to re-synchronize, just to make the assembly easier to write (less brittle, less performant). */ \ + #load_words_asm \ + SYNC5(nonce, clobber0, clobber1, clobber2, clobber3, clobber4) \ + #TIMES_TO_REPEAT_SEND_WORDS_ASM(send_words_asm) \ + "END" #nonce ":\n\t" diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index ae6e30e..18fbc46 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -1,6 +1,7 @@ #include #include +#include "asm_utils.h" #include "low_level_interface.h" #define NORTHEAST 12 From 89b8d6538f851e3d4861161e3f362a1bd2bb91d6 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Tue, 3 Jan 2023 22:05:58 -0800 Subject: [PATCH 13/34] Initial attempt at a batch communication. It doesn't work entirely properly because although we check that all parties are ready to communicate, we do not sync precisely when the batch starts. Instead, we just sync with a blocking read, which is not precise enough. It can't be precise enough because the poll takes 5 cycles, which is as large as the interval between flits. It does "kind of" work though, which is progress. --- programs/noc/LowLevelInterface/asm_utils.h | 2 +- .../LowLevelInterface/low_level_interface.h | 64 ++++---- .../low_level_interface_noc.c | 144 +++++++++++++++++- 3 files changed, 171 insertions(+), 39 deletions(-) diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h index fbabe70..ce7a99f 100644 --- a/programs/noc/LowLevelInterface/asm_utils.h +++ b/programs/noc/LowLevelInterface/asm_utils.h @@ -2,7 +2,7 @@ #define ASM_UTILS_H /** - * @brief Pure assembly version of _fp_print. + * @brief Pure assembly version of _fp_print. Executes in 4 cycles. */ #define FP_PRINT_ASM(reg, clobber0) \ "li " #clobber0 ", 0xbaaabaaa\n\t" \ diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index c1674ea..7a4070f 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -56,19 +56,20 @@ LOAD_NOC_BASE_ADDRESS(noc_base_address) \ "slli " #read_to_reg ", " #sending_core_reg ", 2\n\t" \ "add " #noc_base_address ", " #read_to_reg ", " #noc_base_address "\n\t" \ - "POLL" #nonce ": lw " #read_to_reg ", 16(" #noc_base_address ")\n\t" \ - "beq x0, " #read_to_reg ", POLL" #nonce "\n\t" \ + "BLOCKING_READ_POLL" #nonce ": lw " #read_to_reg ", 16(" #noc_base_address ")\n\t" \ + "beq x0, " #read_to_reg ", BLOCKING_READ_POLL" #nonce "\n\t" \ "lw " #read_to_reg ", 0(" #noc_base_address ")\n\t" -/* Helper to SEND_N_WORDS. Accumulates valid bits. */ +/** Helper to SEND_N_WORDS. Accumulates valid bits. */ #define OR_VALIDITY_OF_NOC_DATA(noc_base_address_reg, accumulator_reg, offset_literal, clobber0) \ - "lw " #clobber0 ", " offset_literal "(" #noc_base_address_reg ")\n\t" \ - "or " #accumulator_reg ", " #accumulator_reg ", " #clobber0 "\n\t" \ + "lw " #clobber0 ", " #offset_literal "(" #noc_base_address_reg ")\n\t" \ + "or " #accumulator_reg ", " #accumulator_reg ", " #clobber0 "\n\t" -#define READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, noc_base_address_reg, read_to_reg, sending_core_reg, result_reg, tag_bit_mask) \ - BLOCKING_READ(nonce, clobber0, read_to_reg, x0) \ - "andi " #result_reg ", " #read_to_reg ", " #tag_bit_mask "\n\t" /* check tag bit */ \ - "bnez " #result_reg ", END" #nonce "\n\t" \ +/** Helper to SEND_N_WORDS. */ +#define READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, noc_base_address_reg, read_to_reg, sending_core_reg, result_reg, tag_bit_mask, fail_label) \ + BLOCKING_READ(nonce, noc_base_address_reg, read_to_reg, sending_core_reg) \ + "and " #result_reg ", " #read_to_reg ", " #tag_bit_mask "\n\t" /* check tag bit */ \ + "bnez " #result_reg ", " #fail_label "\n\t" /** * @brief Try to send the number of words specified by n_words_reg. @@ -84,6 +85,9 @@ * synchronization. * @param TIMES_TO_REPEAT_SEND_WORDS_ASM Macro that repeats send_words_asm an appropriate number of * times. + * @param noc_base_address Register that will hold the NoC base address. No assumptions are made + * about the original value held in this register (the NoC base address will be written into it + * regardless). * The remaining parameters (e.g., SENDING_NORTHEAST_MACRO) must be either TRUE_MACRO or * FALSE_MACRO. */ @@ -100,30 +104,30 @@ SENDING_TO_THREE_MACRO, \ load_words_asm, \ send_words_asm, \ - TIMES_TO_REPEAT_SEND_WORDS_ASM, \ - clobber0, clobber1, clobber2, clobber3, clobber4) \ - LOAD_NOC_BASE_ADDRESS(clobber0) \ + noc_base_address, clobber1, clobber2, clobber3, clobber4 \ +) \ + LOAD_NOC_BASE_ADDRESS(noc_base_address) \ "add " #result_reg ", x0, x0\n\t" \ - SENDING_TO_ZERO_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 16, clobber1), "") \ - SENDING_TO_ONE_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 20, clobber1), "") \ - SENDING_TO_TWO_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 24, clobber1), "") \ - SENDING_TO_THREE_MACRO(OR_VALIDITY_OF_NOC_DATA(clobber0, result_reg, 28, clobber1), "") \ - "bnez " #result_reg ", END" #nonce "\n\t" /** Fail with error code 1 */ \ - SYNC5(nonce, clobber0, clobber1, clobber2, clobber3, clobber4) \ + SENDING_TO_ZERO_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 16, clobber1), "") \ + SENDING_TO_ONE_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 20, clobber1), "") \ + SENDING_TO_TWO_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 24, clobber1), "") \ + SENDING_TO_THREE_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 28, clobber1), "") \ + "bnez " #result_reg ", END_SEND_N_WORDS" #nonce "\n\t" /** Fail with error code 1 */ \ + SYNC5(nonce ## 0, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ "li " #clobber1 ", 0x80000000\n\t" /* Set the top bit as a tag bit. If u wanna send so many words that this creates amiguity, then u have a bigger problem on your hands */ \ - "ori " #n_words_reg ", " #n_words_reg ", " #clobber1 "\n\t" \ - SENDING_NORTH_MACRO("sw " #n_words_reg ", 0(" #clobber0 ")\n\t", "nop\n\t") \ - SENDING_EAST_MACRO("sw " #n_words_reg ", 0(" #clobber0 ")\n\t", "nop\n\t") \ + "or " #n_words_reg ", " #n_words_reg ", " #clobber1 "\n\t" \ + SENDING_NORTH_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ + SENDING_EAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ "li " #clobber3 ", 1\n\t" \ - SENDING_NORTHEAST_MACRO("sw " #n_words_reg ", 0(" #clobber0 ")\n\t", "nop\n\t") \ - SENDING_TO_ZERO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, x0, result_reg, clobber1), "") \ - SENDING_TO_ONE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, clobber3, result_reg, clobber1), "") \ + SENDING_NORTHEAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ + SENDING_TO_ZERO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 0, noc_base_address, clobber2, x0, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ + SENDING_TO_ONE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 1, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ "li " #clobber3 ", 2\n\t" \ - SENDING_TO_TWO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, clobber3, result_reg, clobber1), "") \ + SENDING_TO_TWO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 2, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ "li " #clobber3 ", 3\n\t" \ - SENDING_TO_THREE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, clobber0, clobber2, clobber3, result_reg, clobber1), "") \ + SENDING_TO_THREE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 3, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. By my count the TDM slot of the next instruction will be -2 mod 5, but for now I won't use that fact, preferring instead to re-synchronize, just to make the assembly easier to write (less brittle, less performant). */ \ - #load_words_asm \ - SYNC5(nonce, clobber0, clobber1, clobber2, clobber3, clobber4) \ - #TIMES_TO_REPEAT_SEND_WORDS_ASM(send_words_asm) \ - "END" #nonce ":\n\t" + load_words_asm \ + SYNC5(nonce ## 1, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ + send_words_asm \ + "END_SEND_N_WORDS" #nonce ": nop\n\t" diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 18fbc46..2f29ff6 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -14,38 +14,166 @@ int main1(); int main2(); int main3(); +#define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) +#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address) REPEAT64( \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "addi " #n_words_reg ", " #n_words_reg ", -1\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "beqz " #n_words_reg ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ +) +#define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 15\n\t" +// #define BROADCAST_COUNT_LOAD_ASM "andi t0, t0, 15\n\t" + +/* n_words_reg assumed no greater than 15. The only part here that is specific to core 0 is where you listen for responses. */ +#define BROADCAST_COUNT_FROM_CORE_ZERO(nonce, n_words_reg, result_reg, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ + SEND_N_WORDS( \ + nonce, \ + n_words_reg, \ + result_reg, \ + TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ + FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ + BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ + BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address), \ + noc_base_address, clobber1, clobber2, clobber3, clobber4 \ + ) \ + "END_BROADCAST_COUNT_FROM_CORE_ZERO:" + int main() { int core_id = read_csr(CSR_COREID); switch(core_id) { case 0: main0(NORTH); break; case 1: main1(); break; - case 2: main1(); break; - case 3: main1(); break; + case 2: main2(); break; + case 3: main3(); break; default: _fp_print(666); //ERROR } } int main0(uint32_t direction) { - // Goal: write to slot asm volatile( - "li t1, 42\n\t" - LOAD_NOC_BASE_ADDRESS(t4) - SYNC5(main0, t4, a0, a1, a2, t2) - BROADCAST_SYNCED(main0, t4, t1) + "li t0, 7\n\t" + // Let t1 be result reg + // Let t2 be noc_base_address + BROADCAST_COUNT_FROM_CORE_ZERO(__LINE__, t0, t1, t2, t3, t4, t5, t6) ); } int main1() { asm volatile( - BLOCKING_READ(__LINE__, t4, t2, x0) + BLOCKING_READ(0, t4, t2, x0) + "andi t2, t2, 1023\n\t" FP_PRINT_ASM(t2, a0) + SYNC5(67, t4, a1, a2, a3, a4) + "nop\n\t" + "nop\n\t" + "nop\n\t" + "sw t2, 0(t4)\n\t" + "nop\n\t" + BLOCKING_READ(1, t4, t2, x0) + "csrw 0x51e, a0\n\t" // a0 has baaabaaa, the "print to host" number + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" // t4 has the base address for the noc + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" ); } int main2() { + asm volatile( + BLOCKING_READ(00, t4, t2, x0) + "andi t2, t2, 1023\n\t" + FP_PRINT_ASM(t2, a0) + SYNC5(106, t4, a1, a2, a3, a4) + "nop\n\t" + "nop\n\t" + "sw t2, 0(t4)\n\t" + "nop\n\t" + "nop\n\t" + BLOCKING_READ(10, t4, t2, x0) + "csrw 0x51e, a0\n\t" // a0 has baaabaaa, the "print to host" number + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" // t4 has the base address for the noc + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + ); } int main3() { + asm volatile( + BLOCKING_READ(000, t4, t2, x0) + "andi t2, t2, 1023\n\t" + FP_PRINT_ASM(t2, a0) + SYNC5(146, t4, a1, a2, a3, a4) + "sw t2, 0(t4)\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + BLOCKING_READ(100, t4, t2, x0) + "csrw 0x51e, a0\n\t" // a0 has baaabaaa, the "print to host" number + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" // t4 has the base address for the noc + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + "csrw 0x51e, a0\n\t" + "csrw 0x51e, t2\n\t" + "lw t2, 0(t4)\n\t" + "nop\n\t" + ); } From 085bcd8700b82cfd15162800f250e821dfc5476d Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Wed, 4 Jan 2023 09:58:01 -0800 Subject: [PATCH 14/34] Refactor the assembly a bit. --- .../LowLevelInterface/low_level_interface.h | 37 +++++++++++++++---- .../low_level_interface_noc.c | 27 ++++++++++---- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 7a4070f..34dc82c 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -45,6 +45,22 @@ #define SEND_NORTHEAST_SYNCED(nonce, noc_base_address, reg) \ SEND_NORTHEAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, "nop\n\t", "nop\n\t", "nop\n\t", "nop\n\t") +/** + * @brief Load the NoC base address corresponding to the value of sending_core_reg into + * noc_core_base_address. + * @param noc_core_base_address Output: The address of data sent from the given core. + * @param sending_core_reg Input (preserved): The number of the sending core. This register is not + * clobbered so that the optimization of passing in x0 for it can be used. + */ +#define LOAD_NOC_CORE_BASE_ADDRESS(noc_core_base_address, sending_core_reg, clobber0) \ + LOAD_NOC_BASE_ADDRESS(noc_core_base_address) \ + "slli " #clobber0 ", " #sending_core_reg ", 2\n\t" \ + "add " #noc_core_base_address ", " #clobber0 ", " #noc_core_base_address "\n\t" + +#define BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, clobber0) \ + "BLOCKING_READ_POLL" #nonce ": lw " #clobber0 ", 16(" #noc_core_base_address ")\n\t" \ + "beq x0, " #clobber0 ", BLOCKING_READ_POLL" #nonce "\n\t" \ + /** * @brief Do a blocking read of the message from the core at sending_core_reg. * Set noc_base_address to the base address corresponding to sending_core_reg. @@ -52,13 +68,10 @@ * When run on a FlexPRET core with a single thread, this is guaranteed to take three cycles * (mod 5). */ -#define BLOCKING_READ(nonce, noc_base_address, read_to_reg, sending_core_reg) \ - LOAD_NOC_BASE_ADDRESS(noc_base_address) \ - "slli " #read_to_reg ", " #sending_core_reg ", 2\n\t" \ - "add " #noc_base_address ", " #read_to_reg ", " #noc_base_address "\n\t" \ - "BLOCKING_READ_POLL" #nonce ": lw " #read_to_reg ", 16(" #noc_base_address ")\n\t" \ - "beq x0, " #read_to_reg ", BLOCKING_READ_POLL" #nonce "\n\t" \ - "lw " #read_to_reg ", 0(" #noc_base_address ")\n\t" +#define BLOCKING_READ(nonce, noc_core_base_address, read_to_reg, sending_core_reg) \ + LOAD_NOC_CORE_BASE_ADDRESS(noc_core_base_address, sending_core_reg, read_to_reg) \ + BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, read_to_reg) \ + "lw " #read_to_reg ", 0(" #noc_core_base_address ")\n\t" /** Helper to SEND_N_WORDS. Accumulates valid bits. */ #define OR_VALIDITY_OF_NOC_DATA(noc_base_address_reg, accumulator_reg, offset_literal, clobber0) \ @@ -131,3 +144,13 @@ SYNC5(nonce ## 1, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ send_words_asm \ "END_SEND_N_WORDS" #nonce ": nop\n\t" + +// #define READ_N_WORDS( \ +// nonce, \ +// sending_core_reg, \ +// prepare_receive_words_asm, \ +// receive_words_asm, \ +// noc_base_address, clobber0, clobber1, clobber2, clobber3, clobber4 \ +// ) \ +// BLOCKING_READ(nonce ## 1, noc_base_address, clobber0, sending_core_reg) \ +// SYNC5(nonce ## 2, noc_base_address, ) diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 2f29ff6..9186e8f 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -9,18 +9,29 @@ #define EAST 4 #define PERIOD 20 +/*********************** + * core 0 / N \ core 1 * + * W + E * + * core 2 \ S / core 3 * + ***********************/ + int main0(); int main1(); int main2(); int main3(); #define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) -#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address) REPEAT64( \ +#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber0) REPEAT64( \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "addi " #n_words_reg ", " #n_words_reg ", -1\n\t" \ + "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "beqz " #n_words_reg ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ + "blt " #clobber0 ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "blt " #n_words_reg ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ ) #define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 15\n\t" // #define BROADCAST_COUNT_LOAD_ASM "andi t0, t0, 15\n\t" @@ -34,7 +45,7 @@ int main3(); TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ - BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address), \ + BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ noc_base_address, clobber1, clobber2, clobber3, clobber4 \ ) \ "END_BROADCAST_COUNT_FROM_CORE_ZERO:" @@ -52,6 +63,7 @@ int main() { int main0(uint32_t direction) { asm volatile( + "li t0, 7\n\t" // Let t1 be result reg // Let t2 be noc_base_address @@ -63,7 +75,7 @@ int main1() { asm volatile( BLOCKING_READ(0, t4, t2, x0) "andi t2, t2, 1023\n\t" - FP_PRINT_ASM(t2, a0) + "li a0, 0xbaaabaaa\n\t" SYNC5(67, t4, a1, a2, a3, a4) "nop\n\t" "nop\n\t" @@ -102,7 +114,7 @@ int main2() { asm volatile( BLOCKING_READ(00, t4, t2, x0) "andi t2, t2, 1023\n\t" - FP_PRINT_ASM(t2, a0) + "li a0, 0xbaaabaaa\n\t" SYNC5(106, t4, a1, a2, a3, a4) "nop\n\t" "nop\n\t" @@ -135,14 +147,13 @@ int main2() { "lw t2, 0(t4)\n\t" "nop\n\t" ); - } int main3() { asm volatile( BLOCKING_READ(000, t4, t2, x0) "andi t2, t2, 1023\n\t" - FP_PRINT_ASM(t2, a0) + "li a0, 0xbaaabaaa\n\t" SYNC5(146, t4, a1, a2, a3, a4) "sw t2, 0(t4)\n\t" "nop\n\t" From 5ecc536a57db435ff9a744d1b4cbfdfc9afd8ebf Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Wed, 4 Jan 2023 16:27:22 -0800 Subject: [PATCH 15/34] More assembly refactoring. --- programs/noc/LowLevelInterface/asm_utils.h | 9 +- .../LowLevelInterface/low_level_interface.h | 28 +++- .../low_level_interface_noc.c | 155 ++++++------------ 3 files changed, 82 insertions(+), 110 deletions(-) diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h index ce7a99f..e9d335c 100644 --- a/programs/noc/LowLevelInterface/asm_utils.h +++ b/programs/noc/LowLevelInterface/asm_utils.h @@ -1,6 +1,10 @@ #ifndef ASM_UTILS_H #define ASM_UTILS_H +#define TOKENPASTE2(x, y) x ## y +#define TOKENPASTE1(x, y) TOKENPASTE2(x, y) +#define TOKENPASTE(x, y) TOKENPASTE1(x, y) + /** * @brief Pure assembly version of _fp_print. Executes in 4 cycles. */ @@ -15,8 +19,9 @@ #define REPEAT1(x) x #define REPEAT2(x) x x #define REPEAT3(x) x x x -#define REPEAT5(x) REPEAT2(x) REPEAT3(x) - #define REPEAT4(x) REPEAT2(REPEAT2(x)) +#define REPEAT5(x) REPEAT2(x) REPEAT3(x) +#define REPEAT7(x) REPEAT2(x) REPEAT5(x) +#define REPEAT11(x) REPEAT7(x) REPEAT4(x) #endif // ASM_UTILS_H diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 34dc82c..d2cd6be 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -1,5 +1,23 @@ #include "asm_utils.h" +#define NORTHEAST_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ + "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ + asm2cycles \ + asm1cycle0 \ + asm1cycle1 + +#define NORTH_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ + asm2cycles \ + "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ + asm1cycle0 \ + asm1cycle1 + +#define EAST_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ + asm2cycles \ + asm1cycle0 \ + "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ + asm1cycle1 + #define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" /** @@ -57,16 +75,16 @@ "slli " #clobber0 ", " #sending_core_reg ", 2\n\t" \ "add " #noc_core_base_address ", " #clobber0 ", " #noc_core_base_address "\n\t" -#define BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, clobber0) \ +#define BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, clobber0) BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) // This indirection is necessary for the preprocessor to expand the macro arg __LINE__. :eye_roll: +#define BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) \ "BLOCKING_READ_POLL" #nonce ": lw " #clobber0 ", 16(" #noc_core_base_address ")\n\t" \ "beq x0, " #clobber0 ", BLOCKING_READ_POLL" #nonce "\n\t" \ /** * @brief Do a blocking read of the message from the core at sending_core_reg. - * Set noc_base_address to the base address corresponding to sending_core_reg. - * Preserve the value of sending_core_reg. - * When run on a FlexPRET core with a single thread, this is guaranteed to take three cycles - * (mod 5). + * @param noc_core_base_address Output: The base address corresponding to sending_core_reg. + * @param read_to_reg Output: The result of the blocking read. + * @param sending_core_reg Input (preserved): The number of the sending core. */ #define BLOCKING_READ(nonce, noc_core_base_address, read_to_reg, sending_core_reg) \ LOAD_NOC_CORE_BASE_ADDRESS(noc_core_base_address, sending_core_reg, read_to_reg) \ diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 9186e8f..fc9f20b 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -4,10 +4,7 @@ #include "asm_utils.h" #include "low_level_interface.h" -#define NORTHEAST 12 -#define NORTH 0 -#define EAST 4 -#define PERIOD 20 +/* | NE | _ | N | E | _ | */ /*********************** * core 0 / N \ core 1 * @@ -22,38 +19,48 @@ int main3(); #define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) #define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber0) REPEAT64( \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "blt " #clobber0 ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "blt " #n_words_reg ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "blt " #clobber0 ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "blt " #n_words_reg ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ ) #define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 15\n\t" -// #define BROADCAST_COUNT_LOAD_ASM "andi t0, t0, 15\n\t" /* n_words_reg assumed no greater than 15. The only part here that is specific to core 0 is where you listen for responses. */ #define BROADCAST_COUNT_FROM_CORE_ZERO(nonce, n_words_reg, result_reg, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ - SEND_N_WORDS( \ - nonce, \ - n_words_reg, \ - result_reg, \ - TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ - FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ - BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ - BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ - noc_base_address, clobber1, clobber2, clobber3, clobber4 \ - ) \ + SEND_N_WORDS( \ + nonce, \ + n_words_reg, \ + result_reg, \ + TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ + FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ + BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ + BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ + noc_base_address, clobber1, clobber2, clobber3, clobber4 \ + ) \ "END_BROADCAST_COUNT_FROM_CORE_ZERO:" +#define RECEIVER_BODY(endlabel) REPEAT7( \ + "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ + "rdcycle t0\n\t" \ + "bge t0, t3, " #endlabel "\n\t" \ + "csrw 0x51e, a0\n\t" \ + "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ + "rdcycle t0\n\t" \ + "bge t0, t3, " #endlabel "\n\t" \ + "csrw 0x51e, t2\n\t" \ +) + int main() { int core_id = read_csr(CSR_COREID); switch(core_id) { - case 0: main0(NORTH); break; + case 0: main0(); break; case 1: main1(); break; case 2: main2(); break; case 3: main3(); break; @@ -61,7 +68,7 @@ int main() { } } -int main0(uint32_t direction) { +int main0() { asm volatile( "li t0, 7\n\t" @@ -77,36 +84,18 @@ int main1() { "andi t2, t2, 1023\n\t" "li a0, 0xbaaabaaa\n\t" SYNC5(67, t4, a1, a2, a3, a4) - "nop\n\t" - "nop\n\t" - "nop\n\t" - "sw t2, 0(t4)\n\t" - "nop\n\t" - BLOCKING_READ(1, t4, t2, x0) - "csrw 0x51e, a0\n\t" // a0 has baaabaaa, the "print to host" number - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" // t4 has the base address for the noc - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" + EAST_QUINTET( + "slli t3, t2, 2\n\t" "add t3, t3, t2\n\t", /* multiplying by 5 */ + "nop\n\t", + "nop\n\t", + t2, + t4 + ) + BLOCK_ON_FLIT_FROM_CORE(__LINE__, t4, t2) + "rdcycle t2\n\t" + "add t3, t2, t3\n\t" // end cycle + RECEIVER_BODY(END_MAIN1) + "END_MAIN1:" ); } @@ -121,31 +110,11 @@ int main2() { "sw t2, 0(t4)\n\t" "nop\n\t" "nop\n\t" - BLOCKING_READ(10, t4, t2, x0) - "csrw 0x51e, a0\n\t" // a0 has baaabaaa, the "print to host" number - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" // t4 has the base address for the noc - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" + BLOCK_ON_FLIT_FROM_CORE(__LINE__, t4, t2) "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" "nop\n\t" + RECEIVER_BODY(END_MAIN2) + "END_MAIN2:" ); } @@ -160,31 +129,11 @@ int main3() { "nop\n\t" "nop\n\t" "nop\n\t" - BLOCKING_READ(100, t4, t2, x0) - "csrw 0x51e, a0\n\t" // a0 has baaabaaa, the "print to host" number - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" // t4 has the base address for the noc - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" - "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" + BLOCK_ON_FLIT_FROM_CORE(__LINE__, t4, t2) "nop\n\t" - "csrw 0x51e, a0\n\t" - "csrw 0x51e, t2\n\t" - "lw t2, 0(t4)\n\t" "nop\n\t" + RECEIVER_BODY(END_MAIN3) + "END_MAIN3:" ); } From 614a56fe38cc2456fa84c8b5db8e5d71398676ff Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Wed, 4 Jan 2023 21:01:32 -0800 Subject: [PATCH 16/34] First draft of receive words macro. --- programs/noc/LowLevelInterface/asm_utils.h | 12 ++- .../LowLevelInterface/low_level_interface.h | 97 +++++++++++++++---- 2 files changed, 86 insertions(+), 23 deletions(-) diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h index e9d335c..dfbaed5 100644 --- a/programs/noc/LowLevelInterface/asm_utils.h +++ b/programs/noc/LowLevelInterface/asm_utils.h @@ -1,8 +1,8 @@ #ifndef ASM_UTILS_H #define ASM_UTILS_H -#define TOKENPASTE2(x, y) x ## y -#define TOKENPASTE1(x, y) TOKENPASTE2(x, y) +#define TOKENPASTE2__(x, y) x ## y +#define TOKENPASTE1__(x, y) TOKENPASTE2__(x, y) #define TOKENPASTE(x, y) TOKENPASTE1(x, y) /** @@ -24,4 +24,12 @@ #define REPEAT7(x) REPEAT2(x) REPEAT5(x) #define REPEAT11(x) REPEAT7(x) REPEAT4(x) +#define MUL4(in_reg, out_reg) \ + "slli " #out_reg ", " #in_reg ", 2\n\t" + +#define MUL5(in_reg, out_reg) \ + MUL4(in_reg, out_reg) \ + "add " #out_reg ", " #out_reg ", " #in_reg "\n\t" + + #endif // ASM_UTILS_H diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index d2cd6be..7a73330 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -20,23 +20,33 @@ #define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" +/** + * @brief The same as SYNC5, except with the opportunity to pack one extra single-cycle instruction + * in. + * @param instr An instruction (presumably unrelated to the behavior of SYNC5) that does not clobber + * clobber1, clobber2, or clobber3, but that is permitted to clobber clobber0. This instruction must + * execute in exactly one cycle. + */ +#define SYNC5_PACKED(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3, extra_instr) \ + "li " #clobber0 ", 1\n\t" \ + "li " #clobber1 ", 2\n\t" \ + "li " #clobber2 ", 3\n\t" \ + "li " #noc_base_address ", 0x80000000\n\t" \ + "lw " #clobber3 ", 32(" #noc_base_address ")\n\t" /* Get elapsed cycles mod period */ \ + "beq " #clobber3 ", " #clobber0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #clobber3 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ + extra_instr \ + "beq " #clobber3 ", " #clobber2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "beq " #clobber3 ", " #clobber1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ + "DONE_SYNCHRONIZING" #nonce ":\n\t" + /** * @brief The instruction immediately following SYNC5 is able to store a word into the zeroth TDM * slot (northwest) in a single-threaded setting. Clobber the given registers. * It 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining 9 cycles are overhead. */ -#define SYNC5(nonce, noc_base_address, reg0, reg1, reg2, reg3) \ - "li " #reg0 ", 1\n\t" \ - "li " #reg1 ", 2\n\t" \ - "li " #reg2 ", 3\n\t" \ - "li " #noc_base_address ", 0x80000000\n\t" \ - "lw " #reg3 ", 32(" #noc_base_address ")\n\t" /* Get elapsed cycles mod period */ \ - "beq " #reg3 ", " #reg0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "beq " #reg3 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ - "nop\n\t" \ - "beq " #reg3 ", " #reg2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "beq " #reg3 ", " #reg1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "DONE_SYNCHRONIZING" #nonce ":\n\t" +#define SYNC5(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3) \ + SYNC5_PACKED(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3, "nop\n\t") /** * @brief Broadcast the value stored in reg to all other cores. The instruction immediately @@ -163,12 +173,57 @@ send_words_asm \ "END_SEND_N_WORDS" #nonce ": nop\n\t" -// #define READ_N_WORDS( \ -// nonce, \ -// sending_core_reg, \ -// prepare_receive_words_asm, \ -// receive_words_asm, \ -// noc_base_address, clobber0, clobber1, clobber2, clobber3, clobber4 \ -// ) \ -// BLOCKING_READ(nonce ## 1, noc_base_address, clobber0, sending_core_reg) \ -// SYNC5(nonce ## 2, noc_base_address, ) +#define MUL4_2CYCLES_2INSTRS(in_reg, out_reg) \ + MUL4(in_reg, out_reg) "nop\n\t" + +#define MUL5_2CYCLES_2INSTRS(in_reg, out_reg) MUL5(in_reg, out_reg) + +/** + * @brief Read the number of words specified by the sender. + * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the + * receiver. + * @param MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES_2INSTRS Assembly consisting of two instructions that + * takes an in_reg and an out_reg and sets the out_reg to the number of instructions in each 5-cycle + * subsequence of receive_words_asm. See MUL4_2CYCLES_2INSTRS, MUL5_2CYCLES_2INSTRS for examples. + * @param preparatory_asm_4cycles Assembly taking 4 cycles that may prepare for the + * receive_words_asm that follows. The number of instructions used (not just cycles) matters! + * @param offset_numeric_literal 28 plus (4 times the number of instructions in + * preparatory_asm_4cycles). This will be 44 unless the 4-cycle sequence includes instructions that + * cause stalls, such as loads, branches, or jumps. + * @param sending_core_reg Input: A register specifying the sending core. + * @param receive_words_asm Assembly that receives the sent words. Must read in its first cycle, and + * exactly every 5 cycles thereafter! + * @param noc_base_address Output: A register that will be set to the base address of the NoC + * corresponding to the given core. + */ +#define READ_N_WORDS( \ + nonce, \ + DIRECTION_QUINTET_MACRO, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES_2INSTRS, \ + preparatory_asm_4cycles, \ + offset_numeric_literal, \ + sending_core_reg, \ + receive_words_asm, \ + noc_base_address, clobber0, clobber1, clobber2, clobber3, clobber4 \ +) \ + BLOCKING_READ(nonce ## 1, noc_base_address, clobber0, sending_core_reg) \ + /* "andi " #clobber0 ", " #clobber0 ", 1023\n\t" /* 1023 is the largest 12-bit number that doesn't sign-extend to have a 1 in its top bit */ \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES_2INSTRS(clobber0, clobber0) \ + SYNC5_PACKED(nonce ## 2, noc_base_address, clobber1, clobber2, clobber3, clobber4, "slli " #clobber0 ", " #clobber0 ", 2\n\t") \ + "jal " #clobber1 ", CONTINUING_READ_N_WORDS" #nonce "\n\t" \ + "jal x0, READ_N_WORDS_END" #nonce "\n\t" /* This instruction will not be executed in its present position. It's just here so we can copy it somewhere else. */ \ + "CONTINUING_READ_N_WORDS" #nonce ":\n\t" \ + "lw " #clobber2 ", 0(" #clobber1 ")\n\t" /* clobber2 := "jump to end" instruction */ \ + "add " #clobber1 ", " #clobber1 ", " #clobber0 "\n\t" \ + preparatory_asm_4cycles \ + DIRECTION_QUINTET_MACRO( \ + "lw " #clobber3 ", " #offset_numeric_literal "(" #clobber1 ")\n\t", /* clobber3 := instruction to be replaced (receive_words_asm must not clobber clobber3!) */ \ + "sw " #clobber2 ", " #offset_numeric_literal "(" #clobber1 ")\n\t", \ + "nop\n\t", \ + clobber0, \ + noc_base_address, \ + ) \ + receive_words_asm \ + "READ_N_WORDS_END" #nonce ":\n\t" \ + "sw " #clobber3 ", " #offset_numeric_literal "(" #clobber1 ")\n\t" /* restore the modified imem entry */ + From 8a49d6422610cf4fbf82933161529e582aa18b57 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 01:17:56 -0800 Subject: [PATCH 17/34] Receive a sequence of words correctly. --- flexpret | 2 +- .../LowLevelInterface/low_level_interface.h | 103 ++++++++++-------- .../low_level_interface_noc.c | 73 +++---------- 3 files changed, 77 insertions(+), 101 deletions(-) diff --git a/flexpret b/flexpret index 606bd52..fc487e0 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit 606bd52c8e67735c84014d0a521cf745aad3161a +Subproject commit fc487e0c03871f4d4aaa1a94c3f06b7bdce0a966 diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 7a73330..4820bdf 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -21,13 +21,11 @@ #define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" /** - * @brief The same as SYNC5, except with the opportunity to pack one extra single-cycle instruction - * in. - * @param instr An instruction (presumably unrelated to the behavior of SYNC5) that does not clobber - * clobber1, clobber2, or clobber3, but that is permitted to clobber clobber0. This instruction must - * execute in exactly one cycle. + * @brief The instruction immediately following SYNC5 is able to store a word into the zeroth TDM + * slot (northwest) in a single-threaded setting. Clobber the given registers. + * It 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining 9 cycles are overhead. */ -#define SYNC5_PACKED(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3, extra_instr) \ +#define SYNC5(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3) \ "li " #clobber0 ", 1\n\t" \ "li " #clobber1 ", 2\n\t" \ "li " #clobber2 ", 3\n\t" \ @@ -35,19 +33,11 @@ "lw " #clobber3 ", 32(" #noc_base_address ")\n\t" /* Get elapsed cycles mod period */ \ "beq " #clobber3 ", " #clobber0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ "beq " #clobber3 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ - extra_instr \ + "nop\n\t" \ "beq " #clobber3 ", " #clobber2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ "beq " #clobber3 ", " #clobber1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ "DONE_SYNCHRONIZING" #nonce ":\n\t" -/** - * @brief The instruction immediately following SYNC5 is able to store a word into the zeroth TDM - * slot (northwest) in a single-threaded setting. Clobber the given registers. - * It 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining 9 cycles are overhead. - */ -#define SYNC5(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3) \ - SYNC5_PACKED(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3, "nop\n\t") - /** * @brief Broadcast the value stored in reg to all other cores. The instruction immediately * following BROADCAST_SYNCED is able to store into the zeroth TDM slot. @@ -170,60 +160,83 @@ /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. By my count the TDM slot of the next instruction will be -2 mod 5, but for now I won't use that fact, preferring instead to re-synchronize, just to make the assembly easier to write (less brittle, less performant). */ \ load_words_asm \ SYNC5(nonce ## 1, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ + /* The following 5 cycles are spent to break the receivers from their polling loop. */ \ + SENDING_NORTHEAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + "nop\n\t" \ + SENDING_NORTH_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + SENDING_EAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + "nop\n\t" \ send_words_asm \ "END_SEND_N_WORDS" #nonce ": nop\n\t" -#define MUL4_2CYCLES_2INSTRS(in_reg, out_reg) \ - MUL4(in_reg, out_reg) "nop\n\t" - -#define MUL5_2CYCLES_2INSTRS(in_reg, out_reg) MUL5(in_reg, out_reg) - /** * @brief Read the number of words specified by the sender. * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the * receiver. - * @param MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES_2INSTRS Assembly consisting of two instructions that - * takes an in_reg and an out_reg and sets the out_reg to the number of instructions in each 5-cycle - * subsequence of receive_words_asm. See MUL4_2CYCLES_2INSTRS, MUL5_2CYCLES_2INSTRS for examples. - * @param preparatory_asm_4cycles Assembly taking 4 cycles that may prepare for the - * receive_words_asm that follows. The number of instructions used (not just cycles) matters! - * @param offset_numeric_literal 28 plus (4 times the number of instructions in - * preparatory_asm_4cycles). This will be 44 unless the 4-cycle sequence includes instructions that + * @param MUL_BY_RECEIVE_WORDS_PERIOD Assembly that takes an in_reg and an out_reg and sets the + * out_reg to the number of instructions in each 5-cycle subsequence of receive_words_asm. + * @param offset_numeric_literal 36 plus (4 times the number of instructions in + * preparatory asm). This will be 48 unless one provides instructions that * cause stalls, such as loads, branches, or jumps. + * @param hex_for_12_bit_distance_from_auipc_to_end 3-digit hex (no 0x prefix) for + * offset_numeric_literal plus the offset of receive_words_asm. * @param sending_core_reg Input: A register specifying the sending core. * @param receive_words_asm Assembly that receives the sent words. Must read in its first cycle, and * exactly every 5 cycles thereafter! * @param noc_base_address Output: A register that will be set to the base address of the NoC * corresponding to the given core. + * The remaining registers are all clobbers -- they have descriptive names, but they are not really + * API except insofar as they are clobbered. */ #define READ_N_WORDS( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES_2INSTRS, \ - preparatory_asm_4cycles, \ + MUL_BY_RECEIVE_WORDS_PERIOD, \ + offset_numeric_literal, \ + hex_for_12_bit_distance_from_auipc_to_end, \ + sending_core_reg, \ + receive_words_asm, \ + noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ +) READ_N_WORDS__( \ + nonce, \ + DIRECTION_QUINTET_MACRO, \ + MUL_BY_RECEIVE_WORDS_PERIOD, \ + offset_numeric_literal, \ + hex_for_12_bit_distance_from_auipc_to_end, \ + sending_core_reg, \ + receive_words_asm, \ + noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ +) + +#define READ_N_WORDS__( \ + nonce, \ + DIRECTION_QUINTET_MACRO, \ + MUL_BY_RECEIVE_WORDS_PERIOD, \ offset_numeric_literal, \ + hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ receive_words_asm, \ - noc_base_address, clobber0, clobber1, clobber2, clobber3, clobber4 \ + noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ ) \ - BLOCKING_READ(nonce ## 1, noc_base_address, clobber0, sending_core_reg) \ - /* "andi " #clobber0 ", " #clobber0 ", 1023\n\t" /* 1023 is the largest 12-bit number that doesn't sign-extend to have a 1 in its top bit */ \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES_2INSTRS(clobber0, clobber0) \ - SYNC5_PACKED(nonce ## 2, noc_base_address, clobber1, clobber2, clobber3, clobber4, "slli " #clobber0 ", " #clobber0 ", 2\n\t") \ - "jal " #clobber1 ", CONTINUING_READ_N_WORDS" #nonce "\n\t" \ - "jal x0, READ_N_WORDS_END" #nonce "\n\t" /* This instruction will not be executed in its present position. It's just here so we can copy it somewhere else. */ \ - "CONTINUING_READ_N_WORDS" #nonce ":\n\t" \ - "lw " #clobber2 ", 0(" #clobber1 ")\n\t" /* clobber2 := "jump to end" instruction */ \ - "add " #clobber1 ", " #clobber1 ", " #clobber0 "\n\t" \ - preparatory_asm_4cycles \ + BLOCKING_READ(nonce ## 1, noc_base_address, clobber4, sending_core_reg) \ + MUL_BY_RECEIVE_WORDS_PERIOD(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ + "slli " #packet_size_reg ", " #packet_size_reg ", 2\n\t" \ + SYNC5(nonce ## 2, noc_base_address, t6, jalr_word_reg, replaced_instruction_reg, clobber4) \ + "auipc t6, 0\n\t" \ + "add " #packet_size_reg ", t6, " #packet_size_reg "\n\t" \ + "li " #jalr_word_reg ", 0x" #hex_for_12_bit_distance_from_auipc_to_end "F8067\n\t" /* This hardcodes a write to the address stored at t6 = x31 with the given offset. Note also that li is two instructions and takes two cycles. */ \ + LOAD_NOC_BASE_ADDRESS(clobber4) /* It might be wise to preserve jalr_word_reg for use in a future iteration. t6 can also be preserved, although we will need to add/subtract from it, according to the new sub-packet length, presumably using a preserved value of packet_size_reg. */ \ DIRECTION_QUINTET_MACRO( \ - "lw " #clobber3 ", " #offset_numeric_literal "(" #clobber1 ")\n\t", /* clobber3 := instruction to be replaced (receive_words_asm must not clobber clobber3!) */ \ - "sw " #clobber2 ", " #offset_numeric_literal "(" #clobber1 ")\n\t", \ + "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t", /* replaced_instruction_reg := instruction to be replaced (receive_words_asm must not clobber replaced_instruction_reg!) */ \ + "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t", \ "nop\n\t", \ - clobber0, \ - noc_base_address, \ + x0, /* It doesn't really matter what you send, as long as the tag bit is zero */ \ + noc_base_address \ ) \ + BLOCK_ON_FLIT_FROM_CORE(nonce, noc_base_address, clobber4) /* 2 instructions, -2 cycles (mod 5) */ \ + "nop\n\t" \ + "nop\n\t" \ receive_words_asm \ "READ_N_WORDS_END" #nonce ":\n\t" \ - "sw " #clobber3 ", " #offset_numeric_literal "(" #clobber1 ")\n\t" /* restore the modified imem entry */ + "sw " #replaced_instruction_reg ", 52(t6)\n\t" /* restore the modified imem entry */ diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index fc9f20b..82e5295 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -23,12 +23,12 @@ int main3(); "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "blt " #clobber0 ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ + "bge x0, " #clobber0 ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "blt " #n_words_reg ", x0, END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ + "bge x0, " #n_words_reg ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ ) #define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 15\n\t" @@ -46,16 +46,19 @@ int main3(); ) \ "END_BROADCAST_COUNT_FROM_CORE_ZERO:" -#define RECEIVER_BODY(endlabel) REPEAT7( \ - "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ - "rdcycle t0\n\t" \ - "bge t0, t3, " #endlabel "\n\t" \ - "csrw 0x51e, a0\n\t" \ - "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ - "rdcycle t0\n\t" \ - "bge t0, t3, " #endlabel "\n\t" \ - "csrw 0x51e, t2\n\t" \ -) +#define RECEIVER_BODY REPEAT2(REPEAT4(REPEAT4( \ + "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ + "rdcycle t0\n\t" \ + "csrw 0x51e, a0\n\t" \ + "csrw 0x51e, t2\n\t" \ + ))) \ + REPEAT2(REPEAT4(REPEAT4( \ + "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ + "li t0, 42\n\t" \ + "csrw 0x51e, a0\n\t" \ + "csrw 0x51e, t0\n\t" \ + ))) \ + int main() { int core_id = read_csr(CSR_COREID); @@ -70,7 +73,6 @@ int main() { int main0() { asm volatile( - "li t0, 7\n\t" // Let t1 be result reg // Let t2 be noc_base_address @@ -80,60 +82,21 @@ int main0() { int main1() { asm volatile( - BLOCKING_READ(0, t4, t2, x0) - "andi t2, t2, 1023\n\t" "li a0, 0xbaaabaaa\n\t" - SYNC5(67, t4, a1, a2, a3, a4) - EAST_QUINTET( - "slli t3, t2, 2\n\t" "add t3, t3, t2\n\t", /* multiplying by 5 */ - "nop\n\t", - "nop\n\t", - t2, - t4 - ) - BLOCK_ON_FLIT_FROM_CORE(__LINE__, t4, t2) - "rdcycle t2\n\t" - "add t3, t2, t3\n\t" // end cycle - RECEIVER_BODY(END_MAIN1) - "END_MAIN1:" + READ_N_WORDS(__LINE__, EAST_QUINTET, MUL4, 48, 434, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) ); } int main2() { asm volatile( - BLOCKING_READ(00, t4, t2, x0) - "andi t2, t2, 1023\n\t" "li a0, 0xbaaabaaa\n\t" - SYNC5(106, t4, a1, a2, a3, a4) - "nop\n\t" - "nop\n\t" - "sw t2, 0(t4)\n\t" - "nop\n\t" - "nop\n\t" - BLOCK_ON_FLIT_FROM_CORE(__LINE__, t4, t2) - "nop\n\t" - "nop\n\t" - RECEIVER_BODY(END_MAIN2) - "END_MAIN2:" + READ_N_WORDS(__LINE__, NORTH_QUINTET, MUL4, 48, 434, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) ); } int main3() { asm volatile( - BLOCKING_READ(000, t4, t2, x0) - "andi t2, t2, 1023\n\t" "li a0, 0xbaaabaaa\n\t" - SYNC5(146, t4, a1, a2, a3, a4) - "sw t2, 0(t4)\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - "nop\n\t" - BLOCK_ON_FLIT_FROM_CORE(__LINE__, t4, t2) - "nop\n\t" - "nop\n\t" - RECEIVER_BODY(END_MAIN3) - "END_MAIN3:" + READ_N_WORDS(__LINE__, NORTHEAST_QUINTET, MUL4, 48, 434, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) ); - } From b9ee3ad330de449e4d8ea15e46816a28fb963e6a Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 12:09:26 -0800 Subject: [PATCH 18/34] Send packets of length up to 64. Much longer is impractical because the code size would be excessive. A little FPGA cannot have a giant IMEM scratchpad :( --- .../low_level_interface_noc.c | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 82e5295..767d144 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -30,9 +30,9 @@ int main3(); "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "bge x0, " #n_words_reg ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ ) -#define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 15\n\t" +#define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" -/* n_words_reg assumed no greater than 15. The only part here that is specific to core 0 is where you listen for responses. */ +/* n_words_reg assumed no greater than 1023. The only part here that is specific to core 0 is where you listen for responses. */ #define BROADCAST_COUNT_FROM_CORE_ZERO(nonce, n_words_reg, result_reg, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ SEND_N_WORDS( \ nonce, \ @@ -46,18 +46,12 @@ int main3(); ) \ "END_BROADCAST_COUNT_FROM_CORE_ZERO:" -#define RECEIVER_BODY REPEAT2(REPEAT4(REPEAT4( \ - "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ - "rdcycle t0\n\t" \ - "csrw 0x51e, a0\n\t" \ - "csrw 0x51e, t2\n\t" \ - ))) \ - REPEAT2(REPEAT4(REPEAT4( \ - "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ - "li t0, 42\n\t" \ - "csrw 0x51e, a0\n\t" \ - "csrw 0x51e, t0\n\t" \ - ))) \ +#define RECEIVER_BODY REPEAT64( \ + "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ + "rdcycle t0\n\t" \ + "csrw 0x51e, a0\n\t" \ + "csrw 0x51e, t2\n\t" \ +) "nop\n\t" /* Do not let the code self-modification kill a line that we actually need in the special case that the packet length is exactly 64. */ int main() { @@ -73,7 +67,7 @@ int main() { int main0() { asm volatile( - "li t0, 7\n\t" + "li t0, 64\n\t" // Let t1 be result reg // Let t2 be noc_base_address BROADCAST_COUNT_FROM_CORE_ZERO(__LINE__, t0, t1, t2, t3, t4, t5, t6) @@ -83,20 +77,20 @@ int main0() { int main1() { asm volatile( "li a0, 0xbaaabaaa\n\t" - READ_N_WORDS(__LINE__, EAST_QUINTET, MUL4, 48, 434, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) + READ_N_WORDS(__LINE__, EAST_QUINTET, MUL4, 48, 438, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) ); } int main2() { asm volatile( "li a0, 0xbaaabaaa\n\t" - READ_N_WORDS(__LINE__, NORTH_QUINTET, MUL4, 48, 434, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) + READ_N_WORDS(__LINE__, NORTH_QUINTET, MUL4, 48, 438, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) ); } int main3() { asm volatile( "li a0, 0xbaaabaaa\n\t" - READ_N_WORDS(__LINE__, NORTHEAST_QUINTET, MUL4, 48, 434, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) + READ_N_WORDS(__LINE__, NORTHEAST_QUINTET, MUL4, 48, 438, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) ); } From 132d4c7470d77556c5387932c14a7c315f7b5b6d Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 13:38:12 -0800 Subject: [PATCH 19/34] Add C API for read_n_words_and_print. --- programs/noc/LowLevelInterface/asm_utils.h | 1 + .../LowLevelInterface/low_level_interface.h | 43 ++++++++++++++++++- .../low_level_interface_noc.c | 23 ++-------- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h index dfbaed5..f22d96c 100644 --- a/programs/noc/LowLevelInterface/asm_utils.h +++ b/programs/noc/LowLevelInterface/asm_utils.h @@ -23,6 +23,7 @@ #define REPEAT5(x) REPEAT2(x) REPEAT3(x) #define REPEAT7(x) REPEAT2(x) REPEAT5(x) #define REPEAT11(x) REPEAT7(x) REPEAT4(x) +#define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) #define MUL4(in_reg, out_reg) \ "slli " #out_reg ", " #in_reg ", 2\n\t" diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 4820bdf..d48aae2 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -1,5 +1,9 @@ #include "asm_utils.h" +#define NORTHEAST_INT 0 +#define NORTH_INT 8 +#define EAST_INT 12 + #define NORTHEAST_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ asm2cycles \ @@ -186,7 +190,8 @@ * @param noc_base_address Output: A register that will be set to the base address of the NoC * corresponding to the given core. * The remaining registers are all clobbers -- they have descriptive names, but they are not really - * API except insofar as they are clobbered. + * API except insofar as they are clobbered. t6 is also clobbered, and receive_words_asm cannot use + * t6. The only clobber that receive_words_asm should use is clobber4. */ #define READ_N_WORDS( \ nonce, \ @@ -240,3 +245,39 @@ "READ_N_WORDS_END" #nonce ":\n\t" \ "sw " #replaced_instruction_reg ", 52(t6)\n\t" /* restore the modified imem entry */ +#define LOAD_AND_PRINT_RECEIVER_BODY(noc_base_address, clobber0, clobber1) REPEAT64( \ + "lw " #clobber0 ", 0(" #noc_base_address ")\n\t" /* t4 has the base address for the noc */ \ + "nop\n\t" \ + "csrw 0x51e, " #clobber1 "\n\t" \ + "csrw 0x51e, " #clobber0 "\n\t" \ +) "nop\n\t" /* Do not let the code self-modification kill a line that we actually need in the special case that the packet length is exactly 64. */ + +/** + * @brief Block and print up to 64 words that were sent by another core using the protocol of + * SEND_N_WORDS. + * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the + * receiver. + * @param sender_reg A register containing the core ID of the sender. + * The remaining registers are all clobbers. + */ +#define READ_N_WORDS_AND_PRINT(nonce, DIRECTION_QUINTET_MACRO, sender_reg, clobber0, clobber1, clobber2, clobber3, clobber4, clobber5) \ + READ_N_WORDS(nonce, DIRECTION_QUINTET_MACRO, MUL4, 48, 438, sender_reg, LOAD_AND_PRINT_RECEIVER_BODY(clobber0, clobber4, clobber5), clobber0, clobber1, clobber2, clobber3, clobber4) + +#define READ_N_WORDS_AND_PRINT_HELPER(DIRECTION_QUINTET_MACRO) \ + asm volatile( \ + "li a4, 0xbaaabaaa\n\t" \ + READ_N_WORDS_AND_PRINT(__LINE__, DIRECTION_QUINTET_MACRO, %[sender_reg], t4, t5, a5, a2, a3, a4) \ + : /* no outputs */ \ + : [sender_reg] "r" (sending_core) \ + : "t4", "t5", "a5", "a2", "a3", "a4" \ + ) + +/** @brief See READ_N_WORDS_AND_PRINT for documentation. */ +void read_n_words_and_print(int sending_core, int direction) { + switch (direction) { + case NORTHEAST_INT: READ_N_WORDS_AND_PRINT_HELPER(NORTHEAST_QUINTET); break; + case NORTH_INT: READ_N_WORDS_AND_PRINT_HELPER(NORTH_QUINTET); break; + case EAST_INT: READ_N_WORDS_AND_PRINT_HELPER(EAST_QUINTET); break; + default: *((int*) 0) = 0; // crash with NPE + } +} diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 767d144..29eefee 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -17,7 +17,6 @@ int main1(); int main2(); int main3(); -#define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) #define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber0) REPEAT64( \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ @@ -46,13 +45,6 @@ int main3(); ) \ "END_BROADCAST_COUNT_FROM_CORE_ZERO:" -#define RECEIVER_BODY REPEAT64( \ - "lw t2, 0(t4)\n\t" /* t4 has the base address for the noc */ \ - "rdcycle t0\n\t" \ - "csrw 0x51e, a0\n\t" \ - "csrw 0x51e, t2\n\t" \ -) "nop\n\t" /* Do not let the code self-modification kill a line that we actually need in the special case that the packet length is exactly 64. */ - int main() { int core_id = read_csr(CSR_COREID); @@ -75,22 +67,13 @@ int main0() { } int main1() { - asm volatile( - "li a0, 0xbaaabaaa\n\t" - READ_N_WORDS(__LINE__, EAST_QUINTET, MUL4, 48, 438, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) - ); + read_n_words_and_print(0, EAST_INT); } int main2() { - asm volatile( - "li a0, 0xbaaabaaa\n\t" - READ_N_WORDS(__LINE__, NORTH_QUINTET, MUL4, 48, 438, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) - ); + read_n_words_and_print(0, NORTH_INT); } int main3() { - asm volatile( - "li a0, 0xbaaabaaa\n\t" - READ_N_WORDS(__LINE__, NORTHEAST_QUINTET, MUL4, 48, 438, x0, RECEIVER_BODY, t4, t5, a1, a2, a3) - ); + read_n_words_and_print(0, NORTHEAST_INT); } From a8d065dc1b759afffcfafe6a163dfeb86c54a250 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 14:29:35 -0800 Subject: [PATCH 20/34] Add C API for broadcast_count. --- programs/noc/LowLevelInterface/c_api.h | 135 ++++++++++++++++++ .../LowLevelInterface/low_level_interface.h | 40 +----- .../low_level_interface_noc.c | 37 +---- 3 files changed, 141 insertions(+), 71 deletions(-) create mode 100644 programs/noc/LowLevelInterface/c_api.h diff --git a/programs/noc/LowLevelInterface/c_api.h b/programs/noc/LowLevelInterface/c_api.h new file mode 100644 index 0000000..0369a9b --- /dev/null +++ b/programs/noc/LowLevelInterface/c_api.h @@ -0,0 +1,135 @@ +#ifndef C_API_H +#define C_API_H + +#include + +#include "low_level_interface.h" + +void we_are_bedeviled() { + _fp_print(666); +} + +/** code related to READ_N_WORDS_AND_PRINT ********************************************************/ + +#define LOAD_AND_PRINT_RECEIVER_BODY(noc_base_address, clobber0, clobber1) REPEAT64( \ + "lw " #clobber0 ", 0(" #noc_base_address ")\n\t" /* t4 has the base address for the noc */ \ + "nop\n\t" \ + "csrw 0x51e, " #clobber1 "\n\t" \ + "csrw 0x51e, " #clobber0 "\n\t" \ +) "nop\n\t" /* Do not let the code self-modification kill a line that we actually need in the special case that the packet length is exactly 64. */ + +/** + * @brief Block and print up to 64 words that were sent by another core using the protocol of + * SEND_N_WORDS. + * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the + * receiver. + * @param sender_reg A register containing the core ID of the sender. + * The remaining registers are all clobbers. + */ +#define READ_N_WORDS_AND_PRINT(nonce, DIRECTION_QUINTET_MACRO, sender_reg, clobber0, clobber1, clobber2, clobber3, clobber4, clobber5) \ + READ_N_WORDS(nonce, DIRECTION_QUINTET_MACRO, MUL4, 48, 438, sender_reg, LOAD_AND_PRINT_RECEIVER_BODY(clobber0, clobber4, clobber5), clobber0, clobber1, clobber2, clobber3, clobber4) + +#define read_n_words_and_print_HELPER(DIRECTION_QUINTET_MACRO) \ + asm volatile( \ + "li a4, 0xbaaabaaa\n\t" \ + READ_N_WORDS_AND_PRINT(__LINE__, DIRECTION_QUINTET_MACRO, %[sender_reg], t4, t5, a5, a2, a3, a4) \ + : /* no outputs */ \ + : [sender_reg] "r" (sending_core) \ + : "t4", "t5", "a5", "a2", "a3", "a4" \ + ) + +/** @brief See READ_N_WORDS_AND_PRINT for documentation. */ +void read_n_words_and_print(int sending_core, int direction) { + switch (direction) { + case NORTHEAST_INT: read_n_words_and_print_HELPER(NORTHEAST_QUINTET); break; + case NORTH_INT: read_n_words_and_print_HELPER(NORTH_QUINTET); break; + case EAST_INT: read_n_words_and_print_HELPER(EAST_QUINTET); break; + default: we_are_bedeviled(); + } +} + +/** code related to BROADCAST_COUNT ***************************************************************/ + +#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber0) REPEAT64( \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ + "bge x0, " #clobber0 ", END_BROADCAST_COUNT" #nonce "\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ + "bge x0, " #n_words_reg ", END_BROADCAST_COUNT" #nonce "\n\t" \ +) +#define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" + +/** + * @brief Broadcast a countdown from n_words_reg to 1 to all other cores. + * @param SENDING_TO_ZERO_MACRO, ..., SENDING_TO_THREE_MACRO: All of these should be TRUE_MACRO + * except the one corresponding to the current core (which does not broadcast to itself). + * @param n_words_reg Input: The number of words to send; also, the first number in the countdown. + * Must be less than 128! May need to be even smaller depending on capability of receiver. + * @param result_reg Output: Indicates whether operation succeeded. See SEND_N_WORDS for details. + * @param noc_base_address Clobber. + */ +#define BROADCAST_COUNT( \ + nonce, \ + SENDING_TO_ZERO_MACRO, \ + SENDING_TO_ONE_MACRO, \ + SENDING_TO_TWO_MACRO, \ + SENDING_TO_THREE_MACRO, \ + n_words_reg, \ + result_reg, \ + noc_base_address, \ + clobber1, \ + clobber2, \ + clobber3, \ + clobber4 \ +) \ + SEND_N_WORDS( \ + nonce, \ + n_words_reg, \ + result_reg, \ + TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ + SENDING_TO_ZERO_MACRO, SENDING_TO_ONE_MACRO, SENDING_TO_TWO_MACRO, SENDING_TO_THREE_MACRO, \ + BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ + BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ + noc_base_address, clobber1, clobber2, clobber3, clobber4 \ + ) \ + "END_BROADCAST_COUNT" #nonce ":" + +#define broadcast_count_HELPER( \ + nonce, \ + SENDING_TO_ZERO_MACRO, \ + SENDING_TO_ONE_MACRO, \ + SENDING_TO_TWO_MACRO, \ + SENDING_TO_THREE_MACRO \ +) \ + asm volatile( \ + BROADCAST_COUNT( \ + nonce, \ + SENDING_TO_ZERO_MACRO, \ + SENDING_TO_ONE_MACRO, \ + SENDING_TO_TWO_MACRO, \ + SENDING_TO_THREE_MACRO, \ + %[n_words_reg], \ + t1, t2, t3, t4, t5, t6 \ + ) \ + : /* no outputs */ \ + : [n_words_reg] "r" (start_count_at) \ + : "t1", "t2", "t3", "t4", "t5", "t6" \ + ) + +/** @brief See BROADCAST_COUNT for documentation. start_count_at corresponds to n_words_reg. */ +void broadcast_count(int current_core, int start_count_at) { + switch(current_core) { + case 0: broadcast_count_HELPER(__LINE__, FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO); break; + case 1: broadcast_count_HELPER(__LINE__, TRUE_MACRO, FALSE_MACRO, TRUE_MACRO, TRUE_MACRO); break; + case 2: broadcast_count_HELPER(__LINE__, TRUE_MACRO, TRUE_MACRO, FALSE_MACRO, TRUE_MACRO); break; + case 3: broadcast_count_HELPER(__LINE__, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, FALSE_MACRO); break; + default: we_are_bedeviled(); + } +} + +#endif // C_API_H diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index d48aae2..e0d4f0b 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -1,3 +1,6 @@ +#ifndef LOW_LEVEL_INTERFACE_H +#define LOW_LEVEL_INTERFACE_H + #include "asm_utils.h" #define NORTHEAST_INT 0 @@ -245,39 +248,4 @@ "READ_N_WORDS_END" #nonce ":\n\t" \ "sw " #replaced_instruction_reg ", 52(t6)\n\t" /* restore the modified imem entry */ -#define LOAD_AND_PRINT_RECEIVER_BODY(noc_base_address, clobber0, clobber1) REPEAT64( \ - "lw " #clobber0 ", 0(" #noc_base_address ")\n\t" /* t4 has the base address for the noc */ \ - "nop\n\t" \ - "csrw 0x51e, " #clobber1 "\n\t" \ - "csrw 0x51e, " #clobber0 "\n\t" \ -) "nop\n\t" /* Do not let the code self-modification kill a line that we actually need in the special case that the packet length is exactly 64. */ - -/** - * @brief Block and print up to 64 words that were sent by another core using the protocol of - * SEND_N_WORDS. - * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the - * receiver. - * @param sender_reg A register containing the core ID of the sender. - * The remaining registers are all clobbers. - */ -#define READ_N_WORDS_AND_PRINT(nonce, DIRECTION_QUINTET_MACRO, sender_reg, clobber0, clobber1, clobber2, clobber3, clobber4, clobber5) \ - READ_N_WORDS(nonce, DIRECTION_QUINTET_MACRO, MUL4, 48, 438, sender_reg, LOAD_AND_PRINT_RECEIVER_BODY(clobber0, clobber4, clobber5), clobber0, clobber1, clobber2, clobber3, clobber4) - -#define READ_N_WORDS_AND_PRINT_HELPER(DIRECTION_QUINTET_MACRO) \ - asm volatile( \ - "li a4, 0xbaaabaaa\n\t" \ - READ_N_WORDS_AND_PRINT(__LINE__, DIRECTION_QUINTET_MACRO, %[sender_reg], t4, t5, a5, a2, a3, a4) \ - : /* no outputs */ \ - : [sender_reg] "r" (sending_core) \ - : "t4", "t5", "a5", "a2", "a3", "a4" \ - ) - -/** @brief See READ_N_WORDS_AND_PRINT for documentation. */ -void read_n_words_and_print(int sending_core, int direction) { - switch (direction) { - case NORTHEAST_INT: READ_N_WORDS_AND_PRINT_HELPER(NORTHEAST_QUINTET); break; - case NORTH_INT: READ_N_WORDS_AND_PRINT_HELPER(NORTH_QUINTET); break; - case EAST_INT: READ_N_WORDS_AND_PRINT_HELPER(EAST_QUINTET); break; - default: *((int*) 0) = 0; // crash with NPE - } -} +#endif // LOW_LEVEL_INTERFACE_H diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 29eefee..3e89300 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -3,6 +3,7 @@ #include "asm_utils.h" #include "low_level_interface.h" +#include "c_api.h" /* | NE | _ | N | E | _ | */ @@ -17,35 +18,6 @@ int main1(); int main2(); int main3(); -#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber0) REPEAT64( \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #clobber0 ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #n_words_reg ", END_BROADCAST_COUNT_FROM_CORE_ZERO\n\t" \ -) -#define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" - -/* n_words_reg assumed no greater than 1023. The only part here that is specific to core 0 is where you listen for responses. */ -#define BROADCAST_COUNT_FROM_CORE_ZERO(nonce, n_words_reg, result_reg, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ - SEND_N_WORDS( \ - nonce, \ - n_words_reg, \ - result_reg, \ - TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ - FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ - BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ - BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ - noc_base_address, clobber1, clobber2, clobber3, clobber4 \ - ) \ - "END_BROADCAST_COUNT_FROM_CORE_ZERO:" - - int main() { int core_id = read_csr(CSR_COREID); switch(core_id) { @@ -58,12 +30,7 @@ int main() { } int main0() { - asm volatile( - "li t0, 64\n\t" - // Let t1 be result reg - // Let t2 be noc_base_address - BROADCAST_COUNT_FROM_CORE_ZERO(__LINE__, t0, t1, t2, t3, t4, t5, t6) - ); + broadcast_count(0, 64); } int main1() { From fa062b21abae84202842f0203b6b33d73d730b5e Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 15:17:29 -0800 Subject: [PATCH 21/34] Get broadcast to work from each core in turn. The interleaving of the prints looks funny because the print from one of the cores appears to happen a couple cycles later than the prints from the other two cores. However, all 126 numbers are correctly received, in the correct order, by all receiving cores. That's 126 numbers broadcasted in roughly 1233 cycles. 126*5=630 of those cycles are necessarily required by the TDM schedule, 115*4=460 of those cycles are synchronization overhead at the start of sending a packet, and the remaining 143 or so cycles are probably the C glue code. Obviously the proportion that is overhead will vary depending on the packet size -- smaller packets -> more packets for the same amount of data -> more overhead. In this case 4 packets were sent. By my measurement, the synchronization overhead incurred before sending a packet -- and this is the assembly, not the C which is probably doing a function call, saving things on the stack, etc. -- is 115 cycles. Multiple measurements all gave something like 114.5 cycles. --- programs/noc/LowLevelInterface/low_level_interface.h | 6 +++--- .../noc/LowLevelInterface/low_level_interface_noc.c | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index e0d4f0b..f7389b5 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -187,7 +187,7 @@ * cause stalls, such as loads, branches, or jumps. * @param hex_for_12_bit_distance_from_auipc_to_end 3-digit hex (no 0x prefix) for * offset_numeric_literal plus the offset of receive_words_asm. - * @param sending_core_reg Input: A register specifying the sending core. + * @param sending_core_reg Input: A register specifying the sending core. Clobbered. * @param receive_words_asm Assembly that receives the sent words. Must read in its first cycle, and * exactly every 5 cycles thereafter! * @param noc_base_address Output: A register that will be set to the base address of the NoC @@ -229,11 +229,11 @@ BLOCKING_READ(nonce ## 1, noc_base_address, clobber4, sending_core_reg) \ MUL_BY_RECEIVE_WORDS_PERIOD(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ "slli " #packet_size_reg ", " #packet_size_reg ", 2\n\t" \ - SYNC5(nonce ## 2, noc_base_address, t6, jalr_word_reg, replaced_instruction_reg, clobber4) \ + SYNC5(nonce ## 2, sending_core_reg, t6, jalr_word_reg, replaced_instruction_reg, clobber4) \ "auipc t6, 0\n\t" \ "add " #packet_size_reg ", t6, " #packet_size_reg "\n\t" \ "li " #jalr_word_reg ", 0x" #hex_for_12_bit_distance_from_auipc_to_end "F8067\n\t" /* This hardcodes a write to the address stored at t6 = x31 with the given offset. Note also that li is two instructions and takes two cycles. */ \ - LOAD_NOC_BASE_ADDRESS(clobber4) /* It might be wise to preserve jalr_word_reg for use in a future iteration. t6 can also be preserved, although we will need to add/subtract from it, according to the new sub-packet length, presumably using a preserved value of packet_size_reg. */ \ + "nop\n\t" /* It might be wise to preserve jalr_word_reg for use in a future iteration. t6 can also be preserved, although we will need to add/subtract from it, according to the new sub-packet length, presumably using a preserved value of packet_size_reg. */ \ DIRECTION_QUINTET_MACRO( \ "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t", /* replaced_instruction_reg := instruction to be replaced (receive_words_asm must not clobber replaced_instruction_reg!) */ \ "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t", \ diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 3e89300..f25916a 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -31,16 +31,28 @@ int main() { int main0() { broadcast_count(0, 64); + read_n_words_and_print(1, EAST_INT); + read_n_words_and_print(2, NORTH_INT); + read_n_words_and_print(3, NORTHEAST_INT); } int main1() { read_n_words_and_print(0, EAST_INT); + broadcast_count(1, 17); + read_n_words_and_print(2, NORTHEAST_INT); + read_n_words_and_print(3, NORTH_INT); } int main2() { read_n_words_and_print(0, NORTH_INT); + read_n_words_and_print(1, NORTHEAST_INT); + broadcast_count(2, 42); + read_n_words_and_print(3, EAST_INT); } int main3() { read_n_words_and_print(0, NORTHEAST_INT); + read_n_words_and_print(1, NORTH_INT); + read_n_words_and_print(2, EAST_INT); + broadcast_count(3, 3); } From 5fea13b1e109e7ca1c5d8a4670fdf72b2dfa0e03 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 17:58:08 -0800 Subject: [PATCH 22/34] Start extending the protocol. I want to have multiple bursts, to amortize the initial hundred-cycle synchronization overhead while still allowing enough time between bursts for the sender to either jump back to the start of an unrolled loop or for the sender to load words from main memory before sending them from the register file. In this commit the program in low_level_interface_noc still seems to work. --- programs/noc/LowLevelInterface/c_api.h | 9 ++-- .../LowLevelInterface/low_level_interface.h | 41 +++++++++++++------ 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/programs/noc/LowLevelInterface/c_api.h b/programs/noc/LowLevelInterface/c_api.h index 0369a9b..e16fc7f 100644 --- a/programs/noc/LowLevelInterface/c_api.h +++ b/programs/noc/LowLevelInterface/c_api.h @@ -55,13 +55,13 @@ void read_n_words_and_print(int sending_core, int direction) { "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #clobber0 ", END_BROADCAST_COUNT" #nonce "\n\t" \ + "bge x0, " #clobber0 ", END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #n_words_reg ", END_BROADCAST_COUNT" #nonce "\n\t" \ -) + "bge x0, " #n_words_reg ", END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ +) "END_BROADCAST_COUNT_SEND_ASM" #nonce ": nop\n\t" "nop\n\t" "nop\n\t" #define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" /** @@ -96,8 +96,7 @@ void read_n_words_and_print(int sending_core, int direction) { BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ noc_base_address, clobber1, clobber2, clobber3, clobber4 \ - ) \ - "END_BROADCAST_COUNT" #nonce ":" + ) #define broadcast_count_HELPER( \ nonce, \ diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index f7389b5..f396836 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -174,14 +174,20 @@ SENDING_EAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ "nop\n\t" \ send_words_asm \ - "END_SEND_N_WORDS" #nonce ": nop\n\t" + SENDING_NORTHEAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + "nop\n\t" \ + SENDING_NORTH_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + SENDING_EAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + "nop\n\t" \ + "END_SEND_N_WORDS" #nonce ":\n\t" /** * @brief Read the number of words specified by the sender. * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the * receiver. - * @param MUL_BY_RECEIVE_WORDS_PERIOD Assembly that takes an in_reg and an out_reg and sets the - * out_reg to the number of instructions in each 5-cycle subsequence of receive_words_asm. + * @param MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES Assembly that takes an in_reg and an out_reg + * and sets the out_reg to the number of instructions in each 5-cycle subsequence of + * receive_words_asm. * @param offset_numeric_literal 36 plus (4 times the number of instructions in * preparatory asm). This will be 48 unless one provides instructions that * cause stalls, such as loads, branches, or jumps. @@ -199,7 +205,7 @@ #define READ_N_WORDS( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES, \ offset_numeric_literal, \ hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ @@ -208,7 +214,7 @@ ) READ_N_WORDS__( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES, \ offset_numeric_literal, \ hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ @@ -219,7 +225,7 @@ #define READ_N_WORDS__( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES, \ offset_numeric_literal, \ hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ @@ -227,7 +233,7 @@ noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ ) \ BLOCKING_READ(nonce ## 1, noc_base_address, clobber4, sending_core_reg) \ - MUL_BY_RECEIVE_WORDS_PERIOD(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ + MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ "slli " #packet_size_reg ", " #packet_size_reg ", 2\n\t" \ SYNC5(nonce ## 2, sending_core_reg, t6, jalr_word_reg, replaced_instruction_reg, clobber4) \ "auipc t6, 0\n\t" \ @@ -236,16 +242,27 @@ "nop\n\t" /* It might be wise to preserve jalr_word_reg for use in a future iteration. t6 can also be preserved, although we will need to add/subtract from it, according to the new sub-packet length, presumably using a preserved value of packet_size_reg. */ \ DIRECTION_QUINTET_MACRO( \ "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t", /* replaced_instruction_reg := instruction to be replaced (receive_words_asm must not clobber replaced_instruction_reg!) */ \ - "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t", \ + "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t", \ "nop\n\t", \ x0, /* It doesn't really matter what you send, as long as the tag bit is zero */ \ noc_base_address \ ) \ - BLOCK_ON_FLIT_FROM_CORE(nonce, noc_base_address, clobber4) /* 2 instructions, -2 cycles (mod 5) */ \ + "READ_N_WORDS_WAIT_FOR_SYNC_WORD" #nonce ": " BLOCK_ON_FLIT_FROM_CORE(nonce, noc_base_address, clobber4) /* 2 instructions, -2 cycles (mod 5) */ \ + "nop\n\t" \ "nop\n\t" \ + "READ_N_WORDS_RECEIVE_WORDS" #nonce ": " receive_words_asm /* Upon jumping out of this block (at 0 cycles mod 5) we should be at -2 cycles (mod 5) */ \ + "sw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t" /* restore the modified imem entry */ \ "nop\n\t" \ - receive_words_asm \ - "READ_N_WORDS_END" #nonce ":\n\t" \ - "sw " #replaced_instruction_reg ", 52(t6)\n\t" /* restore the modified imem entry */ + "lw " #clobber4 ", 0(" #noc_base_address ")\n\t" /* cycle 0 mod 5 */ \ + "beqz " #clobber4 ", READ_N_WORDS_END" #nonce " \n\t" \ + MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES(clobber4, replaced_instruction_reg) \ + "slli " #packet_size_reg ", " #replaced_instruction_reg ", 2\n\t" /* cycle 0 mod 5 */ \ + "add " #packet_size_reg ", t6, " #packet_size_reg "\n\t" \ + "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t" \ + "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t" \ + "nop\n\t" /* cycle 0 mod 5 */ \ + "blt " #clobber4 ", x0, READ_N_WORDS_WAIT_FOR_SYNC_WORD" #nonce " \n\t" /* Interpret a tag bit as a request from the sender to have some extra time to prepare for the next burst of data. No need to keep the same alignment mod 5 as before. Coarse (within 5 of cycle-accurate) but adequate synchronization will be done in the busy loop. */ \ + "jal x0, READ_N_WORDS_RECEIVE_WORDS" #nonce "\n\t" /* Jump straight in without re-synchronizing, after incurring 20 (= 0 (mod 5)) cycles of overhead. */ \ + "READ_N_WORDS_END" #nonce ": " \ #endif // LOW_LEVEL_INTERFACE_H From 7ba40170f73d3aad5905c5779cc2635d2e7504c6 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 5 Jan 2023 22:50:05 -0800 Subject: [PATCH 23/34] Make small modifications. It took some time to get the assembly to work as it did in the previous commit. It is getting hard to manage register allocation. --- programs/noc/LowLevelInterface/asm_utils.h | 22 ++++++ programs/noc/LowLevelInterface/c_api.h | 74 ++++++++++++++----- .../LowLevelInterface/low_level_interface.h | 28 ++++--- .../low_level_interface_noc.c | 26 +++---- 4 files changed, 107 insertions(+), 43 deletions(-) diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h index f22d96c..688232d 100644 --- a/programs/noc/LowLevelInterface/asm_utils.h +++ b/programs/noc/LowLevelInterface/asm_utils.h @@ -25,6 +25,28 @@ #define REPEAT11(x) REPEAT7(x) REPEAT4(x) #define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) +#define COUNT_TO_20(MACRO, param0, param1, param2) \ + MACRO(param0, param1, param2, 0) \ + MACRO(param0, param1, param2, 1) \ + MACRO(param0, param1, param2, 2) \ + MACRO(param0, param1, param2, 3) \ + MACRO(param0, param1, param2, 4) \ + MACRO(param0, param1, param2, 5) \ + MACRO(param0, param1, param2, 6) \ + MACRO(param0, param1, param2, 7) \ + MACRO(param0, param1, param2, 8) \ + MACRO(param0, param1, param2, 9) \ + MACRO(param0, param1, param2, 10) \ + MACRO(param0, param1, param2, 11) \ + MACRO(param0, param1, param2, 12) \ + MACRO(param0, param1, param2, 13) \ + MACRO(param0, param1, param2, 14) \ + MACRO(param0, param1, param2, 15) \ + MACRO(param0, param1, param2, 16) \ + MACRO(param0, param1, param2, 17) \ + MACRO(param0, param1, param2, 18) \ + MACRO(param0, param1, param2, 19) + #define MUL4(in_reg, out_reg) \ "slli " #out_reg ", " #in_reg ", 2\n\t" diff --git a/programs/noc/LowLevelInterface/c_api.h b/programs/noc/LowLevelInterface/c_api.h index e16fc7f..02d6e94 100644 --- a/programs/noc/LowLevelInterface/c_api.h +++ b/programs/noc/LowLevelInterface/c_api.h @@ -50,25 +50,56 @@ void read_n_words_and_print(int sending_core, int direction) { /** code related to BROADCAST_COUNT ***************************************************************/ -#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber0) REPEAT64( \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "addi " #clobber0 ", " #n_words_reg ", -1\n\t" \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "sw " #n_words_reg ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #clobber0 ", END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ +#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, countdown_reg, noc_base_address, clobber0) REPEAT64( \ + "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ + "addi " #clobber0 ", " #countdown_reg ", -1\n\t" \ + "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ + "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ + "bge x0, " #clobber0 ", END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce "\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "addi " #n_words_reg ", " #clobber0 ", -1\n\t" \ + "addi " #countdown_reg ", " #clobber0 ", -1\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #n_words_reg ", END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ -) "END_BROADCAST_COUNT_SEND_ASM" #nonce ": nop\n\t" "nop\n\t" "nop\n\t" -#define BROADCAST_COUNT_LOAD_ASM(n_words_reg) "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" + "bge x0, " #countdown_reg ", END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce "\n\t" \ +) \ + "jal x0, END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ + "END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce ":\n\t" \ + "nop\n\t" \ + "add " #n_words_reg ", x0, x0\n\t" \ + "nop\n\t END_BROADCAST_COUNT_SEND_ASM" #nonce ": " + +#define BROADCAST_COUNT_INITIALIZE_ASM(nonce, countdown_reg, n_words_reg, constant_128) \ + "li " #constant_128 ", 128\n\t" \ + "add " #n_words_reg ", " #countdown_reg ", x0\n\t" \ + "blt " #n_words_reg ", " #constant_128 ", BROADCAST_COUNT_DONE_INITIALIZING" #nonce "\n\t" \ + "addi " #n_words_reg ", " #countdown_reg ", -128\n\t" \ + "nop\n\t" \ + "BROADCAST_COUNT_DONE_INITIALIZING" #nonce ":\n\t" \ + "li a4, 0xbaaabaaa\n\t" \ + "csrw 0x51e, a4\n\t" \ + "csrw 0x51e, " #countdown_reg "\n\t" \ + "csrw 0x51e, a4\n\t" \ + "csrw 0x51e, " #n_words_reg "\n\t" + +#define BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, constant_128) \ + "li " #constant_128 ", 128\n\t" \ + "addi " #n_words_reg ", " #countdown_reg ", -128\n\t" \ + "bge " #constant_128 ", " #n_words_reg ", BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce "\n\t" \ + "li " #n_words_reg ", 128\n\t" \ + "nop\n\t" \ + "BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce ":\n\t" \ + "li a4, 0xbaaabaaa\n\t" \ + "csrw 0x51e, a4\n\t" \ + "csrw 0x51e, " #countdown_reg "\n\t" \ + "csrw 0x51e, a4\n\t" \ + "csrw 0x51e, " #n_words_reg "\n\t" \ + "nop\n\t" /** - * @brief Broadcast a countdown from n_words_reg to 1 to all other cores. + * @brief Broadcast a countdown from countdown_reg to 1 to all other cores. * @param SENDING_TO_ZERO_MACRO, ..., SENDING_TO_THREE_MACRO: All of these should be TRUE_MACRO * except the one corresponding to the current core (which does not broadcast to itself). - * @param n_words_reg Input: The number of words to send; also, the first number in the countdown. + * @param countdown_reg Input: The first number in the countdown. * Must be less than 128! May need to be even smaller depending on capability of receiver. * @param result_reg Output: Indicates whether operation succeeded. See SEND_N_WORDS for details. * @param noc_base_address Clobber. @@ -79,22 +110,25 @@ void read_n_words_and_print(int sending_core, int direction) { SENDING_TO_ONE_MACRO, \ SENDING_TO_TWO_MACRO, \ SENDING_TO_THREE_MACRO, \ - n_words_reg, \ + countdown_reg, \ result_reg, \ noc_base_address, \ + n_words_reg, \ clobber1, \ clobber2, \ clobber3, \ clobber4 \ ) \ + "add " #n_words_reg ", " #countdown_reg ", x0\n\t" \ SEND_N_WORDS( \ nonce, \ n_words_reg, \ result_reg, \ TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ SENDING_TO_ZERO_MACRO, SENDING_TO_ONE_MACRO, SENDING_TO_TWO_MACRO, SENDING_TO_THREE_MACRO, \ - BROADCAST_COUNT_LOAD_ASM(n_words_reg), \ - BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, noc_base_address, clobber1), \ + BROADCAST_COUNT_INITIALIZE_ASM(nonce, countdown_reg, n_words_reg, clobber2), \ + BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, countdown_reg, noc_base_address, clobber3), \ + BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, clobber2), \ noc_base_address, clobber1, clobber2, clobber3, clobber4 \ ) @@ -112,15 +146,15 @@ void read_n_words_and_print(int sending_core, int direction) { SENDING_TO_ONE_MACRO, \ SENDING_TO_TWO_MACRO, \ SENDING_TO_THREE_MACRO, \ - %[n_words_reg], \ - t1, t2, t3, t4, t5, t6 \ + %[countdown_reg], \ + t0, t1, t2, t3, t4, t5, t6 \ ) \ : /* no outputs */ \ - : [n_words_reg] "r" (start_count_at) \ - : "t1", "t2", "t3", "t4", "t5", "t6" \ + : [countdown_reg] "r" (start_count_at) \ + : "t0", "t1", "t2", "t3", "t4", "t5", "t6" \ ) -/** @brief See BROADCAST_COUNT for documentation. start_count_at corresponds to n_words_reg. */ +/** @brief See BROADCAST_COUNT for documentation. start_count_at corresponds to countdown_reg. */ void broadcast_count(int current_core, int start_count_at) { switch(current_core) { case 0: broadcast_count_HELPER(__LINE__, FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO); break; diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index f396836..d7f782c 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -116,11 +116,14 @@ * to send a message before this even tries to send a message, 0x80000000 if one of the potential * message receivers tries to send a message after this tries to send a message (so awkward! who * gets to talk first?) - * @param load_words_asm Assembly code for preparing to send words, e.g. by loading the words into + * @param initialize_asm Assembly code for preparing to send words, e.g. by loading the words into * the register file. * @param send_words_asm Assembly code for sending the words rapidly. Must never miss a TDM slot! * May assume that the preceding code has already taken care of synchronization. Must preserve - * synchronization. + * synchronization. Must set n_words_reg to hold the number of words in the next contiguous sequence, + * with the top bit set high if the length of prepare_for_next_send_words_asm is not exactly 11 + * cycles. + * @param prepare_for_next_send_words_asm Prepare for the next run of send_words_asm. * @param TIMES_TO_REPEAT_SEND_WORDS_ASM Macro that repeats send_words_asm an appropriate number of * times. * @param noc_base_address Register that will hold the NoC base address. No assumptions are made @@ -140,8 +143,9 @@ SENDING_TO_ONE_MACRO, \ SENDING_TO_TWO_MACRO, \ SENDING_TO_THREE_MACRO, \ - load_words_asm, \ + initialize_asm, \ send_words_asm, \ + prepare_for_next_send_words_asm, \ noc_base_address, clobber1, clobber2, clobber3, clobber4 \ ) \ LOAD_NOC_BASE_ADDRESS(noc_base_address) \ @@ -165,20 +169,24 @@ "li " #clobber3 ", 3\n\t" \ SENDING_TO_THREE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 3, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. By my count the TDM slot of the next instruction will be -2 mod 5, but for now I won't use that fact, preferring instead to re-synchronize, just to make the assembly easier to write (less brittle, less performant). */ \ - load_words_asm \ + initialize_asm \ SYNC5(nonce ## 1, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ /* The following 5 cycles are spent to break the receivers from their polling loop. */ \ + "SEND_N_WORDS_SENDING_SYNC_WORD" #nonce ": \n\t" \ SENDING_NORTHEAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ "nop\n\t" \ SENDING_NORTH_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ SENDING_EAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ "nop\n\t" \ - send_words_asm \ - SENDING_NORTHEAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ - "nop\n\t" \ - SENDING_NORTH_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ - SENDING_EAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ + "SEND_N_WORDS_SEND_WORDS_ASM" #nonce ": " send_words_asm \ + SENDING_NORTHEAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ "nop\n\t" \ + SENDING_NORTH_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ + SENDING_EAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ + "beqz " #n_words_reg ", END_SEND_N_WORDS" #nonce "\n\t" \ + prepare_for_next_send_words_asm \ + "blt x0, " #n_words_reg ", SEND_N_WORDS_SEND_WORDS_ASM" #nonce "\n\t" \ + "jal x0, SEND_N_WORDS_SENDING_SYNC_WORD" #nonce "\n\t" \ "END_SEND_N_WORDS" #nonce ":\n\t" /** @@ -253,7 +261,7 @@ "READ_N_WORDS_RECEIVE_WORDS" #nonce ": " receive_words_asm /* Upon jumping out of this block (at 0 cycles mod 5) we should be at -2 cycles (mod 5) */ \ "sw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t" /* restore the modified imem entry */ \ "nop\n\t" \ - "lw " #clobber4 ", 0(" #noc_base_address ")\n\t" /* cycle 0 mod 5 */ \ + "lw " #clobber4 ", 0(" #noc_base_address ")\n\t" /* cycle 0 mod 5 */ \ "beqz " #clobber4 ", READ_N_WORDS_END" #nonce " \n\t" \ MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES(clobber4, replaced_instruction_reg) \ "slli " #packet_size_reg ", " #replaced_instruction_reg ", 2\n\t" /* cycle 0 mod 5 */ \ diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index f25916a..000672b 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -30,29 +30,29 @@ int main() { } int main0() { - broadcast_count(0, 64); - read_n_words_and_print(1, EAST_INT); - read_n_words_and_print(2, NORTH_INT); - read_n_words_and_print(3, NORTHEAST_INT); + broadcast_count(0, 17); + // read_n_words_and_print(1, EAST_INT); + // read_n_words_and_print(2, NORTH_INT); + // read_n_words_and_print(3, NORTHEAST_INT); } int main1() { read_n_words_and_print(0, EAST_INT); - broadcast_count(1, 17); - read_n_words_and_print(2, NORTHEAST_INT); - read_n_words_and_print(3, NORTH_INT); + // broadcast_count(1, 17); + // read_n_words_and_print(2, NORTHEAST_INT); + // read_n_words_and_print(3, NORTH_INT); } int main2() { read_n_words_and_print(0, NORTH_INT); - read_n_words_and_print(1, NORTHEAST_INT); - broadcast_count(2, 42); - read_n_words_and_print(3, EAST_INT); + // read_n_words_and_print(1, NORTHEAST_INT); + // broadcast_count(2, 42); + // read_n_words_and_print(3, EAST_INT); } int main3() { read_n_words_and_print(0, NORTHEAST_INT); - read_n_words_and_print(1, NORTH_INT); - read_n_words_and_print(2, EAST_INT); - broadcast_count(3, 3); + // read_n_words_and_print(1, NORTH_INT); + // read_n_words_and_print(2, EAST_INT); + // broadcast_count(3, 3); } From 8b863f91fbbcbd423d414b5f106fc015e450ad6d Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 6 Jan 2023 00:32:19 -0800 Subject: [PATCH 24/34] Get the extended protocol to work properly. --- programs/noc/LowLevelInterface/c_api.h | 37 +++++++++---------- .../LowLevelInterface/low_level_interface.h | 15 +++++--- .../low_level_interface_noc.c | 2 +- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/programs/noc/LowLevelInterface/c_api.h b/programs/noc/LowLevelInterface/c_api.h index 02d6e94..00c27ef 100644 --- a/programs/noc/LowLevelInterface/c_api.h +++ b/programs/noc/LowLevelInterface/c_api.h @@ -27,7 +27,7 @@ void we_are_bedeviled() { * The remaining registers are all clobbers. */ #define READ_N_WORDS_AND_PRINT(nonce, DIRECTION_QUINTET_MACRO, sender_reg, clobber0, clobber1, clobber2, clobber3, clobber4, clobber5) \ - READ_N_WORDS(nonce, DIRECTION_QUINTET_MACRO, MUL4, 48, 438, sender_reg, LOAD_AND_PRINT_RECEIVER_BODY(clobber0, clobber4, clobber5), clobber0, clobber1, clobber2, clobber3, clobber4) + READ_N_WORDS(nonce, DIRECTION_QUINTET_MACRO, MUL4_2CYCLES, 48, 438, sender_reg, LOAD_AND_PRINT_RECEIVER_BODY(clobber0, clobber4, clobber5), clobber0, clobber1, clobber2, clobber3, clobber4) #define read_n_words_and_print_HELPER(DIRECTION_QUINTET_MACRO) \ asm volatile( \ @@ -50,7 +50,7 @@ void read_n_words_and_print(int sending_core, int direction) { /** code related to BROADCAST_COUNT ***************************************************************/ -#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, countdown_reg, noc_base_address, clobber0) REPEAT64( \ +#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, countdown_reg, noc_base_address, clobber0) REPEAT4(REPEAT4( \ "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ "addi " #clobber0 ", " #countdown_reg ", -1\n\t" \ "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ @@ -61,18 +61,20 @@ void read_n_words_and_print(int sending_core, int direction) { "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ "bge x0, " #countdown_reg ", END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce "\n\t" \ -) \ +)) \ + "nop\n\t" \ + "nop\n\t" \ "jal x0, END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ "END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce ":\n\t" \ "nop\n\t" \ "add " #n_words_reg ", x0, x0\n\t" \ - "nop\n\t END_BROADCAST_COUNT_SEND_ASM" #nonce ": " + "nop\n\tEND_BROADCAST_COUNT_SEND_ASM" #nonce ": " -#define BROADCAST_COUNT_INITIALIZE_ASM(nonce, countdown_reg, n_words_reg, constant_128) \ - "li " #constant_128 ", 128\n\t" \ +#define BROADCAST_COUNT_INITIALIZE_ASM(nonce, countdown_reg, n_words_reg, constant_32) \ + "li " #constant_32 ", 32\n\t" \ "add " #n_words_reg ", " #countdown_reg ", x0\n\t" \ - "blt " #n_words_reg ", " #constant_128 ", BROADCAST_COUNT_DONE_INITIALIZING" #nonce "\n\t" \ - "addi " #n_words_reg ", " #countdown_reg ", -128\n\t" \ + "blt " #n_words_reg ", " #constant_32 ", BROADCAST_COUNT_DONE_INITIALIZING" #nonce "\n\t" \ + "li " #n_words_reg ", 32\n\t" \ "nop\n\t" \ "BROADCAST_COUNT_DONE_INITIALIZING" #nonce ":\n\t" \ "li a4, 0xbaaabaaa\n\t" \ @@ -81,18 +83,14 @@ void read_n_words_and_print(int sending_core, int direction) { "csrw 0x51e, a4\n\t" \ "csrw 0x51e, " #n_words_reg "\n\t" -#define BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, constant_128) \ - "li " #constant_128 ", 128\n\t" \ - "addi " #n_words_reg ", " #countdown_reg ", -128\n\t" \ - "bge " #constant_128 ", " #n_words_reg ", BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce "\n\t" \ - "li " #n_words_reg ", 128\n\t" \ +#define BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, constant_32) \ + "li " #constant_32 ", 32\n\t" \ + "addi " #n_words_reg ", " #countdown_reg ", -32\n\t" \ + "bge " #constant_32 ", " #n_words_reg ", BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce "\n\t" \ + "li " #n_words_reg ", 32\n\t" \ "nop\n\t" \ "BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce ":\n\t" \ - "li a4, 0xbaaabaaa\n\t" \ - "csrw 0x51e, a4\n\t" \ - "csrw 0x51e, " #countdown_reg "\n\t" \ - "csrw 0x51e, a4\n\t" \ - "csrw 0x51e, " #n_words_reg "\n\t" \ + "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" /* Don't let it go negative. If it goes negative, we're basically done, so it's OK for it to become garbage. But it mustn't go negative. */ \ "nop\n\t" /** @@ -100,7 +98,6 @@ void read_n_words_and_print(int sending_core, int direction) { * @param SENDING_TO_ZERO_MACRO, ..., SENDING_TO_THREE_MACRO: All of these should be TRUE_MACRO * except the one corresponding to the current core (which does not broadcast to itself). * @param countdown_reg Input: The first number in the countdown. - * Must be less than 128! May need to be even smaller depending on capability of receiver. * @param result_reg Output: Indicates whether operation succeeded. See SEND_N_WORDS for details. * @param noc_base_address Clobber. */ @@ -119,7 +116,7 @@ void read_n_words_and_print(int sending_core, int direction) { clobber3, \ clobber4 \ ) \ - "add " #n_words_reg ", " #countdown_reg ", x0\n\t" \ + BROADCAST_COUNT_INITIALIZE_ASM(nonce ## 123, countdown_reg, n_words_reg, clobber2) \ SEND_N_WORDS( \ nonce, \ n_words_reg, \ diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index d7f782c..532ed26 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -189,11 +189,14 @@ "jal x0, SEND_N_WORDS_SENDING_SYNC_WORD" #nonce "\n\t" \ "END_SEND_N_WORDS" #nonce ":\n\t" +#define MUL4_2CYCLES(in_reg, out_reg) \ + MUL4(in_reg, out_reg) "nop\n\t" + /** * @brief Read the number of words specified by the sender. * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the * receiver. - * @param MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES Assembly that takes an in_reg and an out_reg + * @param MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES Assembly that takes an in_reg and an out_reg * and sets the out_reg to the number of instructions in each 5-cycle subsequence of * receive_words_asm. * @param offset_numeric_literal 36 plus (4 times the number of instructions in @@ -213,7 +216,7 @@ #define READ_N_WORDS( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES, \ offset_numeric_literal, \ hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ @@ -222,7 +225,7 @@ ) READ_N_WORDS__( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES, \ offset_numeric_literal, \ hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ @@ -233,7 +236,7 @@ #define READ_N_WORDS__( \ nonce, \ DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES, \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES, \ offset_numeric_literal, \ hex_for_12_bit_distance_from_auipc_to_end, \ sending_core_reg, \ @@ -241,7 +244,7 @@ noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ ) \ BLOCKING_READ(nonce ## 1, noc_base_address, clobber4, sending_core_reg) \ - MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ "slli " #packet_size_reg ", " #packet_size_reg ", 2\n\t" \ SYNC5(nonce ## 2, sending_core_reg, t6, jalr_word_reg, replaced_instruction_reg, clobber4) \ "auipc t6, 0\n\t" \ @@ -263,7 +266,7 @@ "nop\n\t" \ "lw " #clobber4 ", 0(" #noc_base_address ")\n\t" /* cycle 0 mod 5 */ \ "beqz " #clobber4 ", READ_N_WORDS_END" #nonce " \n\t" \ - MUL_BY_RECEIVE_WORDS_PERIOD_2INSTRS_2CYCLES(clobber4, replaced_instruction_reg) \ + MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES(clobber4, replaced_instruction_reg) \ "slli " #packet_size_reg ", " #replaced_instruction_reg ", 2\n\t" /* cycle 0 mod 5 */ \ "add " #packet_size_reg ", t6, " #packet_size_reg "\n\t" \ "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t" \ diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index 000672b..ab48e8b 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -30,7 +30,7 @@ int main() { } int main0() { - broadcast_count(0, 17); + broadcast_count(0, 130); // read_n_words_and_print(1, EAST_INT); // read_n_words_and_print(2, NORTH_INT); // read_n_words_and_print(3, NORTHEAST_INT); From 309dad60f173d23618a92219881c24e8b2d1fca9 Mon Sep 17 00:00:00 2001 From: Peter Donovan <33707478+petervdonovan@users.noreply.github.com> Date: Fri, 20 Jan 2023 12:52:11 -0800 Subject: [PATCH 25/34] Update programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c Co-authored-by: Marten Lohstroh --- programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c index 804e68b..242d280 100644 --- a/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c +++ b/programs/benchmarks/noc/latency_aligned/noc_latency_aligned.c @@ -36,7 +36,7 @@ static int send_main(uint32_t receiver) { "nop\n\t" "nop\n\t" "li t5, 42\n\t" // Set noc data to 42 - "sw t5, 8(t4)\n\t" // FIXME: Data must be written first? Why? Is it Hardware Bug? + "sw t5, 8(t4)\n\t" // NOTE: Data must be written first. This is by design. "li t5, 0x04\n\t" "sw t5, 4(t4)\n\t" ); From b8fdadbdf2a2f10037871eba625cdc4a85febee8 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 6 Jan 2023 01:05:21 -0800 Subject: [PATCH 26/34] This sends 1023 words in 5867 cycles. That is 87% the maximum possible on this NoC, and by sending more data we can amortize even further. --- programs/noc/LowLevelInterface/low_level_interface_noc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index ab48e8b..e606d79 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -30,7 +30,7 @@ int main() { } int main0() { - broadcast_count(0, 130); + broadcast_count(0, 1023); // read_n_words_and_print(1, EAST_INT); // read_n_words_and_print(2, NORTH_INT); // read_n_words_and_print(3, NORTHEAST_INT); From 8e70867684b05dc1ff84e3fa0e7a24dc7a46e48c Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 6 Jan 2023 16:56:23 -0800 Subject: [PATCH 27/34] Optimize out a SYNC5. This is 14 cycles out of 115 -- not a big performance difference. More importantly, it avoids clobbering a few registers, which the initialize_asm can now write to without their data being lost. --- programs/noc/LowLevelInterface/c_api.h | 7 ++----- .../noc/LowLevelInterface/low_level_interface.h | 17 ++++++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/programs/noc/LowLevelInterface/c_api.h b/programs/noc/LowLevelInterface/c_api.h index 00c27ef..80cee3f 100644 --- a/programs/noc/LowLevelInterface/c_api.h +++ b/programs/noc/LowLevelInterface/c_api.h @@ -77,11 +77,8 @@ void read_n_words_and_print(int sending_core, int direction) { "li " #n_words_reg ", 32\n\t" \ "nop\n\t" \ "BROADCAST_COUNT_DONE_INITIALIZING" #nonce ":\n\t" \ - "li a4, 0xbaaabaaa\n\t" \ - "csrw 0x51e, a4\n\t" \ - "csrw 0x51e, " #countdown_reg "\n\t" \ - "csrw 0x51e, a4\n\t" \ - "csrw 0x51e, " #n_words_reg "\n\t" + "nop\n\t" \ + "nop\n\t" #define BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, constant_32) \ "li " #constant_32 ", 32\n\t" \ diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h index 532ed26..06cb38f 100644 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ b/programs/noc/LowLevelInterface/low_level_interface.h @@ -25,6 +25,7 @@ "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ asm1cycle1 +/** Load 0x80000000 in 1 cycle. */ #define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" /** @@ -72,7 +73,7 @@ /** * @brief Load the NoC base address corresponding to the value of sending_core_reg into - * noc_core_base_address. + * noc_core_base_address. Takes 3 cycles. * @param noc_core_base_address Output: The address of data sent from the given core. * @param sending_core_reg Input (preserved): The number of the sending core. This register is not * clobbered so that the optimization of passing in x0 for it can be used. @@ -82,13 +83,16 @@ "slli " #clobber0 ", " #sending_core_reg ", 2\n\t" \ "add " #noc_core_base_address ", " #clobber0 ", " #noc_core_base_address "\n\t" -#define BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, clobber0) BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) // This indirection is necessary for the preprocessor to expand the macro arg __LINE__. :eye_roll: +/** @brief Block until a flit comes from the core withthe given base address. Takes 3 cycles (mod 5). */ +#define BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, clobber0) BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) +// This indirection is necessary for the preprocessor to expand the macro arg __LINE__. :eye_roll: #define BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) \ "BLOCKING_READ_POLL" #nonce ": lw " #clobber0 ", 16(" #noc_core_base_address ")\n\t" \ "beq x0, " #clobber0 ", BLOCKING_READ_POLL" #nonce "\n\t" \ /** - * @brief Do a blocking read of the message from the core at sending_core_reg. + * @brief Do a blocking read of the message from the core at sending_core_reg. Takes 3 cycles + * (mod 5). * @param noc_core_base_address Output: The base address corresponding to sending_core_reg. * @param read_to_reg Output: The result of the blocking read. * @param sending_core_reg Input (preserved): The number of the sending core. @@ -103,7 +107,7 @@ "lw " #clobber0 ", " #offset_literal "(" #noc_base_address_reg ")\n\t" \ "or " #accumulator_reg ", " #accumulator_reg ", " #clobber0 "\n\t" -/** Helper to SEND_N_WORDS. */ +/** Helper to SEND_N_WORDS. Takes 0 cycles (mod 5) if successful. */ #define READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, noc_base_address_reg, read_to_reg, sending_core_reg, result_reg, tag_bit_mask, fail_label) \ BLOCKING_READ(nonce, noc_base_address_reg, read_to_reg, sending_core_reg) \ "and " #result_reg ", " #read_to_reg ", " #tag_bit_mask "\n\t" /* check tag bit */ \ @@ -117,7 +121,7 @@ * message receivers tries to send a message after this tries to send a message (so awkward! who * gets to talk first?) * @param initialize_asm Assembly code for preparing to send words, e.g. by loading the words into - * the register file. + * the register file. Must take 2 cycles (mod 5). * @param send_words_asm Assembly code for sending the words rapidly. Must never miss a TDM slot! * May assume that the preceding code has already taken care of synchronization. Must preserve * synchronization. Must set n_words_reg to hold the number of words in the next contiguous sequence, @@ -168,9 +172,8 @@ SENDING_TO_TWO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 2, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ "li " #clobber3 ", 3\n\t" \ SENDING_TO_THREE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 3, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ - /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. By my count the TDM slot of the next instruction will be -2 mod 5, but for now I won't use that fact, preferring instead to re-synchronize, just to make the assembly easier to write (less brittle, less performant). */ \ + /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. Cycle is 2 (mod 5). */ \ initialize_asm \ - SYNC5(nonce ## 1, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ /* The following 5 cycles are spent to break the receivers from their polling loop. */ \ "SEND_N_WORDS_SENDING_SYNC_WORD" #nonce ": \n\t" \ SENDING_NORTHEAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ From d5983a3a813cbc8e034205ff4a7e2fdd6a2e1ddf Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 6 Jan 2023 18:54:44 -0800 Subject: [PATCH 28/34] Bugfix; move header-only lib to flexpret. --- flexpret | 2 +- programs/noc/LowLevelInterface/asm_utils.h | 58 ---- programs/noc/LowLevelInterface/c_api.h | 162 ---------- .../LowLevelInterface/low_level_interface.h | 282 ------------------ .../low_level_interface_noc.c | 31 +- 5 files changed, 15 insertions(+), 520 deletions(-) delete mode 100644 programs/noc/LowLevelInterface/asm_utils.h delete mode 100644 programs/noc/LowLevelInterface/c_api.h delete mode 100644 programs/noc/LowLevelInterface/low_level_interface.h diff --git a/flexpret b/flexpret index fc487e0..29446f6 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit fc487e0c03871f4d4aaa1a94c3f06b7bdce0a966 +Subproject commit 29446f644e9becd09996be19a6225a4e6a3acbbd diff --git a/programs/noc/LowLevelInterface/asm_utils.h b/programs/noc/LowLevelInterface/asm_utils.h deleted file mode 100644 index 688232d..0000000 --- a/programs/noc/LowLevelInterface/asm_utils.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef ASM_UTILS_H -#define ASM_UTILS_H - -#define TOKENPASTE2__(x, y) x ## y -#define TOKENPASTE1__(x, y) TOKENPASTE2__(x, y) -#define TOKENPASTE(x, y) TOKENPASTE1(x, y) - -/** - * @brief Pure assembly version of _fp_print. Executes in 4 cycles. - */ -#define FP_PRINT_ASM(reg, clobber0) \ - "li " #clobber0 ", 0xbaaabaaa\n\t" \ - "csrw 0x51e, " #clobber0 "\n\t" \ - "csrw 0x51e, " #reg "\n\t" - -#define TRUE_MACRO(case_true, case_false) case_true -#define FALSE_MACRO(case_true, case_false) case_false - -#define REPEAT1(x) x -#define REPEAT2(x) x x -#define REPEAT3(x) x x x -#define REPEAT4(x) REPEAT2(REPEAT2(x)) -#define REPEAT5(x) REPEAT2(x) REPEAT3(x) -#define REPEAT7(x) REPEAT2(x) REPEAT5(x) -#define REPEAT11(x) REPEAT7(x) REPEAT4(x) -#define REPEAT64(x) REPEAT4(REPEAT4(REPEAT4(x))) - -#define COUNT_TO_20(MACRO, param0, param1, param2) \ - MACRO(param0, param1, param2, 0) \ - MACRO(param0, param1, param2, 1) \ - MACRO(param0, param1, param2, 2) \ - MACRO(param0, param1, param2, 3) \ - MACRO(param0, param1, param2, 4) \ - MACRO(param0, param1, param2, 5) \ - MACRO(param0, param1, param2, 6) \ - MACRO(param0, param1, param2, 7) \ - MACRO(param0, param1, param2, 8) \ - MACRO(param0, param1, param2, 9) \ - MACRO(param0, param1, param2, 10) \ - MACRO(param0, param1, param2, 11) \ - MACRO(param0, param1, param2, 12) \ - MACRO(param0, param1, param2, 13) \ - MACRO(param0, param1, param2, 14) \ - MACRO(param0, param1, param2, 15) \ - MACRO(param0, param1, param2, 16) \ - MACRO(param0, param1, param2, 17) \ - MACRO(param0, param1, param2, 18) \ - MACRO(param0, param1, param2, 19) - -#define MUL4(in_reg, out_reg) \ - "slli " #out_reg ", " #in_reg ", 2\n\t" - -#define MUL5(in_reg, out_reg) \ - MUL4(in_reg, out_reg) \ - "add " #out_reg ", " #out_reg ", " #in_reg "\n\t" - - -#endif // ASM_UTILS_H diff --git a/programs/noc/LowLevelInterface/c_api.h b/programs/noc/LowLevelInterface/c_api.h deleted file mode 100644 index 80cee3f..0000000 --- a/programs/noc/LowLevelInterface/c_api.h +++ /dev/null @@ -1,162 +0,0 @@ -#ifndef C_API_H -#define C_API_H - -#include - -#include "low_level_interface.h" - -void we_are_bedeviled() { - _fp_print(666); -} - -/** code related to READ_N_WORDS_AND_PRINT ********************************************************/ - -#define LOAD_AND_PRINT_RECEIVER_BODY(noc_base_address, clobber0, clobber1) REPEAT64( \ - "lw " #clobber0 ", 0(" #noc_base_address ")\n\t" /* t4 has the base address for the noc */ \ - "nop\n\t" \ - "csrw 0x51e, " #clobber1 "\n\t" \ - "csrw 0x51e, " #clobber0 "\n\t" \ -) "nop\n\t" /* Do not let the code self-modification kill a line that we actually need in the special case that the packet length is exactly 64. */ - -/** - * @brief Block and print up to 64 words that were sent by another core using the protocol of - * SEND_N_WORDS. - * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the - * receiver. - * @param sender_reg A register containing the core ID of the sender. - * The remaining registers are all clobbers. - */ -#define READ_N_WORDS_AND_PRINT(nonce, DIRECTION_QUINTET_MACRO, sender_reg, clobber0, clobber1, clobber2, clobber3, clobber4, clobber5) \ - READ_N_WORDS(nonce, DIRECTION_QUINTET_MACRO, MUL4_2CYCLES, 48, 438, sender_reg, LOAD_AND_PRINT_RECEIVER_BODY(clobber0, clobber4, clobber5), clobber0, clobber1, clobber2, clobber3, clobber4) - -#define read_n_words_and_print_HELPER(DIRECTION_QUINTET_MACRO) \ - asm volatile( \ - "li a4, 0xbaaabaaa\n\t" \ - READ_N_WORDS_AND_PRINT(__LINE__, DIRECTION_QUINTET_MACRO, %[sender_reg], t4, t5, a5, a2, a3, a4) \ - : /* no outputs */ \ - : [sender_reg] "r" (sending_core) \ - : "t4", "t5", "a5", "a2", "a3", "a4" \ - ) - -/** @brief See READ_N_WORDS_AND_PRINT for documentation. */ -void read_n_words_and_print(int sending_core, int direction) { - switch (direction) { - case NORTHEAST_INT: read_n_words_and_print_HELPER(NORTHEAST_QUINTET); break; - case NORTH_INT: read_n_words_and_print_HELPER(NORTH_QUINTET); break; - case EAST_INT: read_n_words_and_print_HELPER(EAST_QUINTET); break; - default: we_are_bedeviled(); - } -} - -/** code related to BROADCAST_COUNT ***************************************************************/ - -#define BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, countdown_reg, noc_base_address, clobber0) REPEAT4(REPEAT4( \ - "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ - "addi " #clobber0 ", " #countdown_reg ", -1\n\t" \ - "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ - "sw " #countdown_reg ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #clobber0 ", END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce "\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "addi " #countdown_reg ", " #clobber0 ", -1\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "sw " #clobber0 ", 0(" #noc_base_address ")\n\t" \ - "bge x0, " #countdown_reg ", END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce "\n\t" \ -)) \ - "nop\n\t" \ - "nop\n\t" \ - "jal x0, END_BROADCAST_COUNT_SEND_ASM" #nonce "\n\t" \ - "END_BROADCAST_COUNT_SEND_ASM_BODY" #nonce ":\n\t" \ - "nop\n\t" \ - "add " #n_words_reg ", x0, x0\n\t" \ - "nop\n\tEND_BROADCAST_COUNT_SEND_ASM" #nonce ": " - -#define BROADCAST_COUNT_INITIALIZE_ASM(nonce, countdown_reg, n_words_reg, constant_32) \ - "li " #constant_32 ", 32\n\t" \ - "add " #n_words_reg ", " #countdown_reg ", x0\n\t" \ - "blt " #n_words_reg ", " #constant_32 ", BROADCAST_COUNT_DONE_INITIALIZING" #nonce "\n\t" \ - "li " #n_words_reg ", 32\n\t" \ - "nop\n\t" \ - "BROADCAST_COUNT_DONE_INITIALIZING" #nonce ":\n\t" \ - "nop\n\t" \ - "nop\n\t" - -#define BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, constant_32) \ - "li " #constant_32 ", 32\n\t" \ - "addi " #n_words_reg ", " #countdown_reg ", -32\n\t" \ - "bge " #constant_32 ", " #n_words_reg ", BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce "\n\t" \ - "li " #n_words_reg ", 32\n\t" \ - "nop\n\t" \ - "BROADCAST_COUNT_DONE_PREPARING_NEXT" #nonce ":\n\t" \ - "andi " #n_words_reg ", " #n_words_reg ", 1023\n\t" /* Don't let it go negative. If it goes negative, we're basically done, so it's OK for it to become garbage. But it mustn't go negative. */ \ - "nop\n\t" - -/** - * @brief Broadcast a countdown from countdown_reg to 1 to all other cores. - * @param SENDING_TO_ZERO_MACRO, ..., SENDING_TO_THREE_MACRO: All of these should be TRUE_MACRO - * except the one corresponding to the current core (which does not broadcast to itself). - * @param countdown_reg Input: The first number in the countdown. - * @param result_reg Output: Indicates whether operation succeeded. See SEND_N_WORDS for details. - * @param noc_base_address Clobber. - */ -#define BROADCAST_COUNT( \ - nonce, \ - SENDING_TO_ZERO_MACRO, \ - SENDING_TO_ONE_MACRO, \ - SENDING_TO_TWO_MACRO, \ - SENDING_TO_THREE_MACRO, \ - countdown_reg, \ - result_reg, \ - noc_base_address, \ - n_words_reg, \ - clobber1, \ - clobber2, \ - clobber3, \ - clobber4 \ -) \ - BROADCAST_COUNT_INITIALIZE_ASM(nonce ## 123, countdown_reg, n_words_reg, clobber2) \ - SEND_N_WORDS( \ - nonce, \ - n_words_reg, \ - result_reg, \ - TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, \ - SENDING_TO_ZERO_MACRO, SENDING_TO_ONE_MACRO, SENDING_TO_TWO_MACRO, SENDING_TO_THREE_MACRO, \ - BROADCAST_COUNT_INITIALIZE_ASM(nonce, countdown_reg, n_words_reg, clobber2), \ - BROADCAST_COUNT_SEND_ASM(nonce, n_words_reg, countdown_reg, noc_base_address, clobber3), \ - BROADCAST_COUNT_PREPARE_NEXT_SEND_ASM(nonce, countdown_reg, n_words_reg, clobber2), \ - noc_base_address, clobber1, clobber2, clobber3, clobber4 \ - ) - -#define broadcast_count_HELPER( \ - nonce, \ - SENDING_TO_ZERO_MACRO, \ - SENDING_TO_ONE_MACRO, \ - SENDING_TO_TWO_MACRO, \ - SENDING_TO_THREE_MACRO \ -) \ - asm volatile( \ - BROADCAST_COUNT( \ - nonce, \ - SENDING_TO_ZERO_MACRO, \ - SENDING_TO_ONE_MACRO, \ - SENDING_TO_TWO_MACRO, \ - SENDING_TO_THREE_MACRO, \ - %[countdown_reg], \ - t0, t1, t2, t3, t4, t5, t6 \ - ) \ - : /* no outputs */ \ - : [countdown_reg] "r" (start_count_at) \ - : "t0", "t1", "t2", "t3", "t4", "t5", "t6" \ - ) - -/** @brief See BROADCAST_COUNT for documentation. start_count_at corresponds to countdown_reg. */ -void broadcast_count(int current_core, int start_count_at) { - switch(current_core) { - case 0: broadcast_count_HELPER(__LINE__, FALSE_MACRO, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO); break; - case 1: broadcast_count_HELPER(__LINE__, TRUE_MACRO, FALSE_MACRO, TRUE_MACRO, TRUE_MACRO); break; - case 2: broadcast_count_HELPER(__LINE__, TRUE_MACRO, TRUE_MACRO, FALSE_MACRO, TRUE_MACRO); break; - case 3: broadcast_count_HELPER(__LINE__, TRUE_MACRO, TRUE_MACRO, TRUE_MACRO, FALSE_MACRO); break; - default: we_are_bedeviled(); - } -} - -#endif // C_API_H diff --git a/programs/noc/LowLevelInterface/low_level_interface.h b/programs/noc/LowLevelInterface/low_level_interface.h deleted file mode 100644 index 06cb38f..0000000 --- a/programs/noc/LowLevelInterface/low_level_interface.h +++ /dev/null @@ -1,282 +0,0 @@ -#ifndef LOW_LEVEL_INTERFACE_H -#define LOW_LEVEL_INTERFACE_H - -#include "asm_utils.h" - -#define NORTHEAST_INT 0 -#define NORTH_INT 8 -#define EAST_INT 12 - -#define NORTHEAST_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ - "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ - asm2cycles \ - asm1cycle0 \ - asm1cycle1 - -#define NORTH_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ - asm2cycles \ - "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ - asm1cycle0 \ - asm1cycle1 - -#define EAST_QUINTET(asm2cycles, asm1cycle0, asm1cycle1, to_send_reg, noc_base_address_reg) \ - asm2cycles \ - asm1cycle0 \ - "sw " #to_send_reg ", 0(" #noc_base_address_reg ")\n\t" \ - asm1cycle1 - -/** Load 0x80000000 in 1 cycle. */ -#define LOAD_NOC_BASE_ADDRESS(reg) "li " #reg ", 0x80000000\n\t" - -/** - * @brief The instruction immediately following SYNC5 is able to store a word into the zeroth TDM - * slot (northwest) in a single-threaded setting. Clobber the given registers. - * It 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining 9 cycles are overhead. - */ -#define SYNC5(nonce, noc_base_address, clobber0, clobber1, clobber2, clobber3) \ - "li " #clobber0 ", 1\n\t" \ - "li " #clobber1 ", 2\n\t" \ - "li " #clobber2 ", 3\n\t" \ - "li " #noc_base_address ", 0x80000000\n\t" \ - "lw " #clobber3 ", 32(" #noc_base_address ")\n\t" /* Get elapsed cycles mod period */ \ - "beq " #clobber3 ", " #clobber0 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "beq " #clobber3 ", x0, DONE_SYNCHRONIZING" #nonce "\n\t" \ - "nop\n\t" \ - "beq " #clobber3 ", " #clobber2 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "beq " #clobber3 ", " #clobber1 ", DONE_SYNCHRONIZING" #nonce "\n\t" \ - "DONE_SYNCHRONIZING" #nonce ":\n\t" - -/** - * @brief Broadcast the value stored in reg to all other cores. The instruction immediately - * following BROADCAST_SYNCED is able to store into the zeroth TDM slot. - * This assumes that the current thread is synchronized to the TDM schedule. - */ -#define BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, instr0, instr1) \ - "sw " #reg ", 0(" #noc_base_address ")\n\t" \ - instr0 \ - "sw " #reg ", 0(" #noc_base_address ")\n\t" \ - "sw " #reg ", 0(" #noc_base_address ")\n\t" \ - instr1 - -#define BROADCAST_SYNCED(nonce, noc_base_address, reg) \ - BROADCAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, "nop\n\t", "nop\n\t") - -#define SEND_NORTHEAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, instr0, instr1, instr2, instr3) \ - "sw " #reg ", 0(" #noc_base_address ")\n\t" \ - instr0 \ - instr1 \ - instr2 \ - instr3 - -#define SEND_NORTHEAST_SYNCED(nonce, noc_base_address, reg) \ - SEND_NORTHEAST_SYNCED_WITH_INSTRUCTIONS(nonce, noc_base_address, reg, "nop\n\t", "nop\n\t", "nop\n\t", "nop\n\t") - -/** - * @brief Load the NoC base address corresponding to the value of sending_core_reg into - * noc_core_base_address. Takes 3 cycles. - * @param noc_core_base_address Output: The address of data sent from the given core. - * @param sending_core_reg Input (preserved): The number of the sending core. This register is not - * clobbered so that the optimization of passing in x0 for it can be used. - */ -#define LOAD_NOC_CORE_BASE_ADDRESS(noc_core_base_address, sending_core_reg, clobber0) \ - LOAD_NOC_BASE_ADDRESS(noc_core_base_address) \ - "slli " #clobber0 ", " #sending_core_reg ", 2\n\t" \ - "add " #noc_core_base_address ", " #clobber0 ", " #noc_core_base_address "\n\t" - -/** @brief Block until a flit comes from the core withthe given base address. Takes 3 cycles (mod 5). */ -#define BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, clobber0) BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) -// This indirection is necessary for the preprocessor to expand the macro arg __LINE__. :eye_roll: -#define BLOCK_ON_FLIT_FROM_CORE__(nonce, noc_core_base_address, clobber0) \ - "BLOCKING_READ_POLL" #nonce ": lw " #clobber0 ", 16(" #noc_core_base_address ")\n\t" \ - "beq x0, " #clobber0 ", BLOCKING_READ_POLL" #nonce "\n\t" \ - -/** - * @brief Do a blocking read of the message from the core at sending_core_reg. Takes 3 cycles - * (mod 5). - * @param noc_core_base_address Output: The base address corresponding to sending_core_reg. - * @param read_to_reg Output: The result of the blocking read. - * @param sending_core_reg Input (preserved): The number of the sending core. - */ -#define BLOCKING_READ(nonce, noc_core_base_address, read_to_reg, sending_core_reg) \ - LOAD_NOC_CORE_BASE_ADDRESS(noc_core_base_address, sending_core_reg, read_to_reg) \ - BLOCK_ON_FLIT_FROM_CORE(nonce, noc_core_base_address, read_to_reg) \ - "lw " #read_to_reg ", 0(" #noc_core_base_address ")\n\t" - -/** Helper to SEND_N_WORDS. Accumulates valid bits. */ -#define OR_VALIDITY_OF_NOC_DATA(noc_base_address_reg, accumulator_reg, offset_literal, clobber0) \ - "lw " #clobber0 ", " #offset_literal "(" #noc_base_address_reg ")\n\t" \ - "or " #accumulator_reg ", " #accumulator_reg ", " #clobber0 "\n\t" - -/** Helper to SEND_N_WORDS. Takes 0 cycles (mod 5) if successful. */ -#define READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce, noc_base_address_reg, read_to_reg, sending_core_reg, result_reg, tag_bit_mask, fail_label) \ - BLOCKING_READ(nonce, noc_base_address_reg, read_to_reg, sending_core_reg) \ - "and " #result_reg ", " #read_to_reg ", " #tag_bit_mask "\n\t" /* check tag bit */ \ - "bnez " #result_reg ", " #fail_label "\n\t" - -/** - * @brief Try to send the number of words specified by n_words_reg. - * @param n_words_reg The number of words to send. - * @param result_reg 0 if successful, 1 if one of the potential message receivers has already tried - * to send a message before this even tries to send a message, 0x80000000 if one of the potential - * message receivers tries to send a message after this tries to send a message (so awkward! who - * gets to talk first?) - * @param initialize_asm Assembly code for preparing to send words, e.g. by loading the words into - * the register file. Must take 2 cycles (mod 5). - * @param send_words_asm Assembly code for sending the words rapidly. Must never miss a TDM slot! - * May assume that the preceding code has already taken care of synchronization. Must preserve - * synchronization. Must set n_words_reg to hold the number of words in the next contiguous sequence, - * with the top bit set high if the length of prepare_for_next_send_words_asm is not exactly 11 - * cycles. - * @param prepare_for_next_send_words_asm Prepare for the next run of send_words_asm. - * @param TIMES_TO_REPEAT_SEND_WORDS_ASM Macro that repeats send_words_asm an appropriate number of - * times. - * @param noc_base_address Register that will hold the NoC base address. No assumptions are made - * about the original value held in this register (the NoC base address will be written into it - * regardless). - * The remaining parameters (e.g., SENDING_NORTHEAST_MACRO) must be either TRUE_MACRO or - * FALSE_MACRO. - */ -#define SEND_N_WORDS( \ - nonce, \ - n_words_reg, \ - result_reg, \ - SENDING_NORTHEAST_MACRO, \ - SENDING_NORTH_MACRO, \ - SENDING_EAST_MACRO, \ - SENDING_TO_ZERO_MACRO, \ - SENDING_TO_ONE_MACRO, \ - SENDING_TO_TWO_MACRO, \ - SENDING_TO_THREE_MACRO, \ - initialize_asm, \ - send_words_asm, \ - prepare_for_next_send_words_asm, \ - noc_base_address, clobber1, clobber2, clobber3, clobber4 \ -) \ - LOAD_NOC_BASE_ADDRESS(noc_base_address) \ - "add " #result_reg ", x0, x0\n\t" \ - SENDING_TO_ZERO_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 16, clobber1), "") \ - SENDING_TO_ONE_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 20, clobber1), "") \ - SENDING_TO_TWO_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 24, clobber1), "") \ - SENDING_TO_THREE_MACRO(OR_VALIDITY_OF_NOC_DATA(noc_base_address, result_reg, 28, clobber1), "") \ - "bnez " #result_reg ", END_SEND_N_WORDS" #nonce "\n\t" /** Fail with error code 1 */ \ - SYNC5(nonce ## 0, noc_base_address, clobber1, clobber2, clobber3, clobber4) \ - "li " #clobber1 ", 0x80000000\n\t" /* Set the top bit as a tag bit. If u wanna send so many words that this creates amiguity, then u have a bigger problem on your hands */ \ - "or " #n_words_reg ", " #n_words_reg ", " #clobber1 "\n\t" \ - SENDING_NORTH_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ - SENDING_EAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ - "li " #clobber3 ", 1\n\t" \ - SENDING_NORTHEAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ - SENDING_TO_ZERO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 0, noc_base_address, clobber2, x0, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ - SENDING_TO_ONE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 1, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ - "li " #clobber3 ", 2\n\t" \ - SENDING_TO_TWO_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 2, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ - "li " #clobber3 ", 3\n\t" \ - SENDING_TO_THREE_MACRO(READ_AND_FAIL_IF_TAG_BIT_IS_1(nonce ## 3, noc_base_address, clobber2, clobber3, result_reg, clobber1, END_SEND_N_WORDS ## nonce), "") \ - /* At this point, all prospective message receivers have agreed that they are ready to receive the given number of words by replying using responses that have zero as their tag bit. Cycle is 2 (mod 5). */ \ - initialize_asm \ - /* The following 5 cycles are spent to break the receivers from their polling loop. */ \ - "SEND_N_WORDS_SENDING_SYNC_WORD" #nonce ": \n\t" \ - SENDING_NORTHEAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ - "nop\n\t" \ - SENDING_NORTH_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ - SENDING_EAST_MACRO("sw x0, 0(" #noc_base_address ")\n\t", "nop\n\t") \ - "nop\n\t" \ - "SEND_N_WORDS_SEND_WORDS_ASM" #nonce ": " send_words_asm \ - SENDING_NORTHEAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ - "nop\n\t" \ - SENDING_NORTH_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ - SENDING_EAST_MACRO("sw " #n_words_reg ", 0(" #noc_base_address ")\n\t", "nop\n\t") \ - "beqz " #n_words_reg ", END_SEND_N_WORDS" #nonce "\n\t" \ - prepare_for_next_send_words_asm \ - "blt x0, " #n_words_reg ", SEND_N_WORDS_SEND_WORDS_ASM" #nonce "\n\t" \ - "jal x0, SEND_N_WORDS_SENDING_SYNC_WORD" #nonce "\n\t" \ - "END_SEND_N_WORDS" #nonce ":\n\t" - -#define MUL4_2CYCLES(in_reg, out_reg) \ - MUL4(in_reg, out_reg) "nop\n\t" - -/** - * @brief Read the number of words specified by the sender. - * @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the - * receiver. - * @param MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES Assembly that takes an in_reg and an out_reg - * and sets the out_reg to the number of instructions in each 5-cycle subsequence of - * receive_words_asm. - * @param offset_numeric_literal 36 plus (4 times the number of instructions in - * preparatory asm). This will be 48 unless one provides instructions that - * cause stalls, such as loads, branches, or jumps. - * @param hex_for_12_bit_distance_from_auipc_to_end 3-digit hex (no 0x prefix) for - * offset_numeric_literal plus the offset of receive_words_asm. - * @param sending_core_reg Input: A register specifying the sending core. Clobbered. - * @param receive_words_asm Assembly that receives the sent words. Must read in its first cycle, and - * exactly every 5 cycles thereafter! - * @param noc_base_address Output: A register that will be set to the base address of the NoC - * corresponding to the given core. - * The remaining registers are all clobbers -- they have descriptive names, but they are not really - * API except insofar as they are clobbered. t6 is also clobbered, and receive_words_asm cannot use - * t6. The only clobber that receive_words_asm should use is clobber4. - */ -#define READ_N_WORDS( \ - nonce, \ - DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES, \ - offset_numeric_literal, \ - hex_for_12_bit_distance_from_auipc_to_end, \ - sending_core_reg, \ - receive_words_asm, \ - noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ -) READ_N_WORDS__( \ - nonce, \ - DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES, \ - offset_numeric_literal, \ - hex_for_12_bit_distance_from_auipc_to_end, \ - sending_core_reg, \ - receive_words_asm, \ - noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ -) - -#define READ_N_WORDS__( \ - nonce, \ - DIRECTION_QUINTET_MACRO, \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES, \ - offset_numeric_literal, \ - hex_for_12_bit_distance_from_auipc_to_end, \ - sending_core_reg, \ - receive_words_asm, \ - noc_base_address, packet_size_reg, jalr_word_reg, replaced_instruction_reg, clobber4 \ -) \ - BLOCKING_READ(nonce ## 1, noc_base_address, clobber4, sending_core_reg) \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES(clobber4, packet_size_reg) /* This kills the tag bit, as desired */ \ - "slli " #packet_size_reg ", " #packet_size_reg ", 2\n\t" \ - SYNC5(nonce ## 2, sending_core_reg, t6, jalr_word_reg, replaced_instruction_reg, clobber4) \ - "auipc t6, 0\n\t" \ - "add " #packet_size_reg ", t6, " #packet_size_reg "\n\t" \ - "li " #jalr_word_reg ", 0x" #hex_for_12_bit_distance_from_auipc_to_end "F8067\n\t" /* This hardcodes a write to the address stored at t6 = x31 with the given offset. Note also that li is two instructions and takes two cycles. */ \ - "nop\n\t" /* It might be wise to preserve jalr_word_reg for use in a future iteration. t6 can also be preserved, although we will need to add/subtract from it, according to the new sub-packet length, presumably using a preserved value of packet_size_reg. */ \ - DIRECTION_QUINTET_MACRO( \ - "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t", /* replaced_instruction_reg := instruction to be replaced (receive_words_asm must not clobber replaced_instruction_reg!) */ \ - "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t", \ - "nop\n\t", \ - x0, /* It doesn't really matter what you send, as long as the tag bit is zero */ \ - noc_base_address \ - ) \ - "READ_N_WORDS_WAIT_FOR_SYNC_WORD" #nonce ": " BLOCK_ON_FLIT_FROM_CORE(nonce, noc_base_address, clobber4) /* 2 instructions, -2 cycles (mod 5) */ \ - "nop\n\t" \ - "nop\n\t" \ - "READ_N_WORDS_RECEIVE_WORDS" #nonce ": " receive_words_asm /* Upon jumping out of this block (at 0 cycles mod 5) we should be at -2 cycles (mod 5) */ \ - "sw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t" /* restore the modified imem entry */ \ - "nop\n\t" \ - "lw " #clobber4 ", 0(" #noc_base_address ")\n\t" /* cycle 0 mod 5 */ \ - "beqz " #clobber4 ", READ_N_WORDS_END" #nonce " \n\t" \ - MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES(clobber4, replaced_instruction_reg) \ - "slli " #packet_size_reg ", " #replaced_instruction_reg ", 2\n\t" /* cycle 0 mod 5 */ \ - "add " #packet_size_reg ", t6, " #packet_size_reg "\n\t" \ - "lw " #replaced_instruction_reg ", 52(" #packet_size_reg ")\n\t" \ - "sw " #jalr_word_reg ", 52(" #packet_size_reg ")\n\t" \ - "nop\n\t" /* cycle 0 mod 5 */ \ - "blt " #clobber4 ", x0, READ_N_WORDS_WAIT_FOR_SYNC_WORD" #nonce " \n\t" /* Interpret a tag bit as a request from the sender to have some extra time to prepare for the next burst of data. No need to keep the same alignment mod 5 as before. Coarse (within 5 of cycle-accurate) but adequate synchronization will be done in the busy loop. */ \ - "jal x0, READ_N_WORDS_RECEIVE_WORDS" #nonce "\n\t" /* Jump straight in without re-synchronizing, after incurring 20 (= 0 (mod 5)) cycles of overhead. */ \ - "READ_N_WORDS_END" #nonce ": " \ - -#endif // LOW_LEVEL_INTERFACE_H diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/LowLevelInterface/low_level_interface_noc.c index e606d79..1335de9 100644 --- a/programs/noc/LowLevelInterface/low_level_interface_noc.c +++ b/programs/noc/LowLevelInterface/low_level_interface_noc.c @@ -1,9 +1,6 @@ #include #include - -#include "asm_utils.h" -#include "low_level_interface.h" -#include "c_api.h" +#include /* | NE | _ | N | E | _ | */ @@ -30,29 +27,29 @@ int main() { } int main0() { - broadcast_count(0, 1023); - // read_n_words_and_print(1, EAST_INT); - // read_n_words_and_print(2, NORTH_INT); - // read_n_words_and_print(3, NORTHEAST_INT); + broadcast_count(0, 125); + read_n_words_and_print(1, EAST_INT); + read_n_words_and_print(2, NORTH_INT); + read_n_words_and_print(3, NORTHEAST_INT); } int main1() { read_n_words_and_print(0, EAST_INT); - // broadcast_count(1, 17); - // read_n_words_and_print(2, NORTHEAST_INT); - // read_n_words_and_print(3, NORTH_INT); + broadcast_count(1, 17); + read_n_words_and_print(2, NORTHEAST_INT); + read_n_words_and_print(3, NORTH_INT); } int main2() { read_n_words_and_print(0, NORTH_INT); - // read_n_words_and_print(1, NORTHEAST_INT); - // broadcast_count(2, 42); - // read_n_words_and_print(3, EAST_INT); + read_n_words_and_print(1, NORTHEAST_INT); + broadcast_count(2, 42); + read_n_words_and_print(3, EAST_INT); } int main3() { read_n_words_and_print(0, NORTHEAST_INT); - // read_n_words_and_print(1, NORTH_INT); - // read_n_words_and_print(2, EAST_INT); - // broadcast_count(3, 3); + read_n_words_and_print(1, NORTH_INT); + read_n_words_and_print(2, EAST_INT); + broadcast_count(3, 3); } From 4c646944e08291501d5790f21f22161f3cd80f83 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 2 Feb 2023 21:47:21 -0800 Subject: [PATCH 29/34] Start creating a BroadcastMemory program. The relevant functions are commented out because GCC inline asm apparently does not permit clobbering caller-saved registers. --- flexpret | 2 +- .../Makefile | 2 +- .../broadcast_count_noc.c} | 0 programs/noc/BroadcastMemory/Makefile | 10 ++++ .../BroadcastMemory/broadcast_memory_noc.c | 55 +++++++++++++++++++ 5 files changed, 67 insertions(+), 2 deletions(-) rename programs/noc/{LowLevelInterface => BroadcastCount}/Makefile (62%) rename programs/noc/{LowLevelInterface/low_level_interface_noc.c => BroadcastCount/broadcast_count_noc.c} (100%) create mode 100644 programs/noc/BroadcastMemory/Makefile create mode 100644 programs/noc/BroadcastMemory/broadcast_memory_noc.c diff --git a/flexpret b/flexpret index 29446f6..8465589 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit 29446f644e9becd09996be19a6225a4e6a3acbbd +Subproject commit 846558974d25e54e2fe04522e9f37a7e165f264a diff --git a/programs/noc/LowLevelInterface/Makefile b/programs/noc/BroadcastCount/Makefile similarity index 62% rename from programs/noc/LowLevelInterface/Makefile rename to programs/noc/BroadcastCount/Makefile index 4bcaa17..18ee0b3 100644 --- a/programs/noc/LowLevelInterface/Makefile +++ b/programs/noc/BroadcastCount/Makefile @@ -1,5 +1,5 @@ build: - riscv_compile.sh ispm low_level_interface_noc.c + riscv_compile.sh ispm broadcast_count_noc.c clean: riscv_clean.sh diff --git a/programs/noc/LowLevelInterface/low_level_interface_noc.c b/programs/noc/BroadcastCount/broadcast_count_noc.c similarity index 100% rename from programs/noc/LowLevelInterface/low_level_interface_noc.c rename to programs/noc/BroadcastCount/broadcast_count_noc.c diff --git a/programs/noc/BroadcastMemory/Makefile b/programs/noc/BroadcastMemory/Makefile new file mode 100644 index 0000000..6ae9aeb --- /dev/null +++ b/programs/noc/BroadcastMemory/Makefile @@ -0,0 +1,10 @@ +build: + riscv_compile.sh ispm broadcast_memory_noc.c + +clean: + riscv_clean.sh + + +rebuild: clean build + +PHONY: build clean rebuild diff --git a/programs/noc/BroadcastMemory/broadcast_memory_noc.c b/programs/noc/BroadcastMemory/broadcast_memory_noc.c new file mode 100644 index 0000000..1335de9 --- /dev/null +++ b/programs/noc/BroadcastMemory/broadcast_memory_noc.c @@ -0,0 +1,55 @@ +#include +#include +#include + +/* | NE | _ | N | E | _ | */ + +/*********************** + * core 0 / N \ core 1 * + * W + E * + * core 2 \ S / core 3 * + ***********************/ + +int main0(); +int main1(); +int main2(); +int main3(); + +int main() { + int core_id = read_csr(CSR_COREID); + switch(core_id) { + case 0: main0(); break; + case 1: main1(); break; + case 2: main2(); break; + case 3: main3(); break; + default: _fp_print(666); //ERROR + } +} + +int main0() { + broadcast_count(0, 125); + read_n_words_and_print(1, EAST_INT); + read_n_words_and_print(2, NORTH_INT); + read_n_words_and_print(3, NORTHEAST_INT); +} + +int main1() { + read_n_words_and_print(0, EAST_INT); + broadcast_count(1, 17); + read_n_words_and_print(2, NORTHEAST_INT); + read_n_words_and_print(3, NORTH_INT); +} + +int main2() { + read_n_words_and_print(0, NORTH_INT); + read_n_words_and_print(1, NORTHEAST_INT); + broadcast_count(2, 42); + read_n_words_and_print(3, EAST_INT); +} + +int main3() { + read_n_words_and_print(0, NORTHEAST_INT); + read_n_words_and_print(1, NORTH_INT); + read_n_words_and_print(2, EAST_INT); + broadcast_count(3, 3); +} From b18fd087b3346a8f7e9189234bf5ff0450139940 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Thu, 2 Feb 2023 23:05:30 -0800 Subject: [PATCH 30/34] Assembly generation "hello world". --- .gitignore | 1 + programs/noc/BroadcastMemory/Makefile | 12 ++- programs/noc/BroadcastMemory/asm/ctrl.rvg | 75 +++++++++++++++++++ programs/noc/BroadcastMemory/asm/hello.rvg | 11 +++ programs/noc/BroadcastMemory/asm/rvgbuild | 1 + programs/noc/BroadcastMemory/asm/stdlib.rvg | 43 +++++++++++ .../BroadcastMemory/broadcast_memory_noc.c | 71 +++++++++--------- 7 files changed, 176 insertions(+), 38 deletions(-) create mode 100644 programs/noc/BroadcastMemory/asm/ctrl.rvg create mode 100644 programs/noc/BroadcastMemory/asm/hello.rvg create mode 100644 programs/noc/BroadcastMemory/asm/rvgbuild create mode 100644 programs/noc/BroadcastMemory/asm/stdlib.rvg diff --git a/.gitignore b/.gitignore index 08a5683..459c0e8 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ emulator/obj_dir **/*.orig **/*.dump **/fp-emu +**/asm-gen/ diff --git a/programs/noc/BroadcastMemory/Makefile b/programs/noc/BroadcastMemory/Makefile index 6ae9aeb..89668c4 100644 --- a/programs/noc/BroadcastMemory/Makefile +++ b/programs/noc/BroadcastMemory/Makefile @@ -1,10 +1,16 @@ -build: - riscv_compile.sh ispm broadcast_memory_noc.c +build: gen compile + +gen: + mkdir -p asm-gen + rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello.rvg > asm-gen/hello2.s + +compile: + riscv_compile.sh ispm asm-gen/hello2.s broadcast_memory_noc.c clean: + rm -r asm-gen riscv_clean.sh - rebuild: clean build PHONY: build clean rebuild diff --git a/programs/noc/BroadcastMemory/asm/ctrl.rvg b/programs/noc/BroadcastMemory/asm/ctrl.rvg new file mode 100644 index 0000000..08c7384 --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/ctrl.rvg @@ -0,0 +1,75 @@ +[mu [(program)] +[def (condition!) [assertify [lam [(x)] [[hasattr {condition}] x]] {a condition}]] +[def (<) [lam [(rs1 reg!) (rs2 reg!)] + [[addattr {condition}] + [lam [(label frag!)] + [pair { + blt rs1 rs2 label + } { + bgeu rs1 rs2 label + }]]]]] +[def (branch3) [lam [(condition condition!) (instrs block!)] + [def (assembly) { + [id { [id [[condition {DONE}] false]] } ] + [id instrs] + DONE: + }] + [[unsafe-assert-exact-cycles {3}] assembly]]] +[def (unroll) [lam [(n num!) (body block!)] + [[unsafe-assert-exact-cycles [* n [cycles? body]]] + [[[fold_range {1} {0} n] {}] + [lam [(acc) (_)] { + [id acc] + [id body] + }]]]]] +[def (for) [lam [ + (start reg!) (stop reg!) + (step num! [>! {0}]) + (body block!)] + [def (addition-and-branching-cycles) {4}] + [def (total-cycles [lamof num!]) [lam [(num-iterations)] [- + [* num-iterations + [+ [cycles? body] addition-and-branching-cycles]] + {2}]]] + [def (assembly) { + LOOP: + [id body] + addi start start step + blt start stop LOOP + }] + [pair assembly total-cycles]]] +[def (for_startup+iterations*-2) [lam [ + (counter reg!) (clobber reg!) + (start num!) (stop num! [>! start]) (step num! [>! {0}]) + (body block!)] + [def (stopreg) clobber] + [def (startup) { + li counter start + li stopreg stop + }] + [def (num-iterations) [/ [- start stop] [- {0} step]]] + [def (assembly,cycles pair!) [for counter stopreg step body]] + { + [id startup] + [[unsafe-assert-exact-cycles + [[assembly,cycles false] num-iterations]] + [assembly,cycles true]] + }]] +[def (prologue-epilogue!) [-> + [lamof [∘ num! [∘ [>=! {0}] [ ctrl.rvg -> hello.rvg diff --git a/programs/noc/BroadcastMemory/asm/stdlib.rvg b/programs/noc/BroadcastMemory/asm/stdlib.rvg new file mode 100644 index 0000000..29aa47b --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/stdlib.rvg @@ -0,0 +1,43 @@ +[lam [(program)] + +[def (id) [lam [(x)] x]] + +[def (if) [lam [(condition) (then) (else)] + [def (selected) [condition then else]] + [selected]]] + +[def (lam!) [lam [(k)] [lam [(l)] + [if [[lam? k] l] + [lam [] l] + [fail l {Expected lam of [id k] parameters}]]]]] + +[def (->) [lam [(arg-check) (result-check)] + [lam [(l arg-check)] + [applierify-varargs [lam [(applier)] + [result-check [applier l]]]]]]] +[def (assertify) [lam [(? [lam! {1}]) (description)] [lam [(checkee)] + [if [? checkee] + [lam [] checkee] + [fail checkee {Expected [id description]}]]]]] +[def (hasattr!) [lam [(attr)] [assertify [lam [(x)] [[hasattr attr] x]] attr]]] +[def (num!) [assertify [lam [(x)] [num? x]] {a number}]] +[def (frag!) [assertify [lam [(x)] [frag? x]] {a fragment}]] +[def (block!) [assertify [lam [(x)] [block? x]] {an assembly block}]] +[def (reg!) [assertify [lam [(x)] [reg? x]] {a register}]] +[def (lam! [lamof num!]) lam!] + +[def (true [addattr {bool}]) [lam [(a) (b)] a]] +[def (false [addattr {bool}]) [lam [(a) (b)] b]] +[def (bool!) [hasattr! {bool}]] +[def (pair [lam! {2}]) [lam [(a) (b)] + [[addattr {pair}] + [lam [(get bool!)] + [get a b]]]]] +[def (pair!) [hasattr! {pair}]] +[def (range!) [hasattr! {range}]] +[def (println) [lam [(s)] [lam [] [[print s]] [[print { +}]]]]] +[def (∘) [lam [(a [lam! {1}]) (b)] [applierify-varargs [lam [(applier)] + [a [applier b]]]]]] +[def (;;) [lam [(doc)] id]] +[program]] diff --git a/programs/noc/BroadcastMemory/broadcast_memory_noc.c b/programs/noc/BroadcastMemory/broadcast_memory_noc.c index 1335de9..38f1b99 100644 --- a/programs/noc/BroadcastMemory/broadcast_memory_noc.c +++ b/programs/noc/BroadcastMemory/broadcast_memory_noc.c @@ -16,40 +16,41 @@ int main2(); int main3(); int main() { - int core_id = read_csr(CSR_COREID); - switch(core_id) { - case 0: main0(); break; - case 1: main1(); break; - case 2: main2(); break; - case 3: main3(); break; - default: _fp_print(666); //ERROR - } + hello(); + // int core_id = read_csr(CSR_COREID); + // switch(core_id) { + // case 0: main0(); break; + // case 1: main1(); break; + // case 2: main2(); break; + // case 3: main3(); break; + // default: _fp_print(666); //ERROR + // } } -int main0() { - broadcast_count(0, 125); - read_n_words_and_print(1, EAST_INT); - read_n_words_and_print(2, NORTH_INT); - read_n_words_and_print(3, NORTHEAST_INT); -} - -int main1() { - read_n_words_and_print(0, EAST_INT); - broadcast_count(1, 17); - read_n_words_and_print(2, NORTHEAST_INT); - read_n_words_and_print(3, NORTH_INT); -} - -int main2() { - read_n_words_and_print(0, NORTH_INT); - read_n_words_and_print(1, NORTHEAST_INT); - broadcast_count(2, 42); - read_n_words_and_print(3, EAST_INT); -} - -int main3() { - read_n_words_and_print(0, NORTHEAST_INT); - read_n_words_and_print(1, NORTH_INT); - read_n_words_and_print(2, EAST_INT); - broadcast_count(3, 3); -} +// int main0() { +// broadcast_count(0, 125); +// read_n_words_and_print(1, EAST_INT); +// read_n_words_and_print(2, NORTH_INT); +// read_n_words_and_print(3, NORTHEAST_INT); +// } + +// int main1() { +// read_n_words_and_print(0, EAST_INT); +// broadcast_count(1, 17); +// read_n_words_and_print(2, NORTHEAST_INT); +// read_n_words_and_print(3, NORTH_INT); +// } + +// int main2() { +// read_n_words_and_print(0, NORTH_INT); +// read_n_words_and_print(1, NORTHEAST_INT); +// broadcast_count(2, 42); +// read_n_words_and_print(3, EAST_INT); +// } + +// int main3() { +// read_n_words_and_print(0, NORTHEAST_INT); +// read_n_words_and_print(1, NORTH_INT); +// read_n_words_and_print(2, EAST_INT); +// broadcast_count(3, 3); +// } From 23d365939db1baccace939d575ed88c756da23e0 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Fri, 3 Feb 2023 18:05:02 -0800 Subject: [PATCH 31/34] Start porting assembly to rvg. This is close to 600 lines of assembly all written in a very brittle way. It is not tested yet. Also note that the result of porting it will not really take advantage of all the code generator's dynamic checks etc. because it is translated very directly from C macros. --- flexpret | 2 +- programs/noc/BroadcastMemory/asm/ctrl.rvg | 13 + programs/noc/BroadcastMemory/asm/flexpret.rvg | 7 + .../asm/flexpret_noc_c_api.rvg | 0 .../asm/flexpret_noc_low_level_interface.rvg | 244 ++++++++++++++++++ programs/noc/BroadcastMemory/asm/hello.rvg | 9 +- programs/noc/BroadcastMemory/asm/stdlib.rvg | 4 + 7 files changed, 272 insertions(+), 7 deletions(-) create mode 100644 programs/noc/BroadcastMemory/asm/flexpret.rvg create mode 100644 programs/noc/BroadcastMemory/asm/flexpret_noc_c_api.rvg create mode 100644 programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg diff --git a/flexpret b/flexpret index 8465589..45eba52 160000 --- a/flexpret +++ b/flexpret @@ -1 +1 @@ -Subproject commit 846558974d25e54e2fe04522e9f37a7e165f264a +Subproject commit 45eba525e5b9f58676dd2dca09ca96e82ae7f654 diff --git a/programs/noc/BroadcastMemory/asm/ctrl.rvg b/programs/noc/BroadcastMemory/asm/ctrl.rvg index 08c7384..49e0725 100644 --- a/programs/noc/BroadcastMemory/asm/ctrl.rvg +++ b/programs/noc/BroadcastMemory/asm/ctrl.rvg @@ -70,6 +70,19 @@ [id acc] lw [id s|t|a][id k] [* {4} k](sp) }]]]]] +[def (file) [lam [(function-names frag!) (body block!)] + [mu [] + [[println {.global [id function-names]}]] + [[println {.section .text}]] + [[println body]] + [[println {.end}]]]]] [def (prologue prologue-epilogue!) [stackpush {s}]] [def (epilogue prologue-epilogue!) [stackpop {s}]] +[def (mul4) [lam [(rd output!) (rs1 input!)] { + slli rd rs1 2 +}]] +[def (mul5) [lam [(rd output!) (rs1 input!)] { + [mul4 rd rs1] + add rd rd rs1 +}]] [program]] diff --git a/programs/noc/BroadcastMemory/asm/flexpret.rvg b/programs/noc/BroadcastMemory/asm/flexpret.rvg new file mode 100644 index 0000000..7f633b6 --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/flexpret.rvg @@ -0,0 +1,7 @@ +[mu [(program)] +[def (fp-print) [lam [(r input!) (c0 clobber!)] { + li c0 0xbaaabaaa + csrw 0x51e c0 + csrw 0x51e r +}]] +[program]] diff --git a/programs/noc/BroadcastMemory/asm/flexpret_noc_c_api.rvg b/programs/noc/BroadcastMemory/asm/flexpret_noc_c_api.rvg new file mode 100644 index 0000000..e69de29 diff --git a/programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg b/programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg new file mode 100644 index 0000000..9f8231b --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg @@ -0,0 +1,244 @@ +[mu [(program)] +[def (quintet!) [-> [lamof [cycles! {2}] [cycles! {1}] [cycles! {1}] input! input!] [cycles! {5}]]] +[def (northeast-quintet quintet) + [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { + sw to-send 0(noc-base-address-reg) + asm2cycles + asm1cycle0 + asm1cycle1 + }] +[def (north-quintet quintet!) + [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { + asm2cycles + sw to-send 0(noc-base-address-reg) + asm1cycle0 + asm1cycle1 + }] +[def (east-quintet quintet!) + [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { + asm2cycles + asm1cycle0 + sw to-send 0(noc-base-address-reg) + asm1cycle1 + }] +[def (load-noc-base-address [-> [lamof output!] [cycles! {1}]) [lam [(r)] { + li r 0x80000000 + }]] +[;; { +The instruction immediately following `sync5` is able to store a word +into the zeroth TDM slot (northwest) in a single-threaded setting. +It takes 9-13 cycles to synchronize. The 0-4 is fundamental and the remaining +9 cycles are overhead.}] +[def (sync5) [lam [(noc-base-address clobber!) (c0 clobber!) (c1 clobber!) (c2 clobber!) (c3 clobber!)] { + li c0 1 + li c1 2 + li c2 3 + [load-noc-base-address noc-base-address] + lw c3 32(noc-base-address) + beq c3 c0 DONE_SYNCHRONIZING + beq c3 zero DONE_SYNCHRONIZING + beq c3 c2 DONE_SYNCHRONIZING + beq c3 c1 DONE_SYNCHRONIZING + DONE_SYNCHRONIZING: + }]] +[;; { +Broadcast the value stored in reg to all other cores. The instruction immediately +following BROADCAST_SYNCED is able to store into the zeroth TDM slot. +This assumes that the current thread is synchronized to the TDM schedule.}] +[def (broadcast-synced-with-instructions) [lam [ + (noc-base-address input!) + (reg input!) + (i0 [cycles! {1}]) + (i1 [cycles! {1}])] { + sw reg 0(noc-base-address) + [id i0] + sw reg 0(noc-base-address) + sw reg 0(noc-base-address) + [id i1]}]] +[def (broadcast-synced) [lam [(noc-base-address input!) (reg input!)] + [broadcast-synced-with-instructions {nop} {nop}]]] +[def (load-noc-core-base-address) [lam [(noc-core-base-address output!) (sending-core-reg input!) (c0 clobber!)]] { + [load-noc-base-address noc-core-base-address] + slli c0 sending-core-reg 2 + add noc-core-base-address c0 noc-core-base-address + }] +[def (block-on-flit-from-core) [lam [(noc-core-base-address input!) (c0 clobber!)] { + BLOCKING_READ_POLL: + lw c0 16(noc-core-base-address) + beq zero c0 BLOCKING_READ_POLL + }]] +[def (blocking-read) [lam [(noc-core-base-address clobber!) (read-to-reg output!) (sending-core-reg input!)] { + [load-noc-core-base-address noc-core-base-address sending-core-reg read-to-reg] + [block-on-flit-from-core noc-core-base-address read-to-reg] + lw read-to-reg 0(noc-core-base-address) + }]] +[def (or-validity-of-noc-data) [lam [ + (noc-base-address-reg input!) + (accumulator-reg input! output!) + (offset num!) + (c0 clobber!)] { + lw c0 offset(noc-base-address-reg) + or accumulator-reg accumulator-reg c0 + }]] +[def (read-and-fail-if-tag-bit-is-1) [lam [ + (noc-base-address-reg input!) + (read-to-reg output!) + (sending-core-reg input!) + (result-reg output!) + (tag-bit-mask input!) + (fail-label frag!)] { + [blocking-read noc-base-address-reg read-to-reg sending-core-reg] + and result-reg read-to-reg tag-bit-mask + bnez result-reg fail-label + }]] +[;; { +@brief Try to send the number of words specified by n_words_reg. +@param n_words_reg The number of words to send. +@param result_reg 0 if successful, 1 if one of the potential message receivers has already tried +to send a message before this even tries to send a message, 0x80000000 if one of the potential +message receivers tries to send a message after this tries to send a message (so awkward! who +gets to talk first?) +@param initialize_asm Assembly code for preparing to send words, e.g. by loading the words into +the register file. Must take 2 cycles (mod 5). +@param send_words_asm Assembly code for sending the words rapidly. Must never miss a TDM slot! +May assume that the preceding code has already taken care of synchronization. Must preserve +synchronization. Must set n_words_reg to hold the number of words in the next contiguous sequence, +with the top bit set high if the length of prepare_for_next_send_words_asm is not exactly 11 +cycles. +@param prepare_for_next_send_words_asm Prepare for the next run of send_words_asm. +@param noc_base_address Register that will hold the NoC base address. No assumptions are made +about the original value held in this register (the NoC base address will be written into it +regardless). +The remaining parameters (e.g., SENDING_NORTHEAST_MACRO) must be either TRUE_MACRO or +FALSE_MACRO. +}] +[def (send-n-words) [lam [ + (n-words-reg input! clobber! [;; {only the tag bit is clobbered}]) + (result-reg output!) + (sending-northeast bool!) + (sending-north bool!) + (sending-east bool!) + (sending-to-0 bool!) + (sending-to-1 bool!) + (sending-to-2 bool!) + (sending-to-3 bool!) + (initialize-asm block!) + (send-words-asm block!) + (prepare-for-next-send-words-asm block!) + (noc-base-address clobber!) + (c0 clobber!) (c1 clobber!) (c2 clobber!) (c3 clobber!) (c4 clobber!) +] { + [load-noc-base-address noc-base-address] + add result-reg zero zero + [sending-to-0 [or-validity-of-noc-data noc-base-address result-reg {16} c1]] + [sending-to-1 [or-validity-of-noc-data noc-base-address result-reg {20} c1]] + [sending-to-2 [or-validity-of-noc-data noc-base-address result-reg {24} c1]] + [sending-to-3 [or-validity-of-noc-data noc-base-address result-reg {28} c1]] + bnez result-reg END ;; fail with error code 1 + [sync5 noc-base-address c1 c2 c3 c4] + li c1 0x80000000 ;; Set the top bit as a tag bit. If u wanna send so many words that this creates ambiguity, then u have a bigger problem on your hands + or n-words-reg n-words-reg c1 + [/* [def (send-num-words-or-nop) [lam [(? bool!)] [? {sw n-words-reg 0(noc-base-address)} {nop}]]]] + [send-num-words-or-nop sending-north ] + [send-num-words-or-nop sending-east ] + li c3 1 + [send-num-words-or-nop sending-northeast] + [/* [def (blocking-read-or-not) [lam [(? bool!) (receiving-core input!)] [? + [read-and-fail-if-tag-bit-is-1 noc-base-address c2 receiving-core result-reg c1 {END}] + {}]]]] + [blocking-read-or-not sending-to-0 {zero}] + [blocking-read-or-not sending-to-1 c3] + li c3 2 + [blocking-read-or-not sending-to-2 c3] + li c3 3 + [blocking-read-or-not sending-to-3 c3] + [id initialize-asm] + SENDING_SYNC_WORD: + [/* [def (sync-word) [lam [(? bool!)] {sw zero 0(noc-base-address)}]]] + [sync-word sending-northeast] + nop + [sync-word sending-north ] + [sync-word sending-east ] + nop + SEND_WORDS_ASM: + [id send-words-asm] + [send-num-words-or-nop sending-northeast] + nop + [send-num-words-or-nop sending-north ] + [send-num-words-or-nop sending-east ] + beqz n-words-reg END + [id prepare-for-next-send-words-asm] + bge n-words-reg zero SEND_WORDS_ASM + jal zero SENDING_SYNC_WORD + END: + }]] +[def (mul4-2cycles [cycles! {2}]) [lam [(in-reg input!) (out-reg output)]] { + [mul4 in-reg out-reg] + nop + }] +[;; { +@brief Read the number of words specified by the sender. +@param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the +receiver. +@param MUL_BY_RECEIVE_WORDS_PERIOD_2CYCLES Assembly that takes an in_reg and an out_reg +and sets the out_reg to the number of instructions in each 5-cycle subsequence of +receive_words_asm. +@param offset_numeric_literal 36 plus (4 times the number of instructions in +preparatory asm). This will be 48 unless one provides instructions that +cause stalls, such as loads, branches, or jumps. +@param hex_for_12_bit_distance_from_auipc_to_end 3-digit hex (no 0x prefix) for +offset_numeric_literal plus the offset of receive_words_asm. +@param sending_core_reg Input: A register specifying the sending core. Clobbered. +@param receive_words_asm Assembly that receives the sent words. Must read in its first cycle, and +exactly every 5 cycles thereafter! +@param noc_base_address Output: A register that will be set to the base address of the NoC +corresponding to the given core. +The remaining registers are all clobbers -- they have descriptive names, but they are not really +API except insofar as they are clobbered. t6 is also clobbered, and receive_words_asm cannot use +t6. The only clobber that receive_words_asm should use is clobber4. +}] +[def (read-n-words) [lam [ + (direction-quintet quintet!) + (mul-by-receive-words-period [cycles! {2}]) + (offset-numeric-literal num!) + (hex-for-12-bit-distance-from-auipc-to-end frag!) + (sending-core-reg input!) + (receive-words-asm block!) + (noc-base-address output!) + (packet-size-reg clobber!) (jalr-word-reg clobber!) (replaced-instruction-reg clobber!) (c4 clobber!) +] { + [blocking-read noc-base-address c4 sending-core-reg] + [mul-by-receive-words-period c4 packet-size-reg] + slli packet-size-reg packet-size-reg 2 + [sync5 sending-core-reg t6 jalr-word-reg replaced-instrucction-reg c4] + auipc t6 0 + add packet-size-reg t6 packet-size-reg + li jalr-word-reg 0x[id hex-for-12-bit-distance-from-auipc-to-end]F8067 + nop + [direction-quintet + {lw replaced-instruction-reg 52(packet-size-reg)} + {sw jalr-word-reg 52(packet-size-reg)} + {nop} + {zero} + noc-base-address] + WAIT_FOR_SYNC_WORD: + [block-on-flit-from-core noc-base-address c4] + nop + nop + RECEIVE_WORDS: + [id receive-words-asm] + sw replaced-instruction-reg 52(packet-size-reg) + nop + lw c4 0(noc-base-address) + beqz c4 END + [mul-by-receive-words-period c4 replaced-instruction-reg] + slli packet-size-reg replaced-instruction-reg 2 + add packet-size-reg t6 packet-size-reg + lw replaced-instruction-reg 52(packet-size-reg) + sw jalr-word-reg 52(packet-size-reg) + nop + blt c4 zero WAIT_FOR_SYNC_WORD + jal zero RECEIVE_WORDS + END: + }]] +[program]] diff --git a/programs/noc/BroadcastMemory/asm/hello.rvg b/programs/noc/BroadcastMemory/asm/hello.rvg index 29abbfa..926065a 100644 --- a/programs/noc/BroadcastMemory/asm/hello.rvg +++ b/programs/noc/BroadcastMemory/asm/hello.rvg @@ -1,11 +1,8 @@ -[stdlib [mu [] -[[println {.global hello}]] -[[println {.section .text}]] -[[println { +[stdlib [mu [] [ctrl [mu [] +[[file {hello} { hello: li t1 42 li t0 0xbaaabaaa csrw 0x51e t0 csrw 0x51e t1 -}]] -[[println {.end}]]]] +}]]]]]] diff --git a/programs/noc/BroadcastMemory/asm/stdlib.rvg b/programs/noc/BroadcastMemory/asm/stdlib.rvg index 29aa47b..1937e4a 100644 --- a/programs/noc/BroadcastMemory/asm/stdlib.rvg +++ b/programs/noc/BroadcastMemory/asm/stdlib.rvg @@ -40,4 +40,8 @@ [def (∘) [lam [(a [lam! {1}]) (b)] [applierify-varargs [lam [(applier)] [a [applier b]]]]]] [def (;;) [lam [(doc)] id]] +[def (/*) [lam [(ignored)] {}]] +[def (input!) reg!] +[def (output!) reg!] +[def (clobber!) reg!] [program]] From 6827fcc7f98c0322df4f3eac7bb1eea4176ddba4 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Sun, 5 Feb 2023 23:28:36 -0800 Subject: [PATCH 32/34] Top-level definitions parse for BroadcastCount. --- programs/noc/BroadcastMemory/Makefile | 6 +- programs/noc/BroadcastMemory/asm/ctrl.rvg | 25 ++- .../asm/flexpret-noc-c-api.rvg | 210 ++++++++++++++++++ ...g => flexpret-noc-low-level-interface.rvg} | 25 ++- programs/noc/BroadcastMemory/asm/flexpret.rvg | 6 + .../asm/flexpret_noc_c_api.rvg | 0 programs/noc/BroadcastMemory/asm/hello.rvg | 4 +- programs/noc/BroadcastMemory/asm/hello_h.rvg | 3 + .../BroadcastMemory/broadcast_memory_noc.c | 3 +- 9 files changed, 259 insertions(+), 23 deletions(-) create mode 100644 programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg rename programs/noc/BroadcastMemory/asm/{flexpret_noc_low_level_interface.rvg => flexpret-noc-low-level-interface.rvg} (95%) delete mode 100644 programs/noc/BroadcastMemory/asm/flexpret_noc_c_api.rvg create mode 100644 programs/noc/BroadcastMemory/asm/hello_h.rvg diff --git a/programs/noc/BroadcastMemory/Makefile b/programs/noc/BroadcastMemory/Makefile index 89668c4..f0f325b 100644 --- a/programs/noc/BroadcastMemory/Makefile +++ b/programs/noc/BroadcastMemory/Makefile @@ -2,10 +2,12 @@ build: gen compile gen: mkdir -p asm-gen - rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello.rvg > asm-gen/hello2.s + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-flexpret-noc-c-api-h]]]" > asm-gen/flexpret-noc-c-api.h + rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello.rvg > asm-gen/hello.s + rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello_h.rvg > asm-gen/hello.h compile: - riscv_compile.sh ispm asm-gen/hello2.s broadcast_memory_noc.c + riscv_compile.sh ispm asm-gen/hello.s broadcast_memory_noc.c clean: rm -r asm-gen diff --git a/programs/noc/BroadcastMemory/asm/ctrl.rvg b/programs/noc/BroadcastMemory/asm/ctrl.rvg index 49e0725..80c2f8f 100644 --- a/programs/noc/BroadcastMemory/asm/ctrl.rvg +++ b/programs/noc/BroadcastMemory/asm/ctrl.rvg @@ -1,27 +1,38 @@ [mu [(program)] [def (condition!) [assertify [lam [(x)] [[hasattr {condition}] x]] {a condition}]] -[def (<) [lam [(rs1 reg!) (rs2 reg!)] +[def (make-condition) [lam [(true-branch frag!) (false-branch frag!)] [lam [(rs1 reg!) (rs2 reg!)] [[addattr {condition}] [lam [(label frag!)] [pair { - blt rs1 rs2 label + [true-branch] rs1 rs2 label } { - bgeu rs1 rs2 label - }]]]]] -[def (branch3) [lam [(condition condition!) (instrs block!)] + [false-branch] rs1 rs2 label + }]]]]]] +[def (<) [make-condition {blt} {bgeu}]] +[def (>=) [make-condition {bgeu} {blt}]] +[def (==) [make-condition {beq} {bne}]] +[def (!=) [make-condition {bne} {beq}]] +[def (branch3) [lam [(condition condition!) (instrs [cycles! {2}])] [def (assembly) { [id { [id [[condition {DONE}] false]] } ] [id instrs] DONE: }] [[unsafe-assert-exact-cycles {3}] assembly]]] -[def (unroll) [lam [(n num!) (body block!)] +[def (branch~) [lam [(condition condition!) (asm block!)] { + [[condition {DONE}] false] + [id asm] + DONE: + }]] +[def (unroll) [lam [(n num!) (body [-> [lamof num!] block!])] [[unsafe-assert-exact-cycles [* n [cycles? body]]] [[[fold_range {1} {0} n] {}] [lam [(acc) (_)] { [id acc] - [id body] + [body n] }]]]]] +[def (repeat) [lam [(n num!) (body block!)] + [unroll [lam [(_)] body]]]] [def (for) [lam [ (start reg!) (stop reg!) (step num! [>! {0}]) diff --git a/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg b/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg new file mode 100644 index 0000000..5f7c801 --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg @@ -0,0 +1,210 @@ +[stdlib [mu [] [ctrl [mu [] [flexpret [mu [] [flexpret-noc-low-level-interface [mu [] [lam [(program)] + +[def (northeast-int) {0}] +[def (north-int) {8}] +[def (east-int) {12}] + +[;; {code related to READ_N_WORDS_AND_PRINT *****************************************************/}] + +[def (load-and-print-receiver-body) [lam [(noc-base-address input!) (c0 clobber!) (c1 clobber!)] { + [unroll {64} { + lw c0 0(noc-base-address) + nop + csrw 0x51e c1 + csrw 0x51e c0 + }] + nop + }]] + +[;; { +@brief Block and print up to 64 words that were sent by another core using the protocol of +SEND_N_WORDS. +@param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the +receiver. +@param sender_reg A register containing the core ID of the sender. +The remaining registers are all clobbers. +}] +[def (read-n-words-and-print) [lam [ + (sender-quintet quintet!) + (sender-reg input!) + (c0 clobber!) + (c1 clobber!) + (c2 clobber!) + (c3 clobber!) + (c4 clobber!) + (c5 clobber!) +] + [def (body) [load-and-print-receiver-body c0 c4 c5]] + [read-n-words sender-quintet mul4-2cycles 48, 438, sender-reg body c0 c1 c2 c3 c4]]] + +[def (print-flexpret-noc-c-api-h) [print { + #define NORTHEAST [id northeast-int] + #define NORTH [id north-int] + #define EAST [id east-int] + }]] + +[def (print-read-n-words-and-print) [lam [] + [def (sender-reg) {a0}] + [def (sender-quintet) {a1}] + [def (receive-from-direction-if-applicable) [lam [(direction quintet!)] + [branch~ + [== x0 sender-quintet] + [read-n-words-and-print direction sender-reg {t0} {t1} {t2} {t3} {t4} {t5}]]]] + [[file {read_n_words_and_print} { + read_n_words_and_print: + [receive-from-direction-if-applicable northeast-quintet] + addi sender-quintet [- north-int northeast-int] + [receive-from-direction-if-applicable north-quintet] + addi sender-quintet [- east-int north-int] + [receive-from-direction-if-applicable east-quintet] + [branch~ + [!= x0 sender-quintet] + [fp-print-int {666} {t0} {t1}]] + jalr zero ra 0 + }]]]] + +[;; {code related to BROADCAST_COUNT ************************************************************/}] + +[def (broadcast-count-send-asm) [lam [ + (n-words-reg output!) + (countdown-reg input! output!) + (noc-base-address input!) + (c0 clobber!) +] { + [repeat {16} { + sw countdown-reg 0(noc-base-address) + addi c0 countdown-reg -1 + sw countdown-reg 0(noc-base-address) + sw countdown-reg 0(noc-base-address) + bge zero c0 END_BODY + sw c0 0(noc-base-address) + addi countdown-reg c0 -1 + sw c0 0(noc-base-address) + sw c0 0(noc-base-address) + bge zero countdown-reg END_BODY + nop + nop}] + jal zero END + END_BODY: + nop + add n-words-reg zero zero + nop + END: + }]] +[;; { +@brief n_words_reg = total_num_words_to_send_reg - max_sequence_length_reg +> max_sequence_length_reg ? max_sequence_length_reg +: total_num_words_to_send_reg - max_sequence_length_reg < 0 ? 0 +: total_num_words_to_send_reg - max_sequence_length_reg +@param total_num_words_to_send_reg Input +@param n_words_reg Output: Register to be set to closest integer in the range from 0 to max_sequence_length (inclusive) to +the value in total_num_words_to_send_reg minus max_sequence_length +@param max_sequence_length_reg Input +}] +[def (initialize-n-words-asm) [lam [ + (total-num-words-to-send-reg input!) + (n-words-reg output!) + (max-sequence-length-reg input!) +] { + sub n-words-reg total-num-words-to-send-reg max-sequence-length-reg + bge n-words-reg zero DONE_WITH_SMALL_CASE + add n-words-reg zero zero + nop + DONE_WITH_SMALL_CASE: + bge max-sequence-length-reg n-words-reg DONE_WITH_BIG_CASE + add n-words-reg max-sequence-length-reg zero + nop + DONE_WITH_BIG_CASE: + }]] +[def (cap-at-max-sequence-length) [lam [ + (total-num-words-to-send-reg input!) + (n-words-reg output!) + (max-sequence-length-reg output!) + (max-sequence-length-literal num!) +] { + addi max-sequence-length-reg zero max-sequence-length-literal + li n-words-reg max-sequence-length-literal + bge total-num-words-to-send-reg max-sequence-length-reg DONE + add n-words-reg total-num-words-to-send-reg zero + DONE + }]] +[def (broadcast-count-prepare-next-send-asm) [lam [ + (countdown-reg input! output!) + (n-words-reg input! output!) + (max-sequence-length-reg clobber!) +] { + li max-sequence-length-reg 32 + addi n-words-reg countdown-reg -32 + bge max-sequence-length-reg n-words-reg DONE + li n-words-reg 32 + nop + DONE: + andi n-words-reg n-words-reg 1023 + nop + }]] + +[;; { +@brief Broadcast a countdown from countdown_reg to 1 to all other cores. +@param SENDING_TO_ZERO_MACRO, ..., SENDING_TO_THREE_MACRO: All of these should be TRUE_MACRO +except the one corresponding to the current core (which does not broadcast to itself). +@param countdown_reg Input: The first number in the countdown. +@param result_reg Output: Indicates whether operation succeeded. See SEND_N_WORDS for details. +@param noc_base_address Clobber. +}] +[def (broadcast-count) [lam [ + (sending-to-0 bool!) + (sending-to-1 bool!) + (sending-to-2 bool!) + (sending-to-3 bool!) + (countdown-reg input! clobber!) + (result-reg output!) + (noc-base-address clobber!) + (n-words-reg clobber!) + (max-sequence-length-reg clobber!) + (c1 clobber!) + (c2 clobber!) + (c3 clobber!) + (c4 clobber!) +] { + [cap-at-max-sequence-length countdown-reg n-words-reg max-sequence-length-reg {32}] + [send-n-words + n-words-reg + result-reg + true true true + sending-to-0 sending-to-1 sending-to-2 sending-to-3 + [initialize-n-words-asm countdown-reg n-words-reg max-sequence-length-reg] + [broadcast-count-send-asm n-words-reg countdown-reg noc-base-address clobber3] + [broadcast-count-prepare-next-send-asm countdown-reg n-words-reg c2] + noc-base-address c1 c2 c3 c4 + ] + }]] +[def (print-broadcast-count) [lam [] + [def (current-core) {a0}] + [def (start-count-at) {a1}] + [def (send-from-direction-if-applicable) [lam [(notfrom0 bool!) (notfrom1 bool!) (notfrom2 bool!) (notfrom3 bool!)] + [branch~ + [== x0 current-core] + [broadcast-count + notfrom0 + notfrom1 + notfrom2 + notfrom3 + start-count-at + t0 t1 t2 t3 t4 t5 t6 a5]]]] + [[file {broadcast_count} { + broadcast_count: + [receive-from-direction-if-applicable northeast-quintet] + addi sender-quintet [- north-int northeast-int] + [receive-from-direction-if-applicable north-quintet] + addi sender-quintet [- east-int north-int] + [receive-from-direction-if-applicable east-quintet] + [branch~ + [!= x0 sender-quintet] + [fp-print-int {666}]] + jalr zero ra zero + [send-from-direction-if-applicable false true true true] + [send-from-direction-if-applicable true false true true] + [send-from-direction-if-applicable true true false true] + [send-from-direction-if-applicable true true true false] + }]]]] +[program]]]]]]]]]] diff --git a/programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg b/programs/noc/BroadcastMemory/asm/flexpret-noc-low-level-interface.rvg similarity index 95% rename from programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg rename to programs/noc/BroadcastMemory/asm/flexpret-noc-low-level-interface.rvg index 9f8231b..3559ca3 100644 --- a/programs/noc/BroadcastMemory/asm/flexpret_noc_low_level_interface.rvg +++ b/programs/noc/BroadcastMemory/asm/flexpret-noc-low-level-interface.rvg @@ -1,27 +1,30 @@ [mu [(program)] +[def (northeast-int) {0 }] +[def (north-int ) {8 }] +[def (east-int ) {12}] [def (quintet!) [-> [lamof [cycles! {2}] [cycles! {1}] [cycles! {1}] input! input!] [cycles! {5}]]] -[def (northeast-quintet quintet) +[def (northeast-quintet quintet!) [lam [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { sw to-send 0(noc-base-address-reg) asm2cycles asm1cycle0 asm1cycle1 - }] -[def (north-quintet quintet!) + }]] +[def (north-quintet quintet!) [lam [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { asm2cycles sw to-send 0(noc-base-address-reg) asm1cycle0 asm1cycle1 - }] -[def (east-quintet quintet!) + }]] +[def (east-quintet quintet!) [lam [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { asm2cycles asm1cycle0 sw to-send 0(noc-base-address-reg) asm1cycle1 - }] -[def (load-noc-base-address [-> [lamof output!] [cycles! {1}]) [lam [(r)] { + }]] +[def (load-noc-base-address [-> [lamof output!] [cycles! {1}]]) [lam [(r)] { li r 0x80000000 }]] [;; { @@ -57,11 +60,11 @@ This assumes that the current thread is synchronized to the TDM schedule.}] [id i1]}]] [def (broadcast-synced) [lam [(noc-base-address input!) (reg input!)] [broadcast-synced-with-instructions {nop} {nop}]]] -[def (load-noc-core-base-address) [lam [(noc-core-base-address output!) (sending-core-reg input!) (c0 clobber!)]] { +[def (load-noc-core-base-address) [lam [(noc-core-base-address output!) (sending-core-reg input!) (c0 clobber!)] { [load-noc-base-address noc-core-base-address] slli c0 sending-core-reg 2 add noc-core-base-address c0 noc-core-base-address - }] + }]] [def (block-on-flit-from-core) [lam [(noc-core-base-address input!) (c0 clobber!)] { BLOCKING_READ_POLL: lw c0 16(noc-core-base-address) @@ -172,10 +175,10 @@ FALSE_MACRO. jal zero SENDING_SYNC_WORD END: }]] -[def (mul4-2cycles [cycles! {2}]) [lam [(in-reg input!) (out-reg output)]] { +[def (mul4-2cycles [-> id [cycles! {2}]]) [lam [(in-reg input!) (out-reg output!)] { [mul4 in-reg out-reg] nop - }] + }]] [;; { @brief Read the number of words specified by the sender. @param DIRECTION_QUINTET_MACRO The macro corresponding to the direction of the sender from the diff --git a/programs/noc/BroadcastMemory/asm/flexpret.rvg b/programs/noc/BroadcastMemory/asm/flexpret.rvg index 7f633b6..463f414 100644 --- a/programs/noc/BroadcastMemory/asm/flexpret.rvg +++ b/programs/noc/BroadcastMemory/asm/flexpret.rvg @@ -4,4 +4,10 @@ csrw 0x51e c0 csrw 0x51e r }]] +[def (fp-print-int) [lam [(k num!) (c0 clobber!) (c1 clobber!)] { + li c0 0xbaaabaaa + li c1 [id k] + csrw 0x51e c0 + csrw 0x51e c1 +}]] [program]] diff --git a/programs/noc/BroadcastMemory/asm/flexpret_noc_c_api.rvg b/programs/noc/BroadcastMemory/asm/flexpret_noc_c_api.rvg deleted file mode 100644 index e69de29..0000000 diff --git a/programs/noc/BroadcastMemory/asm/hello.rvg b/programs/noc/BroadcastMemory/asm/hello.rvg index 926065a..482b589 100644 --- a/programs/noc/BroadcastMemory/asm/hello.rvg +++ b/programs/noc/BroadcastMemory/asm/hello.rvg @@ -1,8 +1,8 @@ [stdlib [mu [] [ctrl [mu [] [[file {hello} { hello: - li t1 42 li t0 0xbaaabaaa csrw 0x51e t0 - csrw 0x51e t1 + csrw 0x51e a0 + jalr zero ra 0 }]]]]]] diff --git a/programs/noc/BroadcastMemory/asm/hello_h.rvg b/programs/noc/BroadcastMemory/asm/hello_h.rvg new file mode 100644 index 0000000..0244fed --- /dev/null +++ b/programs/noc/BroadcastMemory/asm/hello_h.rvg @@ -0,0 +1,3 @@ +[[print { + void hello(int n); +}]] diff --git a/programs/noc/BroadcastMemory/broadcast_memory_noc.c b/programs/noc/BroadcastMemory/broadcast_memory_noc.c index 38f1b99..f52b0cb 100644 --- a/programs/noc/BroadcastMemory/broadcast_memory_noc.c +++ b/programs/noc/BroadcastMemory/broadcast_memory_noc.c @@ -1,6 +1,7 @@ #include #include #include +#include "asm-gen/hello.h" /* | NE | _ | N | E | _ | */ @@ -16,7 +17,7 @@ int main2(); int main3(); int main() { - hello(); + hello(41); // int core_id = read_csr(CSR_COREID); // switch(core_id) { // case 0: main0(); break; From 9bf0027b18808f79fa466dc0bc567b95ff6f5701 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Mon, 6 Feb 2023 02:24:13 -0800 Subject: [PATCH 33/34] BroadcastCount assembly is generated. I have not verified that the assembly is _correct_... --- programs/noc/BroadcastMemory/Makefile | 6 ++ programs/noc/BroadcastMemory/asm/ctrl.rvg | 18 ++--- .../asm/flexpret-noc-c-api.rvg | 38 +++++------ .../asm/flexpret-noc-low-level-interface.rvg | 68 +++++++++++-------- 4 files changed, 75 insertions(+), 55 deletions(-) diff --git a/programs/noc/BroadcastMemory/Makefile b/programs/noc/BroadcastMemory/Makefile index f0f325b..8ef9946 100644 --- a/programs/noc/BroadcastMemory/Makefile +++ b/programs/noc/BroadcastMemory/Makefile @@ -2,7 +2,13 @@ build: gen compile gen: mkdir -p asm-gen + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-flexpret-noc-c-api-h]]]" > asm-gen/flexpret-noc-c-api.h + + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-read-n-words-and-print]]]" > asm-gen/read-n-words-and-print.s + + rvg asm/stdlib.rvg asm/ctrl.rvg asm/flexpret.rvg asm/flexpret-noc-low-level-interface.rvg asm/flexpret-noc-c-api.rvg "flexpret-noc-c-api.rvg=[flexpret-noc-c-api [mu [] [print-broadcast-count]]]" > asm-gen/broadcast-count.s + rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello.rvg > asm-gen/hello.s rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello_h.rvg > asm-gen/hello.h diff --git a/programs/noc/BroadcastMemory/asm/ctrl.rvg b/programs/noc/BroadcastMemory/asm/ctrl.rvg index 80c2f8f..3eb520a 100644 --- a/programs/noc/BroadcastMemory/asm/ctrl.rvg +++ b/programs/noc/BroadcastMemory/asm/ctrl.rvg @@ -4,9 +4,9 @@ [[addattr {condition}] [lam [(label frag!)] [pair { - [true-branch] rs1 rs2 label + [id true-branch] rs1 rs2 label } { - [false-branch] rs1 rs2 label + [id false-branch] rs1 rs2 label }]]]]]] [def (<) [make-condition {blt} {bgeu}]] [def (>=) [make-condition {bgeu} {blt}]] @@ -25,14 +25,16 @@ DONE: }]] [def (unroll) [lam [(n num!) (body [-> [lamof num!] block!])] - [[unsafe-assert-exact-cycles [* n [cycles? body]]] - [[[fold_range {1} {0} n] {}] + [[[fold-range {1} {0} n] {}] [lam [(acc) (_)] { [id acc] [body n] - }]]]]] + }]]]] +[def (repeat~) [lam [(n num!) (body block!)] + [unroll n [lam [(_)] body]]]] [def (repeat) [lam [(n num!) (body block!)] - [unroll [lam [(_)] body]]]] + [[unsafe-assert-exact-cycles [* n [cycles? body]]] + [repeat~ n body]]]] [def (for) [lam [ (start reg!) (stop reg!) (step num! [>! {0}]) @@ -70,13 +72,13 @@ [lamof [∘ num! [∘ [>=! {0}] [ [lamof [cycles! {2}] [cycles! {1}] [cycles! {1}] input! input!] [cycles! {5}]]] -[def (northeast-quintet quintet!) [lam - [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { - sw to-send 0(noc-base-address-reg) - asm2cycles - asm1cycle0 - asm1cycle1 +[def (northeast-quintet quintet!) [lam [ + (asm2cycles [cycles! {2}]) + (asm1cycle0 [cycles! {1}]) + (asm1cycle1 [cycles! {1}]) + (to-send input!) + (noc-base-address input!) +] { + sw to-send 0(noc-base-address) + [id asm2cycles] + [id asm1cycle0] + [id asm1cycle1] }]] -[def (north-quintet quintet!) [lam - [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { - asm2cycles - sw to-send 0(noc-base-address-reg) - asm1cycle0 - asm1cycle1 +[def (north-quintet quintet!) [lam [ + (asm2cycles [cycles! {2}]) + (asm1cycle0 [cycles! {1}]) + (asm1cycle1 [cycles! {1}]) + (to-send input!) + (noc-base-address input!) +] { + [id asm2cycles] + sw to-send 0(noc-base-address) + [id asm1cycle0] + [id asm1cycle1] }]] -[def (east-quintet quintet!) [lam - [(asm2cycles) (asm1cycle0) (asm1cycle0) (to-send-reg) (noc-base-address)] { - asm2cycles - asm1cycle0 - sw to-send 0(noc-base-address-reg) - asm1cycle1 +[def (east-quintet quintet!) [lam [ + (asm2cycles [cycles! {2}]) + (asm1cycle0 [cycles! {1}]) + (asm1cycle1 [cycles! {1}]) + (to-send input!) + (noc-base-address input!) +] { + [id asm2cycles] + [id asm1cycle0] + sw to-send 0(noc-base-address) + [id asm1cycle1] }]] [def (load-noc-base-address [-> [lamof output!] [cycles! {1}]]) [lam [(r)] { li r 0x80000000 @@ -128,15 +143,14 @@ FALSE_MACRO. (initialize-asm block!) (send-words-asm block!) (prepare-for-next-send-words-asm block!) - (noc-base-address clobber!) - (c0 clobber!) (c1 clobber!) (c2 clobber!) (c3 clobber!) (c4 clobber!) + (noc-base-address clobber!) (c1 clobber!) (c2 clobber!) (c3 clobber!) (c4 clobber!) ] { [load-noc-base-address noc-base-address] add result-reg zero zero - [sending-to-0 [or-validity-of-noc-data noc-base-address result-reg {16} c1]] - [sending-to-1 [or-validity-of-noc-data noc-base-address result-reg {20} c1]] - [sending-to-2 [or-validity-of-noc-data noc-base-address result-reg {24} c1]] - [sending-to-3 [or-validity-of-noc-data noc-base-address result-reg {28} c1]] + [sending-to-0 [or-validity-of-noc-data noc-base-address result-reg {16} c1] {}] + [sending-to-1 [or-validity-of-noc-data noc-base-address result-reg {20} c1] {}] + [sending-to-2 [or-validity-of-noc-data noc-base-address result-reg {24} c1] {}] + [sending-to-3 [or-validity-of-noc-data noc-base-address result-reg {28} c1] {}] bnez result-reg END ;; fail with error code 1 [sync5 noc-base-address c1 c2 c3 c4] li c1 0x80000000 ;; Set the top bit as a tag bit. If u wanna send so many words that this creates ambiguity, then u have a bigger problem on your hands @@ -202,7 +216,7 @@ t6. The only clobber that receive_words_asm should use is clobber4. }] [def (read-n-words) [lam [ (direction-quintet quintet!) - (mul-by-receive-words-period [cycles! {2}]) + (mul-by-receive-words-period [-> id [cycles! {2}]]) (offset-numeric-literal num!) (hex-for-12-bit-distance-from-auipc-to-end frag!) (sending-core-reg input!) @@ -213,10 +227,10 @@ t6. The only clobber that receive_words_asm should use is clobber4. [blocking-read noc-base-address c4 sending-core-reg] [mul-by-receive-words-period c4 packet-size-reg] slli packet-size-reg packet-size-reg 2 - [sync5 sending-core-reg t6 jalr-word-reg replaced-instrucction-reg c4] + [sync5 sending-core-reg {t6} jalr-word-reg replaced-instruction-reg c4] auipc t6 0 add packet-size-reg t6 packet-size-reg - li jalr-word-reg 0x[id hex-for-12-bit-distance-from-auipc-to-end]F8067 + li jalr-word-reg [id {0x}][id hex-for-12-bit-distance-from-auipc-to-end]F8067 nop [direction-quintet {lw replaced-instruction-reg 52(packet-size-reg)} From f066f422ee09c32968776f741aad5aee58b62ee3 Mon Sep 17 00:00:00 2001 From: Peter Donovan Date: Mon, 6 Feb 2023 21:14:44 -0800 Subject: [PATCH 34/34] Struggle to get assembly to work. I stopped working on this 20 hours ago; just checkpointing old work here. --- programs/noc/BroadcastMemory/Makefile | 2 +- .../asm/flexpret-noc-c-api.rvg | 12 +-- .../BroadcastMemory/broadcast_memory_noc.c | 78 ++++++++++--------- 3 files changed, 47 insertions(+), 45 deletions(-) diff --git a/programs/noc/BroadcastMemory/Makefile b/programs/noc/BroadcastMemory/Makefile index 8ef9946..28e29a8 100644 --- a/programs/noc/BroadcastMemory/Makefile +++ b/programs/noc/BroadcastMemory/Makefile @@ -13,7 +13,7 @@ gen: rvg asm/stdlib.rvg asm/ctrl.rvg asm/hello_h.rvg > asm-gen/hello.h compile: - riscv_compile.sh ispm asm-gen/hello.s broadcast_memory_noc.c + riscv_compile.sh ispm asm-gen/hello.s asm-gen/read-n-words-and-print.s asm-gen/broadcast-count.s broadcast_memory_noc.c clean: rm -r asm-gen diff --git a/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg b/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg index 00dcbc2..e0ffc28 100644 --- a/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg +++ b/programs/noc/BroadcastMemory/asm/flexpret-noc-c-api.rvg @@ -38,9 +38,11 @@ The remaining registers are all clobbers. [read-n-words sender-quintet mul4-2cycles {48} {438} sender-reg body c0 c1 c2 c3 c4]]] [def (print-flexpret-noc-c-api-h) [print { - #define NORTHEAST [id northeast-int] - #define NORTH [id north-int] - #define EAST [id east-int] + #define NORTHEAST_INT [id northeast-int] + #define NORTH_INT [id north-int] + #define EAST_INT [id east-int] + void read_n_words_and_print(int sender, int sender_direction); + void broadcast_count(int current_core, int start_count_at); }]] [def (print-read-n-words-and-print) [lam [] @@ -52,14 +54,12 @@ The remaining registers are all clobbers. [read-n-words-and-print direction sender-reg {t0} {t1} {t2} {t3} {t4} {t5}]]]] [[file {read_n_words_and_print} { read_n_words_and_print: + [fp-print sender-quintet {t0}] [receive-from-direction-if-applicable northeast-quintet] addi sender-quintet sender-quintet [- north-int northeast-int] [receive-from-direction-if-applicable north-quintet] addi sender-quintet sender-quintet [- east-int north-int] [receive-from-direction-if-applicable east-quintet] - [branch~ - [!= {zero} sender-quintet] - [fp-print-int {666} {t0} {t1}]] jalr zero ra 0 }]]]] diff --git a/programs/noc/BroadcastMemory/broadcast_memory_noc.c b/programs/noc/BroadcastMemory/broadcast_memory_noc.c index f52b0cb..7be3094 100644 --- a/programs/noc/BroadcastMemory/broadcast_memory_noc.c +++ b/programs/noc/BroadcastMemory/broadcast_memory_noc.c @@ -1,7 +1,9 @@ #include #include -#include -#include "asm-gen/hello.h" +// #include + +// #include "asm-gen/hello.h" +#include "asm-gen/flexpret-noc-c-api.h" /* | NE | _ | N | E | _ | */ @@ -17,41 +19,41 @@ int main2(); int main3(); int main() { - hello(41); - // int core_id = read_csr(CSR_COREID); - // switch(core_id) { - // case 0: main0(); break; - // case 1: main1(); break; - // case 2: main2(); break; - // case 3: main3(); break; - // default: _fp_print(666); //ERROR - // } + // hello(41); + int core_id = read_csr(CSR_COREID); + switch(core_id) { + case 0: main0(); break; + case 1: main1(); break; + case 2: main2(); break; + case 3: main3(); break; + default: _fp_print(666); //ERROR + } +} + +int main0() { + broadcast_count(0, 125); + read_n_words_and_print(1, EAST_INT); + read_n_words_and_print(2, NORTH_INT); + read_n_words_and_print(3, NORTHEAST_INT); +} + +int main1() { + read_n_words_and_print(0, EAST_INT); + broadcast_count(1, 17); + read_n_words_and_print(2, NORTHEAST_INT); + read_n_words_and_print(3, NORTH_INT); +} + +int main2() { + read_n_words_and_print(0, NORTH_INT); + read_n_words_and_print(1, NORTHEAST_INT); + broadcast_count(2, 42); + read_n_words_and_print(3, EAST_INT); } -// int main0() { -// broadcast_count(0, 125); -// read_n_words_and_print(1, EAST_INT); -// read_n_words_and_print(2, NORTH_INT); -// read_n_words_and_print(3, NORTHEAST_INT); -// } - -// int main1() { -// read_n_words_and_print(0, EAST_INT); -// broadcast_count(1, 17); -// read_n_words_and_print(2, NORTHEAST_INT); -// read_n_words_and_print(3, NORTH_INT); -// } - -// int main2() { -// read_n_words_and_print(0, NORTH_INT); -// read_n_words_and_print(1, NORTHEAST_INT); -// broadcast_count(2, 42); -// read_n_words_and_print(3, EAST_INT); -// } - -// int main3() { -// read_n_words_and_print(0, NORTHEAST_INT); -// read_n_words_and_print(1, NORTH_INT); -// read_n_words_and_print(2, EAST_INT); -// broadcast_count(3, 3); -// } +int main3() { + read_n_words_and_print(0, NORTHEAST_INT); + read_n_words_and_print(1, NORTH_INT); + read_n_words_and_print(2, EAST_INT); + broadcast_count(3, 3); +}