From 0f9d45663815ff69b8414979ed0649da17444caa Mon Sep 17 00:00:00 2001 From: Adam Bien Date: Sat, 11 Apr 2026 17:08:50 +0200 Subject: [PATCH 1/3] Updated to Java 25 and TornadoVM 4.0.0 --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 111a66aa..cfa80baa 100644 --- a/pom.xml +++ b/pom.xml @@ -39,12 +39,12 @@ 0.4.0 - 3.0.0 + 4.0.0 ${tornadovm.base.version} - 21 - 21 + 25 + 25 UTF-8 true true From 10caa33e84df671f9bf0182524b003c33c58c642 Mon Sep 17 00:00:00 2001 From: Adam Bien Date: Sat, 11 Apr 2026 17:15:47 +0200 Subject: [PATCH 2/3] Java 25 Script with metal support implemented --- llamaTornado | 364 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100755 llamaTornado diff --git a/llamaTornado b/llamaTornado new file mode 100755 index 00000000..6cc303d5 --- /dev/null +++ b/llamaTornado @@ -0,0 +1,364 @@ +#!/usr/bin/env -S java --source 25 + +import module java.logging; + +String name = MethodHandles.lookup().lookupClass().getName(); +String version = "2026-04-11.1"; + +enum Backend { OPENCL, PTX, METAL } + +record Config( + String modelPath, String prompt, String systemPrompt, + double temperature, double topP, long seed, int maxTokens, + boolean stream, boolean echo, boolean interactive, boolean instruct, + boolean useGpu, Backend backend, String gpuMemory, + String heapMin, String heapMax, + boolean debug, boolean profiler, String profilerDumpDir, + boolean printBytecodes, boolean threads, boolean printKernel, + boolean fullDump, boolean verboseInit, + boolean showCommand, boolean executeAfterShow, + String openclFlags, int maxWaitEvents, boolean verbose +) {} + +Config parseArgs(String[] args) { + String modelPath = null; + String prompt = null; + String systemPrompt = null; + double temperature = 0.1; + double topP = 0.95; + long seed = System.currentTimeMillis() / 1000; + int maxTokens = 512; + boolean stream = true; + boolean echo = false; + boolean interactive = false; + boolean instruct = true; + boolean useGpu = false; + Backend backend = Backend.OPENCL; + String gpuMemory = "14GB"; + String heapMin = "20g"; + String heapMax = "20g"; + boolean debug = false; + boolean profiler = false; + String profilerDumpDir = null; + boolean printBytecodes = false; + boolean threads = false; + boolean printKernel = false; + boolean fullDump = false; + boolean verboseInit = false; + boolean showCommand = false; + boolean executeAfterShow = false; + String openclFlags = "-cl-denorms-are-zero -cl-no-signed-zeros -cl-finite-math-only"; + int maxWaitEvents = 32000; + boolean verbose = false; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--model", "-m" -> modelPath = args[++i]; + case "--prompt", "-p" -> prompt = args[++i]; + case "--system-prompt", "-sp" -> systemPrompt = args[++i]; + case "--temperature" -> temperature = Double.parseDouble(args[++i]); + case "--top-p" -> topP = Double.parseDouble(args[++i]); + case "--seed" -> seed = Long.parseLong(args[++i]); + case "--max-tokens", "-n" -> maxTokens = Integer.parseInt(args[++i]); + case "--stream" -> stream = Boolean.parseBoolean(args[++i]); + case "--echo" -> echo = Boolean.parseBoolean(args[++i]); + case "-i", "--interactive" -> { interactive = true; instruct = false; } + case "--instruct" -> instruct = true; + case "--gpu" -> useGpu = true; + case "--opencl" -> backend = Backend.OPENCL; + case "--ptx" -> backend = Backend.PTX; + case "--metal" -> backend = Backend.METAL; + case "--gpu-memory" -> gpuMemory = args[++i]; + case "--heap-min" -> heapMin = args[++i]; + case "--heap-max" -> heapMax = args[++i]; + case "--debug" -> debug = true; + case "--profiler" -> profiler = true; + case "--profiler-dump-dir" -> profilerDumpDir = args[++i]; + case "--print-bytecodes" -> printBytecodes = true; + case "--print-threads" -> threads = true; + case "--print-kernel" -> printKernel = true; + case "--full-dump" -> fullDump = true; + case "--verbose-init" -> verboseInit = true; + case "--show-command" -> showCommand = true; + case "--execute-after-show" -> executeAfterShow = true; + case "--opencl-flags" -> openclFlags = args[++i]; + case "--max-wait-events" -> maxWaitEvents = Integer.parseInt(args[++i]); + case "--verbose", "-v" -> verbose = true; + default -> { + System.err.println("Unknown option: " + args[i]); + System.exit(1); + } + } + } + + if (modelPath == null) { + System.err.println("Error: --model is required"); + printUsage(); + System.exit(1); + } + + if (profilerDumpDir == null) { + profilerDumpDir = System.getenv("LLAMA_ROOT") + "/profiler-log.json"; + } + + return new Config(modelPath, prompt, systemPrompt, temperature, topP, seed, maxTokens, + stream, echo, interactive, instruct, useGpu, backend, gpuMemory, heapMin, heapMax, + debug, profiler, profilerDumpDir, printBytecodes, threads, printKernel, fullDump, + verboseInit, showCommand, executeAfterShow, openclFlags, maxWaitEvents, verbose); +} + +void printUsage() { + IO.println(""" + Usage: %s --model [options] + + GPU-accelerated LLM runner using TornadoVM + + Required: + --model, -m Path to the LLM gguf file + + LLaMA Configuration: + --prompt, -p Input prompt + --system-prompt, -sp System prompt + --temperature Sampling temperature (default: 0.1) + --top-p Top-p sampling (default: 0.95) + --seed Random seed (default: current timestamp) + --max-tokens, -n Max tokens to generate (default: 512) + --stream Enable streaming (default: true) + --echo Echo input prompt (default: false) + + Mode: + -i, --interactive Interactive/chat mode + --instruct Instruction mode (default) + + Hardware: + --gpu Enable GPU acceleration + --opencl Use OpenCL backend (default) + --ptx Use PTX/CUDA backend + --metal Use Metal backend (macOS) + --gpu-memory GPU memory allocation (default: 14GB) + --heap-min Min JVM heap (default: 20g) + --heap-max Max JVM heap (default: 20g) + + Debug: + --debug Enable debug output + --profiler Enable TornadoVM profiler + --profiler-dump-dir Profiler output directory + --print-bytecodes Print bytecodes + --print-threads Print thread info + --print-kernel Print kernel info + --full-dump Full debug dump + --verbose-init TornadoVM init timing + --show-command Display the full Java command + --execute-after-show Execute after showing command + --verbose, -v Verbose output + + -help Show this help + -version Show version + """.formatted(name)); +} + +String findLlamaJar(String llamaRoot) { + var targetDir = Path.of(llamaRoot, "target"); + try (var stream = Files.newDirectoryStream(targetDir, "gpu-llama3-*-SNAPSHOT.jar")) { + var jars = new ArrayList(); + stream.forEach(jars::add); + if (jars.isEmpty()) { + try (var fallback = Files.newDirectoryStream(targetDir, "gpu-llama3-*.jar")) { + fallback.forEach(jars::add); + } + } + if (jars.isEmpty()) { + System.err.println("Error: No gpu-llama3 JAR found in " + targetDir); + System.exit(1); + } + jars.sort(Comparator.reverseOrder()); + return jars.getFirst().toString(); + } catch (IOException e) { + System.err.println("Error searching for JAR: " + e.getMessage()); + System.exit(1); + return null; + } +} + +String modulePath(String tornadoSdk) { + var sep = System.getProperty("os.name").toLowerCase().contains("win") ? ";" : ":"; + return "." + sep + tornadoSdk + "/share/java/tornado"; +} + +List buildCommand(Config cfg, String javaHome, String tornadoSdk, String llamaRoot) { + var cmd = new ArrayList(); + + cmd.addAll(List.of( + javaHome + "/bin/java", + "-server", + "-XX:+UnlockExperimentalVMOptions", + "-XX:+EnableJVMCI", + "-Xms" + cfg.heapMin(), + "-Xmx" + cfg.heapMax(), + "--enable-preview", + "-Djava.library.path=" + tornadoSdk + "/lib", + "-Djdk.module.showModuleResolution=false", + "--module-path", modulePath(tornadoSdk) + )); + + // TornadoVM configuration + cmd.addAll(List.of( + "-Dtornado.load.api.implementation=uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph", + "-Dtornado.load.runtime.implementation=uk.ac.manchester.tornado.runtime.TornadoCoreRuntime", + "-Dtornado.load.tornado.implementation=uk.ac.manchester.tornado.runtime.common.Tornado", + "-Dtornado.load.annotation.implementation=uk.ac.manchester.tornado.annotation.ASMClassVisitor", + "-Dtornado.load.annotation.parallel=uk.ac.manchester.tornado.api.annotations.Parallel", + "-Dtornado.tvm.maxbytecodesize=65536" + )); + + if (cfg.useGpu()) cmd.add("-Duse.tornadovm=true"); + if (cfg.verboseInit()) cmd.add("-Dllama.EnableTimingForTornadoVMInit=true"); + + // Debug flags + cmd.add("-Dtornado.debug=" + cfg.debug()); + cmd.add("-Dtornado.threadInfo=" + cfg.threads()); + cmd.add("-Dtornado.fullDebug=" + cfg.fullDump()); + cmd.add("-Dtornado.printKernel=" + cfg.printKernel()); + cmd.add("-Dtornado.print.bytecodes=" + cfg.printBytecodes()); + + // Runtime configuration + cmd.addAll(List.of( + "-Dtornado.device.memory=" + cfg.gpuMemory(), + "-Dtornado.profiler=" + cfg.profiler(), + "-Dtornado.log.profiler=false", + "-Dtornado.profiler.dump.dir=" + cfg.profilerDumpDir(), + "-Dtornado.enable.fastMathOptimizations=true", + "-Dtornado.enable.mathOptimizations=false", + "-Dtornado.enable.nativeFunctions=true", + "-Dtornado.loop.interchange=true", + "-Dtornado.eventpool.maxwaitevents=" + cfg.maxWaitEvents() + )); + + if (cfg.backend() == Backend.OPENCL) { + cmd.add("-Dtornado.opencl.compiler.flags=" + cfg.openclFlags()); + } + + // Module configuration + cmd.addAll(List.of( + "--upgrade-module-path", tornadoSdk + "/share/java/graalJars", + "@" + tornadoSdk + "/etc/exportLists/common-exports" + )); + + switch (cfg.backend()) { + case OPENCL -> { + cmd.add("@" + tornadoSdk + "/etc/exportLists/opencl-exports"); + cmd.addAll(List.of("--add-modules", + "ALL-SYSTEM,jdk.incubator.vector,tornado.runtime,tornado.annotation,tornado.drivers.common,tornado.drivers.opencl")); + } + case PTX -> { + cmd.add("@" + tornadoSdk + "/etc/exportLists/ptx-exports"); + cmd.addAll(List.of("--add-modules", + "ALL-SYSTEM,jdk.incubator.vector,tornado.runtime,tornado.annotation,tornado.drivers.common,tornado.drivers.ptx")); + } + case METAL -> { + cmd.add("@" + tornadoSdk + "/etc/exportLists/metal-exports"); + cmd.addAll(List.of("--add-modules", + "ALL-SYSTEM,jdk.incubator.vector,tornado.runtime,tornado.annotation,tornado.drivers.common,tornado.drivers.metal")); + } + } + + cmd.addAll(List.of("-cp", findLlamaJar(llamaRoot), "org.beehive.gpullama3.LlamaApp")); + + // LLaMA arguments + cmd.addAll(List.of( + "-m", cfg.modelPath(), + "--temperature", String.valueOf(cfg.temperature()), + "--top-p", String.valueOf(cfg.topP()), + "--seed", String.valueOf(cfg.seed()), + "--max-tokens", String.valueOf(cfg.maxTokens()), + "--stream", String.valueOf(cfg.stream()), + "--echo", String.valueOf(cfg.echo()) + )); + + if (cfg.prompt() != null) cmd.addAll(List.of("-p", cfg.prompt())); + if (cfg.systemPrompt() != null) cmd.addAll(List.of("-sp", cfg.systemPrompt())); + if (cfg.interactive()) cmd.add("--interactive"); + else if (cfg.instruct()) cmd.add("--instruct"); + + return cmd; +} + +String resolveLlamaRoot() { + var envRoot = System.getenv("LLAMA_ROOT"); + if (envRoot != null && !envRoot.isBlank()) return envRoot; + + // Derive from the script's own location, same as set_paths does + try { + var scriptPath = Path.of(MethodHandles.lookup().lookupClass() + .getProtectionDomain().getCodeSource().getLocation().toURI()); + return scriptPath.getParent().toString(); + } catch (Exception e) { + System.err.println("Error: LLAMA_ROOT not set and could not determine script location"); + System.err.println("Note: check set_path in root dir -> source set_path"); + System.exit(1); + return null; + } +} + +String requireEnv(String key) { + var value = System.getenv(key); + if (value == null || value.isBlank()) { + System.err.println("Error: " + key + " is not set"); + System.err.println("Please ensure JAVA_HOME and TORNADOVM_HOME are defined"); + System.exit(1); + } + if (!Files.exists(Path.of(value))) { + System.err.println("Error: " + key + " path does not exist: " + value); + System.exit(1); + } + return value; +} + +void main(String... args) { + if (args.length == 0 || args[0].equals("-help")) { + printUsage(); + return; + } + if (args[0].equals("-version")) { + IO.println(name + " " + version); + return; + } + + var javaHome = requireEnv("JAVA_HOME"); + var tornadoSdk = requireEnv("TORNADOVM_HOME"); + var llamaRoot = resolveLlamaRoot(); + + var cfg = parseArgs(args); + var cmd = buildCommand(cfg, javaHome, tornadoSdk, llamaRoot); + + if (cfg.showCommand()) { + IO.println("Full Java command:"); + IO.println("-".repeat(80)); + IO.println(String.join(" ", cmd)); + IO.println("-".repeat(80)); + IO.println(); + if (!cfg.executeAfterShow()) { + IO.println("Command built successfully. Use --execute-after-show to run after displaying."); + return; + } + } + + if (cfg.verbose()) { + IO.println("Executing command:"); + cmd.forEach(arg -> IO.println(" " + arg)); + IO.println(); + } + + try { + var process = new ProcessBuilder(cmd) + .inheritIO() + .start(); + System.exit(process.waitFor()); + } catch (InterruptedException e) { + System.err.println("\nOperation cancelled"); + System.exit(130); + } catch (IOException e) { + System.err.println("Error: " + e.getMessage()); + System.exit(1); + } +} From 9642299eaebff4d238cdc8a8715f3a3c87c09c34 Mon Sep 17 00:00:00 2001 From: Adam Bien Date: Sat, 11 Apr 2026 17:16:24 +0200 Subject: [PATCH 3/3] README updated with llamaTornado usage example --- README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0e5ce5de..50eabca0 100644 --- a/README.md +++ b/README.md @@ -119,11 +119,9 @@ We are at the early stages of Java entering the AI world with features added to | | M4 Pro | 16.77 tokens/s | 8.56 tokens/s | (WIP) | | **AMD / OpenCL** | Radeon RX | (WIP) | (WIP) | (WIP) | -##### ⚠️ Note on Apple Silicon Performance +##### Apple Silicon Support -TornadoVM currently runs on Apple Silicon via [OpenCL](https://developer.apple.com/opencl/), which has been officially deprecated since macOS 10.14. - -Despite being deprecated, OpenCL can still run on Apple Silicon; albeit, with older drivers which do not support all optimizations of TornadoVM. Therefore, the performance is not optimal since TornadoVM does not have a Metal backend yet (it currently has OpenCL, PTX, and SPIR-V backends). We recommend using Apple silicon for development and for performance testing to use OpenCL/PTX compatible Nvidia GPUs for the time being (until we add a Metal backend to TornadoVM and start optimizing it). +TornadoVM 4.0 includes a native [Metal](https://developer.apple.com/metal/) backend, enabling GPU-accelerated inference on Apple Silicon. ----------- ## 📦 Maven Dependency @@ -313,6 +311,14 @@ Enable GPU acceleration with Q8_0 quantization: ./llama-tornado --gpu --verbose-init --model beehive-llama-3.2-1b-instruct-fp16.gguf --prompt "tell me a joke" ``` +#### Running with `llamaTornado` (Java 25 single-file script) + +`llamaTornado` is a zero-dependency Java 25 single-file script that replaces the Python launcher. It requires `java 25+` on your PATH: + +```bash +./llamaTornado --gpu --verbose-init --metal --model /Users/abien/work/workspaces/llms/Mistral-7B-Instruct-v0.3.Q8_0.gguf --prompt "what is java" +``` + ----------- ## 🐳 Docker