diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java index 0d411801..562050ca 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java @@ -3,22 +3,41 @@ /** * Record to store metrics from the last model run. * @param totalTokens The total number of tokens processed - * @param totalSeconds The total time in seconds + * @param totalNanos The total time in nanoseconds + * @param promptEvalCount Number of tokens in the prompt + * @param promptNanos Time to process the prompt in nanoseconds + * @param inferenceEvalCount Number of tokens in the model's response + * @param inferenceNanos Time to output the response in nanoseconds + * @param tornadoCompileNanos Time to compile the tornado task graph in nanoseconds + * @param tornadoWarmupNanos Time spent warming up until steady state in nanoseconds */ -public record LastRunMetrics(int totalTokens, double totalSeconds) { +public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos, long tornadoCompileNanos, long tornadoWarmupNanos) { /** * Singleton instance to store the latest metrics */ private static LastRunMetrics latestMetrics; + // Load duration is static because it only occurs once + private static long loadDurationNanos = 0; /** * Sets the metrics for the latest run - * - * @param tokens The total number of tokens processed - * @param seconds The total time in seconds + * @param totalTokens The total number of tokens processed + * @param totalNanos The total time in nanoseconds + * @param promptEvalCount Number of tokens in the prompt + * @param promptNanos Time to process the prompt in nanoseconds + * @param inferenceEvalCount Number of tokens in the model's response + * @param inferenceNanos Time to output the response in nanoseconds + * @param tornadoCompileNanos Time to compile the tornado task graph in nanoseconds + * @param tornadoWarmupNanos Time spent warming up until steady state in nanoseconds */ - public static void setMetrics(int tokens, double seconds) { - latestMetrics = new LastRunMetrics(tokens, seconds); + public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos, long tornadoCompileNanos, long tornadoWarmupNanos) { + latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); + } + /** + * @param nanos Time it takes to load the model in nanoseconds; only changes when new model is loaded + */ + public static void setModelLoadDuration(long nanos){ + loadDurationNanos = nanos; } /** @@ -26,8 +45,33 @@ public static void setMetrics(int tokens, double seconds) { */ public static void printMetrics() { if (latestMetrics != null) { - double tokensPerSecond = latestMetrics.totalTokens() / latestMetrics.totalSeconds(); - System.err.printf("\n\nachieved tok/s: %.2f. Tokens: %d, seconds: %.2f\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalSeconds()); + + // If statements to only print model load once, and only print TornadoVM metrics if using GPU + if (loadDurationNanos > 0) { + double loadSeconds = loadDurationNanos / 1_000_000_000.0; + System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadSeconds); + } + if (latestMetrics.tornadoCompileNanos() > 0) { + double compileSeconds = latestMetrics.tornadoCompileNanos() / 1_000_000_000.0; + System.err.printf("TornadoVM Compile Time: %d ns (%.2f s)\n", latestMetrics.tornadoCompileNanos(), compileSeconds); + } + if (latestMetrics.tornadoWarmupNanos() > 0) { + double warmupSeconds = latestMetrics.tornadoWarmupNanos() / 1_000_000_000.0; + System.err.printf("TornadoVM Warmup Time: %d ns (%.2f s)\n", latestMetrics.tornadoWarmupNanos(), warmupSeconds); + } + + double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0; + // Time to First Token = Load Time + Prompt Eval time (which includes first decode step) + long ttftNanos = loadDurationNanos + latestMetrics.promptNanos(); + double ttftSeconds = ttftNanos / 1_000_000_000.0; + double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0; + double prefillThroughput = latestMetrics.promptEvalCount() / promptSeconds; + double inferenceSeconds = latestMetrics.inferenceNanos() / 1_000_000_000.0; + double decodeThroughput = latestMetrics.inferenceEvalCount() / inferenceSeconds; + double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds; + System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s), Time to first Token: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, ttftNanos, ttftSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds); + + loadDurationNanos = 0; } } } diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index a9c65223..9e254318 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -132,10 +132,18 @@ public static List generateTokensLlama(Model model, State state, int st // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -213,10 +221,18 @@ public static List generateTokensQwen3(Model model, State state, int st // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -225,6 +241,7 @@ public static List generateTokensPhi3(Model model, State state, int sta IntConsumer onTokenGenerated) { long startNanos = System.nanoTime(); + long inferenceStartNanos = 0; if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) { maxTokens = model.configuration().contextLength(); } @@ -245,6 +262,9 @@ public static List generateTokensPhi3(Model model, State state, int sta System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken)))); } } else { + if (inferenceStartNanos == 0) { + inferenceStartNanos = System.nanoTime(); + } nextToken = sampler.sampleToken(state.logits); if (echo) { // log inferred token @@ -266,10 +286,18 @@ public static List generateTokensPhi3(Model model, State state, int sta // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; @@ -354,13 +382,28 @@ public static List generateTokensGPULlama(Model model, State state, int pos++; } - // === Performance Metrics === + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - // Set metrics for tokens achieved - LastRunMetrics.setMetrics(totalTokens, totalSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -447,12 +490,28 @@ public static List generateTokensGPUQwen3(Model model, State state, int state.latestToken = currentToken = nextToken; } - // Calculate and print performance metrics + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -522,12 +581,28 @@ public static List generateTokensGPUPhi3(Model model, State state, int pos++; } - // Calculate and print performance metrics + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -590,11 +665,20 @@ public static List generateTokensGranite(Model model, State state, int pos++; } + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } @@ -605,7 +689,7 @@ public static List generateTokensGranite(Model model, State state, int */ public static List generateTokensGPUGranite(Model model, State state, int startPosition, List promptTokens, Set stopTokens, int maxTokens, Sampler sampler, boolean echo, - IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMMasterPlan) { + IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) { long startNanos = System.nanoTime(); long inferenceStartNanos = 0; @@ -623,7 +707,7 @@ public static List generateTokensGPUGranite(Model model, State state, i while (pos < maxTokens) { // Call TornadoVM forward pass (same as Llama for now) - logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMMasterPlan); + logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMPlan); if (promptIndex < promptTokens.size()) { nextToken = promptTokens.get(promptIndex++); @@ -657,11 +741,28 @@ public static List generateTokensGPUGranite(Model model, State state, i pos++; } + // Calculate and Print Performance Metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; + if (inferenceStartNanos == 0) { + inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated + } int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long totalNanos = (endNanos - startNanos); + int promptEvalCount = promptIndex; + long promptNanos = inferenceStartNanos - startNanos; + int inferenceEvalCount = generatedTokens.size(); + long inferenceNanos = endNanos - inferenceStartNanos; + long tornadoCompileNanos = 0; + long tornadoWarmupNanos = 0; + // If statement to prevent inadvertent crashes from future features + if (tornadoVMPlan != null) { + tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos(); + tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos(); + // Reset values so they are only output if they are changed in tornadoVMMasterPlan.java + tornadoVMPlan.setCompileDurationNanos(0); + tornadoVMPlan.setWarmupDurationNanos(0); + } + LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos); return generatedTokens; } diff --git a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java index 392113be..c85bfe49 100644 --- a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java +++ b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java @@ -11,6 +11,7 @@ import org.beehive.gpullama3.tensor.tornado.FP32TornadoTensor; import org.beehive.gpullama3.tensor.tornado.Q8_0TornadoTensor; import org.beehive.gpullama3.tensor.tornado.TornadoTensor; +import org.beehive.gpullama3.auxiliary.LastRunMetrics; import uk.ac.manchester.tornado.api.types.HalfFloat; import uk.ac.manchester.tornado.api.types.arrays.*; @@ -85,6 +86,9 @@ private static ModelType detectModelType(Map metadata) { * @throws IllegalStateException if AOT loading is enabled but the preloaded model is unavailable */ public static Model loadModel(Options options) throws IOException { + //Keep track of load time for performance metrics + long startLoadNanos = System.nanoTime(); + Path ggufPath = options.modelPath(); int contextLength = options.maxTokens(); boolean useTornadovm = options.useTornadovm(); @@ -94,19 +98,36 @@ public static Model loadModel(Options options) throws IOException { // detect model type ModelType modelType = detectModelType(gguf.getMetadata()); // model type-specific load - return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + Model loadedModel = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + + //Calculate load time and send to LastRunMetrics + long endLoadNanos = System.nanoTime(); + long loadNanos = (endLoadNanos - startLoadNanos); + LastRunMetrics.setModelLoadDuration(loadNanos); + + return loadedModel; } /** * For compatibility with langchain4j and quarkus. */ public static Model loadModel(Path ggufPath, int contextLength, boolean loadWeights, boolean useTornadovm) throws IOException { + //Keep track of load time for performance metrics + long startLoadNanos = System.nanoTime(); + // initial load of metadata from gguf file GGUF gguf = GGUF.loadGGUFMetadata(ggufPath); // detect model type ModelType modelType = detectModelType(gguf.getMetadata()); // model type-specific load - return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + Model loadedModel = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + + //Calculate load time and send to LastRunMetrics + long endLoadNanos = System.nanoTime(); + long loadNanos = (endLoadNanos - startLoadNanos); + LastRunMetrics.setModelLoadDuration(loadNanos); + + return loadedModel; } /** diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java index a42dc310..0717e61b 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlan.java @@ -17,6 +17,24 @@ public class TornadoVMMasterPlan { private final Configuration config; public TornadoExecutionPlan executionPlan; GenericLayerPlanner tornadoVMLayerPlanner; + + // Performance variables and their corresponding methods to be accessed from LastRunMetrics + private long compileDurationNanos = 0; + private long warmupDurationNanos = 0; + + public long getCompileDurationNanos() { + return compileDurationNanos; + } + public void setCompileDurationNanos(long nanos) { + this.compileDurationNanos = nanos; + } + + public long getWarmupDurationNanos() { + return warmupDurationNanos; + } + public void setWarmupDurationNanos(long nanos) { + this.warmupDurationNanos = nanos; + } public TornadoVMMasterPlan(State state, Model model) { this.tornadoVMLayerPlanner = createPlanner(state, model); @@ -36,10 +54,8 @@ public TornadoVMMasterPlan(State state, Model model) { * @return The initialized TornadoVMMasterPlan ready for inference */ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model model) { - // Initialize timing variables outside conditional blocks to avoid scope issues + // Record Start Time for Performance metrics long startTime = System.nanoTime(); - long planCreationTime = 0; - long warmupTime = 0; // Start a timing message if enabled if (ENABLE_TORNADOVM_INIT_TIME) { @@ -49,27 +65,34 @@ public static TornadoVMMasterPlan initializeTornadoVMPlan(State state, Model mod // 1. Pre-allocate the TornadoVM plan TornadoVMMasterPlan tornadoVMPlan = new TornadoVMMasterPlan(state, model); + // Calculate and Set Compile Time + long planCreationTime = System.nanoTime(); + tornadoVMPlan.setCompileDurationNanos(planCreationTime - startTime); + // Record time after plan creation if (ENABLE_TORNADOVM_INIT_TIME) { - planCreationTime = System.nanoTime(); System.err.printf("TornadoVM GPU execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); } // 2. Perform warmup with extra iterations to ensure JIT compilation is complete tornadoVMPlan.executionPlan.withPreCompilation(); // Force JIT compilation from Java to GPU code + long warmupTime = System.nanoTime(); + // Record time after warmup if (ENABLE_TORNADOVM_INIT_TIME) { - warmupTime = System.nanoTime(); System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); } // 3. Perform copy-in of read-only weights and objects tornadoVMPlan.forceCopyInReadOnlyDataLayered(); // Force copy-in read-only weights + + // Calculate and Set Total Warmup Time + long copyTime = System.nanoTime(); + tornadoVMPlan.setWarmupDurationNanos(copyTime - planCreationTime); // Record final timing information if (ENABLE_TORNADOVM_INIT_TIME) { - long copyTime = System.nanoTime(); System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); System.err.printf("Finished TornadoVM initialization...\n \n"); }