Skip to content
62 changes: 53 additions & 9 deletions src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,75 @@
/**
* Record to store metrics from the last model run.
* @param totalTokens The total number of tokens processed
* @param totalSeconds The total time in seconds
* @param totalNanos The total time in nanoseconds
* @param promptEvalCount Number of tokens in the prompt
* @param promptNanos Time to process the prompt in nanoseconds
* @param inferenceEvalCount Number of tokens in the model's response
* @param inferenceNanos Time to output the response in nanoseconds
* @param tornadoCompileNanos Time to compile the tornado task graph in nanoseconds
* @param tornadoWarmupNanos Time spent warming up until steady state in nanoseconds
*/
public record LastRunMetrics(int totalTokens, double totalSeconds) {
public record LastRunMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos, long tornadoCompileNanos, long tornadoWarmupNanos) {
/**
* Singleton instance to store the latest metrics
*/
private static LastRunMetrics latestMetrics;
// Load duration is static because it only occurs once
private static long loadDurationNanos = 0;

/**
* Sets the metrics for the latest run
*
* @param tokens The total number of tokens processed
* @param seconds The total time in seconds
* @param totalTokens The total number of tokens processed
* @param totalNanos The total time in nanoseconds
* @param promptEvalCount Number of tokens in the prompt
* @param promptNanos Time to process the prompt in nanoseconds
* @param inferenceEvalCount Number of tokens in the model's response
* @param inferenceNanos Time to output the response in nanoseconds
* @param tornadoCompileNanos Time to compile the tornado task graph in nanoseconds
* @param tornadoWarmupNanos Time spent warming up until steady state in nanoseconds
*/
public static void setMetrics(int tokens, double seconds) {
latestMetrics = new LastRunMetrics(tokens, seconds);
public static void setMetrics(int totalTokens, long totalNanos, int promptEvalCount, long promptNanos, int inferenceEvalCount, long inferenceNanos, long tornadoCompileNanos, long tornadoWarmupNanos) {
latestMetrics = new LastRunMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);
}
/**
* @param nanos Time it takes to load the model in nanoseconds; only changes when new model is loaded
*/
public static void setModelLoadDuration(long nanos){
loadDurationNanos = nanos;
}

/**
* Prints the metrics from the latest run to stderr
*/
public static void printMetrics() {
if (latestMetrics != null) {
double tokensPerSecond = latestMetrics.totalTokens() / latestMetrics.totalSeconds();
System.err.printf("\n\nachieved tok/s: %.2f. Tokens: %d, seconds: %.2f\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalSeconds());

// If statements to only print model load once, and only print TornadoVM metrics if using GPU
if (loadDurationNanos > 0) {
double loadSeconds = loadDurationNanos / 1_000_000_000.0;
System.err.printf("Model load time: %d ns (%.2f s)\n", loadDurationNanos, loadSeconds);
}
if (latestMetrics.tornadoCompileNanos() > 0) {
double compileSeconds = latestMetrics.tornadoCompileNanos() / 1_000_000_000.0;
System.err.printf("TornadoVM Compile Time: %d ns (%.2f s)\n", latestMetrics.tornadoCompileNanos(), compileSeconds);
}
if (latestMetrics.tornadoWarmupNanos() > 0) {
double warmupSeconds = latestMetrics.tornadoWarmupNanos() / 1_000_000_000.0;
System.err.printf("TornadoVM Warmup Time: %d ns (%.2f s)\n", latestMetrics.tornadoWarmupNanos(), warmupSeconds);
}

double totalSeconds = latestMetrics.totalNanos() / 1_000_000_000.0;
// Time to First Token = Load Time + Prompt Eval time (which includes first decode step)
long ttftNanos = loadDurationNanos + latestMetrics.promptNanos();
double ttftSeconds = ttftNanos / 1_000_000_000.0;
double promptSeconds = latestMetrics.promptNanos() / 1_000_000_000.0;
double prefillThroughput = latestMetrics.promptEvalCount() / promptSeconds;
double inferenceSeconds = latestMetrics.inferenceNanos() / 1_000_000_000.0;
double decodeThroughput = latestMetrics.inferenceEvalCount() / inferenceSeconds;
double tokensPerSecond = latestMetrics.totalTokens() / totalSeconds;
System.err.printf("\n\nAchieved tok/s: %.2f. Total tokens: %d, Total time: %d ns (%.2f s), Time to first Token: %d ns (%.2f s)\nPrefill throughput: %.2f tok/s, Prompt tokens: %d, Prompt time: %d ns (%.2f s)\nDecode throughput: %.2f tok/s, Inference tokens: %d, Inference time: %d ns (%.2f s)\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalNanos(), totalSeconds, ttftNanos, ttftSeconds, prefillThroughput, latestMetrics.promptEvalCount(), latestMetrics.promptNanos(), promptSeconds, decodeThroughput, latestMetrics.inferenceEvalCount(), latestMetrics.inferenceNanos(), inferenceSeconds);
Comment on lines +63 to +72
Copy link

Copilot AI Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

printMetrics() computes throughputs via division by promptSeconds, inferenceSeconds, and totalSeconds without guarding for zero durations. Several legitimate flows can yield promptNanos == 0 or inferenceNanos == 0 (e.g., when no inference tokens are generated and inferenceStartNanos is forced to endNanos), which will result in Infinity/NaN being printed. Add explicit checks (e.g., duration > 0 and count > 0) and print 0 or "N/A" when the corresponding duration is zero.

Copilot uses AI. Check for mistakes.

loadDurationNanos = 0;
}
}
}
161 changes: 131 additions & 30 deletions src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,18 @@ public static List<Integer> generateTokensLlama(Model model, State state, int st

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -213,10 +221,18 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);
Comment on lines 222 to +235
Copy link

Copilot AI Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Qwen3 CPU path, promptNanos is derived from inferenceStartNanos - startNanos, but inferenceStartNanos is only set in the else branch of the main loop. The first generated token is produced in the same iteration when the last prompt token is ingested (the branch that does position++), so inferenceStartNanos ends up being recorded after at least one decode step has already happened. This will misattribute some decode time into promptNanos and undercount inferenceNanos; consider capturing inferenceStartNanos immediately before sampling the first generated token when prompt ingestion completes.

Copilot uses AI. Check for mistakes.

return generatedTokens;
}
Expand All @@ -225,6 +241,7 @@ public static List<Integer> generateTokensPhi3(Model model, State state, int sta
IntConsumer onTokenGenerated) {

long startNanos = System.nanoTime();
long inferenceStartNanos = 0;
if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
maxTokens = model.configuration().contextLength();
}
Expand All @@ -245,6 +262,9 @@ public static List<Integer> generateTokensPhi3(Model model, State state, int sta
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
}
} else {
if (inferenceStartNanos == 0) {
inferenceStartNanos = System.nanoTime();
}
nextToken = sampler.sampleToken(state.logits);
if (echo) {
// log inferred token
Expand All @@ -266,10 +286,18 @@ public static List<Integer> generateTokensPhi3(Model model, State state, int sta

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);

return generatedTokens;

Expand Down Expand Up @@ -354,13 +382,28 @@ public static List<Integer> generateTokensGPULlama(Model model, State state, int
pos++;
}

// === Performance Metrics ===
// Calculate and Print Performance Metrics
long endNanos = System.nanoTime();
double totalSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

// Set metrics for tokens achieved
LastRunMetrics.setMetrics(totalTokens, totalSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
// If statement to prevent inadvertent crashes from future features
if (tornadoVMPlan != null) {
tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos();
tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos();
// Reset values so they are only output if they are changed in tornadoVMMasterPlan.java
tornadoVMPlan.setCompileDurationNanos(0);
tornadoVMPlan.setWarmupDurationNanos(0);
}
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -447,12 +490,28 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int
state.latestToken = currentToken = nextToken;
}

// Calculate and print performance metrics
// Calculate and Print Performance Metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
// If statement to prevent inadvertent crashes from future features
if (tornadoVMPlan != null) {
tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos();
tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos();
// Reset values so they are only output if they are changed in tornadoVMMasterPlan.java
tornadoVMPlan.setCompileDurationNanos(0);
tornadoVMPlan.setWarmupDurationNanos(0);
}
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);
Comment on lines +493 to +514
Copy link

Copilot AI Apr 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Qwen3 GPU path, promptNanos is derived from inferenceStartNanos - startNanos, but inferenceStartNanos is only set when entering the post-prompt else branch. As with the CPU variant, the first generated token is produced in the iteration that consumes the final prompt token (the branch that increments position), so inferenceStartNanos is recorded after generation has already started. This skews promptNanos/inferenceNanos and derived throughput/TTFT; capture inferenceStartNanos right before sampling the first generated token when prompt ingestion completes.

Copilot uses AI. Check for mistakes.

return generatedTokens;
}
Expand Down Expand Up @@ -522,12 +581,28 @@ public static List<Integer> generateTokensGPUPhi3(Model model, State state, int
pos++;
}

// Calculate and print performance metrics
// Calculate and Print Performance Metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
// If statement to prevent inadvertent crashes from future features
if (tornadoVMPlan != null) {
tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos();
tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos();
// Reset values so they are only output if they are changed in tornadoVMMasterPlan.java
tornadoVMPlan.setCompileDurationNanos(0);
tornadoVMPlan.setWarmupDurationNanos(0);
}
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -590,11 +665,20 @@ public static List<Integer> generateTokensGranite(Model model, State state, int
pos++;
}

// Calculate and Print Performance Metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);

return generatedTokens;
}
Expand All @@ -605,7 +689,7 @@ public static List<Integer> generateTokensGranite(Model model, State state, int
*/
public static List<Integer> generateTokensGPUGranite(Model model, State state, int startPosition,
List<Integer> promptTokens, Set<Integer> stopTokens, int maxTokens, Sampler sampler, boolean echo,
IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMMasterPlan) {
IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
long startNanos = System.nanoTime();
long inferenceStartNanos = 0;

Expand All @@ -623,7 +707,7 @@ public static List<Integer> generateTokensGPUGranite(Model model, State state, i

while (pos < maxTokens) {
// Call TornadoVM forward pass (same as Llama for now)
logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMMasterPlan);
logits = InferenceCore.forwardTornadoVM(model, state, currentToken, pos, tornadoVMPlan);

if (promptIndex < promptTokens.size()) {
nextToken = promptTokens.get(promptIndex++);
Expand Down Expand Up @@ -657,11 +741,28 @@ public static List<Integer> generateTokensGPUGranite(Model model, State state, i
pos++;
}

// Calculate and Print Performance Metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
if (inferenceStartNanos == 0) {
inferenceStartNanos = endNanos; // Prevents negative time if no tokens were generated
}
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long totalNanos = (endNanos - startNanos);
int promptEvalCount = promptIndex;
long promptNanos = inferenceStartNanos - startNanos;
int inferenceEvalCount = generatedTokens.size();
long inferenceNanos = endNanos - inferenceStartNanos;
long tornadoCompileNanos = 0;
long tornadoWarmupNanos = 0;
// If statement to prevent inadvertent crashes from future features
if (tornadoVMPlan != null) {
tornadoCompileNanos = tornadoVMPlan.getCompileDurationNanos();
tornadoWarmupNanos = tornadoVMPlan.getWarmupDurationNanos();
// Reset values so they are only output if they are changed in tornadoVMMasterPlan.java
tornadoVMPlan.setCompileDurationNanos(0);
tornadoVMPlan.setWarmupDurationNanos(0);
}
LastRunMetrics.setMetrics(totalTokens, totalNanos, promptEvalCount, promptNanos, inferenceEvalCount, inferenceNanos, tornadoCompileNanos, tornadoWarmupNanos);

return generatedTokens;
}
Expand Down
Loading
Loading