leehack · leehack · Jul 4, 2026 · Jul 4, 2026 · Jul 4, 2026 · Jul 4, 2026
@@ -1,3 +1,9 @@
+## Unreleased
+
+* Added `LlamaStructuredOutput` and `LlamaEngine.createStructuredJson(...)`
+  helpers for strict JSON-object / JSON-schema generation with final-output
+  validation and typed decoding.
+
 ## 0.8.12
 
 * Updated the default LiteRT-LM native runtime pin to

diff --git a/README.md b/README.md
@@ -960,7 +960,61 @@ void main() async {
 }
 ```
 
-### 2. Advanced Usage (ChatSession)
+### 2. Structured JSON Output
+
+Use `LlamaStructuredOutput` for strict JSON generation with final validation and
+typed decoding on grammar-capable backends.
+
+```dart
+class Contact {
+  Contact({required this.name, required this.email});
+
+  final String name;
+  final String email;
+
+  static Contact fromJson(Map<String, dynamic> json) {
+    return Contact(
+      name: json['name'] as String,
+      email: json['email'] as String,
+    );
+  }
+}
+
+final output = LlamaStructuredOutput<Contact>.jsonSchema(
+  schema: const {
+    'type': 'object',
+    'properties': {
+      'name': {'type': 'string'},
+      'email': {'type': 'string'},
+    },
+    'required': ['name', 'email'],
+    'additionalProperties': false,
+  },
+  decoder: Contact.fromJson,
+);
+
+final contact = await engine.createStructuredJson(
+  const [
+    LlamaChatMessage.fromText(
+      role: LlamaChatRole.user,
+      text: 'Extract a contact from: Ada Lovelace <ada@example.com>',
+    ),
+  ],
+  output: output,
+  params: const GenerationParams(temp: 0, maxTokens: 96),
+);
+```
+
+Streaming callers can pass `responseFormat: output.responseFormat` to
+`engine.create(...)`, render chunks live, then await
+`stream.parseStructuredJson(output)` after the stream completes. Supported
+schemas cover the practical JSON-schema-to-GBNF subset:
+primitive types, objects, arrays, `enum`/`const`, local `$ref`, `anyOf`,
+`oneOf`, `allOf`, string length, and array item-count bounds. Annotation
+metadata such as `title`, `description`, and `default` is preserved but not
+enforced as a decoding constraint.
+
+### 3. Advanced Usage (ChatSession)
 
 Use `ChatSession` for most chat applications. It automatically manages conversation history, system prompts, and handles context window limits.
 
@@ -989,7 +1043,7 @@ void main() async {
 }
 ```
 
-### 3. Tool Calling
+### 4. Tool Calling
 
 `llamadart` supports intelligent tool calling where the model can use external functions to help it answer questions.
 
@@ -1026,7 +1080,7 @@ Notes:
 - Some handlers use lazy grammar activation (triggered when a tool-call prefix appears) to match llama.cpp behavior.
 - If you implement a custom handler grammar, prefer Dart raw strings (`r'''...'''`) for GBNF blocks to avoid escaping bugs.
 
-### 3.5 Template Routing (Strict llama.cpp parity)
+### 5. Template Routing (Strict llama.cpp parity)
 
 Template/render/parse routing is intentionally strict to match llama.cpp:
 
@@ -1053,7 +1107,7 @@ final result = await engine.chatTemplate(
 print(result.prompt);
 ```
 
-### 3.6 Logging Control
+### 6. Logging Control
 
 Use separate log levels for Dart and native output when debugging:
 
@@ -1072,7 +1126,7 @@ await engine.setNativeLogLevel(LlamaLogLevel.warn);
 await engine.setLogLevel(LlamaLogLevel.none);
 ```
 
-### 4. Multimodal Usage (Vision/Audio)
+### 7. Multimodal Usage (Vision/Audio)
 
 `llamadart` supports multimodal models (vision and audio) using `LlamaChatMessage.withContent`.
 

diff --git a/lib/llamadart.dart b/lib/llamadart.dart
@@ -59,6 +59,7 @@ export 'src/backends/backend.dart'
 // Models - Inference
 export 'src/core/models/inference/model_params.dart';
 export 'src/core/models/inference/generation_params.dart';
+export 'src/core/models/inference/structured_output.dart';
 export 'src/core/models/inference/tool_choice.dart';
 
 // Models - Sources, resolution, and downloads

diff --git a/lib/src/core/engine/engine.dart b/lib/src/core/engine/engine.dart
@@ -18,6 +18,7 @@ import '../llama_logger.dart';
 
 import '../models/inference/model_params.dart';
 import '../models/inference/generation_params.dart';
+import '../models/inference/structured_output.dart';
 import '../models/inference/tool_choice.dart';
 import '../models/model_load_options.dart';
 import '../models/model_resolver.dart';
@@ -401,6 +402,8 @@ class LlamaEngine {
   /// grammar-constrained decoding on compatible backends. Supported shapes are:
   /// - `{'type': 'json_object'}`
   /// - `{'type': 'json_schema', 'json_schema': {'schema': <JSON schema>}}`
+  /// Use [LlamaStructuredOutput.responseFormat] or [createStructuredJson] for a
+  /// typed helper that also validates and decodes the final JSON output.
   ///
   /// Backends without grammar-constrained decoding, including LiteRT-LM native
   /// and web today, throw [LlamaUnsupportedException] for strict
@@ -512,6 +515,41 @@ class LlamaEngine {
     );
   }
 
+  /// Generates strict structured JSON and decodes the final output.
+  ///
+  /// This helper applies [output.responseFormat] to [create], collects streamed
+  /// content deltas, validates the completed JSON value, and returns the typed
+  /// value produced by [output]'s decoder. Use [create] directly when you need
+  /// to render tokens live; the returned stream can still be finalized with
+  /// `await stream.parseStructuredJson(output)`.
+  Future<T> createStructuredJson<T>(
+    List<LlamaChatMessage> messages, {
+    required LlamaStructuredOutput<T> output,
+    GenerationParams? params,
+    List<ToolDefinition>? tools,
+    ToolChoice? toolChoice,
+    bool parallelToolCalls = false,
+    bool enableThinking = true,
+    String? sourceLangCode,
+    String? targetLangCode,
+    Map<String, dynamic>? chatTemplateKwargs,
+    DateTime? templateNow,
+  }) {
+    return create(
+      messages,
+      params: params,
+      tools: tools,
+      toolChoice: toolChoice,
+      parallelToolCalls: parallelToolCalls,
+      enableThinking: enableThinking,
+      responseFormat: output.responseFormat,
+      sourceLangCode: sourceLangCode,
+      targetLangCode: targetLangCode,
+      chatTemplateKwargs: chatTemplateKwargs,
+      templateNow: templateNow,
+    ).parseStructuredJson(output);
+  }
+
   /// Formats a list of [messages] into a prompt string using the model's template.
   ///
   /// This is useful for preparing messages before calling [generate] directly,
@@ -522,6 +560,8 @@ class LlamaEngine {
   /// Supported shapes are:
   /// - `{'type': 'json_object'}`
   /// - `{'type': 'json_schema', 'json_schema': {'schema': <JSON schema>}}`
+  /// Use [LlamaStructuredOutput.responseFormat] to avoid hand-writing these
+  /// maps in application code.
   ///
   /// [jsonSchema] is a legacy shortcut for
   /// `responseFormat: {'type': 'json_schema', 'json_schema': {'schema': ...}}`.