plastic-labs · phuongvm · May 29, 2026 · May 29, 2026
diff --git a/.env.template b/.env.template
@@ -76,19 +76,24 @@ AUTH_USE_AUTH=false
 # section you want to route through that endpoint.
 # Models must support tool calling (function calling).
 #
-# Supported transports: openai, anthropic, gemini
+# Supported transports: openai, anthropic, gemini, lmstudio
 # Each transport picks up its API key from the corresponding LLM_*_API_KEY.
 # Base URLs are set per-module via MODEL_CONFIG__OVERRIDES__BASE_URL.
 #
 LLM_OPENAI_API_KEY=your-api-key-here
 # LLM_ANTHROPIC_API_KEY=
 # LLM_GEMINI_API_KEY=
+# LMStudio (OpenAI-compatible local server). API key can be any non-empty
+# placeholder (e.g. "lm-studio") if the local server doesn't require auth.
+# LLM_LMSTUDIO_API_KEY=lm-studio
+# LLM_LMSTUDIO_BASE_URL=http://localhost:1234/v1
 
 # =============================================================================
 # LLM Configuration
 # =============================================================================
 # Global LLM settings
 # LLM_DEFAULT_MAX_TOKENS=2500
+# LLM_DEFAULT_TIMEOUT=180.0  # HTTP timeout (seconds) for all LLM provider clients
 # LLM_MAX_TOOL_OUTPUT_CHARS=10000  # Max chars for tool output (~2500 tokens)
 # LLM_MAX_MESSAGE_CONTENT_CHARS=2000  # Max chars per message in tool results
 
@@ -119,8 +124,8 @@ LLM_OPENAI_API_KEY=your-api-key-here
 # DERIVER_FLUSH_ENABLED=false  # Bypass batch token threshold, process work immediately
 # DERIVER_MODEL_CONFIG__FALLBACK__MODEL=
 # DERIVER_MODEL_CONFIG__FALLBACK__TRANSPORT=
-# DERIVER_MODEL_CONFIG__OVERRIDES__BASE_URL=
-# DERIVER_MODEL_CONFIG__OVERRIDES__API_KEY_ENV=
+# DERIVER_MODEL_CONFIG__FALLBACK__OVERRIDES__BASE_URL=
+# DERIVER_MODEL_CONFIG__FALLBACK__OVERRIDES__API_KEY_ENV=
 
 # =============================================================================
 # Peer Card
@@ -168,6 +173,14 @@ LLM_OPENAI_API_KEY=your-api-key-here
 # Optional backup per level (must set both or neither):
 # DIALECTIC_LEVELS__max__MODEL_CONFIG__FALLBACK__MODEL=gemini-2.5-pro
 # DIALECTIC_LEVELS__max__MODEL_CONFIG__FALLBACK__TRANSPORT=gemini
+# DIALECTIC_LEVELS__high__MODEL_CONFIG__FALLBACK__MODEL=
+# DIALECTIC_LEVELS__high__MODEL_CONFIG__FALLBACK__TRANSPORT=
+# DIALECTIC_LEVELS__medium__MODEL_CONFIG__FALLBACK__MODEL=
+# DIALECTIC_LEVELS__medium__MODEL_CONFIG__FALLBACK__TRANSPORT=
+# DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__MODEL=
+# DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__TRANSPORT=
+# DIALECTIC_LEVELS__minimal__MODEL_CONFIG__FALLBACK__MODEL=
+# DIALECTIC_LEVELS__minimal__MODEL_CONFIG__FALLBACK__TRANSPORT=
 
 # =============================================================================
 # Summary
@@ -186,6 +199,9 @@ LLM_OPENAI_API_KEY=your-api-key-here
 # SUMMARY_MAX_TOKENS_SHORT=1000
 # SUMMARY_MAX_TOKENS_LONG=4000
 # SUMMARY_MODEL_CONFIG__FALLBACK__MODEL=
+# SUMMARY_MODEL_CONFIG__FALLBACK__TRANSPORT=
+# SUMMARY_MODEL_CONFIG__FALLBACK__OVERRIDES__BASE_URL=
+# SUMMARY_MODEL_CONFIG__FALLBACK__OVERRIDES__API_KEY_ENV=
 
 # =============================================================================
 # Dream
@@ -201,6 +217,10 @@ LLM_OPENAI_API_KEY=your-api-key-here
 # DREAM_DEDUCTION_MODEL_CONFIG__OVERRIDES__BASE_URL=https://openrouter.ai/api/v1
 # DREAM_INDUCTION_MODEL_CONFIG__MODEL=your-model-here
 # DREAM_INDUCTION_MODEL_CONFIG__OVERRIDES__BASE_URL=https://openrouter.ai/api/v1
+# DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__MODEL=
+# DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=
+# DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__MODEL=
+# DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=
 # DREAM_DOCUMENT_THRESHOLD=50
 # DREAM_IDLE_TIMEOUT_MINUTES=60
 # DREAM_MIN_HOURS_BETWEEN_DREAMS=8

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -297,3 +297,55 @@ src/
 ### Notes
 
 - Always use `uv run` or `uv` to prefix any commands related to python to ensure you use the virtual environment
+
+## LLM Model Fallback
+
+Honcho supports automatic fallback to a secondary LLM model when the primary model fails (rate limit, timeout, API error).
+
+### How It Works
+
+1. **First-failure trigger**: When the primary model returns a retryable error (429, 5xx, timeout, connection error), the system immediately switches to the fallback model on the next attempt — it does NOT wait for all retries to exhaust.
+2. **Per-agent configuration**: Each agent (Deriver, Dialectic, Dreamer, Summary) has independent fallback configuration.
+3. **Cross-provider support**: The fallback model can use a different provider (e.g., primary=openai, fallback=lmstudio).
+4. **Backward compatible**: If no fallback is configured, behavior is identical to before.
+
+### Configuration
+
+Set these environment variables for each agent:
+
+```bash
+# Deriver
+DERIVER_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
+DERIVER_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
+
+# Dialectic (per reasoning level)
+DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
+DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
+
+# Summary
+SUMMARY_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
+SUMMARY_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
+
+# Dream
+DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
+DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
+DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
+DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
+```
+
+### Observability
+
+- **Logs**: WARNING-level log emitted when fallback is activated, including primary→fallback provider/model info
+- **Langfuse**: Generation span metadata includes `is_fallback: true` when fallback model is used
+- **No separate span**: Only the successful attempt's span is kept (failed attempts do not create separate spans)
+
+### Retryable Errors
+
+The following errors trigger fast fallback:
+- HTTP 429 (Too Many Requests / Rate Limit)
+- HTTP 5xx (Server Errors)
+- `TimeoutError`
+- `ConnectionError`
+- SDK-specific: `APIConnectionError`, `APITimeoutError`, `InternalServerError`, `ServiceUnavailableError`, `RateLimitError`
+
+Non-retryable errors (e.g., HTTP 400, 422, or synchronous `ValueError`) do NOT trigger fallback — they are surfaced immediately and not retried.
diff --git a/src/config.py b/src/config.py
@@ -22,7 +22,7 @@
 
 logger = logging.getLogger(__name__)
 
-ModelTransport = Literal["anthropic", "openai", "gemini"]
+ModelTransport = Literal["anthropic", "openai", "gemini", "lmstudio"]
 EmbeddingTransport = Literal["openai", "gemini"]
 EmbeddingDimensionsMode = Literal["auto", "always", "never"]
 
@@ -95,7 +95,7 @@ def _normalize_model_transport(data: Any) -> Any:
     transport_value = update.get("transport")
     if isinstance(model_value, str) and "/" in model_value and transport_value is None:
         prefix, bare_model = model_value.split("/", 1)
-        if prefix in {"anthropic", "openai", "gemini"}:
+        if prefix in {"anthropic", "openai", "gemini", "lmstudio"}:
             update["transport"] = prefix
             update["model"] = bare_model
     return update
@@ -654,6 +654,8 @@ class LLMSettings(HonchoSettings):
     ANTHROPIC_API_KEY: str | None = None
     OPENAI_API_KEY: str | None = None
     GEMINI_API_KEY: str | None = None
+    LMSTUDIO_API_KEY: str | None = None
+    LMSTUDIO_BASE_URL: str | None = None
 
     # Base URLs for LLM providers (for OpenAI-compatible proxies like
     # OpenRouter, vLLM, Together, Anyscale, self-hosted, etc.)
@@ -664,6 +666,11 @@ class LLMSettings(HonchoSettings):
     # General LLM settings
     DEFAULT_MAX_TOKENS: Annotated[int, Field(default=1000, gt=0, le=100_000)] = 2500
 
+    # Default timeout in seconds for all LLM provider HTTP clients.
+    # Raised from SDK default (~60s) to accommodate slower local models (e.g.
+    # LMStudio fallback) without hardcoding in registry.py.
+    DEFAULT_TIMEOUT: Annotated[float, Field(default=180.0, gt=0)] = 180.0
+
     # Maximum characters for tool output to prevent token explosion.
     # Set to 10,000 chars (~2,500 tokens at 4 chars/token) to stay well under
     # typical context limits while providing substantial tool output.