Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,24 @@ AUTH_USE_AUTH=false
# section you want to route through that endpoint.
# Models must support tool calling (function calling).
#
# Supported transports: openai, anthropic, gemini
# Supported transports: openai, anthropic, gemini, lmstudio
# Each transport picks up its API key from the corresponding LLM_*_API_KEY.
# Base URLs are set per-module via MODEL_CONFIG__OVERRIDES__BASE_URL.
#
LLM_OPENAI_API_KEY=your-api-key-here
# LLM_ANTHROPIC_API_KEY=
# LLM_GEMINI_API_KEY=
# LMStudio (OpenAI-compatible local server). API key can be any non-empty
# placeholder (e.g. "lm-studio") if the local server doesn't require auth.
# LLM_LMSTUDIO_API_KEY=lm-studio
# LLM_LMSTUDIO_BASE_URL=http://localhost:1234/v1

# =============================================================================
# LLM Configuration
# =============================================================================
# Global LLM settings
# LLM_DEFAULT_MAX_TOKENS=2500
# LLM_DEFAULT_TIMEOUT=180.0 # HTTP timeout (seconds) for all LLM provider clients
# LLM_MAX_TOOL_OUTPUT_CHARS=10000 # Max chars for tool output (~2500 tokens)
# LLM_MAX_MESSAGE_CONTENT_CHARS=2000 # Max chars per message in tool results

Expand Down Expand Up @@ -119,8 +124,8 @@ LLM_OPENAI_API_KEY=your-api-key-here
# DERIVER_FLUSH_ENABLED=false # Bypass batch token threshold, process work immediately
# DERIVER_MODEL_CONFIG__FALLBACK__MODEL=
# DERIVER_MODEL_CONFIG__FALLBACK__TRANSPORT=
# DERIVER_MODEL_CONFIG__OVERRIDES__BASE_URL=
# DERIVER_MODEL_CONFIG__OVERRIDES__API_KEY_ENV=
# DERIVER_MODEL_CONFIG__FALLBACK__OVERRIDES__BASE_URL=
# DERIVER_MODEL_CONFIG__FALLBACK__OVERRIDES__API_KEY_ENV=

# =============================================================================
# Peer Card
Expand Down Expand Up @@ -168,6 +173,14 @@ LLM_OPENAI_API_KEY=your-api-key-here
# Optional backup per level (must set both or neither):
# DIALECTIC_LEVELS__max__MODEL_CONFIG__FALLBACK__MODEL=gemini-2.5-pro
# DIALECTIC_LEVELS__max__MODEL_CONFIG__FALLBACK__TRANSPORT=gemini
# DIALECTIC_LEVELS__high__MODEL_CONFIG__FALLBACK__MODEL=
# DIALECTIC_LEVELS__high__MODEL_CONFIG__FALLBACK__TRANSPORT=
# DIALECTIC_LEVELS__medium__MODEL_CONFIG__FALLBACK__MODEL=
# DIALECTIC_LEVELS__medium__MODEL_CONFIG__FALLBACK__TRANSPORT=
# DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__MODEL=
# DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__TRANSPORT=
# DIALECTIC_LEVELS__minimal__MODEL_CONFIG__FALLBACK__MODEL=
# DIALECTIC_LEVELS__minimal__MODEL_CONFIG__FALLBACK__TRANSPORT=

# =============================================================================
# Summary
Expand All @@ -186,6 +199,9 @@ LLM_OPENAI_API_KEY=your-api-key-here
# SUMMARY_MAX_TOKENS_SHORT=1000
# SUMMARY_MAX_TOKENS_LONG=4000
# SUMMARY_MODEL_CONFIG__FALLBACK__MODEL=
# SUMMARY_MODEL_CONFIG__FALLBACK__TRANSPORT=
# SUMMARY_MODEL_CONFIG__FALLBACK__OVERRIDES__BASE_URL=
# SUMMARY_MODEL_CONFIG__FALLBACK__OVERRIDES__API_KEY_ENV=

# =============================================================================
# Dream
Expand All @@ -201,6 +217,10 @@ LLM_OPENAI_API_KEY=your-api-key-here
# DREAM_DEDUCTION_MODEL_CONFIG__OVERRIDES__BASE_URL=https://openrouter.ai/api/v1
# DREAM_INDUCTION_MODEL_CONFIG__MODEL=your-model-here
# DREAM_INDUCTION_MODEL_CONFIG__OVERRIDES__BASE_URL=https://openrouter.ai/api/v1
# DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__MODEL=
# DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=
# DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__MODEL=
# DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=
# DREAM_DOCUMENT_THRESHOLD=50
# DREAM_IDLE_TIMEOUT_MINUTES=60
# DREAM_MIN_HOURS_BETWEEN_DREAMS=8
Expand Down
52 changes: 52 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,55 @@ src/
### Notes

- Always use `uv run` or `uv` to prefix any commands related to python to ensure you use the virtual environment

## LLM Model Fallback

Honcho supports automatic fallback to a secondary LLM model when the primary model fails (rate limit, timeout, API error).

### How It Works

1. **First-failure trigger**: When the primary model returns a retryable error (429, 5xx, timeout, connection error), the system immediately switches to the fallback model on the next attempt — it does NOT wait for all retries to exhaust.
2. **Per-agent configuration**: Each agent (Deriver, Dialectic, Dreamer, Summary) has independent fallback configuration.
3. **Cross-provider support**: The fallback model can use a different provider (e.g., primary=openai, fallback=lmstudio).
4. **Backward compatible**: If no fallback is configured, behavior is identical to before.

### Configuration

Set these environment variables for each agent:

```bash
# Deriver
DERIVER_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
DERIVER_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b

# Dialectic (per reasoning level)
DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
DIALECTIC_LEVELS__low__MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b

# Summary
SUMMARY_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
SUMMARY_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b

# Dream
DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
DREAM_DEDUCTION_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__TRANSPORT=lmstudio
DREAM_INDUCTION_MODEL_CONFIG__FALLBACK__MODEL=qwen/qwen3.5-9b
```

### Observability

- **Logs**: WARNING-level log emitted when fallback is activated, including primary→fallback provider/model info
- **Langfuse**: Generation span metadata includes `is_fallback: true` when fallback model is used
- **No separate span**: Only the successful attempt's span is kept (failed attempts do not create separate spans)

### Retryable Errors

The following errors trigger fast fallback:
- HTTP 429 (Too Many Requests / Rate Limit)
- HTTP 5xx (Server Errors)
- `TimeoutError`
- `ConnectionError`
- SDK-specific: `APIConnectionError`, `APITimeoutError`, `InternalServerError`, `ServiceUnavailableError`, `RateLimitError`

Non-retryable errors (e.g., HTTP 400, 422, or synchronous `ValueError`) do NOT trigger fallback — they are surfaced immediately and not retried.
11 changes: 9 additions & 2 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

logger = logging.getLogger(__name__)

ModelTransport = Literal["anthropic", "openai", "gemini"]
ModelTransport = Literal["anthropic", "openai", "gemini", "lmstudio"]
EmbeddingTransport = Literal["openai", "gemini"]
EmbeddingDimensionsMode = Literal["auto", "always", "never"]

Expand Down Expand Up @@ -95,7 +95,7 @@ def _normalize_model_transport(data: Any) -> Any:
transport_value = update.get("transport")
if isinstance(model_value, str) and "/" in model_value and transport_value is None:
prefix, bare_model = model_value.split("/", 1)
if prefix in {"anthropic", "openai", "gemini"}:
if prefix in {"anthropic", "openai", "gemini", "lmstudio"}:
update["transport"] = prefix
update["model"] = bare_model
return update
Expand Down Expand Up @@ -654,6 +654,8 @@ class LLMSettings(HonchoSettings):
ANTHROPIC_API_KEY: str | None = None
OPENAI_API_KEY: str | None = None
GEMINI_API_KEY: str | None = None
LMSTUDIO_API_KEY: str | None = None
LMSTUDIO_BASE_URL: str | None = None

# Base URLs for LLM providers (for OpenAI-compatible proxies like
# OpenRouter, vLLM, Together, Anyscale, self-hosted, etc.)
Expand All @@ -664,6 +666,11 @@ class LLMSettings(HonchoSettings):
# General LLM settings
DEFAULT_MAX_TOKENS: Annotated[int, Field(default=1000, gt=0, le=100_000)] = 2500

# Default timeout in seconds for all LLM provider HTTP clients.
# Raised from SDK default (~60s) to accommodate slower local models (e.g.
# LMStudio fallback) without hardcoding in registry.py.
DEFAULT_TIMEOUT: Annotated[float, Field(default=180.0, gt=0)] = 180.0

# Maximum characters for tool output to prevent token explosion.
# Set to 10,000 chars (~2,500 tokens at 4 chars/token) to stay well under
# typical context limits while providing substantial tool output.
Expand Down
Loading