From 51f8f662b93a527f7cd1c7e3827eae92cd90ae88 Mon Sep 17 00:00:00 2001 From: Albertchamberlain <47343901+Albertchamberlain@users.noreply.github.com> Date: Thu, 18 Jun 2026 14:37:40 +0800 Subject: [PATCH] Add configurable LLM provider layer Introduce a provider adapter so FsExplorer can use Google Gemini or OpenAI-compatible APIs (SiliconFlow, OpenAI) via FS_EXPLORER_LLM_PROVIDER and related env vars. Google remains the default with native JSON schema output. OpenAI-compatible providers use response_format=json_object plus flexible action parsing for imperfect model output. Includes PROVIDERS.md, updated tests, and optional live integration test gated on SILICONFLOW_API_KEY. --- .env.example | 21 ++- PROVIDERS.md | 64 +++++++++ README.md | 17 ++- pyproject.toml | 1 + src/fs_explorer/agent.py | 202 ++++++++++++++++----------- src/fs_explorer/llm/__init__.py | 14 ++ src/fs_explorer/llm/action_parser.py | 109 +++++++++++++++ src/fs_explorer/llm/base.py | 42 ++++++ src/fs_explorer/llm/config.py | 87 ++++++++++++ src/fs_explorer/llm/factory.py | 41 ++++++ src/fs_explorer/llm/google_client.py | 63 +++++++++ src/fs_explorer/llm/openai_client.py | 75 ++++++++++ src/fs_explorer/llm/prompts.py | 28 ++++ src/fs_explorer/ui.html | 2 +- tests/conftest.py | 83 ++++------- tests/test_agent.py | 113 ++++++++------- tests/test_llm_config.py | 51 +++++++ tests/test_llm_integration.py | 29 ++++ 18 files changed, 842 insertions(+), 200 deletions(-) create mode 100644 PROVIDERS.md create mode 100644 src/fs_explorer/llm/__init__.py create mode 100644 src/fs_explorer/llm/action_parser.py create mode 100644 src/fs_explorer/llm/base.py create mode 100644 src/fs_explorer/llm/config.py create mode 100644 src/fs_explorer/llm/factory.py create mode 100644 src/fs_explorer/llm/google_client.py create mode 100644 src/fs_explorer/llm/openai_client.py create mode 100644 src/fs_explorer/llm/prompts.py create mode 100644 tests/test_llm_config.py create mode 100644 tests/test_llm_integration.py diff --git a/.env.example b/.env.example index a68315a..9740d4c 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,23 @@ -# Google Gemini API Key +# LLM provider selection: google | siliconflow | openai +FS_EXPLORER_LLM_PROVIDER=google + +# Optional model override (provider-specific default if unset) +# FS_EXPLORER_LLM_MODEL=gemini-3-flash-preview +# FS_EXPLORER_LLM_MODEL=Qwen/Qwen2.5-72B-Instruct + +# Optional base URL override for OpenAI-compatible providers +# FS_EXPLORER_LLM_BASE_URL=https://api.siliconflow.cn/v1 + +# --- Google Gemini --- # Get yours at: https://aistudio.google.com/apikey -GOOGLE_API_KEY=your_api_key_here +GOOGLE_API_KEY=your_google_api_key_here + +# --- SiliconFlow (OpenAI-compatible) --- +# Get yours at: https://cloud.siliconflow.cn/account/ak +SILICONFLOW_API_KEY=your_siliconflow_api_key_here + +# --- OpenAI (optional) --- +# OPENAI_API_KEY=your_openai_api_key_here # Optional: dedicated key for langextract metadata mode. # If unset, indexing will fall back to GOOGLE_API_KEY. diff --git a/PROVIDERS.md b/PROVIDERS.md new file mode 100644 index 0000000..b8448b3 --- /dev/null +++ b/PROVIDERS.md @@ -0,0 +1,64 @@ +# LLM Provider Configuration + +FsExplorer supports multiple LLM backends through a small provider adapter layer. + +## Quick Start + +### Google Gemini (default) + +```bash +export FS_EXPLORER_LLM_PROVIDER=google +export GOOGLE_API_KEY=your_google_api_key +``` + +### SiliconFlow (OpenAI-compatible) + +```bash +export FS_EXPLORER_LLM_PROVIDER=siliconflow +export SILICONFLOW_API_KEY=your_siliconflow_api_key +# Optional overrides +export FS_EXPLORER_LLM_MODEL=Qwen/Qwen2.5-72B-Instruct +export FS_EXPLORER_LLM_BASE_URL=https://api.siliconflow.cn/v1 +# International endpoint: https://api.siliconflow.com/v1 +``` + +Get a SiliconFlow API key at https://cloud.siliconflow.cn/account/ak + +### OpenAI + +```bash +export FS_EXPLORER_LLM_PROVIDER=openai +export OPENAI_API_KEY=your_openai_api_key +export FS_EXPLORER_LLM_MODEL=gpt-4o-mini +export FS_EXPLORER_LLM_BASE_URL=https://api.openai.com/v1 +``` + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `FS_EXPLORER_LLM_PROVIDER` | `google`, `siliconflow`, or `openai` (default: `google`) | +| `FS_EXPLORER_LLM_MODEL` | Model id override | +| `FS_EXPLORER_LLM_BASE_URL` | Base URL for OpenAI-compatible providers | +| `GOOGLE_API_KEY` | Google Gemini API key | +| `SILICONFLOW_API_KEY` | SiliconFlow API key | +| `OPENAI_API_KEY` | OpenAI API key | + +## Architecture + +``` +FsExplorerAgent + -> llm.create_llm_client() + -> GoogleGeminiClient (native JSON schema) + -> OpenAICompatibleClient (SiliconFlow, OpenAI, ...) +``` + +Google Gemini uses native structured JSON output. OpenAI-compatible providers use `response_format=json_object` plus the Action JSON schema embedded in the system prompt, with flexible parsing for imperfect model output. + +## Embeddings / Indexing + +Vector indexing (`explore index --with-embeddings`) still uses Google Gemini embeddings by default via `GOOGLE_API_KEY`. Chat provider selection is independent of embedding configuration. + +## Security + +Never commit `.env` or real API keys. Use `.env.example` as a template only. diff --git a/README.md b/README.md index a04f857..af0aa3f 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ This video explains the architecture of the project and how to run it. - ๐ **6 Tools**: `scan_folder`, `preview_file`, `parse_file`, `read`, `grep`, `glob` - ๐ **Document Support**: PDF, DOCX, PPTX, XLSX, HTML, Markdown (via Docling) -- ๐ค **Powered by**: Google Gemini 3 Flash with structured JSON output +- ๐ค **Multi-LLM**: Google Gemini, SiliconFlow, OpenAI-compatible APIs - ๐ฐ **Cost Efficient**: ~$0.001 per query with token tracking - ๐ **Web UI**: Real-time WebSocket streaming interface - ๐ **Citations**: Answers include source references @@ -45,13 +45,19 @@ pip install . ## Configuration -Create a `.env` file in the project root: +Create a `.env` file in the project root (see `.env.example`): ```bash +# Google Gemini (default) +FS_EXPLORER_LLM_PROVIDER=google GOOGLE_API_KEY=your_api_key_here + +# SiliconFlow (OpenAI-compatible) +# FS_EXPLORER_LLM_PROVIDER=siliconflow +# SILICONFLOW_API_KEY=your_api_key_here ``` -Get your API key from [Google AI Studio](https://aistudio.google.com/apikey). +See [PROVIDERS.md](PROVIDERS.md) for all supported backends and model overrides. ## Usage @@ -125,7 +131,7 @@ uv run explore --task "Look in data/large_acquisition/. What happens to employee | Component | Technology | |-----------|------------| -| LLM | Google Gemini 3 Flash | +| LLM | Google Gemini / SiliconFlow / OpenAI-compatible | | Document Parsing | Docling (local, open-source) | | Orchestration | LlamaIndex Workflows | | CLI | Typer + Rich | @@ -136,7 +142,8 @@ uv run explore --task "Look in data/large_acquisition/. What happens to employee ``` src/fs_explorer/ -โโโ agent.py # Gemini client, token tracking +โโโ agent.py # Agent + tool registry +โโโ llm/ # Multi-provider LLM adapters โโโ workflow.py # LlamaIndex workflow engine โโโ fs.py # File tools: scan, parse, grep โโโ models.py # Pydantic models for actions diff --git a/pyproject.toml b/pyproject.toml index 9dea965..8488dd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "duckdb>=1.0.0", "fastapi>=0.115.0", "google-genai>=1.55.0", + "openai>=1.60.0", "langextract>=1.0.0", "llama-index-workflows>=2.11.5", "python-dotenv>=1.0.0", diff --git a/src/fs_explorer/agent.py b/src/fs_explorer/agent.py index ce0e25a..850d68a 100644 --- a/src/fs_explorer/agent.py +++ b/src/fs_explorer/agent.py @@ -1,8 +1,7 @@ """ -FsExplorer Agent for filesystem exploration using Google Gemini. +FsExplorer Agent for filesystem exploration using configurable LLM providers. -This module contains the agent that interacts with the Gemini AI model -to make decisions about filesystem exploration actions. +Supports Google Gemini, SiliconFlow, and other OpenAI-compatible APIs. """ import os @@ -12,9 +11,9 @@ from dataclasses import dataclass from dotenv import load_dotenv -from google.genai.types import Content, HttpOptions, Part -from google.genai import Client as GenAIClient +from .llm import ChatMessage, LLMClient, create_llm_client, load_llm_config +from .llm.action_parser import parse_action_json from .models import Action, ActionType, ToolCallAction, Tools from .fs import ( read_file, @@ -43,9 +42,12 @@ # Token Usage Tracking # ============================================================================= -# Gemini Flash pricing (per million tokens) -GEMINI_FLASH_INPUT_COST_PER_MILLION = 0.075 -GEMINI_FLASH_OUTPUT_COST_PER_MILLION = 0.30 +# Reference pricing (USD per million tokens) for optional cost estimates. +_PROVIDER_PRICING: dict[str, tuple[float, float]] = { + "google": (0.075, 0.30), + "siliconflow": (0.59, 0.59), + "openai": (0.15, 0.60), +} @dataclass @@ -53,26 +55,39 @@ class TokenUsage: """ Track token usage and costs across the session. - Maintains running totals of API calls, token counts, and provides - cost estimates based on Gemini Flash pricing. + Maintains running totals of API calls and token counts. Cost estimates + are best-effort and depend on the active provider. """ prompt_tokens: int = 0 completion_tokens: int = 0 total_tokens: int = 0 api_calls: int = 0 + provider_name: str = "google" + model_name: str = "" # Track content sizes tool_result_chars: int = 0 documents_parsed: int = 0 documents_scanned: int = 0 - def add_api_call(self, prompt_tokens: int, completion_tokens: int) -> None: + def add_api_call( + self, + prompt_tokens: int, + completion_tokens: int, + *, + provider_name: str | None = None, + model_name: str | None = None, + ) -> None: """Record token usage from an API call.""" self.prompt_tokens += prompt_tokens self.completion_tokens += completion_tokens self.total_tokens += prompt_tokens + completion_tokens self.api_calls += 1 + if provider_name: + self.provider_name = provider_name + if model_name: + self.model_name = model_name def add_tool_result(self, result: str, tool_name: str) -> None: """Record metrics from a tool execution.""" @@ -85,24 +100,36 @@ def add_tool_result(self, result: str, tool_name: str) -> None: elif tool_name == "preview_file": self.documents_parsed += 1 - def _calculate_cost(self) -> tuple[float, float, float]: - """Calculate estimated costs based on Gemini Flash pricing.""" - input_cost = ( - self.prompt_tokens / 1_000_000 - ) * GEMINI_FLASH_INPUT_COST_PER_MILLION - output_cost = ( - self.completion_tokens / 1_000_000 - ) * GEMINI_FLASH_OUTPUT_COST_PER_MILLION + def _calculate_cost(self) -> tuple[float, float, float] | None: + """Calculate estimated costs when provider pricing is known.""" + pricing = _PROVIDER_PRICING.get(self.provider_name) + if pricing is None: + return None + input_rate, output_rate = pricing + input_cost = (self.prompt_tokens / 1_000_000) * input_rate + output_cost = (self.completion_tokens / 1_000_000) * output_rate return input_cost, output_cost, input_cost + output_cost def summary(self) -> str: """Generate a formatted summary of token usage and costs.""" - input_cost, output_cost, total_cost = self._calculate_cost() + costs = self._calculate_cost() + cost_block = "" + if costs is not None: + input_cost, output_cost, total_cost = costs + cost_block = ( + f"โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n" + f" Est. Cost ({self.provider_name} / {self.model_name or 'default'}):\n" + f" Input: ${input_cost:.4f}\n" + f" Output: ${output_cost:.4f}\n" + f" Total: ${total_cost:.4f}\n" + ) return f""" โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ TOKEN USAGE SUMMARY โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + Provider: {self.provider_name} + Model: {self.model_name or "n/a"} API Calls: {self.api_calls} Prompt Tokens: {self.prompt_tokens:,} Completion Tokens: {self.completion_tokens:,} @@ -111,12 +138,7 @@ def summary(self) -> str: Documents Scanned: {self.documents_scanned} Documents Parsed: {self.documents_parsed} Tool Result Chars: {self.tool_result_chars:,} -โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ - Est. Cost (Gemini Flash): - Input: ${input_cost:.4f} - Output: ${output_cost:.4f} - Total: ${total_cost:.4f} -โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ +{cost_block}โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ """ @@ -540,7 +562,7 @@ def _build_system_prompt(enable_semantic: bool, enable_metadata: bool) -> str: class FsExplorerAgent: """ - AI agent for exploring filesystems using Google Gemini. + AI agent for exploring filesystems using a configurable LLM provider. The agent maintains a conversation history with the LLM and uses structured JSON output to make decisions about which actions to take. @@ -549,31 +571,42 @@ class FsExplorerAgent: token_usage: Tracks API call statistics and costs. """ - def __init__(self, api_key: str | None = None) -> None: + def __init__( + self, + api_key: str | None = None, + *, + llm_client: LLMClient | None = None, + ) -> None: """ - Initialize the agent with Google API credentials. + Initialize the agent with provider credentials from env or args. Args: - api_key: Google API key. If not provided, reads from - GOOGLE_API_KEY environment variable. + api_key: Optional API key override for the active provider. + llm_client: Optional pre-built LLM client (used in tests). Raises: - ValueError: If no API key is available. + ValueError: If provider configuration or API key is missing. """ - if api_key is None: - api_key = os.getenv("GOOGLE_API_KEY") - if api_key is None: - raise ValueError( - "GOOGLE_API_KEY not found within the current environment: " - "please export it or provide it to the class constructor." - ) - - self._client = GenAIClient( - api_key=api_key, - http_options=HttpOptions(api_version="v1beta"), + if llm_client is not None: + self._llm = llm_client + else: + self._llm = create_llm_client(api_key=api_key) + + config = ( + load_llm_config(api_key=api_key) + if llm_client is None + else None + ) + self._chat_history: list[ChatMessage] = [] + self.token_usage = TokenUsage( + provider_name=self._llm.provider_name, + model_name=config.model if config else self._llm.model_name, ) - self._chat_history: list[Content] = [] - self.token_usage = TokenUsage() + + @property + def llm_client(self) -> LLMClient: + """Return the active LLM backend.""" + return self._llm def configure_task(self, task: str) -> None: """ @@ -582,51 +615,53 @@ def configure_task(self, task: str) -> None: Args: task: The task or context to add to the conversation. """ - self._chat_history.append( - Content(role="user", parts=[Part.from_text(text=task)]) - ) + self._chat_history.append(ChatMessage(role="user", content=task)) async def take_action(self) -> tuple[Action, ActionType] | None: """ Request the next action from the AI model. - Sends the current conversation history to Gemini and receives - a structured JSON response indicating the next action to take. + Sends the current conversation history to the configured provider and + receives a structured JSON response indicating the next action. Returns: A tuple of (Action, ActionType) if successful, None otherwise. """ - response = await self._client.aio.models.generate_content( - model="gemini-3-flash-preview", - contents=self._chat_history, # type: ignore - config={ - "system_instruction": _build_system_prompt(_ENABLE_SEMANTIC, _ENABLE_METADATA), - "response_mime_type": "application/json", - "response_schema": Action, - }, - ) - - # Track token usage from response metadata - if response.usage_metadata: - self.token_usage.add_api_call( - prompt_tokens=response.usage_metadata.prompt_token_count or 0, - completion_tokens=response.usage_metadata.candidates_token_count or 0, + try: + response_text, usage = await self._llm.generate_action_json( + messages=self._chat_history, + system_instruction=_build_system_prompt( + _ENABLE_SEMANTIC, _ENABLE_METADATA + ), ) + except Exception as exc: + print(f"LLM request failed ({self._llm.provider_name}): {exc}") + return None + + self.token_usage.add_api_call( + prompt_tokens=usage.prompt_tokens, + completion_tokens=usage.completion_tokens, + provider_name=self._llm.provider_name, + model_name=self._llm.model_name, + ) - if response.candidates is not None: - if response.candidates[0].content is not None: - self._chat_history.append(response.candidates[0].content) - if response.text is not None: - action = Action.model_validate_json(response.text) - if action.to_action_type() == "toolcall": - toolcall = cast(ToolCallAction, action.action) - self.call_tool( - tool_name=toolcall.tool_name, - tool_input=toolcall.to_fn_args(), - ) - return action, action.to_action_type() + self._chat_history.append( + ChatMessage(role="assistant", content=response_text) + ) - return None + try: + action, action_type = parse_action_json(response_text) + except ValueError as exc: + print(f"Failed to parse LLM response: {exc}") + return None + + if action_type == "toolcall": + toolcall = cast(ToolCallAction, action.action) + self.call_tool( + tool_name=toolcall.tool_name, + tool_input=toolcall.to_fn_args(), + ) + return action, action_type def call_tool(self, tool_name: Tools, tool_input: dict[str, Any]) -> None: """ @@ -648,15 +683,16 @@ def call_tool(self, tool_name: Tools, tool_input: dict[str, Any]) -> None: self.token_usage.add_tool_result(result, tool_name) self._chat_history.append( - Content( + ChatMessage( role="user", - parts=[ - Part.from_text(text=f"Tool result for {tool_name}:\n\n{result}") - ], + content=f"Tool result for {tool_name}:\n\n{result}", ) ) def reset(self) -> None: """Reset the agent's conversation history and token tracking.""" self._chat_history.clear() - self.token_usage = TokenUsage() + self.token_usage = TokenUsage( + provider_name=self._llm.provider_name, + model_name=self._llm.model_name, + ) diff --git a/src/fs_explorer/llm/__init__.py b/src/fs_explorer/llm/__init__.py new file mode 100644 index 0000000..47fda1d --- /dev/null +++ b/src/fs_explorer/llm/__init__.py @@ -0,0 +1,14 @@ +"""LLM provider adapters for FsExplorer.""" + +from .config import LLMConfig, load_llm_config +from .factory import create_llm_client +from .base import ChatMessage, LLMClient, LLMUsage + +__all__ = [ + "ChatMessage", + "LLMClient", + "LLMUsage", + "LLMConfig", + "load_llm_config", + "create_llm_client", +] diff --git a/src/fs_explorer/llm/action_parser.py b/src/fs_explorer/llm/action_parser.py new file mode 100644 index 0000000..6624ac1 --- /dev/null +++ b/src/fs_explorer/llm/action_parser.py @@ -0,0 +1,109 @@ +"""Parse agent actions from LLM JSON with flexible recovery.""" + +from __future__ import annotations + +import json +from typing import cast + +from ..models import ( + Action, + ActionType, + AskHumanAction, + GoDeeperAction, + StopAction, + ToolCallAction, + ToolCallArg, + Tools, +) + + +def parse_action_json(json_str: str) -> tuple[Action, ActionType]: + """Parse an action from JSON, with flexible recovery for weaker models.""" + try: + action = Action.model_validate_json(json_str) + return action, action.to_action_type() + except Exception: + pass + + try: + raw = json.loads(json_str) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON: {json_str[:200]}") from exc + + action_data = raw.get("action", {}) + reason = raw.get("reason", "") + + if isinstance(action_data, dict) and "final_result" in action_data: + return Action( + action=StopAction(final_result=str(action_data["final_result"])), + reason=str(reason), + ), "stop" + + if isinstance(action_data, dict) and action_data.get("tool_name") == "final_result": + answer = reason or str(action_data.get("tool_input", "")) + return Action( + action=StopAction(final_result=answer), + reason="Recovered: tool_name was final_result", + ), "stop" + + if isinstance(action_data, dict) and "directory" in action_data and "tool_name" not in action_data: + return Action( + action=GoDeeperAction(directory=str(action_data["directory"])), + reason=str(reason), + ), "godeeper" + + if isinstance(action_data, dict) and "question" in action_data: + return Action( + action=AskHumanAction(question=str(action_data["question"])), + reason=str(reason), + ), "askhuman" + + if not isinstance(action_data, dict): + raise ValueError(f"Unsupported action payload: {json_str[:200]}") + + tool_name = action_data.get("tool_name") + tool_input_raw = action_data.get("tool_input", []) + if not tool_name: + raise ValueError(f"Missing tool_name in action: {json_str[:200]}") + + args = _normalize_tool_args(tool_input_raw) + tool = cast(Tools, tool_name) + return Action( + action=ToolCallAction(tool_name=tool, tool_input=args), + reason=str(reason), + ), "toolcall" + + +def _normalize_tool_args(tool_input_raw: object) -> list[ToolCallArg]: + if isinstance(tool_input_raw, dict): + return [ + ToolCallArg(parameter_name=str(key), parameter_value=value) + for key, value in tool_input_raw.items() + ] + + if not isinstance(tool_input_raw, list): + return [] + + args: list[ToolCallArg] = [] + for item in tool_input_raw: + if isinstance(item, ToolCallArg): + args.append(item) + elif isinstance(item, dict): + if "parameter_name" in item and "parameter_value" in item: + args.append( + ToolCallArg( + parameter_name=str(item["parameter_name"]), + parameter_value=item["parameter_value"], + ) + ) + elif len(item) == 1: + key, value = next(iter(item.items())) + args.append( + ToolCallArg(parameter_name=str(key), parameter_value=value) + ) + else: + for key, value in item.items(): + args.append( + ToolCallArg(parameter_name=str(key), parameter_value=value) + ) + return args diff --git a/src/fs_explorer/llm/base.py b/src/fs_explorer/llm/base.py new file mode 100644 index 0000000..51d7ef9 --- /dev/null +++ b/src/fs_explorer/llm/base.py @@ -0,0 +1,42 @@ +"""Shared types for LLM provider adapters.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Protocol + + +@dataclass(frozen=True) +class ChatMessage: + """Provider-neutral chat message.""" + + role: str + content: str + + +@dataclass(frozen=True) +class LLMUsage: + """Token usage from a single LLM call.""" + + prompt_tokens: int = 0 + completion_tokens: int = 0 + + +class LLMClient(Protocol): + """Interface implemented by all LLM backends.""" + + @property + def provider_name(self) -> str: + """Human-readable provider id, e.g. google or siliconflow.""" + + @property + def model_name(self) -> str: + """Model id sent to the provider API.""" + + async def generate_action_json( + self, + *, + messages: list[ChatMessage], + system_instruction: str, + ) -> tuple[str, LLMUsage]: + """Return structured action JSON and token usage.""" diff --git a/src/fs_explorer/llm/config.py b/src/fs_explorer/llm/config.py new file mode 100644 index 0000000..10fd1da --- /dev/null +++ b/src/fs_explorer/llm/config.py @@ -0,0 +1,87 @@ +"""Environment-based LLM provider configuration.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Literal + +LLMProviderName = Literal["google", "siliconflow", "openai"] + +_DEFAULT_MODELS: dict[LLMProviderName, str] = { + "google": "gemini-3-flash-preview", + "siliconflow": "Qwen/Qwen2.5-72B-Instruct", + "openai": "gpt-4o-mini", +} + +_DEFAULT_BASE_URLS: dict[LLMProviderName, str] = { + "siliconflow": "https://api.siliconflow.cn/v1", + "openai": "https://api.openai.com/v1", +} + + +@dataclass(frozen=True) +class LLMConfig: + """Resolved LLM settings for the active provider.""" + + provider: LLMProviderName + api_key: str + model: str + base_url: str | None = None + + +def _resolve_provider() -> LLMProviderName: + raw = os.getenv("FS_EXPLORER_LLM_PROVIDER", "google").strip().lower() + aliases = { + "gemini": "google", + "google": "google", + "siliconflow": "siliconflow", + "silicon-flow": "siliconflow", + "sf": "siliconflow", + "openai": "openai", + } + provider = aliases.get(raw) + if provider is None: + supported = ", ".join(sorted(set(aliases.values()))) + raise ValueError( + f"Unsupported FS_EXPLORER_LLM_PROVIDER={raw!r}. " + f"Supported values: {supported}" + ) + return provider # type: ignore[return-value] + + +def _resolve_api_key(provider: LLMProviderName, explicit_key: str | None) -> str: + if explicit_key: + return explicit_key + + env_keys: dict[LLMProviderName, tuple[str, ...]] = { + "google": ("GOOGLE_API_KEY",), + "siliconflow": ("SILICONFLOW_API_KEY", "OPENAI_API_KEY"), + "openai": ("OPENAI_API_KEY",), + } + for env_name in env_keys[provider]: + value = os.getenv(env_name) + if value: + return value + + expected = " or ".join(env_keys[provider]) + raise ValueError( + f"No API key found for provider {provider!r}. " + f"Set {expected}, or pass api_key to FsExplorerAgent." + ) + + +def load_llm_config(*, api_key: str | None = None) -> LLMConfig: + """Load provider settings from environment variables.""" + provider = _resolve_provider() + resolved_key = _resolve_api_key(provider, api_key) + model = os.getenv("FS_EXPLORER_LLM_MODEL", _DEFAULT_MODELS[provider]) + base_url = os.getenv("FS_EXPLORER_LLM_BASE_URL") + if base_url is None and provider in _DEFAULT_BASE_URLS: + base_url = _DEFAULT_BASE_URLS[provider] + return LLMConfig( + provider=provider, + api_key=resolved_key, + model=model, + base_url=base_url, + ) diff --git a/src/fs_explorer/llm/factory.py b/src/fs_explorer/llm/factory.py new file mode 100644 index 0000000..4712d32 --- /dev/null +++ b/src/fs_explorer/llm/factory.py @@ -0,0 +1,41 @@ +"""Factory for LLM provider clients.""" + +from __future__ import annotations + +from .base import LLMClient +from .config import LLMConfig, load_llm_config +from .google_client import GoogleGeminiClient +from .openai_client import OpenAICompatibleClient + + +from .prompts import action_schema_instructions + + +def create_llm_client(*, api_key: str | None = None, config: LLMConfig | None = None) -> LLMClient: + """Instantiate the configured LLM backend.""" + resolved = config or load_llm_config(api_key=api_key) + + if resolved.provider == "google": + return GoogleGeminiClient(api_key=resolved.api_key, model=resolved.model) + + if resolved.provider == "siliconflow": + if not resolved.base_url: + raise ValueError("FS_EXPLORER_LLM_BASE_URL is required for siliconflow") + return OpenAICompatibleClient( + api_key=resolved.api_key, + model=resolved.model, + base_url=resolved.base_url, + provider_name="siliconflow", + ) + + if resolved.provider == "openai": + if not resolved.base_url: + raise ValueError("FS_EXPLORER_LLM_BASE_URL is required for openai") + return OpenAICompatibleClient( + api_key=resolved.api_key, + model=resolved.model, + base_url=resolved.base_url, + provider_name="openai", + ) + + raise ValueError(f"Unsupported provider: {resolved.provider}") diff --git a/src/fs_explorer/llm/google_client.py b/src/fs_explorer/llm/google_client.py new file mode 100644 index 0000000..47d1d3d --- /dev/null +++ b/src/fs_explorer/llm/google_client.py @@ -0,0 +1,63 @@ +"""Google Gemini LLM client.""" + +from __future__ import annotations + +from google.genai import Client as GenAIClient +from google.genai.types import Content, HttpOptions, Part + +from ..models import Action +from .base import ChatMessage, LLMUsage + + +class GoogleGeminiClient: + """Gemini backend with native JSON schema support.""" + + def __init__(self, *, api_key: str, model: str) -> None: + self._client = GenAIClient( + api_key=api_key, + http_options=HttpOptions(api_version="v1beta"), + ) + self._model = model + + @property + def provider_name(self) -> str: + return "google" + + @property + def model_name(self) -> str: + return self._model + + async def generate_action_json( + self, + *, + messages: list[ChatMessage], + system_instruction: str, + ) -> tuple[str, LLMUsage]: + contents = [ + Content( + role="user" if message.role == "user" else "model", + parts=[Part.from_text(text=message.content)], + ) + for message in messages + ] + response = await self._client.aio.models.generate_content( + model=self._model, + contents=contents, # type: ignore[arg-type] + config={ + "system_instruction": system_instruction, + "response_mime_type": "application/json", + "response_schema": Action, + }, + ) + + usage = LLMUsage() + if response.usage_metadata: + usage = LLMUsage( + prompt_tokens=response.usage_metadata.prompt_token_count or 0, + completion_tokens=response.usage_metadata.candidates_token_count or 0, + ) + + if response.text is None: + raise RuntimeError("Gemini returned an empty response") + + return response.text, usage diff --git a/src/fs_explorer/llm/openai_client.py b/src/fs_explorer/llm/openai_client.py new file mode 100644 index 0000000..d4a02cc --- /dev/null +++ b/src/fs_explorer/llm/openai_client.py @@ -0,0 +1,75 @@ +"""OpenAI-compatible LLM client (SiliconFlow, OpenAI, etc.).""" + +from __future__ import annotations + +from openai import AsyncOpenAI + +from .base import ChatMessage, LLMUsage +from .prompts import action_schema_instructions + + +class OpenAICompatibleClient: + """Chat-completions backend for OpenAI-compatible APIs.""" + + def __init__( + self, + *, + api_key: str, + model: str, + base_url: str, + provider_name: str, + ) -> None: + self._client = AsyncOpenAI(api_key=api_key, base_url=base_url) + self._model = model + self._provider_name = provider_name + + @property + def provider_name(self) -> str: + return self._provider_name + + @property + def model_name(self) -> str: + return self._model + + async def generate_action_json( + self, + *, + messages: list[ChatMessage], + system_instruction: str, + ) -> tuple[str, LLMUsage]: + schema_hint = ( + f"{system_instruction}\n\n{action_schema_instructions()}" + ) + payload = [ + {"role": "system", "content": schema_hint}, + *[ + {"role": message.role, "content": message.content} + for message in messages + ], + ] + response = await self._client.chat.completions.create( + model=self._model, + messages=payload, # type: ignore[arg-type] + response_format={"type": "json_object"}, + temperature=0.2, + ) + + content = response.choices[0].message.content or "" + usage = LLMUsage() + if response.usage is not None: + usage = LLMUsage( + prompt_tokens=response.usage.prompt_tokens or 0, + completion_tokens=response.usage.completion_tokens or 0, + ) + + json_str = content.strip() + if not json_str.startswith("{"): + start = json_str.find("{") + end = json_str.rfind("}") + 1 + if start >= 0 and end > start: + json_str = json_str[start:end] + + if not json_str: + raise RuntimeError(f"{self._provider_name} returned an empty response") + + return json_str, usage diff --git a/src/fs_explorer/llm/prompts.py b/src/fs_explorer/llm/prompts.py new file mode 100644 index 0000000..1b07261 --- /dev/null +++ b/src/fs_explorer/llm/prompts.py @@ -0,0 +1,28 @@ +"""Compact action-format instructions for OpenAI-compatible models.""" + + +def action_schema_instructions() -> str: + """Return JSON action format guidance for chat-completions backends.""" + return """ +You MUST respond with exactly one JSON object containing `action` and `reason`. + +Stop when you have the final answer: +{"action": {"final_result": "Your answer with citations"}, "reason": "Why you are done"} + +Call a tool: +{"action": {"tool_name": "scan_folder", "tool_input": [{"parameter_name": "directory", "parameter_value": "/path"}]}, "reason": "Why this tool"} + +Navigate into a subdirectory: +{"action": {"directory": "/path/to/subdir"}, "reason": "Why go deeper"} + +Ask the user a clarifying question: +{"action": {"question": "Your question"}, "reason": "Why you need input"} + +Allowed tool_name values: +read, grep, glob, scan_folder, preview_file, parse_file, semantic_search, get_document, list_indexed_documents + +Rules: +- Output JSON only. No markdown fences. +- tool_input must be a list of {"parameter_name": ..., "parameter_value": ...} objects. +- Use stop only when you can answer the user's task. +""" diff --git a/src/fs_explorer/ui.html b/src/fs_explorer/ui.html index 3143c23..ed28d34 100644 --- a/src/fs_explorer/ui.html +++ b/src/fs_explorer/ui.html @@ -1050,7 +1050,7 @@