From 83fd8120532e3f7849c6fdfe78f1a3b881329df8 Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Mon, 11 May 2026 15:42:06 +0300
Subject: [PATCH 1/9] fix(cuga_lite): cap bind_tools count and shortlist over
 the limit (#202)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`resolve_model_with_bind_tools` blindly passed every candidate tool to
`LLM.bind_tools()`, which fails hard on Groq/OpenAI (max 128 tools) for
benchmarks like m3-hockey (206 tools).

Add `cuga_lite_bind_tools_max_count` (default 128). When the candidate list
exceeds the cap, run the existing LLM shortlister against the first user
message and bind the top-K most relevant tools (reserving 1 slot for
find_tools when `include_find_tools=True`). When shortlisting is impossible
(no query / shortlister failure / empty ranking), raise an actionable
RuntimeError naming the count, cap, and remedy — silent truncation would
corrupt research comparing native tool-calling against text-mode.

WatsonX / LiteLLM-to-Anthropic users on permissive backends can disable the
cap via `DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT=0`.

Closes #202

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../nodes/cuga_lite/cuga_lite_graph.py        | 191 +++++++++++++++++-
 .../nodes/cuga_lite/prompt_utils.py           |  92 +++++++++
 src/cuga/config.py                            |   1 +
 tests/unit/test_cuga_lite_bind_tools.py       | 165 ++++++++++++++-
 4 files changed, 439 insertions(+), 10 deletions(-)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index 41f36395..e30c0512 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -164,6 +164,24 @@ def _extract_code_from_response_tool_calls(response: object) -> str | None:
     return f"```python\nresult = await {name}({args_str})\nprint(result)\n```"
 
 
+def _bind_tools_max_count_from_settings() -> int:
+    """Provider-safe cap on the number of tools passed to ``LLM.bind_tools``.
+
+    Default 128 matches the strictest common provider limit (Groq, OpenAI). Set
+    ``DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT=0`` (or negative)
+    to disable the cap entirely — useful for permissive backends like WatsonX or
+    LiteLLM routing to Anthropic.
+    """
+    try:
+        raw = getattr(settings.advanced_features, "cuga_lite_bind_tools_max_count", 128)
+    except Exception:
+        return 128
+    try:
+        return int(raw)
+    except (TypeError, ValueError):
+        return 128
+
+
 def _bind_tools_mode_from_settings() -> str:
     try:
         m = getattr(settings.advanced_features, "cuga_lite_bind_tools_mode", None)
@@ -283,6 +301,123 @@ async def _indexed_tools_for_native_bind(
     return by_name
 
 
+async def _apply_bind_tools_cap_and_merge(
+    bound: List[StructuredTool],
+    *,
+    query: Optional[str],
+    tool_provider: Optional[ToolProviderInterface],
+    llm: Optional[BaseChatModel],
+    max_count: int,
+    include_find_tools: bool,
+    tools_context_ref: Optional[Dict[str, Any]],
+    mode: str,
+) -> List[StructuredTool]:
+    """Enforce the provider-safe ``max_count`` and optionally merge ``find_tools``.
+
+    Under cap → merge ``find_tools`` (when ``include_find_tools``) and return. Over cap →
+    run the existing LLM shortlister (see :meth:`PromptUtils.shortlist_tool_names`) against
+    ``query``, take top-K (reserving 1 slot for ``find_tools`` when applicable), and return
+    the ranked subset.
+
+    Raises ``RuntimeError`` with an actionable message when the cap is exceeded but
+    shortlisting is impossible — no user query, shortlister failure, or empty ranking.
+    Failing loudly is intentional: silent truncation would corrupt research/benchmark
+    results that compare native tool-calling against text-mode.
+    """
+    find_tools_tool = None
+    if include_find_tools and tools_context_ref:
+        candidate = tools_context_ref.get("_lc_bind_tools_find_tools")
+        if candidate is not None:
+            find_tools_tool = candidate
+
+    def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
+        if find_tools_tool is None:
+            return tools
+        ft_name = getattr(find_tools_tool, "name", None) or ""
+        if not ft_name:
+            return tools
+        if ft_name in {getattr(t, "name", "") for t in tools}:
+            return tools
+        return [*tools, find_tools_tool]
+
+    cap_disabled = max_count <= 0
+    if cap_disabled or len(bound) <= max_count:
+        return _append_find_tools(bound)
+
+    query_text = (query or "").strip()
+    if not query_text:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools_mode={mode!r} produced {len(bound)} tools but the "
+            f"provider-safe cap (cuga_lite_bind_tools_max_count) is {max_count}. "
+            f"Shortlisting requires a non-empty user query, but none was provided. Options: "
+            f"(a) ensure the first user message is non-empty so the shortlister can run, "
+            f"(b) raise the cap via DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT "
+            f"for permissive backends (WatsonX, Anthropic via LiteLLM), or "
+            f"(c) set the cap to 0 to disable (Groq/OpenAI will reject)."
+        )
+
+    reserve = 1 if find_tools_tool is not None else 0
+    target_k = max_count - reserve
+    if target_k <= 0:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools_max_count={max_count} is too small to fit even find_tools "
+            f"(reserve={reserve}). Raise the cap."
+        )
+
+    all_apps: List[Any] = []
+    if tool_provider is not None:
+        try:
+            all_apps = await tool_provider.get_apps()
+        except Exception as e:
+            logger.warning("bind_tools cap: tool_provider.get_apps() failed: %s", e)
+
+    logger.info(
+        "bind_tools cap exceeded: mode=%s candidates=%d cap=%d → LLM shortlister to top %d "
+        "(reserve=%d for find_tools)",
+        mode,
+        len(bound),
+        max_count,
+        target_k,
+        reserve,
+    )
+    try:
+        ranked_names = await PromptUtils.shortlist_tool_names(
+            query=query_text,
+            all_tools=bound,
+            all_apps=all_apps,
+            llm=llm,
+            top_k=target_k,
+        )
+    except Exception as e:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools shortlister failed reducing {len(bound)} tools to top "
+            f"{target_k} (cap={max_count}): {e!r}. Raise the cap or fix the shortlister LLM."
+        ) from e
+
+    if not ranked_names:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools shortlister returned 0 tools for {len(bound)} candidates "
+            f"(cap={max_count}, query={query_text!r}). Cannot proceed safely; raise the cap "
+            f"or refine the query."
+        )
+
+    by_name = {getattr(t, "name", ""): t for t in bound}
+    shortlisted: List[StructuredTool] = []
+    for n in ranked_names:
+        t = by_name.get(n)
+        if t is not None:
+            shortlisted.append(t)
+
+    shortlisted = _append_find_tools(shortlisted)
+    logger.info(
+        "bind_tools cap: shortlisted to %d tools (mode=%s, cap=%d)",
+        len(shortlisted),
+        mode,
+        max_count,
+    )
+    return shortlisted
+
+
 async def resolve_model_with_bind_tools(
     active_model: BaseChatModel,
     *,
@@ -290,6 +425,7 @@ async def resolve_model_with_bind_tools(
     tools_context_ref: Optional[Dict[str, Any]],
     tool_provider: Optional[ToolProviderInterface],
     model_name: Optional[str] = None,
+    query: Optional[str] = None,
 ) -> BaseChatModel:
     """Optionally wrap ``active_model`` with ``bind_tools`` for native tool-calling tests.
 
@@ -299,6 +435,10 @@ async def resolve_model_with_bind_tools(
     - ``cuga_lite_bind_tools_apps``: list of app names (``mode=apps`` or ``apps_and_tools``)
     - ``cuga_lite_bind_tools_tool_names``: StructuredTool ``name`` values (``mode=tools`` or ``apps_and_tools``)
     - ``cuga_lite_bind_tools_include_find_tools``: merge ``find_tools`` into ``all`` / ``apps`` / ``tools`` / ``apps_and_tools``
+    - ``cuga_lite_bind_tools_max_count``: provider-safe cap on the number of tools sent to
+      ``bind_tools``. Default 128 (matches Groq/OpenAI). Set 0 to disable. When the
+      candidate list exceeds the cap, the LLM shortlister picks the top-K most relevant
+      tools for ``query`` (typically the first user message).
 
     Profile ``gpt-oss-20b``: see ``model_runtime_profile.GPT_OSS_20B_RUNTIME_DEFAULTS``.
     """
@@ -317,6 +457,7 @@ async def resolve_model_with_bind_tools(
         settings_tool_names_fn=_bind_tools_tool_names_from_settings,
         settings_include_fn=lambda: _bind_include_find_tools_from_config({}),
     )
+    max_count = _bind_tools_max_count_from_settings()
 
     if mode in ("", "none", "false", "0", "off"):
         if include_find_tools:
@@ -342,9 +483,15 @@ async def resolve_model_with_bind_tools(
                 return active_model
             by_name = await _indexed_tools_for_native_bind(tool_provider, tools_context_ref)
             bound = list(by_name.values())
-            seen: Set[str] = {n for n in by_name}
-            _merge_find_tools_into_bound(
-                bound, seen, include_find_tools=include_find_tools, tools_context_ref=tools_context_ref
+            bound = await _apply_bind_tools_cap_and_merge(
+                bound,
+                query=query,
+                tool_provider=tool_provider,
+                llm=active_model,
+                max_count=max_count,
+                include_find_tools=include_find_tools,
+                tools_context_ref=tools_context_ref,
+                mode=mode,
             )
             if not bound:
                 return active_model
@@ -399,8 +546,15 @@ async def resolve_model_with_bind_tools(
                         missing,
                     )
 
-            _merge_find_tools_into_bound(
-                bound, seen_names, include_find_tools=include_find_tools, tools_context_ref=tools_context_ref
+            bound = await _apply_bind_tools_cap_and_merge(
+                bound,
+                query=query,
+                tool_provider=tool_provider,
+                llm=active_model,
+                max_count=max_count,
+                include_find_tools=include_find_tools,
+                tools_context_ref=tools_context_ref,
+                mode=mode,
             )
             if not bound:
                 return active_model
@@ -432,8 +586,15 @@ async def resolve_model_with_bind_tools(
                             bound.append(t)
                 except Exception as e:
                     logger.warning("bind_tools apps: get_tools(%s) failed: %s", app_name, e)
-            _merge_find_tools_into_bound(
-                bound, seen, include_find_tools=include_find_tools, tools_context_ref=tools_context_ref
+            bound = await _apply_bind_tools_cap_and_merge(
+                bound,
+                query=query,
+                tool_provider=tool_provider,
+                llm=active_model,
+                max_count=max_count,
+                include_find_tools=include_find_tools,
+                tools_context_ref=tools_context_ref,
+                mode=mode,
             )
             if not bound:
                 return active_model
@@ -472,8 +633,15 @@ async def resolve_model_with_bind_tools(
                     "cuga_lite_bind_tools_tool_names not found among provider tools (skipped): %s",
                     missing,
                 )
-            _merge_find_tools_into_bound(
-                bound, seen, include_find_tools=include_find_tools, tools_context_ref=tools_context_ref
+            bound = await _apply_bind_tools_cap_and_merge(
+                bound,
+                query=query,
+                tool_provider=tool_provider,
+                llm=active_model,
+                max_count=max_count,
+                include_find_tools=include_find_tools,
+                tools_context_ref=tools_context_ref,
+                mode=mode,
             )
             if not bound:
                 return active_model
@@ -483,6 +651,10 @@ async def resolve_model_with_bind_tools(
             "Unknown cuga_lite_bind_tools_mode: %s (use none|find_tools|all|apps|tools|apps_and_tools)",
             mode,
         )
+    except RuntimeError:
+        # Actionable cap/shortlist errors from _apply_bind_tools_cap_and_merge are intentional —
+        # surfacing them is required so research/benchmark runs don't silently degrade.
+        raise
     except Exception as e:
         logger.warning("resolve_model_with_bind_tools failed: %s", e)
     return active_model
@@ -1955,6 +2127,7 @@ async def call_model(state: CugaLiteState, config: Optional[RunnableConfig] = No
                     tools_context_ref=tools_context_ref,
                     tool_provider=base_tool_provider,
                     model_name=_runtime_model_name,
+                    query=_first_user_message_text(state.chat_messages),
                 )
 
                 response = await invoke_model.ainvoke(
diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
index 71563c54..87ed02ab 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
@@ -397,6 +397,98 @@ async def find_tools(
 
         return "\n".join(markdown_lines)
 
+    @staticmethod
+    async def shortlist_tool_names(
+        query: str,
+        all_tools: List[StructuredTool],
+        all_apps: List[AppDefinition],
+        llm: Optional[Any] = None,
+        top_k: int = 4,
+        instructions: Optional[str] = None,
+    ) -> List[str]:
+        """Rank tools by relevance to ``query`` and return up to ``top_k`` names (best-first).
+
+        Wraps the same shortlister LLM chain as :meth:`find_tools` but exposes the
+        ranked ``APIDetails.name`` list directly. Used by bind-time shortlisting in
+        ``resolve_model_with_bind_tools`` when the candidate tool count exceeds the
+        configured provider cap.
+        """
+        if top_k <= 0 or not all_tools:
+            return []
+
+        from cuga.backend.llm.models import LLMManager
+        from cuga.backend.cuga_graph.nodes.api.shortlister_agent.prompts.load_prompt import (
+            ShortListerOutputLite,
+        )
+        from cuga.backend.cuga_graph.nodes.shared.base_agent import BaseAgent
+
+        effective_instructions = (
+            instructions
+            if instructions is not None
+            else (
+                f"Return the {top_k} most relevant tools (or fewer if not enough are relevant), "
+                "ordered best-first by relevance. Do not exceed this count."
+            )
+        )
+
+        prompt = create_chat_prompt_from_templates(
+            system_path='./prompts/shortlister/system.jinja2',
+            message_templates=[
+                (
+                    'human',
+                    """
+                Current Apps: {all_apps}
+                Current Available Tools: {all_tools}
+                """,
+                ),
+                ('ai', 'Sure, now give me the intent'),
+                ('human', '{input}'),
+            ],
+        )
+        tools_as_dict: Dict[str, Any] = {}
+        for tool in all_tools:
+            tool_dict = tool.model_dump()
+            if hasattr(tool, 'args_schema') and tool.args_schema:
+                try:
+                    if hasattr(tool.args_schema, 'schema'):
+                        tool_dict['args_schema'] = tool.args_schema.schema()
+                    elif hasattr(tool.args_schema, 'model_json_schema'):
+                        tool_dict['args_schema'] = tool.args_schema.model_json_schema()
+                    else:
+                        tool_dict['args_schema'] = {}
+                except Exception:
+                    tool_dict['args_schema'] = {}
+            else:
+                tool_dict['args_schema'] = {}
+            tools_as_dict[tool.name] = tool_dict
+
+        apps_as_dict = {app.name: app.model_dump() for app in all_apps}
+
+        llm_manager = LLMManager()
+        model = llm or llm_manager.get_model(settings.agent.code.model)
+        chain = BaseAgent.get_chain(prompt, model, ShortListerOutputLite)
+        response = await chain.ainvoke(
+            {
+                "input": query,
+                "all_apps": apps_as_dict,
+                "all_tools": tools_as_dict,
+                "instructions": effective_instructions,
+            }
+        )
+
+        valid_names = {t.name for t in all_tools}
+        ranked: List[str] = []
+        seen: set = set()
+        for api_detail in getattr(response, "result", None) or []:
+            name = getattr(api_detail, "name", None)
+            if not name or name in seen or name not in valid_names:
+                continue
+            seen.add(name)
+            ranked.append(name)
+            if len(ranked) >= top_k:
+                break
+        return ranked
+
     @staticmethod
     def create_find_tools_bound(all_tools: List[StructuredTool], all_apps: List[AppDefinition]):
         """Create a bound version of find_tools with all_tools and all_apps pre-bound.
diff --git a/src/cuga/config.py b/src/cuga/config.py
index 83f2877b..e08d1bbb 100644
--- a/src/cuga/config.py
+++ b/src/cuga/config.py
@@ -181,6 +181,7 @@ def get_all_paths(config, parent_key=""):
     Validator("skills.enabled", default=False),
     Validator("advanced_features.builtin_tools", default=["knowledge"]),
     Validator("advanced_features.cuga_lite_bind_tools_tool_names", default=[]),
+    Validator("advanced_features.cuga_lite_bind_tools_max_count", default=128),
     # Evolve integration
     Validator("evolve.enabled", default=False),
     Validator("evolve.url", default="http://127.0.0.1:8201/sse"),
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index a85c2183..0d293626 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -1,6 +1,6 @@
 """CugaLite native bind_tools resolution (mode=tools by tool name)."""
 
-from unittest.mock import AsyncMock, MagicMock
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 from langchain_core.tools import StructuredTool
@@ -109,3 +109,166 @@ async def test_bind_tools_overlay_includes_shell_tools_not_on_registry():
     model.bind_tools.assert_called_once()
     (bound,), _kwargs = model.bind_tools.call_args
     assert [t.name for t in bound] == ["run_command"]
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_mode_all_shortlists_when_over_cap():
+    """When mode=all candidate count > max_count, run the LLM shortlister and bind top-K."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(10)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        return [t.name for t in all_tools[: min(top_k, 3)]]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=3,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=fake_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={"cuga_lite_bind_tools_mode": "all"},
+            tools_context_ref={},
+            tool_provider=provider,
+            query="find me a hockey scorer",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert [t.name for t in bound] == ["tool_000", "tool_001", "tool_002"]
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_mode_all_raises_when_over_cap_without_query():
+    """Failing loudly is required to avoid silently corrupting benchmark results."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(10)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    with patch(
+        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+        return_value=3,
+    ):
+        with pytest.raises(RuntimeError, match="provider-safe cap"):
+            await resolve_model_with_bind_tools(
+                model,
+                configurable={"cuga_lite_bind_tools_mode": "all"},
+                tools_context_ref={},
+                tool_provider=provider,
+                query=None,
+            )
+    model.bind_tools.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_mode_all_no_cap_when_under_threshold():
+    """Under the cap, no shortlister is invoked and all candidates are bound."""
+    tools = [_stub_tool(f"tool_{i}") for i in range(3)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    shortlist_calls = MagicMock()
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=128,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=shortlist_calls,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={"cuga_lite_bind_tools_mode": "all"},
+            tools_context_ref={},
+            tool_provider=provider,
+            query="anything",
+        )
+
+    shortlist_calls.assert_not_called()
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert sorted(t.name for t in bound) == ["tool_0", "tool_1", "tool_2"]
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_mode_all_disabled_cap_binds_everything():
+    """max_count <= 0 disables the cap entirely (WatsonX/permissive backends)."""
+    tools = [_stub_tool(f"tool_{i}") for i in range(50)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    with patch(
+        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+        return_value=0,
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={"cuga_lite_bind_tools_mode": "all"},
+            tools_context_ref={},
+            tool_provider=provider,
+            query=None,
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert len(bound) == 50
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_cap_reserves_slot_for_find_tools():
+    """When include_find_tools is on, the cap reserves 1 slot for find_tools."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(10)]
+    find_tools_tool = _stub_tool("find_tools")
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    captured_top_k = {}
+
+    async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        captured_top_k["value"] = top_k
+        return [t.name for t in all_tools[:top_k]]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=4,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=fake_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={
+                "cuga_lite_bind_tools_mode": "all",
+                "cuga_lite_bind_tools_include_find_tools": True,
+            },
+            tools_context_ref={"_lc_bind_tools_find_tools": find_tools_tool},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    assert captured_top_k["value"] == 3
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert [t.name for t in bound] == ["tool_000", "tool_001", "tool_002", "find_tools"]

From b4b154cd219869be8d319a3c4b5b712812cd9320 Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Mon, 11 May 2026 16:11:21 +0300
Subject: [PATCH 2/9] fix(cuga_lite): make bind_tools cap pad opt-in and fix
 loguru formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups discovered while exercising the cap path on the m3 hockey
benchmark (206 tools, Groq llama-3, mode=all):

1. Loguru log strings used %-style placeholders so the cap exceeded /
   shortlisted-to messages printed literal "%s" / "%d". Switch to {}.

2. The existing shortlister LLM system prompt is biased toward "the most
   relevant" tools and typically returns 1-4 entries even when asked for
   K=128. Padding the result with the remaining tools (provider order) does
   fill the cap, but binding ~128 native tools pushes the Groq llama model
   into native tool_calls mode, which cuga_lite's code-execution flow does
   not fully exercise — measured regression from 5-7 tool calls / pass to
   0 tool calls / fail on the same task.

   Add `cuga_lite_bind_tools_pad_to_cap` (default False) and keep the
   shortlister's natural ranked prefix as the bound set. Result on the m3
   hockey task: 0/1 to 1/1 pass, 17s, 4 tool calls in code-mode.

Users explicitly chasing "bind every slot the provider allows" can flip the
setting on for that experiment.
---
 .../nodes/cuga_lite/cuga_lite_graph.py        | 54 +++++++++++-
 src/cuga/config.py                            |  1 +
 tests/unit/test_cuga_lite_bind_tools.py       | 82 +++++++++++++++++++
 3 files changed, 133 insertions(+), 4 deletions(-)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index e30c0512..46d57635 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -182,6 +182,29 @@ def _bind_tools_max_count_from_settings() -> int:
         return 128
 
 
+def _bind_tools_pad_to_cap_from_settings() -> bool:
+    """Whether to pad the shortlister output with the remaining tools to fill the cap.
+
+    Default ``False`` — bind only the tools the shortlister deemed relevant (often 1-4
+    on the existing system prompt). cuga_lite is a code-execution agent and exhibits
+    measurable regressions in code-emission when many tools are bound natively (the
+    model tends to switch to native ``tool_calls`` mode, which the code-mode flow
+    doesn't fully exercise).
+
+    Set ``True`` for research scenarios where the user explicitly wants ``mode=all``
+    to bind as many tools as the provider will accept.
+    """
+    try:
+        raw = getattr(settings.advanced_features, "cuga_lite_bind_tools_pad_to_cap", False)
+    except Exception:
+        return False
+    if isinstance(raw, bool):
+        return raw
+    if isinstance(raw, str):
+        return raw.strip().lower() in ("true", "1", "yes", "on")
+    return bool(raw)
+
+
 def _bind_tools_mode_from_settings() -> str:
     try:
         m = getattr(settings.advanced_features, "cuga_lite_bind_tools_mode", None)
@@ -372,8 +395,8 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
             logger.warning("bind_tools cap: tool_provider.get_apps() failed: %s", e)
 
     logger.info(
-        "bind_tools cap exceeded: mode=%s candidates=%d cap=%d → LLM shortlister to top %d "
-        "(reserve=%d for find_tools)",
+        "bind_tools cap exceeded: mode={} candidates={} cap={} → LLM shortlister to top {} "
+        "(reserve={} for find_tools)",
         mode,
         len(bound),
         max_count,
@@ -403,17 +426,40 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
 
     by_name = {getattr(t, "name", ""): t for t in bound}
     shortlisted: List[StructuredTool] = []
+    seen_short: Set[str] = set()
     for n in ranked_names:
         t = by_name.get(n)
-        if t is not None:
+        if t is not None and n not in seen_short:
+            seen_short.add(n)
+            shortlisted.append(t)
+
+    # Pad-to-cap is opt-in (off by default) because cuga_lite is a code-execution agent
+    # and binding many tools natively pushes the model toward `tool_calls` mode, which the
+    # code-mode flow doesn't fully exercise (measured: 0 tool calls vs 5-7 without padding
+    # on the m3 hockey benchmark). Users explicitly chasing "true mode=all" can opt in.
+    padded_count = 0
+    if _bind_tools_pad_to_cap_from_settings() and len(shortlisted) < target_k:
+        for t in bound:
+            name = getattr(t, "name", "") or ""
+            if not name or name in seen_short:
+                continue
+            seen_short.add(name)
             shortlisted.append(t)
+            padded_count += 1
+            if len(shortlisted) >= target_k:
+                break
 
     shortlisted = _append_find_tools(shortlisted)
     logger.info(
-        "bind_tools cap: shortlisted to %d tools (mode=%s, cap=%d)",
+        "bind_tools cap: shortlisted to {} tools (mode={}, cap={}, ranked={}, padded={}, "
+        "include_find_tools={}, top_ranked={})",
         len(shortlisted),
         mode,
         max_count,
+        len(ranked_names),
+        padded_count,
+        find_tools_tool is not None,
+        ranked_names[:5],
     )
     return shortlisted
 
diff --git a/src/cuga/config.py b/src/cuga/config.py
index e08d1bbb..c15e512d 100644
--- a/src/cuga/config.py
+++ b/src/cuga/config.py
@@ -182,6 +182,7 @@ def get_all_paths(config, parent_key=""):
     Validator("advanced_features.builtin_tools", default=["knowledge"]),
     Validator("advanced_features.cuga_lite_bind_tools_tool_names", default=[]),
     Validator("advanced_features.cuga_lite_bind_tools_max_count", default=128),
+    Validator("advanced_features.cuga_lite_bind_tools_pad_to_cap", default=False),
     # Evolve integration
     Validator("evolve.enabled", default=False),
     Validator("evolve.url", default="http://127.0.0.1:8201/sse"),
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index 0d293626..fabdf0c2 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -231,6 +231,88 @@ async def test_bind_tools_mode_all_disabled_cap_binds_everything():
     assert len(bound) == 50
 
 
+@pytest.mark.asyncio
+async def test_bind_tools_cap_does_not_pad_by_default():
+    """By default, only the shortlister's ranked tools are bound — cuga_lite is a code-agent
+    and binding many native tools regresses code-emission (see commit context)."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(10)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    async def stingy_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        return ["tool_007"]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=5,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_pad_to_cap_from_settings",
+            return_value=False,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=stingy_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={"cuga_lite_bind_tools_mode": "all"},
+            tools_context_ref={},
+            tool_provider=provider,
+            query="anything",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert [t.name for t in bound] == ["tool_007"]
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_cap_pads_when_opt_in():
+    """When pad_to_cap=True, the shortlist is filled with the remaining tools up to target_k."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(10)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    async def stingy_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        return ["tool_007"]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=5,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_pad_to_cap_from_settings",
+            return_value=True,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=stingy_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={"cuga_lite_bind_tools_mode": "all"},
+            tools_context_ref={},
+            tool_provider=provider,
+            query="anything",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    names = [t.name for t in bound]
+    assert len(names) == 5
+    assert names[0] == "tool_007"
+    assert names[1:] == ["tool_000", "tool_001", "tool_002", "tool_003"]
+
+
 @pytest.mark.asyncio
 async def test_bind_tools_cap_reserves_slot_for_find_tools():
     """When include_find_tools is on, the cap reserves 1 slot for find_tools."""

From 6af343166a7893432ed874807ed9c922fd5a35db Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Mon, 11 May 2026 19:21:57 +0300
Subject: [PATCH 3/9] fix(cuga_lite): count find_tools correctly when applying
 bind_tools cap

Per coderabbit review on #203: the under-cap fast path and the over-cap
reserve logic treated find_tools inconsistently.

- When len(bound) == max_count and include_find_tools=True with a
  non-overlapping find_tools, the fast path returned max_count+1 tools and
  the provider rejected the request.
- When the overlay path had already placed find_tools into bound and
  include_find_tools=True, reserve=1 dropped a real shortlist slot even
  though find_tools was already counted in bound.

Track whether find_tools is already in bound, compute an effective_count
for the under-cap check, and only set reserve=1 when find_tools genuinely
needs to be appended.

Add two regression tests for the boundary cases.
---
 .../nodes/cuga_lite/cuga_lite_graph.py        | 21 +++--
 tests/unit/test_cuga_lite_bind_tools.py       | 93 +++++++++++++++++++
 2 files changed, 107 insertions(+), 7 deletions(-)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index 46d57635..31dbd164 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -353,18 +353,25 @@ async def _apply_bind_tools_cap_and_merge(
         if candidate is not None:
             find_tools_tool = candidate
 
+    # The overlay path (`_indexed_tools_for_native_bind`) can place ``find_tools`` directly
+    # into ``bound``. If it's already there, treat it as in-band so we don't double-count
+    # it in the cap or reserve an extra slot we don't need.
+    find_tools_name = getattr(find_tools_tool, "name", "") or ""
+    find_tools_already_in_bound = bool(find_tools_name) and any(
+        getattr(t, "name", "") == find_tools_name for t in bound
+    )
+    need_to_append_find_tools = find_tools_tool is not None and not find_tools_already_in_bound
+
     def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
-        if find_tools_tool is None:
-            return tools
-        ft_name = getattr(find_tools_tool, "name", None) or ""
-        if not ft_name:
+        if not need_to_append_find_tools:
             return tools
-        if ft_name in {getattr(t, "name", "") for t in tools}:
+        if find_tools_name in {getattr(t, "name", "") for t in tools}:
             return tools
         return [*tools, find_tools_tool]
 
     cap_disabled = max_count <= 0
-    if cap_disabled or len(bound) <= max_count:
+    effective_count = len(bound) + (1 if need_to_append_find_tools else 0)
+    if cap_disabled or effective_count <= max_count:
         return _append_find_tools(bound)
 
     query_text = (query or "").strip()
@@ -379,7 +386,7 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
             f"(c) set the cap to 0 to disable (Groq/OpenAI will reject)."
         )
 
-    reserve = 1 if find_tools_tool is not None else 0
+    reserve = 1 if need_to_append_find_tools else 0
     target_k = max_count - reserve
     if target_k <= 0:
         raise RuntimeError(
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index fabdf0c2..aa709582 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -313,6 +313,99 @@ async def stingy_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instru
     assert names[1:] == ["tool_000", "tool_001", "tool_002", "tool_003"]
 
 
+@pytest.mark.asyncio
+async def test_bind_tools_cap_not_violated_when_at_boundary_with_find_tools():
+    """Boundary case from coderabbit: len(bound) == max_count and include_find_tools=True.
+
+    Without the effective-count check, the under-cap fast path would append find_tools and
+    return max_count+1 tools — provider rejects.
+    """
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(5)]
+    find_tools_tool = _stub_tool("find_tools")
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    captured_top_k = {}
+
+    async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        captured_top_k["value"] = top_k
+        return [t.name for t in all_tools[:top_k]]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=5,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=fake_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={
+                "cuga_lite_bind_tools_mode": "all",
+                "cuga_lite_bind_tools_include_find_tools": True,
+            },
+            tools_context_ref={"_lc_bind_tools_find_tools": find_tools_tool},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert len(bound) == 5, f"cap violated: bound has {len(bound)} tools (max=5)"
+    assert captured_top_k["value"] == 4, "expected 1 slot reserved for find_tools"
+    assert bound[-1].name == "find_tools"
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_cap_does_not_double_count_find_tools_in_bound():
+    """find_tools is already in `bound` via the overlay path — don't reserve a second slot."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(5)]
+    find_tools_tool = _stub_tool("find_tools")
+    provider = AsyncMock()
+    # Overlay path puts find_tools into bound directly.
+    provider.get_all_tools = AsyncMock(return_value=tools + [find_tools_tool])
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    captured_top_k = {}
+
+    async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        captured_top_k["value"] = top_k
+        return [t.name for t in all_tools[:top_k]]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=4,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=fake_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={
+                "cuga_lite_bind_tools_mode": "all",
+                "cuga_lite_bind_tools_include_find_tools": True,
+            },
+            tools_context_ref={"_lc_bind_tools_find_tools": find_tools_tool},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert len(bound) <= 4
+    # find_tools was already in bound; no slot reservation should happen, so top_k == cap.
+    assert captured_top_k["value"] == 4
+
+
 @pytest.mark.asyncio
 async def test_bind_tools_cap_reserves_slot_for_find_tools():
     """When include_find_tools is on, the cap reserves 1 slot for find_tools."""

From acdbf18b55563117b9f5e65833f7c8abf5a69698 Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Mon, 11 May 2026 19:38:06 +0300
Subject: [PATCH 4/9] refactor(prompt_utils): share shortlister payload builder

Per coderabbit nitpick on #203: shortlist_tool_names duplicated the tool
serialization from find_tools but dropped _response_schemas and
_param_constraints, so bind-time ranking could diverge from runtime find_tools
ranking on the same tool set.

Extract PromptUtils._build_shortlister_payload as the single source of truth
for serializing (all_tools, all_apps) into the prompt-friendly dicts. Both
callers now produce identical payloads including the response-schema and
param-constraint fields the LLM uses to rank.
---
 .../nodes/cuga_lite/prompt_utils.py           | 92 +++++++++----------
 1 file changed, 43 insertions(+), 49 deletions(-)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
index 87ed02ab..923a7ce8 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
@@ -205,6 +205,47 @@ def get_tool_docs(tool: StructuredTool) -> tuple[str, str]:
 
         return params_doc, response_doc
 
+    @staticmethod
+    def _build_shortlister_payload(
+        all_tools: List[StructuredTool],
+        all_apps: List[AppDefinition],
+    ) -> tuple[Dict[str, Any], Dict[str, Any]]:
+        """Serialize ``all_tools`` and ``all_apps`` for the shortlister LLM prompt.
+
+        Shared by :meth:`find_tools` (runtime tool discovery) and
+        :meth:`shortlist_tool_names` (bind-time cap reduction). Per coderabbit on
+        cuga-agent#203, keeping a single payload builder prevents the two callers
+        from drifting — both must include ``args_schema``, ``_response_schemas``,
+        and ``_param_constraints`` for the LLM to rank tools consistently.
+        """
+        tools_as_dict: Dict[str, Any] = {}
+        for tool in all_tools:
+            tool_dict = tool.model_dump()
+            if hasattr(tool, 'args_schema') and tool.args_schema:
+                try:
+                    if hasattr(tool.args_schema, 'schema'):
+                        tool_dict['args_schema'] = tool.args_schema.schema()
+                    elif hasattr(tool.args_schema, 'model_json_schema'):
+                        tool_dict['args_schema'] = tool.args_schema.model_json_schema()
+                    else:
+                        tool_dict['args_schema'] = {}
+                except Exception as e:
+                    logger.debug(f"Failed to serialize args_schema for tool {tool.name}: {e}")
+                    tool_dict['args_schema'] = {}
+            else:
+                tool_dict['args_schema'] = {}
+
+            if hasattr(tool, 'func'):
+                if hasattr(tool.func, '_response_schemas'):
+                    tool_dict['_response_schemas'] = tool.func._response_schemas
+                if hasattr(tool.func, '_param_constraints'):
+                    tool_dict['_param_constraints'] = tool.func._param_constraints
+
+            tools_as_dict[tool.name] = tool_dict
+
+        apps_as_dict = {app.name: app.model_dump() for app in all_apps}
+        return tools_as_dict, apps_as_dict
+
     @staticmethod
     async def find_tools(
         query: str,
@@ -246,37 +287,7 @@ async def find_tools(
                 ('human', '{input}'),
             ],
         )
-        # Serialize tools properly, converting args_schema class to dict
-        tools_as_dict = {}
-        for tool in all_tools:
-            tool_dict = tool.model_dump()
-            # Extract and convert args_schema from the tool object (it's an attribute, not in model_dump)
-            if hasattr(tool, 'args_schema') and tool.args_schema:
-                try:
-                    # Try schema() method (Pydantic v1)
-                    if hasattr(tool.args_schema, 'schema'):
-                        tool_dict['args_schema'] = tool.args_schema.schema()
-                    # Try model_json_schema() method (Pydantic v2)
-                    elif hasattr(tool.args_schema, 'model_json_schema'):
-                        tool_dict['args_schema'] = tool.args_schema.model_json_schema()
-                    else:
-                        tool_dict['args_schema'] = {}
-                except Exception as e:
-                    logger.debug(f"Failed to serialize args_schema for tool {tool.name}: {e}")
-                    tool_dict['args_schema'] = {}
-            else:
-                tool_dict['args_schema'] = {}
-
-            # Also ensure response_schemas and param_constraints are included if they exist
-            if hasattr(tool, 'func'):
-                if hasattr(tool.func, '_response_schemas'):
-                    tool_dict['_response_schemas'] = tool.func._response_schemas
-                if hasattr(tool.func, '_param_constraints'):
-                    tool_dict['_param_constraints'] = tool.func._param_constraints
-
-            tools_as_dict[tool.name] = tool_dict
-
-        apps_as_dict = {app.name: app.model_dump() for app in all_apps}
+        tools_as_dict, apps_as_dict = PromptUtils._build_shortlister_payload(all_tools, all_apps)
         from cuga.backend.llm.models import LLMManager
         from cuga.backend.cuga_graph.nodes.api.shortlister_agent.prompts.load_prompt import (
             ShortListerOutputLite,
@@ -445,24 +456,7 @@ async def shortlist_tool_names(
                 ('human', '{input}'),
             ],
         )
-        tools_as_dict: Dict[str, Any] = {}
-        for tool in all_tools:
-            tool_dict = tool.model_dump()
-            if hasattr(tool, 'args_schema') and tool.args_schema:
-                try:
-                    if hasattr(tool.args_schema, 'schema'):
-                        tool_dict['args_schema'] = tool.args_schema.schema()
-                    elif hasattr(tool.args_schema, 'model_json_schema'):
-                        tool_dict['args_schema'] = tool.args_schema.model_json_schema()
-                    else:
-                        tool_dict['args_schema'] = {}
-                except Exception:
-                    tool_dict['args_schema'] = {}
-            else:
-                tool_dict['args_schema'] = {}
-            tools_as_dict[tool.name] = tool_dict
-
-        apps_as_dict = {app.name: app.model_dump() for app in all_apps}
+        tools_as_dict, apps_as_dict = PromptUtils._build_shortlister_payload(all_tools, all_apps)
 
         llm_manager = LLMManager()
         model = llm or llm_manager.get_model(settings.agent.code.model)

From ded870ca00b84325ede4f73f347bd8e21525761e Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Tue, 12 May 2026 06:51:08 +0300
Subject: [PATCH 5/9] fix(cuga_lite): address all open coderabbit findings on
 #203
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four findings addressed in one commit:

1. (major) include_find_tools=True did not guarantee find_tools survived
   shortlisting when the overlay had already placed it into bound. The
   shortlister LLM is free to drop any tool from its ranking input. Fix:
   pull find_tools out of the ranking pool when present, always reserve a
   cap slot when include_find_tools=True, and append find_tools back after
   ranking. As a bonus, max_count=1 + include_find_tools=True now binds
   just find_tools instead of raising.

2. (minor) logger.warning("...get_apps() failed: %s", e) used printf-style
   formatting, which loguru does not interpolate — the actual error reason
   was lost. Switch to "{}".

3. (minor) shortlist_tool_names did not guard whitespace-only queries; the
   guard at the top only checked top_k and all_tools. A whitespace query
   would still hit the LLM and produce arbitrary rankings, bypassing the
   "no query" failure path in the caller. Add `if not query.strip(): return []`.

4. (nit) _build_shortlister_payload caught blind Exception when
   serializing args_schema, masking unexpected bugs. Narrow to
   (AttributeError, TypeError, ValueError).

New regression tests:
- find_tools survives shortlisting when in overlay bound (cap path)
- max_count=1 + include_find_tools=True binds only find_tools
---
 .../nodes/cuga_lite/cuga_lite_graph.py        | 50 +++++++++-------
 .../nodes/cuga_lite/prompt_utils.py           |  8 ++-
 tests/unit/test_cuga_lite_bind_tools.py       | 58 ++++++++++++++++---
 3 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index 31dbd164..675db87f 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -353,26 +353,34 @@ async def _apply_bind_tools_cap_and_merge(
         if candidate is not None:
             find_tools_tool = candidate
 
-    # The overlay path (`_indexed_tools_for_native_bind`) can place ``find_tools`` directly
-    # into ``bound``. If it's already there, treat it as in-band so we don't double-count
-    # it in the cap or reserve an extra slot we don't need.
+    # ``find_tools`` may arrive via the overlay (already inside ``bound``) or out-of-band
+    # (only in ``tools_context_ref``). Either way, when ``include_find_tools=True`` we must
+    # *guarantee* it survives shortlisting — the LLM ranker is free to drop any tool from
+    # the ranking pool, so the only safe move is to pull find_tools OUT of the pool, reserve
+    # a cap slot for it, and append it back at the end. (Per coderabbit on #203.)
     find_tools_name = getattr(find_tools_tool, "name", "") or ""
     find_tools_already_in_bound = bool(find_tools_name) and any(
         getattr(t, "name", "") == find_tools_name for t in bound
     )
-    need_to_append_find_tools = find_tools_tool is not None and not find_tools_already_in_bound
+    keep_find_tools = include_find_tools and find_tools_tool is not None
+    # Tools the shortlister actually ranks — strip find_tools out so the ranker can't evict it.
+    ranking_pool: List[StructuredTool] = (
+        [t for t in bound if getattr(t, "name", "") != find_tools_name]
+        if keep_find_tools and find_tools_already_in_bound
+        else bound
+    )
 
     def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
-        if not need_to_append_find_tools:
+        if not keep_find_tools or find_tools_tool is None:
             return tools
         if find_tools_name in {getattr(t, "name", "") for t in tools}:
             return tools
         return [*tools, find_tools_tool]
 
     cap_disabled = max_count <= 0
-    effective_count = len(bound) + (1 if need_to_append_find_tools else 0)
+    effective_count = len(ranking_pool) + (1 if keep_find_tools else 0)
     if cap_disabled or effective_count <= max_count:
-        return _append_find_tools(bound)
+        return _append_find_tools(ranking_pool)
 
     query_text = (query or "").strip()
     if not query_text:
@@ -386,26 +394,24 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
             f"(c) set the cap to 0 to disable (Groq/OpenAI will reject)."
         )
 
-    reserve = 1 if need_to_append_find_tools else 0
+    reserve = 1 if keep_find_tools else 0
     target_k = max_count - reserve
     if target_k <= 0:
-        raise RuntimeError(
-            f"cuga_lite_bind_tools_max_count={max_count} is too small to fit even find_tools "
-            f"(reserve={reserve}). Raise the cap."
-        )
+        # ``max_count=1`` with ``include_find_tools=True``: bind only find_tools.
+        return _append_find_tools([])
 
     all_apps: List[Any] = []
     if tool_provider is not None:
         try:
             all_apps = await tool_provider.get_apps()
         except Exception as e:
-            logger.warning("bind_tools cap: tool_provider.get_apps() failed: %s", e)
+            logger.warning("bind_tools cap: tool_provider.get_apps() failed: {}", e)
 
     logger.info(
         "bind_tools cap exceeded: mode={} candidates={} cap={} → LLM shortlister to top {} "
         "(reserve={} for find_tools)",
         mode,
-        len(bound),
+        len(ranking_pool),
         max_count,
         target_k,
         reserve,
@@ -413,25 +419,25 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
     try:
         ranked_names = await PromptUtils.shortlist_tool_names(
             query=query_text,
-            all_tools=bound,
+            all_tools=ranking_pool,
             all_apps=all_apps,
             llm=llm,
             top_k=target_k,
         )
     except Exception as e:
         raise RuntimeError(
-            f"cuga_lite_bind_tools shortlister failed reducing {len(bound)} tools to top "
-            f"{target_k} (cap={max_count}): {e!r}. Raise the cap or fix the shortlister LLM."
+            f"cuga_lite_bind_tools shortlister failed reducing {len(ranking_pool)} tools to "
+            f"top {target_k} (cap={max_count}): {e!r}. Raise the cap or fix the shortlister LLM."
         ) from e
 
     if not ranked_names:
         raise RuntimeError(
-            f"cuga_lite_bind_tools shortlister returned 0 tools for {len(bound)} candidates "
-            f"(cap={max_count}, query={query_text!r}). Cannot proceed safely; raise the cap "
-            f"or refine the query."
+            f"cuga_lite_bind_tools shortlister returned 0 tools for {len(ranking_pool)} "
+            f"candidates (cap={max_count}, query={query_text!r}). Cannot proceed safely; "
+            f"raise the cap or refine the query."
         )
 
-    by_name = {getattr(t, "name", ""): t for t in bound}
+    by_name = {getattr(t, "name", ""): t for t in ranking_pool}
     shortlisted: List[StructuredTool] = []
     seen_short: Set[str] = set()
     for n in ranked_names:
@@ -446,7 +452,7 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
     # on the m3 hockey benchmark). Users explicitly chasing "true mode=all" can opt in.
     padded_count = 0
     if _bind_tools_pad_to_cap_from_settings() and len(shortlisted) < target_k:
-        for t in bound:
+        for t in ranking_pool:
             name = getattr(t, "name", "") or ""
             if not name or name in seen_short:
                 continue
diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
index 923a7ce8..fceec538 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/prompt_utils.py
@@ -229,7 +229,9 @@ def _build_shortlister_payload(
                         tool_dict['args_schema'] = tool.args_schema.model_json_schema()
                     else:
                         tool_dict['args_schema'] = {}
-                except Exception as e:
+                except (AttributeError, TypeError, ValueError) as e:
+                    # Narrow to expected serialization failures so unexpected bugs propagate
+                    # instead of silently stripping schema (coderabbit on #203).
                     logger.debug(f"Failed to serialize args_schema for tool {tool.name}: {e}")
                     tool_dict['args_schema'] = {}
             else:
@@ -426,6 +428,10 @@ async def shortlist_tool_names(
         """
         if top_k <= 0 or not all_tools:
             return []
+        # A whitespace-only query would otherwise invoke the LLM and produce arbitrary
+        # rankings, defeating the "no query" failure path in the caller (coderabbit on #203).
+        if not query or not query.strip():
+            return []
 
         from cuga.backend.llm.models import LLMManager
         from cuga.backend.cuga_graph.nodes.api.shortlister_agent.prompts.load_prompt import (
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index aa709582..6a0843c8 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -362,20 +362,26 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
 
 @pytest.mark.asyncio
-async def test_bind_tools_cap_does_not_double_count_find_tools_in_bound():
-    """find_tools is already in `bound` via the overlay path — don't reserve a second slot."""
+async def test_bind_tools_cap_guarantees_find_tools_when_in_overlay_bound():
+    """If `find_tools` is in `bound` via the overlay and `include_find_tools=True`, it must
+    survive shortlisting — the LLM ranker is allowed to drop any tool from its ranking input,
+    so we pull find_tools out of the ranking pool and reserve a cap slot for it.
+
+    Regression test for coderabbit comment on #203 (`include_find_tools` contract).
+    """
     tools = [_stub_tool(f"tool_{i:03d}") for i in range(5)]
     find_tools_tool = _stub_tool("find_tools")
     provider = AsyncMock()
-    # Overlay path puts find_tools into bound directly.
     provider.get_all_tools = AsyncMock(return_value=tools + [find_tools_tool])
     provider.get_apps = AsyncMock(return_value=[])
     model = MagicMock()
 
-    captured_top_k = {}
+    captured = {}
 
     async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
-        captured_top_k["value"] = top_k
+        captured["top_k"] = top_k
+        captured["pool_names"] = [t.name for t in all_tools]
+        # Adversarial: rank find_tools out if it appears in the input. With the fix it shouldn't.
         return [t.name for t in all_tools[:top_k]]
 
     with (
@@ -401,9 +407,45 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
     model.bind_tools.assert_called_once()
     (bound,), _kwargs = model.bind_tools.call_args
-    assert len(bound) <= 4
-    # find_tools was already in bound; no slot reservation should happen, so top_k == cap.
-    assert captured_top_k["value"] == 4
+    assert len(bound) == 4, f"cap violated: bound has {len(bound)} (cap=4)"
+    # Reserve 1 slot for find_tools, so the shortlister sees top_k = cap - 1.
+    assert captured["top_k"] == 3
+    # find_tools must not be exposed to the ranker (it can't be evicted that way).
+    assert "find_tools" not in captured["pool_names"]
+    # find_tools must end up bound regardless of how the ranker votes.
+    assert "find_tools" in {t.name for t in bound}
+    assert bound[-1].name == "find_tools"
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_cap_binds_only_find_tools_when_max_count_is_one():
+    """`max_count=1` + `include_find_tools=True` should still succeed by binding only
+    find_tools, instead of raising as "cap too small to fit even find_tools"."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(5)]
+    find_tools_tool = _stub_tool("find_tools")
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    with patch(
+        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+        return_value=1,
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={
+                "cuga_lite_bind_tools_mode": "all",
+                "cuga_lite_bind_tools_include_find_tools": True,
+            },
+            tools_context_ref={"_lc_bind_tools_find_tools": find_tools_tool},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert [t.name for t in bound] == ["find_tools"]
 
 
 @pytest.mark.asyncio

From 11feaced74b04c1b2098702f080bb9b945e64783 Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Tue, 12 May 2026 09:36:21 +0300
Subject: [PATCH 6/9] fix(cuga_lite): two more bind_tools cap edge cases
 (coderabbit on #203)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A. Overlay-injected find_tools must respect include_find_tools=False.
   `_indexed_tools_for_native_bind` can place find_tools into `bound`
   independently of config. Previously, when `include_find_tools=False`,
   the code never even *read* the find_tools candidate from
   tools_context_ref, so the overlay-injected copy slipped through —
   consuming a capped slot, leaking through the shortlister, and
   defeating the user's explicit opt-out. Fix: resolve the find_tools
   candidate unconditionally; if `include_find_tools=False` and it's
   already in `bound`, strip it before any cap math runs.

B. LLM-hallucinated shortlist names must fail loudly.
   The existing guard catches `ranked_names == []` but not the case
   where the LLM returns names that don't exist in `ranking_pool` (zero
   matches after the dictionary lookup). Without the new check, we'd
   silently pad with arbitrary `ranking_pool` prefix or return just
   find_tools — recreating exactly the silent degradation this cap path
   exists to prevent. Fix: raise RuntimeError when `shortlisted` is
   empty after the lookup loop, with diagnostic context.

Two regression tests added:
- overlay-injected find_tools stripped when include_find_tools=False
- shortlister returning hallucinated names raises before pad / append

16/16 tests pass.
---
 .../nodes/cuga_lite/cuga_lite_graph.py        | 34 ++++++--
 tests/unit/test_cuga_lite_bind_tools.py       | 87 +++++++++++++++++++
 2 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index 675db87f..0964281a 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -347,21 +347,30 @@ async def _apply_bind_tools_cap_and_merge(
     Failing loudly is intentional: silent truncation would corrupt research/benchmark
     results that compare native tool-calling against text-mode.
     """
+    # Resolve the find_tools candidate *unconditionally* — the overlay path
+    # (``_indexed_tools_for_native_bind``) can inject it into ``bound`` independently of
+    # ``include_find_tools``, so we have to detect it either way to honor an explicit
+    # opt-out. (Coderabbit on #203.)
     find_tools_tool = None
-    if include_find_tools and tools_context_ref:
+    if tools_context_ref:
         candidate = tools_context_ref.get("_lc_bind_tools_find_tools")
         if candidate is not None:
             find_tools_tool = candidate
 
-    # ``find_tools`` may arrive via the overlay (already inside ``bound``) or out-of-band
-    # (only in ``tools_context_ref``). Either way, when ``include_find_tools=True`` we must
-    # *guarantee* it survives shortlisting — the LLM ranker is free to drop any tool from
-    # the ranking pool, so the only safe move is to pull find_tools OUT of the pool, reserve
-    # a cap slot for it, and append it back at the end. (Per coderabbit on #203.)
     find_tools_name = getattr(find_tools_tool, "name", "") or ""
     find_tools_already_in_bound = bool(find_tools_name) and any(
         getattr(t, "name", "") == find_tools_name for t in bound
     )
+    # If the user disabled find_tools but the overlay injected it anyway, strip it from
+    # ``bound`` so it can't consume a capped slot or sneak into the shortlister's input.
+    if not include_find_tools and find_tools_already_in_bound:
+        bound = [t for t in bound if getattr(t, "name", "") != find_tools_name]
+        find_tools_already_in_bound = False
+
+    # When ``include_find_tools=True`` we must *guarantee* find_tools survives shortlisting
+    # — the LLM ranker is free to drop any tool from the ranking pool, so the only safe move
+    # is to pull find_tools OUT of the pool, reserve a cap slot for it, and append it back
+    # at the end.
     keep_find_tools = include_find_tools and find_tools_tool is not None
     # Tools the shortlister actually ranks — strip find_tools out so the ranker can't evict it.
     ranking_pool: List[StructuredTool] = (
@@ -446,6 +455,19 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
             seen_short.add(n)
             shortlisted.append(t)
 
+    # The empty-``ranked_names`` case is caught above. This catches the LLM-hallucinated-names
+    # case: ranked_names is non-empty but none match a tool in ranking_pool. Without this
+    # raise, we'd silently pad (or return just find_tools), recreating the silent degradation
+    # the cap path is meant to prevent. (Coderabbit on #203.)
+    if not shortlisted:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools shortlister returned {len(ranked_names)} names but none "
+            f"matched the {len(ranking_pool)} candidates (cap={max_count}, "
+            f"query={query_text!r}, sample_ranked={ranked_names[:5]}). Shortlister LLM "
+            f"hallucinated tool names — raise the cap, fix the shortlister prompt, or "
+            f"refine the query."
+        )
+
     # Pad-to-cap is opt-in (off by default) because cuga_lite is a code-execution agent
     # and binding many tools natively pushes the model toward `tool_calls` mode, which the
     # code-mode flow doesn't fully exercise (measured: 0 tool calls vs 5-7 without padding
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index 6a0843c8..15590038 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -417,6 +417,93 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
     assert bound[-1].name == "find_tools"
 
 
+@pytest.mark.asyncio
+async def test_bind_tools_include_find_tools_false_strips_overlay_find_tools():
+    """Overlay can inject `find_tools` into `bound` regardless of `include_find_tools`.
+
+    When the user sets `include_find_tools=False`, the overlay-injected tool must be stripped
+    so it can't consume a cap slot or be ranked. Regression test for coderabbit on #203.
+    """
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(5)]
+    find_tools_tool = _stub_tool("find_tools")
+    provider = AsyncMock()
+    # Overlay path puts find_tools into bound directly.
+    provider.get_all_tools = AsyncMock(return_value=tools + [find_tools_tool])
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    captured = {}
+
+    async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        captured["pool_names"] = [t.name for t in all_tools]
+        return [t.name for t in all_tools[:top_k]]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=3,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=fake_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={
+                "cuga_lite_bind_tools_mode": "all",
+                # NB: include_find_tools is False (default in configurable here)
+            },
+            tools_context_ref={"_lc_bind_tools_find_tools": find_tools_tool},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    bound_names = {t.name for t in bound}
+    assert "find_tools" not in bound_names, (
+        f"find_tools leaked through overlay despite include_find_tools=False: {bound_names}"
+    )
+    # The shortlister must not have seen find_tools either.
+    assert "find_tools" not in captured["pool_names"]
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_cap_raises_when_shortlist_names_dont_match_pool():
+    """LLM-hallucinated shortlist names (non-empty list, zero matches in pool) must fail
+    loudly. Without the guard we'd silently pad or bind just find_tools, recreating the
+    silent degradation the cap path exists to prevent. Coderabbit on #203."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(10)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    async def hallucinating_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        return ["nonexistent_tool_a", "nonexistent_tool_b"]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=3,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=hallucinating_shortlist,
+        ),
+    ):
+        with pytest.raises(RuntimeError, match="hallucinated"):
+            await resolve_model_with_bind_tools(
+                model,
+                configurable={"cuga_lite_bind_tools_mode": "all"},
+                tools_context_ref={},
+                tool_provider=provider,
+                query="hockey",
+            )
+    model.bind_tools.assert_not_called()
+
+
 @pytest.mark.asyncio
 async def test_bind_tools_cap_binds_only_find_tools_when_max_count_is_one():
     """`max_count=1` + `include_find_tools=True` should still succeed by binding only

From ae4215c1134650298b59e4d8e4da4eb07049ed52 Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Tue, 12 May 2026 11:02:26 +0300
Subject: [PATCH 7/9] fix(cuga_lite): clamp shortlisted output to target_k
 (coderabbit on #203)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Defense-in-depth: the build-shortlisted loop in _apply_bind_tools_cap_and_merge
trusted PromptUtils.shortlist_tool_names to truncate its own output at top_k.
That trust is fragile — a custom shortlister, a future refactor, or a mocked
path could return more names than requested, in which case shortlisted grew
past target_k, the pad block was skipped (len already ≥ target_k), and the
final bound list exceeded max_count, re-triggering the provider 400 the cap
exists to prevent.

Add an explicit `if len(shortlisted) >= target_k: break` so the call site
enforces its own cap contract regardless of the shortlister's compliance.

Two regression tests added:
- overlong shortlister output clamped to target_k (mode=all, no find_tools)
- same clamp with include_find_tools=True reserving 1 slot
---
 .../nodes/cuga_lite/cuga_lite_graph.py        |  7 ++
 tests/unit/test_cuga_lite_bind_tools.py       | 82 +++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index 0964281a..7ff1f5ea 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -454,6 +454,13 @@ def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
         if t is not None and n not in seen_short:
             seen_short.add(n)
             shortlisted.append(t)
+            # Defense-in-depth: enforce the cap at the call site too, in case the
+            # shortlister returns more names than ``top_k`` (custom shortlister, future
+            # refactor, or a mocked path). Without this clamp the bound list could exceed
+            # ``max_count`` and re-trigger the provider 400 the cap exists to prevent.
+            # (Coderabbit on #203.)
+            if len(shortlisted) >= target_k:
+                break
 
     # The empty-``ranked_names`` case is caught above. This catches the LLM-hallucinated-names
     # case: ranked_names is non-empty but none match a tool in ranking_pool. Without this
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index 15590038..0da7d30c 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -504,6 +504,88 @@ async def hallucinating_shortlist(query, all_tools, all_apps, llm=None, top_k=4,
     model.bind_tools.assert_not_called()
 
 
+@pytest.mark.asyncio
+async def test_bind_tools_cap_clamps_shortlist_when_llm_returns_too_many():
+    """If the shortlister (e.g. a non-compliant custom impl or future refactor) returns more
+    valid names than ``top_k``, the call site must still enforce ``target_k`` so the bound
+    list never exceeds the provider-safe cap. Regression test for coderabbit on #203."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(20)]
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    async def overlong_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        # Deliberately ignore top_k — return every pool name.
+        return [t.name for t in all_tools]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=4,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=overlong_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={"cuga_lite_bind_tools_mode": "all"},
+            tools_context_ref={},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert len(bound) == 4, f"cap violated: bound has {len(bound)} (max=4)"
+    # First four ranked names — earlier names win via the in-order break.
+    assert [t.name for t in bound] == ["tool_000", "tool_001", "tool_002", "tool_003"]
+
+
+@pytest.mark.asyncio
+async def test_bind_tools_cap_clamps_shortlist_with_find_tools_slot():
+    """Same clamp must hold when ``include_find_tools=True`` reserves a slot:
+    ``target_k = max_count - 1`` is the upper bound on shortlisted entries."""
+    tools = [_stub_tool(f"tool_{i:03d}") for i in range(20)]
+    find_tools_tool = _stub_tool("find_tools")
+    provider = AsyncMock()
+    provider.get_all_tools = AsyncMock(return_value=tools)
+    provider.get_apps = AsyncMock(return_value=[])
+    model = MagicMock()
+
+    async def overlong_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instructions=None):
+        return [t.name for t in all_tools]
+
+    with (
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            return_value=4,
+        ),
+        patch(
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.PromptUtils.shortlist_tool_names",
+            side_effect=overlong_shortlist,
+        ),
+    ):
+        await resolve_model_with_bind_tools(
+            model,
+            configurable={
+                "cuga_lite_bind_tools_mode": "all",
+                "cuga_lite_bind_tools_include_find_tools": True,
+            },
+            tools_context_ref={"_lc_bind_tools_find_tools": find_tools_tool},
+            tool_provider=provider,
+            query="hockey",
+        )
+
+    model.bind_tools.assert_called_once()
+    (bound,), _kwargs = model.bind_tools.call_args
+    assert len(bound) == 4, f"cap violated: bound has {len(bound)} (max=4)"
+    # 3 shortlisted + 1 find_tools at the end.
+    assert [t.name for t in bound] == ["tool_000", "tool_001", "tool_002", "find_tools"]
+
+
 @pytest.mark.asyncio
 async def test_bind_tools_cap_binds_only_find_tools_when_max_count_is_one():
     """`max_count=1` + `include_find_tools=True` should still succeed by binding only

From c20968bab28fcc89a9ca2629ae241a3494653218 Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Thu, 14 May 2026 15:11:07 +0300
Subject: [PATCH 8/9] refactor(cuga_lite): extract bind_tools cap/shortlist to
 subpackage (#203)

Addresses Sami's review on #203 (modularity + DRY + docs).

- New subpackage src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/
  - cap.py: holds bind_tools_max_count_from_settings,
    bind_tools_pad_to_cap_from_settings, apply_bind_tools_cap_and_merge,
    plus small private helpers (_resolve_find_tools_overlay,
    _build_ranking_pool, _run_shortlister, _materialize_shortlist,
    _maybe_pad_to_cap).
  - __init__.py: re-exports only what cuga_lite_graph.py needs.
- cuga_lite_graph.py: resolve_model_with_bind_tools stays as orchestration.
  The four mode={all,apps,tools,apps_and_tools} branches now share a local
  _cap_merge_bound(bound) closure (DRY ask), and import the helpers from
  bind_tools/.
- Docstring on resolve_model_with_bind_tools now documents
  cuga_lite_bind_tools_pad_to_cap and adds an Operational cost paragraph
  (one extra LLM round-trip per call_model when cap exceeded; intentional
  RuntimeError when shortlisting cannot run safely).
- settings.toml: documents both cuga_lite_bind_tools_max_count and
  cuga_lite_bind_tools_pad_to_cap inline with the other bind_tools_* knobs
  (env overrides + latency/RuntimeError semantics).
- Tests: patch paths updated for the new module layout; pad_to_cap patches
  now target bind_tools.cap directly since the call site moved with the
  function. 18 tests pass.
---
 .../nodes/cuga_lite/bind_tools/__init__.py    |  17 +
 .../nodes/cuga_lite/bind_tools/cap.py         | 350 ++++++++++++++++++
 .../nodes/cuga_lite/cuga_lite_graph.py        | 303 ++-------------
 src/cuga/settings.toml                        |  14 +
 tests/unit/test_cuga_lite_bind_tools.py       |  32 +-
 5 files changed, 434 insertions(+), 282 deletions(-)
 create mode 100644 src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/__init__.py
 create mode 100644 src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/cap.py

diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/__init__.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/__init__.py
new file mode 100644
index 00000000..036cf0e6
--- /dev/null
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/__init__.py
@@ -0,0 +1,17 @@
+"""bind_tools cap + shortlist machinery for cuga_lite.
+
+Keeps cuga_lite_graph.py focused on orchestration. See :mod:`.cap` for the
+provider-safe cap and shortlister flow.
+"""
+
+from cuga.backend.cuga_graph.nodes.cuga_lite.bind_tools.cap import (
+    apply_bind_tools_cap_and_merge,
+    bind_tools_max_count_from_settings,
+    bind_tools_pad_to_cap_from_settings,
+)
+
+__all__ = [
+    "apply_bind_tools_cap_and_merge",
+    "bind_tools_max_count_from_settings",
+    "bind_tools_pad_to_cap_from_settings",
+]
diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/cap.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/cap.py
new file mode 100644
index 00000000..0365dc6b
--- /dev/null
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/bind_tools/cap.py
@@ -0,0 +1,350 @@
+"""Provider-safe cap and LLM shortlisting for ``bind_tools`` candidate lists.
+
+Strict providers (Groq, OpenAI) reject ``bind_tools`` calls with more than ~128
+tools per request. This module reads the cap from settings, and — when the
+candidate list exceeds it — defers to the same LLM shortlister that runtime
+tool-discovery uses (:meth:`PromptUtils.shortlist_tool_names`) to pick the
+top-K most relevant tools for the user query.
+
+Note: when the cap is exceeded, applying it incurs a single shortlister LLM
+round-trip per ``call_model`` invocation. This is intentional — silent
+truncation would corrupt benchmark results comparing native tool-calling vs
+text-mode. Permissive backends (WatsonX, Anthropic via LiteLLM) can disable
+the cap with ``cuga_lite_bind_tools_max_count=0``.
+"""
+
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from loguru import logger
+from langchain_core.language_models import BaseChatModel
+from langchain_core.tools import StructuredTool
+
+from cuga.backend.cuga_graph.nodes.cuga_lite.prompt_utils import PromptUtils
+from cuga.backend.cuga_graph.nodes.cuga_lite.tool_provider_interface import ToolProviderInterface
+from cuga.config import settings
+
+
+__all__ = [
+    "apply_bind_tools_cap_and_merge",
+    "bind_tools_max_count_from_settings",
+    "bind_tools_pad_to_cap_from_settings",
+]
+
+
+def bind_tools_max_count_from_settings() -> int:
+    """Provider-safe cap on the number of tools passed to ``LLM.bind_tools``.
+
+    Default 128 matches the strictest common provider limit (Groq, OpenAI). Set
+    ``DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT=0`` (or negative)
+    to disable the cap entirely — useful for permissive backends like WatsonX or
+    LiteLLM routing to Anthropic.
+    """
+    try:
+        raw = getattr(settings.advanced_features, "cuga_lite_bind_tools_max_count", 128)
+    except Exception:
+        return 128
+    try:
+        return int(raw)
+    except (TypeError, ValueError):
+        return 128
+
+
+def bind_tools_pad_to_cap_from_settings() -> bool:
+    """Whether to pad the shortlister output with the remaining tools to fill the cap.
+
+    Default ``False`` — bind only the tools the shortlister deemed relevant (often 1-4
+    on the existing system prompt). cuga_lite is a code-execution agent and exhibits
+    measurable regressions in code-emission when many tools are bound natively (the
+    model tends to switch to native ``tool_calls`` mode, which the code-mode flow
+    doesn't fully exercise).
+
+    Set ``True`` for research scenarios where the user explicitly wants ``mode=all``
+    to bind as many tools as the provider will accept.
+    """
+    try:
+        raw = getattr(settings.advanced_features, "cuga_lite_bind_tools_pad_to_cap", False)
+    except Exception:
+        return False
+    if isinstance(raw, bool):
+        return raw
+    if isinstance(raw, str):
+        return raw.strip().lower() in ("true", "1", "yes", "on")
+    return bool(raw)
+
+
+def _resolve_find_tools_overlay(
+    bound: List[StructuredTool],
+    *,
+    include_find_tools: bool,
+    tools_context_ref: Optional[Dict[str, Any]],
+) -> Tuple[Optional[StructuredTool], str, bool, List[StructuredTool]]:
+    """Resolve the ``find_tools`` overlay candidate and reconcile against ``bound``.
+
+    Returns ``(find_tools_tool, find_tools_name, find_tools_already_in_bound, bound)``.
+
+    The overlay path (``_indexed_tools_for_native_bind``) can inject ``find_tools``
+    into ``bound`` independently of ``include_find_tools``, so we detect it either
+    way to honor an explicit opt-out. If the user disabled it but the overlay
+    injected it anyway, strip it from ``bound`` so it can't consume a capped slot
+    or sneak into the shortlister's input. (Coderabbit on #203.)
+    """
+    find_tools_tool: Optional[StructuredTool] = None
+    if tools_context_ref:
+        candidate = tools_context_ref.get("_lc_bind_tools_find_tools")
+        if candidate is not None:
+            find_tools_tool = candidate
+
+    find_tools_name = getattr(find_tools_tool, "name", "") or ""
+    find_tools_already_in_bound = bool(find_tools_name) and any(
+        getattr(t, "name", "") == find_tools_name for t in bound
+    )
+    if not include_find_tools and find_tools_already_in_bound:
+        bound = [t for t in bound if getattr(t, "name", "") != find_tools_name]
+        find_tools_already_in_bound = False
+    return find_tools_tool, find_tools_name, find_tools_already_in_bound, bound
+
+
+def _build_ranking_pool(
+    bound: List[StructuredTool],
+    *,
+    keep_find_tools: bool,
+    find_tools_name: str,
+    find_tools_already_in_bound: bool,
+) -> List[StructuredTool]:
+    """Strip ``find_tools`` from the ranking pool when we must guarantee it survives.
+
+    When ``include_find_tools=True`` the LLM ranker is free to drop any tool from
+    the ranking pool — pulling find_tools out and appending it back is the only
+    safe way to guarantee it.
+    """
+    if keep_find_tools and find_tools_already_in_bound:
+        return [t for t in bound if getattr(t, "name", "") != find_tools_name]
+    return bound
+
+
+async def _run_shortlister(
+    query_text: str,
+    *,
+    ranking_pool: List[StructuredTool],
+    tool_provider: Optional[ToolProviderInterface],
+    llm: Optional[BaseChatModel],
+    top_k: int,
+    mode: str,
+    max_count: int,
+) -> List[str]:
+    """Run :meth:`PromptUtils.shortlist_tool_names` and validate the result.
+
+    Raises ``RuntimeError`` on shortlister failure or empty ranking — silent
+    truncation would corrupt benchmark results comparing native vs text mode.
+    """
+    all_apps: List[Any] = []
+    if tool_provider is not None:
+        try:
+            all_apps = await tool_provider.get_apps()
+        except Exception as e:
+            logger.warning("bind_tools cap: tool_provider.get_apps() failed: {}", e)
+
+    logger.info(
+        "bind_tools cap exceeded: mode={} candidates={} cap={} → LLM shortlister to top {}",
+        mode,
+        len(ranking_pool),
+        max_count,
+        top_k,
+    )
+    try:
+        ranked_names = await PromptUtils.shortlist_tool_names(
+            query=query_text,
+            all_tools=ranking_pool,
+            all_apps=all_apps,
+            llm=llm,
+            top_k=top_k,
+        )
+    except Exception as e:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools shortlister failed reducing {len(ranking_pool)} tools to "
+            f"top {top_k} (cap={max_count}): {e!r}. Raise the cap or fix the shortlister LLM."
+        ) from e
+
+    if not ranked_names:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools shortlister returned 0 tools for {len(ranking_pool)} "
+            f"candidates (cap={max_count}, query={query_text!r}). Cannot proceed safely; "
+            f"raise the cap or refine the query."
+        )
+    return ranked_names
+
+
+def _materialize_shortlist(
+    ranked_names: List[str],
+    *,
+    ranking_pool: List[StructuredTool],
+    target_k: int,
+    query_text: str,
+    max_count: int,
+) -> Tuple[List[StructuredTool], Set[str]]:
+    """Map ranker output back to ``StructuredTool`` objects, clamped to ``target_k``.
+
+    Defense-in-depth clamp: enforce ``target_k`` at the call site too, in case the
+    shortlister returns more names than ``top_k`` (custom shortlister, future
+    refactor, or a mocked path). Without this clamp the bound list could exceed
+    ``max_count`` and re-trigger the provider 400 the cap exists to prevent.
+    Raises ``RuntimeError`` if the ranker hallucinated names that don't match any
+    candidate. (Coderabbit on #203.)
+    """
+    by_name = {getattr(t, "name", ""): t for t in ranking_pool}
+    shortlisted: List[StructuredTool] = []
+    seen_short: Set[str] = set()
+    for n in ranked_names:
+        t = by_name.get(n)
+        if t is not None and n not in seen_short:
+            seen_short.add(n)
+            shortlisted.append(t)
+            if len(shortlisted) >= target_k:
+                break
+
+    if not shortlisted:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools shortlister returned {len(ranked_names)} names but none "
+            f"matched the {len(ranking_pool)} candidates (cap={max_count}, "
+            f"query={query_text!r}, sample_ranked={ranked_names[:5]}). Shortlister LLM "
+            f"hallucinated tool names — raise the cap, fix the shortlister prompt, or "
+            f"refine the query."
+        )
+    return shortlisted, seen_short
+
+
+def _maybe_pad_to_cap(
+    shortlisted: List[StructuredTool],
+    *,
+    ranking_pool: List[StructuredTool],
+    seen_short: Set[str],
+    target_k: int,
+) -> int:
+    """Opt-in padding (off by default) — measured regressions on m3 hockey otherwise.
+
+    Padding pushes the model toward native ``tool_calls`` mode, which the code-mode
+    flow doesn't fully exercise (measured: 0 tool calls vs 5-7 without padding).
+    Users explicitly chasing "true mode=all" can opt in.
+    """
+    if not bind_tools_pad_to_cap_from_settings() or len(shortlisted) >= target_k:
+        return 0
+    padded_count = 0
+    for t in ranking_pool:
+        name = getattr(t, "name", "") or ""
+        if not name or name in seen_short:
+            continue
+        seen_short.add(name)
+        shortlisted.append(t)
+        padded_count += 1
+        if len(shortlisted) >= target_k:
+            break
+    return padded_count
+
+
+async def apply_bind_tools_cap_and_merge(
+    bound: List[StructuredTool],
+    *,
+    query: Optional[str],
+    tool_provider: Optional[ToolProviderInterface],
+    llm: Optional[BaseChatModel],
+    max_count: int,
+    include_find_tools: bool,
+    tools_context_ref: Optional[Dict[str, Any]],
+    mode: str,
+) -> List[StructuredTool]:
+    """Enforce the provider-safe ``max_count`` and optionally merge ``find_tools``.
+
+    Under cap → merge ``find_tools`` (when ``include_find_tools``) and return. Over cap →
+    run the existing LLM shortlister (see :meth:`PromptUtils.shortlist_tool_names`) against
+    ``query``, take top-K (reserving 1 slot for ``find_tools`` when applicable), and return
+    the ranked subset.
+
+    Raises ``RuntimeError`` with an actionable message when the cap is exceeded but
+    shortlisting is impossible — no user query, shortlister failure, or empty ranking.
+    Failing loudly is intentional: silent truncation would corrupt research/benchmark
+    results that compare native tool-calling against text-mode.
+    """
+    bound_in_len = len(bound)
+    (
+        find_tools_tool,
+        find_tools_name,
+        find_tools_already_in_bound,
+        bound,
+    ) = _resolve_find_tools_overlay(
+        bound,
+        include_find_tools=include_find_tools,
+        tools_context_ref=tools_context_ref,
+    )
+
+    keep_find_tools = include_find_tools and find_tools_tool is not None
+    ranking_pool = _build_ranking_pool(
+        bound,
+        keep_find_tools=keep_find_tools,
+        find_tools_name=find_tools_name,
+        find_tools_already_in_bound=find_tools_already_in_bound,
+    )
+
+    def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
+        if not keep_find_tools or find_tools_tool is None:
+            return tools
+        if find_tools_name in {getattr(t, "name", "") for t in tools}:
+            return tools
+        return [*tools, find_tools_tool]
+
+    cap_disabled = max_count <= 0
+    effective_count = len(ranking_pool) + (1 if keep_find_tools else 0)
+    if cap_disabled or effective_count <= max_count:
+        return _append_find_tools(ranking_pool)
+
+    query_text = (query or "").strip()
+    if not query_text:
+        raise RuntimeError(
+            f"cuga_lite_bind_tools_mode={mode!r} produced {bound_in_len} tools but the "
+            f"provider-safe cap (cuga_lite_bind_tools_max_count) is {max_count}. "
+            f"Shortlisting requires a non-empty user query, but none was provided. Options: "
+            f"(a) ensure the first user message is non-empty so the shortlister can run, "
+            f"(b) raise the cap via DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT "
+            f"for permissive backends (WatsonX, Anthropic via LiteLLM), or "
+            f"(c) set the cap to 0 to disable (Groq/OpenAI will reject)."
+        )
+
+    reserve = 1 if keep_find_tools else 0
+    target_k = max_count - reserve
+    if target_k <= 0:
+        return _append_find_tools([])
+
+    ranked_names = await _run_shortlister(
+        query_text,
+        ranking_pool=ranking_pool,
+        tool_provider=tool_provider,
+        llm=llm,
+        top_k=target_k,
+        mode=mode,
+        max_count=max_count,
+    )
+    shortlisted, seen_short = _materialize_shortlist(
+        ranked_names,
+        ranking_pool=ranking_pool,
+        target_k=target_k,
+        query_text=query_text,
+        max_count=max_count,
+    )
+    padded_count = _maybe_pad_to_cap(
+        shortlisted,
+        ranking_pool=ranking_pool,
+        seen_short=seen_short,
+        target_k=target_k,
+    )
+    shortlisted = _append_find_tools(shortlisted)
+    logger.info(
+        "bind_tools cap: shortlisted to {} tools (mode={}, cap={}, ranked={}, padded={}, "
+        "include_find_tools={}, top_ranked={})",
+        len(shortlisted),
+        mode,
+        max_count,
+        len(ranked_names),
+        padded_count,
+        find_tools_tool is not None,
+        ranked_names[:5],
+    )
+    return shortlisted
diff --git a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
index 7ff1f5ea..14735d45 100644
--- a/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
+++ b/src/cuga/backend/cuga_graph/nodes/cuga_lite/cuga_lite_graph.py
@@ -91,6 +91,10 @@
     resolved_runtime_model_name,
     resolve_bind_tools_fields,
 )
+from cuga.backend.cuga_graph.nodes.cuga_lite.bind_tools import (
+    apply_bind_tools_cap_and_merge,
+    bind_tools_max_count_from_settings,
+)
 from cuga.backend.cuga_graph.nodes.cuga_lite.nl_auto_continue_classifier import (
     classify_nl_auto_continue,
     normalize_assistant_text,
@@ -164,47 +168,6 @@ def _extract_code_from_response_tool_calls(response: object) -> str | None:
     return f"```python\nresult = await {name}({args_str})\nprint(result)\n```"
 
 
-def _bind_tools_max_count_from_settings() -> int:
-    """Provider-safe cap on the number of tools passed to ``LLM.bind_tools``.
-
-    Default 128 matches the strictest common provider limit (Groq, OpenAI). Set
-    ``DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT=0`` (or negative)
-    to disable the cap entirely — useful for permissive backends like WatsonX or
-    LiteLLM routing to Anthropic.
-    """
-    try:
-        raw = getattr(settings.advanced_features, "cuga_lite_bind_tools_max_count", 128)
-    except Exception:
-        return 128
-    try:
-        return int(raw)
-    except (TypeError, ValueError):
-        return 128
-
-
-def _bind_tools_pad_to_cap_from_settings() -> bool:
-    """Whether to pad the shortlister output with the remaining tools to fill the cap.
-
-    Default ``False`` — bind only the tools the shortlister deemed relevant (often 1-4
-    on the existing system prompt). cuga_lite is a code-execution agent and exhibits
-    measurable regressions in code-emission when many tools are bound natively (the
-    model tends to switch to native ``tool_calls`` mode, which the code-mode flow
-    doesn't fully exercise).
-
-    Set ``True`` for research scenarios where the user explicitly wants ``mode=all``
-    to bind as many tools as the provider will accept.
-    """
-    try:
-        raw = getattr(settings.advanced_features, "cuga_lite_bind_tools_pad_to_cap", False)
-    except Exception:
-        return False
-    if isinstance(raw, bool):
-        return raw
-    if isinstance(raw, str):
-        return raw.strip().lower() in ("true", "1", "yes", "on")
-    return bool(raw)
-
-
 def _bind_tools_mode_from_settings() -> str:
     try:
         m = getattr(settings.advanced_features, "cuga_lite_bind_tools_mode", None)
@@ -324,188 +287,6 @@ async def _indexed_tools_for_native_bind(
     return by_name
 
 
-async def _apply_bind_tools_cap_and_merge(
-    bound: List[StructuredTool],
-    *,
-    query: Optional[str],
-    tool_provider: Optional[ToolProviderInterface],
-    llm: Optional[BaseChatModel],
-    max_count: int,
-    include_find_tools: bool,
-    tools_context_ref: Optional[Dict[str, Any]],
-    mode: str,
-) -> List[StructuredTool]:
-    """Enforce the provider-safe ``max_count`` and optionally merge ``find_tools``.
-
-    Under cap → merge ``find_tools`` (when ``include_find_tools``) and return. Over cap →
-    run the existing LLM shortlister (see :meth:`PromptUtils.shortlist_tool_names`) against
-    ``query``, take top-K (reserving 1 slot for ``find_tools`` when applicable), and return
-    the ranked subset.
-
-    Raises ``RuntimeError`` with an actionable message when the cap is exceeded but
-    shortlisting is impossible — no user query, shortlister failure, or empty ranking.
-    Failing loudly is intentional: silent truncation would corrupt research/benchmark
-    results that compare native tool-calling against text-mode.
-    """
-    # Resolve the find_tools candidate *unconditionally* — the overlay path
-    # (``_indexed_tools_for_native_bind``) can inject it into ``bound`` independently of
-    # ``include_find_tools``, so we have to detect it either way to honor an explicit
-    # opt-out. (Coderabbit on #203.)
-    find_tools_tool = None
-    if tools_context_ref:
-        candidate = tools_context_ref.get("_lc_bind_tools_find_tools")
-        if candidate is not None:
-            find_tools_tool = candidate
-
-    find_tools_name = getattr(find_tools_tool, "name", "") or ""
-    find_tools_already_in_bound = bool(find_tools_name) and any(
-        getattr(t, "name", "") == find_tools_name for t in bound
-    )
-    # If the user disabled find_tools but the overlay injected it anyway, strip it from
-    # ``bound`` so it can't consume a capped slot or sneak into the shortlister's input.
-    if not include_find_tools and find_tools_already_in_bound:
-        bound = [t for t in bound if getattr(t, "name", "") != find_tools_name]
-        find_tools_already_in_bound = False
-
-    # When ``include_find_tools=True`` we must *guarantee* find_tools survives shortlisting
-    # — the LLM ranker is free to drop any tool from the ranking pool, so the only safe move
-    # is to pull find_tools OUT of the pool, reserve a cap slot for it, and append it back
-    # at the end.
-    keep_find_tools = include_find_tools and find_tools_tool is not None
-    # Tools the shortlister actually ranks — strip find_tools out so the ranker can't evict it.
-    ranking_pool: List[StructuredTool] = (
-        [t for t in bound if getattr(t, "name", "") != find_tools_name]
-        if keep_find_tools and find_tools_already_in_bound
-        else bound
-    )
-
-    def _append_find_tools(tools: List[StructuredTool]) -> List[StructuredTool]:
-        if not keep_find_tools or find_tools_tool is None:
-            return tools
-        if find_tools_name in {getattr(t, "name", "") for t in tools}:
-            return tools
-        return [*tools, find_tools_tool]
-
-    cap_disabled = max_count <= 0
-    effective_count = len(ranking_pool) + (1 if keep_find_tools else 0)
-    if cap_disabled or effective_count <= max_count:
-        return _append_find_tools(ranking_pool)
-
-    query_text = (query or "").strip()
-    if not query_text:
-        raise RuntimeError(
-            f"cuga_lite_bind_tools_mode={mode!r} produced {len(bound)} tools but the "
-            f"provider-safe cap (cuga_lite_bind_tools_max_count) is {max_count}. "
-            f"Shortlisting requires a non-empty user query, but none was provided. Options: "
-            f"(a) ensure the first user message is non-empty so the shortlister can run, "
-            f"(b) raise the cap via DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT "
-            f"for permissive backends (WatsonX, Anthropic via LiteLLM), or "
-            f"(c) set the cap to 0 to disable (Groq/OpenAI will reject)."
-        )
-
-    reserve = 1 if keep_find_tools else 0
-    target_k = max_count - reserve
-    if target_k <= 0:
-        # ``max_count=1`` with ``include_find_tools=True``: bind only find_tools.
-        return _append_find_tools([])
-
-    all_apps: List[Any] = []
-    if tool_provider is not None:
-        try:
-            all_apps = await tool_provider.get_apps()
-        except Exception as e:
-            logger.warning("bind_tools cap: tool_provider.get_apps() failed: {}", e)
-
-    logger.info(
-        "bind_tools cap exceeded: mode={} candidates={} cap={} → LLM shortlister to top {} "
-        "(reserve={} for find_tools)",
-        mode,
-        len(ranking_pool),
-        max_count,
-        target_k,
-        reserve,
-    )
-    try:
-        ranked_names = await PromptUtils.shortlist_tool_names(
-            query=query_text,
-            all_tools=ranking_pool,
-            all_apps=all_apps,
-            llm=llm,
-            top_k=target_k,
-        )
-    except Exception as e:
-        raise RuntimeError(
-            f"cuga_lite_bind_tools shortlister failed reducing {len(ranking_pool)} tools to "
-            f"top {target_k} (cap={max_count}): {e!r}. Raise the cap or fix the shortlister LLM."
-        ) from e
-
-    if not ranked_names:
-        raise RuntimeError(
-            f"cuga_lite_bind_tools shortlister returned 0 tools for {len(ranking_pool)} "
-            f"candidates (cap={max_count}, query={query_text!r}). Cannot proceed safely; "
-            f"raise the cap or refine the query."
-        )
-
-    by_name = {getattr(t, "name", ""): t for t in ranking_pool}
-    shortlisted: List[StructuredTool] = []
-    seen_short: Set[str] = set()
-    for n in ranked_names:
-        t = by_name.get(n)
-        if t is not None and n not in seen_short:
-            seen_short.add(n)
-            shortlisted.append(t)
-            # Defense-in-depth: enforce the cap at the call site too, in case the
-            # shortlister returns more names than ``top_k`` (custom shortlister, future
-            # refactor, or a mocked path). Without this clamp the bound list could exceed
-            # ``max_count`` and re-trigger the provider 400 the cap exists to prevent.
-            # (Coderabbit on #203.)
-            if len(shortlisted) >= target_k:
-                break
-
-    # The empty-``ranked_names`` case is caught above. This catches the LLM-hallucinated-names
-    # case: ranked_names is non-empty but none match a tool in ranking_pool. Without this
-    # raise, we'd silently pad (or return just find_tools), recreating the silent degradation
-    # the cap path is meant to prevent. (Coderabbit on #203.)
-    if not shortlisted:
-        raise RuntimeError(
-            f"cuga_lite_bind_tools shortlister returned {len(ranked_names)} names but none "
-            f"matched the {len(ranking_pool)} candidates (cap={max_count}, "
-            f"query={query_text!r}, sample_ranked={ranked_names[:5]}). Shortlister LLM "
-            f"hallucinated tool names — raise the cap, fix the shortlister prompt, or "
-            f"refine the query."
-        )
-
-    # Pad-to-cap is opt-in (off by default) because cuga_lite is a code-execution agent
-    # and binding many tools natively pushes the model toward `tool_calls` mode, which the
-    # code-mode flow doesn't fully exercise (measured: 0 tool calls vs 5-7 without padding
-    # on the m3 hockey benchmark). Users explicitly chasing "true mode=all" can opt in.
-    padded_count = 0
-    if _bind_tools_pad_to_cap_from_settings() and len(shortlisted) < target_k:
-        for t in ranking_pool:
-            name = getattr(t, "name", "") or ""
-            if not name or name in seen_short:
-                continue
-            seen_short.add(name)
-            shortlisted.append(t)
-            padded_count += 1
-            if len(shortlisted) >= target_k:
-                break
-
-    shortlisted = _append_find_tools(shortlisted)
-    logger.info(
-        "bind_tools cap: shortlisted to {} tools (mode={}, cap={}, ranked={}, padded={}, "
-        "include_find_tools={}, top_ranked={})",
-        len(shortlisted),
-        mode,
-        max_count,
-        len(ranked_names),
-        padded_count,
-        find_tools_tool is not None,
-        ranked_names[:5],
-    )
-    return shortlisted
-
-
 async def resolve_model_with_bind_tools(
     active_model: BaseChatModel,
     *,
@@ -527,6 +308,19 @@ async def resolve_model_with_bind_tools(
       ``bind_tools``. Default 128 (matches Groq/OpenAI). Set 0 to disable. When the
       candidate list exceeds the cap, the LLM shortlister picks the top-K most relevant
       tools for ``query`` (typically the first user message).
+    - ``cuga_lite_bind_tools_pad_to_cap``: opt-in padding (default ``False``). When the
+      shortlister returns fewer than the cap allows, pad with remaining candidates to fill
+      the cap. Off by default because padding pushes the model toward native ``tool_calls``
+      mode, which the code-mode flow doesn't fully exercise (measured: 0 tool calls vs 5-7
+      without padding on the m3 hockey benchmark).
+
+    Operational cost: when the cap is exceeded, applying it incurs **one extra LLM
+    round-trip** (the shortlister) per ``call_model`` invocation. Permissive backends
+    (WatsonX, Anthropic via LiteLLM) can avoid this round-trip entirely by setting
+    ``cuga_lite_bind_tools_max_count=0``. Silent truncation is **not** an option — when
+    shortlisting cannot run safely (no user query, shortlister failure, or hallucinated
+    names that don't match any candidate), a ``RuntimeError`` is raised so research/
+    benchmark runs comparing native tool-calling vs text-mode don't silently degrade.
 
     Profile ``gpt-oss-20b``: see ``model_runtime_profile.GPT_OSS_20B_RUNTIME_DEFAULTS``.
     """
@@ -545,7 +339,21 @@ async def resolve_model_with_bind_tools(
         settings_tool_names_fn=_bind_tools_tool_names_from_settings,
         settings_include_fn=lambda: _bind_include_find_tools_from_config({}),
     )
-    max_count = _bind_tools_max_count_from_settings()
+    max_count = bind_tools_max_count_from_settings()
+
+    async def _cap_merge_bound(bound: List[StructuredTool]) -> List[StructuredTool]:
+        # Closes over query, tool_provider, llm, max_count, include_find_tools,
+        # tools_context_ref, mode — the four mode branches all pass the same kwargs.
+        return await apply_bind_tools_cap_and_merge(
+            bound,
+            query=query,
+            tool_provider=tool_provider,
+            llm=active_model,
+            max_count=max_count,
+            include_find_tools=include_find_tools,
+            tools_context_ref=tools_context_ref,
+            mode=mode,
+        )
 
     if mode in ("", "none", "false", "0", "off"):
         if include_find_tools:
@@ -570,17 +378,7 @@ async def resolve_model_with_bind_tools(
                 logger.warning("cuga_lite_bind_tools_mode=all but tool_provider is missing")
                 return active_model
             by_name = await _indexed_tools_for_native_bind(tool_provider, tools_context_ref)
-            bound = list(by_name.values())
-            bound = await _apply_bind_tools_cap_and_merge(
-                bound,
-                query=query,
-                tool_provider=tool_provider,
-                llm=active_model,
-                max_count=max_count,
-                include_find_tools=include_find_tools,
-                tools_context_ref=tools_context_ref,
-                mode=mode,
-            )
+            bound = await _cap_merge_bound(list(by_name.values()))
             if not bound:
                 return active_model
             return active_model.bind_tools(bound)
@@ -634,16 +432,7 @@ async def resolve_model_with_bind_tools(
                         missing,
                     )
 
-            bound = await _apply_bind_tools_cap_and_merge(
-                bound,
-                query=query,
-                tool_provider=tool_provider,
-                llm=active_model,
-                max_count=max_count,
-                include_find_tools=include_find_tools,
-                tools_context_ref=tools_context_ref,
-                mode=mode,
-            )
+            bound = await _cap_merge_bound(bound)
             if not bound:
                 return active_model
             return active_model.bind_tools(bound)
@@ -674,16 +463,7 @@ async def resolve_model_with_bind_tools(
                             bound.append(t)
                 except Exception as e:
                     logger.warning("bind_tools apps: get_tools(%s) failed: %s", app_name, e)
-            bound = await _apply_bind_tools_cap_and_merge(
-                bound,
-                query=query,
-                tool_provider=tool_provider,
-                llm=active_model,
-                max_count=max_count,
-                include_find_tools=include_find_tools,
-                tools_context_ref=tools_context_ref,
-                mode=mode,
-            )
+            bound = await _cap_merge_bound(bound)
             if not bound:
                 return active_model
             return active_model.bind_tools(bound)
@@ -721,16 +501,7 @@ async def resolve_model_with_bind_tools(
                     "cuga_lite_bind_tools_tool_names not found among provider tools (skipped): %s",
                     missing,
                 )
-            bound = await _apply_bind_tools_cap_and_merge(
-                bound,
-                query=query,
-                tool_provider=tool_provider,
-                llm=active_model,
-                max_count=max_count,
-                include_find_tools=include_find_tools,
-                tools_context_ref=tools_context_ref,
-                mode=mode,
-            )
+            bound = await _cap_merge_bound(bound)
             if not bound:
                 return active_model
             return active_model.bind_tools(bound)
@@ -740,7 +511,7 @@ async def resolve_model_with_bind_tools(
             mode,
         )
     except RuntimeError:
-        # Actionable cap/shortlist errors from _apply_bind_tools_cap_and_merge are intentional —
+        # Actionable cap/shortlist errors from apply_bind_tools_cap_and_merge are intentional —
         # surfacing them is required so research/benchmark runs don't silently degrade.
         raise
     except Exception as e:
diff --git a/src/cuga/settings.toml b/src/cuga/settings.toml
index 15d330a9..a83b54d2 100644
--- a/src/cuga/settings.toml
+++ b/src/cuga/settings.toml
@@ -54,6 +54,20 @@ cuga_lite_bind_tools_mode = "none"
 cuga_lite_bind_tools_apps = []  # mode=apps or apps_and_tools
 cuga_lite_bind_tools_tool_names = []  # mode=tools or apps_and_tools (StructuredTool.name)
 cuga_lite_bind_tools_include_find_tools = false  # Also bind find_tools alongside all/apps/tools/apps_and_tools
+# Provider-safe cap on the number of tools sent to LLM.bind_tools. Default 128 matches the strictest common
+# provider limit (Groq, OpenAI). When the candidate list exceeds the cap, the LLM shortlister picks the top-K
+# most relevant tools for the first user message — costs one extra LLM round-trip per call_model invocation.
+# When shortlisting can't run safely (no user query, shortlister failure, hallucinated names) a RuntimeError
+# is raised rather than silently truncating — silent truncation would corrupt benchmark/research results.
+# Set 0 (or negative) to disable the cap entirely on permissive backends (WatsonX, Anthropic via LiteLLM).
+# Env override: DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT.
+cuga_lite_bind_tools_max_count = 128
+# Whether to pad the shortlister output with additional candidates to fill the cap. Default false because
+# cuga_lite is a code-execution agent and padding pushes the model toward native tool_calls mode, which the
+# code-mode flow doesn't fully exercise (measured: 0 tool calls vs 5-7 without padding on m3 hockey).
+# Set true for research scenarios that explicitly want mode=all to bind as many tools as the provider accepts.
+# Env override: DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_PAD_TO_CAP.
+cuga_lite_bind_tools_pad_to_cap = false
 cuga_lite_nl_auto_continue = false  # When model returns NL with no code, LLM-classify interim vs final; if interim, simulate user "continue" and re-call model
 path_segment_index = 1  # Which path segment to use for operation naming (1 = first, 2 = second, 3 = third)
 force_autonomous_mode = false
diff --git a/tests/unit/test_cuga_lite_bind_tools.py b/tests/unit/test_cuga_lite_bind_tools.py
index 0da7d30c..271dc035 100644
--- a/tests/unit/test_cuga_lite_bind_tools.py
+++ b/tests/unit/test_cuga_lite_bind_tools.py
@@ -125,7 +125,7 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=3,
         ),
         patch(
@@ -156,7 +156,7 @@ async def test_bind_tools_mode_all_raises_when_over_cap_without_query():
     model = MagicMock()
 
     with patch(
-        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
         return_value=3,
     ):
         with pytest.raises(RuntimeError, match="provider-safe cap"):
@@ -183,7 +183,7 @@ async def test_bind_tools_mode_all_no_cap_when_under_threshold():
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=128,
         ),
         patch(
@@ -215,7 +215,7 @@ async def test_bind_tools_mode_all_disabled_cap_binds_everything():
     model = MagicMock()
 
     with patch(
-        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
         return_value=0,
     ):
         await resolve_model_with_bind_tools(
@@ -246,11 +246,11 @@ async def stingy_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instru
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=5,
         ),
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_pad_to_cap_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.bind_tools.cap.bind_tools_pad_to_cap_from_settings",
             return_value=False,
         ),
         patch(
@@ -285,11 +285,11 @@ async def stingy_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instru
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=5,
         ),
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_pad_to_cap_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.bind_tools.cap.bind_tools_pad_to_cap_from_settings",
             return_value=True,
         ),
         patch(
@@ -335,7 +335,7 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=5,
         ),
         patch(
@@ -386,7 +386,7 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=4,
         ),
         patch(
@@ -440,7 +440,7 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=3,
         ),
         patch(
@@ -485,7 +485,7 @@ async def hallucinating_shortlist(query, all_tools, all_apps, llm=None, top_k=4,
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=3,
         ),
         patch(
@@ -521,7 +521,7 @@ async def overlong_shortlist(query, all_tools, all_apps, llm=None, top_k=4, inst
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=4,
         ),
         patch(
@@ -560,7 +560,7 @@ async def overlong_shortlist(query, all_tools, all_apps, llm=None, top_k=4, inst
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=4,
         ),
         patch(
@@ -598,7 +598,7 @@ async def test_bind_tools_cap_binds_only_find_tools_when_max_count_is_one():
     model = MagicMock()
 
     with patch(
-        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+        "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
         return_value=1,
     ):
         await resolve_model_with_bind_tools(
@@ -635,7 +635,7 @@ async def fake_shortlist(query, all_tools, all_apps, llm=None, top_k=4, instruct
 
     with (
         patch(
-            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph._bind_tools_max_count_from_settings",
+            "cuga.backend.cuga_graph.nodes.cuga_lite.cuga_lite_graph.bind_tools_max_count_from_settings",
             return_value=4,
         ),
         patch(

From 31b6b84844a47e916372e04a2be6843c840b790a Mon Sep 17 00:00:00 2001
From: Harold Ship <harold@il.ibm.com>
Date: Sun, 17 May 2026 10:32:43 +0300
Subject: [PATCH 9/9] fix(cuga_lite): address sami's review comments on #203

- run_tests.sh: include tests/unit/test_cuga_lite_bind_tools.py in unit run
- settings.toml: trim verbose comments around bind_tools cap/pad flags
---
 src/cuga/settings.toml   | 14 ++------------
 src/scripts/run_tests.sh |  2 ++
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/cuga/settings.toml b/src/cuga/settings.toml
index a83b54d2..a9676f9a 100644
--- a/src/cuga/settings.toml
+++ b/src/cuga/settings.toml
@@ -54,19 +54,9 @@ cuga_lite_bind_tools_mode = "none"
 cuga_lite_bind_tools_apps = []  # mode=apps or apps_and_tools
 cuga_lite_bind_tools_tool_names = []  # mode=tools or apps_and_tools (StructuredTool.name)
 cuga_lite_bind_tools_include_find_tools = false  # Also bind find_tools alongside all/apps/tools/apps_and_tools
-# Provider-safe cap on the number of tools sent to LLM.bind_tools. Default 128 matches the strictest common
-# provider limit (Groq, OpenAI). When the candidate list exceeds the cap, the LLM shortlister picks the top-K
-# most relevant tools for the first user message — costs one extra LLM round-trip per call_model invocation.
-# When shortlisting can't run safely (no user query, shortlister failure, hallucinated names) a RuntimeError
-# is raised rather than silently truncating — silent truncation would corrupt benchmark/research results.
-# Set 0 (or negative) to disable the cap entirely on permissive backends (WatsonX, Anthropic via LiteLLM).
-# Env override: DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_MAX_COUNT.
+# Cap on tools sent to LLM.bind_tools (0 disables). Over-cap triggers an LLM shortlister; failure raises.
 cuga_lite_bind_tools_max_count = 128
-# Whether to pad the shortlister output with additional candidates to fill the cap. Default false because
-# cuga_lite is a code-execution agent and padding pushes the model toward native tool_calls mode, which the
-# code-mode flow doesn't fully exercise (measured: 0 tool calls vs 5-7 without padding on m3 hockey).
-# Set true for research scenarios that explicitly want mode=all to bind as many tools as the provider accepts.
-# Env override: DYNACONF_ADVANCED_FEATURES__CUGA_LITE_BIND_TOOLS_PAD_TO_CAP.
+# Pad shortlister output to fill the cap. Off by default for code-execution mode.
 cuga_lite_bind_tools_pad_to_cap = false
 cuga_lite_nl_auto_continue = false  # When model returns NL with no code, LLM-classify interim vs final; if interim, simulate user "continue" and re-call model
 path_segment_index = 1  # Which path segment to use for operation naming (1 = first, 2 = second, 3 = third)
diff --git a/src/scripts/run_tests.sh b/src/scripts/run_tests.sh
index 78997716..32ff9f9a 100755
--- a/src/scripts/run_tests.sh
+++ b/src/scripts/run_tests.sh
@@ -80,6 +80,8 @@ run_pytest \
     tests/unit/test_chat_knowledge_mode.py \
     tests/unit/test_chat_agent_knowledge_toggle.py \
     tests/integration/test_knowledge_integration.py
+echo "Running cuga_lite bind_tools tests..."
+run_pytest tests/unit/test_cuga_lite_bind_tools.py
 echo "✅ All unit tests passed!"
 
 # Check for test type flag