From 00dd6e5a1feb952af88a474abffd07f5015ceee6 Mon Sep 17 00:00:00 2001
From: zhaizhiqiang <584508161@qq.com>
Date: Fri, 26 Jun 2026 03:32:33 +0000
Subject: [PATCH 1/3] support mini-swe-agent and claud-code blockbox agent
training recipes
---
.../claude_code/Dockerfile.claude-code-tool | 21 +
.../claude_code/claude_code_runner.py | 232 +++++++++
.../claude_code/config/claude_code.yaml | 1 +
.../Dockerfile.mini-swe-agent-tool | 45 ++
.../blackbox_recipes/mini_swe_agent/README.md | 269 +++++++++++
.../mini_swe_agent/__init__.py | 0
.../mini_swe_agent/config/agent_config.yaml | 36 ++
.../config/agent_config_openyuanrong.yaml | 37 ++
.../mini_swe_agent/config/parallel_infer.yaml | 31 ++
.../config/swe_agent_blackbox.yaml | 123 +++++
.../swe_agent_blackbox_megatron_async.yaml | 162 +++++++
.../swe_agent_blackbox_megatron_sync.yaml | 129 +++++
.../mini_swe_agent/dataset.py | 34 ++
.../mini_swe_agent/framework.py | 105 ++++
.../mini_swe_agent/mini_swe_agent_runner.py | 227 +++++++++
.../mini_swe_agent/parallel_infer.py | 447 ++++++++++++++++++
.../blackbox_recipes/mini_swe_agent/reward.py | 74 +++
.../mini_swe_agent/run_agent.py | 106 +++++
.../mini_swe_agent/subprocess_runner.py | 61 +++
examples/blackbox_recipes/sandbox/sandbox.py | 10 +
.../blackbox_recipes/scripts/build_tool.sh | 75 +++
.../blackbox_recipes/scripts/run_infer.sh | 66 +++
.../blackbox_recipes/scripts/run_train.sh | 122 +++++
.../scripts/run_train_megatron_async.sh | 199 ++++++++
.../scripts/run_train_megatron_sync.sh | 138 ++++++
25 files changed, 2750 insertions(+)
create mode 100644 examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool
create mode 100644 examples/blackbox_recipes/claude_code/claude_code_runner.py
create mode 100644 examples/blackbox_recipes/claude_code/config/claude_code.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool
create mode 100644 examples/blackbox_recipes/mini_swe_agent/README.md
create mode 100644 examples/blackbox_recipes/mini_swe_agent/__init__.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml
create mode 100644 examples/blackbox_recipes/mini_swe_agent/dataset.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/framework.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/parallel_infer.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/reward.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/run_agent.py
create mode 100644 examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py
create mode 100644 examples/blackbox_recipes/sandbox/sandbox.py
create mode 100755 examples/blackbox_recipes/scripts/build_tool.sh
create mode 100755 examples/blackbox_recipes/scripts/run_infer.sh
create mode 100755 examples/blackbox_recipes/scripts/run_train.sh
create mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_async.sh
create mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_sync.sh
diff --git a/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool b/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool
new file mode 100644
index 00000000..3d12af4c
--- /dev/null
+++ b/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool
@@ -0,0 +1,21 @@
+# Claude Code sidecar tool image.
+#
+# Mounted at /opt/claude-code inside the SWE-bench sandbox.
+
+FROM node:20-bookworm-slim AS builder
+
+ARG TOOL_VERSION="latest"
+ARG NPM_REGISTRY=""
+
+ENV DISABLE_AUTOUPDATER=1 \
+ IS_SANDBOX=1 \
+ npm_config_audit=false \
+ npm_config_fund=false \
+ npm_config_update_notifier=false
+
+RUN if [ -n "${NPM_REGISTRY}" ]; then npm config set registry "${NPM_REGISTRY}"; fi \
+ && npm install -g --prefix /opt/claude-code "@anthropic-ai/claude-code@${TOOL_VERSION}" \
+ && /opt/claude-code/bin/claude --version
+
+FROM scratch
+COPY --from=builder /opt/claude-code /
diff --git a/examples/blackbox_recipes/claude_code/claude_code_runner.py b/examples/blackbox_recipes/claude_code/claude_code_runner.py
new file mode 100644
index 00000000..bee41aaf
--- /dev/null
+++ b/examples/blackbox_recipes/claude_code/claude_code_runner.py
@@ -0,0 +1,232 @@
+"""Claude Code runner for the blackbox SWE-agent recipe."""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import shlex
+import time
+
+from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_TOOL_IMAGE = "claude-code-tool:latest"
+TOOL_TARGET = "/opt/claude-code"
+
+
+def extract_task(raw_prompt) -> str:
+ if isinstance(raw_prompt, str):
+ return raw_prompt
+ return next(
+ (m["content"] for m in raw_prompt if isinstance(m, dict) and m.get("role") == "user"),
+ str(raw_prompt),
+ )
+
+
+def _extract_issue_text(task: str) -> str:
+ start = task.find("")
+ end = task.find("")
+ if start >= 0 and end > start:
+ return task[start + len(""):end].strip()
+ marker = "\nFollow these steps to resolve the issue:"
+ if marker in task:
+ return task.split(marker, 1)[0].strip()
+ return task.strip()
+
+
+def _decode_metadata_list(value) -> list[str]:
+ if not value:
+ return []
+ if isinstance(value, list):
+ return [str(item) for item in value]
+ if isinstance(value, str):
+ try:
+ parsed = json.loads(value)
+ except json.JSONDecodeError:
+ return [value]
+ if isinstance(parsed, list):
+ return [str(item) for item in parsed]
+ return [str(value)]
+
+
+def build_claude_task(raw_prompt, tools_kwargs: dict | None = None) -> str:
+ tools_kwargs = tools_kwargs or {}
+ task = extract_task(raw_prompt)
+ metadata = ((tools_kwargs.get("reward") or {}).get("metadata") or {})
+ issue = metadata.get("problem_statement") or _extract_issue_text(task)
+ tests = _decode_metadata_list(metadata.get("FAIL_TO_PASS"))
+ if not tests:
+ tests = _decode_metadata_list(metadata.get("PASS_TO_PASS"))[:3]
+ tests_block = "\n".join(f"- {test}" for test in tests) if tests else "- Run the closest relevant tests you identify."
+
+ return (
+ "You are fixing a SWE-bench task in /testbed.\n\n"
+ "Issue:\n"
+ f"{issue}\n\n"
+ "Rules:\n"
+ "- Edit source files only. Do not modify tests.\n"
+ "- The development environment is already installed; do not install packages unless a test command proves it is necessary.\n"
+ "- There is no submit tool in this environment. Do not try to submit.\n"
+ "- Do not create extra edge-case test files after the relevant tests pass.\n"
+ "- Do not run `pytest --collect-only`, `git log`, or any other command that does not directly validate the fix.\n"
+ "- Do not analyze unrelated `is_separable` behavior.\n"
+ "- Do not run additional ad-hoc verification after the listed relevant pytest command passes.\n"
+ "- Do not commit.\n"
+ "- After the minimal fix is applied and a relevant pytest command passes, print a one-line summary and exit immediately.\n\n"
+ "Relevant tests to run after the fix:\n"
+ f"{tests_block}\n"
+ )
+
+
+def build_claude_command(
+ *,
+ task: str,
+ base_url: str,
+ max_turns: int,
+ model: str = "default",
+ permission_mode: str = "bypassPermissions",
+ conda_env: str | None = "testbed",
+ disable_web_tools: bool = True,
+ disable_slash_commands: bool = True,
+) -> str:
+ env = {
+ "ANTHROPIC_BASE_URL": base_url,
+ "ANTHROPIC_API_KEY": "not-needed",
+ "ANTHROPIC_MODEL": model,
+ "ANTHROPIC_DEFAULT_HAIKU_MODEL": model,
+ "ANTHROPIC_DEFAULT_SONNET_MODEL": model,
+ "ANTHROPIC_DEFAULT_OPUS_MODEL": model,
+ "ANTHROPIC_SMALL_FAST_MODEL": model,
+ "CLAUDE_CODE_DISABLE_BACKGROUND_TASKS": "1",
+ "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
+ "CLAUDE_CODE_FORK_SUBAGENT": "0",
+ "CLAUDE_CODE_SUBAGENT_MODEL": model,
+ "DISABLE_AUTOUPDATER": "1",
+ "IS_SANDBOX": "1",
+ }
+ env_assignments = [f"{key}={shlex.quote(value)}" for key, value in env.items()]
+ if conda_env:
+ conda_prefix = f"/opt/miniconda3/envs/{conda_env}"
+ env_assignments.extend(
+ [
+ f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)}",
+ f"CONDA_PREFIX={shlex.quote(conda_prefix)}",
+ f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH",
+ ]
+ )
+ env_prefix = " ".join(env_assignments)
+ argv = [
+ "/opt/claude-code/bin/claude",
+ "-p",
+ task,
+ "--model",
+ model,
+ "--max-turns",
+ str(max_turns),
+ "--permission-mode",
+ permission_mode,
+ ]
+ if disable_slash_commands:
+ argv.append("--disable-slash-commands")
+ if disable_web_tools:
+ argv.extend(["--disallowedTools", "Agent", "Task", "WebFetch", "WebSearch"])
+ return (
+ "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; "
+ "cd /testbed; "
+ f"{env_prefix} "
+ + shlex.join(argv)
+ )
+
+
+async def _create_claude_sandbox(
+ *,
+ image: str,
+ sidecar_image: str,
+ gateway_url: str,
+):
+ from examples.swe_agent_blackbox.sandbox import YRSandbox, extract_upstream
+
+ upstream = extract_upstream(gateway_url) if gateway_url else ""
+ return await YRSandbox.create(
+ image=image,
+ sidecar_image=sidecar_image,
+ sidecar_target=TOOL_TARGET,
+ upstream=upstream,
+ )
+
+
+async def claude_code_runner(
+ *,
+ raw_prompt,
+ session: SessionHandle,
+ sample_index: int,
+ session_runtime: SessionRuntime,
+ tools_kwargs: dict | None = None,
+ tool_image: str = DEFAULT_TOOL_IMAGE,
+ run_timeout: int = 7200,
+ **kwargs,
+) -> None:
+ from examples.swe_agent_blackbox.dataset import extract_image
+ from examples.swe_agent_blackbox.mini_swe_agent_runner import SandboxEnvForReward
+ from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env
+
+ tools_kwargs = tools_kwargs or {}
+ task = build_claude_task(raw_prompt, tools_kwargs)
+ env_config = tools_kwargs.get("env", {})
+ image = extract_image(env_config)
+ if not image:
+ raise ValueError(f"No Docker image found in tools_kwargs.env for sample {sample_index}")
+
+ gateway_url = session.base_url
+ if not gateway_url:
+ raise ValueError(f"gateway_url is empty for sample {sample_index}")
+
+ sandbox = await _create_claude_sandbox(
+ image=image,
+ sidecar_image=tool_image,
+ gateway_url=gateway_url,
+ )
+
+ try:
+ post_setup_cmd = env_config.get("post_setup_cmd", "")
+ if post_setup_cmd:
+ setup_result = await sandbox.run(post_setup_cmd, timeout=120)
+ if setup_result.exit_code != 0:
+ logger.warning("post_setup_cmd failed rc=%s: %.300s", setup_result.exit_code, setup_result.stdout + setup_result.stderr)
+
+ from examples.swe_agent_blackbox.sandbox import rewrite_gateway_url
+
+ claude_base_url = rewrite_gateway_url(gateway_url, strip_v1=True)
+ max_turns = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100"))
+ agent_cmd = build_claude_command(
+ task=task,
+ base_url=claude_base_url,
+ max_turns=max_turns,
+ )
+
+ started_at = time.perf_counter()
+ result = await sandbox.run(agent_cmd, timeout=int(run_timeout))
+ elapsed = time.perf_counter() - started_at
+ logger.info("[sample %d] claude-code finished rc=%s elapsed=%.1fs", sample_index, result.exit_code, elapsed)
+ if result.exit_code != 0:
+ logger.warning(
+ "[sample %d] claude-code failed stdout_tail=%r stderr_tail=%r",
+ sample_index,
+ (result.stdout or "")[-4000:],
+ (result.stderr or "")[-4000:],
+ )
+
+ metadata, eval_timeout = build_reward_context(tools_kwargs)
+ score, eval_result = await evaluate_in_env(SandboxEnvForReward(sandbox), metadata, eval_timeout)
+ logger.info("[sample %d] reward done score=%s resolved=%s", sample_index, score, eval_result.get("resolved"))
+
+ reward_info = {
+ "reward_score": score,
+ "claude_code_exit_code": result.exit_code,
+ **eval_result,
+ }
+ await session_runtime.complete_session(session.session_id, reward_info=reward_info)
+ finally:
+ await sandbox.cleanup()
diff --git a/examples/blackbox_recipes/claude_code/config/claude_code.yaml b/examples/blackbox_recipes/claude_code/config/claude_code.yaml
new file mode 100644
index 00000000..503fa1da
--- /dev/null
+++ b/examples/blackbox_recipes/claude_code/config/claude_code.yaml
@@ -0,0 +1 @@
+#TODO
\ No newline at end of file
diff --git a/examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool b/examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool
new file mode 100644
index 00000000..a2fba565
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/Dockerfile.mini-swe-agent-tool
@@ -0,0 +1,45 @@
+# Mini-swe-agent sidecar tool image.
+#
+# Contains a self-contained Python venv at /opt/mini-swe-agent with
+# mini-swe-agent + litellm installed. When mounted into a sandbox at
+# /opt/mini-swe-agent, the agent can be invoked via:
+#
+# /opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py ...
+#
+# Uses python-build-standalone for maximum portability across different
+# glibc versions (built against older glibc, forward-compatible).
+#
+# Build:
+# docker build -f Dockerfile.mini-swe-agent-tool -t mini-swe-agent-tool:latest .
+#
+
+FROM debian:bullseye-slim AS builder
+
+ARG PBS_RELEASE="20260602"
+ARG PBS_PYTHON="3.12.13"
+ARG PIP_INDEX_URL=""
+
+# Download and extract python-build-standalone (stripped, 32MB)
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates wget \
+ && rm -rf /var/lib/apt/lists/* \
+ && wget -q \
+ "https://github.com/astral-sh/python-build-standalone/releases/download/${PBS_RELEASE}/cpython-${PBS_PYTHON}%2B${PBS_RELEASE}-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz" \
+ -O /tmp/python.tar.gz \
+ && mkdir -p /opt/mini-swe-agent \
+ && tar -xzf /tmp/python.tar.gz -C /opt/mini-swe-agent --strip-components=1 \
+ && rm /tmp/python.tar.gz
+
+# Install mini-swe-agent + litellm
+RUN /opt/mini-swe-agent/bin/pip install --no-cache-dir \
+ ${PIP_INDEX_URL:+-i ${PIP_INDEX_URL}} \
+ "mini-swe-agent==2.2.8" \
+ "litellm==1.81.7"
+
+# Copy the in-sandbox runner script
+COPY run_agent.py /opt/mini-swe-agent/bin/run_agent.py
+
+# Final scratch image: files are at the image root level so that when
+# akernel_sdk.Mount(target="/opt/mini-swe-agent") overlays this image,
+# the files appear at /opt/mini-swe-agent/bin/python etc.
+FROM scratch
+COPY --from=builder /opt/mini-swe-agent /
diff --git a/examples/blackbox_recipes/mini_swe_agent/README.md b/examples/blackbox_recipes/mini_swe_agent/README.md
new file mode 100644
index 00000000..b32a637a
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/README.md
@@ -0,0 +1,269 @@
+# Mini-SWE-Agent In-Sandbox Execution
+
+## Overview
+
+`mini_swe` and `claude_code` both run inside the SWE-bench sandbox through a
+sidecar tool image. The external runner creates the sandbox, mounts the selected
+tool image, starts the agent process, and evaluates the reward in the same
+sandbox.
+
+For `mini_swe`, the agent executes commands through `LocalEnvironment` (local
+bash) inside the sandbox and calls the LLM through the gateway URL passed in via
+stdin. For `claude_code`, the runner starts the Claude Code CLI from the sidecar
+image and points it at the same Anthropic-compatible gateway.
+
+The `mini_swe` tool image uses
+[python-build-standalone](https://github.com/astral-sh/python-build-standalone)
+to build an isolated Python environment. The Claude Code tool image uses a Node
+builder to install the Claude Code npm package. Both images use a minimal
+`FROM scratch` final stage, so the sandbox base image does not need to provide
+Python, Node, or npm for the sidecar tool runtime.
+
+**Supported runners:**
+
+| runner | Description |
+|--------|-------------|
+| `uniagent` | Original SWE-agent runner |
+| `mini_swe` | mini-swe-agent sidecar runner |
+| `claude_code` | Claude Code sidecar runner; reward is returned through `complete_session(reward_info)` without writing a separate reward JSON file |
+
+**Supported sandbox types:**
+
+| Type | Description |
+|------|-------------|
+| OpenYuanRong (`"openyuanrong"`) | Uses `akernel_sdk.Mount` and `sandbox.commands.run()` |
+
+At runtime, the selected runner depends directly on its tool image. The tool
+image does not need to be extracted into a host directory ahead of time.
+
+## Architecture
+
+```text
+[Rollouter Host: mini_swe_agent_runner / claude_code_runner]
+ |
+ |-- _create_sandbox(image, sidecar_image)
+ | `-- openyuanrong: Sandbox(mounts=[Mount(target="/opt/", ...)])
+ |
+ |-- sandbox.run("")
+ | `-- [Inside Sandbox]
+ | /opt/mini-swe-agent/bin/python3.12 or /opt/claude-code/bin/claude
+ | stdin <- task config JSON (task, gateway_url, agent)
+ | commands run inside the SWE-bench sandbox
+ | stdout -> runner-specific execution result
+ |
+ |-- parse agent result
+ |-- SandboxEnvForReward(sandbox) -> evaluate_in_env()
+ `-- session_runtime.complete_session(reward_info)
+```
+
+## Prerequisites
+
+1. **OpenYuanRong** - set `OPENYUANRONG_SERVER_ADDRESS` and `OPENYUANRONG_TOKEN`.
+2. **Runner tool image** - build the selected tool image and push it to a remote
+ registry if the sandbox service cannot access local Docker images.
+
+## 1. Build Tool Image
+
+`mini_swe` and `claude_code` are both injected into the SWE-bench sandbox as
+sidecar tool images, but they differ in image contents, mount paths, and
+accelerator/mirror options. Use `build_tool.sh` for both runners, and select the
+target runner with `--tool` or `TOOL_KIND`.
+
+| runner | Default tool image | Dockerfile | Sandbox mount path | Image contents | Mirror option |
+|--------|--------------------|------------|--------------------|----------------|---------------|
+| `mini_swe` | `mini-swe-agent-tool:latest` | `Dockerfile.mini-swe-agent-tool` | `/opt/mini-swe-agent` | Standalone Python 3.12, `mini-swe-agent`, `litellm`, and `run_agent.py` | `--pip-index` / `PIP_INDEX_URL` |
+| `claude_code` | `claude-code-tool:latest` | `Dockerfile.claude-code-tool` | `/opt/claude-code` | Claude Code npm package installed by a Node 20 builder | `--npm-registry` / `NPM_REGISTRY` |
+
+### mini_swe Tool Image
+
+`mini_swe` is the default build target:
+
+```bash
+# Use the default PyPI source.
+bash examples/swe_agent_blackbox/build_tool.sh
+
+# Use a custom PyPI mirror.
+bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/
+
+# Build and push to a remote registry.
+bash examples/swe_agent_blackbox/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
+```
+
+The `mini_swe` image uses `python-build-standalone` to build an isolated Python
+runtime. The final `FROM scratch` image contains only the files needed under
+`/opt/mini-swe-agent`, and it does not depend on the Python version installed in
+the sandbox base image.
+
+After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`:
+
+```bash
+SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \
+RUNNER=mini_swe \
+bash examples/swe_agent_blackbox/scripts/run_infer.sh
+```
+
+### Claude Code Tool Image
+
+Claude Code must be selected explicitly with `--tool claude_code`:
+
+```bash
+# Use the default npm registry.
+bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code
+
+# Use a custom npm registry.
+bash examples/swe_agent_blackbox/build_tool.sh \
+ --tool claude_code \
+ --npm-registry https://registry.npmmirror.com
+
+# Select the Claude Code npm package version.
+bash examples/swe_agent_blackbox/build_tool.sh \
+ --tool claude_code \
+ --tool-version latest
+
+# Build and push the Claude Code sidecar image.
+bash examples/swe_agent_blackbox/build_tool.sh \
+ --tool claude_code \
+ --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
+```
+
+The Claude Code image uses `node:20-bookworm-slim` as the builder stage and
+installs `@anthropic-ai/claude-code` into `/opt/claude-code`. The final image is
+also a `FROM scratch` sidecar image. At runtime, the runner mounts it into the
+sandbox at `/opt/claude-code` and invokes `/opt/claude-code/bin/claude`.
+
+After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`:
+
+```bash
+SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \
+RUNNER=claude_code \
+bash examples/swe_agent_blackbox/scripts/run_infer.sh
+```
+
+### Combined Build Options
+
+`--tool`, image tags, mirrors, and registries can be combined:
+
+```bash
+bash examples/swe_agent_blackbox/build_tool.sh \
+ --tool mini_swe \
+ --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ \
+ --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
+```
+
+The build script:
+
+1. Selects the Dockerfile and default image name from `--tool`:
+ - `mini_swe` -> `mini-swe-agent-tool:latest`
+ - `claude_code` -> `claude-code-tool:latest`
+2. Tags and pushes the image when `--registry` is provided.
+
+Both tool images are sidecar runtime dependencies, not SWE-bench task base
+images. The `mini_swe` Python runtime is fully isolated from the sandbox
+container's Python. The `claude_code` Node/npm dependencies live only under
+`/opt/claude-code`, so the sandbox base image does not need Node installed.
+
+### Build Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TOOL_IMAGE` | `mini-swe-agent-tool` / `claude-code-tool` | Image name; the default changes with `TOOL_KIND` |
+| `TOOL_TAG` | `latest` | Image tag |
+| `TOOL_VERSION` | `latest` | Tool package version; for `claude_code`, this selects the `@anthropic-ai/claude-code` npm package version |
+| `PIP_INDEX_URL` | unset, use PyPI | pip index URL; equivalent to `--pip-index` |
+| `TOOL_KIND` | `mini_swe` | Tool kind: `mini_swe` or `claude_code` |
+| `NPM_REGISTRY` | unset, use npm default | npm registry URL; equivalent to `--npm-registry` |
+
+## 2. Inference With OpenYuanRong Sandbox
+
+### Using run_infer.sh
+
+```bash
+cd "$(git rev-parse --show-toplevel)"
+
+RUNNER=mini_swe \
+SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \
+MODEL_PATH=$HOME/models/Qwen3.5-9B \
+DATA_PATH=$HOME/data/swe_agent/r2e_gym.parquet \
+MAX_SAMPLES=1 \
+TP=1 \
+bash examples/swe_agent_blackbox/scripts/run_infer.sh
+```
+
+### Calling Python Directly
+
+```bash
+python examples/swe_agent_blackbox/parallel_infer.py \
+ --model-path ~/models/Qwen3.5-9B \
+ --data-path ~/data/swe_agent/r2e_gym.parquet \
+ --max-samples 1 \
+ --runner mini_swe \
+ --max-turns 100 \
+ --tensor-parallel-size 1
+```
+
+## 3. Inference
+
+### Environment Variables
+
+```bash
+export OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888"
+export OPENYUANRONG_TOKEN=""
+export DEPLOYMENT=openyuanrong
+```
+
+### Run mini_swe
+
+```bash
+RUNNER=mini_swe \
+OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \
+OPENYUANRONG_TOKEN="" \
+DEPLOYMENT=openyuanrong \
+SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \
+bash examples/swe_agent_blackbox/scripts/run_infer.sh
+```
+
+### Run Claude Code
+
+```bash
+RUNNER=claude_code \
+OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \
+OPENYUANRONG_TOKEN="" \
+DEPLOYMENT=openyuanrong \
+SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \
+SWE_AGENT_MAX_TURNS=50 \
+SWE_AGENT_RUN_TIMEOUT=7200 \
+bash examples/swe_agent_blackbox/scripts/run_infer.sh
+```
+
+## 4. Training (Fully Async)
+
+```bash
+OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \
+OPENYUANRONG_TOKEN="" \
+MODEL_PATH=~/models/Qwen3.5-9B \
+bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh
+```
+
+The training YAML keeps `mini_swe` as the default runner:
+
+```yaml
+agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
+```
+
+To run training with Claude Code, keep the YAML unchanged and override the runner
+FQN from the launch command:
+
+```bash
+python3 -m verl.experimental.fully_async_policy.fully_async_main \
+ --config-path examples/swe_agent_blackbox/config \
+ --config-name swe_agent_blackbox_megatron_async \
+ actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=examples.swe_agent_blackbox.claude_code_runner.claude_code_runner
+```
+
+## 5. Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `SWE_AGENT_MAX_TURNS` | `100` | Max agent steps |
+| `SWE_AGENT_TOOL_IMAGE` | `swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest` | Sidecar tool image |
+| `DEBUG_MODE` | (unset) | Set to 1 to enable debug logging |
diff --git a/examples/blackbox_recipes/mini_swe_agent/__init__.py b/examples/blackbox_recipes/mini_swe_agent/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml
new file mode 100644
index 00000000..b7352b72
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml
@@ -0,0 +1,36 @@
+- name: swe_agent
+
+ _target_: uni_agent.agent_loop.UniAgentLoop
+ concurrency: 64
+ log_dir: /tmp/swebench_qwen3_coder
+ mask_abnormal_exit_traj: false
+
+ interaction:
+ action_timeout: 300
+ max_turns: 100
+
+ env:
+ deployment:
+ type: local
+ command: /usr/bin/python3 -m swerex.server --auth-token {token}
+ timeout: 600
+ startup_timeout: 600
+ container_runtime: docker
+ env_variables:
+ PIP_PROGRESS_BAR: "off"
+ PIP_CACHE_DIR: "~/.cache/pip"
+ PAGER: "cat"
+ MANPAGER: "cat"
+ LESS: "-R"
+ TQDM_DISABLE: "1"
+ GIT_PAGER: "cat"
+
+ tool_parser: qwen3_coder
+
+ tools:
+ - name: str_replace_editor
+ - name: execute_bash
+ - name: submit
+
+ reward:
+ eval_timeout: 600
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml
new file mode 100644
index 00000000..b298c676
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml
@@ -0,0 +1,37 @@
+- name: swe_agent
+
+ _target_: uni_agent.agent_loop.UniAgentLoop
+ concurrency: 64
+ log_dir: /tmp/swebench_qwen3_coder
+ mask_abnormal_exit_traj: false
+
+ interaction:
+ action_timeout: 300
+ max_turns: 100
+
+ env:
+ deployment:
+ type: openyuanrong
+ command: /opt/swe-rex/bin/python /opt/swe-rex/bin/swerex-remote --host 0.0.0.0 --port {port} --auth-token {token}
+ timeout: 600
+ startup_timeout: 600
+ swerex_runtime_image: swr.cn-east-3.myhuaweicloud.com/openyuanrong/swerex-runtime:1.4.0
+ swerex_runtime_target: /opt/swe-rex
+ env_variables:
+ PIP_PROGRESS_BAR: "off"
+ PIP_CACHE_DIR: "~/.cache/pip"
+ PAGER: "cat"
+ MANPAGER: "cat"
+ LESS: "-R"
+ TQDM_DISABLE: "1"
+ GIT_PAGER: "cat"
+
+ tool_parser: qwen3_coder
+
+ tools:
+ - name: str_replace_editor
+ - name: execute_bash
+ - name: submit
+
+ reward:
+ eval_timeout: 600
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml b/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml
new file mode 100644
index 00000000..0829fdcd
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml
@@ -0,0 +1,31 @@
+# Parallel inference config for the blackbox SWE-agent recipe.
+# Composes verl's base configs with inference-specific overrides.
+
+defaults:
+ - model_engine: dp
+ - actor@actor_rollout_ref.actor: ${model_engine}_actor
+ - rollout@actor_rollout_ref.rollout: rollout
+ - model@actor_rollout_ref.model: hf_model
+ - reward: reward
+ - _self_
+
+hydra:
+ searchpath:
+ - pkg://verl.trainer.config
+
+actor_rollout_ref:
+ hybrid_engine: true
+ nccl_timeout: 600
+ model: {}
+ rollout:
+ agent: {}
+
+trainer:
+ nnodes: 1
+ n_gpus_per_node: 8
+ logger:
+ - console
+ device: cuda
+ total_epochs: 1
+ total_training_steps: null
+ balance_batch: false
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml
new file mode 100644
index 00000000..62b73da1
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml
@@ -0,0 +1,123 @@
+# PPO trainer config for the blackbox SWE-agent recipe (v2).
+# Uses the generic AgentFrameworkRolloutAdapter + SWEAgentFramework subclass.
+
+hydra:
+ searchpath:
+ - pkg://verl.trainer.config
+
+defaults:
+ - ppo_trainer
+ - _self_
+
+actor_rollout_ref:
+ hybrid_engine: true
+ nccl_timeout: 600
+
+ model:
+ path: ???
+ enable_gradient_checkpointing: true
+
+ rollout:
+ name: vllm
+ mode: async
+ prompt_length: 4096
+ response_length: 131072
+ max_model_len: 135168
+ temperature: 1.0
+ top_p: 1.0
+ n: 8
+ tensor_model_parallel_size: 4
+ gpu_memory_utilization: 0.7
+ calculate_log_probs: true
+ enable_sleep_mode: true
+ free_cache_engine: true
+
+ multi_turn:
+ enable: true
+ max_assistant_turns: 1
+ max_parallel_calls: 1
+ format: qwen3_coder
+
+ agent:
+ num_workers: 8
+ agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter
+
+ custom:
+ agent_framework:
+ framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework
+ agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
+ gateway_count: 1
+ completion_timeout_seconds: 600
+ max_concurrent_sessions: 32
+ agent_runner_kwargs:
+ agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml
+
+ actor:
+ use_dynamic_bsz: true
+ ppo_mini_batch_size: 16
+ use_kl_loss: false
+ kl_loss_coef: 0.0
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.28
+ loss_agg_mode: token-mean
+ optim:
+ lr: 1e-6
+ weight_decay: 0.1
+ clip_grad: 1.0
+ fsdp_config:
+ param_offload: true
+ optimizer_offload: true
+ grad_offload: true
+
+data:
+ train_files: ???
+ val_files: ???
+ max_prompt_length: 4096
+ max_response_length: 131072
+ train_batch_size: 128
+ val_batch_size: 128
+ return_raw_chat: true
+ trust_remote_code: true
+ custom_cls:
+ path: pkg://examples.swe_agent_blackbox.dataset
+ name: SWEBenchDataset
+
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ use_kl_in_reward: false
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.0
+
+reward:
+ custom_reward_function:
+ path: pkg://examples/swe_agent_blackbox.reward
+ name: compute_score
+
+trainer:
+ use_legacy_worker_impl: disable
+ nnodes: 1
+ n_gpus_per_node: 8
+ total_epochs: 10
+ project_name: swe_agent_blackbox
+ experiment_name: swe_agent
+ logger:
+ - console
+ device: cuda
+ balance_batch: false
+ val_before_train: true
+ val_only: false
+ save_freq: 10
+ test_freq: 10
+ default_local_dir: checkpoints/swe_agent_blackbox
+ resume_mode: disable
+
+ray_kwargs:
+ ray_init:
+ runtime_env:
+ env_vars:
+ TRANSFER_QUEUE_ENABLE: ""
+ NCCL_P2P_DISABLE: "1"
+ NCCL_SHM_DISABLE: "1"
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml
new file mode 100644
index 00000000..d25fcce5
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml
@@ -0,0 +1,162 @@
+# Megatron + TQ fully-async training config for the blackbox SWE-agent recipe.
+# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend.
+#
+# Entry point: python3 -m verl.experimental.fully_async_policy.fully_async_main
+# Requires: transfer_queue.enable=true (selects TQ path in FullyAsyncTaskRunner)
+
+hydra:
+ searchpath:
+ - pkg://verl.trainer.config
+
+defaults:
+ - ppo_megatron_trainer
+ - _self_
+
+actor_rollout_ref:
+ hybrid_engine: false
+ nccl_timeout: 9600
+
+ model:
+ path: ???
+
+ rollout:
+ name: vllm
+ mode: async
+ prompt_length: 4096
+ response_length: 131072
+ max_model_len: 135168
+ temperature: 1.0
+ top_p: 1.0
+ n: 8
+ tensor_model_parallel_size: 2
+ gpu_memory_utilization: 0.7
+ calculate_log_probs: true
+ enable_sleep_mode: true
+ free_cache_engine: true
+ enable_chunked_prefill: true
+ max_num_batched_tokens: 135168
+ checkpoint_engine:
+ backend: nccl
+
+ multi_turn:
+ enable: true
+ max_assistant_turns: 1
+ max_parallel_calls: 1
+ format: qwen3_coder
+
+ agent:
+ num_workers: 8
+ agent_loop_manager_class: uni_agent.trainer.framework.entry.FullyAsyncAgentFrameworkRolloutAdapter
+
+ custom:
+ agent_framework:
+ framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework
+ agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
+ gateway_count: 1
+ completion_timeout_seconds: 600
+ max_concurrent_sessions: 32
+ agent_runner_kwargs:
+ agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml
+
+ actor:
+ use_dynamic_bsz: true
+ use_rollout_log_probs: true
+ ppo_mini_batch_size: 16
+ ppo_micro_batch_size_per_gpu: 1
+ use_kl_loss: false
+ kl_loss_coef: 0.0
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.28
+ clip_ratio_c: 10.0
+ loss_agg_mode: token-mean
+ entropy_coeff: 0
+ optim:
+ lr: 1e-6
+ weight_decay: 0.1
+ lr_decay_style: constant
+ megatron:
+ param_offload: true
+ grad_offload: true
+ optimizer_offload: true
+ tensor_model_parallel_size: 8
+ pipeline_model_parallel_size: 1
+ context_parallel_size: 1
+ use_mbridge: true
+ use_remove_padding: false
+
+ ref:
+ megatron:
+ param_offload: false
+ tensor_model_parallel_size: 8
+ pipeline_model_parallel_size: 1
+ context_parallel_size: 1
+
+data:
+ train_files: ???
+ val_files: ???
+ prompt_key: prompt
+ truncation: left
+ max_prompt_length: 4096
+ max_response_length: 131072
+ train_batch_size: 0
+ gen_batch_size: 1
+ return_raw_chat: true
+ trust_remote_code: true
+ custom_cls:
+ path: pkg://examples.swe_agent_blackbox.dataset
+ name: SWEBenchDataset
+
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ use_kl_in_reward: false
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.0
+ rollout_correction:
+ bypass_mode: true
+
+reward:
+ custom_reward_function:
+ path: pkg://examples.swe_agent_blackbox.reward
+ name: compute_score
+
+trainer:
+ nnodes: 1
+ n_gpus_per_node: 8
+ total_epochs: 10
+ project_name: swe_agent_blackbox
+ experiment_name: swe_agent
+ logger:
+ - console
+ device: cuda
+ val_before_train: true
+ val_only: false
+ save_freq: 10
+ test_freq: 10
+ default_local_dir: checkpoints/swe_agent_blackbox
+ resume_mode: auto
+
+rollout:
+ nnodes: 1
+ n_gpus_per_node: 8
+ total_rollout_steps: 100000
+
+async_training:
+ use_trainer_do_validate: false
+ staleness_threshold: 1.0
+ trigger_parameter_sync_step: 4
+ require_batches: 1
+ partial_rollout: true
+
+transfer_queue:
+ enable: true
+
+ray_kwargs:
+ ray_init:
+ runtime_env:
+ env_vars:
+ TRANSFER_QUEUE_ENABLE: ""
+ NCCL_P2P_DISABLE: "1"
+ NCCL_SHM_DISABLE: "1"
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml
new file mode 100644
index 00000000..65b09b1a
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml
@@ -0,0 +1,129 @@
+# Megatron sync training config for the blackbox SWE-agent recipe.
+# Uses main_ppo_sync + Megatron backend, same blackbox infrastructure as FSDP.
+#
+# Entry point: python3 -m verl.trainer.main_ppo_sync
+
+hydra:
+ searchpath:
+ - pkg://verl.trainer.config
+
+defaults:
+ - ppo_megatron_trainer
+ - _self_
+
+actor_rollout_ref:
+ hybrid_engine: true
+ nccl_timeout: 600
+
+ model:
+ path: ???
+ enable_gradient_checkpointing: true
+
+ rollout:
+ name: vllm
+ mode: async
+ prompt_length: 4096
+ response_length: 131072
+ max_model_len: 135168
+ temperature: 1.0
+ top_p: 1.0
+ n: 8
+ tensor_model_parallel_size: 4
+ gpu_memory_utilization: 0.7
+ calculate_log_probs: true
+ enable_sleep_mode: true
+ free_cache_engine: true
+
+ multi_turn:
+ enable: true
+ max_assistant_turns: 1
+ max_parallel_calls: 1
+ format: qwen3_coder
+
+ agent:
+ num_workers: 8
+ agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter
+
+ custom:
+ agent_framework:
+ framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework
+ agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
+ gateway_count: 1
+ completion_timeout_seconds: 600
+ max_concurrent_sessions: 32
+ agent_runner_kwargs:
+ agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml
+
+ actor:
+ use_dynamic_bsz: true
+ ppo_mini_batch_size: 16
+ use_kl_loss: false
+ kl_loss_coef: 0.0
+ clip_ratio_low: 0.2
+ clip_ratio_high: 0.28
+ loss_agg_mode: token-mean
+ optim:
+ lr: 1e-6
+ weight_decay: 0.1
+ clip_grad: 1.0
+ megatron:
+ param_offload: true
+ grad_offload: true
+ optimizer_offload: true
+ tensor_model_parallel_size: 8
+ pipeline_model_parallel_size: 1
+ context_parallel_size: 1
+ use_mbridge: true
+
+data:
+ train_files: ???
+ val_files: ???
+ max_prompt_length: 4096
+ max_response_length: 131072
+ train_batch_size: 128
+ val_batch_size: 128
+ return_raw_chat: true
+ trust_remote_code: true
+ custom_cls:
+ path: pkg://examples.swe_agent_blackbox.dataset
+ name: SWEBenchDataset
+
+algorithm:
+ gamma: 1.0
+ lam: 1.0
+ adv_estimator: grpo
+ use_kl_in_reward: false
+ kl_ctrl:
+ type: fixed
+ kl_coef: 0.0
+
+reward:
+ custom_reward_function:
+ path: pkg://examples.swe_agent_blackbox.reward
+ name: compute_score
+
+trainer:
+ use_legacy_worker_impl: disable
+ nnodes: 1
+ n_gpus_per_node: 8
+ total_epochs: 10
+ project_name: swe_agent_blackbox
+ experiment_name: swe_agent
+ logger:
+ - console
+ device: cuda
+ balance_batch: false
+ val_before_train: true
+ val_only: false
+ save_freq: 10
+ test_freq: 10
+ default_local_dir: checkpoints/swe_agent_blackbox
+ resume_mode: disable
+
+ray_kwargs:
+ ray_init:
+ runtime_env:
+ env_vars:
+ TRANSFER_QUEUE_ENABLE: ""
+ NCCL_P2P_DISABLE: "1"
+ NCCL_SHM_DISABLE: "1"
diff --git a/examples/blackbox_recipes/mini_swe_agent/dataset.py b/examples/blackbox_recipes/mini_swe_agent/dataset.py
new file mode 100644
index 00000000..89d65129
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/dataset.py
@@ -0,0 +1,34 @@
+"""SWEBench-specific dataset that injects verl-standard reward fields."""
+
+from verl.utils.dataset.rl_dataset import RLHFDataset
+
+
+def extract_image(env_config: dict) -> str:
+ """Extract Docker image from env config, supporting both flat and nested formats.
+
+ Flat: env_config["image"]
+ Nested: env_config["deployment"]["image"]
+ """
+ image = env_config.get("image")
+ if image:
+ return image
+ deployment = env_config.get("deployment")
+ if isinstance(deployment, dict):
+ image = deployment.get("image")
+ if image:
+ return image
+ return ""
+
+
+class SWEBenchDataset(RLHFDataset):
+
+ def __getitem__(self, item):
+ row_dict = super().__getitem__(item)
+ extra_info = row_dict.get("extra_info", {})
+ tools_kwargs = extra_info.get("tools_kwargs", {})
+ reward_config = tools_kwargs.get("reward", {})
+
+ row_dict.setdefault("data_source", reward_config.get("name", "unknown"))
+ row_dict.setdefault("reward_model", {"ground_truth": {}})
+
+ return row_dict
diff --git a/examples/blackbox_recipes/mini_swe_agent/framework.py b/examples/blackbox_recipes/mini_swe_agent/framework.py
new file mode 100644
index 00000000..7c5c027c
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/framework.py
@@ -0,0 +1,105 @@
+"""SWE-agent specific framework subclass.
+
+Injects reward_info (from agent_runner's complete_session call)
+into sample_fields["extra_info"] so the reward worker's
+compute_score can access it via extra_info.
+
+Overrides _run_session to execute agent_runner in a separate Ray worker
+process, preventing blocking operations from stalling the event loop.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import functools
+import logging
+from dataclasses import replace
+from uuid import uuid4
+
+import ray
+
+from uni_agent.trainer.framework.framework import OpenAICompatibleAgentFramework
+
+from examples.swe_agent_blackbox.subprocess_runner import remote_agent_run
+
+logger = logging.getLogger(__name__)
+
+
+class SWEAgentFramework(OpenAICompatibleAgentFramework):
+
+ async def _score_trajectories(self, session_trajectories, sample_fields):
+ if session_trajectories and session_trajectories[-1].reward_info:
+ reward_info = session_trajectories[-1].reward_info
+ extra_info = dict(sample_fields.get("extra_info") or {})
+ sample_fields = {**sample_fields, "extra_info": {**extra_info, **reward_info}}
+ return await super()._score_trajectories(session_trajectories, sample_fields)
+
+ def _resolve_runner(self) -> tuple[str, dict]:
+ """Extract FQN and pre-bound kwargs from self.agent_runner.
+
+ self.agent_runner may be a functools.partial (from_config wraps it),
+ so we unpack the original function and its keywords.
+ """
+ fn = self.agent_runner
+ kwargs = {}
+ if isinstance(fn, functools.partial):
+ kwargs = dict(fn.keywords)
+ fn = fn.func
+ fqn = f"{fn.__module__}.{fn.__qualname__}"
+ return fqn, kwargs
+
+ async def _run_session(
+ self,
+ *,
+ prompts,
+ raw_prompt,
+ sample_index: int,
+ session_id: str | None = None,
+ runner_kwargs: dict | None = None,
+ ):
+ """Run agent_runner in a Ray worker process instead of in-process."""
+ session_id = session_id or f"session-{sample_index}-0-{uuid4().hex}"
+ sample_fields = self._extract_sample_fields(prompts=prompts, sample_index=sample_index)
+ session = await self.session_runtime.create_session(session_id)
+ agent_runner_fqn, resolved_kwargs = self._resolve_runner()
+
+ try:
+ if runner_kwargs:
+ resolved_kwargs = {**resolved_kwargs, **runner_kwargs}
+
+ ref = remote_agent_run.remote(
+ agent_runner_fqn=agent_runner_fqn,
+ raw_prompt=raw_prompt,
+ session_id=session_id,
+ base_url=session.base_url,
+ sample_index=sample_index,
+ runner_kwargs=resolved_kwargs,
+ )
+ loop = asyncio.get_running_loop()
+ reward_info = await loop.run_in_executor(None, ray.get, ref)
+
+ await self.session_runtime.complete_session(
+ session_id, reward_info=reward_info,
+ )
+ session_trajectories = await self.session_runtime.finalize_session(session_id)
+
+ except Exception as e:
+ logger.error("_run_session failed: session=%s, sample=%d, runner=%s: %s",
+ session_id, sample_index, agent_runner_fqn, e, exc_info=True)
+ await self.session_runtime.abort_session(session_id)
+ raise
+
+ if not self.reward_loop_worker_handles or not session_trajectories:
+ return session_trajectories, sample_fields
+
+ annotations = await self._score_trajectories(session_trajectories, sample_fields)
+ scored_trajectories = []
+ for traj, (score, extra) in zip(session_trajectories, annotations, strict=True):
+ scored_trajectories.append(
+ replace(
+ traj,
+ reward_score=score,
+ extra_fields={**traj.extra_fields, "reward_extra_info": extra},
+ )
+ )
+ return scored_trajectories, sample_fields
diff --git a/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py
new file mode 100644
index 00000000..33882bc8
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py
@@ -0,0 +1,227 @@
+"""Mini-swe-agent runner for the blackbox SWE-agent recipe.
+
+Agent runs inside a OpenYuanRong remote sandbox via sidecar tool image mount.
+The runner creates the sandbox, pipes task config via stdin, parses
+the result from stdout, and evaluates reward in the same sandbox.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+import logging
+import os
+import shlex
+import time
+from pathlib import Path
+
+from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime
+
+from examples.swe_agent_blackbox.dataset import extract_image
+from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env
+from examples.swe_agent_blackbox.sandbox import CommandResult, YRSandbox, extract_upstream, rewrite_gateway_url
+
+logger = logging.getLogger(__name__)
+if os.environ.get("DEBUG_MODE"):
+ logger.setLevel(logging.DEBUG)
+
+DEFAULT_TOOL_IMAGE = "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest"
+
+
+class SandboxEnvForReward:
+ """Adapts :class:`YRSandbox` to the async env interface used by
+ reward specs (``communicate``, ``write_file``, ``read_file``).
+ """
+
+ def __init__(self, sandbox):
+ self._sandbox = sandbox
+
+ async def communicate(self, input: str, timeout=600, check="ignore", error_msg="Command failed") -> str:
+ result = await self._sandbox.run(input, timeout=int(timeout))
+ if check == "raise" and result.exit_code != 0:
+ raise RuntimeError(f"{error_msg}: {result.stdout[:200]}")
+ return result.stdout
+
+ async def write_file(self, path: str | Path, content: str) -> None:
+ encoded = base64.b64encode(content.encode()).decode()
+ await self.communicate(f"echo {encoded} | base64 -d > {path}", check="raise", error_msg=f"write {path}")
+
+ async def read_file(self, path: str | Path, **_) -> str:
+ return await self.communicate(f"cat {path}")
+
+
+def _extract_task(raw_prompt) -> str:
+ """Extract task text from raw_prompt (str or message list)."""
+ if isinstance(raw_prompt, str):
+ return raw_prompt
+ return next(
+ (m["content"] for m in raw_prompt if isinstance(m, dict) and m.get("role") == "user"),
+ str(raw_prompt),
+ )
+
+
+def _build_task_config(
+ *,
+ task: str,
+ gateway_url: str,
+) -> dict:
+ """Build the task config passed to run_agent.py via stdin."""
+ agent_gateway_url = rewrite_gateway_url(gateway_url)
+ step_limit = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100"))
+ return {
+ "task": task,
+ "gateway_url": agent_gateway_url,
+ "agent": {
+ "step_limit": step_limit,
+ },
+ }
+
+
+def build_agent_command(
+ *,
+ config_b64: str,
+ conda_env: str = "testbed",
+) -> str:
+ """Build the command that runs run_agent.py inside the sandbox."""
+ conda_prefix = f"/opt/miniconda3/envs/{conda_env}"
+ env_prefix = (
+ f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)} "
+ f"CONDA_PREFIX={shlex.quote(conda_prefix)} "
+ f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH"
+ )
+ return (
+ "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; "
+ f"{env_prefix} "
+ f"echo {config_b64} | base64 -d | "
+ "/opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py"
+ )
+
+
+async def mini_swe_agent_runner(
+ *,
+ raw_prompt,
+ session: SessionHandle,
+ sample_index: int,
+ session_runtime: SessionRuntime,
+ tools_kwargs: dict | None = None,
+ tool_image: str = DEFAULT_TOOL_IMAGE,
+ run_timeout: int = 7200,
+ conda_env: str = "testbed",
+ **kwargs,
+) -> None:
+ """Run mini-swe-agent inside a sandbox with sidecar tool mount.
+
+ Flow:
+ 1. Create OpenYuanRong remote sandbox with mini-swe-agent sidecar
+ 2. Pipe task config to run_agent.py via stdin
+ 3. Parse agent result from stdout
+ 4. Evaluate reward in the same sandbox
+ 5. Complete session with reward_info
+ """
+ tools_kwargs = tools_kwargs or {}
+ logger.info("mini_swe_agent_runner called, sample_index=%d", sample_index)
+
+ # Extract task text and sandbox config (image from parquet)
+ task = _extract_task(raw_prompt)
+ logger.info("task extracted, %d chars", len(task))
+
+ env_config = tools_kwargs.get("env", {})
+ image = extract_image(env_config)
+ if not image:
+ raise ValueError(f"No sandbox image found in tools_kwargs.env for sample {sample_index}")
+
+ # Gateway URL — extract upstream for OpenYuanRong tunnel
+ gateway_url = session.base_url
+ if not gateway_url:
+ raise ValueError(f"gateway_url is empty for sample {sample_index}")
+
+ upstream = extract_upstream(gateway_url)
+ sandbox = await YRSandbox.create(
+ image=image, sidecar_image=tool_image, upstream=upstream,
+ )
+ sandbox_id = sandbox.sandbox_id
+ logger.info("Sandbox created (image=%s, sandbox_id=%s)", image, sandbox_id)
+
+ # Build task config (gateway URL rewritten to sandbox-internal tunnel)
+ task_config = _build_task_config(
+ task=task,
+ gateway_url=gateway_url,
+ )
+
+ try:
+ # Run post_setup_cmd if provided (e.g. git checkout correct commit)
+ post_setup_cmd = env_config.get("post_setup_cmd", "")
+ if post_setup_cmd:
+ logger.info("Running post_setup_cmd (%d chars)...", len(post_setup_cmd))
+ r = await sandbox.run(post_setup_cmd, timeout=600)
+ if r.exit_code != 0:
+ logger.warning("post_setup_cmd failed (rc=%d): %s", r.exit_code, r.stdout[:200])
+ else:
+ logger.info("post_setup_cmd done")
+
+ # Run agent inside sandbox — pipe config via base64-encoded stdin.
+ config_b64 = base64.b64encode(json.dumps(task_config).encode()).decode()
+ agent_cmd = build_agent_command(config_b64=config_b64, conda_env=conda_env)
+ logger.debug("[sample %d] starting agent inside sandbox", sample_index)
+ t0 = time.perf_counter()
+ agent_result = await sandbox.run(agent_cmd, timeout=int(run_timeout))
+ elapsed = time.perf_counter() - t0
+ logger.debug(
+ "[sample %d] agent process finished: rc=%d (%.1fs)",
+ sample_index, agent_result.exit_code, elapsed,
+ )
+
+ # Parse agent result from stdout
+ agent_info = _parse_agent_result(agent_result.stdout, sample_index)
+ logger.info(
+ "[sample %d] agent: exit_status=%s, submission=%d chars",
+ sample_index, agent_info.get("exit_status"),
+ len(agent_info.get("submission", "")),
+ )
+
+ # Evaluate reward in the same sandbox
+ metadata, eval_timeout = build_reward_context(tools_kwargs)
+ t0 = time.perf_counter()
+ reward_env = SandboxEnvForReward(sandbox)
+ score, eval_result = await evaluate_in_env(reward_env, metadata, eval_timeout)
+ logger.debug(
+ "[sample %d] reward done: score=%s, resolved=%s (%.1fs)",
+ sample_index, score, eval_result.get("resolved"), time.perf_counter() - t0,
+ )
+
+ reward_info = {"reward_score": score, **eval_result}
+ await session_runtime.complete_session(session.session_id, reward_info=reward_info)
+
+ except Exception as e:
+ logger.warning("Mini-swe-agent runner failed for sample %d (sandbox_id=%s): %s", sample_index, sandbox_id, e)
+ raise
+ finally:
+ try:
+ await sandbox.cleanup()
+ except Exception:
+ pass
+
+
+def _parse_agent_result(stdout: str, sample_index: int) -> dict:
+ """Parse agent result JSON from run_agent.py stdout.
+
+ litellm may print error messages to stdout, polluting the output.
+ The last line starting with '{' is the result JSON.
+ """
+ stdout = stdout.strip()
+ if not stdout:
+ return {"exit_status": "error", "submission": ""}
+ # Try the last line that looks like JSON first
+ lines = [l.strip() for l in stdout.split("\n") if l.strip()]
+ for line in reversed(lines):
+ if line.startswith("{"):
+ try:
+ return json.loads(line)
+ except json.JSONDecodeError:
+ continue
+ # Fallback: try entire stdout
+ try:
+ return json.loads(stdout)
+ except json.JSONDecodeError:
+ logger.warning("[sample %d] Failed to parse agent result (full stdout): %s", sample_index, stdout[:1000])
+ return {"exit_status": "error", "submission": ""}
diff --git a/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py
new file mode 100644
index 00000000..c74765e0
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py
@@ -0,0 +1,447 @@
+"""Parallel inference runner for the blackbox SWE-agent recipe (v2).
+
+Creates an LLM server, GatewayServingRuntime, and SWEAgentFramework,
+then runs agent sessions in parallel and reports resolve rate.
+
+Usage (CLI):
+ python examples/swe_agent_blackbox/parallel_infer.py \
+ --model-path ~/models/Qwen3-Coder-30B-A3B-Instruct \
+ --data-path ~/data/swe_agent/swe_bench_verified.parquet \
+ --max-samples 10
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from functools import partial
+from typing import Any
+from uuid import uuid4
+
+import numpy as np
+import ray
+
+from verl import DataProto
+from verl.protocol import pad_dataproto_to_divisor
+from verl.utils import hf_tokenizer
+from verl.utils.transferqueue_utils import tq as _tq_mock
+from verl.workers.rollout.llm_server import LLMServerManager
+
+from uni_agent.trainer.gateway.runtime import GatewayServingRuntime
+
+from examples.swe_agent_blackbox.framework import SWEAgentFramework
+from examples.swe_agent_blackbox.agent_runner import swe_agent_runner
+from examples.swe_agent_blackbox.claude_code_runner import claude_code_runner
+
+try:
+ from examples.swe_agent_blackbox.mini_swe_agent_runner import mini_swe_agent_runner
+except ImportError:
+ mini_swe_agent_runner = None
+
+logging.basicConfig(
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+ level=os.getenv("VERL_LOGGING_LEVEL", "INFO"),
+ force=True,
+)
+logger = logging.getLogger(__name__)
+
+
+# =====================================================================
+# Dataset loading (inlined from dataset.py — only used here)
+# =====================================================================
+
+
+def _remap_image_to_local(image_name: str) -> str:
+ parts = image_name.split("/")
+ if len(parts) > 1 and "." in parts[0]:
+ basename = parts[-1]
+ else:
+ basename = image_name
+ basename = basename.replace("_1776_", "__")
+ if ":" in basename:
+ basename = basename.rsplit(":", 1)[0]
+ return f"{basename}:latest"
+
+
+def _remap_sample_images(sample: dict[str, Any]) -> dict[str, Any]:
+ extra_info = sample.get("extra_info")
+ if not extra_info:
+ return sample
+ tools_kwargs = extra_info.get("tools_kwargs", {})
+ env = tools_kwargs.get("env", {})
+ image = env.get("image")
+ if not image:
+ return sample
+ local_image = _remap_image_to_local(image)
+ if local_image != image:
+ logger.debug("Remapping image: %s -> %s", image, local_image)
+ env["image"] = local_image
+ return sample
+
+
+def _inject_reward_fields(sample: dict[str, Any]) -> None:
+ """Inject verl-standard data_source and reward_model from extra_info.tools_kwargs.reward."""
+ extra_info = sample.get("extra_info", {})
+ tools_kwargs = extra_info.get("tools_kwargs", {})
+ reward_config = tools_kwargs.get("reward", {})
+ sample.setdefault("data_source", reward_config.get("name", "unknown"))
+ sample.setdefault("reward_model", {"ground_truth": {}})
+
+
+def load_swe_dataset(data_path: str | list[str], max_samples: int = -1) -> list[dict[str, Any]]:
+ import pyarrow.parquet as pq
+
+ if isinstance(data_path, list):
+ paths = [os.path.expanduser(p) for p in data_path]
+ else:
+ paths = os.path.expanduser(data_path)
+
+ logger.info("Loading dataset from: %s", data_path)
+ if isinstance(paths, list):
+ import pyarrow as pa
+ tables = [pq.read_table(p) for p in paths]
+ table = pa.concat_tables(tables)
+ else:
+ table = pq.read_table(paths)
+ samples = table.to_pylist()
+
+ for i, sample in enumerate(samples):
+ samples[i] = _remap_sample_images(sample)
+ _inject_reward_fields(samples[i])
+
+ if max_samples > 0:
+ samples = samples[:max_samples]
+ logger.info("Using first %d samples (max_samples=%d)", len(samples), max_samples)
+
+ logger.info("Loaded %d samples from %s", len(samples), data_path)
+ return samples
+
+
+class _MockReplayBuffer:
+ """Minimal replay buffer for inference mode (no actual training)."""
+
+ def add(self, partition_id, items):
+ pass
+
+
+def run_inference(
+ *,
+ model_path: str,
+ data_path: str,
+ prompt_length: int = 4096,
+ response_length: int = 65536,
+ temperature: float = 0.8,
+ top_p: float = 0.9,
+ n: int = 1,
+ max_samples: int = -1,
+ engine: str = "vllm",
+ nnodes: int = 1,
+ n_gpus_per_node: int = 8,
+ tensor_parallel_size: int = 4,
+ gateway_count: int = 1,
+ max_concurrent_sessions: int = 2,
+ completion_timeout: float = 600.0,
+ tool_parser: str | None = None,
+ agent_config_path: str | None = None,
+ runner: str = "uniagent",
+ tool_image: str | None = None,
+ run_timeout: int = 7200,
+) -> dict[str, Any]:
+ """Run parallel SWE-agent inference using the blackbox framework."""
+ if runner == "mini_swe":
+ if mini_swe_agent_runner is None:
+ raise ImportError("mini-swe-agent is required for --runner mini_swe. Install with: pip install mini-swe-agent")
+ _agent_runner = partial(
+ mini_swe_agent_runner,
+ tool_image=tool_image or "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest",
+ run_timeout=run_timeout,
+ )
+ elif runner == "claude_code":
+ _agent_runner = partial(
+ claude_code_runner,
+ tool_image=tool_image or "claude-code-tool:latest",
+ run_timeout=run_timeout,
+ )
+ else:
+ _agent_runner = swe_agent_runner
+
+ if not ray.is_initialized():
+ ray.init()
+
+ # 1. Init Hydra config
+ config = _init_hydra_config(
+ model_path=model_path,
+ engine=engine,
+ prompt_length=prompt_length,
+ response_length=response_length,
+ temperature=temperature,
+ top_p=top_p,
+ n=n,
+ nnodes=nnodes,
+ n_gpus_per_node=n_gpus_per_node,
+ tensor_parallel_size=tensor_parallel_size,
+ )
+
+ # 2. Load dataset
+ samples = load_swe_dataset(data_path, max_samples=max_samples)
+ logger.info(
+ "Loaded %d samples, %d rollout(s) each, runner=%s, gateway_count=%d, max_concurrent_sessions=%d",
+ len(samples),
+ n,
+ runner,
+ gateway_count,
+ max_concurrent_sessions,
+ )
+
+ if not samples:
+ raise ValueError("No samples to process")
+
+ # 3. Create LLM server
+ logger.info("Initializing LLM server manager...")
+ llm_server_manager = LLMServerManager.create(config=config)
+
+ # 4. Create GatewayServingRuntime
+ logger.info("Using tool_parser=%r", tool_parser)
+
+ llm_client = llm_server_manager.get_client()
+ gateway_actor_kwargs = {
+ "tokenizer": hf_tokenizer(os.path.expanduser(model_path)),
+ "base_sampling_params": {"temperature": temperature, "top_p": top_p, "max_tokens": response_length},
+ }
+ if tool_parser:
+ gateway_actor_kwargs["tool_parser_name"] = tool_parser
+
+ gateway_runtime = GatewayServingRuntime(
+ llm_client=llm_client,
+ gateway_count=gateway_count,
+ gateway_actor_kwargs=gateway_actor_kwargs,
+ )
+
+ # 5. Create RewardLoopWorker for compute_score
+ from verl.experimental.reward_loop.reward_loop import RewardLoopWorker
+ reward_worker = ray.remote(RewardLoopWorker).remote(config, None)
+
+ # 6. Create framework
+ framework = SWEAgentFramework(
+ session_runtime=gateway_runtime,
+ agent_runner=_agent_runner,
+ replay_buffer=_MockReplayBuffer(),
+ rollout_config={"n": n, "val_kwargs": {"n": n}},
+ completion_timeout=completion_timeout,
+ wait_for_completion_after_agent_run=True,
+ max_concurrent_sessions=max_concurrent_sessions,
+ reward_loop_worker_handles=[reward_worker],
+ )
+
+ # 6. Build batch data and run
+ _tools_kwargs_list = []
+ for sample in samples:
+ tk = (sample.get("extra_info") or {}).get("tools_kwargs", {})
+ if runner == "uniagent" and agent_config_path:
+ tk["agent_config_path"] = agent_config_path
+ tk["model_path"] = os.path.expanduser(model_path)
+ _tools_kwargs_list.append(tk)
+
+ from tensordict import TensorDict
+ from verl.utils import tensordict_utils as _tu
+
+ raw_prompts = [sample["prompt"] for sample in samples]
+ uids = [str(uuid4()) for _ in samples]
+ td = TensorDict({"uid": uids, "global_steps": [0] * len(samples)}, batch_size=[len(samples)])
+ _tu.assign_non_tensor_stack(td, "raw_prompt", raw_prompts)
+ _tu.assign_non_tensor_stack(td, "tools_kwargs", _tools_kwargs_list)
+ _tu.assign_non_tensor_stack(td, "data_source", [sample["data_source"] for sample in samples])
+ _tu.assign_non_tensor_stack(td, "reward_model", [sample["reward_model"] for sample in samples])
+
+ batch = DataProto(batch=td, meta_info={}).repeat(n)
+
+ size_divisor = gateway_count
+ batch_padded, pad_size = pad_dataproto_to_divisor(batch, size_divisor)
+ logger.info("Starting %d agent session(s)...", len(batch_padded))
+
+ _tq_store: dict[str, Any] = {}
+
+ async def _dummy_kv_put(key, partition_id=None, tag=None, **kwargs):
+ _tq_store[key] = tag
+
+ async def _dummy_kv_batch_put(keys=None, fields=None, tags=None, partition_id=None, **kwargs):
+ for i, key in enumerate(keys):
+ _tq_store[key] = {"fields": fields, "tag": tags[i] if tags else None}
+
+ _tq_mock.async_kv_put = _dummy_kv_put
+ _tq_mock.async_kv_batch_put = _dummy_kv_batch_put
+
+ async def _generate():
+ return await framework.generate_sequences(batch_padded.batch)
+
+ try:
+ stats = asyncio.run(_generate())
+ except RuntimeError as e:
+ logger.warning("generate_sequences failed: %s", e)
+ stats = {}
+
+ # 7. Collect scores
+ uid_to_sample_idx = {uid: i for i, uid in enumerate(uids)}
+ per_sample_scores = [0.0] * len(samples)
+ sample_trajectory_counts = [0] * len(samples)
+ for key, value in _tq_store.items():
+ if not isinstance(value, dict) or "fields" not in value:
+ continue
+ fields = value["fields"]
+ rm_scores = fields.get("rm_scores", None)
+ if rm_scores is None:
+ continue
+ # Key format: {uid}_{session_index}_{index}
+ uid = key.rsplit("_", 2)[0]
+ sample_idx = uid_to_sample_idx.get(uid)
+ if sample_idx is None:
+ continue
+ score = float(rm_scores.float()[-1, -1].item())
+ per_sample_scores[sample_idx] += score
+ sample_trajectory_counts[sample_idx] += 1
+
+ for i in range(len(samples)):
+ if sample_trajectory_counts[i] > 0:
+ per_sample_scores[i] /= sample_trajectory_counts[i]
+
+ resolved_count = sum(1 for s in per_sample_scores if s > 0)
+ overall_mean = float(np.mean(per_sample_scores)) if per_sample_scores else 0.0
+ logger.info(
+ "Resolved %d / %d samples (%.2f%%), mean score: %.4f",
+ resolved_count, len(samples), 100.0 * resolved_count / max(len(samples), 1), overall_mean,
+ )
+
+ # 8. Cleanup
+ asyncio.run(gateway_runtime.shutdown())
+
+ return {
+ "stats": stats,
+ "mean_score": overall_mean,
+ "per_sample_scores": per_sample_scores,
+ }
+
+
+# =====================================================================
+# Helpers
+# =====================================================================
+
+
+def _init_hydra_config(
+ *,
+ model_path: str,
+ engine: str,
+ prompt_length: int,
+ response_length: int,
+ temperature: float,
+ top_p: float,
+ n: int,
+ nnodes: int,
+ n_gpus_per_node: int,
+ tensor_parallel_size: int,
+) -> Any:
+ """Initialize Hydra config with rollout/model settings."""
+ from hydra import compose, initialize_config_dir
+ from omegaconf import OmegaConf
+
+ config_dir = os.path.abspath("examples/swe_agent_blackbox/config")
+ with initialize_config_dir(config_dir=config_dir, version_base=None):
+ config = compose(config_name="parallel_infer")
+
+ config.actor_rollout_ref.model.path = os.path.expanduser(model_path)
+ config.actor_rollout_ref.rollout.name = engine
+ config.actor_rollout_ref.rollout.mode = "async"
+ config.actor_rollout_ref.rollout.prompt_length = prompt_length
+ config.actor_rollout_ref.rollout.response_length = response_length
+ config.actor_rollout_ref.rollout.max_model_len = prompt_length + response_length + 1024
+ config.actor_rollout_ref.rollout.n = n
+ config.actor_rollout_ref.rollout.tensor_model_parallel_size = tensor_parallel_size
+ config.actor_rollout_ref.rollout.gpu_memory_utilization = float(os.getenv("ROLLOUT_GPU_MEM_UTIL", "0.5"))
+ config.actor_rollout_ref.rollout.temperature = temperature
+ config.actor_rollout_ref.rollout.top_p = top_p
+ config.actor_rollout_ref.rollout.val_kwargs.temperature = temperature
+ config.actor_rollout_ref.rollout.val_kwargs.top_p = top_p
+ config.actor_rollout_ref.rollout.calculate_log_probs = True
+ config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns = 100
+ config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1
+ config.actor_rollout_ref.rollout.nnodes = nnodes
+ config.actor_rollout_ref.rollout.n_gpus_per_node = n_gpus_per_node
+ config.trainer.nnodes = nnodes
+ config.trainer.n_gpus_per_node = n_gpus_per_node
+
+ config.reward.custom_reward_function.path = "pkg://examples.swe_agent_blackbox.reward"
+ config.reward.custom_reward_function.name = "compute_score"
+ config.reward.num_workers = 1
+
+ OmegaConf.set_struct(config.actor_rollout_ref.rollout, False)
+ config.actor_rollout_ref.rollout.enable_sleep_mode = False
+ config.actor_rollout_ref.rollout.enforce_eager = os.getenv("ROLLOUT_ENFORCE_EAGER", "0") == "1"
+ OmegaConf.set_struct(config.actor_rollout_ref.rollout, True)
+ return config
+
+
+# =====================================================================
+# CLI entry point
+# =====================================================================
+
+
+def main():
+ parser = argparse.ArgumentParser(description="SWE-Agent Blackbox Parallel Inference")
+ parser.add_argument("--data-path", type=str, default="~/data/swe_agent/swe_bench_verified.parquet")
+ parser.add_argument("--model-path", "--model", type=str, default="~/models/Qwen3-Coder-30B-A3B-Instruct")
+ parser.add_argument("--max-turns", type=int, default=100)
+ parser.add_argument("--prompt-length", type=int, default=4096)
+ parser.add_argument("--response-length", type=int, default=65536)
+ parser.add_argument("--temperature", type=float, default=0.8)
+ parser.add_argument("--top-p", type=float, default=0.9)
+ parser.add_argument("--n", type=int, default=1)
+ parser.add_argument("--max-samples", type=int, default=-1)
+ parser.add_argument("--engine", type=str, default="vllm", choices=["vllm", "sglang"])
+ parser.add_argument("--nnodes", type=int, default=1)
+ parser.add_argument("--n-gpus-per-node", type=int, default=8)
+ parser.add_argument("--tensor-parallel-size", "--tp", type=int, default=4)
+ parser.add_argument("--gateway-count", type=int, default=1)
+ parser.add_argument("--max-concurrent-sessions", type=int, default=2)
+ parser.add_argument("--tool-parser", type=str, default="qwen3_coder")
+ parser.add_argument("--tool-image", type=str, default=None)
+ parser.add_argument("--run-timeout", type=int, default=7200)
+ parser.add_argument(
+ "--runner", type=str, default="uniagent", choices=["uniagent", "mini_swe", "claude_code"],
+ help="Agent runner: 'uniagent', 'mini_swe', or 'claude_code'.",
+ )
+ parser.add_argument(
+ "--agent-config-path", type=str,
+ default="examples/swe_agent_blackbox/config/agent_config.yaml",
+ help="Path to agent config YAML.",
+ )
+ args = parser.parse_args()
+
+ os.environ["SWE_AGENT_MAX_TURNS"] = str(args.max_turns)
+
+ run_inference(
+ model_path=args.model_path,
+ data_path=args.data_path,
+ prompt_length=args.prompt_length,
+ response_length=args.response_length,
+ temperature=args.temperature,
+ top_p=args.top_p,
+ n=args.n,
+ max_samples=args.max_samples,
+ engine=args.engine,
+ nnodes=args.nnodes,
+ n_gpus_per_node=args.n_gpus_per_node,
+ tensor_parallel_size=args.tensor_parallel_size,
+ gateway_count=args.gateway_count,
+ max_concurrent_sessions=args.max_concurrent_sessions,
+ tool_parser=args.tool_parser,
+ agent_config_path=args.agent_config_path,
+ runner=args.runner,
+ tool_image=args.tool_image,
+ run_timeout=args.run_timeout,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/blackbox_recipes/mini_swe_agent/reward.py b/examples/blackbox_recipes/mini_swe_agent/reward.py
new file mode 100644
index 00000000..61da218b
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/reward.py
@@ -0,0 +1,74 @@
+"""Reward utilities for the blackbox SWE-agent recipe.
+
+Contains:
+- build_reward_context: extract reward metadata + eval_timeout from tools_kwargs
+- compute_score: thin reward function that reads reward_score from extra_info
+- evaluate_in_env: run reward evaluation in Docker env (shared by both runners)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def build_reward_context(tools_kwargs: dict) -> tuple[dict[str, Any], int]:
+ """Extract reward metadata and eval_timeout from per-sample tools_kwargs."""
+ reward_config = tools_kwargs.get("reward", {})
+ metadata = {
+ "data_source": reward_config.get("name", "unknown"),
+ "reward_model": reward_config.get("metadata", {}),
+ }
+ eval_timeout = int(os.environ.get("SWE_AGENT_EVAL_TIMEOUT", "600"))
+ return metadata, eval_timeout
+
+
+def compute_score(data_source: str, solution_str: str, ground_truth: str, extra_info=None) -> dict:
+ """Read reward_score from extra_info, injected by SWEAgentFramework."""
+ score = 0.0
+ if extra_info and "reward_score" in extra_info:
+ score = float(extra_info["reward_score"])
+ return {"score": score}
+
+
+def _get_reward_spec(data_source: str):
+ """Load reward spec class by data_source name."""
+ from uni_agent.reward.registry import REWARD_SPEC_REGISTRY, _load_reward_spec_module
+
+ if data_source not in REWARD_SPEC_REGISTRY:
+ _load_reward_spec_module(data_source)
+ cls = REWARD_SPEC_REGISTRY.get(data_source)
+ if cls is None:
+ raise ValueError(f"Unknown data_source: {data_source}. Available: {list(REWARD_SPEC_REGISTRY.keys())}")
+ return cls
+
+
+async def evaluate_in_env(
+ env,
+ metadata: dict[str, Any],
+ eval_timeout: int = 600,
+) -> tuple[float, dict]:
+ """Run reward evaluation in the Docker env.
+
+ Returns (score, eval_result) where score is 1.0/0.0 and
+ eval_result contains details (eval_completed, resolved, etc.).
+ """
+ data_source = metadata.get("data_source", "unknown")
+ reward_model = metadata.get("reward_model", {})
+
+ spec_cls = _get_reward_spec(data_source)
+ spec_metadata = reward_model.get("ground_truth", reward_model)
+
+ spec = spec_cls(
+ run_id="swe_bb_eval",
+ metadata=spec_metadata,
+ env=env,
+ eval_timeout=eval_timeout,
+ )
+
+ resolved, result = await spec.compute_reward()
+ score = 1.0 if resolved else 0.0
+ return score, result
diff --git a/examples/blackbox_recipes/mini_swe_agent/run_agent.py b/examples/blackbox_recipes/mini_swe_agent/run_agent.py
new file mode 100644
index 00000000..c5a4b165
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/run_agent.py
@@ -0,0 +1,106 @@
+#!/opt/mini-swe-agent/bin/python
+"""Run mini-swe-agent inside the sandbox.
+
+Input: task config JSON from **stdin**
+ - task: str — the issue description for the agent to solve
+ - gateway_url: str — LLM gateway endpoint (tunnel URL for OpenYuanRong sandbox)
+ - agent: dict — agent config (e.g. step_limit)
+
+Output: agent result JSON to **stdout**, or error JSON on failure
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+DEFAULT_ACTION_TIMEOUT = 600
+
+
+def _fail(msg: str, exit_status: str = "error") -> None:
+ """Write error result to stdout and exit."""
+ sys.stdout.write(json.dumps({"exit_status": exit_status, "submission": "", "error": msg}))
+ sys.stdout.write("\n")
+ sys.stdout.flush()
+
+
+def main() -> None:
+ try:
+ # 1. Read task config from stdin
+ config = json.load(sys.stdin)
+ task = config["task"]
+ gateway_url = config["gateway_url"]
+
+ # 2. Load swebench defaults
+ from minisweagent.config import builtin_config_dir, get_config_from_spec
+
+ swebench_cfg = get_config_from_spec(str(builtin_config_dir / "benchmarks" / "swebench.yaml"))
+
+ # 3. Create LocalEnvironment (use swebench defaults)
+ from minisweagent.environments.local import LocalEnvironment
+
+ env_cfg = dict(swebench_cfg.get("environment", {}))
+ env_cfg.pop("environment_class", None)
+ env_cfg["timeout"] = DEFAULT_ACTION_TIMEOUT
+ env_cfg.setdefault("env", {})
+ env_cfg["env"].setdefault("GIT_PAGER", "cat")
+ for key in ("image", "container_timeout", "run_args", "executable", "pull_timeout",
+ "forward_env", "interpreter"):
+ env_cfg.pop(key, None)
+ env = LocalEnvironment(**env_cfg)
+
+ # 4. Create LitellmModel pointing at gateway
+ from minisweagent.models.litellm_model import LitellmModel
+
+ model_defaults = dict(swebench_cfg.get("model", {}))
+ model_defaults.pop("model_name", None)
+ model_defaults.pop("model_kwargs", None)
+ model_cfg = model_defaults
+ model_cfg.update({
+ "model_name": "openai/default",
+ "model_kwargs": {
+ "api_base": gateway_url,
+ "api_key": "not-needed",
+ "drop_params": True,
+ },
+ "cost_tracking": "ignore_errors",
+ })
+ model = LitellmModel(**model_cfg)
+
+ # 5. Create DefaultAgent
+ from minisweagent.agents.default import DefaultAgent
+
+ agent_defaults = dict(swebench_cfg.get("agent", {}))
+ agent_overrides = config.get("agent", {})
+ agent_defaults.update(agent_overrides)
+ agent_cfg = agent_defaults
+ step_limit = agent_cfg.get("step_limit", 100)
+ agent_cfg["step_limit"] = step_limit
+ agent = DefaultAgent(model, env, **agent_cfg)
+
+ # 6. Run agent
+ try:
+ info = agent.run(task=task)
+ except Exception as e:
+ info = {"exit_status": type(e).__name__, "submission": str(e)}
+
+ # 7. Write result to stdout
+ result = {
+ "exit_status": info.get("exit_status", "unknown"),
+ "submission": info.get("submission", ""),
+ "model_stats": {
+ "instance_cost": agent.cost,
+ "api_calls": agent.n_calls,
+ },
+ }
+ sys.stdout.write(json.dumps(result, ensure_ascii=False))
+ sys.stdout.write("\n")
+ sys.stdout.flush()
+
+ except Exception as e:
+ _fail(str(e), exit_status=type(e).__name__)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py b/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py
new file mode 100644
index 00000000..b03dc7d7
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py
@@ -0,0 +1,61 @@
+"""Ray-based subprocess runner for agent_runner execution.
+
+Launches agent_runner in a separate Ray worker process to prevent blocking
+operations (sleep, sync I/O, etc.) from stalling the framework's event loop.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from typing import Any
+
+import ray
+
+from uni_agent.trainer.framework.types import SessionHandle
+
+logger = logging.getLogger(__name__)
+
+
+class _StubSessionRuntime:
+ """Captures reward_info from agent_runner's complete_session call."""
+
+ def __init__(self):
+ self.reward_info: dict[str, Any] | None = None
+
+ async def complete_session(self, session_id: str, reward_info: dict[str, Any] | None = None):
+ self.reward_info = reward_info
+
+
+@ray.remote(num_cpus=0)
+def remote_agent_run(
+ agent_runner_fqn: str,
+ raw_prompt,
+ session_id: str,
+ base_url: str,
+ sample_index: int,
+ runner_kwargs: dict,
+) -> dict[str, Any] | None:
+ """Run agent_runner in a dedicated Ray worker process."""
+ from verl.utils.import_utils import load_class_from_fqn
+
+ agent_runner = load_class_from_fqn(agent_runner_fqn)
+ stub_runtime = _StubSessionRuntime()
+ handle = SessionHandle(session_id=session_id, base_url=base_url)
+
+ async def _run():
+ try:
+ await agent_runner(
+ raw_prompt=raw_prompt,
+ session=handle,
+ sample_index=sample_index,
+ session_runtime=stub_runtime,
+ **runner_kwargs,
+ )
+ return stub_runtime.reward_info
+ except Exception as e:
+ logger.error("remote_agent_run failed: session_id=%s, sample=%d, error=%s",
+ session_id, sample_index, e, exc_info=True)
+ raise
+
+ return asyncio.run(_run())
diff --git a/examples/blackbox_recipes/sandbox/sandbox.py b/examples/blackbox_recipes/sandbox/sandbox.py
new file mode 100644
index 00000000..fb21ac94
--- /dev/null
+++ b/examples/blackbox_recipes/sandbox/sandbox.py
@@ -0,0 +1,10 @@
+"""OpenYuanRong (AKernel) remote sandbox command execution.
+
+Uses ``akernel_sdk.Sandbox`` with sidecar ``Mount`` to inject the
+mini-swe-agent tool image. Supports upstream tunnel so the agent
+inside the sandbox can reach the gateway via ``http://127.0.0.1:``.
+"""
+
+#TODO
+
+
diff --git a/examples/blackbox_recipes/scripts/build_tool.sh b/examples/blackbox_recipes/scripts/build_tool.sh
new file mode 100755
index 00000000..e5158629
--- /dev/null
+++ b/examples/blackbox_recipes/scripts/build_tool.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Build a SWE blackbox sidecar tool image.
+#
+# Usage:
+# bash examples/swe_agent_blackbox/build_tool.sh
+# bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code
+# bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/
+# bash examples/swe_agent_blackbox/build_tool.sh --npm-registry https://registry.npmmirror.com
+# bash examples/swe_agent_blackbox/build_tool.sh --tool-version latest
+# bash examples/swe_agent_blackbox/build_tool.sh --registry reg.antgroup-inc.cn/myrepo
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+TOOL_KIND="${TOOL_KIND:-mini_swe}"
+IMAGE_TAG="${TOOL_TAG:-latest}"
+TOOL_VERSION="${TOOL_VERSION:-latest}"
+
+# Parse args
+REGISTRY=""
+PIP_INDEX_URL="${PIP_INDEX_URL:-}"
+NPM_REGISTRY="${NPM_REGISTRY:-}"
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --tool) TOOL_KIND="$2"; shift 2 ;;
+ --registry) REGISTRY="$2"; shift 2 ;;
+ --pip-index) PIP_INDEX_URL="$2"; shift 2 ;;
+ --npm-registry) NPM_REGISTRY="$2"; shift 2 ;;
+ --tool-version) TOOL_VERSION="$2"; shift 2 ;;
+ *) echo "Unknown arg: $1"; exit 1 ;;
+ esac
+done
+
+BUILD_ARGS=()
+DOCKERFILE="${SCRIPT_DIR}/Dockerfile.mini-swe-agent-tool"
+if [[ "${TOOL_KIND}" == "claude" ]]; then
+ TOOL_KIND="claude_code"
+fi
+if [[ "${TOOL_KIND}" == "claude_code" ]]; then
+ IMAGE_NAME="${TOOL_IMAGE:-claude-code-tool}"
+ DOCKERFILE="${SCRIPT_DIR}/Dockerfile.claude-code-tool"
+ BUILD_ARGS+=(--build-arg "TOOL_VERSION=${TOOL_VERSION}")
+ if [[ -n "${NPM_REGISTRY}" ]]; then
+ BUILD_ARGS+=(--build-arg "NPM_REGISTRY=${NPM_REGISTRY}")
+ fi
+elif [[ "${TOOL_KIND}" == "mini_swe" ]]; then
+ IMAGE_NAME="${TOOL_IMAGE:-mini-swe-agent-tool}"
+ if [[ -n "${PIP_INDEX_URL}" ]]; then
+ BUILD_ARGS+=(--build-arg PIP_INDEX_URL="${PIP_INDEX_URL}")
+ fi
+else
+ echo "Unknown tool: ${TOOL_KIND}; expected mini_swe or claude_code"
+ exit 1
+fi
+
+echo "==> Building ${TOOL_KIND} tool image: ${IMAGE_NAME}:${IMAGE_TAG}"
+docker build \
+ -f "${DOCKERFILE}" \
+ -t "${IMAGE_NAME}:${IMAGE_TAG}" \
+ "${BUILD_ARGS[@]}" \
+ "${SCRIPT_DIR}/"
+
+if [[ -n "${REGISTRY}" ]]; then
+ FULL_TAG="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
+ echo "==> Tagging and pushing: ${FULL_TAG}"
+ docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${FULL_TAG}"
+ docker push "${FULL_TAG}"
+ echo " Pushed."
+fi
+
+echo ""
+echo "Tool image ready: ${IMAGE_NAME}:${IMAGE_TAG}"
+if [[ -n "${REGISTRY}" ]]; then
+ echo " Remote sandbox: ${FULL_TAG}"
+fi
diff --git a/examples/blackbox_recipes/scripts/run_infer.sh b/examples/blackbox_recipes/scripts/run_infer.sh
new file mode 100755
index 00000000..d5703aa6
--- /dev/null
+++ b/examples/blackbox_recipes/scripts/run_infer.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# Parallel inference for the blackbox SWE-agent recipe.
+#
+# Usage:
+# bash examples/swe_agent_blackbox/scripts/run_infer.sh
+
+set -euo pipefail
+
+# ── Model & data ─────────────────────────────────────────────────────────
+MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}"
+DATA_PATH="${DATA_PATH:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
+
+# ── Inference parameters ─────────────────────────────────────────────────
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
+RESPONSE_LENGTH="${RESPONSE_LENGTH:-65536}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+TOP_P="${TOP_P:-1.0}"
+N="${N:-8}"
+ENGINE="${ENGINE:-vllm}"
+TP="${TP:-4}"
+N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-8}"
+GATEWAY_COUNT="${GATEWAY_COUNT:-1}"
+MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-2}"
+
+# ── Agent parameters ─────────────────────────────────────────────────────
+RUNNER="${RUNNER:-uniagent}"
+AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
+export SWE_AGENT_MAX_TURNS="${SWE_AGENT_MAX_TURNS:-100}"
+export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
+SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-}"
+SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
+
+# ── Logging ──────────────────────────────────────────────────────────────
+export VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
+export ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.5}"
+
+echo "=== SWE-Agent Blackbox Inference ==="
+echo "Model: ${MODEL_PATH}"
+echo "Data: ${DATA_PATH}"
+echo "Max samples: ${MAX_SAMPLES}"
+echo "Engine: ${ENGINE} (TP=${TP})"
+echo "Runner: ${RUNNER}"
+echo "Gateway count: ${GATEWAY_COUNT}"
+echo "Max concurrent sessions: ${MAX_CONCURRENT_SESSIONS}"
+echo "====================================="
+
+python examples/swe_agent_blackbox/parallel_infer.py \
+ --model-path "${MODEL_PATH}" \
+ --data-path "${DATA_PATH}" \
+ --max-samples "${MAX_SAMPLES}" \
+ --prompt-length "${PROMPT_LENGTH}" \
+ --response-length "${RESPONSE_LENGTH}" \
+ --temperature "${TEMPERATURE}" \
+ --top-p "${TOP_P}" \
+ --n "${N}" \
+ --engine "${ENGINE}" \
+ --tensor-parallel-size "${TP}" \
+ --max-turns "${SWE_AGENT_MAX_TURNS}" \
+ --runner "${RUNNER}" \
+ --agent-config-path "${AGENT_CONFIG_PATH}" \
+ --n-gpus-per-node "${N_GPUS_PER_NODE}" \
+ --gateway-count "${GATEWAY_COUNT}" \
+ --max-concurrent-sessions "${MAX_CONCURRENT_SESSIONS}" \
+ --tool-image "${SWE_AGENT_TOOL_IMAGE}" \
+ --run-timeout "${SWE_AGENT_RUN_TIMEOUT}"
diff --git a/examples/blackbox_recipes/scripts/run_train.sh b/examples/blackbox_recipes/scripts/run_train.sh
new file mode 100755
index 00000000..cf08005d
--- /dev/null
+++ b/examples/blackbox_recipes/scripts/run_train.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+# Training launch script for the blackbox SWE-agent recipe.
+#
+# Uses GRPO + AgentFrameworkRolloutAdapter with reward computed in-process
+# by the agent runner, then passed through the reward worker's compute_score.
+#
+# Usage:
+# bash examples/swe_agent_blackbox/scripts/run_train.sh
+#
+# All configurable via environment variables (see defaults below).
+
+set -euo pipefail
+
+# ── Model & data ─────────────────────────────────────────────────────────
+MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3-Coder-30B-A3B-Instruct}"
+TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
+VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
+
+# ── Hardware ─────────────────────────────────────────────────────────────
+NNODES="${NNODES:-1}"
+NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}"
+
+# ── Training parameters ─────────────────────────────────────────────────
+TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}"
+PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
+RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
+ACTOR_LR="${ACTOR_LR:-1e-6}"
+TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}"
+SAVE_FREQ="${SAVE_FREQ:-10}"
+TEST_FREQ="${TEST_FREQ:-10}"
+
+# ── Rollout parameters ──────────────────────────────────────────────────
+ENGINE="${ENGINE:-vllm}"
+TP="${TP:-4}"
+ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
+N="${N:-8}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+
+# ── Agent parameters ─────────────────────────────────────────────────────
+RUNNER="${RUNNER:-mini_swe}"
+MAX_TURNS="${MAX_TURNS:-100}"
+AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
+COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}"
+if [[ "${RUNNER}" == "claude_code" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}"
+elif [[ "${RUNNER}" == "mini_swe" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
+elif [[ "${RUNNER}" == "uniagent" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE=""
+else
+ echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2
+ exit 1
+fi
+SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
+RUNNER_ARGS=(
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}"
+)
+if [[ "${RUNNER}" != "uniagent" ]]; then
+ RUNNER_ARGS+=(
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
+ )
+fi
+
+# ── Logging ──────────────────────────────────────────────────────────────
+PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
+EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
+VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
+
+export SWE_AGENT_MAX_TURNS="${MAX_TURNS}"
+export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
+export VERL_LOGGING_LEVEL
+
+# ── Environment for NCCL ─────────────────────────────────────────────────
+export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}"
+export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}"
+
+echo "=== SWE-Agent Blackbox Training ==="
+echo "Model: ${MODEL_PATH}"
+echo "Train data: ${TRAIN_DATA}"
+echo "Val data: ${VAL_DATA}"
+echo "Engine: ${ENGINE} (TP=${TP})"
+echo "Runner: ${RUNNER}"
+echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}"
+echo "Epochs: ${TOTAL_EPOCHS}"
+echo "====================================="
+
+python3 -m verl.trainer.main_ppo_sync \
+ --config-name=swe_agent_blackbox \
+ --config-path="$(pwd)/examples/swe_agent_blackbox/config" \
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
+ data.train_files="['${TRAIN_DATA}']" \
+ data.val_files="['${VAL_DATA}']" \
+ data.train_batch_size=${TRAIN_BATCH_SIZE} \
+ data.max_prompt_length=${PROMPT_LENGTH} \
+ data.max_response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.name=${ENGINE} \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
+ actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
+ actor_rollout_ref.rollout.n=${N} \
+ actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
+ actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
+ actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH + 1024)) \
+ actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \
+ actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
+ actor_rollout_ref.rollout.nnodes=${NNODES} \
+ actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \
+ trainer.nnodes=${NNODES} \
+ trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
+ trainer.total_epochs=${TOTAL_EPOCHS} \
+ trainer.save_freq=${SAVE_FREQ} \
+ trainer.test_freq=${TEST_FREQ} \
+ trainer.project_name=${PROJECT_NAME} \
+ trainer.experiment_name=${EXPERIMENT_NAME} \
+ actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \
+ actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \
+ "${RUNNER_ARGS[@]}" \
+ "$@"
diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_async.sh b/examples/blackbox_recipes/scripts/run_train_megatron_async.sh
new file mode 100755
index 00000000..db3a8264
--- /dev/null
+++ b/examples/blackbox_recipes/scripts/run_train_megatron_async.sh
@@ -0,0 +1,199 @@
+#!/usr/bin/env bash
+# Megatron + TQ fully-async training for the blackbox SWE-agent recipe.
+#
+# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend.
+# Data flows through TransferQueue (zero-copy) with ReplayBuffer flow control.
+#
+# Usage:
+# bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh
+#
+# All configurable via environment variables (see defaults below).
+
+set -euo pipefail
+
+# ── Model & data ─────────────────────────────────────────────────────────
+MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}"
+TRAIN_DATA="${TRAIN_DATA:-${HOME}/data/swe_agent/swe_rebench_filtered.parquet}"
+VAL_DATA="${VAL_DATA:-${HOME}/data/swe_agent/swe_bench_verified.parquet}"
+RUNTIME_ENV="${RUNTIME_ENV:-}"
+
+# ── Hardware ─────────────────────────────────────────────────────────────
+NNODES_TRAIN="${NNODES_TRAIN:-1}"
+NNODES_ROLLOUT="${NNODES_ROLLOUT:-1}"
+NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}"
+
+# ── Algorithm ────────────────────────────────────────────────────────────
+CLIP_RATIO_LOW="${CLIP_RATIO_LOW:-0.2}"
+CLIP_RATIO_HIGH="${CLIP_RATIO_HIGH:-0.28}"
+ACTOR_LR="${ACTOR_LR:-1e-6}"
+
+# ── Sequence lengths ─────────────────────────────────────────────────────
+PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
+RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
+MAX_MODEL_LEN=$((PROMPT_LENGTH + RESPONSE_LENGTH))
+
+# ── Rollout parameters ───────────────────────────────────────────────────
+ENGINE="${ENGINE:-vllm}"
+GEN_TP="${GEN_TP:-2}"
+N="${N:-8}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
+
+# ── Megatron training parallelism ────────────────────────────────────────
+TRAIN_TP="${TRAIN_TP:-8}"
+TRAIN_PP="${TRAIN_PP:-1}"
+TRAIN_CP="${TRAIN_CP:-1}"
+OFFLOAD="${OFFLOAD:-True}"
+OPTIMIZER_OFFLOAD_FRACTION="${OFFLOAD_FRACTION:-1.0}"
+USE_MBRIDGE="${USE_MBRIDGE:-True}"
+PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}"
+
+# ── Agent parameters ─────────────────────────────────────────────────────
+RUNNER="${RUNNER:-mini_swe}"
+MAX_TURNS="${MAX_TURNS:-100}"
+AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
+COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}"
+if [[ "${RUNNER}" == "claude_code" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}"
+elif [[ "${RUNNER}" == "mini_swe" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
+elif [[ "${RUNNER}" == "uniagent" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE=""
+else
+ echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2
+ exit 1
+fi
+SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
+CONDA_ENV="${CONDA_ENV:-testbed}"
+RUNNER_ARGS=(
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}"
+)
+if [[ "${RUNNER}" != "uniagent" ]]; then
+ RUNNER_ARGS+=(
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.conda_env=${CONDA_ENV}"
+ )
+fi
+
+# ── OpenYuanRong (YR remote sandbox) ─────────────────────────────────────
+OPENYUANRONG_SERVER_ADDRESS="${OPENYUANRONG_SERVER_ADDRESS:-}"
+OPENYUANRONG_TOKEN="${OPENYUANRONG_TOKEN:-}"
+OPENYUANRONG_TUNNEL_SSL_VERIFY="${OPENYUANRONG_TUNNEL_SSL_VERIFY:-0}"
+
+# ── Async training ───────────────────────────────────────────────────────
+TOTAL_ROLLOUT_STEPS="${TOTAL_ROLLOUT_STEPS:-100000}"
+STALENESS_THRESHOLD="${STALENESS_THRESHOLD:-1.0}"
+TRIGGER_SYNC_STEP="${TRIGGER_SYNC_STEP:-4}"
+PARTIAL_ROLLOUT="${PARTIAL_ROLLOUT:-True}"
+
+# ── Logging & checkpointing ──────────────────────────────────────────────
+PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
+EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
+SAVE_FREQ="${SAVE_FREQ:-10}"
+TEST_FREQ="${TEST_FREQ:-10}"
+CKPTS_DIR="${CKPTS_DIR:-checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}"
+
+export SWE_AGENT_MAX_TURNS="${MAX_TURNS}"
+export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
+export OPENYUANRONG_SERVER_ADDRESS
+export OPENYUANRONG_TOKEN
+export OPENYUANRONG_TUNNEL_SSL_VERIFY
+
+echo "=== SWE-Agent Blackbox Megatron Async Training ==="
+echo "Model: ${MODEL_PATH}"
+echo "Train data: ${TRAIN_DATA}"
+echo "Val data: ${VAL_DATA}"
+echo "Engine: ${ENGINE} (gen_tp=${GEN_TP}, train_tp=${TRAIN_TP})"
+echo "Runner: ${RUNNER}"
+echo "Batch: n=${N}, mini_bsz=${PPO_MINI_BATCH_SIZE}"
+echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}"
+echo "Nodes: train=${NNODES_TRAIN}, rollout=${NNODES_ROLLOUT}"
+echo "==================================================="
+
+# ── Compute derived parameters ───────────────────────────────────────────
+ACTOR_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP ))
+INFER_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP ))
+
+RUNTIME_ENV_ARGS=()
+if [ -n "${RUNTIME_ENV}" ]; then
+ RUNTIME_ENV_ARGS=(--runtime-env "${RUNTIME_ENV}")
+fi
+
+# ── Ensure Ray is running ────────────────────────────────────────────────
+TOTAL_GPUS=$(( (NNODES_TRAIN + NNODES_ROLLOUT) * NGPUS_PER_NODE ))
+if ! ray status &>/dev/null; then
+ echo "Starting Ray cluster (${TOTAL_GPUS} GPUs)..."
+ ray start --head --num-gpus="${TOTAL_GPUS}" --disable-usage-stats
+else
+ echo "Ray cluster already running."
+fi
+
+# ── Launch ────────────────────────────────────────────────────────────────
+WORKING_DIR="${WORKING_DIR:-$(pwd)}"
+
+ray job submit --no-wait --working-dir="${WORKING_DIR}" "${RUNTIME_ENV_ARGS[@]}" \
+ -- python3 -m verl.experimental.fully_async_policy.fully_async_main \
+ --config-name=swe_agent_blackbox_megatron_async \
+ --config-path="$(pwd)/examples/swe_agent_blackbox/config" \
+ hydra.searchpath=[pkg://verl.trainer.config] \
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
+ data.train_files="['${TRAIN_DATA}']" \
+ data.val_files="['${VAL_DATA}']" \
+ data.max_prompt_length=${PROMPT_LENGTH} \
+ data.max_response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.n=${N} \
+ actor_rollout_ref.rollout.name=${ENGINE} \
+ actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
+ actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \
+ actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \
+ actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP} \
+ actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
+ actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \
+ actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \
+ actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \
+ "${RUNNER_ARGS[@]}" \
+ actor_rollout_ref.actor.clip_ratio_low=${CLIP_RATIO_LOW} \
+ actor_rollout_ref.actor.clip_ratio_high=${CLIP_RATIO_HIGH} \
+ actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ACTOR_PPO_MAX_TOKEN_LEN} \
+ actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
+ actor_rollout_ref.actor.optim.lr_decay_steps=${TOTAL_ROLLOUT_STEPS} \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${OPTIMIZER_OFFLOAD_FRACTION} \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
+ actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.optimizer_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
+ actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \
+ actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \
+ actor_rollout_ref.ref.megatron.param_offload=${OFFLOAD} \
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TRAIN_TP} \
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
+ actor_rollout_ref.ref.megatron.context_parallel_size=${TRAIN_CP} \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \
+ trainer.project_name="${PROJECT_NAME}" \
+ trainer.experiment_name="${EXPERIMENT_NAME}" \
+ trainer.save_freq=${SAVE_FREQ} \
+ trainer.test_freq=${TEST_FREQ} \
+ trainer.default_local_dir="${CKPTS_DIR}" \
+ trainer.nnodes=${NNODES_TRAIN} \
+ trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
+ rollout.nnodes=${NNODES_ROLLOUT} \
+ rollout.n_gpus_per_node=${NGPUS_PER_NODE} \
+ rollout.total_rollout_steps=${TOTAL_ROLLOUT_STEPS} \
+ async_training.staleness_threshold=${STALENESS_THRESHOLD} \
+ async_training.trigger_parameter_sync_step=${TRIGGER_SYNC_STEP} \
+ async_training.partial_rollout=${PARTIAL_ROLLOUT} \
+ "$@"
diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh b/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh
new file mode 100755
index 00000000..1a0c19d3
--- /dev/null
+++ b/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh
@@ -0,0 +1,138 @@
+#!/usr/bin/env bash
+# Megatron sync training for the blackbox SWE-agent recipe.
+#
+# Uses main_ppo_sync + Megatron backend with the same blackbox agent infrastructure
+# (AgentFrameworkRolloutAdapter, subprocess_runner, SWEAgentFramework).
+#
+# Usage:
+# bash examples/swe_agent_blackbox/scripts/run_train_megatron_sync.sh
+#
+# All configurable via environment variables (see defaults below).
+
+set -euo pipefail
+
+# ── Model & data ─────────────────────────────────────────────────────────
+MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}"
+TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_rebench_filtered.parquet}"
+VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
+
+# ── Hardware ─────────────────────────────────────────────────────────────
+NNODES="${NNODES:-1}"
+NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}"
+
+# ── Training parameters ─────────────────────────────────────────────────
+TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}"
+PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
+RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
+ACTOR_LR="${ACTOR_LR:-1e-6}"
+TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}"
+SAVE_FREQ="${SAVE_FREQ:-10}"
+TEST_FREQ="${TEST_FREQ:-10}"
+PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}"
+
+# ── Rollout parameters ──────────────────────────────────────────────────
+ENGINE="${ENGINE:-vllm}"
+TP="${TP:-4}"
+ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
+N="${N:-8}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+
+# ── Megatron parallelism ────────────────────────────────────────────────
+TRAIN_TP="${TRAIN_TP:-8}"
+TRAIN_PP="${TRAIN_PP:-1}"
+TRAIN_CP="${TRAIN_CP:-1}"
+OFFLOAD="${OFFLOAD:-true}"
+USE_MBRIDGE="${USE_MBRIDGE:-true}"
+
+# ── Agent parameters ─────────────────────────────────────────────────────
+RUNNER="${RUNNER:-mini_swe}"
+MAX_TURNS="${MAX_TURNS:-100}"
+AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
+COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}"
+if [[ "${RUNNER}" == "claude_code" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}"
+elif [[ "${RUNNER}" == "mini_swe" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
+elif [[ "${RUNNER}" == "uniagent" ]]; then
+ AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE=""
+else
+ echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2
+ exit 1
+fi
+SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
+RUNNER_ARGS=(
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}"
+)
+if [[ "${RUNNER}" != "uniagent" ]]; then
+ RUNNER_ARGS+=(
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
+ "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
+ )
+fi
+
+# ── Logging ──────────────────────────────────────────────────────────────
+PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
+EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
+VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
+
+export SWE_AGENT_MAX_TURNS="${MAX_TURNS}"
+export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
+export VERL_LOGGING_LEVEL
+
+# ── Environment for NCCL ────────────────────────────────────────────────
+export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}"
+export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}"
+
+echo "=== SWE-Agent Blackbox Megatron Sync Training ==="
+echo "Model: ${MODEL_PATH}"
+echo "Train data: ${TRAIN_DATA}"
+echo "Val data: ${VAL_DATA}"
+echo "Engine: ${ENGINE} (gen_tp=${TP}, train_tp=${TRAIN_TP})"
+echo "Runner: ${RUNNER}"
+echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}"
+echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}"
+echo "==============================================="
+
+python3 -m verl.trainer.main_ppo_sync \
+ --config-name=swe_agent_blackbox_megatron_sync \
+ --config-path="$(pwd)/examples/swe_agent_blackbox/config" \
+ hydra.searchpath=[pkg://verl.trainer.config] \
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
+ data.train_files="['${TRAIN_DATA}']" \
+ data.val_files="['${VAL_DATA}']" \
+ data.train_batch_size=${TRAIN_BATCH_SIZE} \
+ data.max_prompt_length=${PROMPT_LENGTH} \
+ data.max_response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.name=${ENGINE} \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
+ actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
+ actor_rollout_ref.rollout.n=${N} \
+ actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
+ actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
+ actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH)) \
+ actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \
+ actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
+ actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
+ actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \
+ actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \
+ actor_rollout_ref.rollout.nnodes=${NNODES} \
+ actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \
+ trainer.nnodes=${NNODES} \
+ trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
+ trainer.total_epochs=${TOTAL_EPOCHS} \
+ trainer.save_freq=${SAVE_FREQ} \
+ trainer.test_freq=${TEST_FREQ} \
+ trainer.project_name=${PROJECT_NAME} \
+ trainer.experiment_name=${EXPERIMENT_NAME} \
+ actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \
+ actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \
+ "${RUNNER_ARGS[@]}" \
+ "$@"
From a28540a4452be30d2d5284b2c71c7c0b3f3354e9 Mon Sep 17 00:00:00 2001
From: sheng
Date: Fri, 26 Jun 2026 14:18:43 +0800
Subject: [PATCH 2/3] feat: add openyuanrong sandbox
---
examples/blackbox_recipes/sandbox/sandbox.py | 180 ++++++++++++++++++-
1 file changed, 179 insertions(+), 1 deletion(-)
diff --git a/examples/blackbox_recipes/sandbox/sandbox.py b/examples/blackbox_recipes/sandbox/sandbox.py
index fb21ac94..e6e46a8d 100644
--- a/examples/blackbox_recipes/sandbox/sandbox.py
+++ b/examples/blackbox_recipes/sandbox/sandbox.py
@@ -5,6 +5,184 @@
inside the sandbox can reach the gateway via ``http://127.0.0.1:``.
"""
-#TODO
+from __future__ import annotations
+import asyncio
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any
+from urllib.parse import urlparse
+
+@dataclass
+class CommandResult:
+ """Result of a command executed inside a sandbox."""
+
+ stdout: str
+ stderr: str
+ exit_code: int
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_PROXY_PORT = 38197
+
+
+def _configure_akernel_env() -> None:
+ """Map OPENYUANRONG_* env vars to AKERNEL_* before importing akernel_sdk."""
+ server = os.getenv("OPENYUANRONG_SERVER_ADDRESS")
+ token = os.getenv("OPENYUANRONG_TOKEN")
+ tunnel_ssl_verify = os.getenv("OPENYUANRONG_TUNNEL_SSL_VERIFY", "0")
+ if not server or not token:
+ raise ValueError(
+ "OPENYUANRONG_SERVER_ADDRESS and OPENYUANRONG_TOKEN "
+ "environment variables must be set for YR sandbox"
+ )
+ os.environ["AKERNEL_SERVER_ADDRESS"] = server
+ os.environ["AKERNEL_TOKEN"] = token
+ os.environ["TUNNEL_SSL_VERIFY"] = tunnel_ssl_verify
+
+
+def extract_upstream(gateway_url: str) -> str:
+ """Extract host:port from a gateway URL for upstream tunnel config.
+
+ Example: "http://8.92.9.155:40169/sessions/abc/v1" -> "8.92.9.155:40169"
+ """
+ parsed = urlparse(gateway_url)
+ return f"{parsed.hostname}:{parsed.port}"
+
+
+def rewrite_gateway_url(
+ gateway_url: str,
+ proxy_port: int = DEFAULT_PROXY_PORT,
+ *,
+ strip_v1: bool = False,
+) -> str:
+ """Rewrite gateway URL to use the sandbox-internal tunnel.
+
+ Replaces host:port with 127.0.0.1:, keeps path intact.
+
+ Example:
+ "http://8.92.9.155:40169/sessions/abc/v1"
+ -> "http://127.0.0.1:8766/sessions/abc/v1"
+ """
+ parsed = urlparse(gateway_url)
+ path = parsed.path.removesuffix("/v1") if strip_v1 else parsed.path
+ return f"http://127.0.0.1:{proxy_port}{path}"
+
+
+class YRSandbox:
+ """Command execution via OpenYuanRong (AKernel) remote sandbox."""
+
+ def __init__(self, sandbox: Any) -> None:
+ self._sandbox = sandbox
+
+ @property
+ def sandbox_id(self) -> str:
+ return getattr(self._sandbox, "sandbox_id", "unknown")
+
+
+ @classmethod
+ async def create(
+ cls,
+ *,
+ image: str,
+ sidecar_image: str,
+ upstream: str = "",
+ proxy_port: int = DEFAULT_PROXY_PORT,
+ env: dict[str, str] | None = None,
+ cpu: int = 1000,
+ memory: int = 2048,
+ cpu_limit: int = 4000,
+ mem_limit: int = 8192,
+ idle_timeout: int = 7200,
+ sidecar_target: str = "/opt/mini-swe-agent",
+ max_retries: int = 5,
+ **sandbox_kwargs: Any,
+ ) -> "YRSandbox":
+ """Create an OpenYuanRong sandbox with sidecar tool mounted.
+
+ The sidecar image is mounted at ``sidecar_target`` inside the
+ sandbox via ``akernel_sdk.Mount``.
+
+ If ``upstream`` is provided, a tunnel is set up so the sandbox can
+ reach the local gateway via ``http://127.0.0.1:``.
+ """
+ _configure_akernel_env()
+ from akernel_sdk import Mount, Sandbox
+
+ sb_kwargs: dict[str, Any] = {
+ "image": image,
+ "cpu": cpu,
+ "memory": memory,
+ "cpu_limit": cpu_limit,
+ "mem_limit": mem_limit,
+ "idle_timeout": idle_timeout,
+ "mounts": [
+ Mount(target=sidecar_target, image_url=sidecar_image),
+ ],
+ }
+ if upstream:
+ sb_kwargs["upstream"] = upstream
+ sb_kwargs["proxy_port"] = proxy_port
+ if env:
+ sb_kwargs["env"] = env
+ sb_kwargs.update(sandbox_kwargs)
+
+ logger.info(
+ "Creating YR sandbox (image=%s, cpu=%d, memory=%d, sidecar=%s:%s, upstream=%s)",
+ image, cpu, memory, sidecar_image, sidecar_target, upstream or "none",
+ )
+ last_error: Exception | None = None
+ for retry in range(max_retries):
+ sandbox = None
+ try:
+ sandbox = await asyncio.to_thread(lambda: Sandbox(**sb_kwargs))
+ logger.info("YR sandbox created: %s", getattr(sandbox, "sandbox_id", "?"))
+ return cls(sandbox=sandbox)
+ except Exception as exc:
+ last_error = exc
+ sandbox_id = getattr(sandbox, "sandbox_id", None)
+ logger.critical(
+ "Failed to create YR sandbox (sandbox_id=%s): %s",
+ sandbox_id or "n/a", exc,
+ )
+ if sandbox is not None:
+ try:
+ await asyncio.to_thread(sandbox.kill)
+ except Exception:
+ pass
+ if retry < max_retries - 1:
+ sleep_time = min(30, 2 ** retry)
+ logger.info("Retrying YR sandbox creation in %d seconds...", sleep_time)
+ await asyncio.sleep(sleep_time)
+
+ raise RuntimeError(f"Failed to create YR sandbox after {max_retries} retries") from last_error
+
+ async def run(self, cmd: str, *, timeout: int = 600) -> CommandResult:
+ """Execute *cmd* inside the OpenYuanRong sandbox via ``sandbox.commands.run``."""
+ try:
+ result = await asyncio.to_thread(
+ self._sandbox.commands.run, cmd, timeout=timeout,
+ )
+ return CommandResult(
+ stdout=getattr(result, "stdout", ""),
+ stderr=getattr(result, "stderr", ""),
+ exit_code=getattr(result, "exit_code", -1),
+ )
+ except Exception as e:
+ return CommandResult(stdout="", stderr=str(e), exit_code=-1)
+
+ async def cleanup(self) -> None:
+ """Kill the OpenYuanRong sandbox if still running."""
+ if self._sandbox is not None:
+ sandbox_id = getattr(self._sandbox, "sandbox_id", "?")
+ try:
+ if self._sandbox.is_running():
+ await asyncio.to_thread(self._sandbox.kill)
+ logger.info("YR sandbox %s killed", sandbox_id)
+ else:
+ logger.info("YR sandbox %s already stopped", sandbox_id)
+ except Exception as e:
+ logger.warning("Failed to kill YR sandbox %s: %s", sandbox_id, e)
+ self._sandbox = None
From 6de203b8a6a1fe1b07137c6dce8c81e73508267e Mon Sep 17 00:00:00 2001
From: zhaizhiqiang <584508161@qq.com>
Date: Mon, 29 Jun 2026 03:30:36 +0000
Subject: [PATCH 3/3] 9235bf603757c85c41deef22fe93490a1b5f0921
---
.../claude_code/Dockerfile.claude-code-tool | 21 -
.../claude_code/claude_code_runner.py | 232 ---------
.../claude_code/config/claude_code.yaml | 1 -
.../blackbox_recipes/mini_swe_agent/README.md | 248 ++-------
.../mini_swe_agent/build_tool.sh | 56 ++
.../mini_swe_agent/config/agent_config.yaml | 36 --
.../config/agent_config_openyuanrong.yaml | 37 --
.../mini_swe_agent/config/parallel_infer.yaml | 31 --
.../config/swe_agent_blackbox.yaml | 123 -----
.../swe_agent_blackbox_megatron_sync.yaml | 129 -----
...ml => swe_agent_blackbox_megatron_v1.yaml} | 70 +--
.../mini_swe_agent/dataset.py | 1 -
.../mini_swe_agent/framework.py | 105 ----
.../mini_swe_agent/mini_swe_agent_runner.py | 69 ++-
.../mini_swe_agent/parallel_infer.py | 486 ++++++++----------
.../blackbox_recipes/mini_swe_agent/reward.py | 2 +-
.../mini_swe_agent/run_agent.py | 34 +-
.../mini_swe_agent/run_infer.sh | 80 +++
.../mini_swe_agent/run_train.sh | 300 +++++++++++
.../mini_swe_agent/subprocess_runner.py | 61 ---
.../{sandbox/sandbox.py => sandbox_client.py} | 85 +--
.../blackbox_recipes/scripts/build_tool.sh | 75 ---
.../blackbox_recipes/scripts/run_infer.sh | 66 ---
.../blackbox_recipes/scripts/run_train.sh | 122 -----
.../scripts/run_train_megatron_async.sh | 199 -------
.../scripts/run_train_megatron_sync.sh | 138 -----
.../r2e_gym_subset_filtered.py | 7 +
.../data_preprocess/swe_bench_verified.py | 9 +
examples/data_preprocess/swe_rebench.py | 8 +
uni_agent/gateway/session/codec.py | 4 +-
uni_agent/interaction/model.py | 4 +-
verl | 2 +-
32 files changed, 881 insertions(+), 1960 deletions(-)
delete mode 100644 examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool
delete mode 100644 examples/blackbox_recipes/claude_code/claude_code_runner.py
delete mode 100644 examples/blackbox_recipes/claude_code/config/claude_code.yaml
create mode 100755 examples/blackbox_recipes/mini_swe_agent/build_tool.sh
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml
rename examples/blackbox_recipes/mini_swe_agent/config/{swe_agent_blackbox_megatron_async.yaml => swe_agent_blackbox_megatron_v1.yaml} (62%)
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/framework.py
create mode 100755 examples/blackbox_recipes/mini_swe_agent/run_infer.sh
create mode 100755 examples/blackbox_recipes/mini_swe_agent/run_train.sh
delete mode 100644 examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py
rename examples/blackbox_recipes/{sandbox/sandbox.py => sandbox_client.py} (66%)
delete mode 100755 examples/blackbox_recipes/scripts/build_tool.sh
delete mode 100755 examples/blackbox_recipes/scripts/run_infer.sh
delete mode 100755 examples/blackbox_recipes/scripts/run_train.sh
delete mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_async.sh
delete mode 100755 examples/blackbox_recipes/scripts/run_train_megatron_sync.sh
diff --git a/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool b/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool
deleted file mode 100644
index 3d12af4c..00000000
--- a/examples/blackbox_recipes/claude_code/Dockerfile.claude-code-tool
+++ /dev/null
@@ -1,21 +0,0 @@
-# Claude Code sidecar tool image.
-#
-# Mounted at /opt/claude-code inside the SWE-bench sandbox.
-
-FROM node:20-bookworm-slim AS builder
-
-ARG TOOL_VERSION="latest"
-ARG NPM_REGISTRY=""
-
-ENV DISABLE_AUTOUPDATER=1 \
- IS_SANDBOX=1 \
- npm_config_audit=false \
- npm_config_fund=false \
- npm_config_update_notifier=false
-
-RUN if [ -n "${NPM_REGISTRY}" ]; then npm config set registry "${NPM_REGISTRY}"; fi \
- && npm install -g --prefix /opt/claude-code "@anthropic-ai/claude-code@${TOOL_VERSION}" \
- && /opt/claude-code/bin/claude --version
-
-FROM scratch
-COPY --from=builder /opt/claude-code /
diff --git a/examples/blackbox_recipes/claude_code/claude_code_runner.py b/examples/blackbox_recipes/claude_code/claude_code_runner.py
deleted file mode 100644
index bee41aaf..00000000
--- a/examples/blackbox_recipes/claude_code/claude_code_runner.py
+++ /dev/null
@@ -1,232 +0,0 @@
-"""Claude Code runner for the blackbox SWE-agent recipe."""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-import shlex
-import time
-
-from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_TOOL_IMAGE = "claude-code-tool:latest"
-TOOL_TARGET = "/opt/claude-code"
-
-
-def extract_task(raw_prompt) -> str:
- if isinstance(raw_prompt, str):
- return raw_prompt
- return next(
- (m["content"] for m in raw_prompt if isinstance(m, dict) and m.get("role") == "user"),
- str(raw_prompt),
- )
-
-
-def _extract_issue_text(task: str) -> str:
- start = task.find("")
- end = task.find("")
- if start >= 0 and end > start:
- return task[start + len(""):end].strip()
- marker = "\nFollow these steps to resolve the issue:"
- if marker in task:
- return task.split(marker, 1)[0].strip()
- return task.strip()
-
-
-def _decode_metadata_list(value) -> list[str]:
- if not value:
- return []
- if isinstance(value, list):
- return [str(item) for item in value]
- if isinstance(value, str):
- try:
- parsed = json.loads(value)
- except json.JSONDecodeError:
- return [value]
- if isinstance(parsed, list):
- return [str(item) for item in parsed]
- return [str(value)]
-
-
-def build_claude_task(raw_prompt, tools_kwargs: dict | None = None) -> str:
- tools_kwargs = tools_kwargs or {}
- task = extract_task(raw_prompt)
- metadata = ((tools_kwargs.get("reward") or {}).get("metadata") or {})
- issue = metadata.get("problem_statement") or _extract_issue_text(task)
- tests = _decode_metadata_list(metadata.get("FAIL_TO_PASS"))
- if not tests:
- tests = _decode_metadata_list(metadata.get("PASS_TO_PASS"))[:3]
- tests_block = "\n".join(f"- {test}" for test in tests) if tests else "- Run the closest relevant tests you identify."
-
- return (
- "You are fixing a SWE-bench task in /testbed.\n\n"
- "Issue:\n"
- f"{issue}\n\n"
- "Rules:\n"
- "- Edit source files only. Do not modify tests.\n"
- "- The development environment is already installed; do not install packages unless a test command proves it is necessary.\n"
- "- There is no submit tool in this environment. Do not try to submit.\n"
- "- Do not create extra edge-case test files after the relevant tests pass.\n"
- "- Do not run `pytest --collect-only`, `git log`, or any other command that does not directly validate the fix.\n"
- "- Do not analyze unrelated `is_separable` behavior.\n"
- "- Do not run additional ad-hoc verification after the listed relevant pytest command passes.\n"
- "- Do not commit.\n"
- "- After the minimal fix is applied and a relevant pytest command passes, print a one-line summary and exit immediately.\n\n"
- "Relevant tests to run after the fix:\n"
- f"{tests_block}\n"
- )
-
-
-def build_claude_command(
- *,
- task: str,
- base_url: str,
- max_turns: int,
- model: str = "default",
- permission_mode: str = "bypassPermissions",
- conda_env: str | None = "testbed",
- disable_web_tools: bool = True,
- disable_slash_commands: bool = True,
-) -> str:
- env = {
- "ANTHROPIC_BASE_URL": base_url,
- "ANTHROPIC_API_KEY": "not-needed",
- "ANTHROPIC_MODEL": model,
- "ANTHROPIC_DEFAULT_HAIKU_MODEL": model,
- "ANTHROPIC_DEFAULT_SONNET_MODEL": model,
- "ANTHROPIC_DEFAULT_OPUS_MODEL": model,
- "ANTHROPIC_SMALL_FAST_MODEL": model,
- "CLAUDE_CODE_DISABLE_BACKGROUND_TASKS": "1",
- "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
- "CLAUDE_CODE_FORK_SUBAGENT": "0",
- "CLAUDE_CODE_SUBAGENT_MODEL": model,
- "DISABLE_AUTOUPDATER": "1",
- "IS_SANDBOX": "1",
- }
- env_assignments = [f"{key}={shlex.quote(value)}" for key, value in env.items()]
- if conda_env:
- conda_prefix = f"/opt/miniconda3/envs/{conda_env}"
- env_assignments.extend(
- [
- f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)}",
- f"CONDA_PREFIX={shlex.quote(conda_prefix)}",
- f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH",
- ]
- )
- env_prefix = " ".join(env_assignments)
- argv = [
- "/opt/claude-code/bin/claude",
- "-p",
- task,
- "--model",
- model,
- "--max-turns",
- str(max_turns),
- "--permission-mode",
- permission_mode,
- ]
- if disable_slash_commands:
- argv.append("--disable-slash-commands")
- if disable_web_tools:
- argv.extend(["--disallowedTools", "Agent", "Task", "WebFetch", "WebSearch"])
- return (
- "unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; "
- "cd /testbed; "
- f"{env_prefix} "
- + shlex.join(argv)
- )
-
-
-async def _create_claude_sandbox(
- *,
- image: str,
- sidecar_image: str,
- gateway_url: str,
-):
- from examples.swe_agent_blackbox.sandbox import YRSandbox, extract_upstream
-
- upstream = extract_upstream(gateway_url) if gateway_url else ""
- return await YRSandbox.create(
- image=image,
- sidecar_image=sidecar_image,
- sidecar_target=TOOL_TARGET,
- upstream=upstream,
- )
-
-
-async def claude_code_runner(
- *,
- raw_prompt,
- session: SessionHandle,
- sample_index: int,
- session_runtime: SessionRuntime,
- tools_kwargs: dict | None = None,
- tool_image: str = DEFAULT_TOOL_IMAGE,
- run_timeout: int = 7200,
- **kwargs,
-) -> None:
- from examples.swe_agent_blackbox.dataset import extract_image
- from examples.swe_agent_blackbox.mini_swe_agent_runner import SandboxEnvForReward
- from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env
-
- tools_kwargs = tools_kwargs or {}
- task = build_claude_task(raw_prompt, tools_kwargs)
- env_config = tools_kwargs.get("env", {})
- image = extract_image(env_config)
- if not image:
- raise ValueError(f"No Docker image found in tools_kwargs.env for sample {sample_index}")
-
- gateway_url = session.base_url
- if not gateway_url:
- raise ValueError(f"gateway_url is empty for sample {sample_index}")
-
- sandbox = await _create_claude_sandbox(
- image=image,
- sidecar_image=tool_image,
- gateway_url=gateway_url,
- )
-
- try:
- post_setup_cmd = env_config.get("post_setup_cmd", "")
- if post_setup_cmd:
- setup_result = await sandbox.run(post_setup_cmd, timeout=120)
- if setup_result.exit_code != 0:
- logger.warning("post_setup_cmd failed rc=%s: %.300s", setup_result.exit_code, setup_result.stdout + setup_result.stderr)
-
- from examples.swe_agent_blackbox.sandbox import rewrite_gateway_url
-
- claude_base_url = rewrite_gateway_url(gateway_url, strip_v1=True)
- max_turns = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100"))
- agent_cmd = build_claude_command(
- task=task,
- base_url=claude_base_url,
- max_turns=max_turns,
- )
-
- started_at = time.perf_counter()
- result = await sandbox.run(agent_cmd, timeout=int(run_timeout))
- elapsed = time.perf_counter() - started_at
- logger.info("[sample %d] claude-code finished rc=%s elapsed=%.1fs", sample_index, result.exit_code, elapsed)
- if result.exit_code != 0:
- logger.warning(
- "[sample %d] claude-code failed stdout_tail=%r stderr_tail=%r",
- sample_index,
- (result.stdout or "")[-4000:],
- (result.stderr or "")[-4000:],
- )
-
- metadata, eval_timeout = build_reward_context(tools_kwargs)
- score, eval_result = await evaluate_in_env(SandboxEnvForReward(sandbox), metadata, eval_timeout)
- logger.info("[sample %d] reward done score=%s resolved=%s", sample_index, score, eval_result.get("resolved"))
-
- reward_info = {
- "reward_score": score,
- "claude_code_exit_code": result.exit_code,
- **eval_result,
- }
- await session_runtime.complete_session(session.session_id, reward_info=reward_info)
- finally:
- await sandbox.cleanup()
diff --git a/examples/blackbox_recipes/claude_code/config/claude_code.yaml b/examples/blackbox_recipes/claude_code/config/claude_code.yaml
deleted file mode 100644
index 503fa1da..00000000
--- a/examples/blackbox_recipes/claude_code/config/claude_code.yaml
+++ /dev/null
@@ -1 +0,0 @@
-#TODO
\ No newline at end of file
diff --git a/examples/blackbox_recipes/mini_swe_agent/README.md b/examples/blackbox_recipes/mini_swe_agent/README.md
index b32a637a..436d3c65 100644
--- a/examples/blackbox_recipes/mini_swe_agent/README.md
+++ b/examples/blackbox_recipes/mini_swe_agent/README.md
@@ -2,268 +2,118 @@
## Overview
-`mini_swe` and `claude_code` both run inside the SWE-bench sandbox through a
-sidecar tool image. The external runner creates the sandbox, mounts the selected
-tool image, starts the agent process, and evaluates the reward in the same
-sandbox.
-
-For `mini_swe`, the agent executes commands through `LocalEnvironment` (local
-bash) inside the sandbox and calls the LLM through the gateway URL passed in via
-stdin. For `claude_code`, the runner starts the Claude Code CLI from the sidecar
-image and points it at the same Anthropic-compatible gateway.
-
-The `mini_swe` tool image uses
+`mini-swe-agent` runs inside the SWE-bench sandbox through a sidecar tool image.
+The external runner creates the sandbox, mounts the tool image at
+`/opt/mini-swe-agent`, starts the agent process, and evaluates the reward in the
+same sandbox.
+
+The agent executes commands through `LocalEnvironment` (local bash) inside the
+sandbox and calls the LLM through the gateway URL passed in via stdin. The
+`mini_swe` tool image uses
[python-build-standalone](https://github.com/astral-sh/python-build-standalone)
-to build an isolated Python environment. The Claude Code tool image uses a Node
-builder to install the Claude Code npm package. Both images use a minimal
+to build an isolated Python environment, then copies the result into a minimal
`FROM scratch` final stage, so the sandbox base image does not need to provide
-Python, Node, or npm for the sidecar tool runtime.
+Python for the sidecar tool runtime.
+
+**This recipe is self-contained.** It shares only
+[`../sandbox_client.py`](../sandbox_client.py) with the claude-code recipe;
+everything else (`dataset.py`, `reward.py`, `run_agent.py`, `build_tool.sh`,
+`run_train.sh`, config) lives in this directory and does not depend on
+`claude_code/`.
**Supported runners:**
| runner | Description |
|--------|-------------|
-| `uniagent` | Original SWE-agent runner |
| `mini_swe` | mini-swe-agent sidecar runner |
-| `claude_code` | Claude Code sidecar runner; reward is returned through `complete_session(reward_info)` without writing a separate reward JSON file |
**Supported sandbox types:**
| Type | Description |
|------|-------------|
-| OpenYuanRong (`"openyuanrong"`) | Uses `akernel_sdk.Mount` and `sandbox.commands.run()` |
-
-At runtime, the selected runner depends directly on its tool image. The tool
-image does not need to be extracted into a host directory ahead of time.
+| openyuanrong | Uses `akernel_sdk.Mount` and `sandbox.commands.run()` |
## Architecture
```text
-[Rollouter Host: mini_swe_agent_runner / claude_code_runner]
+[Rollouter Host: mini_swe_agent_runner]
|
- |-- _create_sandbox(image, sidecar_image)
- | `-- openyuanrong: Sandbox(mounts=[Mount(target="/opt/", ...)])
+ |-- SandboxClient.create(image, sidecar_image, sidecar_target="/opt/mini-swe-agent")
+ | `-- akernel: Sandbox(mounts=[Mount(target="/opt/mini-swe-agent", ...)])
|
|-- sandbox.run("")
| `-- [Inside Sandbox]
- | /opt/mini-swe-agent/bin/python3.12 or /opt/claude-code/bin/claude
+ | /opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py
| stdin <- task config JSON (task, gateway_url, agent)
| commands run inside the SWE-bench sandbox
- | stdout -> runner-specific execution result
+ | stdout -> agent execution result JSON
|
|-- parse agent result
|-- SandboxEnvForReward(sandbox) -> evaluate_in_env()
- `-- session_runtime.complete_session(reward_info)
+ `-- POST session.reward_info_url
```
## Prerequisites
-1. **OpenYuanRong** - set `OPENYUANRONG_SERVER_ADDRESS` and `OPENYUANRONG_TOKEN`.
-2. **Runner tool image** - build the selected tool image and push it to a remote
+1. **AKernel** — set `AKERNEL_SERVER_ADDRESS` and `AKERNEL_TOKEN`.
+2. **Tool image** — build the mini-swe-agent tool image and push it to a remote
registry if the sandbox service cannot access local Docker images.
## 1. Build Tool Image
-`mini_swe` and `claude_code` are both injected into the SWE-bench sandbox as
-sidecar tool images, but they differ in image contents, mount paths, and
-accelerator/mirror options. Use `build_tool.sh` for both runners, and select the
-target runner with `--tool` or `TOOL_KIND`.
+`mini_swe` is injected into the SWE-bench sandbox as a sidecar tool image. Use
+`build_tool.sh` to build it.
-| runner | Default tool image | Dockerfile | Sandbox mount path | Image contents | Mirror option |
-|--------|--------------------|------------|--------------------|----------------|---------------|
-| `mini_swe` | `mini-swe-agent-tool:latest` | `Dockerfile.mini-swe-agent-tool` | `/opt/mini-swe-agent` | Standalone Python 3.12, `mini-swe-agent`, `litellm`, and `run_agent.py` | `--pip-index` / `PIP_INDEX_URL` |
-| `claude_code` | `claude-code-tool:latest` | `Dockerfile.claude-code-tool` | `/opt/claude-code` | Claude Code npm package installed by a Node 20 builder | `--npm-registry` / `NPM_REGISTRY` |
-
-### mini_swe Tool Image
-
-`mini_swe` is the default build target:
+| Default tool image | Dockerfile | Sandbox mount path | Image contents |
+|--------------------|------------|--------------------|----------------|
+| `mini-swe-agent-tool:latest` | `Dockerfile.mini-swe-agent-tool` | `/opt/mini-swe-agent` | Standalone Python 3.12, `mini-swe-agent`, `litellm`, and `run_agent.py` |
```bash
# Use the default PyPI source.
-bash examples/swe_agent_blackbox/build_tool.sh
+bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh
# Use a custom PyPI mirror.
-bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/
+bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/
# Build and push to a remote registry.
-bash examples/swe_agent_blackbox/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
-```
-
-The `mini_swe` image uses `python-build-standalone` to build an isolated Python
-runtime. The final `FROM scratch` image contains only the files needed under
-`/opt/mini-swe-agent`, and it does not depend on the Python version installed in
-the sandbox base image.
-
-After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`:
-
-```bash
-SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \
-RUNNER=mini_swe \
-bash examples/swe_agent_blackbox/scripts/run_infer.sh
-```
-
-### Claude Code Tool Image
-
-Claude Code must be selected explicitly with `--tool claude_code`:
-
-```bash
-# Use the default npm registry.
-bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code
-
-# Use a custom npm registry.
-bash examples/swe_agent_blackbox/build_tool.sh \
- --tool claude_code \
- --npm-registry https://registry.npmmirror.com
-
-# Select the Claude Code npm package version.
-bash examples/swe_agent_blackbox/build_tool.sh \
- --tool claude_code \
- --tool-version latest
-
-# Build and push the Claude Code sidecar image.
-bash examples/swe_agent_blackbox/build_tool.sh \
- --tool claude_code \
- --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
-```
-
-The Claude Code image uses `node:20-bookworm-slim` as the builder stage and
-installs `@anthropic-ai/claude-code` into `/opt/claude-code`. The final image is
-also a `FROM scratch` sidecar image. At runtime, the runner mounts it into the
-sandbox at `/opt/claude-code` and invokes `/opt/claude-code/bin/claude`.
-
-After pushing the image, point runtime inference at it with `SWE_AGENT_TOOL_IMAGE`:
-
-```bash
-SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \
-RUNNER=claude_code \
-bash examples/swe_agent_blackbox/scripts/run_infer.sh
-```
-
-### Combined Build Options
-
-`--tool`, image tags, mirrors, and registries can be combined:
-
-```bash
-bash examples/swe_agent_blackbox/build_tool.sh \
- --tool mini_swe \
- --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/ \
- --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
+bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
```
-The build script:
-
-1. Selects the Dockerfile and default image name from `--tool`:
- - `mini_swe` -> `mini-swe-agent-tool:latest`
- - `claude_code` -> `claude-code-tool:latest`
-2. Tags and pushes the image when `--registry` is provided.
-
-Both tool images are sidecar runtime dependencies, not SWE-bench task base
-images. The `mini_swe` Python runtime is fully isolated from the sandbox
-container's Python. The `claude_code` Node/npm dependencies live only under
-`/opt/claude-code`, so the sandbox base image does not need Node installed.
+The `mini_swe` Python runtime is fully isolated from the sandbox container's
+Python.
### Build Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
-| `TOOL_IMAGE` | `mini-swe-agent-tool` / `claude-code-tool` | Image name; the default changes with `TOOL_KIND` |
+| `TOOL_IMAGE` | `mini-swe-agent-tool` | Image name |
| `TOOL_TAG` | `latest` | Image tag |
-| `TOOL_VERSION` | `latest` | Tool package version; for `claude_code`, this selects the `@anthropic-ai/claude-code` npm package version |
-| `PIP_INDEX_URL` | unset, use PyPI | pip index URL; equivalent to `--pip-index` |
-| `TOOL_KIND` | `mini_swe` | Tool kind: `mini_swe` or `claude_code` |
-| `NPM_REGISTRY` | unset, use npm default | npm registry URL; equivalent to `--npm-registry` |
+| `PIP_INDEX_URL` | unset, use PyPI | pip index URL (`--pip-index`) |
-## 2. Inference With OpenYuanRong Sandbox
+After pushing, point training at it with `SWE_AGENT_TOOL_IMAGE`.
-### Using run_infer.sh
+## 2. Training (Fully Async)
```bash
-cd "$(git rev-parse --show-toplevel)"
-
-RUNNER=mini_swe \
+AKERNEL_SERVER_ADDRESS="6.2.179.37:8888" \
+AKERNEL_TOKEN="" \
SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \
-MODEL_PATH=$HOME/models/Qwen3.5-9B \
-DATA_PATH=$HOME/data/swe_agent/r2e_gym.parquet \
-MAX_SAMPLES=1 \
-TP=1 \
-bash examples/swe_agent_blackbox/scripts/run_infer.sh
-```
-
-### Calling Python Directly
-
-```bash
-python examples/swe_agent_blackbox/parallel_infer.py \
- --model-path ~/models/Qwen3.5-9B \
- --data-path ~/data/swe_agent/r2e_gym.parquet \
- --max-samples 1 \
- --runner mini_swe \
- --max-turns 100 \
- --tensor-parallel-size 1
-```
-
-## 3. Inference
-
-### Environment Variables
-
-```bash
-export OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888"
-export OPENYUANRONG_TOKEN=""
-export DEPLOYMENT=openyuanrong
-```
-
-### Run mini_swe
-
-```bash
-RUNNER=mini_swe \
-OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \
-OPENYUANRONG_TOKEN="" \
-DEPLOYMENT=openyuanrong \
-SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest \
-bash examples/swe_agent_blackbox/scripts/run_infer.sh
-```
-
-### Run Claude Code
-
-```bash
-RUNNER=claude_code \
-OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \
-OPENYUANRONG_TOKEN="" \
-DEPLOYMENT=openyuanrong \
-SWE_AGENT_TOOL_IMAGE=swr.cn-east-3.myhuaweicloud.com/openyuanrong/claude-code-tool:latest \
-SWE_AGENT_MAX_TURNS=50 \
-SWE_AGENT_RUN_TIMEOUT=7200 \
-bash examples/swe_agent_blackbox/scripts/run_infer.sh
-```
-
-## 4. Training (Fully Async)
-
-```bash
-OPENYUANRONG_SERVER_ADDRESS="6.2.179.37:8888" \
-OPENYUANRONG_TOKEN="" \
MODEL_PATH=~/models/Qwen3.5-9B \
-bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh
+bash examples/blackbox_recipes/mini_swe_agent/run_train.sh
```
-The training YAML keeps `mini_swe` as the default runner:
+The training YAML keeps `mini_swe` as the only runner:
```yaml
-agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
-```
-
-To run training with Claude Code, keep the YAML unchanged and override the runner
-FQN from the launch command:
-
-```bash
-python3 -m verl.experimental.fully_async_policy.fully_async_main \
- --config-path examples/swe_agent_blackbox/config \
- --config-name swe_agent_blackbox_megatron_async \
- actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=examples.swe_agent_blackbox.claude_code_runner.claude_code_runner
+agent_runner_fqn: examples.blackbox_recipes.mini_swe_agent.mini_swe_agent_runner.mini_swe_agent_runner
```
-## 5. Configuration
+## 3. Configuration
| Variable | Default | Description |
|----------|---------|-------------|
-| `SWE_AGENT_MAX_TURNS` | `100` | Max agent steps |
+| `AGENT_MAX_TURNS` | `100` | mini-swe-agent `step_limit` (the agent's turn budget); read by the runner from the `AGENT_MAX_TURNS` env var |
+| `SWE_AGENT_EVAL_TIMEOUT` | `600` | Reward evaluation timeout (seconds) |
+| `SWE_AGENT_RUN_TIMEOUT` | `7200` | Max wall time for the agent process in the sandbox |
| `SWE_AGENT_TOOL_IMAGE` | `swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest` | Sidecar tool image |
-| `DEBUG_MODE` | (unset) | Set to 1 to enable debug logging |
+| `CONDA_ENV` | `testbed` | Conda env activated inside the sandbox before running the agent |
diff --git a/examples/blackbox_recipes/mini_swe_agent/build_tool.sh b/examples/blackbox_recipes/mini_swe_agent/build_tool.sh
new file mode 100755
index 00000000..7bcdbe1f
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/build_tool.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Build the mini-swe-agent sidecar tool image.
+#
+# The image uses python-build-standalone to build an isolated Python runtime
+# with mini-swe-agent + litellm + run_agent.py, copied into a minimal
+# `FROM scratch` final stage rooted at /opt/mini-swe-agent. It is mounted into
+# the SWE-bench sandbox at /opt/mini-swe-agent, so the sandbox base image does
+# not need Python for the sidecar tool runtime.
+#
+# Usage:
+# bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh
+# bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/
+# bash examples/blackbox_recipes/mini_swe_agent/build_tool.sh --registry swr.cn-east-3.myhuaweicloud.com/openyuanrong
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+IMAGE_NAME="${TOOL_IMAGE:-mini-swe-agent-tool}"
+IMAGE_TAG="${TOOL_TAG:-latest}"
+
+# Parse args
+REGISTRY=""
+PIP_INDEX_URL="${PIP_INDEX_URL:-}"
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --registry) REGISTRY="$2"; shift 2 ;;
+ --pip-index) PIP_INDEX_URL="$2"; shift 2 ;;
+ *) echo "Unknown arg: $1"; exit 1 ;;
+ esac
+done
+
+BUILD_ARGS=()
+if [[ -n "${PIP_INDEX_URL}" ]]; then
+ BUILD_ARGS+=(--build-arg PIP_INDEX_URL="${PIP_INDEX_URL}")
+fi
+
+echo "==> Building mini_swe tool image: ${IMAGE_NAME}:${IMAGE_TAG}"
+docker build \
+ -f "${SCRIPT_DIR}/Dockerfile.mini-swe-agent-tool" \
+ -t "${IMAGE_NAME}:${IMAGE_TAG}" \
+ "${BUILD_ARGS[@]}" \
+ "${SCRIPT_DIR}/"
+
+if [[ -n "${REGISTRY}" ]]; then
+ FULL_TAG="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
+ echo "==> Tagging and pushing: ${FULL_TAG}"
+ docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${FULL_TAG}"
+ docker push "${FULL_TAG}"
+ echo " Pushed."
+fi
+
+echo ""
+echo "Tool image ready: ${IMAGE_NAME}:${IMAGE_TAG}"
+if [[ -n "${REGISTRY}" ]]; then
+ echo " Remote sandbox: ${FULL_TAG}"
+fi
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml
deleted file mode 100644
index b7352b72..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/config/agent_config.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-- name: swe_agent
-
- _target_: uni_agent.agent_loop.UniAgentLoop
- concurrency: 64
- log_dir: /tmp/swebench_qwen3_coder
- mask_abnormal_exit_traj: false
-
- interaction:
- action_timeout: 300
- max_turns: 100
-
- env:
- deployment:
- type: local
- command: /usr/bin/python3 -m swerex.server --auth-token {token}
- timeout: 600
- startup_timeout: 600
- container_runtime: docker
- env_variables:
- PIP_PROGRESS_BAR: "off"
- PIP_CACHE_DIR: "~/.cache/pip"
- PAGER: "cat"
- MANPAGER: "cat"
- LESS: "-R"
- TQDM_DISABLE: "1"
- GIT_PAGER: "cat"
-
- tool_parser: qwen3_coder
-
- tools:
- - name: str_replace_editor
- - name: execute_bash
- - name: submit
-
- reward:
- eval_timeout: 600
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml b/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml
deleted file mode 100644
index b298c676..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/config/agent_config_openyuanrong.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-- name: swe_agent
-
- _target_: uni_agent.agent_loop.UniAgentLoop
- concurrency: 64
- log_dir: /tmp/swebench_qwen3_coder
- mask_abnormal_exit_traj: false
-
- interaction:
- action_timeout: 300
- max_turns: 100
-
- env:
- deployment:
- type: openyuanrong
- command: /opt/swe-rex/bin/python /opt/swe-rex/bin/swerex-remote --host 0.0.0.0 --port {port} --auth-token {token}
- timeout: 600
- startup_timeout: 600
- swerex_runtime_image: swr.cn-east-3.myhuaweicloud.com/openyuanrong/swerex-runtime:1.4.0
- swerex_runtime_target: /opt/swe-rex
- env_variables:
- PIP_PROGRESS_BAR: "off"
- PIP_CACHE_DIR: "~/.cache/pip"
- PAGER: "cat"
- MANPAGER: "cat"
- LESS: "-R"
- TQDM_DISABLE: "1"
- GIT_PAGER: "cat"
-
- tool_parser: qwen3_coder
-
- tools:
- - name: str_replace_editor
- - name: execute_bash
- - name: submit
-
- reward:
- eval_timeout: 600
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml b/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml
deleted file mode 100644
index 0829fdcd..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/config/parallel_infer.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-# Parallel inference config for the blackbox SWE-agent recipe.
-# Composes verl's base configs with inference-specific overrides.
-
-defaults:
- - model_engine: dp
- - actor@actor_rollout_ref.actor: ${model_engine}_actor
- - rollout@actor_rollout_ref.rollout: rollout
- - model@actor_rollout_ref.model: hf_model
- - reward: reward
- - _self_
-
-hydra:
- searchpath:
- - pkg://verl.trainer.config
-
-actor_rollout_ref:
- hybrid_engine: true
- nccl_timeout: 600
- model: {}
- rollout:
- agent: {}
-
-trainer:
- nnodes: 1
- n_gpus_per_node: 8
- logger:
- - console
- device: cuda
- total_epochs: 1
- total_training_steps: null
- balance_batch: false
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml
deleted file mode 100644
index 62b73da1..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-# PPO trainer config for the blackbox SWE-agent recipe (v2).
-# Uses the generic AgentFrameworkRolloutAdapter + SWEAgentFramework subclass.
-
-hydra:
- searchpath:
- - pkg://verl.trainer.config
-
-defaults:
- - ppo_trainer
- - _self_
-
-actor_rollout_ref:
- hybrid_engine: true
- nccl_timeout: 600
-
- model:
- path: ???
- enable_gradient_checkpointing: true
-
- rollout:
- name: vllm
- mode: async
- prompt_length: 4096
- response_length: 131072
- max_model_len: 135168
- temperature: 1.0
- top_p: 1.0
- n: 8
- tensor_model_parallel_size: 4
- gpu_memory_utilization: 0.7
- calculate_log_probs: true
- enable_sleep_mode: true
- free_cache_engine: true
-
- multi_turn:
- enable: true
- max_assistant_turns: 1
- max_parallel_calls: 1
- format: qwen3_coder
-
- agent:
- num_workers: 8
- agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter
-
- custom:
- agent_framework:
- framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework
- agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
- gateway_count: 1
- completion_timeout_seconds: 600
- max_concurrent_sessions: 32
- agent_runner_kwargs:
- agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml
-
- actor:
- use_dynamic_bsz: true
- ppo_mini_batch_size: 16
- use_kl_loss: false
- kl_loss_coef: 0.0
- clip_ratio_low: 0.2
- clip_ratio_high: 0.28
- loss_agg_mode: token-mean
- optim:
- lr: 1e-6
- weight_decay: 0.1
- clip_grad: 1.0
- fsdp_config:
- param_offload: true
- optimizer_offload: true
- grad_offload: true
-
-data:
- train_files: ???
- val_files: ???
- max_prompt_length: 4096
- max_response_length: 131072
- train_batch_size: 128
- val_batch_size: 128
- return_raw_chat: true
- trust_remote_code: true
- custom_cls:
- path: pkg://examples.swe_agent_blackbox.dataset
- name: SWEBenchDataset
-
-algorithm:
- gamma: 1.0
- lam: 1.0
- adv_estimator: grpo
- use_kl_in_reward: false
- kl_ctrl:
- type: fixed
- kl_coef: 0.0
-
-reward:
- custom_reward_function:
- path: pkg://examples/swe_agent_blackbox.reward
- name: compute_score
-
-trainer:
- use_legacy_worker_impl: disable
- nnodes: 1
- n_gpus_per_node: 8
- total_epochs: 10
- project_name: swe_agent_blackbox
- experiment_name: swe_agent
- logger:
- - console
- device: cuda
- balance_batch: false
- val_before_train: true
- val_only: false
- save_freq: 10
- test_freq: 10
- default_local_dir: checkpoints/swe_agent_blackbox
- resume_mode: disable
-
-ray_kwargs:
- ray_init:
- runtime_env:
- env_vars:
- TRANSFER_QUEUE_ENABLE: ""
- NCCL_P2P_DISABLE: "1"
- NCCL_SHM_DISABLE: "1"
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml
deleted file mode 100644
index 65b09b1a..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_sync.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-# Megatron sync training config for the blackbox SWE-agent recipe.
-# Uses main_ppo_sync + Megatron backend, same blackbox infrastructure as FSDP.
-#
-# Entry point: python3 -m verl.trainer.main_ppo_sync
-
-hydra:
- searchpath:
- - pkg://verl.trainer.config
-
-defaults:
- - ppo_megatron_trainer
- - _self_
-
-actor_rollout_ref:
- hybrid_engine: true
- nccl_timeout: 600
-
- model:
- path: ???
- enable_gradient_checkpointing: true
-
- rollout:
- name: vllm
- mode: async
- prompt_length: 4096
- response_length: 131072
- max_model_len: 135168
- temperature: 1.0
- top_p: 1.0
- n: 8
- tensor_model_parallel_size: 4
- gpu_memory_utilization: 0.7
- calculate_log_probs: true
- enable_sleep_mode: true
- free_cache_engine: true
-
- multi_turn:
- enable: true
- max_assistant_turns: 1
- max_parallel_calls: 1
- format: qwen3_coder
-
- agent:
- num_workers: 8
- agent_loop_manager_class: uni_agent.trainer.framework.entry.AgentFrameworkRolloutAdapter
-
- custom:
- agent_framework:
- framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework
- agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
- gateway_count: 1
- completion_timeout_seconds: 600
- max_concurrent_sessions: 32
- agent_runner_kwargs:
- agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml
-
- actor:
- use_dynamic_bsz: true
- ppo_mini_batch_size: 16
- use_kl_loss: false
- kl_loss_coef: 0.0
- clip_ratio_low: 0.2
- clip_ratio_high: 0.28
- loss_agg_mode: token-mean
- optim:
- lr: 1e-6
- weight_decay: 0.1
- clip_grad: 1.0
- megatron:
- param_offload: true
- grad_offload: true
- optimizer_offload: true
- tensor_model_parallel_size: 8
- pipeline_model_parallel_size: 1
- context_parallel_size: 1
- use_mbridge: true
-
-data:
- train_files: ???
- val_files: ???
- max_prompt_length: 4096
- max_response_length: 131072
- train_batch_size: 128
- val_batch_size: 128
- return_raw_chat: true
- trust_remote_code: true
- custom_cls:
- path: pkg://examples.swe_agent_blackbox.dataset
- name: SWEBenchDataset
-
-algorithm:
- gamma: 1.0
- lam: 1.0
- adv_estimator: grpo
- use_kl_in_reward: false
- kl_ctrl:
- type: fixed
- kl_coef: 0.0
-
-reward:
- custom_reward_function:
- path: pkg://examples.swe_agent_blackbox.reward
- name: compute_score
-
-trainer:
- use_legacy_worker_impl: disable
- nnodes: 1
- n_gpus_per_node: 8
- total_epochs: 10
- project_name: swe_agent_blackbox
- experiment_name: swe_agent
- logger:
- - console
- device: cuda
- balance_batch: false
- val_before_train: true
- val_only: false
- save_freq: 10
- test_freq: 10
- default_local_dir: checkpoints/swe_agent_blackbox
- resume_mode: disable
-
-ray_kwargs:
- ray_init:
- runtime_env:
- env_vars:
- TRANSFER_QUEUE_ENABLE: ""
- NCCL_P2P_DISABLE: "1"
- NCCL_SHM_DISABLE: "1"
diff --git a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_v1.yaml
similarity index 62%
rename from examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml
rename to examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_v1.yaml
index d25fcce5..ad2c719d 100644
--- a/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_async.yaml
+++ b/examples/blackbox_recipes/mini_swe_agent/config/swe_agent_blackbox_megatron_v1.yaml
@@ -1,8 +1,8 @@
-# Megatron + TQ fully-async training config for the blackbox SWE-agent recipe.
-# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend.
+# Megatron + V1 unified trainer config for the blackbox mini-swe recipe.
#
-# Entry point: python3 -m verl.experimental.fully_async_policy.fully_async_main
-# Requires: transfer_queue.enable=true (selects TQ path in FullyAsyncTaskRunner)
+# Entry point: python3 -m verl.trainer.main_ppo
+# Default trainer mode is separate_async. On a single 8-GPU node this recipe
+# uses 4 GPUs for trainer and 4 GPUs for standalone rollout.
hydra:
searchpath:
@@ -13,7 +13,7 @@ defaults:
- _self_
actor_rollout_ref:
- hybrid_engine: false
+ hybrid_engine: true
nccl_timeout: 9600
model:
@@ -22,13 +22,16 @@ actor_rollout_ref:
rollout:
name: vllm
mode: async
+ nnodes: 1
+ n_gpus_per_node: 4
prompt_length: 4096
response_length: 131072
max_model_len: 135168
temperature: 1.0
top_p: 1.0
+ top_k: -1
n: 8
- tensor_model_parallel_size: 2
+ tensor_model_parallel_size: 4
gpu_memory_utilization: 0.7
calculate_log_probs: true
enable_sleep_mode: true
@@ -37,26 +40,29 @@ actor_rollout_ref:
max_num_batched_tokens: 135168
checkpoint_engine:
backend: nccl
+ update_weights_bucket_megabytes: 2048
multi_turn:
enable: true
- max_assistant_turns: 1
max_parallel_calls: 1
format: qwen3_coder
agent:
num_workers: 8
- agent_loop_manager_class: uni_agent.trainer.framework.entry.FullyAsyncAgentFrameworkRolloutAdapter
+ agent_loop_manager_class: uni_agent.framework.entry.AgentFrameworkRolloutAdapter
custom:
agent_framework:
- framework_class_fqn: examples.swe_agent_blackbox.framework.SWEAgentFramework
- agent_runner_fqn: examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner
gateway_count: 1
- completion_timeout_seconds: 600
- max_concurrent_sessions: 32
- agent_runner_kwargs:
- agent_config_path: examples/swe_agent_blackbox/config/agent_config.yaml
+ agent_runners:
+ swe_agent:
+ runner_fqn: examples.blackbox_recipes.mini_swe_agent.mini_swe_agent_runner.mini_swe_agent_runner
+ dispatch_mode: ray_task
+ max_concurrent_sessions: 32
+ runner_kwargs:
+ tool_image: swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest
+ run_timeout: 3600
+ conda_env: testbed
actor:
use_dynamic_bsz: true
@@ -78,16 +84,17 @@ actor_rollout_ref:
param_offload: true
grad_offload: true
optimizer_offload: true
- tensor_model_parallel_size: 8
+ tensor_model_parallel_size: 4
pipeline_model_parallel_size: 1
context_parallel_size: 1
use_mbridge: true
use_remove_padding: false
ref:
+ log_prob_micro_batch_size_per_gpu: 1
megatron:
param_offload: false
- tensor_model_parallel_size: 8
+ tensor_model_parallel_size: 4
pipeline_model_parallel_size: 1
context_parallel_size: 1
@@ -98,12 +105,14 @@ data:
truncation: left
max_prompt_length: 4096
max_response_length: 131072
- train_batch_size: 0
+ train_batch_size: 1
+ val_batch_size: 1
gen_batch_size: 1
return_raw_chat: true
trust_remote_code: true
+ dataloader_num_workers: 0
custom_cls:
- path: pkg://examples.swe_agent_blackbox.dataset
+ path: pkg://examples.blackbox_recipes.mini_swe_agent.dataset
name: SWEBenchDataset
algorithm:
@@ -119,13 +128,14 @@ algorithm:
reward:
custom_reward_function:
- path: pkg://examples.swe_agent_blackbox.reward
+ path: pkg://examples.blackbox_recipes.mini_swe_agent.reward
name: compute_score
trainer:
nnodes: 1
- n_gpus_per_node: 8
+ n_gpus_per_node: 4
total_epochs: 10
+ total_training_steps: null
project_name: swe_agent_blackbox
experiment_name: swe_agent
logger:
@@ -137,18 +147,14 @@ trainer:
test_freq: 10
default_local_dir: checkpoints/swe_agent_blackbox
resume_mode: auto
-
-rollout:
- nnodes: 1
- n_gpus_per_node: 8
- total_rollout_steps: 100000
-
-async_training:
- use_trainer_do_validate: false
- staleness_threshold: 1.0
- trigger_parameter_sync_step: 4
- require_batches: 1
- partial_rollout: true
+ use_v1: true
+ v1:
+ trainer_mode: separate_async
+ colocate_async:
+ num_warmup_batches: 1
+ separate_async:
+ num_warmup_batches: 4
+ parameter_sync_step: 4
transfer_queue:
enable: true
diff --git a/examples/blackbox_recipes/mini_swe_agent/dataset.py b/examples/blackbox_recipes/mini_swe_agent/dataset.py
index 89d65129..e7781c03 100644
--- a/examples/blackbox_recipes/mini_swe_agent/dataset.py
+++ b/examples/blackbox_recipes/mini_swe_agent/dataset.py
@@ -21,7 +21,6 @@ def extract_image(env_config: dict) -> str:
class SWEBenchDataset(RLHFDataset):
-
def __getitem__(self, item):
row_dict = super().__getitem__(item)
extra_info = row_dict.get("extra_info", {})
diff --git a/examples/blackbox_recipes/mini_swe_agent/framework.py b/examples/blackbox_recipes/mini_swe_agent/framework.py
deleted file mode 100644
index 7c5c027c..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/framework.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""SWE-agent specific framework subclass.
-
-Injects reward_info (from agent_runner's complete_session call)
-into sample_fields["extra_info"] so the reward worker's
-compute_score can access it via extra_info.
-
-Overrides _run_session to execute agent_runner in a separate Ray worker
-process, preventing blocking operations from stalling the event loop.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import functools
-import logging
-from dataclasses import replace
-from uuid import uuid4
-
-import ray
-
-from uni_agent.trainer.framework.framework import OpenAICompatibleAgentFramework
-
-from examples.swe_agent_blackbox.subprocess_runner import remote_agent_run
-
-logger = logging.getLogger(__name__)
-
-
-class SWEAgentFramework(OpenAICompatibleAgentFramework):
-
- async def _score_trajectories(self, session_trajectories, sample_fields):
- if session_trajectories and session_trajectories[-1].reward_info:
- reward_info = session_trajectories[-1].reward_info
- extra_info = dict(sample_fields.get("extra_info") or {})
- sample_fields = {**sample_fields, "extra_info": {**extra_info, **reward_info}}
- return await super()._score_trajectories(session_trajectories, sample_fields)
-
- def _resolve_runner(self) -> tuple[str, dict]:
- """Extract FQN and pre-bound kwargs from self.agent_runner.
-
- self.agent_runner may be a functools.partial (from_config wraps it),
- so we unpack the original function and its keywords.
- """
- fn = self.agent_runner
- kwargs = {}
- if isinstance(fn, functools.partial):
- kwargs = dict(fn.keywords)
- fn = fn.func
- fqn = f"{fn.__module__}.{fn.__qualname__}"
- return fqn, kwargs
-
- async def _run_session(
- self,
- *,
- prompts,
- raw_prompt,
- sample_index: int,
- session_id: str | None = None,
- runner_kwargs: dict | None = None,
- ):
- """Run agent_runner in a Ray worker process instead of in-process."""
- session_id = session_id or f"session-{sample_index}-0-{uuid4().hex}"
- sample_fields = self._extract_sample_fields(prompts=prompts, sample_index=sample_index)
- session = await self.session_runtime.create_session(session_id)
- agent_runner_fqn, resolved_kwargs = self._resolve_runner()
-
- try:
- if runner_kwargs:
- resolved_kwargs = {**resolved_kwargs, **runner_kwargs}
-
- ref = remote_agent_run.remote(
- agent_runner_fqn=agent_runner_fqn,
- raw_prompt=raw_prompt,
- session_id=session_id,
- base_url=session.base_url,
- sample_index=sample_index,
- runner_kwargs=resolved_kwargs,
- )
- loop = asyncio.get_running_loop()
- reward_info = await loop.run_in_executor(None, ray.get, ref)
-
- await self.session_runtime.complete_session(
- session_id, reward_info=reward_info,
- )
- session_trajectories = await self.session_runtime.finalize_session(session_id)
-
- except Exception as e:
- logger.error("_run_session failed: session=%s, sample=%d, runner=%s: %s",
- session_id, sample_index, agent_runner_fqn, e, exc_info=True)
- await self.session_runtime.abort_session(session_id)
- raise
-
- if not self.reward_loop_worker_handles or not session_trajectories:
- return session_trajectories, sample_fields
-
- annotations = await self._score_trajectories(session_trajectories, sample_fields)
- scored_trajectories = []
- for traj, (score, extra) in zip(session_trajectories, annotations, strict=True):
- scored_trajectories.append(
- replace(
- traj,
- reward_score=score,
- extra_fields={**traj.extra_fields, "reward_extra_info": extra},
- )
- )
- return scored_trajectories, sample_fields
diff --git a/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py
index 33882bc8..2b16099a 100644
--- a/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py
+++ b/examples/blackbox_recipes/mini_swe_agent/mini_swe_agent_runner.py
@@ -1,6 +1,6 @@
"""Mini-swe-agent runner for the blackbox SWE-agent recipe.
-Agent runs inside a OpenYuanRong remote sandbox via sidecar tool image mount.
+Agent runs inside a remote sandbox via sidecar tool image mount.
The runner creates the sandbox, pipes task config via stdin, parses
the result from stdout, and evaluates reward in the same sandbox.
"""
@@ -15,21 +15,24 @@
import time
from pathlib import Path
-from uni_agent.trainer.framework.types import SessionHandle, SessionRuntime
+import httpx
-from examples.swe_agent_blackbox.dataset import extract_image
-from examples.swe_agent_blackbox.reward import build_reward_context, evaluate_in_env
-from examples.swe_agent_blackbox.sandbox import CommandResult, YRSandbox, extract_upstream, rewrite_gateway_url
+from examples.blackbox_recipes.mini_swe_agent.dataset import extract_image
+from examples.blackbox_recipes.mini_swe_agent.reward import build_reward_context, evaluate_in_env
+from examples.blackbox_recipes.sandbox_client import (
+ SandboxClient,
+ extract_upstream,
+ rewrite_gateway_url,
+)
+from uni_agent.gateway.session import SessionHandle
logger = logging.getLogger(__name__)
-if os.environ.get("DEBUG_MODE"):
- logger.setLevel(logging.DEBUG)
DEFAULT_TOOL_IMAGE = "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest"
class SandboxEnvForReward:
- """Adapts :class:`YRSandbox` to the async env interface used by
+ """Adapts :class:`Sandbox` to the async env interface used by
reward specs (``communicate``, ``write_file``, ``read_file``).
"""
@@ -67,7 +70,7 @@ def _build_task_config(
) -> dict:
"""Build the task config passed to run_agent.py via stdin."""
agent_gateway_url = rewrite_gateway_url(gateway_url)
- step_limit = int(os.environ.get("SWE_AGENT_MAX_TURNS", "100"))
+ step_limit = int(os.environ.get("AGENT_MAX_TURNS", "100"))
return {
"task": task,
"gateway_url": agent_gateway_url,
@@ -84,15 +87,20 @@ def build_agent_command(
) -> str:
"""Build the command that runs run_agent.py inside the sandbox."""
conda_prefix = f"/opt/miniconda3/envs/{conda_env}"
- env_prefix = (
+ run_agent_env = (
f"CONDA_DEFAULT_ENV={shlex.quote(conda_env)} "
f"CONDA_PREFIX={shlex.quote(conda_prefix)} "
- f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH"
+ f"PATH={shlex.quote(conda_prefix + '/bin')}:/opt/miniconda3/bin:$PATH "
+ "PIP_DISABLE_PIP_VERSION_CHECK=1 "
+ "PIP_PROGRESS_BAR=off"
)
return (
"unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy NO_PROXY no_proxy; "
- f"{env_prefix} "
- f"echo {config_b64} | base64 -d | "
+ f"env {run_agent_env} sh -c 'echo \"[mini_swe] shell env: CONDA_DEFAULT_ENV=$CONDA_DEFAULT_ENV "
+ 'CONDA_PREFIX=$CONDA_PREFIX PATH=$PATH" >&2; '
+ 'echo "[mini_swe] python=$(command -v python) pip=$(command -v pip)" >&2\' ; '
+ f"printf %s {shlex.quote(config_b64)} | base64 -d | "
+ f"env {run_agent_env} "
"/opt/mini-swe-agent/bin/python /opt/mini-swe-agent/bin/run_agent.py"
)
@@ -102,21 +110,21 @@ async def mini_swe_agent_runner(
raw_prompt,
session: SessionHandle,
sample_index: int,
- session_runtime: SessionRuntime,
tools_kwargs: dict | None = None,
tool_image: str = DEFAULT_TOOL_IMAGE,
run_timeout: int = 7200,
conda_env: str = "testbed",
+ sandbox_max_retries: int = 10,
**kwargs,
) -> None:
"""Run mini-swe-agent inside a sandbox with sidecar tool mount.
Flow:
- 1. Create OpenYuanRong remote sandbox with mini-swe-agent sidecar
+ 1. Create remote sandbox with mini-swe-agent sidecar
2. Pipe task config to run_agent.py via stdin
3. Parse agent result from stdout
4. Evaluate reward in the same sandbox
- 5. Complete session with reward_info
+ 5. Post reward_info for the framework reward path
"""
tools_kwargs = tools_kwargs or {}
logger.info("mini_swe_agent_runner called, sample_index=%d", sample_index)
@@ -130,14 +138,17 @@ async def mini_swe_agent_runner(
if not image:
raise ValueError(f"No sandbox image found in tools_kwargs.env for sample {sample_index}")
- # Gateway URL — extract upstream for OpenYuanRong tunnel
+ # Gateway URL — extract upstream for tunnel
gateway_url = session.base_url
if not gateway_url:
raise ValueError(f"gateway_url is empty for sample {sample_index}")
upstream = extract_upstream(gateway_url)
- sandbox = await YRSandbox.create(
- image=image, sidecar_image=tool_image, upstream=upstream,
+ sandbox = await SandboxClient.create(
+ image=image,
+ sidecar_image=tool_image,
+ upstream=upstream,
+ max_retries=int(sandbox_max_retries),
)
sandbox_id = sandbox.sandbox_id
logger.info("Sandbox created (image=%s, sandbox_id=%s)", image, sandbox_id)
@@ -168,14 +179,17 @@ async def mini_swe_agent_runner(
elapsed = time.perf_counter() - t0
logger.debug(
"[sample %d] agent process finished: rc=%d (%.1fs)",
- sample_index, agent_result.exit_code, elapsed,
+ sample_index,
+ agent_result.exit_code,
+ elapsed,
)
# Parse agent result from stdout
agent_info = _parse_agent_result(agent_result.stdout, sample_index)
logger.info(
"[sample %d] agent: exit_status=%s, submission=%d chars",
- sample_index, agent_info.get("exit_status"),
+ sample_index,
+ agent_info.get("exit_status"),
len(agent_info.get("submission", "")),
)
@@ -186,11 +200,18 @@ async def mini_swe_agent_runner(
score, eval_result = await evaluate_in_env(reward_env, metadata, eval_timeout)
logger.debug(
"[sample %d] reward done: score=%s, resolved=%s (%.1fs)",
- sample_index, score, eval_result.get("resolved"), time.perf_counter() - t0,
+ sample_index,
+ score,
+ eval_result.get("resolved"),
+ time.perf_counter() - t0,
)
reward_info = {"reward_score": score, **eval_result}
- await session_runtime.complete_session(session.session_id, reward_info=reward_info)
+ if not session.reward_info_url:
+ raise ValueError(f"reward_info_url is empty for session {session.session_id}")
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ response = await client.post(session.reward_info_url, json={"reward_info": reward_info})
+ response.raise_for_status()
except Exception as e:
logger.warning("Mini-swe-agent runner failed for sample %d (sandbox_id=%s): %s", sample_index, sandbox_id, e)
@@ -212,7 +233,7 @@ def _parse_agent_result(stdout: str, sample_index: int) -> dict:
if not stdout:
return {"exit_status": "error", "submission": ""}
# Try the last line that looks like JSON first
- lines = [l.strip() for l in stdout.split("\n") if l.strip()]
+ lines = [ln.strip() for ln in stdout.split("\n") if ln.strip()]
for line in reversed(lines):
if line.startswith("{"):
try:
diff --git a/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py
index c74765e0..cd792cd5 100644
--- a/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py
+++ b/examples/blackbox_recipes/mini_swe_agent/parallel_infer.py
@@ -1,11 +1,16 @@
-"""Parallel inference runner for the blackbox SWE-agent recipe (v2).
+"""Standalone inference runner for the blackbox mini-swe-agent recipe.
-Creates an LLM server, GatewayServingRuntime, and SWEAgentFramework,
-then runs agent sessions in parallel and reports resolve rate.
+Spins up vLLM + gateway + a reward worker, runs agent sessions in parallel,
+and reports resolve rate. Does NOT start the Megatron trainer.
-Usage (CLI):
- python examples/swe_agent_blackbox/parallel_infer.py \
- --model-path ~/models/Qwen3-Coder-30B-A3B-Instruct \
+Reuses the recipe's existing training config
+(config/swe_agent_blackbox_megatron_v1.yaml); its megatron/optimizer sections
+are inert here since this driver never builds the actor worker group — only
+the rollout, agent_framework, model, and reward sections are read.
+
+Usage:
+ python examples/blackbox_recipes/mini_swe_agent/parallel_infer.py \
+ --model-path ~/models/Qwen3.5-9B \
--data-path ~/data/swe_agent/swe_bench_verified.parquet \
--max-samples 10
"""
@@ -14,32 +19,20 @@
import argparse
import asyncio
-import json
import logging
import os
-from functools import partial
from typing import Any
from uuid import uuid4
import numpy as np
import ray
-from verl import DataProto
-from verl.protocol import pad_dataproto_to_divisor
-from verl.utils import hf_tokenizer
-from verl.utils.transferqueue_utils import tq as _tq_mock
+from verl.experimental.reward_loop.reward_loop import RewardLoopWorker
+from verl.utils import tensordict_utils as tu
+from verl.utils.transferqueue_utils import tq
from verl.workers.rollout.llm_server import LLMServerManager
-from uni_agent.trainer.gateway.runtime import GatewayServingRuntime
-
-from examples.swe_agent_blackbox.framework import SWEAgentFramework
-from examples.swe_agent_blackbox.agent_runner import swe_agent_runner
-from examples.swe_agent_blackbox.claude_code_runner import claude_code_runner
-
-try:
- from examples.swe_agent_blackbox.mini_swe_agent_runner import mini_swe_agent_runner
-except ImportError:
- mini_swe_agent_runner = None
+from uni_agent.framework.entry import build_agent_framework, build_gateway_manager
logging.basicConfig(
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -48,9 +41,14 @@
)
logger = logging.getLogger(__name__)
+# ── Recipe-specific constants (only these two differ between recipes) ──────
+_CONFIG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config")
+_CONFIG_NAME = "swe_agent_blackbox_megatron_v1"
+_DEFAULT_TOOL_IMAGE = "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest"
+
# =====================================================================
-# Dataset loading (inlined from dataset.py — only used here)
+# Dataset loading (inlined; keeps the driver self-contained)
# =====================================================================
@@ -83,7 +81,6 @@ def _remap_sample_images(sample: dict[str, Any]) -> dict[str, Any]:
def _inject_reward_fields(sample: dict[str, Any]) -> None:
- """Inject verl-standard data_source and reward_model from extra_info.tools_kwargs.reward."""
extra_info = sample.get("extra_info", {})
tools_kwargs = extra_info.get("tools_kwargs", {})
reward_config = tools_kwargs.get("reward", {})
@@ -91,334 +88,272 @@ def _inject_reward_fields(sample: dict[str, Any]) -> None:
sample.setdefault("reward_model", {"ground_truth": {}})
-def load_swe_dataset(data_path: str | list[str], max_samples: int = -1) -> list[dict[str, Any]]:
+def load_swe_dataset(data_path: str, max_samples: int = -1) -> list[dict[str, Any]]:
import pyarrow.parquet as pq
- if isinstance(data_path, list):
- paths = [os.path.expanduser(p) for p in data_path]
- else:
- paths = os.path.expanduser(data_path)
-
- logger.info("Loading dataset from: %s", data_path)
- if isinstance(paths, list):
- import pyarrow as pa
- tables = [pq.read_table(p) for p in paths]
- table = pa.concat_tables(tables)
- else:
- table = pq.read_table(paths)
- samples = table.to_pylist()
-
+ path = os.path.expanduser(data_path)
+ logger.info("Loading dataset from: %s", path)
+ samples = pq.read_table(path).to_pylist()
for i, sample in enumerate(samples):
samples[i] = _remap_sample_images(sample)
_inject_reward_fields(samples[i])
-
if max_samples > 0:
samples = samples[:max_samples]
- logger.info("Using first %d samples (max_samples=%d)", len(samples), max_samples)
-
- logger.info("Loaded %d samples from %s", len(samples), data_path)
+ logger.info("Loaded %d samples", len(samples))
return samples
-class _MockReplayBuffer:
- """Minimal replay buffer for inference mode (no actual training)."""
-
- def add(self, partition_id, items):
- pass
+# =====================================================================
+# Config
+# =====================================================================
-def run_inference(
+def _load_config(
*,
model_path: str,
- data_path: str,
- prompt_length: int = 4096,
- response_length: int = 65536,
- temperature: float = 0.8,
- top_p: float = 0.9,
- n: int = 1,
- max_samples: int = -1,
- engine: str = "vllm",
- nnodes: int = 1,
- n_gpus_per_node: int = 8,
- tensor_parallel_size: int = 4,
- gateway_count: int = 1,
- max_concurrent_sessions: int = 2,
- completion_timeout: float = 600.0,
- tool_parser: str | None = None,
- agent_config_path: str | None = None,
- runner: str = "uniagent",
- tool_image: str | None = None,
- run_timeout: int = 7200,
-) -> dict[str, Any]:
- """Run parallel SWE-agent inference using the blackbox framework."""
- if runner == "mini_swe":
- if mini_swe_agent_runner is None:
- raise ImportError("mini-swe-agent is required for --runner mini_swe. Install with: pip install mini-swe-agent")
- _agent_runner = partial(
- mini_swe_agent_runner,
- tool_image=tool_image or "swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest",
- run_timeout=run_timeout,
- )
- elif runner == "claude_code":
- _agent_runner = partial(
- claude_code_runner,
- tool_image=tool_image or "claude-code-tool:latest",
- run_timeout=run_timeout,
- )
- else:
- _agent_runner = swe_agent_runner
+ engine: str,
+ prompt_length: int,
+ response_length: int,
+ temperature: float,
+ top_p: float,
+ n: int,
+ nnodes: int,
+ n_gpus_per_node: int,
+ tensor_parallel_size: int,
+ gateway_count: int,
+ max_concurrent_sessions: int,
+ tool_image: str | None,
+ run_timeout: int,
+) -> Any:
+ """Compose the recipe's training config and override inference fields.
- if not ray.is_initialized():
- ray.init()
+ The megatron/actor/optimizer sections are left untouched and never read.
+ """
+ from hydra import compose, initialize_config_dir
+ from omegaconf import OmegaConf
- # 1. Init Hydra config
- config = _init_hydra_config(
- model_path=model_path,
- engine=engine,
- prompt_length=prompt_length,
- response_length=response_length,
- temperature=temperature,
- top_p=top_p,
- n=n,
- nnodes=nnodes,
- n_gpus_per_node=n_gpus_per_node,
- tensor_parallel_size=tensor_parallel_size,
- )
+ with initialize_config_dir(config_dir=_CONFIG_DIR, version_base=None):
+ config = compose(config_name=_CONFIG_NAME)
- # 2. Load dataset
- samples = load_swe_dataset(data_path, max_samples=max_samples)
- logger.info(
- "Loaded %d samples, %d rollout(s) each, runner=%s, gateway_count=%d, max_concurrent_sessions=%d",
- len(samples),
- n,
- runner,
- gateway_count,
- max_concurrent_sessions,
- )
+ OmegaConf.set_struct(config, False)
- if not samples:
- raise ValueError("No samples to process")
+ config.actor_rollout_ref.model.path = os.path.expanduser(model_path)
- # 3. Create LLM server
- logger.info("Initializing LLM server manager...")
- llm_server_manager = LLMServerManager.create(config=config)
+ ro = config.actor_rollout_ref.rollout
+ ro.name = engine
+ ro.mode = "async"
+ ro.prompt_length = prompt_length
+ ro.response_length = response_length
+ ro.max_model_len = prompt_length + response_length + 1024
+ ro.max_num_batched_tokens = ro.max_model_len
+ ro.n = n
+ ro.temperature = temperature
+ ro.top_p = top_p
+ ro.tensor_model_parallel_size = tensor_parallel_size
+ ro.gpu_memory_utilization = float(os.getenv("ROLLOUT_GPU_MEM_UTIL", "0.7"))
+ ro.nnodes = nnodes
+ ro.n_gpus_per_node = n_gpus_per_node
+ ro.calculate_log_probs = True
+ ro.enable_sleep_mode = False
+
+ af = ro.custom.agent_framework
+ af.gateway_count = gateway_count
+ runner_name = next(iter(af.agent_runners.keys()))
+ runner_cfg = af.agent_runners[runner_name]
+ runner_cfg.max_concurrent_sessions = max_concurrent_sessions
+ if tool_image:
+ runner_cfg.runner_kwargs.tool_image = tool_image
+ runner_cfg.runner_kwargs.run_timeout = run_timeout
- # 4. Create GatewayServingRuntime
- logger.info("Using tool_parser=%r", tool_parser)
+ config.trainer.nnodes = nnodes
+ config.trainer.n_gpus_per_node = n_gpus_per_node
- llm_client = llm_server_manager.get_client()
- gateway_actor_kwargs = {
- "tokenizer": hf_tokenizer(os.path.expanduser(model_path)),
- "base_sampling_params": {"temperature": temperature, "top_p": top_p, "max_tokens": response_length},
- }
- if tool_parser:
- gateway_actor_kwargs["tool_parser_name"] = tool_parser
-
- gateway_runtime = GatewayServingRuntime(
- llm_client=llm_client,
- gateway_count=gateway_count,
- gateway_actor_kwargs=gateway_actor_kwargs,
- )
+ OmegaConf.set_struct(config, True)
+ return config
- # 5. Create RewardLoopWorker for compute_score
- from verl.experimental.reward_loop.reward_loop import RewardLoopWorker
- reward_worker = ray.remote(RewardLoopWorker).remote(config, None)
- # 6. Create framework
- framework = SWEAgentFramework(
- session_runtime=gateway_runtime,
- agent_runner=_agent_runner,
- replay_buffer=_MockReplayBuffer(),
- rollout_config={"n": n, "val_kwargs": {"n": n}},
- completion_timeout=completion_timeout,
- wait_for_completion_after_agent_run=True,
- max_concurrent_sessions=max_concurrent_sessions,
- reward_loop_worker_handles=[reward_worker],
- )
-
- # 6. Build batch data and run
- _tools_kwargs_list = []
- for sample in samples:
- tk = (sample.get("extra_info") or {}).get("tools_kwargs", {})
- if runner == "uniagent" and agent_config_path:
- tk["agent_config_path"] = agent_config_path
- tk["model_path"] = os.path.expanduser(model_path)
- _tools_kwargs_list.append(tk)
+# =====================================================================
+# Batch + score capture
+# =====================================================================
- from tensordict import TensorDict
- from verl.utils import tensordict_utils as _tu
+def _build_prompts(samples: list[dict[str, Any]]) -> tuple[Any, list[str]]:
raw_prompts = [sample["prompt"] for sample in samples]
uids = [str(uuid4()) for _ in samples]
- td = TensorDict({"uid": uids, "global_steps": [0] * len(samples)}, batch_size=[len(samples)])
- _tu.assign_non_tensor_stack(td, "raw_prompt", raw_prompts)
- _tu.assign_non_tensor_stack(td, "tools_kwargs", _tools_kwargs_list)
- _tu.assign_non_tensor_stack(td, "data_source", [sample["data_source"] for sample in samples])
- _tu.assign_non_tensor_stack(td, "reward_model", [sample["reward_model"] for sample in samples])
+ tools_kwargs_list = [dict((sample.get("extra_info") or {}).get("tools_kwargs", {})) for sample in samples]
+ prompts = tu.get_tensordict(
+ tensor_dict={
+ "raw_prompt": raw_prompts,
+ "uid": uids,
+ "data_source": [sample["data_source"] for sample in samples],
+ "reward_model": [sample["reward_model"] for sample in samples],
+ "tools_kwargs": tools_kwargs_list,
+ },
+ non_tensor_dict={"global_steps": 0},
+ )
+ return prompts, uids
- batch = DataProto(batch=td, meta_info={}).repeat(n)
- size_divisor = gateway_count
- batch_padded, pad_size = pad_dataproto_to_divisor(batch, size_divisor)
- logger.info("Starting %d agent session(s)...", len(batch_padded))
+def _install_tq_capture() -> tuple[dict[str, float], dict[str, str]]:
+ """Monkeypatch the process-local TransferQueue to capture rm_scores in-memory.
- _tq_store: dict[str, Any] = {}
+ Runner dispatch is a Ray task, but session finalize/score/TQ-writes happen
+ in this driver process, so patching ``tq`` here captures every write.
+ """
+ captured_scores: dict[str, float] = {}
+ uid_status: dict[str, str] = {}
- async def _dummy_kv_put(key, partition_id=None, tag=None, **kwargs):
- _tq_store[key] = tag
+ async def _fake_put(*, key, partition_id=None, tag=None, **kwargs):
+ if isinstance(tag, dict) and "status" in tag:
+ uid_status[str(key)] = str(tag["status"])
- async def _dummy_kv_batch_put(keys=None, fields=None, tags=None, partition_id=None, **kwargs):
+ async def _fake_batch_put(*, keys=None, fields=None, tags=None, partition_id=None, **kwargs):
+ if fields is None or keys is None or "rm_scores" not in fields:
+ return
+ rm = fields["rm_scores"] # nested tensor; rm[i] is trajectory i's response scores
for i, key in enumerate(keys):
- _tq_store[key] = {"fields": fields, "tag": tags[i] if tags else None}
+ row = rm[i]
+ captured_scores[str(key)] = float(row[-1].item()) if row.numel() else 0.0
- _tq_mock.async_kv_put = _dummy_kv_put
- _tq_mock.async_kv_batch_put = _dummy_kv_batch_put
+ tq.async_kv_put = _fake_put
+ tq.async_kv_batch_put = _fake_batch_put
+ return captured_scores, uid_status
- async def _generate():
- return await framework.generate_sequences(batch_padded.batch)
- try:
- stats = asyncio.run(_generate())
- except RuntimeError as e:
- logger.warning("generate_sequences failed: %s", e)
- stats = {}
-
- # 7. Collect scores
- uid_to_sample_idx = {uid: i for i, uid in enumerate(uids)}
- per_sample_scores = [0.0] * len(samples)
- sample_trajectory_counts = [0] * len(samples)
- for key, value in _tq_store.items():
- if not isinstance(value, dict) or "fields" not in value:
- continue
- fields = value["fields"]
- rm_scores = fields.get("rm_scores", None)
- if rm_scores is None:
- continue
- # Key format: {uid}_{session_index}_{index}
+def _report(samples, uids, captured_scores) -> dict[str, Any]:
+ uid_to_index = {uid: i for i, uid in enumerate(uids)}
+ per_sample_sum = [0.0] * len(samples)
+ per_sample_cnt = [0] * len(samples)
+ for key, score in captured_scores.items():
+ # key format: {uid}_{session_index}_{index}
uid = key.rsplit("_", 2)[0]
- sample_idx = uid_to_sample_idx.get(uid)
- if sample_idx is None:
+ idx = uid_to_index.get(uid)
+ if idx is None:
continue
- score = float(rm_scores.float()[-1, -1].item())
- per_sample_scores[sample_idx] += score
- sample_trajectory_counts[sample_idx] += 1
-
- for i in range(len(samples)):
- if sample_trajectory_counts[i] > 0:
- per_sample_scores[i] /= sample_trajectory_counts[i]
-
- resolved_count = sum(1 for s in per_sample_scores if s > 0)
- overall_mean = float(np.mean(per_sample_scores)) if per_sample_scores else 0.0
+ per_sample_sum[idx] += score
+ per_sample_cnt[idx] += 1
+ per_sample_scores = [
+ per_sample_sum[i] / per_sample_cnt[i] if per_sample_cnt[i] else 0.0 for i in range(len(samples))
+ ]
+ resolved = sum(1 for s in per_sample_scores if s > 0)
+ mean = float(np.mean(per_sample_scores)) if per_sample_scores else 0.0
logger.info(
"Resolved %d / %d samples (%.2f%%), mean score: %.4f",
- resolved_count, len(samples), 100.0 * resolved_count / max(len(samples), 1), overall_mean,
+ resolved, len(samples), 100.0 * resolved / max(len(samples), 1), mean,
)
-
- # 8. Cleanup
- asyncio.run(gateway_runtime.shutdown())
-
- return {
- "stats": stats,
- "mean_score": overall_mean,
- "per_sample_scores": per_sample_scores,
- }
+ return {"resolved": resolved, "total": len(samples), "mean_score": mean, "per_sample_scores": per_sample_scores}
# =====================================================================
-# Helpers
+# Runner
# =====================================================================
-def _init_hydra_config(
+def run_inference(
*,
model_path: str,
- engine: str,
+ data_path: str,
prompt_length: int,
response_length: int,
temperature: float,
top_p: float,
n: int,
+ max_samples: int,
+ engine: str,
nnodes: int,
n_gpus_per_node: int,
tensor_parallel_size: int,
-) -> Any:
- """Initialize Hydra config with rollout/model settings."""
- from hydra import compose, initialize_config_dir
- from omegaconf import OmegaConf
+ gateway_count: int,
+ max_concurrent_sessions: int,
+ tool_image: str | None,
+ run_timeout: int,
+) -> dict[str, Any]:
+ if not ray.is_initialized():
+ ray.init()
- config_dir = os.path.abspath("examples/swe_agent_blackbox/config")
- with initialize_config_dir(config_dir=config_dir, version_base=None):
- config = compose(config_name="parallel_infer")
+ config = _load_config(
+ model_path=model_path,
+ engine=engine,
+ prompt_length=prompt_length,
+ response_length=response_length,
+ temperature=temperature,
+ top_p=top_p,
+ n=n,
+ nnodes=nnodes,
+ n_gpus_per_node=n_gpus_per_node,
+ tensor_parallel_size=tensor_parallel_size,
+ gateway_count=gateway_count,
+ max_concurrent_sessions=max_concurrent_sessions,
+ tool_image=tool_image,
+ run_timeout=run_timeout,
+ )
- config.actor_rollout_ref.model.path = os.path.expanduser(model_path)
- config.actor_rollout_ref.rollout.name = engine
- config.actor_rollout_ref.rollout.mode = "async"
- config.actor_rollout_ref.rollout.prompt_length = prompt_length
- config.actor_rollout_ref.rollout.response_length = response_length
- config.actor_rollout_ref.rollout.max_model_len = prompt_length + response_length + 1024
- config.actor_rollout_ref.rollout.n = n
- config.actor_rollout_ref.rollout.tensor_model_parallel_size = tensor_parallel_size
- config.actor_rollout_ref.rollout.gpu_memory_utilization = float(os.getenv("ROLLOUT_GPU_MEM_UTIL", "0.5"))
- config.actor_rollout_ref.rollout.temperature = temperature
- config.actor_rollout_ref.rollout.top_p = top_p
- config.actor_rollout_ref.rollout.val_kwargs.temperature = temperature
- config.actor_rollout_ref.rollout.val_kwargs.top_p = top_p
- config.actor_rollout_ref.rollout.calculate_log_probs = True
- config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns = 100
- config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1
- config.actor_rollout_ref.rollout.nnodes = nnodes
- config.actor_rollout_ref.rollout.n_gpus_per_node = n_gpus_per_node
- config.trainer.nnodes = nnodes
- config.trainer.n_gpus_per_node = n_gpus_per_node
+ samples = load_swe_dataset(data_path, max_samples=max_samples)
+ if not samples:
+ raise ValueError("No samples to process")
- config.reward.custom_reward_function.path = "pkg://examples.swe_agent_blackbox.reward"
- config.reward.custom_reward_function.name = "compute_score"
- config.reward.num_workers = 1
+ logger.info("Initializing LLM server manager...")
+ llm_server_manager = LLMServerManager.create(config=config)
+ llm_client = llm_server_manager.get_client()
- OmegaConf.set_struct(config.actor_rollout_ref.rollout, False)
- config.actor_rollout_ref.rollout.enable_sleep_mode = False
- config.actor_rollout_ref.rollout.enforce_eager = os.getenv("ROLLOUT_ENFORCE_EAGER", "0") == "1"
- OmegaConf.set_struct(config.actor_rollout_ref.rollout, True)
- return config
+ gateway_manager = build_gateway_manager(config=config, llm_client=llm_client)
+ reward_worker = ray.remote(RewardLoopWorker).remote(config, None)
+ framework = build_agent_framework(
+ config=config,
+ gateway_manager=gateway_manager,
+ reward_loop_worker_handles=[reward_worker],
+ )
+
+ prompts, uids = _build_prompts(samples)
+ captured_scores, _uid_status = _install_tq_capture()
+
+ logger.info("Starting %d sample(s), %d session(s) each...", len(samples), n)
+ try:
+ asyncio.run(framework.generate_sequences(prompts))
+ except RuntimeError as exc:
+ logger.warning("generate_sequences failed: %s", exc)
+
+ if not captured_scores:
+ logger.warning(
+ "No trajectory scores captured — all rollouts may have failed (see the "
+ "generate_sequences summary above), or the TransferQueue monkeypatch did not "
+ "reach the writer; resolve rate will be reported as 0."
+ )
+
+ result = _report(samples, uids, captured_scores)
+
+ asyncio.run(gateway_manager.shutdown())
+ return result
# =====================================================================
-# CLI entry point
+# CLI
# =====================================================================
def main():
- parser = argparse.ArgumentParser(description="SWE-Agent Blackbox Parallel Inference")
+ parser = argparse.ArgumentParser(description="Blackbox mini-swe-agent standalone inference")
+ parser.add_argument("--model-path", "--model", type=str, default="~/models/Qwen3.5-9B")
parser.add_argument("--data-path", type=str, default="~/data/swe_agent/swe_bench_verified.parquet")
- parser.add_argument("--model-path", "--model", type=str, default="~/models/Qwen3-Coder-30B-A3B-Instruct")
- parser.add_argument("--max-turns", type=int, default=100)
+ parser.add_argument("--max-samples", type=int, default=-1)
parser.add_argument("--prompt-length", type=int, default=4096)
- parser.add_argument("--response-length", type=int, default=65536)
- parser.add_argument("--temperature", type=float, default=0.8)
- parser.add_argument("--top-p", type=float, default=0.9)
+ parser.add_argument("--response-length", type=int, default=131072)
+ parser.add_argument("--temperature", type=float, default=1.0)
+ parser.add_argument("--top-p", type=float, default=1.0)
parser.add_argument("--n", type=int, default=1)
- parser.add_argument("--max-samples", type=int, default=-1)
parser.add_argument("--engine", type=str, default="vllm", choices=["vllm", "sglang"])
+ parser.add_argument("--tensor-parallel-size", "--tp", type=int, default=4)
parser.add_argument("--nnodes", type=int, default=1)
parser.add_argument("--n-gpus-per-node", type=int, default=8)
- parser.add_argument("--tensor-parallel-size", "--tp", type=int, default=4)
parser.add_argument("--gateway-count", type=int, default=1)
- parser.add_argument("--max-concurrent-sessions", type=int, default=2)
- parser.add_argument("--tool-parser", type=str, default="qwen3_coder")
- parser.add_argument("--tool-image", type=str, default=None)
+ parser.add_argument("--max-concurrent-sessions", type=int, default=8)
+ parser.add_argument("--tool-image", type=str, default=_DEFAULT_TOOL_IMAGE)
parser.add_argument("--run-timeout", type=int, default=7200)
- parser.add_argument(
- "--runner", type=str, default="uniagent", choices=["uniagent", "mini_swe", "claude_code"],
- help="Agent runner: 'uniagent', 'mini_swe', or 'claude_code'.",
- )
- parser.add_argument(
- "--agent-config-path", type=str,
- default="examples/swe_agent_blackbox/config/agent_config.yaml",
- help="Path to agent config YAML.",
- )
+ parser.add_argument("--max-turns", type=int, default=100)
args = parser.parse_args()
- os.environ["SWE_AGENT_MAX_TURNS"] = str(args.max_turns)
+ # Set before ray.init so runner Ray tasks inherit it.
+ os.environ["AGENT_MAX_TURNS"] = str(args.max_turns)
run_inference(
model_path=args.model_path,
@@ -435,9 +370,6 @@ def main():
tensor_parallel_size=args.tensor_parallel_size,
gateway_count=args.gateway_count,
max_concurrent_sessions=args.max_concurrent_sessions,
- tool_parser=args.tool_parser,
- agent_config_path=args.agent_config_path,
- runner=args.runner,
tool_image=args.tool_image,
run_timeout=args.run_timeout,
)
diff --git a/examples/blackbox_recipes/mini_swe_agent/reward.py b/examples/blackbox_recipes/mini_swe_agent/reward.py
index 61da218b..267cfea5 100644
--- a/examples/blackbox_recipes/mini_swe_agent/reward.py
+++ b/examples/blackbox_recipes/mini_swe_agent/reward.py
@@ -27,7 +27,7 @@ def build_reward_context(tools_kwargs: dict) -> tuple[dict[str, Any], int]:
def compute_score(data_source: str, solution_str: str, ground_truth: str, extra_info=None) -> dict:
- """Read reward_score from extra_info, injected by SWEAgentFramework."""
+ """Read reward_score from extra_info, injected by the agent runner."""
score = 0.0
if extra_info and "reward_score" in extra_info:
score = float(extra_info["reward_score"])
diff --git a/examples/blackbox_recipes/mini_swe_agent/run_agent.py b/examples/blackbox_recipes/mini_swe_agent/run_agent.py
index c5a4b165..68406803 100644
--- a/examples/blackbox_recipes/mini_swe_agent/run_agent.py
+++ b/examples/blackbox_recipes/mini_swe_agent/run_agent.py
@@ -3,7 +3,7 @@
Input: task config JSON from **stdin**
- task: str — the issue description for the agent to solve
- - gateway_url: str — LLM gateway endpoint (tunnel URL for OpenYuanRong sandbox)
+ - gateway_url: str — LLM gateway endpoint (tunnel URL for remote sandbox)
- agent: dict — agent config (e.g. step_limit)
Output: agent result JSON to **stdout**, or error JSON on failure
@@ -12,7 +12,6 @@
from __future__ import annotations
import json
-import os
import sys
DEFAULT_ACTION_TIMEOUT = 600
@@ -45,8 +44,15 @@ def main() -> None:
env_cfg["timeout"] = DEFAULT_ACTION_TIMEOUT
env_cfg.setdefault("env", {})
env_cfg["env"].setdefault("GIT_PAGER", "cat")
- for key in ("image", "container_timeout", "run_args", "executable", "pull_timeout",
- "forward_env", "interpreter"):
+ for key in (
+ "image",
+ "container_timeout",
+ "run_args",
+ "executable",
+ "pull_timeout",
+ "forward_env",
+ "interpreter",
+ ):
env_cfg.pop(key, None)
env = LocalEnvironment(**env_cfg)
@@ -57,15 +63,17 @@ def main() -> None:
model_defaults.pop("model_name", None)
model_defaults.pop("model_kwargs", None)
model_cfg = model_defaults
- model_cfg.update({
- "model_name": "openai/default",
- "model_kwargs": {
- "api_base": gateway_url,
- "api_key": "not-needed",
- "drop_params": True,
- },
- "cost_tracking": "ignore_errors",
- })
+ model_cfg.update(
+ {
+ "model_name": "openai/default",
+ "model_kwargs": {
+ "api_base": gateway_url,
+ "api_key": "not-needed",
+ "drop_params": True,
+ },
+ "cost_tracking": "ignore_errors",
+ }
+ )
model = LitellmModel(**model_cfg)
# 5. Create DefaultAgent
diff --git a/examples/blackbox_recipes/mini_swe_agent/run_infer.sh b/examples/blackbox_recipes/mini_swe_agent/run_infer.sh
new file mode 100755
index 00000000..2dfa997d
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/run_infer.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Standalone inference for the blackbox mini-swe-agent recipe.
+# Runs rollout + reward only (no Megatron trainer) and reports resolve rate.
+#
+# Usage:
+# bash examples/blackbox_recipes/mini_swe_agent/run_infer.sh
+#
+# All configurable via environment variables (see defaults below).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/../../.." && pwd)}"
+cd "${REPO_ROOT}"
+
+# ── Model & data ─────────────────────────────────────────────────────────
+MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}"
+DATA_PATH="${DATA_PATH:-${HOME}/data/swe_agent/swe_bench_verified.parquet}"
+
+# ── Inference parameters ─────────────────────────────────────────────────
+MAX_SAMPLES="${MAX_SAMPLES:--1}"
+PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
+RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+TOP_P="${TOP_P:-1.0}"
+N="${N:-1}"
+ENGINE="${ENGINE:-vllm}"
+TP="${TP:-4}"
+NNODES="${NNODES:-1}"
+N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-8}"
+GATEWAY_COUNT="${GATEWAY_COUNT:-1}"
+MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-8}"
+
+# ── Agent parameters ─────────────────────────────────────────────────────
+AGENT_MAX_TURNS="${AGENT_MAX_TURNS:-100}"
+SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
+SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
+
+# ── AKernel (remote sandbox) ─────────────────────────────────────────────
+export AKERNEL_SERVER_ADDRESS="${AKERNEL_SERVER_ADDRESS:-}"
+export AKERNEL_TOKEN="${AKERNEL_TOKEN:-}"
+export AKERNEL_TUNNEL_SSL_VERIFY="${AKERNEL_TUNNEL_SSL_VERIFY:-0}"
+
+# ── Logging & env ────────────────────────────────────────────────────────
+export VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
+export ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
+export AGENT_MAX_TURNS
+export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
+export PYTHONPATH="${REPO_ROOT}:${REPO_ROOT}/verl:${PYTHONPATH:-}"
+
+echo "=== Mini-SWE-Agent Blackbox Inference ==="
+echo "Model: ${MODEL_PATH}"
+echo "Data: ${DATA_PATH}"
+echo "Max samples: ${MAX_SAMPLES}"
+echo "Engine: ${ENGINE} (TP=${TP})"
+echo "Tool image: ${SWE_AGENT_TOOL_IMAGE}"
+echo "Batch: n=${N}, gateway=${GATEWAY_COUNT}, max_sessions=${MAX_CONCURRENT_SESSIONS}"
+if [[ -n "${GATEWAY_MESSAGE_JSONL_PATH}" ]]; then
+ echo "Messages: ${GATEWAY_MESSAGE_JSONL_PATH}"
+fi
+echo "========================================="
+
+python examples/blackbox_recipes/mini_swe_agent/parallel_infer.py \
+ --model-path "${MODEL_PATH}" \
+ --data-path "${DATA_PATH}" \
+ --max-samples "${MAX_SAMPLES}" \
+ --prompt-length "${PROMPT_LENGTH}" \
+ --response-length "${RESPONSE_LENGTH}" \
+ --temperature "${TEMPERATURE}" \
+ --top-p "${TOP_P}" \
+ --n "${N}" \
+ --engine "${ENGINE}" \
+ --tensor-parallel-size "${TP}" \
+ --nnodes "${NNODES}" \
+ --n-gpus-per-node "${N_GPUS_PER_NODE}" \
+ --gateway-count "${GATEWAY_COUNT}" \
+ --max-concurrent-sessions "${MAX_CONCURRENT_SESSIONS}" \
+ --tool-image "${SWE_AGENT_TOOL_IMAGE}" \
+ --run-timeout "${SWE_AGENT_RUN_TIMEOUT}" \
+ --max-turns "${AGENT_MAX_TURNS}"
diff --git a/examples/blackbox_recipes/mini_swe_agent/run_train.sh b/examples/blackbox_recipes/mini_swe_agent/run_train.sh
new file mode 100755
index 00000000..75a042ba
--- /dev/null
+++ b/examples/blackbox_recipes/mini_swe_agent/run_train.sh
@@ -0,0 +1,300 @@
+#!/usr/bin/env bash
+# Megatron + V1 async training for the blackbox mini-swe recipe.
+#
+# Uses verl.trainer.main_ppo with the V1 unified trainer. The default mode is
+# separate_async, which uses separate trainer and rollout GPU pools.
+#
+# Usage:
+# bash examples/blackbox_recipes/mini_swe_agent/run_train.sh
+#
+# All configurable via environment variables (see defaults below).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/../../.." && pwd)}"
+cd "${REPO_ROOT}"
+
+# ── Model & data ─────────────────────────────────────────────────────────
+MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}"
+TRAIN_DATA="${TRAIN_DATA:-${HOME}/data/swe_agent/swe_rebench_filtered.parquet}"
+VAL_DATA="${VAL_DATA:-${HOME}/data/swe_agent/swe_bench_verified.parquet}"
+RUNTIME_ENV="${RUNTIME_ENV:-}"
+
+# ── V1 trainer ───────────────────────────────────────────────────────────
+TRAINER_MODE="${TRAINER_MODE:-separate_async}"
+NUM_WARMUP_BATCHES="${NUM_WARMUP_BATCHES:-1}"
+SEPARATE_NUM_WARMUP_BATCHES="${SEPARATE_NUM_WARMUP_BATCHES:-${NUM_WARMUP_BATCHES}}"
+PARAMETER_SYNC_STEP="${PARAMETER_SYNC_STEP:-4}"
+RAY_SUBMIT_MODE="${RAY_SUBMIT_MODE:-job}"
+RAY_INIT_ADDRESS="${RAY_INIT_ADDRESS:-auto}"
+RAY_STATUS_TIMEOUT="${RAY_STATUS_TIMEOUT:-5}"
+CONFIG_NAME="${CONFIG_NAME:-swe_agent_blackbox_megatron_v1}"
+
+# ── Hardware ─────────────────────────────────────────────────────────────
+NNODES="${NNODES:-${NNODES_TRAIN:-1}}"
+PHYSICAL_GPUS_PER_NODE="${PHYSICAL_GPUS_PER_NODE:-8}"
+if [[ "${TRAINER_MODE}" == "separate_async" ]]; then
+ N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-${TRAIN_NGPUS_PER_NODE:-4}}"
+ ROLLOUT_NNODES="${ROLLOUT_NNODES:-${NNODES_ROLLOUT:-${NNODES}}}"
+ ROLLOUT_NGPUS_PER_NODE="${ROLLOUT_NGPUS_PER_NODE:-${NGPUS_PER_NODE_ROLLOUT:-4}}"
+else
+ N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-${TRAIN_NGPUS_PER_NODE:-${PHYSICAL_GPUS_PER_NODE}}}"
+ ROLLOUT_NNODES="${ROLLOUT_NNODES:-${NNODES_ROLLOUT:-0}}"
+ ROLLOUT_NGPUS_PER_NODE="${ROLLOUT_NGPUS_PER_NODE:-${NGPUS_PER_NODE_ROLLOUT:-${N_GPUS_PER_NODE}}}"
+fi
+
+# ── Algorithm ────────────────────────────────────────────────────────────
+CLIP_RATIO_LOW="${CLIP_RATIO_LOW:-0.2}"
+CLIP_RATIO_HIGH="${CLIP_RATIO_HIGH:-0.28}"
+ACTOR_LR="${ACTOR_LR:-1e-6}"
+
+# ── Sequence lengths ─────────────────────────────────────────────────────
+PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
+RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
+MAX_MODEL_LEN=$((PROMPT_LENGTH + RESPONSE_LENGTH))
+
+# ── Rollout parameters ───────────────────────────────────────────────────
+ENGINE="${ENGINE:-vllm}"
+if [[ "${TRAINER_MODE}" == "separate_async" ]]; then
+ GEN_TP="${GEN_TP:-${TP:-${ROLLOUT_NGPUS_PER_NODE}}}"
+else
+ GEN_TP="${GEN_TP:-${TP:-2}}"
+fi
+N="${N:-8}"
+TEMPERATURE="${TEMPERATURE:-1.0}"
+TOP_P="${TOP_P:-1.0}"
+TOP_K="${TOP_K:--1}"
+ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
+UPDATE_WEIGHTS_BUCKET_MB="${UPDATE_WEIGHTS_BUCKET_MB:-2048}"
+
+# ── Megatron training parallelism ────────────────────────────────────────
+if [[ "${TRAINER_MODE}" == "separate_async" ]]; then
+ TRAIN_TP="${TRAIN_TP:-${TP:-${N_GPUS_PER_NODE}}}"
+else
+ TRAIN_TP="${TRAIN_TP:-${TP:-8}}"
+fi
+TRAIN_PP="${TRAIN_PP:-1}"
+TRAIN_CP="${TRAIN_CP:-1}"
+OFFLOAD="${OFFLOAD:-True}"
+OPTIMIZER_OFFLOAD_FRACTION="${OFFLOAD_FRACTION:-1.0}"
+USE_MBRIDGE="${USE_MBRIDGE:-True}"
+PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}"
+
+# ── Agent parameters ─────────────────────────────────────────────────────
+# AGENT_MAX_TURNS is the agent's turn budget inside the sandbox: it becomes the
+# mini-swe-agent step_limit (read by the runner via the AGENT_MAX_TURNS env var).
+# Note: the trainer's multi_turn.max_assistant_turns is NOT enforced on the
+# blackbox rollout path (AgentFrameworkRolloutAdapter), so it is not exposed here.
+RUNNER="${RUNNER:-mini_swe}"
+AGENT_MAX_TURNS="${AGENT_MAX_TURNS:-100}"
+if [[ "${RUNNER}" == "mini_swe" ]]; then
+ AGENT_RUNNER_FQN="examples.blackbox_recipes.mini_swe_agent.mini_swe_agent_runner.mini_swe_agent_runner"
+ SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
+else
+ echo "Unknown RUNNER=${RUNNER}; this recipe currently supports mini_swe only" >&2
+ exit 1
+fi
+SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
+CONDA_ENV="${CONDA_ENV:-testbed}"
+GATEWAY_COUNT="${GATEWAY_COUNT:-1}"
+MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-32}"
+NUM_AGENT_WORKERS="${NUM_AGENT_WORKERS:-8}"
+RUNNER_ARGS=(
+ "actor_rollout_ref.rollout.agent.agent_loop_manager_class=uni_agent.framework.entry.AgentFrameworkRolloutAdapter"
+ "actor_rollout_ref.rollout.custom.agent_framework.gateway_count=${GATEWAY_COUNT}"
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_fqn=${AGENT_RUNNER_FQN}"
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.dispatch_mode=ray_task"
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.max_concurrent_sessions=${MAX_CONCURRENT_SESSIONS}"
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
+ "actor_rollout_ref.rollout.custom.agent_framework.agent_runners.swe_agent.runner_kwargs.conda_env=${CONDA_ENV}"
+)
+
+# ── AKernel (remote sandbox) ─────────────────────────────────────────────
+AKERNEL_SERVER_ADDRESS="${AKERNEL_SERVER_ADDRESS:-}"
+AKERNEL_TOKEN="${AKERNEL_TOKEN:-}"
+AKERNEL_TUNNEL_SSL_VERIFY="${AKERNEL_TUNNEL_SSL_VERIFY:-0}"
+
+# ── Logging & checkpointing ──────────────────────────────────────────────
+PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
+EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
+SAVE_FREQ="${SAVE_FREQ:-10}"
+TEST_FREQ="${TEST_FREQ:-10}"
+TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}"
+TOTAL_TRAINING_STEPS="${TOTAL_TRAINING_STEPS:-}"
+VAL_BEFORE_TRAIN="${VAL_BEFORE_TRAIN:-true}"
+CKPTS_DIR="${CKPTS_DIR:-checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}"
+TRAIN_MAX_SAMPLES="${TRAIN_MAX_SAMPLES:-${MAX_SAMPLES:--1}}"
+VAL_MAX_SAMPLES="${VAL_MAX_SAMPLES:-${MAX_SAMPLES:--1}}"
+TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-${PPO_MINI_BATCH_SIZE}}"
+VAL_BATCH_SIZE="${VAL_BATCH_SIZE:-${TRAIN_BATCH_SIZE}}"
+
+export AGENT_MAX_TURNS
+export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
+export SWE_AGENT_TOOL_IMAGE
+export SWE_AGENT_RUN_TIMEOUT
+export CONDA_ENV
+export GATEWAY_COUNT
+export AKERNEL_SERVER_ADDRESS
+export AKERNEL_TOKEN
+export AKERNEL_TUNNEL_SSL_VERIFY
+export PYTHONPATH="${REPO_ROOT}:${REPO_ROOT}/verl:${PYTHONPATH:-}"
+
+echo "=== SWE-Agent Blackbox Megatron Async Training ==="
+echo "Model: ${MODEL_PATH}"
+echo "Train data: ${TRAIN_DATA}"
+echo "Val data: ${VAL_DATA}"
+echo "Engine: ${ENGINE} (gen_tp=${GEN_TP}, train_tp=${TRAIN_TP})"
+echo "Runner: ${RUNNER}"
+echo "Turns: agent_max_turns=${AGENT_MAX_TURNS}"
+echo "Batch: n=${N}, mini_bsz=${PPO_MINI_BATCH_SIZE}"
+echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}"
+echo "Trainer: V1 ${TRAINER_MODE}"
+if [[ "${TRAINER_MODE}" == "separate_async" ]]; then
+ echo "Resources: trainer=${NNODES}x${N_GPUS_PER_NODE}, rollout=${ROLLOUT_NNODES}x${ROLLOUT_NGPUS_PER_NODE}"
+else
+ echo "Resources: colocated=${NNODES}x${N_GPUS_PER_NODE}"
+fi
+echo "Samples: train_max=${TRAIN_MAX_SAMPLES}, val_max=${VAL_MAX_SAMPLES}"
+echo "==================================================="
+
+# ── Compute derived parameters ───────────────────────────────────────────
+ACTOR_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP ))
+INFER_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP ))
+
+RUNTIME_ENV_ARGS=()
+if [ -n "${RUNTIME_ENV}" ]; then
+ RUNTIME_ENV_ARGS=(--runtime-env "${RUNTIME_ENV}")
+else
+ RUNTIME_ENV_JSON="$(
+ python3 - <<'PY'
+import json
+import os
+
+env_vars = {
+ key: value
+ for key in (
+ "PYTHONPATH",
+ "AKERNEL_SERVER_ADDRESS",
+ "AKERNEL_TOKEN",
+ "AKERNEL_TUNNEL_SSL_VERIFY",
+ "AGENT_MAX_TURNS",
+ "SWE_AGENT_EVAL_TIMEOUT",
+ "SWE_AGENT_TOOL_IMAGE",
+ "SWE_AGENT_RUN_TIMEOUT",
+ "CONDA_ENV",
+ "GATEWAY_COUNT",
+ )
+ if (value := os.environ.get(key)) is not None
+}
+env_vars.setdefault("TRANSFER_QUEUE_ENABLE", "")
+env_vars.setdefault("NCCL_P2P_DISABLE", "1")
+env_vars.setdefault("NCCL_SHM_DISABLE", "1")
+print(json.dumps({"env_vars": env_vars}))
+PY
+ )"
+ RUNTIME_ENV_ARGS=(--runtime-env-json "${RUNTIME_ENV_JSON}")
+fi
+
+# ── Ensure Ray is running ────────────────────────────────────────────────
+if [[ "${TRAINER_MODE}" == "separate_async" ]]; then
+ TOTAL_GPUS=$(( NNODES * N_GPUS_PER_NODE + ROLLOUT_NNODES * ROLLOUT_NGPUS_PER_NODE ))
+else
+ TOTAL_GPUS=$(( NNODES * N_GPUS_PER_NODE ))
+fi
+if ! timeout "${RAY_STATUS_TIMEOUT}" ray status &>/dev/null; then
+ echo "Starting Ray cluster (${TOTAL_GPUS} GPUs)..."
+ ray start --head --num-gpus="${TOTAL_GPUS}" --disable-usage-stats
+else
+ echo "Ray cluster already running."
+fi
+
+# ── Launch ────────────────────────────────────────────────────────────────
+WORKING_DIR="${WORKING_DIR:-$(pwd)}"
+
+MAIN_CMD=(
+ python3 -m verl.trainer.main_ppo
+ --config-name="${CONFIG_NAME}" \
+ --config-path="${REPO_ROOT}/examples/blackbox_recipes/mini_swe_agent/config" \
+ hydra.searchpath=[pkg://verl.trainer.config] \
+ +ray_kwargs.ray_init.address="${RAY_INIT_ADDRESS}" \
+ trainer.use_v1=True \
+ trainer.v1.trainer_mode="${TRAINER_MODE}" \
+ trainer.v1.colocate_async.num_warmup_batches=${NUM_WARMUP_BATCHES} \
+ trainer.v1.separate_async.num_warmup_batches=${SEPARATE_NUM_WARMUP_BATCHES} \
+ trainer.v1.separate_async.parameter_sync_step=${PARAMETER_SYNC_STEP} \
+ transfer_queue.enable=True \
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
+ data.train_files="['${TRAIN_DATA}']" \
+ data.val_files="['${VAL_DATA}']" \
+ data.train_max_samples=${TRAIN_MAX_SAMPLES} \
+ data.val_max_samples=${VAL_MAX_SAMPLES} \
+ data.train_batch_size=${TRAIN_BATCH_SIZE} \
+ data.val_batch_size=${VAL_BATCH_SIZE} \
+ data.max_prompt_length=${PROMPT_LENGTH} \
+ data.max_response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.n=${N} \
+ actor_rollout_ref.rollout.name=${ENGINE} \
+ actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
+ actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
+ actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \
+ actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \
+ actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
+ actor_rollout_ref.rollout.top_p=${TOP_P} \
+ actor_rollout_ref.rollout.top_k=${TOP_K} \
+ actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=${UPDATE_WEIGHTS_BUCKET_MB} \
+ actor_rollout_ref.rollout.nnodes=${ROLLOUT_NNODES} \
+ actor_rollout_ref.rollout.n_gpus_per_node=${ROLLOUT_NGPUS_PER_NODE} \
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP} \
+ actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
+ actor_rollout_ref.rollout.agent.num_workers=${NUM_AGENT_WORKERS} \
+ "${RUNNER_ARGS[@]}" \
+ actor_rollout_ref.actor.clip_ratio_low=${CLIP_RATIO_LOW} \
+ actor_rollout_ref.actor.clip_ratio_high=${CLIP_RATIO_HIGH} \
+ actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ACTOR_PPO_MAX_TOKEN_LEN} \
+ actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${OPTIMIZER_OFFLOAD_FRACTION} \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
+ actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.optimizer_offload=${OFFLOAD} \
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
+ actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \
+ actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \
+ actor_rollout_ref.ref.megatron.param_offload=${OFFLOAD} \
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TRAIN_TP} \
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
+ actor_rollout_ref.ref.megatron.context_parallel_size=${TRAIN_CP} \
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \
+ trainer.project_name="${PROJECT_NAME}" \
+ trainer.experiment_name="${EXPERIMENT_NAME}" \
+ trainer.total_epochs=${TOTAL_EPOCHS} \
+ trainer.val_before_train=${VAL_BEFORE_TRAIN} \
+ trainer.save_freq=${SAVE_FREQ} \
+ trainer.test_freq=${TEST_FREQ} \
+ trainer.default_local_dir="${CKPTS_DIR}" \
+ trainer.nnodes=${NNODES} \
+ trainer.n_gpus_per_node=${N_GPUS_PER_NODE} \
+ "$@"
+)
+
+if [[ -n "${TOTAL_TRAINING_STEPS}" ]]; then
+ MAIN_CMD+=(trainer.total_training_steps=${TOTAL_TRAINING_STEPS})
+fi
+
+if [[ "${RAY_SUBMIT_MODE}" == "job" ]]; then
+ ray job submit --no-wait --working-dir="${WORKING_DIR}" "${RUNTIME_ENV_ARGS[@]}" -- "${MAIN_CMD[@]}"
+elif [[ "${RAY_SUBMIT_MODE}" == "local" ]]; then
+ "${MAIN_CMD[@]}"
+else
+ echo "Unknown RAY_SUBMIT_MODE=${RAY_SUBMIT_MODE}; expected job or local" >&2
+ exit 1
+fi
diff --git a/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py b/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py
deleted file mode 100644
index b03dc7d7..00000000
--- a/examples/blackbox_recipes/mini_swe_agent/subprocess_runner.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Ray-based subprocess runner for agent_runner execution.
-
-Launches agent_runner in a separate Ray worker process to prevent blocking
-operations (sleep, sync I/O, etc.) from stalling the framework's event loop.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import logging
-from typing import Any
-
-import ray
-
-from uni_agent.trainer.framework.types import SessionHandle
-
-logger = logging.getLogger(__name__)
-
-
-class _StubSessionRuntime:
- """Captures reward_info from agent_runner's complete_session call."""
-
- def __init__(self):
- self.reward_info: dict[str, Any] | None = None
-
- async def complete_session(self, session_id: str, reward_info: dict[str, Any] | None = None):
- self.reward_info = reward_info
-
-
-@ray.remote(num_cpus=0)
-def remote_agent_run(
- agent_runner_fqn: str,
- raw_prompt,
- session_id: str,
- base_url: str,
- sample_index: int,
- runner_kwargs: dict,
-) -> dict[str, Any] | None:
- """Run agent_runner in a dedicated Ray worker process."""
- from verl.utils.import_utils import load_class_from_fqn
-
- agent_runner = load_class_from_fqn(agent_runner_fqn)
- stub_runtime = _StubSessionRuntime()
- handle = SessionHandle(session_id=session_id, base_url=base_url)
-
- async def _run():
- try:
- await agent_runner(
- raw_prompt=raw_prompt,
- session=handle,
- sample_index=sample_index,
- session_runtime=stub_runtime,
- **runner_kwargs,
- )
- return stub_runtime.reward_info
- except Exception as e:
- logger.error("remote_agent_run failed: session_id=%s, sample=%d, error=%s",
- session_id, sample_index, e, exc_info=True)
- raise
-
- return asyncio.run(_run())
diff --git a/examples/blackbox_recipes/sandbox/sandbox.py b/examples/blackbox_recipes/sandbox_client.py
similarity index 66%
rename from examples/blackbox_recipes/sandbox/sandbox.py
rename to examples/blackbox_recipes/sandbox_client.py
index e6e46a8d..631c8722 100644
--- a/examples/blackbox_recipes/sandbox/sandbox.py
+++ b/examples/blackbox_recipes/sandbox_client.py
@@ -1,5 +1,7 @@
-"""OpenYuanRong (AKernel) remote sandbox command execution.
+"""AKernel remote sandbox command execution.
+AKernel is an agent sandbox infra collaboratively developed by the
+OpenYuanrong team and the Ant AKernel team.
Uses ``akernel_sdk.Sandbox`` with sidecar ``Mount`` to inject the
mini-swe-agent tool image. Supports upstream tunnel so the agent
inside the sandbox can reach the gateway via ``http://127.0.0.1:``.
@@ -10,6 +12,7 @@
import asyncio
import logging
import os
+import uuid
from dataclasses import dataclass
from typing import Any
from urllib.parse import urlparse
@@ -23,24 +26,31 @@ class CommandResult:
stderr: str
exit_code: int
+
logger = logging.getLogger(__name__)
DEFAULT_PROXY_PORT = 38197
def _configure_akernel_env() -> None:
- """Map OPENYUANRONG_* env vars to AKERNEL_* before importing akernel_sdk."""
- server = os.getenv("OPENYUANRONG_SERVER_ADDRESS")
- token = os.getenv("OPENYUANRONG_TOKEN")
- tunnel_ssl_verify = os.getenv("OPENYUANRONG_TUNNEL_SSL_VERIFY", "0")
+ """Validate AKernel credentials and map the tunnel SSL flag for akernel_sdk.
+
+ ``akernel_sdk`` reads ``AKERNEL_SERVER_ADDRESS`` / ``AKERNEL_TOKEN`` directly,
+ so only the tunnel SSL flag needs to be translated to ``TUNNEL_SSL_VERIFY``.
+ """
+ server = os.getenv("AKERNEL_SERVER_ADDRESS")
+ token = os.getenv("AKERNEL_TOKEN")
if not server or not token:
- raise ValueError(
- "OPENYUANRONG_SERVER_ADDRESS and OPENYUANRONG_TOKEN "
- "environment variables must be set for YR sandbox"
- )
- os.environ["AKERNEL_SERVER_ADDRESS"] = server
- os.environ["AKERNEL_TOKEN"] = token
- os.environ["TUNNEL_SSL_VERIFY"] = tunnel_ssl_verify
+ raise ValueError("AKERNEL_SERVER_ADDRESS and AKERNEL_TOKEN environment variables must be set for sandbox")
+ os.environ["TUNNEL_SSL_VERIFY"] = os.getenv("AKERNEL_TUNNEL_SSL_VERIFY", "0")
+
+
+def _resolve_sandbox_name() -> str | None:
+ """Return ``{prefix}{random}`` when ``SANDBOX_NAME_PREFIX`` env is set."""
+ prefix = os.getenv("SANDBOX_NAME_PREFIX")
+ if not prefix:
+ return None
+ return f"{prefix}{uuid.uuid4().hex[:8]}"
def extract_upstream(gateway_url: str) -> str:
@@ -71,8 +81,8 @@ def rewrite_gateway_url(
return f"http://127.0.0.1:{proxy_port}{path}"
-class YRSandbox:
- """Command execution via OpenYuanRong (AKernel) remote sandbox."""
+class SandboxClient:
+ """Command execution via remote sandbox."""
def __init__(self, sandbox: Any) -> None:
self._sandbox = sandbox
@@ -81,7 +91,6 @@ def __init__(self, sandbox: Any) -> None:
def sandbox_id(self) -> str:
return getattr(self._sandbox, "sandbox_id", "unknown")
-
@classmethod
async def create(
cls,
@@ -97,10 +106,10 @@ async def create(
mem_limit: int = 8192,
idle_timeout: int = 7200,
sidecar_target: str = "/opt/mini-swe-agent",
- max_retries: int = 5,
+ max_retries: int = 10,
**sandbox_kwargs: Any,
- ) -> "YRSandbox":
- """Create an OpenYuanRong sandbox with sidecar tool mounted.
+ ) -> SandboxClient:
+ """Create an sandbox client with sidecar tool mounted.
The sidecar image is mounted at ``sidecar_target`` inside the
sandbox via ``akernel_sdk.Mount``.
@@ -127,25 +136,35 @@ async def create(
sb_kwargs["proxy_port"] = proxy_port
if env:
sb_kwargs["env"] = env
+ name = _resolve_sandbox_name()
+ if name is not None:
+ sb_kwargs["name"] = name
sb_kwargs.update(sandbox_kwargs)
logger.info(
- "Creating YR sandbox (image=%s, cpu=%d, memory=%d, sidecar=%s:%s, upstream=%s)",
- image, cpu, memory, sidecar_image, sidecar_target, upstream or "none",
+ "Creating sandbox (image=%s, cpu=%d, memory=%d, sidecar=%s:%s, upstream=%s, name=%s)",
+ image,
+ cpu,
+ memory,
+ sidecar_image,
+ sidecar_target,
+ upstream or "none",
+ name or "auto",
)
last_error: Exception | None = None
for retry in range(max_retries):
sandbox = None
try:
sandbox = await asyncio.to_thread(lambda: Sandbox(**sb_kwargs))
- logger.info("YR sandbox created: %s", getattr(sandbox, "sandbox_id", "?"))
+ logger.info("sandbox created: %s", getattr(sandbox, "sandbox_id", "?"))
return cls(sandbox=sandbox)
except Exception as exc:
last_error = exc
sandbox_id = getattr(sandbox, "sandbox_id", None)
logger.critical(
- "Failed to create YR sandbox (sandbox_id=%s): %s",
- sandbox_id or "n/a", exc,
+ "Failed to create sandbox (sandbox_id=%s): %s",
+ sandbox_id or "n/a",
+ exc,
)
if sandbox is not None:
try:
@@ -153,17 +172,19 @@ async def create(
except Exception:
pass
if retry < max_retries - 1:
- sleep_time = min(30, 2 ** retry)
- logger.info("Retrying YR sandbox creation in %d seconds...", sleep_time)
+ sleep_time = min(30, 2**retry)
+ logger.info("Retrying sandbox creation in %d seconds...", sleep_time)
await asyncio.sleep(sleep_time)
- raise RuntimeError(f"Failed to create YR sandbox after {max_retries} retries") from last_error
+ raise RuntimeError(f"Failed to create sandbox after {max_retries} retries") from last_error
async def run(self, cmd: str, *, timeout: int = 600) -> CommandResult:
- """Execute *cmd* inside the OpenYuanRong sandbox via ``sandbox.commands.run``."""
+ """Execute *cmd* inside the sandbox via ``sandbox.commands.run``."""
try:
result = await asyncio.to_thread(
- self._sandbox.commands.run, cmd, timeout=timeout,
+ self._sandbox.commands.run,
+ cmd,
+ timeout=timeout,
)
return CommandResult(
stdout=getattr(result, "stdout", ""),
@@ -174,15 +195,15 @@ async def run(self, cmd: str, *, timeout: int = 600) -> CommandResult:
return CommandResult(stdout="", stderr=str(e), exit_code=-1)
async def cleanup(self) -> None:
- """Kill the OpenYuanRong sandbox if still running."""
+ """Kill the sandbox if still running."""
if self._sandbox is not None:
sandbox_id = getattr(self._sandbox, "sandbox_id", "?")
try:
if self._sandbox.is_running():
await asyncio.to_thread(self._sandbox.kill)
- logger.info("YR sandbox %s killed", sandbox_id)
+ logger.info("sandbox %s killed", sandbox_id)
else:
- logger.info("YR sandbox %s already stopped", sandbox_id)
+ logger.info("sandbox %s already stopped", sandbox_id)
except Exception as e:
- logger.warning("Failed to kill YR sandbox %s: %s", sandbox_id, e)
+ logger.warning("Failed to kill sandbox %s: %s", sandbox_id, e)
self._sandbox = None
diff --git a/examples/blackbox_recipes/scripts/build_tool.sh b/examples/blackbox_recipes/scripts/build_tool.sh
deleted file mode 100755
index e5158629..00000000
--- a/examples/blackbox_recipes/scripts/build_tool.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-# Build a SWE blackbox sidecar tool image.
-#
-# Usage:
-# bash examples/swe_agent_blackbox/build_tool.sh
-# bash examples/swe_agent_blackbox/build_tool.sh --tool claude_code
-# bash examples/swe_agent_blackbox/build_tool.sh --pip-index https://pypi.tuna.tsinghua.edu.cn/simple/
-# bash examples/swe_agent_blackbox/build_tool.sh --npm-registry https://registry.npmmirror.com
-# bash examples/swe_agent_blackbox/build_tool.sh --tool-version latest
-# bash examples/swe_agent_blackbox/build_tool.sh --registry reg.antgroup-inc.cn/myrepo
-#
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-TOOL_KIND="${TOOL_KIND:-mini_swe}"
-IMAGE_TAG="${TOOL_TAG:-latest}"
-TOOL_VERSION="${TOOL_VERSION:-latest}"
-
-# Parse args
-REGISTRY=""
-PIP_INDEX_URL="${PIP_INDEX_URL:-}"
-NPM_REGISTRY="${NPM_REGISTRY:-}"
-while [[ $# -gt 0 ]]; do
- case "$1" in
- --tool) TOOL_KIND="$2"; shift 2 ;;
- --registry) REGISTRY="$2"; shift 2 ;;
- --pip-index) PIP_INDEX_URL="$2"; shift 2 ;;
- --npm-registry) NPM_REGISTRY="$2"; shift 2 ;;
- --tool-version) TOOL_VERSION="$2"; shift 2 ;;
- *) echo "Unknown arg: $1"; exit 1 ;;
- esac
-done
-
-BUILD_ARGS=()
-DOCKERFILE="${SCRIPT_DIR}/Dockerfile.mini-swe-agent-tool"
-if [[ "${TOOL_KIND}" == "claude" ]]; then
- TOOL_KIND="claude_code"
-fi
-if [[ "${TOOL_KIND}" == "claude_code" ]]; then
- IMAGE_NAME="${TOOL_IMAGE:-claude-code-tool}"
- DOCKERFILE="${SCRIPT_DIR}/Dockerfile.claude-code-tool"
- BUILD_ARGS+=(--build-arg "TOOL_VERSION=${TOOL_VERSION}")
- if [[ -n "${NPM_REGISTRY}" ]]; then
- BUILD_ARGS+=(--build-arg "NPM_REGISTRY=${NPM_REGISTRY}")
- fi
-elif [[ "${TOOL_KIND}" == "mini_swe" ]]; then
- IMAGE_NAME="${TOOL_IMAGE:-mini-swe-agent-tool}"
- if [[ -n "${PIP_INDEX_URL}" ]]; then
- BUILD_ARGS+=(--build-arg PIP_INDEX_URL="${PIP_INDEX_URL}")
- fi
-else
- echo "Unknown tool: ${TOOL_KIND}; expected mini_swe or claude_code"
- exit 1
-fi
-
-echo "==> Building ${TOOL_KIND} tool image: ${IMAGE_NAME}:${IMAGE_TAG}"
-docker build \
- -f "${DOCKERFILE}" \
- -t "${IMAGE_NAME}:${IMAGE_TAG}" \
- "${BUILD_ARGS[@]}" \
- "${SCRIPT_DIR}/"
-
-if [[ -n "${REGISTRY}" ]]; then
- FULL_TAG="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
- echo "==> Tagging and pushing: ${FULL_TAG}"
- docker tag "${IMAGE_NAME}:${IMAGE_TAG}" "${FULL_TAG}"
- docker push "${FULL_TAG}"
- echo " Pushed."
-fi
-
-echo ""
-echo "Tool image ready: ${IMAGE_NAME}:${IMAGE_TAG}"
-if [[ -n "${REGISTRY}" ]]; then
- echo " Remote sandbox: ${FULL_TAG}"
-fi
diff --git a/examples/blackbox_recipes/scripts/run_infer.sh b/examples/blackbox_recipes/scripts/run_infer.sh
deleted file mode 100755
index d5703aa6..00000000
--- a/examples/blackbox_recipes/scripts/run_infer.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env bash
-# Parallel inference for the blackbox SWE-agent recipe.
-#
-# Usage:
-# bash examples/swe_agent_blackbox/scripts/run_infer.sh
-
-set -euo pipefail
-
-# ── Model & data ─────────────────────────────────────────────────────────
-MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}"
-DATA_PATH="${DATA_PATH:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
-
-# ── Inference parameters ─────────────────────────────────────────────────
-MAX_SAMPLES="${MAX_SAMPLES:--1}"
-PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
-RESPONSE_LENGTH="${RESPONSE_LENGTH:-65536}"
-TEMPERATURE="${TEMPERATURE:-1.0}"
-TOP_P="${TOP_P:-1.0}"
-N="${N:-8}"
-ENGINE="${ENGINE:-vllm}"
-TP="${TP:-4}"
-N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-8}"
-GATEWAY_COUNT="${GATEWAY_COUNT:-1}"
-MAX_CONCURRENT_SESSIONS="${MAX_CONCURRENT_SESSIONS:-2}"
-
-# ── Agent parameters ─────────────────────────────────────────────────────
-RUNNER="${RUNNER:-uniagent}"
-AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
-export SWE_AGENT_MAX_TURNS="${SWE_AGENT_MAX_TURNS:-100}"
-export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
-SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-}"
-SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
-
-# ── Logging ──────────────────────────────────────────────────────────────
-export VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
-export ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.5}"
-
-echo "=== SWE-Agent Blackbox Inference ==="
-echo "Model: ${MODEL_PATH}"
-echo "Data: ${DATA_PATH}"
-echo "Max samples: ${MAX_SAMPLES}"
-echo "Engine: ${ENGINE} (TP=${TP})"
-echo "Runner: ${RUNNER}"
-echo "Gateway count: ${GATEWAY_COUNT}"
-echo "Max concurrent sessions: ${MAX_CONCURRENT_SESSIONS}"
-echo "====================================="
-
-python examples/swe_agent_blackbox/parallel_infer.py \
- --model-path "${MODEL_PATH}" \
- --data-path "${DATA_PATH}" \
- --max-samples "${MAX_SAMPLES}" \
- --prompt-length "${PROMPT_LENGTH}" \
- --response-length "${RESPONSE_LENGTH}" \
- --temperature "${TEMPERATURE}" \
- --top-p "${TOP_P}" \
- --n "${N}" \
- --engine "${ENGINE}" \
- --tensor-parallel-size "${TP}" \
- --max-turns "${SWE_AGENT_MAX_TURNS}" \
- --runner "${RUNNER}" \
- --agent-config-path "${AGENT_CONFIG_PATH}" \
- --n-gpus-per-node "${N_GPUS_PER_NODE}" \
- --gateway-count "${GATEWAY_COUNT}" \
- --max-concurrent-sessions "${MAX_CONCURRENT_SESSIONS}" \
- --tool-image "${SWE_AGENT_TOOL_IMAGE}" \
- --run-timeout "${SWE_AGENT_RUN_TIMEOUT}"
diff --git a/examples/blackbox_recipes/scripts/run_train.sh b/examples/blackbox_recipes/scripts/run_train.sh
deleted file mode 100755
index cf08005d..00000000
--- a/examples/blackbox_recipes/scripts/run_train.sh
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env bash
-# Training launch script for the blackbox SWE-agent recipe.
-#
-# Uses GRPO + AgentFrameworkRolloutAdapter with reward computed in-process
-# by the agent runner, then passed through the reward worker's compute_score.
-#
-# Usage:
-# bash examples/swe_agent_blackbox/scripts/run_train.sh
-#
-# All configurable via environment variables (see defaults below).
-
-set -euo pipefail
-
-# ── Model & data ─────────────────────────────────────────────────────────
-MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3-Coder-30B-A3B-Instruct}"
-TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
-VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
-
-# ── Hardware ─────────────────────────────────────────────────────────────
-NNODES="${NNODES:-1}"
-NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}"
-
-# ── Training parameters ─────────────────────────────────────────────────
-TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}"
-PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
-RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
-ACTOR_LR="${ACTOR_LR:-1e-6}"
-TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}"
-SAVE_FREQ="${SAVE_FREQ:-10}"
-TEST_FREQ="${TEST_FREQ:-10}"
-
-# ── Rollout parameters ──────────────────────────────────────────────────
-ENGINE="${ENGINE:-vllm}"
-TP="${TP:-4}"
-ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
-N="${N:-8}"
-TEMPERATURE="${TEMPERATURE:-1.0}"
-
-# ── Agent parameters ─────────────────────────────────────────────────────
-RUNNER="${RUNNER:-mini_swe}"
-MAX_TURNS="${MAX_TURNS:-100}"
-AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
-COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}"
-if [[ "${RUNNER}" == "claude_code" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner"
- SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}"
-elif [[ "${RUNNER}" == "mini_swe" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner"
- SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
-elif [[ "${RUNNER}" == "uniagent" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner"
- SWE_AGENT_TOOL_IMAGE=""
-else
- echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2
- exit 1
-fi
-SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
-RUNNER_ARGS=(
- "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}"
-)
-if [[ "${RUNNER}" != "uniagent" ]]; then
- RUNNER_ARGS+=(
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
- )
-fi
-
-# ── Logging ──────────────────────────────────────────────────────────────
-PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
-EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
-VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
-
-export SWE_AGENT_MAX_TURNS="${MAX_TURNS}"
-export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
-export VERL_LOGGING_LEVEL
-
-# ── Environment for NCCL ─────────────────────────────────────────────────
-export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}"
-export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}"
-
-echo "=== SWE-Agent Blackbox Training ==="
-echo "Model: ${MODEL_PATH}"
-echo "Train data: ${TRAIN_DATA}"
-echo "Val data: ${VAL_DATA}"
-echo "Engine: ${ENGINE} (TP=${TP})"
-echo "Runner: ${RUNNER}"
-echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}"
-echo "Epochs: ${TOTAL_EPOCHS}"
-echo "====================================="
-
-python3 -m verl.trainer.main_ppo_sync \
- --config-name=swe_agent_blackbox \
- --config-path="$(pwd)/examples/swe_agent_blackbox/config" \
- actor_rollout_ref.model.path="${MODEL_PATH}" \
- data.train_files="['${TRAIN_DATA}']" \
- data.val_files="['${VAL_DATA}']" \
- data.train_batch_size=${TRAIN_BATCH_SIZE} \
- data.max_prompt_length=${PROMPT_LENGTH} \
- data.max_response_length=${RESPONSE_LENGTH} \
- actor_rollout_ref.rollout.name=${ENGINE} \
- actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
- actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
- actor_rollout_ref.rollout.n=${N} \
- actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
- actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
- actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
- actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH + 1024)) \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \
- actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
- actor_rollout_ref.rollout.nnodes=${NNODES} \
- actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \
- trainer.nnodes=${NNODES} \
- trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
- trainer.total_epochs=${TOTAL_EPOCHS} \
- trainer.save_freq=${SAVE_FREQ} \
- trainer.test_freq=${TEST_FREQ} \
- trainer.project_name=${PROJECT_NAME} \
- trainer.experiment_name=${EXPERIMENT_NAME} \
- actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \
- actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \
- "${RUNNER_ARGS[@]}" \
- "$@"
diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_async.sh b/examples/blackbox_recipes/scripts/run_train_megatron_async.sh
deleted file mode 100755
index db3a8264..00000000
--- a/examples/blackbox_recipes/scripts/run_train_megatron_async.sh
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/usr/bin/env bash
-# Megatron + TQ fully-async training for the blackbox SWE-agent recipe.
-#
-# Uses FullyAsyncAgentFrameworkRolloutAdapter + SWEAgentFramework with Megatron backend.
-# Data flows through TransferQueue (zero-copy) with ReplayBuffer flow control.
-#
-# Usage:
-# bash examples/swe_agent_blackbox/scripts/run_train_megatron_async.sh
-#
-# All configurable via environment variables (see defaults below).
-
-set -euo pipefail
-
-# ── Model & data ─────────────────────────────────────────────────────────
-MODEL_PATH="${MODEL_PATH:-${HOME}/models/Qwen3.5-9B}"
-TRAIN_DATA="${TRAIN_DATA:-${HOME}/data/swe_agent/swe_rebench_filtered.parquet}"
-VAL_DATA="${VAL_DATA:-${HOME}/data/swe_agent/swe_bench_verified.parquet}"
-RUNTIME_ENV="${RUNTIME_ENV:-}"
-
-# ── Hardware ─────────────────────────────────────────────────────────────
-NNODES_TRAIN="${NNODES_TRAIN:-1}"
-NNODES_ROLLOUT="${NNODES_ROLLOUT:-1}"
-NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}"
-
-# ── Algorithm ────────────────────────────────────────────────────────────
-CLIP_RATIO_LOW="${CLIP_RATIO_LOW:-0.2}"
-CLIP_RATIO_HIGH="${CLIP_RATIO_HIGH:-0.28}"
-ACTOR_LR="${ACTOR_LR:-1e-6}"
-
-# ── Sequence lengths ─────────────────────────────────────────────────────
-PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
-RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
-MAX_MODEL_LEN=$((PROMPT_LENGTH + RESPONSE_LENGTH))
-
-# ── Rollout parameters ───────────────────────────────────────────────────
-ENGINE="${ENGINE:-vllm}"
-GEN_TP="${GEN_TP:-2}"
-N="${N:-8}"
-TEMPERATURE="${TEMPERATURE:-1.0}"
-ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
-
-# ── Megatron training parallelism ────────────────────────────────────────
-TRAIN_TP="${TRAIN_TP:-8}"
-TRAIN_PP="${TRAIN_PP:-1}"
-TRAIN_CP="${TRAIN_CP:-1}"
-OFFLOAD="${OFFLOAD:-True}"
-OPTIMIZER_OFFLOAD_FRACTION="${OFFLOAD_FRACTION:-1.0}"
-USE_MBRIDGE="${USE_MBRIDGE:-True}"
-PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}"
-
-# ── Agent parameters ─────────────────────────────────────────────────────
-RUNNER="${RUNNER:-mini_swe}"
-MAX_TURNS="${MAX_TURNS:-100}"
-AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
-COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}"
-if [[ "${RUNNER}" == "claude_code" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner"
- SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}"
-elif [[ "${RUNNER}" == "mini_swe" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner"
- SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
-elif [[ "${RUNNER}" == "uniagent" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner"
- SWE_AGENT_TOOL_IMAGE=""
-else
- echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2
- exit 1
-fi
-SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
-CONDA_ENV="${CONDA_ENV:-testbed}"
-RUNNER_ARGS=(
- "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}"
-)
-if [[ "${RUNNER}" != "uniagent" ]]; then
- RUNNER_ARGS+=(
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.conda_env=${CONDA_ENV}"
- )
-fi
-
-# ── OpenYuanRong (YR remote sandbox) ─────────────────────────────────────
-OPENYUANRONG_SERVER_ADDRESS="${OPENYUANRONG_SERVER_ADDRESS:-}"
-OPENYUANRONG_TOKEN="${OPENYUANRONG_TOKEN:-}"
-OPENYUANRONG_TUNNEL_SSL_VERIFY="${OPENYUANRONG_TUNNEL_SSL_VERIFY:-0}"
-
-# ── Async training ───────────────────────────────────────────────────────
-TOTAL_ROLLOUT_STEPS="${TOTAL_ROLLOUT_STEPS:-100000}"
-STALENESS_THRESHOLD="${STALENESS_THRESHOLD:-1.0}"
-TRIGGER_SYNC_STEP="${TRIGGER_SYNC_STEP:-4}"
-PARTIAL_ROLLOUT="${PARTIAL_ROLLOUT:-True}"
-
-# ── Logging & checkpointing ──────────────────────────────────────────────
-PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
-EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
-SAVE_FREQ="${SAVE_FREQ:-10}"
-TEST_FREQ="${TEST_FREQ:-10}"
-CKPTS_DIR="${CKPTS_DIR:-checkpoints/${PROJECT_NAME}/${EXPERIMENT_NAME}}"
-
-export SWE_AGENT_MAX_TURNS="${MAX_TURNS}"
-export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
-export OPENYUANRONG_SERVER_ADDRESS
-export OPENYUANRONG_TOKEN
-export OPENYUANRONG_TUNNEL_SSL_VERIFY
-
-echo "=== SWE-Agent Blackbox Megatron Async Training ==="
-echo "Model: ${MODEL_PATH}"
-echo "Train data: ${TRAIN_DATA}"
-echo "Val data: ${VAL_DATA}"
-echo "Engine: ${ENGINE} (gen_tp=${GEN_TP}, train_tp=${TRAIN_TP})"
-echo "Runner: ${RUNNER}"
-echo "Batch: n=${N}, mini_bsz=${PPO_MINI_BATCH_SIZE}"
-echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}"
-echo "Nodes: train=${NNODES_TRAIN}, rollout=${NNODES_ROLLOUT}"
-echo "==================================================="
-
-# ── Compute derived parameters ───────────────────────────────────────────
-ACTOR_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP ))
-INFER_PPO_MAX_TOKEN_LEN=$(( (PROMPT_LENGTH + RESPONSE_LENGTH) / TRAIN_CP ))
-
-RUNTIME_ENV_ARGS=()
-if [ -n "${RUNTIME_ENV}" ]; then
- RUNTIME_ENV_ARGS=(--runtime-env "${RUNTIME_ENV}")
-fi
-
-# ── Ensure Ray is running ────────────────────────────────────────────────
-TOTAL_GPUS=$(( (NNODES_TRAIN + NNODES_ROLLOUT) * NGPUS_PER_NODE ))
-if ! ray status &>/dev/null; then
- echo "Starting Ray cluster (${TOTAL_GPUS} GPUs)..."
- ray start --head --num-gpus="${TOTAL_GPUS}" --disable-usage-stats
-else
- echo "Ray cluster already running."
-fi
-
-# ── Launch ────────────────────────────────────────────────────────────────
-WORKING_DIR="${WORKING_DIR:-$(pwd)}"
-
-ray job submit --no-wait --working-dir="${WORKING_DIR}" "${RUNTIME_ENV_ARGS[@]}" \
- -- python3 -m verl.experimental.fully_async_policy.fully_async_main \
- --config-name=swe_agent_blackbox_megatron_async \
- --config-path="$(pwd)/examples/swe_agent_blackbox/config" \
- hydra.searchpath=[pkg://verl.trainer.config] \
- actor_rollout_ref.model.path="${MODEL_PATH}" \
- data.train_files="['${TRAIN_DATA}']" \
- data.val_files="['${VAL_DATA}']" \
- data.max_prompt_length=${PROMPT_LENGTH} \
- data.max_response_length=${RESPONSE_LENGTH} \
- actor_rollout_ref.rollout.n=${N} \
- actor_rollout_ref.rollout.name=${ENGINE} \
- actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
- actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
- actor_rollout_ref.rollout.max_model_len=${MAX_MODEL_LEN} \
- actor_rollout_ref.rollout.max_num_batched_tokens=${MAX_MODEL_LEN} \
- actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
- actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP} \
- actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \
- actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \
- actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \
- "${RUNNER_ARGS[@]}" \
- actor_rollout_ref.actor.clip_ratio_low=${CLIP_RATIO_LOW} \
- actor_rollout_ref.actor.clip_ratio_high=${CLIP_RATIO_HIGH} \
- actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \
- actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ACTOR_PPO_MAX_TOKEN_LEN} \
- actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
- actor_rollout_ref.actor.optim.lr_decay_steps=${TOTAL_ROLLOUT_STEPS} \
- +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${OPTIMIZER_OFFLOAD_FRACTION} \
- +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
- +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
- +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
- actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \
- actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \
- actor_rollout_ref.actor.megatron.optimizer_offload=${OFFLOAD} \
- actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \
- actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
- actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \
- actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \
- actor_rollout_ref.ref.megatron.param_offload=${OFFLOAD} \
- actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TRAIN_TP} \
- actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
- actor_rollout_ref.ref.megatron.context_parallel_size=${TRAIN_CP} \
- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
- actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \
- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
- actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${INFER_PPO_MAX_TOKEN_LEN} \
- trainer.project_name="${PROJECT_NAME}" \
- trainer.experiment_name="${EXPERIMENT_NAME}" \
- trainer.save_freq=${SAVE_FREQ} \
- trainer.test_freq=${TEST_FREQ} \
- trainer.default_local_dir="${CKPTS_DIR}" \
- trainer.nnodes=${NNODES_TRAIN} \
- trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
- rollout.nnodes=${NNODES_ROLLOUT} \
- rollout.n_gpus_per_node=${NGPUS_PER_NODE} \
- rollout.total_rollout_steps=${TOTAL_ROLLOUT_STEPS} \
- async_training.staleness_threshold=${STALENESS_THRESHOLD} \
- async_training.trigger_parameter_sync_step=${TRIGGER_SYNC_STEP} \
- async_training.partial_rollout=${PARTIAL_ROLLOUT} \
- "$@"
diff --git a/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh b/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh
deleted file mode 100755
index 1a0c19d3..00000000
--- a/examples/blackbox_recipes/scripts/run_train_megatron_sync.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env bash
-# Megatron sync training for the blackbox SWE-agent recipe.
-#
-# Uses main_ppo_sync + Megatron backend with the same blackbox agent infrastructure
-# (AgentFrameworkRolloutAdapter, subprocess_runner, SWEAgentFramework).
-#
-# Usage:
-# bash examples/swe_agent_blackbox/scripts/run_train_megatron_sync.sh
-#
-# All configurable via environment variables (see defaults below).
-
-set -euo pipefail
-
-# ── Model & data ─────────────────────────────────────────────────────────
-MODEL_PATH="${MODEL_PATH:-$HOME/models/Qwen3.5-9B}"
-TRAIN_DATA="${TRAIN_DATA:-$HOME/data/swe_agent/swe_rebench_filtered.parquet}"
-VAL_DATA="${VAL_DATA:-$HOME/data/swe_agent/swe_bench_verified.parquet}"
-
-# ── Hardware ─────────────────────────────────────────────────────────────
-NNODES="${NNODES:-1}"
-NGPUS_PER_NODE="${NGPUS_PER_NODE:-8}"
-
-# ── Training parameters ─────────────────────────────────────────────────
-TRAIN_BATCH_SIZE="${TRAIN_BATCH_SIZE:-128}"
-PROMPT_LENGTH="${PROMPT_LENGTH:-4096}"
-RESPONSE_LENGTH="${RESPONSE_LENGTH:-131072}"
-ACTOR_LR="${ACTOR_LR:-1e-6}"
-TOTAL_EPOCHS="${TOTAL_EPOCHS:-10}"
-SAVE_FREQ="${SAVE_FREQ:-10}"
-TEST_FREQ="${TEST_FREQ:-10}"
-PPO_MINI_BATCH_SIZE="${PPO_MINI_BATCH_SIZE:-16}"
-
-# ── Rollout parameters ──────────────────────────────────────────────────
-ENGINE="${ENGINE:-vllm}"
-TP="${TP:-4}"
-ROLLOUT_GPU_MEM_UTIL="${ROLLOUT_GPU_MEM_UTIL:-0.7}"
-N="${N:-8}"
-TEMPERATURE="${TEMPERATURE:-1.0}"
-
-# ── Megatron parallelism ────────────────────────────────────────────────
-TRAIN_TP="${TRAIN_TP:-8}"
-TRAIN_PP="${TRAIN_PP:-1}"
-TRAIN_CP="${TRAIN_CP:-1}"
-OFFLOAD="${OFFLOAD:-true}"
-USE_MBRIDGE="${USE_MBRIDGE:-true}"
-
-# ── Agent parameters ─────────────────────────────────────────────────────
-RUNNER="${RUNNER:-mini_swe}"
-MAX_TURNS="${MAX_TURNS:-100}"
-AGENT_CONFIG_PATH="${AGENT_CONFIG_PATH:-examples/swe_agent_blackbox/config/agent_config.yaml}"
-COMPLETION_TIMEOUT="${COMPLETION_TIMEOUT:-600}"
-if [[ "${RUNNER}" == "claude_code" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.claude_code_runner.claude_code_runner"
- SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-claude-code-tool:latest}"
-elif [[ "${RUNNER}" == "mini_swe" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.mini_swe_agent_runner.mini_swe_agent_runner"
- SWE_AGENT_TOOL_IMAGE="${SWE_AGENT_TOOL_IMAGE:-swr.cn-east-3.myhuaweicloud.com/openyuanrong/mini-swe-agent-tool:latest}"
-elif [[ "${RUNNER}" == "uniagent" ]]; then
- AGENT_RUNNER_FQN="examples.swe_agent_blackbox.agent_runner.swe_agent_runner"
- SWE_AGENT_TOOL_IMAGE=""
-else
- echo "Unknown RUNNER=${RUNNER}; expected mini_swe, claude_code, or uniagent" >&2
- exit 1
-fi
-SWE_AGENT_RUN_TIMEOUT="${SWE_AGENT_RUN_TIMEOUT:-7200}"
-RUNNER_ARGS=(
- "actor_rollout_ref.rollout.custom.agent_framework.agent_runner_fqn=${AGENT_RUNNER_FQN}"
-)
-if [[ "${RUNNER}" != "uniagent" ]]; then
- RUNNER_ARGS+=(
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.tool_image=${SWE_AGENT_TOOL_IMAGE}"
- "+actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.run_timeout=${SWE_AGENT_RUN_TIMEOUT}"
- )
-fi
-
-# ── Logging ──────────────────────────────────────────────────────────────
-PROJECT_NAME="${PROJECT_NAME:-swe_agent_blackbox}"
-EXPERIMENT_NAME="${EXPERIMENT_NAME:-swe_agent_$(date +%Y%m%d_%H%M)}"
-VERL_LOGGING_LEVEL="${VERL_LOGGING_LEVEL:-INFO}"
-
-export SWE_AGENT_MAX_TURNS="${MAX_TURNS}"
-export SWE_AGENT_EVAL_TIMEOUT="${SWE_AGENT_EVAL_TIMEOUT:-600}"
-export VERL_LOGGING_LEVEL
-
-# ── Environment for NCCL ────────────────────────────────────────────────
-export NCCL_P2P_DISABLE="${NCCL_P2P_DISABLE:-1}"
-export NCCL_SHM_DISABLE="${NCCL_SHM_DISABLE:-1}"
-
-echo "=== SWE-Agent Blackbox Megatron Sync Training ==="
-echo "Model: ${MODEL_PATH}"
-echo "Train data: ${TRAIN_DATA}"
-echo "Val data: ${VAL_DATA}"
-echo "Engine: ${ENGINE} (gen_tp=${TP}, train_tp=${TRAIN_TP})"
-echo "Runner: ${RUNNER}"
-echo "Batch size: ${TRAIN_BATCH_SIZE}, N=${N}"
-echo "Sequence: prompt=${PROMPT_LENGTH}, response=${RESPONSE_LENGTH}"
-echo "==============================================="
-
-python3 -m verl.trainer.main_ppo_sync \
- --config-name=swe_agent_blackbox_megatron_sync \
- --config-path="$(pwd)/examples/swe_agent_blackbox/config" \
- hydra.searchpath=[pkg://verl.trainer.config] \
- actor_rollout_ref.model.path="${MODEL_PATH}" \
- data.train_files="['${TRAIN_DATA}']" \
- data.val_files="['${VAL_DATA}']" \
- data.train_batch_size=${TRAIN_BATCH_SIZE} \
- data.max_prompt_length=${PROMPT_LENGTH} \
- data.max_response_length=${RESPONSE_LENGTH} \
- actor_rollout_ref.rollout.name=${ENGINE} \
- actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
- actor_rollout_ref.rollout.gpu_memory_utilization=${ROLLOUT_GPU_MEM_UTIL} \
- actor_rollout_ref.rollout.n=${N} \
- actor_rollout_ref.rollout.temperature=${TEMPERATURE} \
- actor_rollout_ref.rollout.prompt_length=${PROMPT_LENGTH} \
- actor_rollout_ref.rollout.response_length=${RESPONSE_LENGTH} \
- actor_rollout_ref.rollout.max_model_len=$((PROMPT_LENGTH + RESPONSE_LENGTH)) \
- actor_rollout_ref.rollout.multi_turn.max_assistant_turns=${MAX_TURNS} \
- actor_rollout_ref.actor.optim.lr=${ACTOR_LR} \
- actor_rollout_ref.actor.ppo_mini_batch_size=${PPO_MINI_BATCH_SIZE} \
- actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TRAIN_TP} \
- actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${TRAIN_PP} \
- actor_rollout_ref.actor.megatron.context_parallel_size=${TRAIN_CP} \
- actor_rollout_ref.actor.megatron.param_offload=${OFFLOAD} \
- actor_rollout_ref.actor.megatron.grad_offload=${OFFLOAD} \
- actor_rollout_ref.actor.megatron.use_mbridge=${USE_MBRIDGE} \
- actor_rollout_ref.rollout.nnodes=${NNODES} \
- actor_rollout_ref.rollout.n_gpus_per_node=${NGPUS_PER_NODE} \
- trainer.nnodes=${NNODES} \
- trainer.n_gpus_per_node=${NGPUS_PER_NODE} \
- trainer.total_epochs=${TOTAL_EPOCHS} \
- trainer.save_freq=${SAVE_FREQ} \
- trainer.test_freq=${TEST_FREQ} \
- trainer.project_name=${PROJECT_NAME} \
- trainer.experiment_name=${EXPERIMENT_NAME} \
- actor_rollout_ref.rollout.custom.agent_framework.agent_runner_kwargs.agent_config_path="${AGENT_CONFIG_PATH}" \
- actor_rollout_ref.rollout.custom.agent_framework.completion_timeout_seconds=${COMPLETION_TIMEOUT} \
- "${RUNNER_ARGS[@]}" \
- "$@"
diff --git a/examples/data_preprocess/r2e_gym_subset_filtered.py b/examples/data_preprocess/r2e_gym_subset_filtered.py
index c97afbdc..eeafbc0a 100644
--- a/examples/data_preprocess/r2e_gym_subset_filtered.py
+++ b/examples/data_preprocess/r2e_gym_subset_filtered.py
@@ -17,6 +17,13 @@ def get_image_name(dataset_id: str, instance_id: str) -> str:
assert len(parts) == 2
instance_number = parts[1].lower()
return PUB_VOLCES_IMG_URL_R2E.format(instance_number=instance_number)
+elif impl == "openyuanrong":
+
+ def get_image_name(dataset_id: str, instance_id: str) -> str:
+ parts = instance_id.split("__")
+ assert len(parts) == 2
+ instance_number = parts[1].lower()
+ return f"swr.cn-east-3.myhuaweicloud.com/openyuanrong/r2e-gym-subset/{instance_number}:latest"
else:
raise ValueError(f"Invalid deployment implementation: {impl}")
diff --git a/examples/data_preprocess/swe_bench_verified.py b/examples/data_preprocess/swe_bench_verified.py
index 8f26ad16..3a56695e 100644
--- a/examples/data_preprocess/swe_bench_verified.py
+++ b/examples/data_preprocess/swe_bench_verified.py
@@ -18,6 +18,15 @@ def get_image_name(dataset_id: str, instance_id: str) -> str:
project_name = parts[0].lower()
instance_number = parts[1].lower()
return f"swebench/sweb.eval.x86_64.{project_name}_1776_{instance_number}"
+elif impl == "openyuanrong":
+
+ def get_image_name(dataset_id: str, instance_id: str) -> str:
+ assert dataset_id == "swe-bench-verified"
+ parts = instance_id.split("__")
+ assert len(parts) == 2
+ project_name = parts[0].lower()
+ instance_number = parts[1].lower()
+ return f"swr.cn-east-3.myhuaweicloud.com/openyuanrong/swe-bench-verified/sweb.eval.x86_64.{project_name}_1776_{instance_number}:v2"
else:
raise ValueError(f"Invalid deployment implementation: {impl}")
diff --git a/examples/data_preprocess/swe_rebench.py b/examples/data_preprocess/swe_rebench.py
index 3add8b28..1cb907df 100644
--- a/examples/data_preprocess/swe_rebench.py
+++ b/examples/data_preprocess/swe_rebench.py
@@ -17,6 +17,14 @@ def get_image_name(dataset_id, instance_id):
project_name = parts[0].lower()
instance_number = parts[1].lower()
return f"swerebench/sweb.eval.x86_64.{project_name}_1776_{instance_number}"
+elif impl == "openyuanrong":
+
+ def get_image_name(dataset_id: str, instance_id: str) -> str:
+ parts = instance_id.split("__")
+ assert len(parts) == 2
+ project_name = parts[0].lower()
+ instance_number = parts[1].lower()
+ return f"swr.cn-east-3.myhuaweicloud.com/openyuanrong/swe-rebench/{project_name}_1776_{instance_number}:latest"
else:
raise ValueError(f"Invalid deployment implementation: {impl}")
diff --git a/uni_agent/gateway/session/codec.py b/uni_agent/gateway/session/codec.py
index 747dbde8..b808183d 100644
--- a/uni_agent/gateway/session/codec.py
+++ b/uni_agent/gateway/session/codec.py
@@ -7,8 +7,8 @@
from uuid import uuid4
from verl.experimental.agent_loop.tool_parser import ToolParser
-from verl.utils.chat_template import apply_chat_template as _apply_chat_template
-from verl.utils.chat_template import initialize_system_prompt
+from verl.utils.tokenizer.chat_template import apply_chat_template as _apply_chat_template
+from verl.utils.tokenizer.chat_template import initialize_system_prompt
from verl.utils.tokenizer import normalize_token_ids
diff --git a/uni_agent/interaction/model.py b/uni_agent/interaction/model.py
index bfa0651f..1184688f 100644
--- a/uni_agent/interaction/model.py
+++ b/uni_agent/interaction/model.py
@@ -138,7 +138,7 @@ async def query(
return response_str, [], rollout_cache, generation_info
async def _get_new_message_ids(self, new_messages: list[dict[str, Any]]) -> list[int]:
- from verl.utils.chat_template import apply_chat_template
+ from verl.utils.tokenizer.chat_template import apply_chat_template
from verl.utils.tokenizer import normalize_token_ids
tokenized_prompt = await self.loop.run_in_executor(
@@ -154,7 +154,7 @@ async def _get_new_message_ids(self, new_messages: list[dict[str, Any]]) -> list
@cached_property
def message_boundary_tokens(self) -> list[int]:
- from verl.utils.chat_template import apply_chat_template
+ from verl.utils.tokenizer.chat_template import apply_chat_template
from verl.utils.tokenizer import normalize_token_ids
dummy_history = [
diff --git a/verl b/verl
index 7aed6b23..6fef6a7a 160000
--- a/verl
+++ b/verl
@@ -1 +1 @@
-Subproject commit 7aed6b230776f963fa09509c10d9c3a767d1102c
+Subproject commit 6fef6a7a699435cad84e8907e9121457e41eed04